Initial commit

2025-07-13 09:46:56 -07:00 · 2025-07-13 09:46:56 -07:00 · ccbcebf0e8
commit ccbcebf0e8
14 changed files with 703 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+env
+data/*
+__pycache__
--- a/22
+++ b/22
@ -0,0 +1,22 @@
+
+DOCKER_COMPOSE := COMPOSE_MENU=0 docker compose --env_file ./env
+
+
+default: up
+
+
+up: env
+	$(DOCKER_COMPOSE) up
+
+build:
+	$(DOCKER_COMPOSE) build
+
+env:
+	@echo "Please copy env.in to env and edit the contents to what you need."
+	@false
+
+signalbash:
+	$(DOCKER_COMPOSE) run --rm signal-cli bash
+
+import_factoids:
+	$(DOCKER_COMPOSE) run --rm init-db
--- a/README.md
+++ b/README.md
@ -0,0 +1,70 @@
+
+# What the devil is this?
+
+It's a Signal bot that emits Infobot-style factoids, with an LLM for fuzzy matching.
+
+When a user asks a question, it first checks the database for a verbatim answer, and emits that if it finds one.
+
+If there's no exact match:
+- the LLM parses the question into one or more topics ("tell me about alice and bob" becomes `["alice", "bob"]`)
+- the topics are vector encoded and queried against the encodings in the factoid database
+- the nearest match for each topic is sent back to the LLM, which then is asked to phrase it back to the user
+
+
+# Why?
+
+Huh?
+
+
+
+# How do I use it?
+
+You need CUDA working in Docker, or to edit the docker compose files to take that stuff out and rely on CPU.
+
+At present the LLM seems to require about 2GB of GPU RAM, which is really small as LLMs go.  My PC works harder playing Balatro.
+
+
+
+# What are its limitations?
+
+It doesn't have heaps of feature parity with the old perl infobot.  The right way to get that might be to hack on the old bot code and use it as the main chat parser for this.  I don't have a ton of desire to sit down and code my own implementation of the entire thing.
+
+Some important stuff we're missing right now:
+
+- Creating new factoids
+- Understanding when questions are being asked of the bot so it doesn't just respond to every single thing that's said
+- Botsnacks.
+
+And some non-infobot stuff we could use:
+- A better prompt for the LLM to integrate multiple factoids into a single response
+- Some security precautions against prompt injection etc. -- at the moment it's just "trust only those on the allowlist"
+
+
+
+## Initialize the database
+
+This took around an hour to do 300k factoids and a fair amount of compute/GPU power.  There's no consistency or duplicate checking at the moment so you're best off trashing the postgres data dir first
+
+- dump the factoid database into "is.txt" and "are.txt" and put them in scripts/
+    (tab separated lines: "topic\tresponse")
+
+- make import_factoids
+
+
+## Prepare a Signal account
+
+Making signal-cli work can be fairly involved.  Check the wiki at the signal-cli repo for details on how, but run this command to get a shell into the signal-cli container:
+
+- make signalbash
+
+
+## Create an env file
+
+- Copy `env.in` to `env` and edit its contents to what you need.
+
+
+## Start the server
+
+- make
+
+
--- a/app/Dockerfile
+++ b/app/Dockerfile
@ -0,0 +1,9 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+COPY . /app
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
+
--- a/app/main.py
+++ b/app/main.py
@ -0,0 +1,244 @@
+import os
+import asyncpg
+import numpy as np
+from fastapi import FastAPI, Request
+from sentence_transformers import SentenceTransformer
+import httpx
+import random
+import itertools
+
+DB_CONFIG = {
+    "host": os.getenv("DB_HOST"),
+    "port": int(os.getenv("DB_PORT", "5432")),
+    "user": os.getenv("DB_USER"),
+    "password": os.getenv("DB_PASSWORD"),
+    "database": os.getenv("DB_NAME"),
+}
+LLM_API_URL = os.getenv("LLM_API_URL", "http://llm:80")
+
+app = FastAPI()
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+
+@app.on_event("startup")
+async def startup():
+    app.state.db = await asyncpg.create_pool(**DB_CONFIG)
+
+@app.on_event("shutdown")
+async def shutdown():
+    await app.state.db.close()
+
+async def extract_triggers(user_query: str) -> list[str]:
+    """Ask the LLM to extract factoid-like terms from a user query."""
+
+
+    prompt = f"""<|im_start|>system
+You are a factoid trigger extractor. Extract a list of keywords or short phrases that might match entries in an infobot-style knowledge base.
+Respond only with a comma-separated list of triggers. Do not answer the question or correct spelling or grammar.
+<|im_end|>
+<|im_start|>user
+Who is steve?
+<|im_end|>
+<|im_start|>assistant
+steve
+<|im_end|>
+<|im_start|>user
+Tell me about HTTP status codes.
+<|im_end|>
+<|im_start|>assistant
+HTTP status codes
+<|im_end|>
+<|im_start|>user
+Who are you and your Gary
+<|im_end|>
+<|im_start|>assistant
+you, Gary
+<|im_end|>
+<|im_start|>user
+{user_query}
+<|im_end|>
+<|im_start|>assistant
+"""
+
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "temperature": 0.3,
+            "max_new_tokens": 64,
+        },
+    }
+
+    print("PAYLOAD:", payload)
+    async with httpx.AsyncClient() as client:
+        r = await client.post(
+            f"{LLM_API_URL}/generate",
+            json=payload,
+            timeout=httpx.Timeout(120.0),
+        )
+        if r.status_code != 200:
+            return []
+        output = r.json().get("generated_text", "")
+
+    print("OUTPUT:", output)
+
+    # Try to parse output as a list
+    triggers = []
+    for line in output.splitlines():
+        line = line.strip(" []\"'")
+        if line:
+            triggers.extend([t.strip(" \"'") for t in line.split(",") if t.strip()])
+    return list(set(triggers))
+
+@app.post("/ask")
+async def ask(request: Request):
+    body = await request.json()
+    query = body.get("query", "").strip()
+    if not query:
+        return {"error": "Missing query"}
+    responses = []
+
+    # try to extract the query verbatim
+    async with app.state.db.acquire() as conn:
+        rows = await conn.fetchrow(
+            """
+            SELECT trigger, response, copula
+            FROM factoids
+            WHERE trigger = $1
+            """,
+            query.strip()
+        )
+        if rows:
+            rows = [ {'trigger': x[0], 'response': x[1], 'copula': x[2]} for x in itertools.combinations(rows, 3) ]
+            options = []
+            for row in rows:
+                for option in row["response"].split("|"):
+                    options.append(row | {"response": option.strip()})
+            row = random.choice(options)
+            print("CHOICE:", row)
+            triggers = [query.strip()]
+            choices = [row['response']]
+            # Use <reply> if present, otherwise prepend the trigger
+            for c in choices:
+                if c.startswith("<reply>"):
+                    responses.append(c[len("<reply>"):].strip())
+                else:
+                    responses.append(f"{row['trigger']} {row['copula']} {c}")
+
+        else:
+            triggers = await extract_triggers(query)
+            if not triggers:
+                return {"response": "I don't know that one."}
+
+            print("triggers: ", triggers)
+            for trigger in triggers:
+
+                row = await conn.fetchrow(
+                    """
+                    SELECT trigger, response, copula
+                    FROM factoids
+                    WHERE trigger = $1
+                    LIMIT 1
+                    """,
+                    trigger
+                )
+
+                if not row:
+
+                    embedding = model.encode(trigger)
+                    embedding_str = f"[{', '.join(map(str, embedding))}]"
+
+                    row = await conn.fetchrow(
+                        """
+                        SELECT trigger, response, copula
+                        FROM factoids
+                        ORDER BY embedding <-> $1::vector
+                        LIMIT 1
+                        """,
+                        embedding_str
+                    )
+
+
+                if row:
+                    print("ROW:", row)
+                    response = row['response']
+                    # Pick a random option from pipe-separated responses
+                    options = [r.strip() for r in response.split('|') if r.strip()]
+                    if options:
+                        k = 1
+                        choices = random.choices(options,k=k)
+                    else:
+                        choices = [response]
+                    # Use <reply> if present, otherwise prepend the trigger
+                    for c in choices:
+                        if c.startswith("<reply>"):
+                            responses.append(c[len("<reply>"):].strip())
+                        else:
+                            responses.append(f"{row['trigger']} {row['copula']} {c}")
+
+    if not responses:
+        return {"response": "I don't know any of those."}
+
+    # Ask LLM to summarize
+    responses_str = "\nValue: ".join(responses)
+    #
+    summary_prompt = (
+        f"""<|im_start|>system
+You are a summarizer for a fact-based chatbot. Your task is to condense database entries into short, accurate one-line summaries. Do not speculate, define, or add new facts.  Do not correct spelling or phrasing from the facts or triggers.  Do not mix context from prior triggers.
+<|im_end|>
+
+<|im_start|>user
+Summarize the following database entry.
+
+Trigger: Paris
+Value: Paris is the capital of France
+Value: Paris is located in the north-central part of the country.
+<|im_end|>
+
+<|im_start|>assistant
+Paris is the capital of France and located in the north-central part of the country.
+<|im_end|>
+
+<|im_start|>user
+Summarize the following database entry.
+
+Trigger: squinky, spacehobo
+Value: spacehobo is a Citizen.
+Value: squinky is kinky
+<|im_end|>
+
+<|im_start|>assistant
+spacehobo is a Citizen and squinky is kinky
+<|im_end|>
+
+<|im_start|>user
+Summarize the following database entry.
+
+Trigger: sky
+Value: sky is blue
+Value: the sky is the big thing outside when you look up
+Value: Look!
+<|im_end|>
+
+<|im_start|>assistant
+Look up at that big blue thing outside!
+<|im_end|>
+
+<|im_start|>user
+Summarize the following database entry.
+
+Trigger: {query}
+Value: {responses_str}
+<|im_end|>"""
+
+    )
+    print("SUMMARY PAYLOAD:", summary_prompt)
+    async with httpx.AsyncClient() as client:
+        r = await client.post(
+            f"{LLM_API_URL}/generate",
+            json={"inputs": summary_prompt, "parameters": {"temperature": 0.8, "max_new_tokens": 200}},
+            timeout=httpx.Timeout(120.0),
+        )
+        final_response = r.json().get("generated_text", "\n".join(responses))
+        print("FINAL RESPONSE:", final_response)
+
+    return {"reply": final_response, "matches": responses, "triggers": triggers}
+
--- a/app/requirements.txt
+++ b/app/requirements.txt
@ -0,0 +1,6 @@
+fastapi
+uvicorn[standard]
+asyncpg
+sentence-transformers
+httpx
+
--- a/docker-compose-initdb.yaml
+++ b/docker-compose-initdb.yaml
@ -0,0 +1,42 @@
+
+services:
+
+  init-db:
+    build:
+        context: .
+        dockerfile: initdb.Dockerfile
+    container_name: infobot-init
+    depends_on:
+      db:
+        condition: service_healthy
+    volumes:
+      - ./scripts:/scripts:ro
+      - ./data/hf_cache:/root/.cache/huggingface
+    environment:
+      DB_HOST: db
+      DB_PORT: 5432
+      DB_USER: infobot
+      DB_PASSWORD: infobot
+      DB_NAME: infobot
+    entrypoint: ["python", "/scripts/init_and_load.py"]
+    devices:
+      - /dev/nvidia-uvm
+      - /dev/nvidia-uvm-tools
+      - /dev/nvidia-modeset
+      - /dev/nvidiactl
+      - /dev/nvidia0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: ["gpu"]
+    networks: ["dumant"]
+
+
+networks:
+  dumant:
+    name: dumant
+    external: true
+
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,105 @@
+
+services:
+  db:
+    image: ankane/pgvector
+    container_name: infobot-db
+    restart: unless-stopped
+    ports:
+      - "5432:5432"
+    environment:
+      POSTGRES_DB: infobot
+      POSTGRES_USER: infobot
+      POSTGRES_PASSWORD: infobot
+    volumes:
+      - ./data/postgres:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD", "pg_isready", "-U", "infobot"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks: ["dumant"]
+
+  llm:
+    image: ghcr.io/huggingface/text-generation-inference:1.4
+    container_name: infobot-llm
+    restart: unless-stopped
+    ports:
+      - "8080:80"
+    environment:
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+#      PYTORCH_CUDA_ALLOC_CONF: max_split_size_mb:64
+      MODEL_ID: ${MODEL_ID}
+    volumes:
+      - ./data/models:/data
+    devices:
+      - /dev/nvidia-uvm
+      - /dev/nvidia-uvm-tools
+      - /dev/nvidia-modeset
+      - /dev/nvidiactl
+      - /dev/nvidia0
+    command:
+      - --max-total-tokens=1024
+      - --max-batch-prefill-tokens=256
+      - --max-input-length=256
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: ["gpu"]
+    networks: ["dumant"]
+
+  app:
+    build: ./app
+    container_name: infobot-app
+    restart: unless-stopped
+    depends_on:
+      db:
+        condition: service_healthy
+      llm:
+        condition: service_started
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./app:/app
+    environment:
+      DB_HOST: db
+      DB_PORT: 5432
+      DB_USER: infobot
+      DB_PASSWORD: infobot
+      DB_NAME: infobot
+      LLM_API_URL: http://llm:80
+    networks: ["dumant"]
+
+
+  signal-app:
+    build: ./signal-app
+    container_name: infobot-signal-app
+    restart: unless-stopped
+    volumes:
+      - ./signal-app:/app
+    networks: ["dumant"]
+
+  signal-cli:
+    #image: dogukanakkaya/signal-cli
+    image: registry.gitlab.com/packaging/signal-cli/signal-cli-native:latest
+    container_name: infobot-signal-cli
+    restart: unless-stopped
+    volumes:
+      - ./data/signal-cli:/var/lib/signal-cli
+    ports:
+      - "7583:7583"
+    tmpfs:
+      - "/tmp:exec"
+    command: -a ${SIGNAL_ID} daemon --tcp=0.0.0.0:7583 --receive-mode=on-start
+    networks: ["dumant"]
+
+
+volumes:
+  postgres:
+  models:
+
+networks:
+  dumant:
+    name: dumant
--- a/env.in
+++ b/env.in
@ -0,0 +1,14 @@
+# Set this to the valid phone number associated with your Signal account
+SIGNAL_ID=+12345678901
+
+# Accept DMS from this list of users only (space-separated)
+SIGNAL_USER_ALLOWLIST="+12345678901 +12345678901 +12345678901 +12345678901 +12345678901"
+
+# Accept messages from anyone in these groups
+SIGNAL_GROUP_ALLOWLIST="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefg1234567890= ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefg1234567890="
+
+# Use this LLM.
+# Visit https://huggingface.co/models?pipeline_tag=text-generation&sort=trending and try to use an "instruct" model
+# The one listed here by default is a rather small one but adequate for now
+MODEL_ID="Gensyn/Qwen2.5-0.5B-Instruct"
+
--- a/initdb.Dockerfile
+++ b/initdb.Dockerfile
@ -0,0 +1,7 @@
+FROM python:3
+#FROM python:3.11-slim
+
+RUN pip install --no-cache-dir \
+        psycopg2 \
+        sentence_transformers \
+    && /bin/true
--- a/scripts/init_and_load.py
+++ b/scripts/init_and_load.py
@ -0,0 +1,68 @@
+import os
+import psycopg2
+import csv
+from sentence_transformers import SentenceTransformer
+
+DB_HOST = os.environ["DB_HOST"]
+DB_PORT = os.environ["DB_PORT"]
+DB_USER = os.environ["DB_USER"]
+DB_PASSWORD = os.environ["DB_PASSWORD"]
+DB_NAME = os.environ["DB_NAME"]
+TSV_IS = "/scripts/is.txt"
+TSV_ARE = "/scripts/are.txt"
+
+# Connect to DB
+conn = psycopg2.connect(
+    host=DB_HOST,
+    port=DB_PORT,
+    dbname=DB_NAME,
+    user=DB_USER,
+    password=DB_PASSWORD
+)
+conn.autocommit = True
+cur = conn.cursor()
+
+# Ensure pgvector extension and table exist
+cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+cur.execute("""
+    CREATE TABLE IF NOT EXISTS factoids (
+        id SERIAL PRIMARY KEY,
+        trigger TEXT NOT NULL,
+        copula TEXT NOT NULL DEFAULT 'is',
+        response TEXT NOT NULL,
+        embedding VECTOR(384)
+    );
+""")
+
+# Load model
+print("Loading embedding model...")
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+
+# Process TSVs one line at a time
+
+for copula, TSV_PATH in ( ("is", TSV_IS), ("are", TSV_ARE) ):
+
+    print(f"Loading and inserting from {TSV_PATH}...")
+    with open(TSV_PATH, "r", encoding="utf-8", errors="replace") as f:
+        reader = csv.reader(f, delimiter="\t")
+        count = 0
+        for row in reader:
+            if len(row) == 2:
+                if count % 100 == 0:
+                    print(count, row[0], copula, row[1])
+                count += 1
+                if len(row) != 2:
+                    continue
+                trigger, response = row[0].strip(), row[1].strip()
+                if not trigger or not response:
+                    continue
+                embedding = model.encode(trigger).tolist()
+                cur.execute(
+                    "INSERT INTO factoids (trigger, copula, response, embedding) VALUES (%s, %s, %s, %s)",
+                    (trigger, copula, response, embedding)
+                )
+
+print("All factoids loaded.")
+cur.close()
+conn.close()
+
--- a/signal-app/Dockerfile
+++ b/signal-app/Dockerfile
@ -0,0 +1,9 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+COPY requirements.txt /tmp
+
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+
+CMD [ "python", "-u", "/app/main.py" ]
+
--- a/signal-app/main.py
+++ b/signal-app/main.py
@ -0,0 +1,102 @@
+import socket
+import json
+import time
+import requests
+import os
+
+SIGNAL_USER_ALLOWLIST = " ".split(os.getenv("SIGNAL_USER_ALLOWLIST"))
+SIGNAL_GROUP_ALLOWLIST = " ".split(os.getenv("SIGNAL_GROUP_ALLOWLIST"))
+
+LLM_API_URL = os.getenv("LLM_API_URL", "http://llm:80")
+#SIGNAL_API = "http://signal-cli:7583"
+BOT_API="http://app:8000/ask"
+
+def send_json_rpc(method, params=None, request_id=1, host="signal-cli", port=7583):
+    """Send a JSON-RPC 2.0 request over TCP to signal-cli."""
+    request = {
+        "jsonrpc": "2.0",
+        "id": request_id,
+        "method": method,
+        "params": params or {}
+    }
+    request_str = json.dumps(request) + "\n"
+
+    with socket.create_connection((host, port)) as sock:
+        sock.sendall(request_str.encode("utf-8"))
+        response = recv_all(sock)
+        return json.loads(response)
+
+def recv_all(sock):
+    """Read until newline (signal-cli terminates each JSON-RPC response with \\n)."""
+    buffer = b""
+    while not buffer.endswith(b"\n"):
+        data = sock.recv(4096)
+        if not data:
+            break
+        buffer += data
+    return buffer.decode("utf-8")
+
+def process_message(msg):
+    with requests.Session() as session:
+        result = session.post(BOT_API, json={"query": str(msg)})
+        j = result.json()
+        return j["reply"]
+
+
+def listen_for_notifications(host="signal-cli", port=7583):
+    with socket.create_connection((host, port)) as sock:
+        print("Connected to signal-cli JSON-RPC")
+        buffer = b""
+
+        while True:
+
+            chunk = sock.recv(4096)
+            if not chunk:
+                break
+            buffer += chunk
+
+            while b"\n" in buffer:
+                line, buffer = buffer.split(b"\n", 1)
+                try:
+                    msg = json.loads(line)
+                    print(msg)
+                    if "method" in msg:
+                        if msg["method"] == "receive":
+                            envelope = msg["params"]["envelope"]
+                            source = envelope["source"]
+                            if "dataMessage" in envelope:
+                                msg = envelope["dataMessage"]["message"] # there are non-message messages, like read receipts
+                                if "groupInfo" in envelope["dataMessage"]:
+                                    group_id = envelope["dataMessage"]["groupInfo"]["groupId"]
+                                    if group_id not in SIGNAL_GROUP_ALLOWLIST:
+                                        print(f"GROUP RECV DENIED ({source}): {envelope}")
+                                        break
+                                    print(f"GROUP ({group_id}/{source}): {msg}")
+                                    params = { "recipient": group_id, "groupId": group_id }
+                                else:
+                                    if source not in SIGNAL_USER_ALLOWLIST:
+                                        print(f"RECV DENIED ({source}): {envelope}")
+                                        break
+
+                                    print(f"RECV ({source}): {msg}")
+                                    params = { "recipient": source }
+
+                                params["message"] = process_message(msg)
+                                print("PARAMS:",params)
+                                result = send_json_rpc(
+                                    method="send",
+                                    params=params
+                                )
+                except json.JSONDecodeError:
+                    print("Invalid JSON:", line)
+
+
+# Example usage:
+if __name__ == "__main__":
+#    result = send_json_rpc(
+#        method="listIdentities",
+#        params={},
+#    )
+#    print("Response:", json.dumps(result, indent=2))
+    listen_for_notifications()
+
--- a/signal-app/requirements.txt
+++ b/signal-app/requirements.txt
@ -0,0 +1,2 @@
+requests
+jsonrpclib