// knowledge_base_indexer.ks — embed text files and store in SQLite // Modules: curl plugin, sqlite plugin, Hash, Json, File, Path, OS, List, Math, Konsol // Usage: minks knowledge_base_indexer.ks [db_path] // // Walks a directory of .txt files, generates an embedding for each file's // content via the OpenAI Embeddings API, and stores the chunk + embedding // in a SQLite database for later retrieval or similarity search. // // One file = one chunk (simpler for learning; split by paragraph for production). // Sample docs ship alongside this script: note1.txt, note2.txt, note3.txt. #include "curl" #include "sqlite" Konsol:Print("=== Knowledge Base Indexer ==="); // ── Step 1: Read arguments ──────────────────────────────────────────────────── Var:List args; OS:Args(args); Var:Number argc; List:Size(args, argc); if (argc < 2) { Konsol:Print("Usage: minks knowledge_base_indexer.ks [db_path]"); OS:Exit(1); } Var:String apiKey; Var:String docsDir; List:Get(0, args, apiKey); List:Get(1, args, docsDir); Var:String dbPath = "kb.db"; if (argc >= 3) { List:Get(2, args, dbPath); } Var:Boolean dirOk; Path:IsDir(docsDir, dirOk); if (!dirOk) { Konsol:Print("Not a directory: ${docsDir}"); OS:Exit(1); } // ── Step 2: Open the database and create the embeddings table ───────────────── Var:Number db; SQLite:Open(dbPath, db); Var:Boolean ok; SQLite:Exec(db, "CREATE TABLE IF NOT EXISTS embeddings (id TEXT PRIMARY KEY, source TEXT, chunk TEXT, embedding TEXT, indexed_at TEXT)", ok); // ── Step 3: List .txt files ─────────────────────────────────────────────────── Var:List entries; OS:ListDir(docsDir, entries); Var:Number entryCount; List:Size(entries, entryCount); Var:List txtFiles; for (Number i = 0; i < entryCount; i++) { Var:String name; List:Get(i, entries, name); Var:String ext; Path:Extension(name, ext); if (ext == ".txt") { Var:String fp; Path:Join(docsDir, name, fp); List:Push(fp, txtFiles); } } Var:Number fileCount; List:Size(txtFiles, fileCount); Konsol:Print("Found ${fileCount} .txt file(s) in ${docsDir}"); // ── Step 4: Set shared curl headers ────────────────────────────────────────── Curl:SetHeader("Authorization", "Bearer ${apiKey}"); Curl:SetHeader("Content-Type", "application/json"); Curl:SetTimeout(30); // ── Step 5: Embed and store each file ──────────────────────────────────────── Var:Number indexed = 0; Var:Number skipped = 0; for (Number fi = 0; fi < fileCount; fi++) { Var:String filePath; List:Get(fi, txtFiles, filePath); Var:String fileName; Path:FileName(filePath, fileName); Var:Number num = fi + 1; Konsol:Print("[${num}/${fileCount}] ${fileName}"); // Read file content. Var:Number fh; File:Open(filePath, "r", fh); Var:String chunk = ""; Var:String line; Var:Boolean eof; File:EOF(fh, eof); while (!eof) { File:ReadLine(fh, line); chunk = chunk + line + "\n"; File:EOF(fh, eof); } File:Close(fh); String:Trim(chunk, chunk); // SHA-256 of the chunk content serves as a stable, unique row ID. // Re-indexing the same file will overwrite the existing row (INSERT OR REPLACE). Var:String chunkId; Hash:SHA256(chunk, chunkId); // Check if already indexed. Var:String existing; SQLite:QueryOne(db, "SELECT id FROM embeddings WHERE id = '${chunkId}'", existing); if (existing != "" && existing != "null") { Konsol:Print(" already indexed — skipping."); skipped = skipped + 1; } else { // Call the embeddings API. Var:String safeChunk; String:Replace(chunk, "\"", "\\\"", safeChunk); Var:String payload = "{\"model\":\"text-embedding-3-small\",\"input\":\"" + safeChunk + "\"}"; Var:String body; Curl:Post("https://api.openai.com/v1/embeddings", payload, body); Var:Number status; Curl:Status(status); if (status != 200) { Konsol:Print(" API error ${status} — skipping."); skipped = skipped + 1; } else { // Store the raw embedding response body as a JSON string. // In production: extract data[0].embedding and store the float array. Var:String safeBody; String:Replace(body, "'", "''", safeBody); Var:String safeSource; String:Replace(fileName, "'", "''", safeSource); Var:String safeChunkSql; String:Replace(chunk, "'", "''", safeChunkSql); // Get current timestamp for indexed_at. Var:Number yr; Var:Number mo; Var:Number dy; Time:GetYear(yr); Time:GetMonth(mo); Time:GetDay(dy); Var:String ts = "${yr}-${mo}-${dy}"; SQLite:Exec(db, "INSERT OR REPLACE INTO embeddings (id, source, chunk, embedding, indexed_at) VALUES ('${chunkId}', '${safeSource}', '${safeChunkSql}', '${safeBody}', '${ts}')", ok); Var:Number chunkLen; String:Length(chunk, chunkLen); Konsol:Print(" indexed (${chunkLen} chars)."); indexed = indexed + 1; } } } // ── Step 6: Report ──────────────────────────────────────────────────────────── SQLite:Close(db); Konsol:Print("Indexed ${indexed} chunk(s), skipped ${skipped} — database: ${dbPath}"); Konsol:Print("Done.");