Data + AI

Three recipes that combine structured data (CSV, text files) with LLM APIs: classify rows, normalize messy records, and build a searchable knowledge base.

Back to Kookbook


Dataset Annotation Runner

Modules: curl plugin · CSV · JSON · File · String · Konsol

Reads a CSV, sends each row's text field to an LLM for sentiment classification (positive / negative / neutral), and writes an annotated CSV with an added sentiment column.

Key patterns:

Ships with reviews_raw.csv (8 product reviews).

Usage

minks dataset_annotator.ks <api_key> <input.csv> <output.csv>

minks dataset_annotator.ks sk-... reviews_raw.csv reviews_labeled.csv

Sample output

=== Dataset Annotator ===
Loaded 9 rows (including header).
[1/8] 1: Great product - really fast delivery...
  → positive
[2/8] 2: Broke after one week...
  → negative
...
Annotated 8 rows → reviews_labeled.csv
Done.

Sample data

id,review_text
1,Great product - really fast delivery and excellent build quality!
2,Broke after one week. Very disappointed with the durability.
3,Average quality. Nothing special but it does the job.
4,Exceeded my expectations. Will definitely buy again!
5,Terrible customer service. Waited 3 weeks and got the wrong item.
6,Solid value for the price. Recommended.
7,Instructions were confusing but the product itself is fine.
8,Absolute garbage. Returned immediately.

Script

// dataset_annotator.ks - classify CSV rows with an LLM, write labeled output
// Modules: curl plugin, CSV, JSON, File, String, Konsol
// Usage:  minks dataset_annotator.ks <api_key> <input.csv> <output.csv>
//
// Reads a CSV where column 1 is a text field, sends each row to the LLM
// for sentiment classification (positive / negative / neutral), and writes
// a new CSV with an added "sentiment" column.
//
// Sample input ships alongside this script: reviews_raw.csv

#include "curl"

Konsol:Print("=== Dataset Annotator ===");

// ── Step 1: Read arguments ────────────────────────────────────────────────────
Var:List args;
OS:Args(args);

Var:Number argc;
List:Size(args, argc);

if (argc < 3) {
    Konsol:Print("Usage: minks dataset_annotator.ks <api_key> <input.csv> <output.csv>");
    Konsol:Exit(1);
}

Var:String apiKey;
Var:String inPath;
Var:String outPath;
List:Get(0, args, apiKey);
List:Get(1, args, inPath);
List:Get(2, args, outPath);

// ── Step 2: Read and parse the input CSV ─────────────────────────────────────
Var:Number rfh;
File:Open(inPath, "r", rfh);

Var:String csvContent = "";
Var:String rline;
Var:Boolean reof;
File:EOF(rfh, reof);
while (!reof) {
    File:ReadLine(rfh, rline);
    csvContent = csvContent + rline + "\n";
    File:EOF(rfh, reof);
}
File:Close(rfh);

CSV:Parse(csvContent, inp);

Var:Number rowCount;
CSV:Rows(inp, rowCount);
Konsol:Print("Loaded ${rowCount} rows (including header).");

// ── Step 3: Set up the output CSV with an added "sentiment" column ────────────
// Copy the header row from input, append "sentiment".
Var:String h0;
Var:String h1;
CSV:Get(0, 0, inp, h0);
CSV:Get(0, 1, inp, h1);
CSV:Set(0, 0, h0, out);
CSV:Set(0, 1, h1, out);
CSV:Set(0, 2, "sentiment", out);

// ── Step 4: Set shared headers ────────────────────────────────────────────────
Curl:SetHeader("Authorization", "Bearer ${apiKey}");
Curl:SetHeader("Content-Type", "application/json");
Curl:SetTimeout(20);

// ── Step 5: Annotate each data row ────────────────────────────────────────────
// Skip row 0 (header); process rows 1..rowCount-1.
for (Number i = 1; i < rowCount; i++) {
    Var:String idVal;
    Var:String reviewText;
    CSV:Get(i, 0, inp, idVal);
    CSV:Get(i, 1, inp, reviewText);

    Konsol:Print("[${i}/${rowCount-1}] ${idVal}: ${reviewText}");

    // Ask the LLM to classify sentiment with a single word.
    Var:String safeText;
    String:Replace(reviewText, "\"", "\\\"", safeText);

    Var:String sysMsg = "You are a sentiment classifier. Respond with exactly one word: positive, negative, or neutral.";
    Var:String payload = """{
      "model": "gpt-4o-mini",
      "messages": [
        {"role": "system", "content": "${sysMsg}"},
        {"role": "user", "content": "${safeText}"}
      ],
      "max_tokens": 5
    }""";

    Var:String body;
    Var:String curlErr = "";
    try {
        Curl:Post("https://api.openai.com/v1/chat/completions", payload, body);
    } catch (CurlException e) {
        curlErr = e.message;
    }

    Var:Number status;
    Curl:Status(status);

    Var:String label = "error";
    if (curlErr == "" && status == 200) {
        JSON:Parse(body, resp);
        JSON:Get("choices.0.message.content", resp, label);
        String:Trim(label, label);
    }

    Konsol:Print("  → ${label}");

    // Write the row with its new label.
    CSV:Set(i, 0, idVal, out);
    CSV:Set(i, 1, reviewText, out);
    CSV:Set(i, 2, label, out);
}

CSV:Free(inp);

// ── Step 6: Save the annotated CSV ────────────────────────────────────────────
Var:String csvText;
CSV:Stringify(out, csvText);
CSV:Free(out);

Var:Number wh;
File:Open(outPath, "w", wh);
File:Write(csvText, wh);
File:Close(wh);

Var:Number dataRows = rowCount - 1;
Konsol:Print("Annotated ${dataRows} rows → ${outPath}");
Konsol:Print("Done.");


AI-Assisted Data Cleaner

Modules: curl plugin · CSV · JSON · File · String · Konsol

Normalizes messy contact records: the LLM is instructed to return a corrected CSV row (proper-case name, E.164 phone, lowercase email), which is then parsed back into fields with CSV:Parse.

Key patterns:

Ships with dirty_contacts.csv (6 malformed contact records).

Usage

minks data_cleaner.ks <api_key> <input.csv> <output.csv>

minks data_cleaner.ks sk-... dirty_contacts.csv clean_contacts.csv

Sample output

=== AI Data Cleaner ===
Loaded 7 rows (including header).
[1] john smith,555 123 4567,john.smith@gmail.com
  → John Smith, +15551234567, john.smith@gmail.com
[2] ALICE JOHNSON,(415)555-9876,ALICE@EXAMPLE.COM
  → Alice Johnson, +14155559876, alice@example.com
...
Cleaned 6 rows → clean_contacts.csv
Done.

Sample data

name,phone,email
john smith,555 123 4567,john.smith@gmail.com
ALICE JOHNSON,(415)555-9876,ALICE@EXAMPLE.COM
Bob        Williams,+1 800 555 0100,bob.williams@  hotmail.com
sara jones,5551234,sara@jones
Mike OBrien,555.987.6543,mike@obrien.net
DR. PATRICIA LEE,1-555-222-3333,patricia.lee@hospital.org

Script

// data_cleaner.ks - normalize messy CSV rows with an LLM
// Modules: curl plugin, CSV, JSON, File, String, Konsol
// Usage:  minks data_cleaner.ks <api_key> <input.csv> <output.csv>
//
// Reads a CSV of contact records with inconsistent formatting, asks the
// LLM to normalize each row (proper-case name, E.164 phone, lowercase email),
// and writes a clean CSV.  The LLM is instructed to respond with CSV only
// so the output can be parsed directly.
//
// Sample input ships alongside this script: dirty_contacts.csv

#include "curl"

Konsol:Print("=== AI Data Cleaner ===");

// ── Step 1: Read arguments ────────────────────────────────────────────────────
Var:List args;
OS:Args(args);

Var:Number argc;
List:Size(args, argc);

if (argc < 3) {
    Konsol:Print("Usage: minks data_cleaner.ks <api_key> <input.csv> <output.csv>");
    Konsol:Exit(1);
}

Var:String apiKey;
Var:String inPath;
Var:String outPath;
List:Get(0, args, apiKey);
List:Get(1, args, inPath);
List:Get(2, args, outPath);

// ── Step 2: Read the input CSV ────────────────────────────────────────────────
Var:Number rfh;
File:Open(inPath, "r", rfh);

Var:String csvContent = "";
Var:String rline;
Var:Boolean reof;
File:EOF(rfh, reof);
while (!reof) {
    File:ReadLine(rfh, rline);
    csvContent = csvContent + rline + "\n";
    File:EOF(rfh, reof);
}
File:Close(rfh);

CSV:Parse(csvContent, inp);

Var:Number rowCount;
CSV:Rows(inp, rowCount);
Konsol:Print("Loaded ${rowCount} rows (including header).");

// ── Step 3: Write the header row directly to the output ───────────────────────
Var:String h0;
Var:String h1;
Var:String h2;
CSV:Get(0, 0, inp, h0);
CSV:Get(0, 1, inp, h1);
CSV:Get(0, 2, inp, h2);
CSV:Set(0, 0, h0, out);
CSV:Set(0, 1, h1, out);
CSV:Set(0, 2, h2, out);

// ── Step 4: Set shared headers ────────────────────────────────────────────────
Curl:SetHeader("Authorization", "Bearer ${apiKey}");
Curl:SetHeader("Content-Type", "application/json");
Curl:SetTimeout(20);

// Instruction given to the LLM as a system message.
Var:String sysMsg = "You are a data normalization assistant. Given a CSV row with name, phone, email, return a single corrected CSV row in the same order: proper-case name, E.164 phone format, lowercase email. Respond with ONLY the CSV row - no explanation, no quotes around the whole line.";

// ── Step 5: Clean each data row ───────────────────────────────────────────────
for (Number i = 1; i < rowCount; i++) {
    Var:String name;
    Var:String phone;
    Var:String email;
    CSV:Get(i, 0, inp, name);
    CSV:Get(i, 1, inp, phone);
    CSV:Get(i, 2, inp, email);

    // Build the user message as a raw CSV row for the LLM to correct.
    Var:String rawRow = name + "," + phone + "," + email;
    Konsol:Print("[${i}] ${rawRow}");

    Var:String safeRow;
    String:Replace(rawRow, "\"", "\\\"", safeRow);

    Var:String payload = """{
      "model": "gpt-4o-mini",
      "messages": [
        {"role": "system", "content": "${sysMsg}"},
        {"role": "user", "content": "${safeRow}"}
      ],
      "max_tokens": 60
    }""";

    Var:String body;
    Var:String curlErr = "";
    try {
        Curl:Post("https://api.openai.com/v1/chat/completions", payload, body);
    } catch (CurlException e) {
        curlErr = e.message;
    }

    Var:Number status;
    Curl:Status(status);

    if (curlErr != "" || status != 200) {
        Var:String errMsg = curlErr != "" ? curlErr : "HTTP ${status}";
        Konsol:Print("  API error ${errMsg} - keeping original row.");
        CSV:Set(i, 0, name, out);
        CSV:Set(i, 1, phone, out);
        CSV:Set(i, 2, email, out);
    } else {
        JSON:Parse(body, resp);
        Var:String cleaned;
        JSON:Get("choices.0.message.content", resp, cleaned);
        String:Trim(cleaned, cleaned);

        // Parse the LLM's CSV response (one row) back into fields.
        CSV:Parse(cleaned, cleanedRow);
        Var:String cName;
        Var:String cPhone;
        Var:String cEmail;
        CSV:Get(0, 0, cleanedRow, cName);
        CSV:Get(0, 1, cleanedRow, cPhone);
        CSV:Get(0, 2, cleanedRow, cEmail);
        CSV:Free(cleanedRow);

        Konsol:Print("  → ${cName}, ${cPhone}, ${cEmail}");

        CSV:Set(i, 0, cName, out);
        CSV:Set(i, 1, cPhone, out);
        CSV:Set(i, 2, cEmail, out);
    }
}

CSV:Free(inp);

// ── Step 6: Save the cleaned CSV ─────────────────────────────────────────────
Var:String csvText;
CSV:Stringify(out, csvText);
CSV:Free(out);

Var:Number wh;
File:Open(outPath, "w", wh);
File:Write(csvText, wh);
File:Close(wh);

Var:Number dataRows = rowCount - 1;
Konsol:Print("Cleaned ${dataRows} rows → ${outPath}");
Konsol:Print("Done.");


Knowledge Base Indexer

Modules: `curl` plugin · `sqlite` plugin · `Hash` · `JSON` · `File` · `Path` · `OS` · `List` · `Time` · `Math` · `String` · `Konsol`

Walks a directory of .txt files, generates an embedding for each via the OpenAI Embeddings API (text-embedding-3-small), and stores the chunk text plus the raw embedding response in SQLite for later retrieval or similarity search.

Key patterns:

Ships with three sample notes: note1.txt, note2.txt, note3.txt.

Usage

minks knowledge_base_indexer.ks <api_key> <docs_dir> [db_path]

minks knowledge_base_indexer.ks sk-... kookbook/data-and-ai kb.db

Sample output

=== Knowledge Base Indexer ===
Found 3 .txt file(s) in kookbook/data-and-ai
[1/3] note1.txt
  indexed (312 chars).
[2/3] note2.txt
  indexed (298 chars).
[3/3] note3.txt
  indexed (341 chars).
Indexed 3 chunk(s), skipped 0 - database: kb.db
Done.

Script

// knowledge_base_indexer.ks - embed text files and store in SQLite
// Modules: curl plugin, sqlite plugin, Hash, JSON, File, Path, OS, List, Math, Konsol
// Usage:  minks knowledge_base_indexer.ks <api_key> <docs_dir> [db_path]
//
// Walks a directory of .txt files, generates an embedding for each file's
// content via the OpenAI Embeddings API, and stores the chunk + embedding
// in a SQLite database for later retrieval or similarity search.
//
// One file = one chunk (simpler for learning; split by paragraph for production).
// Sample docs ship alongside this script: note1.txt, note2.txt, note3.txt.

#include "curl"
#include "sqlite"

Konsol:Print("=== Knowledge Base Indexer ===");

// ── Step 1: Read arguments ────────────────────────────────────────────────────
Var:List args;
OS:Args(args);

Var:Number argc;
List:Size(args, argc);

if (argc < 2) {
    Konsol:Print("Usage: minks knowledge_base_indexer.ks <api_key> <docs_dir> [db_path]");
    Konsol:Exit(1);
}

Var:String apiKey;
Var:String docsDir;
List:Get(0, args, apiKey);
List:Get(1, args, docsDir);

Var:String dbPath = "kb.db";
if (argc >= 3) {
    List:Get(2, args, dbPath);
}

Var:Boolean dirOk;
Path:IsDirectory(docsDir, dirOk);
if (!dirOk) {
    Konsol:Print("Not a directory: ${docsDir}");
    Konsol:Exit(1);
}

// ── Step 2: Open the database and create the embeddings table ─────────────────
Var:Number db;
SQLite:Open(dbPath, db);

Var:Boolean ok;
SQLite:Exec(db, "CREATE TABLE IF NOT EXISTS embeddings (id TEXT PRIMARY KEY, source TEXT, chunk TEXT, embedding TEXT, indexed_at TEXT)", ok);

// ── Step 3: List .txt files ───────────────────────────────────────────────────
Var:List entries;
OS:ListDirectory(docsDir, entries);

Var:Number entryCount;
List:Size(entries, entryCount);

Var:List txtFiles;
for (Number i = 0; i < entryCount; i++) {
    Var:String name;
    List:Get(i, entries, name);
    Var:String ext;
    Path:Extension(name, ext);
    if (ext == ".txt") {
        Var:String fp;
        Path:Join(docsDir, name, fp);
        List:Push(fp, txtFiles);
    }
}

Var:Number fileCount;
List:Size(txtFiles, fileCount);
Konsol:Print("Found ${fileCount} .txt file(s) in ${docsDir}");

// ── Step 4: Set shared curl headers ──────────────────────────────────────────
Curl:SetHeader("Authorization", "Bearer ${apiKey}");
Curl:SetHeader("Content-Type", "application/json");
Curl:SetTimeout(30);

// ── Step 5: Embed and store each file ────────────────────────────────────────
Var:Number indexed = 0;
Var:Number skipped = 0;

for (Number fi = 0; fi < fileCount; fi++) {
    Var:String filePath;
    List:Get(fi, txtFiles, filePath);

    Var:String fileName;
    Path:FileName(filePath, fileName);
    Var:Number num = fi + 1;
    Konsol:Print("[${num}/${fileCount}] ${fileName}");

    // Read file content.
    Var:Number fh;
    File:Open(filePath, "r", fh);
    Var:String chunk = "";
    Var:String line;
    Var:Boolean eof;
    File:EOF(fh, eof);
    while (!eof) {
        File:ReadLine(fh, line);
        chunk = chunk + line + "\n";
        File:EOF(fh, eof);
    }
    File:Close(fh);
    String:Trim(chunk, chunk);

    // SHA-256 of the chunk content serves as a stable, unique row ID.
    // Re-indexing the same file will overwrite the existing row (INSERT OR REPLACE).
    Var:String chunkId;
    Hash:SHA256(chunk, chunkId);

    // Check if already indexed.
    Var:String existing;
    SQLite:QueryOne(db, "SELECT id FROM embeddings WHERE id = '${chunkId}'", existing);
    if (existing != "" && existing != "null") {
        Konsol:Print("  already indexed - skipping.");
        skipped = skipped + 1;
    } else {
        // Call the embeddings API.
        Var:String safeChunk;
        String:Replace(chunk, "\"", "\\\"", safeChunk);
        Var:String payload = """{
          "model": "text-embedding-3-small",
          "input": "${safeChunk}"
        }""";

        Var:String body;
        Var:String curlErr = "";
        try {
            Curl:Post("https://api.openai.com/v1/embeddings", payload, body);
        } catch (CurlException e) {
            curlErr = e.message;
        }

        Var:Number status;
        Curl:Status(status);

        if (curlErr != "" || status != 200) {
            Var:String errMsg = curlErr != "" ? curlErr : "HTTP ${status}";
            Konsol:Print("  API error ${errMsg} - skipping.");
            skipped = skipped + 1;
        } else {
            // Store the raw embedding response body as a JSON string.
            // In production: extract data[0].embedding and store the float array.
            Var:String safeBody;
            String:Replace(body, "'", "''", safeBody);
            Var:String safeSource;
            String:Replace(fileName, "'", "''", safeSource);
            Var:String safeChunkSql;
            String:Replace(chunk, "'", "''", safeChunkSql);

            // Get current timestamp for indexed_at.
            Var:Number yr;
            Var:Number mo;
            Var:Number dy;
            Time:GetYear(yr);
            Time:GetMonth(mo);
            Time:GetDay(dy);
            Var:String ts = "${yr}-${mo}-${dy}";

            SQLite:Exec(db, "INSERT OR REPLACE INTO embeddings (id, source, chunk, embedding, indexed_at) VALUES ('${chunkId}', '${safeSource}', '${safeChunkSql}', '${safeBody}', '${ts}')", ok);

            Var:Number chunkLen;
            String:Length(chunk, chunkLen);
            Konsol:Print("  indexed (${chunkLen} chars).");
            indexed = indexed + 1;
        }
    }
}

// ── Step 6: Report ────────────────────────────────────────────────────────────
SQLite:Close(db);
Konsol:Print("Indexed ${indexed} chunk(s), skipped ${skipped} - database: ${dbPath}");
Konsol:Print("Done.");


Back to Kookbook