diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0393e4b9..670b8b9e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -228,7 +228,7 @@ jobs: uses: actions/cache@v5 with: path: ~/.cache/huggingface - key: hf-models-${{ runner.os }}-${{ hashFiles('src/embedder.js') }} + key: hf-models-${{ runner.os }}-${{ hashFiles('src/embeddings/**') }} restore-keys: hf-models-${{ runner.os }}- - name: Build graph diff --git a/.github/workflows/embedding-regression.yml b/.github/workflows/embedding-regression.yml index 7cecee3f..a42cc6e7 100644 --- a/.github/workflows/embedding-regression.yml +++ b/.github/workflows/embedding-regression.yml @@ -6,7 +6,7 @@ on: workflow_dispatch: pull_request: paths: - - 'src/embedder.js' + - 'src/embeddings/**' - 'tests/search/**' - 'package.json' diff --git a/CLAUDE.md b/CLAUDE.md index 46119fff..1b25676b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -45,7 +45,7 @@ JS source is plain JavaScript (ES modules) in `src/`. No transpilation step. The | `builder.js` | Graph building: file collection, parsing, import resolution, incremental hashing | | `parser.js` | tree-sitter WASM wrapper; `LANGUAGE_REGISTRY` + per-language extractors for functions, classes, methods, imports, exports, call sites | | `queries.js` | Query functions: symbol search, file deps, impact analysis, diff-impact; `SYMBOL_KINDS` constant defines all node kinds | -| `embedder.js` | Semantic search with `@huggingface/transformers`; multi-query RRF ranking | +| `embeddings/` | Embedding subsystem: model management, vector generation, semantic/keyword/hybrid search, CLI formatting | | `db.js` | SQLite schema and operations (`better-sqlite3`) | | `mcp.js` | MCP server exposing graph queries to AI agents; single-repo by default, `--multi-repo` to enable cross-repo access | | `cycles.js` | Circular dependency detection | diff --git a/scripts/embedding-benchmark.js b/scripts/embedding-benchmark.js index 51738074..4bc3afec 100644 --- a/scripts/embedding-benchmark.js +++ b/scripts/embedding-benchmark.js @@ -26,7 +26,7 @@ const { version, srcDir, cleanup } = await resolveBenchmarkSource(); const dbPath = path.join(root, '.codegraph', 'graph.db'); const { buildEmbeddings, MODELS, searchData, disposeModel } = await import( - srcImport(srcDir, 'embedder.js') + srcImport(srcDir, 'embeddings/index.js') ); // Redirect console.log to stderr so only JSON goes to stdout diff --git a/src/cli/commands/embed.js b/src/cli/commands/embed.js index fcd908e9..075520cd 100644 --- a/src/cli/commands/embed.js +++ b/src/cli/commands/embed.js @@ -1,5 +1,5 @@ import path from 'node:path'; -import { buildEmbeddings, DEFAULT_MODEL, EMBEDDING_STRATEGIES } from '../../embedder.js'; +import { buildEmbeddings, DEFAULT_MODEL, EMBEDDING_STRATEGIES } from '../../embeddings/index.js'; export const command = { name: 'embed [dir]', diff --git a/src/cli/commands/models.js b/src/cli/commands/models.js index 6773f2c2..0763650a 100644 --- a/src/cli/commands/models.js +++ b/src/cli/commands/models.js @@ -1,4 +1,4 @@ -import { DEFAULT_MODEL, MODELS } from '../../embedder.js'; +import { DEFAULT_MODEL, MODELS } from '../../embeddings/index.js'; export const command = { name: 'models', diff --git a/src/cli/commands/search.js b/src/cli/commands/search.js index 312f734d..238b59a0 100644 --- a/src/cli/commands/search.js +++ b/src/cli/commands/search.js @@ -1,4 +1,4 @@ -import { search } from '../../embedder.js'; +import { search } from '../../embeddings/index.js'; export const command = { name: 'search ', diff --git a/src/embedder.js b/src/embedder.js deleted file mode 100644 index f8fbc527..00000000 --- a/src/embedder.js +++ /dev/null @@ -1,1097 +0,0 @@ -import { execFileSync } from 'node:child_process'; -import fs from 'node:fs'; -import path from 'node:path'; -import { createInterface } from 'node:readline'; -import { - closeDb, - findCalleeNames, - findCallerNames, - findDbPath, - openDb, - openReadonlyOrFail, -} from './db.js'; -import { ConfigError, DbError, EngineError } from './errors.js'; -import { info, warn } from './logger.js'; -import { normalizeSymbol } from './queries.js'; - -/** - * Split an identifier into readable words. - * camelCase/PascalCase → "camel Case", snake_case → "snake case", kebab-case → "kebab case" - */ -function splitIdentifier(name) { - return name - .replace(/([a-z])([A-Z])/g, '$1 $2') - .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') - .replace(/[_-]+/g, ' ') - .trim(); -} - -/** - * Match a file path against a glob pattern. - * Supports *, **, and ? wildcards. Zero dependencies. - */ -function globMatch(filePath, pattern) { - // Normalize separators to forward slashes - const normalized = filePath.replace(/\\/g, '/'); - // Escape regex specials except glob chars - let regex = pattern.replace(/\\/g, '/').replace(/[.+^${}()|[\]\\]/g, '\\$&'); - // Replace ** first (matches any path segment), then * and ? - regex = regex.replace(/\*\*/g, '\0'); - regex = regex.replace(/\*/g, '[^/]*'); - regex = regex.replace(/\0/g, '.*'); - regex = regex.replace(/\?/g, '[^/]'); - try { - return new RegExp(`^${regex}$`).test(normalized); - } catch { - // Malformed pattern — fall back to substring match - return normalized.includes(pattern); - } -} - -// Lazy-load transformers (heavy, optional module) -let pipeline = null; -let _cos_sim = null; -let extractor = null; -let activeModel = null; - -export const MODELS = { - minilm: { - name: 'Xenova/all-MiniLM-L6-v2', - dim: 384, - contextWindow: 256, - desc: 'Smallest, fastest (~23MB). General text.', - quantized: true, - }, - 'jina-small': { - name: 'Xenova/jina-embeddings-v2-small-en', - dim: 512, - contextWindow: 8192, - desc: 'Small, good quality (~33MB). General text.', - quantized: false, - }, - 'jina-base': { - name: 'Xenova/jina-embeddings-v2-base-en', - dim: 768, - contextWindow: 8192, - desc: 'Good quality (~137MB). General text, 8192 token context.', - quantized: false, - }, - 'jina-code': { - name: 'Xenova/jina-embeddings-v2-base-code', - dim: 768, - contextWindow: 8192, - desc: 'Code-aware (~137MB). Trained on code+text, best for code search.', - quantized: false, - }, - nomic: { - name: 'Xenova/nomic-embed-text-v1', - dim: 768, - contextWindow: 8192, - desc: 'Good local quality (~137MB). 8192 context.', - quantized: false, - }, - 'nomic-v1.5': { - name: 'nomic-ai/nomic-embed-text-v1.5', - dim: 768, - contextWindow: 8192, - desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.', - quantized: false, - }, - 'bge-large': { - name: 'Xenova/bge-large-en-v1.5', - dim: 1024, - contextWindow: 512, - desc: 'Best general retrieval (~335MB). Top MTEB scores.', - quantized: false, - }, -}; - -export const EMBEDDING_STRATEGIES = ['structured', 'source']; - -export const DEFAULT_MODEL = 'nomic-v1.5'; -const BATCH_SIZE_MAP = { - minilm: 32, - 'jina-small': 16, - 'jina-base': 8, - 'jina-code': 8, - nomic: 8, - 'nomic-v1.5': 8, - 'bge-large': 4, -}; -const DEFAULT_BATCH_SIZE = 32; - -function getModelConfig(modelKey) { - const key = modelKey || DEFAULT_MODEL; - const config = MODELS[key]; - if (!config) { - throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`); - } - return config; -} - -/** - * Rough token estimate (~4 chars per token for code/English). - * Conservative — avoids adding a tokenizer dependency. - */ -export function estimateTokens(text) { - return Math.ceil(text.length / 4); -} - -/** - * Extract leading comment text (JSDoc, //, #, etc.) above a function line. - * Returns the cleaned comment text or null if none found. - */ -function extractLeadingComment(lines, fnLineIndex) { - if (fnLineIndex > lines.length) return null; - const raw = []; - for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) { - if (i >= lines.length) continue; - const trimmed = lines[i].trim(); - if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) { - raw.unshift(trimmed); - } else if (trimmed === '') { - if (raw.length > 0) break; - } else { - break; - } - } - if (raw.length === 0) return null; - return raw - .map((line) => - line - .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */ - .replace(/^\*\s?/, '') // middle * lines - .replace(/^\/\/\/?\s?/, '') // // or /// - .replace(/^#\s?/, '') // # (Python/Ruby) - .trim(), - ) - .filter((l) => l.length > 0) - .join(' '); -} - -/** - * Build graph-enriched text for a symbol using dependency context. - * Produces compact, semantic text (~100 tokens) instead of full source code. - */ -function buildStructuredText(node, file, lines, db) { - const readable = splitIdentifier(node.name); - const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`]; - const startLine = Math.max(0, node.line - 1); - - // Extract parameters from signature (best-effort, single-line) - const sigLine = lines[startLine] || ''; - const paramMatch = sigLine.match(/\(([^)]*)\)/); - if (paramMatch?.[1]?.trim()) { - parts.push(`Parameters: ${paramMatch[1].trim()}`); - } - - // Graph context: callees (capped at 10) - const callees = findCalleeNames(db, node.id); - if (callees.length > 0) { - parts.push(`Calls: ${callees.slice(0, 10).join(', ')}`); - } - - // Graph context: callers (capped at 10) - const callers = findCallerNames(db, node.id); - if (callers.length > 0) { - parts.push(`Called by: ${callers.slice(0, 10).join(', ')}`); - } - - // Leading comment (high semantic value) or first few lines of code - const comment = extractLeadingComment(lines, startLine); - if (comment) { - parts.push(comment); - } else { - const endLine = Math.min(lines.length, startLine + 4); - const snippet = lines.slice(startLine, endLine).join('\n').trim(); - if (snippet) parts.push(snippet); - } - - return parts.join('\n'); -} - -/** - * Build raw source-code text for a symbol (original strategy). - */ -function buildSourceText(node, file, lines) { - const startLine = Math.max(0, node.line - 1); - const endLine = node.end_line - ? Math.min(lines.length, node.end_line) - : Math.min(lines.length, startLine + 15); - const context = lines.slice(startLine, endLine).join('\n'); - const readable = splitIdentifier(node.name); - return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`; -} - -/** - * Prompt the user to install a missing package interactively. - * Returns true if the package was installed, false otherwise. - * Skips the prompt entirely in non-TTY environments (CI, piped stdin). - */ -function promptInstall(packageName) { - if (!process.stdin.isTTY) return Promise.resolve(false); - - return new Promise((resolve) => { - const rl = createInterface({ input: process.stdin, output: process.stderr }); - rl.question(`Semantic search requires ${packageName}. Install it now? [y/N] `, (answer) => { - rl.close(); - if (answer.trim().toLowerCase() !== 'y') return resolve(false); - try { - execFileSync('npm', ['install', packageName], { - stdio: 'inherit', - timeout: 300_000, - }); - resolve(true); - } catch { - resolve(false); - } - }); - }); -} - -/** - * Lazy-load @huggingface/transformers. - * If the package is missing, prompts the user to install it interactively. - * In non-TTY environments, prints an error and exits. - */ -async function loadTransformers() { - try { - return await import('@huggingface/transformers'); - } catch { - const pkg = '@huggingface/transformers'; - const installed = await promptInstall(pkg); - if (installed) { - try { - return await import(pkg); - } catch (loadErr) { - throw new EngineError( - `${pkg} was installed but failed to load. Please check your environment.`, - { cause: loadErr }, - ); - } - } - throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`); - } -} - -/** - * Dispose the current ONNX session and free memory. - * Safe to call when no model is loaded (no-op). - */ -export async function disposeModel() { - if (extractor) { - await extractor.dispose(); - extractor = null; - } - activeModel = null; -} - -async function loadModel(modelKey) { - const config = getModelConfig(modelKey); - - if (extractor && activeModel === config.name) return { extractor, config }; - - // Dispose previous model before loading a different one - await disposeModel(); - - const transformers = await loadTransformers(); - pipeline = transformers.pipeline; - _cos_sim = transformers.cos_sim; - - info(`Loading embedding model: ${config.name} (${config.dim}d)...`); - const pipelineOpts = config.quantized ? { quantized: true } : {}; - try { - extractor = await pipeline('feature-extraction', config.name, pipelineOpts); - } catch (err) { - const msg = err.message || String(err); - if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) { - throw new EngineError( - `Model "${config.name}" requires authentication.\n` + - `This model is gated on HuggingFace and needs an access token.\n\n` + - `Options:\n` + - ` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` + - ` 2. Use a public model instead: codegraph embed --model minilm`, - { cause: err }, - ); - } - throw new EngineError( - `Failed to load model "${config.name}": ${msg}\n` + - `Try a different model: codegraph embed --model minilm`, - { cause: err }, - ); - } - activeModel = config.name; - info('Model loaded.'); - return { extractor, config }; -} - -/** - * Generate embeddings for an array of texts. - */ -export async function embed(texts, modelKey) { - const { extractor: ext, config } = await loadModel(modelKey); - const dim = config.dim; - const results = []; - const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] || DEFAULT_BATCH_SIZE; - - for (let i = 0; i < texts.length; i += batchSize) { - const batch = texts.slice(i, i + batchSize); - const output = await ext(batch, { pooling: 'mean', normalize: true }); - - for (let j = 0; j < batch.length; j++) { - const start = j * dim; - const vec = new Float32Array(dim); - for (let k = 0; k < dim; k++) { - vec[k] = output.data[start + k]; - } - results.push(vec); - } - - if (texts.length > batchSize) { - process.stdout.write(` Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`); - } - } - - return { vectors: results, dim }; -} - -/** - * Cosine similarity between two Float32Arrays. - */ -export function cosineSim(a, b) { - let dot = 0, - normA = 0, - normB = 0; - for (let i = 0; i < a.length; i++) { - dot += a[i] * b[i]; - normA += a[i] * a[i]; - normB += b[i] * b[i]; - } - return dot / (Math.sqrt(normA) * Math.sqrt(normB)); -} - -function initEmbeddingsSchema(db) { - db.exec(` - CREATE TABLE IF NOT EXISTS embeddings ( - node_id INTEGER PRIMARY KEY, - vector BLOB NOT NULL, - text_preview TEXT, - FOREIGN KEY(node_id) REFERENCES nodes(id) - ); - CREATE TABLE IF NOT EXISTS embedding_meta ( - key TEXT PRIMARY KEY, - value TEXT - ); - `); - - // Add full_text column (idempotent — ignore if already exists) - try { - db.exec('ALTER TABLE embeddings ADD COLUMN full_text TEXT'); - } catch { - /* column already exists */ - } - - // FTS5 virtual table for BM25 keyword search - db.exec(` - CREATE VIRTUAL TABLE IF NOT EXISTS fts_index USING fts5( - name, - content, - tokenize='unicode61' - ); - `); -} - -/** - * Build embeddings for all functions/methods/classes in the graph. - * @param {string} rootDir - Project root directory - * @param {string} modelKey - Model identifier from MODELS registry - * @param {string} [customDbPath] - Override path to graph.db - * @param {object} [options] - Embedding options - * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code) - */ -export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) { - const strategy = options.strategy || 'structured'; - const dbPath = customDbPath || findDbPath(null); - - if (!fs.existsSync(dbPath)) { - throw new DbError( - `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, - { file: dbPath }, - ); - } - - const db = openDb(dbPath); - initEmbeddingsSchema(db); - - db.exec('DELETE FROM embeddings'); - db.exec('DELETE FROM embedding_meta'); - db.exec('DELETE FROM fts_index'); - - const nodes = db - .prepare( - `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`, - ) - .all(); - - console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`); - - const byFile = new Map(); - for (const node of nodes) { - if (!byFile.has(node.file)) byFile.set(node.file, []); - byFile.get(node.file).push(node); - } - - const texts = []; - const nodeIds = []; - const nodeNames = []; - const previews = []; - const config = getModelConfig(modelKey); - const contextWindow = config.contextWindow; - let overflowCount = 0; - - for (const [file, fileNodes] of byFile) { - const fullPath = path.join(rootDir, file); - let lines; - try { - lines = fs.readFileSync(fullPath, 'utf-8').split('\n'); - } catch (err) { - warn(`Cannot read ${file} for embeddings: ${err.message}`); - continue; - } - - for (const node of fileNodes) { - let text = - strategy === 'structured' - ? buildStructuredText(node, file, lines, db) - : buildSourceText(node, file, lines); - - // Detect and handle context window overflow - const tokens = estimateTokens(text); - if (tokens > contextWindow) { - overflowCount++; - const maxChars = contextWindow * 4; - text = text.slice(0, maxChars); - } - - texts.push(text); - nodeIds.push(node.id); - nodeNames.push(node.name); - previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`); - } - } - - if (overflowCount > 0) { - warn( - `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`, - ); - } - - console.log(`Embedding ${texts.length} symbols...`); - const { vectors, dim } = await embed(texts, modelKey); - - const insert = db.prepare( - 'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)', - ); - const insertFts = db.prepare('INSERT INTO fts_index(rowid, name, content) VALUES (?, ?, ?)'); - const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)'); - const insertAll = db.transaction(() => { - for (let i = 0; i < vectors.length; i++) { - insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i], texts[i]); - insertFts.run(nodeIds[i], nodeNames[i], texts[i]); - } - insertMeta.run('model', config.name); - insertMeta.run('dim', String(dim)); - insertMeta.run('count', String(vectors.length)); - insertMeta.run('fts_count', String(vectors.length)); - insertMeta.run('strategy', strategy); - insertMeta.run('built_at', new Date().toISOString()); - if (overflowCount > 0) { - insertMeta.run('truncated_count', String(overflowCount)); - } - }); - insertAll(); - - console.log( - `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`, - ); - closeDb(db); -} - -/** - * Shared setup for search functions: opens DB, validates embeddings/model, loads rows. - * Returns { db, rows, modelKey, storedDim } or null on failure (prints error). - */ -function _prepareSearch(customDbPath, opts = {}) { - const db = openReadonlyOrFail(customDbPath); - - let count; - try { - count = db.prepare('SELECT COUNT(*) as c FROM embeddings').get().c; - } catch { - console.log('No embeddings table found. Run `codegraph embed` first.'); - db.close(); - return null; - } - if (count === 0) { - console.log('No embeddings found. Run `codegraph embed` first.'); - db.close(); - return null; - } - - let storedModel = null; - let storedDim = null; - try { - const modelRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'model'").get(); - const dimRow = db.prepare("SELECT value FROM embedding_meta WHERE key = 'dim'").get(); - if (modelRow) storedModel = modelRow.value; - if (dimRow) storedDim = parseInt(dimRow.value, 10); - } catch { - /* old DB without meta table */ - } - - let modelKey = opts.model || null; - if (!modelKey && storedModel) { - for (const [key, config] of Object.entries(MODELS)) { - if (config.name === storedModel) { - modelKey = key; - break; - } - } - } - - // Pre-filter: allow filtering by kind or file pattern to reduce search space - const noTests = opts.noTests || false; - const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./; - let sql = ` - SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role - FROM embeddings e - JOIN nodes n ON e.node_id = n.id - `; - const params = []; - const conditions = []; - if (opts.kind) { - conditions.push('n.kind = ?'); - params.push(opts.kind); - } - const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); - if (opts.filePattern && !isGlob) { - conditions.push('n.file LIKE ?'); - params.push(`%${opts.filePattern}%`); - } - if (conditions.length > 0) { - sql += ` WHERE ${conditions.join(' AND ')}`; - } - - let rows = db.prepare(sql).all(...params); - if (isGlob) { - rows = rows.filter((row) => globMatch(row.file, opts.filePattern)); - } - if (noTests) { - rows = rows.filter((row) => !TEST_PATTERN.test(row.file)); - } - - return { db, rows, modelKey, storedDim }; -} - -/** - * Single-query semantic search — returns data instead of printing. - * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure. - */ -export async function searchData(query, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const minScore = opts.minScore || 0.2; - - const prepared = _prepareSearch(customDbPath, opts); - if (!prepared) return null; - const { db, rows, modelKey, storedDim } = prepared; - - try { - const { - vectors: [queryVec], - dim, - } = await embed([query], modelKey); - - if (storedDim && dim !== storedDim) { - console.log( - `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, - ); - console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); - return null; - } - - const hc = new Map(); - const results = []; - for (const row of rows) { - const vec = new Float32Array(new Uint8Array(row.vector).buffer); - const sim = cosineSim(queryVec, vec); - - if (sim >= minScore) { - results.push({ - ...normalizeSymbol(row, db, hc), - similarity: sim, - }); - } - } - - results.sort((a, b) => b.similarity - a.similarity); - return { results: results.slice(0, limit) }; - } finally { - db.close(); - } -} - -/** - * Multi-query semantic search with Reciprocal Rank Fusion (RRF). - * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure. - */ -export async function multiSearchData(queries, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const minScore = opts.minScore || 0.2; - const k = opts.rrfK || 60; - - const prepared = _prepareSearch(customDbPath, opts); - if (!prepared) return null; - const { db, rows, modelKey, storedDim } = prepared; - - try { - const { vectors: queryVecs, dim } = await embed(queries, modelKey); - - // Warn about similar queries that may bias RRF results - const SIMILARITY_WARN_THRESHOLD = 0.85; - for (let i = 0; i < queryVecs.length; i++) { - for (let j = i + 1; j < queryVecs.length; j++) { - const sim = cosineSim(queryVecs[i], queryVecs[j]); - if (sim >= SIMILARITY_WARN_THRESHOLD) { - warn( - `Queries "${queries[i]}" and "${queries[j]}" are very similar ` + - `(${(sim * 100).toFixed(0)}% cosine similarity). ` + - `This may bias RRF results toward their shared matches. ` + - `Consider using more distinct queries.`, - ); - } - } - } - - if (storedDim && dim !== storedDim) { - console.log( - `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, - ); - console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); - return null; - } - - // Parse row vectors once - const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer)); - - // For each query: compute similarities, filter by minScore, rank - const perQueryRanked = queries.map((_query, qi) => { - const scored = []; - for (let ri = 0; ri < rows.length; ri++) { - const sim = cosineSim(queryVecs[qi], rowVecs[ri]); - if (sim >= minScore) { - scored.push({ rowIndex: ri, similarity: sim }); - } - } - scored.sort((a, b) => b.similarity - a.similarity); - // Assign 1-indexed ranks - return scored.map((item, rank) => ({ ...item, rank: rank + 1 })); - }); - - // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries - const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] } - for (let qi = 0; qi < queries.length; qi++) { - for (const item of perQueryRanked[qi]) { - if (!fusionMap.has(item.rowIndex)) { - fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] }); - } - const entry = fusionMap.get(item.rowIndex); - entry.rrfScore += 1 / (k + item.rank); - entry.queryScores.push({ - query: queries[qi], - similarity: item.similarity, - rank: item.rank, - }); - } - } - - // Build results sorted by RRF score - const hc = new Map(); - const results = []; - for (const [rowIndex, entry] of fusionMap) { - const row = rows[rowIndex]; - results.push({ - ...normalizeSymbol(row, db, hc), - rrf: entry.rrfScore, - queryScores: entry.queryScores, - }); - } - - results.sort((a, b) => b.rrf - a.rrf); - return { results: results.slice(0, limit) }; - } finally { - db.close(); - } -} - -/** - * Sanitize a user query for FTS5 MATCH syntax. - * Wraps each token as an implicit OR and escapes special FTS5 characters. - */ -function sanitizeFtsQuery(query) { - // Remove FTS5 special chars that could cause syntax errors - const cleaned = query.replace(/[*"():^{}~<>]/g, ' ').trim(); - if (!cleaned) return null; - // Split into tokens, wrap with OR for multi-token queries - const tokens = cleaned.split(/\s+/).filter((t) => t.length > 0); - if (tokens.length === 0) return null; - if (tokens.length === 1) return `"${tokens[0]}"`; - return tokens.map((t) => `"${t}"`).join(' OR '); -} - -/** - * Check if the FTS5 index exists in the database. - * Returns true if fts_index table exists and has rows, false otherwise. - */ -function hasFtsIndex(db) { - try { - const row = db.prepare('SELECT COUNT(*) as c FROM fts_index').get(); - return row.c > 0; - } catch { - return false; - } -} - -/** - * BM25 keyword search via FTS5. - * Returns { results: [{ name, kind, file, line, bm25Score }] } or null if no FTS5 index. - */ -export function ftsSearchData(query, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const noTests = opts.noTests || false; - const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./; - - const db = openReadonlyOrFail(customDbPath); - - try { - if (!hasFtsIndex(db)) { - return null; - } - - const ftsQuery = sanitizeFtsQuery(query); - if (!ftsQuery) { - return { results: [] }; - } - - let sql = ` - SELECT f.rowid AS node_id, rank AS bm25_score, - n.name, n.kind, n.file, n.line, n.end_line, n.role - FROM fts_index f - JOIN nodes n ON f.rowid = n.id - WHERE fts_index MATCH ? - `; - const params = [ftsQuery]; - - if (opts.kind) { - sql += ' AND n.kind = ?'; - params.push(opts.kind); - } - - const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); - if (opts.filePattern && !isGlob) { - sql += ' AND n.file LIKE ?'; - params.push(`%${opts.filePattern}%`); - } - - sql += ' ORDER BY rank LIMIT ?'; - params.push(limit * 5); // fetch generous set for post-filtering - - let rows; - try { - rows = db.prepare(sql).all(...params); - } catch { - // Invalid FTS5 query syntax — return empty - return { results: [] }; - } - - if (isGlob) { - rows = rows.filter((row) => globMatch(row.file, opts.filePattern)); - } - if (noTests) { - rows = rows.filter((row) => !TEST_PATTERN.test(row.file)); - } - - const hc = new Map(); - const results = rows.slice(0, limit).map((row) => ({ - ...normalizeSymbol(row, db, hc), - bm25Score: -row.bm25_score, // FTS5 rank is negative; negate for display - })); - - return { results }; - } finally { - db.close(); - } -} - -/** - * Hybrid BM25 + semantic search with RRF fusion. - * Returns { results: [{ name, kind, file, line, rrf, bm25Score, bm25Rank, similarity, semanticRank }] } - * or null if no FTS5 index (caller should fall back to semantic-only). - */ -export async function hybridSearchData(query, customDbPath, opts = {}) { - const limit = opts.limit || 15; - const k = opts.rrfK || 60; - const topK = (opts.limit || 15) * 5; - - // Split semicolons for multi-query support - const queries = - typeof query === 'string' - ? query - .split(';') - .map((q) => q.trim()) - .filter((q) => q.length > 0) - : [query]; - - // Check FTS5 availability first (sync, cheap) - const checkDb = openReadonlyOrFail(customDbPath); - const ftsAvailable = hasFtsIndex(checkDb); - checkDb.close(); - if (!ftsAvailable) return null; - - // Collect ranked lists: for each query, one BM25 list + one semantic list - const rankedLists = []; - - for (const q of queries) { - // BM25 ranked list (sync) - const bm25Data = ftsSearchData(q, customDbPath, { ...opts, limit: topK }); - if (bm25Data?.results) { - rankedLists.push( - bm25Data.results.map((r, idx) => ({ - key: `${r.name}:${r.file}:${r.line}`, - rank: idx + 1, - source: 'bm25', - ...r, - })), - ); - } - - // Semantic ranked list (async) - const semData = await searchData(q, customDbPath, { - ...opts, - limit: topK, - minScore: opts.minScore || 0.2, - }); - if (semData?.results) { - rankedLists.push( - semData.results.map((r, idx) => ({ - key: `${r.name}:${r.file}:${r.line}`, - rank: idx + 1, - source: 'semantic', - ...r, - })), - ); - } - } - - // RRF fusion across all ranked lists - const fusionMap = new Map(); - for (const list of rankedLists) { - for (const item of list) { - if (!fusionMap.has(item.key)) { - fusionMap.set(item.key, { - name: item.name, - kind: item.kind, - file: item.file, - line: item.line, - endLine: item.endLine ?? null, - role: item.role ?? null, - fileHash: item.fileHash ?? null, - rrfScore: 0, - bm25Score: null, - bm25Rank: null, - similarity: null, - semanticRank: null, - }); - } - const entry = fusionMap.get(item.key); - entry.rrfScore += 1 / (k + item.rank); - if (item.source === 'bm25') { - if (entry.bm25Rank === null || item.rank < entry.bm25Rank) { - entry.bm25Score = item.bm25Score; - entry.bm25Rank = item.rank; - } - } else { - if (entry.semanticRank === null || item.rank < entry.semanticRank) { - entry.similarity = item.similarity; - entry.semanticRank = item.rank; - } - } - } - } - - const results = [...fusionMap.values()] - .sort((a, b) => b.rrfScore - a.rrfScore) - .slice(0, limit) - .map((e) => ({ - name: e.name, - kind: e.kind, - file: e.file, - line: e.line, - endLine: e.endLine, - role: e.role, - fileHash: e.fileHash, - rrf: e.rrfScore, - bm25Score: e.bm25Score, - bm25Rank: e.bm25Rank, - similarity: e.similarity, - semanticRank: e.semanticRank, - })); - - return { results }; -} - -/** - * Search with mode support — CLI wrapper with multi-query detection. - * Modes: 'hybrid' (default), 'semantic', 'keyword' - */ -export async function search(query, customDbPath, opts = {}) { - const mode = opts.mode || 'hybrid'; - - // Split by semicolons, trim, filter empties - const queries = query - .split(';') - .map((q) => q.trim()) - .filter((q) => q.length > 0); - - const kindIcon = (kind) => (kind === 'function' ? 'f' : kind === 'class' ? '*' : 'o'); - - // ─── Keyword-only mode ────────────────────────────────────────────── - if (mode === 'keyword') { - const singleQuery = queries.length === 1 ? queries[0] : query; - const data = ftsSearchData(singleQuery, customDbPath, opts); - if (!data) { - console.log('No FTS5 index found. Run `codegraph embed` to build the keyword index.'); - return; - } - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - console.log(`\nKeyword search: "${singleQuery}" (BM25)\n`); - if (data.results.length === 0) { - console.log(' No results found.'); - } else { - for (const r of data.results) { - console.log( - ` BM25 ${r.bm25Score.toFixed(2)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, - ); - } - } - console.log(`\n ${data.results.length} results shown\n`); - return; - } - - // ─── Semantic-only mode ───────────────────────────────────────────── - if (mode === 'semantic') { - if (queries.length <= 1) { - const singleQuery = queries[0] || query; - const data = await searchData(singleQuery, customDbPath, opts); - if (!data) return; - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - console.log(`\nSemantic search: "${singleQuery}"\n`); - if (data.results.length === 0) { - console.log(' No results above threshold.'); - } else { - for (const r of data.results) { - const bar = '#'.repeat(Math.round(r.similarity * 20)); - console.log(` ${(r.similarity * 100).toFixed(1)}% ${bar}`); - console.log(` ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`); - } - } - console.log(`\n ${data.results.length} results shown\n`); - } else { - const data = await multiSearchData(queries, customDbPath, opts); - if (!data) return; - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`); - for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); - console.log(); - if (data.results.length === 0) { - console.log(' No results above threshold.'); - } else { - for (const r of data.results) { - console.log( - ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, - ); - for (const qs of r.queryScores) { - const bar = '#'.repeat(Math.round(qs.similarity * 20)); - console.log( - ` [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`, - ); - } - } - } - console.log(`\n ${data.results.length} results shown\n`); - } - return; - } - - // ─── Hybrid mode (default) ────────────────────────────────────────── - const data = await hybridSearchData(query, customDbPath, opts); - - if (!data) { - // No FTS5 index — fall back to semantic-only - warn( - 'FTS5 index not found — using semantic search only. Re-run `codegraph embed` to enable hybrid mode.', - ); - return search(query, customDbPath, { ...opts, mode: 'semantic' }); - } - - if (opts.json) { - console.log(JSON.stringify(data, null, 2)); - return; - } - - const rrfK = opts.rrfK || 60; - if (queries.length <= 1) { - const singleQuery = queries[0] || query; - console.log(`\nHybrid search: "${singleQuery}" (BM25 + semantic, RRF k=${rrfK})\n`); - } else { - console.log(`\nHybrid multi-query search (BM25 + semantic, RRF k=${rrfK}):`); - for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); - console.log(); - } - - if (data.results.length === 0) { - console.log(' No results found.'); - } else { - for (const r of data.results) { - console.log( - ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, - ); - const parts = []; - if (r.bm25Rank != null) { - parts.push(`BM25: rank ${r.bm25Rank} (score ${r.bm25Score.toFixed(2)})`); - } - if (r.semanticRank != null) { - parts.push(`Semantic: rank ${r.semanticRank} (${(r.similarity * 100).toFixed(1)}%)`); - } - if (parts.length > 0) { - console.log(` ${parts.join(' | ')}`); - } - } - } - - console.log(`\n ${data.results.length} results shown\n`); -} diff --git a/src/embeddings/generator.js b/src/embeddings/generator.js new file mode 100644 index 00000000..b34f5934 --- /dev/null +++ b/src/embeddings/generator.js @@ -0,0 +1,163 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { closeDb, findDbPath, openDb } from '../db.js'; +import { DbError } from '../errors.js'; +import { warn } from '../logger.js'; +import { embed, getModelConfig } from './models.js'; +import { buildSourceText } from './strategies/source.js'; +import { buildStructuredText } from './strategies/structured.js'; + +/** + * Rough token estimate (~4 chars per token for code/English). + * Conservative — avoids adding a tokenizer dependency. + */ +export function estimateTokens(text) { + return Math.ceil(text.length / 4); +} + +function initEmbeddingsSchema(db) { + db.exec(` + CREATE TABLE IF NOT EXISTS embeddings ( + node_id INTEGER PRIMARY KEY, + vector BLOB NOT NULL, + text_preview TEXT, + FOREIGN KEY(node_id) REFERENCES nodes(id) + ); + CREATE TABLE IF NOT EXISTS embedding_meta ( + key TEXT PRIMARY KEY, + value TEXT + ); + `); + + // Add full_text column (idempotent — ignore if already exists) + try { + db.exec('ALTER TABLE embeddings ADD COLUMN full_text TEXT'); + } catch { + /* column already exists */ + } + + // FTS5 virtual table for BM25 keyword search + db.exec(` + CREATE VIRTUAL TABLE IF NOT EXISTS fts_index USING fts5( + name, + content, + tokenize='unicode61' + ); + `); +} + +/** + * Build embeddings for all functions/methods/classes in the graph. + * @param {string} rootDir - Project root directory + * @param {string} modelKey - Model identifier from MODELS registry + * @param {string} [customDbPath] - Override path to graph.db + * @param {object} [options] - Embedding options + * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code) + */ +export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) { + const strategy = options.strategy || 'structured'; + const dbPath = customDbPath || findDbPath(null); + + if (!fs.existsSync(dbPath)) { + throw new DbError( + `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, + { file: dbPath }, + ); + } + + const db = openDb(dbPath); + initEmbeddingsSchema(db); + + db.exec('DELETE FROM embeddings'); + db.exec('DELETE FROM embedding_meta'); + db.exec('DELETE FROM fts_index'); + + const nodes = db + .prepare( + `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`, + ) + .all(); + + console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`); + + const byFile = new Map(); + for (const node of nodes) { + if (!byFile.has(node.file)) byFile.set(node.file, []); + byFile.get(node.file).push(node); + } + + const texts = []; + const nodeIds = []; + const nodeNames = []; + const previews = []; + const config = getModelConfig(modelKey); + const contextWindow = config.contextWindow; + let overflowCount = 0; + + for (const [file, fileNodes] of byFile) { + const fullPath = path.join(rootDir, file); + let lines; + try { + lines = fs.readFileSync(fullPath, 'utf-8').split('\n'); + } catch (err) { + warn(`Cannot read ${file} for embeddings: ${err.message}`); + continue; + } + + for (const node of fileNodes) { + let text = + strategy === 'structured' + ? buildStructuredText(node, file, lines, db) + : buildSourceText(node, file, lines); + + // Detect and handle context window overflow + const tokens = estimateTokens(text); + if (tokens > contextWindow) { + overflowCount++; + const maxChars = contextWindow * 4; + text = text.slice(0, maxChars); + } + + texts.push(text); + nodeIds.push(node.id); + nodeNames.push(node.name); + previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`); + } + } + + if (overflowCount > 0) { + warn( + `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`, + ); + } + + console.log(`Embedding ${texts.length} symbols...`); + const { vectors, dim } = await embed(texts, modelKey); + + const insert = db.prepare( + 'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)', + ); + const insertFts = db.prepare('INSERT INTO fts_index(rowid, name, content) VALUES (?, ?, ?)'); + const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)'); + const insertAll = db.transaction(() => { + for (let i = 0; i < vectors.length; i++) { + insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i], texts[i]); + insertFts.run(nodeIds[i], nodeNames[i], texts[i]); + } + insertMeta.run('model', config.name); + insertMeta.run('dim', String(dim)); + insertMeta.run('count', String(vectors.length)); + insertMeta.run('fts_count', String(vectors.length)); + insertMeta.run('strategy', strategy); + insertMeta.run('built_at', new Date().toISOString()); + if (overflowCount > 0) { + insertMeta.run('truncated_count', String(overflowCount)); + } + }); + insertAll(); + + console.log( + `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`, + ); + closeDb(db); +} diff --git a/src/embeddings/index.js b/src/embeddings/index.js new file mode 100644 index 00000000..bac3c60d --- /dev/null +++ b/src/embeddings/index.js @@ -0,0 +1,13 @@ +/** + * Embeddings subsystem — public API barrel. + * + * Re-exports everything consumers previously imported from `../embedder.js`. + */ + +export { buildEmbeddings, estimateTokens } from './generator.js'; +export { DEFAULT_MODEL, disposeModel, EMBEDDING_STRATEGIES, embed, MODELS } from './models.js'; +export { search } from './search/cli-formatter.js'; +export { hybridSearchData } from './search/hybrid.js'; +export { ftsSearchData } from './search/keyword.js'; +export { multiSearchData, searchData } from './search/semantic.js'; +export { cosineSim } from './stores/sqlite-blob.js'; diff --git a/src/embeddings/models.js b/src/embeddings/models.js new file mode 100644 index 00000000..1202dd28 --- /dev/null +++ b/src/embeddings/models.js @@ -0,0 +1,218 @@ +import { execFileSync } from 'node:child_process'; +import { createInterface } from 'node:readline'; +import { ConfigError, EngineError } from '../errors.js'; +import { info } from '../logger.js'; + +// Lazy-load transformers (heavy, optional module) +let pipeline = null; +let extractor = null; +let activeModel = null; + +export const MODELS = { + minilm: { + name: 'Xenova/all-MiniLM-L6-v2', + dim: 384, + contextWindow: 256, + desc: 'Smallest, fastest (~23MB). General text.', + quantized: true, + }, + 'jina-small': { + name: 'Xenova/jina-embeddings-v2-small-en', + dim: 512, + contextWindow: 8192, + desc: 'Small, good quality (~33MB). General text.', + quantized: false, + }, + 'jina-base': { + name: 'Xenova/jina-embeddings-v2-base-en', + dim: 768, + contextWindow: 8192, + desc: 'Good quality (~137MB). General text, 8192 token context.', + quantized: false, + }, + 'jina-code': { + name: 'Xenova/jina-embeddings-v2-base-code', + dim: 768, + contextWindow: 8192, + desc: 'Code-aware (~137MB). Trained on code+text, best for code search.', + quantized: false, + }, + nomic: { + name: 'Xenova/nomic-embed-text-v1', + dim: 768, + contextWindow: 8192, + desc: 'Good local quality (~137MB). 8192 context.', + quantized: false, + }, + 'nomic-v1.5': { + name: 'nomic-ai/nomic-embed-text-v1.5', + dim: 768, + contextWindow: 8192, + desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.', + quantized: false, + }, + 'bge-large': { + name: 'Xenova/bge-large-en-v1.5', + dim: 1024, + contextWindow: 512, + desc: 'Best general retrieval (~335MB). Top MTEB scores.', + quantized: false, + }, +}; + +export const EMBEDDING_STRATEGIES = ['structured', 'source']; + +export const DEFAULT_MODEL = 'nomic-v1.5'; +const BATCH_SIZE_MAP = { + minilm: 32, + 'jina-small': 16, + 'jina-base': 8, + 'jina-code': 8, + nomic: 8, + 'nomic-v1.5': 8, + 'bge-large': 4, +}; +const DEFAULT_BATCH_SIZE = 32; + +/** @internal Used by generator.js — not part of the public barrel. */ +export function getModelConfig(modelKey) { + const key = modelKey || DEFAULT_MODEL; + const config = MODELS[key]; + if (!config) { + throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`); + } + return config; +} + +/** + * Prompt the user to install a missing package interactively. + * Returns true if the package was installed, false otherwise. + * Skips the prompt entirely in non-TTY environments (CI, piped stdin). + * @internal Not part of the public barrel. + */ +export function promptInstall(packageName) { + if (!process.stdin.isTTY) return Promise.resolve(false); + + return new Promise((resolve) => { + const rl = createInterface({ input: process.stdin, output: process.stderr }); + rl.question(`Semantic search requires ${packageName}. Install it now? [y/N] `, (answer) => { + rl.close(); + if (answer.trim().toLowerCase() !== 'y') return resolve(false); + try { + execFileSync('npm', ['install', packageName], { + stdio: 'inherit', + timeout: 300_000, + }); + resolve(true); + } catch { + resolve(false); + } + }); + }); +} + +/** + * Lazy-load @huggingface/transformers. + * If the package is missing, prompts the user to install it interactively. + * In non-TTY environments, prints an error and exits. + * @internal Not part of the public barrel. + */ +export async function loadTransformers() { + try { + return await import('@huggingface/transformers'); + } catch { + const pkg = '@huggingface/transformers'; + const installed = await promptInstall(pkg); + if (installed) { + try { + return await import(pkg); + } catch (loadErr) { + throw new EngineError( + `${pkg} was installed but failed to load. Please check your environment.`, + { cause: loadErr }, + ); + } + } + throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`); + } +} + +/** + * Dispose the current ONNX session and free memory. + * Safe to call when no model is loaded (no-op). + */ +export async function disposeModel() { + if (extractor) { + await extractor.dispose(); + extractor = null; + } + activeModel = null; +} + +async function loadModel(modelKey) { + const config = getModelConfig(modelKey); + + if (extractor && activeModel === config.name) return { extractor, config }; + + // Dispose previous model before loading a different one + await disposeModel(); + + const transformers = await loadTransformers(); + pipeline = transformers.pipeline; + + info(`Loading embedding model: ${config.name} (${config.dim}d)...`); + const pipelineOpts = config.quantized ? { quantized: true } : {}; + try { + extractor = await pipeline('feature-extraction', config.name, pipelineOpts); + } catch (err) { + const msg = err.message || String(err); + if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) { + throw new EngineError( + `Model "${config.name}" requires authentication.\n` + + `This model is gated on HuggingFace and needs an access token.\n\n` + + `Options:\n` + + ` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` + + ` 2. Use a public model instead: codegraph embed --model minilm`, + { cause: err }, + ); + } + throw new EngineError( + `Failed to load model "${config.name}": ${msg}\n` + + `Try a different model: codegraph embed --model minilm`, + { cause: err }, + ); + } + activeModel = config.name; + info('Model loaded.'); + return { extractor, config }; +} + +/** + * Generate embeddings for an array of texts. + */ +export async function embed(texts, modelKey) { + const { extractor: ext, config } = await loadModel(modelKey); + const dim = config.dim; + const results = []; + const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] || DEFAULT_BATCH_SIZE; + + for (let i = 0; i < texts.length; i += batchSize) { + const batch = texts.slice(i, i + batchSize); + const output = await ext(batch, { pooling: 'mean', normalize: true }); + + for (let j = 0; j < batch.length; j++) { + const start = j * dim; + const vec = new Float32Array(dim); + for (let k = 0; k < dim; k++) { + vec[k] = output.data[start + k]; + } + results.push(vec); + } + + if (texts.length > batchSize) { + process.stdout.write(` Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`); + } + } + + return { vectors: results, dim }; +} diff --git a/src/embeddings/search/cli-formatter.js b/src/embeddings/search/cli-formatter.js new file mode 100644 index 00000000..f79a9e27 --- /dev/null +++ b/src/embeddings/search/cli-formatter.js @@ -0,0 +1,151 @@ +import { warn } from '../../logger.js'; +import { hybridSearchData } from './hybrid.js'; +import { ftsSearchData } from './keyword.js'; +import { multiSearchData, searchData } from './semantic.js'; + +/** + * Search with mode support — CLI wrapper with multi-query detection. + * Modes: 'hybrid' (default), 'semantic', 'keyword' + */ +export async function search(query, customDbPath, opts = {}) { + const mode = opts.mode || 'hybrid'; + + // Split by semicolons, trim, filter empties + const queries = query + .split(';') + .map((q) => q.trim()) + .filter((q) => q.length > 0); + + const kindIcon = (kind) => (kind === 'function' ? 'f' : kind === 'class' ? '*' : 'o'); + + // ─── Keyword-only mode ────────────────────────────────────────────── + if (mode === 'keyword') { + const singleQuery = queries.length === 1 ? queries[0] : query; + const data = ftsSearchData(singleQuery, customDbPath, opts); + if (!data) { + console.log('No FTS5 index found. Run `codegraph embed` to build the keyword index.'); + return; + } + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + console.log(`\nKeyword search: "${singleQuery}" (BM25)\n`); + if (data.results.length === 0) { + console.log(' No results found.'); + } else { + for (const r of data.results) { + console.log( + ` BM25 ${r.bm25Score.toFixed(2)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, + ); + } + } + console.log(`\n ${data.results.length} results shown\n`); + return; + } + + // ─── Semantic-only mode ───────────────────────────────────────────── + if (mode === 'semantic') { + if (queries.length <= 1) { + const singleQuery = queries[0] || query; + const data = await searchData(singleQuery, customDbPath, opts); + if (!data) return; + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + console.log(`\nSemantic search: "${singleQuery}"\n`); + if (data.results.length === 0) { + console.log(' No results above threshold.'); + } else { + for (const r of data.results) { + const bar = '#'.repeat(Math.round(r.similarity * 20)); + console.log(` ${(r.similarity * 100).toFixed(1)}% ${bar}`); + console.log(` ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`); + } + } + console.log(`\n ${data.results.length} results shown\n`); + } else { + const data = await multiSearchData(queries, customDbPath, opts); + if (!data) return; + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + console.log(`\nMulti-query semantic search (RRF, k=${opts.rrfK || 60}):`); + for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); + console.log(); + if (data.results.length === 0) { + console.log(' No results above threshold.'); + } else { + for (const r of data.results) { + console.log( + ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, + ); + for (const qs of r.queryScores) { + const bar = '#'.repeat(Math.round(qs.similarity * 20)); + console.log( + ` [${queries.indexOf(qs.query) + 1}] ${(qs.similarity * 100).toFixed(1)}% ${bar} (rank ${qs.rank})`, + ); + } + } + } + console.log(`\n ${data.results.length} results shown\n`); + } + return; + } + + // ─── Hybrid mode (default) ────────────────────────────────────────── + const data = await hybridSearchData(query, customDbPath, opts); + + if (!data) { + // No FTS5 index — fall back to semantic-only + warn( + 'FTS5 index not found — using semantic search only. Re-run `codegraph embed` to enable hybrid mode.', + ); + return search(query, customDbPath, { ...opts, mode: 'semantic' }); + } + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + const rrfK = opts.rrfK || 60; + if (queries.length <= 1) { + const singleQuery = queries[0] || query; + console.log(`\nHybrid search: "${singleQuery}" (BM25 + semantic, RRF k=${rrfK})\n`); + } else { + console.log(`\nHybrid multi-query search (BM25 + semantic, RRF k=${rrfK}):`); + for (let i = 0; i < queries.length; i++) console.log(` [${i + 1}] "${queries[i]}"`); + console.log(); + } + + if (data.results.length === 0) { + console.log(' No results found.'); + } else { + for (const r of data.results) { + console.log( + ` RRF ${r.rrf.toFixed(4)} ${kindIcon(r.kind)} ${r.name} -- ${r.file}:${r.line}`, + ); + const parts = []; + if (r.bm25Rank != null) { + parts.push(`BM25: rank ${r.bm25Rank} (score ${r.bm25Score.toFixed(2)})`); + } + if (r.semanticRank != null) { + parts.push(`Semantic: rank ${r.semanticRank} (${(r.similarity * 100).toFixed(1)}%)`); + } + if (parts.length > 0) { + console.log(` ${parts.join(' | ')}`); + } + } + } + + console.log(`\n ${data.results.length} results shown\n`); +} diff --git a/src/embeddings/search/filters.js b/src/embeddings/search/filters.js new file mode 100644 index 00000000..465e51e0 --- /dev/null +++ b/src/embeddings/search/filters.js @@ -0,0 +1,46 @@ +/** + * Match a file path against a glob pattern. + * Supports *, **, and ? wildcards. Zero dependencies. + */ +export function globMatch(filePath, pattern) { + // Normalize separators to forward slashes + const normalized = filePath.replace(/\\/g, '/'); + // Escape regex specials except glob chars + let regex = pattern.replace(/\\/g, '/').replace(/[.+^${}()|[\]\\]/g, '\\$&'); + // Replace ** first (matches any path segment), then * and ? + regex = regex.replace(/\*\*/g, '\0'); + regex = regex.replace(/\*/g, '[^/]*'); + regex = regex.replace(/\0/g, '.*'); + regex = regex.replace(/\?/g, '[^/]'); + try { + return new RegExp(`^${regex}$`).test(normalized); + } catch { + // Malformed pattern — fall back to substring match + return normalized.includes(pattern); + } +} + +const TEST_PATTERN = /\.(test|spec)\.|__test__|__tests__|\.stories\./; + +/** + * Apply post-query filters (glob pattern, noTests) to a set of rows. + * Mutates nothing — returns a new filtered array. + * @param {Array} rows - Rows with at least a `file` property + * @param {object} opts + * @param {string} [opts.filePattern] - Glob pattern (only applied if it contains glob chars) + * @param {boolean} [opts.noTests] - Exclude test/spec files + * @param {boolean} [opts.isGlob] - Pre-computed: does filePattern contain glob chars? + * @returns {Array} + */ +export function applyFilters(rows, opts = {}) { + let filtered = rows; + const isGlob = + opts.isGlob !== undefined ? opts.isGlob : opts.filePattern && /[*?[\]]/.test(opts.filePattern); + if (isGlob) { + filtered = filtered.filter((row) => globMatch(row.file, opts.filePattern)); + } + if (opts.noTests) { + filtered = filtered.filter((row) => !TEST_PATTERN.test(row.file)); + } + return filtered; +} diff --git a/src/embeddings/search/hybrid.js b/src/embeddings/search/hybrid.js new file mode 100644 index 00000000..759e91c7 --- /dev/null +++ b/src/embeddings/search/hybrid.js @@ -0,0 +1,121 @@ +import { openReadonlyOrFail } from '../../db.js'; +import { hasFtsIndex } from '../stores/fts5.js'; +import { ftsSearchData } from './keyword.js'; +import { searchData } from './semantic.js'; + +/** + * Hybrid BM25 + semantic search with RRF fusion. + * Returns { results: [{ name, kind, file, line, rrf, bm25Score, bm25Rank, similarity, semanticRank }] } + * or null if no FTS5 index (caller should fall back to semantic-only). + */ +export async function hybridSearchData(query, customDbPath, opts = {}) { + const limit = opts.limit || 15; + const k = opts.rrfK || 60; + const topK = (opts.limit || 15) * 5; + + // Split semicolons for multi-query support + const queries = + typeof query === 'string' + ? query + .split(';') + .map((q) => q.trim()) + .filter((q) => q.length > 0) + : [query]; + + // Check FTS5 availability first (sync, cheap) + const checkDb = openReadonlyOrFail(customDbPath); + const ftsAvailable = hasFtsIndex(checkDb); + checkDb.close(); + if (!ftsAvailable) return null; + + // Collect ranked lists: for each query, one BM25 list + one semantic list + const rankedLists = []; + + for (const q of queries) { + // BM25 ranked list (sync) + const bm25Data = ftsSearchData(q, customDbPath, { ...opts, limit: topK }); + if (bm25Data?.results) { + rankedLists.push( + bm25Data.results.map((r, idx) => ({ + key: `${r.name}:${r.file}:${r.line}`, + rank: idx + 1, + source: 'bm25', + ...r, + })), + ); + } + + // Semantic ranked list (async) + const semData = await searchData(q, customDbPath, { + ...opts, + limit: topK, + minScore: opts.minScore || 0.2, + }); + if (semData?.results) { + rankedLists.push( + semData.results.map((r, idx) => ({ + key: `${r.name}:${r.file}:${r.line}`, + rank: idx + 1, + source: 'semantic', + ...r, + })), + ); + } + } + + // RRF fusion across all ranked lists + const fusionMap = new Map(); + for (const list of rankedLists) { + for (const item of list) { + if (!fusionMap.has(item.key)) { + fusionMap.set(item.key, { + name: item.name, + kind: item.kind, + file: item.file, + line: item.line, + endLine: item.endLine ?? null, + role: item.role ?? null, + fileHash: item.fileHash ?? null, + rrfScore: 0, + bm25Score: null, + bm25Rank: null, + similarity: null, + semanticRank: null, + }); + } + const entry = fusionMap.get(item.key); + entry.rrfScore += 1 / (k + item.rank); + if (item.source === 'bm25') { + if (entry.bm25Rank === null || item.rank < entry.bm25Rank) { + entry.bm25Score = item.bm25Score; + entry.bm25Rank = item.rank; + } + } else { + if (entry.semanticRank === null || item.rank < entry.semanticRank) { + entry.similarity = item.similarity; + entry.semanticRank = item.rank; + } + } + } + } + + const results = [...fusionMap.values()] + .sort((a, b) => b.rrfScore - a.rrfScore) + .slice(0, limit) + .map((e) => ({ + name: e.name, + kind: e.kind, + file: e.file, + line: e.line, + endLine: e.endLine, + role: e.role, + fileHash: e.fileHash, + rrf: e.rrfScore, + bm25Score: e.bm25Score, + bm25Rank: e.bm25Rank, + similarity: e.similarity, + semanticRank: e.semanticRank, + })); + + return { results }; +} diff --git a/src/embeddings/search/keyword.js b/src/embeddings/search/keyword.js new file mode 100644 index 00000000..cc8975d3 --- /dev/null +++ b/src/embeddings/search/keyword.js @@ -0,0 +1,68 @@ +import { openReadonlyOrFail } from '../../db.js'; +import { normalizeSymbol } from '../../queries.js'; +import { hasFtsIndex, sanitizeFtsQuery } from '../stores/fts5.js'; +import { applyFilters } from './filters.js'; + +/** + * BM25 keyword search via FTS5. + * Returns { results: [{ name, kind, file, line, bm25Score }] } or null if no FTS5 index. + */ +export function ftsSearchData(query, customDbPath, opts = {}) { + const limit = opts.limit || 15; + + const db = openReadonlyOrFail(customDbPath); + + try { + if (!hasFtsIndex(db)) { + return null; + } + + const ftsQuery = sanitizeFtsQuery(query); + if (!ftsQuery) { + return { results: [] }; + } + + let sql = ` + SELECT f.rowid AS node_id, rank AS bm25_score, + n.name, n.kind, n.file, n.line, n.end_line, n.role + FROM fts_index f + JOIN nodes n ON f.rowid = n.id + WHERE fts_index MATCH ? + `; + const params = [ftsQuery]; + + if (opts.kind) { + sql += ' AND n.kind = ?'; + params.push(opts.kind); + } + + const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); + if (opts.filePattern && !isGlob) { + sql += ' AND n.file LIKE ?'; + params.push(`%${opts.filePattern}%`); + } + + sql += ' ORDER BY rank LIMIT ?'; + params.push(limit * 5); // fetch generous set for post-filtering + + let rows; + try { + rows = db.prepare(sql).all(...params); + } catch { + // Invalid FTS5 query syntax — return empty + return { results: [] }; + } + + rows = applyFilters(rows, { ...opts, isGlob }); + + const hc = new Map(); + const results = rows.slice(0, limit).map((row) => ({ + ...normalizeSymbol(row, db, hc), + bm25Score: -row.bm25_score, // FTS5 rank is negative; negate for display + })); + + return { results }; + } finally { + db.close(); + } +} diff --git a/src/embeddings/search/prepare.js b/src/embeddings/search/prepare.js new file mode 100644 index 00000000..864bfbe9 --- /dev/null +++ b/src/embeddings/search/prepare.js @@ -0,0 +1,66 @@ +import { getEmbeddingCount, getEmbeddingMeta } from '../../db/repository/embeddings.js'; +import { openReadonlyOrFail } from '../../db.js'; +import { MODELS } from '../models.js'; +import { applyFilters } from './filters.js'; + +/** + * Shared setup for search functions: opens DB, validates embeddings/model, loads rows. + * Returns { db, rows, modelKey, storedDim } or null on failure (prints error). + * On null return, the DB is closed. On exception, the DB is also closed + * (callers only need to close DB from the returned object on the happy path). + */ +export function prepareSearch(customDbPath, opts = {}) { + const db = openReadonlyOrFail(customDbPath); + + try { + const count = getEmbeddingCount(db); + if (count === 0) { + console.log('No embeddings found. Run `codegraph embed` first.'); + db.close(); + return null; + } + + const storedModel = getEmbeddingMeta(db, 'model') || null; + const dimStr = getEmbeddingMeta(db, 'dim'); + const storedDim = dimStr ? parseInt(dimStr, 10) : null; + + let modelKey = opts.model || null; + if (!modelKey && storedModel) { + for (const [key, config] of Object.entries(MODELS)) { + if (config.name === storedModel) { + modelKey = key; + break; + } + } + } + + // Pre-filter: allow filtering by kind or file pattern to reduce search space + const isGlob = opts.filePattern && /[*?[\]]/.test(opts.filePattern); + let sql = ` + SELECT e.node_id, e.vector, e.text_preview, n.name, n.kind, n.file, n.line, n.end_line, n.role + FROM embeddings e + JOIN nodes n ON e.node_id = n.id + `; + const params = []; + const conditions = []; + if (opts.kind) { + conditions.push('n.kind = ?'); + params.push(opts.kind); + } + if (opts.filePattern && !isGlob) { + conditions.push('n.file LIKE ?'); + params.push(`%${opts.filePattern}%`); + } + if (conditions.length > 0) { + sql += ` WHERE ${conditions.join(' AND ')}`; + } + + let rows = db.prepare(sql).all(...params); + rows = applyFilters(rows, { ...opts, isGlob }); + + return { db, rows, modelKey, storedDim }; + } catch (err) { + db.close(); + throw err; + } +} diff --git a/src/embeddings/search/semantic.js b/src/embeddings/search/semantic.js new file mode 100644 index 00000000..62263ac3 --- /dev/null +++ b/src/embeddings/search/semantic.js @@ -0,0 +1,145 @@ +import { warn } from '../../logger.js'; +import { normalizeSymbol } from '../../queries.js'; +import { embed } from '../models.js'; +import { cosineSim } from '../stores/sqlite-blob.js'; +import { prepareSearch } from './prepare.js'; + +/** + * Single-query semantic search — returns data instead of printing. + * Returns { results: [{ name, kind, file, line, similarity }] } or null on failure. + */ +export async function searchData(query, customDbPath, opts = {}) { + const limit = opts.limit || 15; + const minScore = opts.minScore || 0.2; + + const prepared = prepareSearch(customDbPath, opts); + if (!prepared) return null; + const { db, rows, modelKey, storedDim } = prepared; + + try { + const { + vectors: [queryVec], + dim, + } = await embed([query], modelKey); + + if (storedDim && dim !== storedDim) { + console.log( + `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, + ); + console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); + return null; + } + + const hc = new Map(); + const results = []; + for (const row of rows) { + const vec = new Float32Array(new Uint8Array(row.vector).buffer); + const sim = cosineSim(queryVec, vec); + + if (sim >= minScore) { + results.push({ + ...normalizeSymbol(row, db, hc), + similarity: sim, + }); + } + } + + results.sort((a, b) => b.similarity - a.similarity); + return { results: results.slice(0, limit) }; + } finally { + db.close(); + } +} + +/** + * Multi-query semantic search with Reciprocal Rank Fusion (RRF). + * Returns { results: [{ name, kind, file, line, rrf, queryScores }] } or null on failure. + */ +export async function multiSearchData(queries, customDbPath, opts = {}) { + const limit = opts.limit || 15; + const minScore = opts.minScore || 0.2; + const k = opts.rrfK || 60; + + const prepared = prepareSearch(customDbPath, opts); + if (!prepared) return null; + const { db, rows, modelKey, storedDim } = prepared; + + try { + const { vectors: queryVecs, dim } = await embed(queries, modelKey); + + // Warn about similar queries that may bias RRF results + const SIMILARITY_WARN_THRESHOLD = 0.85; + for (let i = 0; i < queryVecs.length; i++) { + for (let j = i + 1; j < queryVecs.length; j++) { + const sim = cosineSim(queryVecs[i], queryVecs[j]); + if (sim >= SIMILARITY_WARN_THRESHOLD) { + warn( + `Queries "${queries[i]}" and "${queries[j]}" are very similar ` + + `(${(sim * 100).toFixed(0)}% cosine similarity). ` + + `This may bias RRF results toward their shared matches. ` + + `Consider using more distinct queries.`, + ); + } + } + } + + if (storedDim && dim !== storedDim) { + console.log( + `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, + ); + console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); + return null; + } + + // Parse row vectors once + const rowVecs = rows.map((row) => new Float32Array(new Uint8Array(row.vector).buffer)); + + // For each query: compute similarities, filter by minScore, rank + const perQueryRanked = queries.map((_query, qi) => { + const scored = []; + for (let ri = 0; ri < rows.length; ri++) { + const sim = cosineSim(queryVecs[qi], rowVecs[ri]); + if (sim >= minScore) { + scored.push({ rowIndex: ri, similarity: sim }); + } + } + scored.sort((a, b) => b.similarity - a.similarity); + // Assign 1-indexed ranks + return scored.map((item, rank) => ({ ...item, rank: rank + 1 })); + }); + + // Fuse results using RRF: for each unique row, sum 1/(k + rank_i) across queries + const fusionMap = new Map(); // rowIndex -> { rrfScore, queryScores[] } + for (let qi = 0; qi < queries.length; qi++) { + for (const item of perQueryRanked[qi]) { + if (!fusionMap.has(item.rowIndex)) { + fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] }); + } + const entry = fusionMap.get(item.rowIndex); + entry.rrfScore += 1 / (k + item.rank); + entry.queryScores.push({ + query: queries[qi], + similarity: item.similarity, + rank: item.rank, + }); + } + } + + // Build results sorted by RRF score + const hc = new Map(); + const results = []; + for (const [rowIndex, entry] of fusionMap) { + const row = rows[rowIndex]; + results.push({ + ...normalizeSymbol(row, db, hc), + rrf: entry.rrfScore, + queryScores: entry.queryScores, + }); + } + + results.sort((a, b) => b.rrf - a.rrf); + return { results: results.slice(0, limit) }; + } finally { + db.close(); + } +} diff --git a/src/embeddings/stores/fts5.js b/src/embeddings/stores/fts5.js new file mode 100644 index 00000000..9b902dce --- /dev/null +++ b/src/embeddings/stores/fts5.js @@ -0,0 +1,27 @@ +/** + * Sanitize a user query for FTS5 MATCH syntax. + * Wraps each token as an implicit OR and escapes special FTS5 characters. + */ +export function sanitizeFtsQuery(query) { + // Remove FTS5 special chars that could cause syntax errors + const cleaned = query.replace(/[*"():^{}~<>]/g, ' ').trim(); + if (!cleaned) return null; + // Split into tokens, wrap with OR for multi-token queries + const tokens = cleaned.split(/\s+/).filter((t) => t.length > 0); + if (tokens.length === 0) return null; + if (tokens.length === 1) return `"${tokens[0]}"`; + return tokens.map((t) => `"${t}"`).join(' OR '); +} + +/** + * Check if the FTS5 index exists in the database. + * Returns true if fts_index table exists and has rows, false otherwise. + */ +export function hasFtsIndex(db) { + try { + const row = db.prepare('SELECT COUNT(*) as c FROM fts_index').get(); + return row.c > 0; + } catch { + return false; + } +} diff --git a/src/embeddings/stores/sqlite-blob.js b/src/embeddings/stores/sqlite-blob.js new file mode 100644 index 00000000..75037ffa --- /dev/null +++ b/src/embeddings/stores/sqlite-blob.js @@ -0,0 +1,24 @@ +/** + * @typedef {object} VectorStore + * @property {(queryVec: Float32Array, rows: Array<{vector: Buffer}>) => Array<{index: number, score: number}>} search + * Score every row against a query vector and return scored indices. + * + * Future implementations (e.g. HNSW via `hnsw.js`) implement this same shape + * for approximate nearest-neighbor search. + */ + +/** + * Cosine similarity between two Float32Arrays. + */ +export function cosineSim(a, b) { + let dot = 0, + normA = 0, + normB = 0; + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + const denom = Math.sqrt(normA) * Math.sqrt(normB); + return denom === 0 ? 0 : dot / denom; +} diff --git a/src/embeddings/strategies/source.js b/src/embeddings/strategies/source.js new file mode 100644 index 00000000..3b25e0f3 --- /dev/null +++ b/src/embeddings/strategies/source.js @@ -0,0 +1,14 @@ +import { splitIdentifier } from './text-utils.js'; + +/** + * Build raw source-code text for a symbol (original strategy). + */ +export function buildSourceText(node, file, lines) { + const startLine = Math.max(0, node.line - 1); + const endLine = node.end_line + ? Math.min(lines.length, node.end_line) + : Math.min(lines.length, startLine + 15); + const context = lines.slice(startLine, endLine).join('\n'); + const readable = splitIdentifier(node.name); + return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`; +} diff --git a/src/embeddings/strategies/structured.js b/src/embeddings/strategies/structured.js new file mode 100644 index 00000000..c488d1c6 --- /dev/null +++ b/src/embeddings/strategies/structured.js @@ -0,0 +1,43 @@ +import { findCalleeNames, findCallerNames } from '../../db.js'; +import { extractLeadingComment, splitIdentifier } from './text-utils.js'; + +/** + * Build graph-enriched text for a symbol using dependency context. + * Produces compact, semantic text (~100 tokens) instead of full source code. + */ +export function buildStructuredText(node, file, lines, db) { + const readable = splitIdentifier(node.name); + const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`]; + const startLine = Math.max(0, node.line - 1); + + // Extract parameters from signature (best-effort, single-line) + const sigLine = lines[startLine] || ''; + const paramMatch = sigLine.match(/\(([^)]*)\)/); + if (paramMatch?.[1]?.trim()) { + parts.push(`Parameters: ${paramMatch[1].trim()}`); + } + + // Graph context: callees (capped at 10) + const callees = findCalleeNames(db, node.id); + if (callees.length > 0) { + parts.push(`Calls: ${callees.slice(0, 10).join(', ')}`); + } + + // Graph context: callers (capped at 10) + const callers = findCallerNames(db, node.id); + if (callers.length > 0) { + parts.push(`Called by: ${callers.slice(0, 10).join(', ')}`); + } + + // Leading comment (high semantic value) or first few lines of code + const comment = extractLeadingComment(lines, startLine); + if (comment) { + parts.push(comment); + } else { + const endLine = Math.min(lines.length, startLine + 4); + const snippet = lines.slice(startLine, endLine).join('\n').trim(); + if (snippet) parts.push(snippet); + } + + return parts.join('\n'); +} diff --git a/src/embeddings/strategies/text-utils.js b/src/embeddings/strategies/text-utils.js new file mode 100644 index 00000000..fca8f29e --- /dev/null +++ b/src/embeddings/strategies/text-utils.js @@ -0,0 +1,43 @@ +/** + * Split an identifier into readable words. + * camelCase/PascalCase -> "camel Case", snake_case -> "snake case", kebab-case -> "kebab case" + */ +export function splitIdentifier(name) { + return name + .replace(/([a-z])([A-Z])/g, '$1 $2') + .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') + .replace(/[_-]+/g, ' ') + .trim(); +} + +/** + * Extract leading comment text (JSDoc, //, #, etc.) above a function line. + * Returns the cleaned comment text or null if none found. + */ +export function extractLeadingComment(lines, fnLineIndex) { + if (fnLineIndex > lines.length) return null; + const raw = []; + for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) { + if (i >= lines.length) continue; + const trimmed = lines[i].trim(); + if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) { + raw.unshift(trimmed); + } else if (trimmed === '') { + if (raw.length > 0) break; + } else { + break; + } + } + if (raw.length === 0) return null; + return raw + .map((line) => + line + .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */ + .replace(/^\*\s?/, '') // middle * lines + .replace(/^\/\/\/?\s?/, '') // // or /// + .replace(/^#\s?/, '') // # (Python/Ruby) + .trim(), + ) + .filter((l) => l.length > 0) + .join(' '); +} diff --git a/src/index.js b/src/index.js index bca2cec7..576efac6 100644 --- a/src/index.js +++ b/src/index.js @@ -23,7 +23,12 @@ export { loadConfig } from './config.js'; export { EXTENSIONS, IGNORE_DIRS } from './constants.js'; export { findCycles } from './cycles.js'; export { dataflowData } from './dataflow.js'; -export { buildEmbeddings, hybridSearchData, multiSearchData, searchData } from './embedder.js'; +export { + buildEmbeddings, + hybridSearchData, + multiSearchData, + searchData, +} from './embeddings/index.js'; export { AnalysisError, BoundaryError, diff --git a/src/mcp/tools/semantic-search.js b/src/mcp/tools/semantic-search.js index 06ef8354..2fa22ea5 100644 --- a/src/mcp/tools/semantic-search.js +++ b/src/mcp/tools/semantic-search.js @@ -11,7 +11,7 @@ export async function handler(args, ctx) { }; if (mode === 'keyword') { - const { ftsSearchData } = await import('../../embedder.js'); + const { ftsSearchData } = await import('../../embeddings/index.js'); const result = ftsSearchData(args.query, ctx.dbPath, searchOpts); if (result === null) { return { @@ -28,7 +28,7 @@ export async function handler(args, ctx) { } if (mode === 'semantic') { - const { searchData } = await import('../../embedder.js'); + const { searchData } = await import('../../embeddings/index.js'); const result = await searchData(args.query, ctx.dbPath, searchOpts); if (result === null) { return { @@ -45,7 +45,7 @@ export async function handler(args, ctx) { } // hybrid (default) — falls back to semantic if no FTS5 - const { hybridSearchData, searchData } = await import('../../embedder.js'); + const { hybridSearchData, searchData } = await import('../../embeddings/index.js'); let result = await hybridSearchData(args.query, ctx.dbPath, searchOpts); if (result === null) { result = await searchData(args.query, ctx.dbPath, searchOpts); diff --git a/tests/search/embedder-search.test.js b/tests/search/embedder-search.test.js index 93ea518c..86fe5543 100644 --- a/tests/search/embedder-search.test.js +++ b/tests/search/embedder-search.test.js @@ -38,7 +38,7 @@ import { multiSearchData, search, searchData, -} from '../../src/embedder.js'; +} from '../../src/embeddings/index.js'; // ─── Helpers ─────────────────────────────────────────────────────────── diff --git a/tests/search/embedding-regression.test.js b/tests/search/embedding-regression.test.js index f1004bf3..56222875 100644 --- a/tests/search/embedding-regression.test.js +++ b/tests/search/embedding-regression.test.js @@ -23,7 +23,7 @@ try { // Lazy-import to avoid top-level errors when transformers is missing const { buildGraph } = await import('../../src/builder.js'); -const { buildEmbeddings, searchData } = await import('../../src/embedder.js'); +const { buildEmbeddings, searchData } = await import('../../src/embeddings/index.js'); // Same ES-module fixture files used by build.test.js const FIXTURE_FILES = { diff --git a/tests/search/embedding-strategy.test.js b/tests/search/embedding-strategy.test.js index e1553678..70215559 100644 --- a/tests/search/embedding-strategy.test.js +++ b/tests/search/embedding-strategy.test.js @@ -31,7 +31,7 @@ import { EMBEDDING_STRATEGIES, estimateTokens, MODELS, -} from '../../src/embedder.js'; +} from '../../src/embeddings/index.js'; // ─── Helpers ─────────────────────────────────────────────────────────── diff --git a/tests/unit/prompt-install.test.js b/tests/unit/prompt-install.test.js index 6a36c2de..f23a73f8 100644 --- a/tests/unit/prompt-install.test.js +++ b/tests/unit/prompt-install.test.js @@ -1,5 +1,5 @@ /** - * Unit tests for the interactive install prompt in src/embedder.js. + * Unit tests for the interactive install prompt in src/embeddings/models.js. * * Tests the promptInstall() + loadTransformers() flow when * @huggingface/transformers is missing. @@ -44,7 +44,7 @@ describe('loadTransformers install prompt', () => { throw new Error('Cannot find package'); }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); await expect(embed(['test'], 'minilm')).rejects.toThrow( 'Semantic search requires @huggingface/transformers', @@ -71,7 +71,7 @@ describe('loadTransformers install prompt', () => { throw new Error('Cannot find package'); }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); await expect(embed(['test'], 'minilm')).rejects.toThrow( 'Semantic search requires @huggingface/transformers', @@ -99,7 +99,7 @@ describe('loadTransformers install prompt', () => { throw new Error('Cannot find package'); }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); await expect(embed(['test'], 'minilm')).rejects.toThrow( 'Semantic search requires @huggingface/transformers', @@ -137,7 +137,7 @@ describe('loadTransformers install prompt', () => { }; }); - const { embed } = await import('../../src/embedder.js'); + const { embed } = await import('../../src/embeddings/index.js'); const result = await embed(['test text'], 'minilm'); expect(result.vectors).toHaveLength(1);