One-file CLI: semantic search

Everything in this book, distilled into one self-contained tool that does a genuinely useful job: semantic search over a text file. It builds vector embeddings, indexes them with our from-scratch HNSW, and answers queries by meaning — no machine-learning libraries required.

The steps

  1. Load documents (one per line) from a text file.
  2. Embed each document as a TF-IDF vector, built from scratch (term frequency × inverse document frequency — a classic, model-free text embedding).
  3. Index the vectors with HNSW (cosine distance).
  4. Query: embed the query the same way and HNSW-search for the nearest documents.
  5. Report the best matches with similarity scores.

There's also a VECTORS mode (--vectors embeddings.npy) to plug in real embeddings from a sentence model — the same index, better semantics.

Install & run

pip install numpy        # that's all TEXT mode needs

# one-shot query
python hnsw_search.py docs.txt --query "deep learning for recognizing speech"

# interactive
python hnsw_search.py docs.txt

# use real precomputed embeddings instead of TF-IDF
python hnsw_search.py docs.txt --vectors embeddings.npy --query "..."

It works — real output

With a small docs.txt of ten unrelated sentences:

$ python hnsw_search.py docs.txt --query "deep learning for recognizing speech" -k 3
loaded 10 documents from docs.txt
built TF-IDF index (vocab=81 terms)

query: 'deep learning for recognizing speech'
  1. (sim=0.312)  Neural networks are used for image and speech recognition tasks.
  2. (sim=0.217)  Machine learning models can recognize patterns in large datasets.
  3. (sim=0.096)  Vector embeddings turn words and sentences into numbers for search.
$ python hnsw_search.py docs.txt --query "how do search engines use vectors" -k 3
query: 'how do search engines use vectors'
  1. (sim=0.312)  Approximate nearest neighbor search powers modern vector databases.
  2. (sim=0.293)  Vector embeddings turn words and sentences into numbers for search.
  3. (sim=0.000)  The cat sat on the warm windowsill in the sunshine.
$ python hnsw_search.py docs.txt --query "baking homemade bread" -k 3
query: 'baking homemade bread'
  1. (sim=0.334)  Bake the bread at 220 degrees for about thirty minutes.

Each query surfaces the meaning-related document at the top — even when the words differ ("deep learning" → "neural networks"; "baking" → "bake"). That's HNSW serving semantic search.

The complete script

#!/usr/bin/env python3
"""
hnsw_search.py — a self-contained semantic search CLI built on HNSW.

Two modes:

  TEXT mode (default, no extra deps): build TF-IDF vectors from a text file
  (one document per line) entirely from scratch, index them with HNSW, and
  answer nearest-neighbor queries.

      python hnsw_search.py docs.txt --query "machine learning for search"
      python hnsw_search.py docs.txt          # interactive prompt

  VECTORS mode: use real precomputed embeddings (e.g. from a sentence model),
  one row per line in docs.txt.

      python hnsw_search.py docs.txt --vectors embeddings.npy --query "..."

The HNSW index is the from-scratch implementation in hnsw.py.

Requirements: numpy (and the local hnsw.py). No ML libraries needed for TEXT mode.
"""

from __future__ import annotations

import argparse
import re
import sys

import numpy as np

from hnsw import HNSW


# --------------------------------------------------------------------------- #
# From-scratch TF-IDF (so TEXT mode needs no ML libraries)
# --------------------------------------------------------------------------- #
def tokenize(text):
    return re.findall(r"[a-z0-9]+", text.lower())


class Tfidf:
    """Minimal TF-IDF vectorizer: learns a vocabulary + IDF, builds vectors."""

    def fit(self, docs):
        self.vocab = {}
        df = {}
        for doc in docs:
            seen = set(tokenize(doc))
            for w in seen:
                df[w] = df.get(w, 0) + 1
        for w in sorted(df):
            self.vocab[w] = len(self.vocab)
        n = max(len(docs), 1)
        self.idf = np.zeros(len(self.vocab))
        for w, i in self.vocab.items():
            self.idf[i] = np.log((1 + n) / (1 + df[w])) + 1.0   # smoothed idf
        return self

    def transform(self, docs):
        rows = np.zeros((len(docs), len(self.vocab)))
        for r, doc in enumerate(docs):
            for w in tokenize(doc):
                j = self.vocab.get(w)
                if j is not None:
                    rows[r, j] += 1.0                          # term frequency
            rows[r] *= self.idf                                # * idf
        return rows


# --------------------------------------------------------------------------- #
# Index building
# --------------------------------------------------------------------------- #
def build_text_index(docs, M, ef_construction):
    vec = Tfidf().fit(docs)
    X = vec.transform(docs)
    index = HNSW(dim=X.shape[1], M=M, ef_construction=ef_construction,
                 distance="cosine", seed=1)
    for row in X:
        index.add(row)
    return index, vec


def build_vector_index(vectors, M, ef_construction):
    X = np.asarray(vectors, dtype=np.float64)
    index = HNSW(dim=X.shape[1], M=M, ef_construction=ef_construction,
                 distance="cosine", seed=1)
    for row in X:
        index.add(row)
    return index


# --------------------------------------------------------------------------- #
# CLI
# --------------------------------------------------------------------------- #
def main(argv=None):
    p = argparse.ArgumentParser(description="Semantic search over a text file "
                                            "using a from-scratch HNSW index.")
    p.add_argument("docs", help="text file, one document per line")
    p.add_argument("--query", help="query string (omit for interactive mode)")
    p.add_argument("-k", type=int, default=5, help="results to return (default 5)")
    p.add_argument("--ef", type=int, default=50, help="search breadth (default 50)")
    p.add_argument("--M", type=int, default=16, help="graph degree (default 16)")
    p.add_argument("--ef-construction", type=int, default=200,
                   help="build breadth (default 200)")
    p.add_argument("--vectors", help="optional .npy of precomputed embeddings")
    args = p.parse_args(argv)

    with open(args.docs, encoding="utf-8") as f:
        docs = [line.rstrip("\n") for line in f if line.strip()]
    print(f"loaded {len(docs)} documents from {args.docs}")

    if args.vectors:
        vectors = np.load(args.vectors)
        if len(vectors) != len(docs):
            sys.exit("error: number of vectors != number of documents")
        index = build_vector_index(vectors, args.M, args.ef_construction)
        vec = None
        print(f"indexed precomputed embeddings (dim={vectors.shape[1]})")
    else:
        index, vec = build_text_index(docs, args.M, args.ef_construction)
        print(f"built TF-IDF index (vocab={len(vec.vocab)} terms)")

    def run_query(text):
        if vec is not None:
            q = vec.transform([text])[0]
        else:
            sys.exit("VECTORS mode needs precomputed query embeddings")
        results = index.search(q, k=args.k, ef=args.ef)
        print(f"\nquery: {text!r}")
        for rank, (i, dist) in enumerate(results, 1):
            print(f"  {rank}. (sim={1 - dist:.3f})  {docs[i]}")

    if args.query:
        run_query(args.query)
    else:
        print("\ntype a query (empty line to quit):")
        try:
            while True:
                text = input("> ").strip()
                if not text:
                    break
                run_query(text)
        except (EOFError, KeyboardInterrupt):
            pass


if __name__ == "__main__":
    main()

That's the whole tool: load → TF-IDF embed → HNSW index → search, in one file you can read top to bottom. Swap TF-IDF for a real embedding model and you have the core of a production semantic-search or RAG retriever.