One-file CLI: semantic search
Everything in this book, distilled into one self-contained tool that does a genuinely useful job: semantic search over a text file. It builds vector embeddings, indexes them with our from-scratch HNSW, and answers queries by meaning — no machine-learning libraries required.
The steps
- Load documents (one per line) from a text file.
- Embed each document as a TF-IDF vector, built from scratch (term frequency × inverse document frequency — a classic, model-free text embedding).
- Index the vectors with HNSW (cosine distance).
- Query: embed the query the same way and HNSW-search for the nearest documents.
- Report the best matches with similarity scores.
There's also a VECTORS mode (--vectors embeddings.npy) to plug in real
embeddings from a sentence model — the same index, better semantics.
Install & run
pip install numpy # that's all TEXT mode needs
# one-shot query
python hnsw_search.py docs.txt --query "deep learning for recognizing speech"
# interactive
python hnsw_search.py docs.txt
# use real precomputed embeddings instead of TF-IDF
python hnsw_search.py docs.txt --vectors embeddings.npy --query "..."
It works — real output
With a small docs.txt of ten unrelated sentences:
$ python hnsw_search.py docs.txt --query "deep learning for recognizing speech" -k 3
loaded 10 documents from docs.txt
built TF-IDF index (vocab=81 terms)
query: 'deep learning for recognizing speech'
1. (sim=0.312) Neural networks are used for image and speech recognition tasks.
2. (sim=0.217) Machine learning models can recognize patterns in large datasets.
3. (sim=0.096) Vector embeddings turn words and sentences into numbers for search.
$ python hnsw_search.py docs.txt --query "how do search engines use vectors" -k 3
query: 'how do search engines use vectors'
1. (sim=0.312) Approximate nearest neighbor search powers modern vector databases.
2. (sim=0.293) Vector embeddings turn words and sentences into numbers for search.
3. (sim=0.000) The cat sat on the warm windowsill in the sunshine.
$ python hnsw_search.py docs.txt --query "baking homemade bread" -k 3
query: 'baking homemade bread'
1. (sim=0.334) Bake the bread at 220 degrees for about thirty minutes.
Each query surfaces the meaning-related document at the top — even when the words differ ("deep learning" → "neural networks"; "baking" → "bake"). That's HNSW serving semantic search.
The complete script
#!/usr/bin/env python3
"""
hnsw_search.py — a self-contained semantic search CLI built on HNSW.
Two modes:
TEXT mode (default, no extra deps): build TF-IDF vectors from a text file
(one document per line) entirely from scratch, index them with HNSW, and
answer nearest-neighbor queries.
python hnsw_search.py docs.txt --query "machine learning for search"
python hnsw_search.py docs.txt # interactive prompt
VECTORS mode: use real precomputed embeddings (e.g. from a sentence model),
one row per line in docs.txt.
python hnsw_search.py docs.txt --vectors embeddings.npy --query "..."
The HNSW index is the from-scratch implementation in hnsw.py.
Requirements: numpy (and the local hnsw.py). No ML libraries needed for TEXT mode.
"""
from __future__ import annotations
import argparse
import re
import sys
import numpy as np
from hnsw import HNSW
# --------------------------------------------------------------------------- #
# From-scratch TF-IDF (so TEXT mode needs no ML libraries)
# --------------------------------------------------------------------------- #
def tokenize(text):
return re.findall(r"[a-z0-9]+", text.lower())
class Tfidf:
"""Minimal TF-IDF vectorizer: learns a vocabulary + IDF, builds vectors."""
def fit(self, docs):
self.vocab = {}
df = {}
for doc in docs:
seen = set(tokenize(doc))
for w in seen:
df[w] = df.get(w, 0) + 1
for w in sorted(df):
self.vocab[w] = len(self.vocab)
n = max(len(docs), 1)
self.idf = np.zeros(len(self.vocab))
for w, i in self.vocab.items():
self.idf[i] = np.log((1 + n) / (1 + df[w])) + 1.0 # smoothed idf
return self
def transform(self, docs):
rows = np.zeros((len(docs), len(self.vocab)))
for r, doc in enumerate(docs):
for w in tokenize(doc):
j = self.vocab.get(w)
if j is not None:
rows[r, j] += 1.0 # term frequency
rows[r] *= self.idf # * idf
return rows
# --------------------------------------------------------------------------- #
# Index building
# --------------------------------------------------------------------------- #
def build_text_index(docs, M, ef_construction):
vec = Tfidf().fit(docs)
X = vec.transform(docs)
index = HNSW(dim=X.shape[1], M=M, ef_construction=ef_construction,
distance="cosine", seed=1)
for row in X:
index.add(row)
return index, vec
def build_vector_index(vectors, M, ef_construction):
X = np.asarray(vectors, dtype=np.float64)
index = HNSW(dim=X.shape[1], M=M, ef_construction=ef_construction,
distance="cosine", seed=1)
for row in X:
index.add(row)
return index
# --------------------------------------------------------------------------- #
# CLI
# --------------------------------------------------------------------------- #
def main(argv=None):
p = argparse.ArgumentParser(description="Semantic search over a text file "
"using a from-scratch HNSW index.")
p.add_argument("docs", help="text file, one document per line")
p.add_argument("--query", help="query string (omit for interactive mode)")
p.add_argument("-k", type=int, default=5, help="results to return (default 5)")
p.add_argument("--ef", type=int, default=50, help="search breadth (default 50)")
p.add_argument("--M", type=int, default=16, help="graph degree (default 16)")
p.add_argument("--ef-construction", type=int, default=200,
help="build breadth (default 200)")
p.add_argument("--vectors", help="optional .npy of precomputed embeddings")
args = p.parse_args(argv)
with open(args.docs, encoding="utf-8") as f:
docs = [line.rstrip("\n") for line in f if line.strip()]
print(f"loaded {len(docs)} documents from {args.docs}")
if args.vectors:
vectors = np.load(args.vectors)
if len(vectors) != len(docs):
sys.exit("error: number of vectors != number of documents")
index = build_vector_index(vectors, args.M, args.ef_construction)
vec = None
print(f"indexed precomputed embeddings (dim={vectors.shape[1]})")
else:
index, vec = build_text_index(docs, args.M, args.ef_construction)
print(f"built TF-IDF index (vocab={len(vec.vocab)} terms)")
def run_query(text):
if vec is not None:
q = vec.transform([text])[0]
else:
sys.exit("VECTORS mode needs precomputed query embeddings")
results = index.search(q, k=args.k, ef=args.ef)
print(f"\nquery: {text!r}")
for rank, (i, dist) in enumerate(results, 1):
print(f" {rank}. (sim={1 - dist:.3f}) {docs[i]}")
if args.query:
run_query(args.query)
else:
print("\ntype a query (empty line to quit):")
try:
while True:
text = input("> ").strip()
if not text:
break
run_query(text)
except (EOFError, KeyboardInterrupt):
pass
if __name__ == "__main__":
main()
That's the whole tool: load → TF-IDF embed → HNSW index → search, in one file you can read top to bottom. Swap TF-IDF for a real embedding model and you have the core of a production semantic-search or RAG retriever.