One-file CLI: the trade-off estimator
Everything in this book, distilled into one tool that answers the practical question every engineer asks before compressing vectors:
If I compress my embeddings with these settings, how much memory do I save, and what recall do I keep?
pqtool.py builds an IVFPQ index over your vectors (or synthetic data), measures
recall against exact search, and reports the memory savings — so you can pick
parameters with evidence instead of guesswork.
The steps
- Load vectors from an
.npyfile (shape[n, d]), or generate synthetic clustered data with--synthetic. - Hold out some vectors as queries and compute their exact neighbors (ground truth).
- Build an IVFPQ index with your chosen
nlist,m,nprobe. - Measure recall@k — both raw and with exact re-ranking.
- Report recall, memory (float32 vs PQ codes), and the fraction of cells scanned.
Install & run
pip install numpy # that's all it needs
# try it on synthetic data (no files required)
python pqtool.py --synthetic --m 16 --nlist 128 --nprobe 8
# estimate for your own embeddings
python pqtool.py --npy embeddings.npy --m 16 --nlist 256 --nprobe 16 --rerank 100
It works — real output
$ python pqtool.py --synthetic --m 16 --nlist 128 --nprobe 8
synthetic: 10000 vectors x 64 dims
building IVFPQ (nlist=128, m=16) ...
--- results ---
recall@10 (raw) : 0.711
recall@10 (+rerank 100): 1.000
memory (float32 vectors) : 2.6 MB
memory (PQ codes) : 0.16 MB (16x smaller)
per query, IVF scans ~6.2% of cells
The read-out is the whole decision in one place: 16× less memory, recall
1.0 after re-ranking, scanning only 6% of cells. Change --m to trade
memory for raw recall, --nprobe to trade speed for recall, and --rerank to
trade a little query time for accuracy.
The complete script
#!/usr/bin/env python3
"""
pqtool.py — estimate the memory / recall / speed trade-off of compressing your
vectors with IVF + Product Quantization, using the from-scratch index in ivfpq.py.
Answers the practical question: "If I compress my embeddings with these settings,
how much memory do I save and what recall do I keep?"
Usage:
# try it on synthetic clustered data (no files needed)
python pqtool.py --synthetic
# estimate for your own embeddings (an .npy array of shape [n, d])
python pqtool.py --npy embeddings.npy --m 16 --nlist 256 --nprobe 16
Requirements: numpy (and the local ivfpq.py).
"""
from __future__ import annotations
import argparse
import numpy as np
from ivfpq import flat_knn, IVFPQ
def make_synthetic(n=10000, d=64, seed=0):
rng = np.random.default_rng(seed)
centers = rng.normal(scale=5, size=(max(n // 250, 2), d))
data = np.vstack([centers[i % len(centers)] + rng.normal(size=(1, d))
for i in range(n)])
return data
def main(argv=None):
p = argparse.ArgumentParser(description="Estimate IVF-PQ memory/recall trade-off.")
p.add_argument("--npy", help="path to an .npy array of shape [n, d]")
p.add_argument("--synthetic", action="store_true",
help="use generated clustered data instead of a file")
p.add_argument("--n", type=int, default=10000, help="synthetic dataset size")
p.add_argument("--d", type=int, default=64, help="synthetic dimensionality")
p.add_argument("--m", type=int, default=16, help="PQ sub-vectors / bytes per code")
p.add_argument("--nlist", type=int, default=128, help="IVF partitions")
p.add_argument("--nprobe", type=int, default=8, help="IVF cells probed per query")
p.add_argument("--rerank", type=int, default=100,
help="candidates to re-rank exactly (0 = no re-rank)")
p.add_argument("-k", type=int, default=10, help="neighbors to return")
p.add_argument("--queries", type=int, default=100, help="evaluation queries")
args = p.parse_args(argv)
if args.npy:
data = np.load(args.npy).astype(np.float64)
print(f"loaded {data.shape[0]} vectors x {data.shape[1]} dims from {args.npy}")
else:
data = make_synthetic(args.n, args.d)
print(f"synthetic: {data.shape[0]} vectors x {data.shape[1]} dims")
N, D = data.shape
if D % args.m != 0:
raise SystemExit(f"error: dim {D} not divisible by --m {args.m}; "
f"pick m in {[x for x in (4,8,16,32) if D % x == 0]}")
rng = np.random.default_rng(123)
qidx = rng.choice(N, size=min(args.queries, N), replace=False)
queries = data[qidx] + rng.normal(scale=0.5, size=(len(qidx), D))
exact = [set(flat_knn(data, q, args.k)[0].tolist()) for q in queries]
print(f"building IVFPQ (nlist={args.nlist}, m={args.m}) ...")
index = IVFPQ(nlist=args.nlist, m=args.m, ksub=256, seed=1).train(data)
index.add(data)
def recall(rerank):
hits = 0
for q, ex in zip(queries, exact):
want = args.rerank if (rerank and args.rerank) else args.k
ids, _ = index.search(q, want, nprobe=args.nprobe)
if rerank and args.rerank and len(ids):
d = ((data[ids] - q) ** 2).sum(1)
ids = ids[np.argsort(d)[:args.k]]
else:
ids = ids[:args.k]
hits += len(set(np.asarray(ids).tolist()) & ex)
return hits / (len(queries) * args.k)
flat_bytes = N * D * 4 # realistic float32 baseline
code_bytes = N * args.m # PQ codes (1 byte each)
raw = recall(False)
rr = recall(True) if args.rerank else None
print("\n--- results ---")
print(f" recall@{args.k} (raw) : {raw:.3f}")
if rr is not None:
print(f" recall@{args.k} (+rerank {args.rerank}): {rr:.3f}")
print(f" memory (float32 vectors) : {flat_bytes / 1e6:.1f} MB")
print(f" memory (PQ codes) : {code_bytes / 1e6:.2f} MB "
f"({flat_bytes // code_bytes}x smaller)")
print(f" per query, IVF scans ~{args.nprobe / args.nlist * 100:.1f}% of cells")
if __name__ == "__main__":
main()
Point it at your real embeddings and you have an evidence-based way to choose IVFPQ parameters — the same decision FAISS users make every day, now with a tool you fully understand.