Skip to content

Indexers

HybridIndexProtocol

Bases: Protocol

Combined dense + sparse index with RRF or weighted fusion.

HybridIndex

HybridIndex

HybridIndex(dense: Any, sparse: Any, rrf_k: int = 60, over_fetch: int = 3)

Combined dense + sparse index with RRF fusion.

Parameters

dense: A DenseIndex instance (e.g. LanceDBIndex). sparse: A SparseIndex instance (e.g. BM25Index). rrf_k: RRF smoothing constant (default 60, from the original RRF paper). over_fetch: Multiplier on k when querying sub-indexes. Larger pools give RRF more candidates to promote chunks appearing in both lists.

Source code in src/verifiable_rag/indexers/hybrid.py
def __init__(
    self,
    dense: Any,
    sparse: Any,
    rrf_k: int = 60,
    over_fetch: int = 3,
) -> None:
    if rrf_k <= 0:
        raise ValueError(f"rrf_k must be positive, got {rrf_k}")
    if over_fetch < 1:
        raise ValueError(f"over_fetch must be >= 1, got {over_fetch}")
    self._dense = dense
    self._sparse = sparse
    self._rrf_k = rrf_k
    self._over_fetch = over_fetch

count property

count: int

Number of documents in the dense sub-index (both are kept in sync).

add

add(chunks: list[Chunk], embeddings: list[list[float]]) -> None

Index chunks into both sub-indexes.

Source code in src/verifiable_rag/indexers/hybrid.py
def add(self, chunks: list[Chunk], embeddings: list[list[float]]) -> None:
    """Index *chunks* into both sub-indexes."""
    self._dense.add(chunks, embeddings)
    self._sparse.add(chunks)

search

search(query: str, query_embedding: list[float], k: int) -> list[RetrievedChunk]

Return up to k chunks fused from dense and sparse results via RRF.

Source code in src/verifiable_rag/indexers/hybrid.py
def search(
    self,
    query: str,
    query_embedding: list[float],
    k: int,
) -> list[RetrievedChunk]:
    """Return up to *k* chunks fused from dense and sparse results via RRF."""
    fetch = k * self._over_fetch
    dense_results = self._dense.search(query_embedding, fetch)
    sparse_results = self._sparse.search(query, fetch)
    fused = _rrf_fuse([dense_results, sparse_results], k=self._rrf_k)
    return fused[:k]

clear

clear() -> None

Clear both sub-indexes.

Source code in src/verifiable_rag/indexers/hybrid.py
def clear(self) -> None:
    """Clear both sub-indexes."""
    self._dense.clear()
    self._sparse.clear()

LanceDBIndex

LanceDBIndex

LanceDBIndex(uri: str | Path = DEFAULT_URI, table_name: str = 'chunks', metric: str = 'cosine')

Dense ANN index backed by LanceDB.

Parameters

uri: Directory for the LanceDB database. Created on first write. table_name: Name of the table inside the database. Use different names to keep multiple collections in the same URI. metric: Distance metric: "cosine" (default, requires normalised vectors) or "l2".

Source code in src/verifiable_rag/indexers/lance.py
def __init__(
    self,
    uri: str | Path = DEFAULT_URI,
    table_name: str = "chunks",
    metric: str = "cosine",
) -> None:
    self._uri = Path(uri)
    self._table_name = table_name
    self._metric = metric
    self._db: Any = None
    self._table: Any = None

count property

count: int

Number of vectors currently stored.

add

add(chunks: list[Chunk], embeddings: list[list[float]]) -> None

Persist chunks with their embeddings. Appends if table exists.

Source code in src/verifiable_rag/indexers/lance.py
def add(self, chunks: list[Chunk], embeddings: list[list[float]]) -> None:
    """Persist *chunks* with their *embeddings*.  Appends if table exists."""
    if len(chunks) != len(embeddings):
        raise ValueError(
            f"chunks and embeddings length mismatch: "
            f"{len(chunks)} vs {len(embeddings)}"
        )
    if not chunks:
        return

    records = [self._chunk_to_row(c, e) for c, e in zip(chunks, embeddings, strict=True)]
    db = self._connect()

    if self._table_exists(db):
        tbl = db.open_table(self._table_name)
        tbl.add(records)
    else:
        tbl = db.create_table(self._table_name, records)
    self._table = tbl

search

search(query_embedding: list[float], k: int) -> list[RetrievedChunk]

Return up to k chunks nearest to query_embedding.

Source code in src/verifiable_rag/indexers/lance.py
def search(
    self, query_embedding: list[float], k: int
) -> list[RetrievedChunk]:
    """Return up to *k* chunks nearest to *query_embedding*."""
    tbl = self._open_table()
    if tbl is None:
        return []

    rows: list[dict[str, Any]] = tbl.search(query_embedding).limit(k).to_list()
    results: list[RetrievedChunk] = []
    for row in rows:
        chunk = self._row_to_chunk(row)
        distance = float(row["_distance"])
        # LanceDB v0.30 cosine: _distance = 2*(1 - cos_similarity) ∈ [0,4]
        # Convert back to cosine similarity ∈ [-1, 1].
        # L2: report negative distance so higher score = smaller distance.
        score = (1.0 - distance / 2.0) if self._metric == "cosine" else -distance
        results.append(
            RetrievedChunk(chunk=chunk, score=score, retrieval_method="dense")
        )
    return results

build_index

build_index() -> None

Build an HNSW ANN index. Call after bulk-loading for fast queries.

Source code in src/verifiable_rag/indexers/lance.py
def build_index(self) -> None:
    """Build an HNSW ANN index.  Call after bulk-loading for fast queries."""
    tbl = self._open_table()
    if tbl is None:
        raise RuntimeError(
            "Cannot build index: table is empty or does not exist yet."
        )
    tbl.create_index(metric=self._metric)

clear

clear() -> None

Drop the table and all indexed vectors.

Source code in src/verifiable_rag/indexers/lance.py
def clear(self) -> None:
    """Drop the table and all indexed vectors."""
    db = self._connect()
    if self._table_exists(db):
        db.drop_table(self._table_name)
    self._table = None

BM25Index

BM25Index

BM25Index(tokenize: Callable[[list[str]], Any] | None = None, method: str = 'robertson')

Sparse BM25 retrieval index backed by bm25s.

Parameters

tokenize: Callable (texts: list[str]) -> list[list[str]] that converts raw texts into token lists. Defaults to bm25s.tokenize. method: BM25 variant — "bm25" (Okapi, default), "bm25l", "bm25+".

Source code in src/verifiable_rag/indexers/sparse/bm25.py
def __init__(
    self,
    tokenize: Callable[[list[str]], Any] | None = None,
    method: str = "robertson",
) -> None:
    self._tokenize = tokenize
    self._method = method
    self._index: Any = None
    self._corpus: list[dict[str, Any]] = []
    # Marked dirty whenever the corpus changes; rebuild is deferred until
    # the next search()/save() so a bulk ingest of N PDFs is O(N) instead
    # of O(N²) full rebuilds.
    self._dirty: bool = False

count property

count: int

Number of documents currently indexed.

add

add(chunks: list[Chunk]) -> None

Append chunks to the corpus; rebuild is deferred to next search().

Lazy rebuild: the internal BM25 matrix is only recomputed on the next search() (or save()). This keeps a bulk ingest of N PDFs O(N) rather than O(N²) — previously every add() re-tokenised the entire corpus.

Source code in src/verifiable_rag/indexers/sparse/bm25.py
def add(self, chunks: list[Chunk]) -> None:
    """Append *chunks* to the corpus; rebuild is deferred to next search().

    Lazy rebuild: the internal BM25 matrix is only recomputed on the next
    ``search()`` (or ``save()``). This keeps a bulk ingest of N PDFs O(N)
    rather than O(N²) — previously every ``add()`` re-tokenised the entire
    corpus.
    """
    if not chunks:
        return
    self._corpus.extend(self._chunk_to_row(c) for c in chunks)
    self._dirty = True

search

search(query: str, k: int) -> list[RetrievedChunk]

Return up to k chunks ranked by BM25 score for query.

Source code in src/verifiable_rag/indexers/sparse/bm25.py
def search(self, query: str, k: int) -> list[RetrievedChunk]:
    """Return up to *k* chunks ranked by BM25 score for *query*."""
    if not self._corpus:
        return []
    if self._dirty or self._index is None:
        self._rebuild()

    bm25s = self._import_bm25s()
    tokenizer = self._tokenize or bm25s.tokenize
    query_tokens = tokenizer([query])
    results, scores = self._index.retrieve(
        query_tokens,
        corpus=self._corpus,
        k=min(k, len(self._corpus)),
        show_progress=False,
    )

    retrieved: list[RetrievedChunk] = []
    for doc_row, score in zip(results[0], scores[0], strict=True):
        chunk = self._row_to_chunk(doc_row)
        retrieved.append(
            RetrievedChunk(chunk=chunk, score=float(score), retrieval_method="sparse")
        )
    return retrieved

clear

clear() -> None

Remove all indexed documents.

Source code in src/verifiable_rag/indexers/sparse/bm25.py
def clear(self) -> None:
    """Remove all indexed documents."""
    self._index = None
    self._corpus = []
    self._dirty = False

save

save(path: str | Path) -> None

Persist the index and corpus to path directory.

Source code in src/verifiable_rag/indexers/sparse/bm25.py
def save(self, path: str | Path) -> None:
    """Persist the index and corpus to *path* directory."""
    if not self._corpus:
        raise RuntimeError("Nothing to save — index is empty.")
    if self._dirty or self._index is None:
        self._rebuild()
    path = Path(path)
    path.mkdir(parents=True, exist_ok=True)
    self._index.save(str(path / "bm25"))
    (path / "corpus.json").write_text(json.dumps(self._corpus, ensure_ascii=False))

load classmethod

load(path: str | Path) -> BM25Index

Restore a previously saved BM25Index from path directory.

Source code in src/verifiable_rag/indexers/sparse/bm25.py
@classmethod
def load(cls, path: str | Path) -> BM25Index:
    """Restore a previously saved BM25Index from *path* directory."""
    bm25s = cls._import_bm25s()
    path = Path(path)
    instance = cls()
    instance._index = bm25s.BM25.load(str(path / "bm25"), load_corpus=False)
    instance._corpus = json.loads((path / "corpus.json").read_text())
    return instance