Skip to content

Faiss manager

Incorporating hybrid retrieval into the FAISS-based database manager requires combining dense (vector-based) and sparse (keyword or traditional text-based) retrieval methods. This refined approach allows better retrieval by leveraging both embedding similarity and term-based matching.

This example uses SentenceTransformer for vector embeddings and TfidfVectorizer (from scikit-learn) for sparse keyword-based retrieval.


Complete Code with Hybrid Retrieval

import faiss
import os
import numpy as np
import pickle
import uuid
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


class ProductionHybridFaiss:
    def __init__(self,
                 embedding_model='all-MiniLM-L6-v2',
                 base_dir="faiss_dbs",
                 nlist=100):

        self.embedding_model = SentenceTransformer(embedding_model)
        self.base_dir = base_dir
        self.nlist = nlist  # number of IVF clusters

        os.makedirs(self.base_dir, exist_ok=True)

        self.index = None
        self.vectorizer = None
        self.tfidf_matrix = None
        self.documents = []
        self.dimension = self.embedding_model.get_sentence_embedding_dimension()
        self.is_trained = False
        self.sparse_dirty = False

    # ----------------------------
    # Utility Paths
    # ----------------------------
    def _paths(self, db_name):
        db_path = os.path.join(self.base_dir, db_name)
        os.makedirs(db_path, exist_ok=True)

        return {
            "faiss": os.path.join(db_path, "index.faiss"),
            "docs": os.path.join(db_path, "documents.pkl"),
            "vectorizer": os.path.join(db_path, "vectorizer.pkl"),
            "tfidf": os.path.join(db_path, "tfidf.pkl"),
        }

    # ----------------------------
    # Create Database
    # ----------------------------
    def create(self, db_name):

        quantizer = faiss.IndexFlatIP(self.dimension)
        self.index = faiss.IndexIVFFlat(
            quantizer,
            self.dimension,
            self.nlist,
            faiss.METRIC_INNER_PRODUCT
        )

        self.vectorizer = TfidfVectorizer()
        self.documents = []
        self.is_trained = False
        self.sparse_dirty = True

        self._persist(db_name)
        print(f"Production DB '{db_name}' created.")

    # ----------------------------
    # Insert Documents
    # ----------------------------
    def insert(self, db_name, docs):
        """
        docs: list of dicts:
        {
            "text": "...",
            "metadata": {...}
        }
        """

        texts = [d["text"] for d in docs]

        embeddings = self.embedding_model.encode(
            texts,
            convert_to_numpy=True
        ).astype("float32")

        # Normalize for cosine similarity
        faiss.normalize_L2(embeddings)

        # Train IVF if not trained
        if not self.is_trained:
            self.index.train(embeddings)
            self.is_trained = True

        self.index.add(embeddings)

        # Store documents
        for doc in docs:
            if "id" not in doc:
                doc["id"] = str(uuid.uuid4())
            self.documents.append(doc)

        self.sparse_dirty = True
        self._persist(db_name)

        print(f"{len(docs)} documents inserted.")

    # ----------------------------
    # Rebuild Sparse Index (Lazy)
    # ----------------------------
    def _rebuild_sparse(self):
        texts = [d["text"] for d in self.documents]
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
        self.sparse_dirty = False

    # ----------------------------
    # Hybrid Search
    # ----------------------------
    def search(self, query, alpha=0.7, k=5):

        if self.index.ntotal == 0:
            return []

        # Dense search
        query_embedding = self.embedding_model.encode(
            [query],
            convert_to_numpy=True
        ).astype("float32")

        faiss.normalize_L2(query_embedding)

        self.index.nprobe = min(10, self.nlist)
        dense_scores, dense_indices = self.index.search(query_embedding, k)

        dense_scores = dense_scores[0]
        dense_indices = dense_indices[0]

        # Sparse search
        if self.sparse_dirty:
            self._rebuild_sparse()

        tfidf_query = self.vectorizer.transform([query])
        sparse_scores = cosine_similarity(tfidf_query, self.tfidf_matrix)[0]

        # Combine scores
        combined = []

        for i, doc in enumerate(self.documents):

            dense_score = 0.0
            if i in dense_indices:
                dense_score = dense_scores[list(dense_indices).index(i)]

            sparse_score = sparse_scores[i]

            score = alpha * dense_score + (1 - alpha) * sparse_score

            combined.append((score, doc))

        combined.sort(key=lambda x: x[0], reverse=True)

        return combined[:k]

    # ----------------------------
    # Persist
    # ----------------------------
    def _persist(self, db_name):

        paths = self._paths(db_name)

        if self.index:
            faiss.write_index(self.index, paths["faiss"])

        with open(paths["docs"], "wb") as f:
            pickle.dump(self.documents, f)

        with open(paths["vectorizer"], "wb") as f:
            pickle.dump(self.vectorizer, f)

        if self.tfidf_matrix is not None:
            with open(paths["tfidf"], "wb") as f:
                pickle.dump(self.tfidf_matrix, f)

    # ----------------------------
    # Load
    # ----------------------------
    def load(self, db_name):

        paths = self._paths(db_name)

        self.index = faiss.read_index(paths["faiss"])

        with open(paths["docs"], "rb") as f:
            self.documents = pickle.load(f)

        with open(paths["vectorizer"], "rb") as f:
            self.vectorizer = pickle.load(f)

        if os.path.exists(paths["tfidf"]):
            with open(paths["tfidf"], "rb") as f:
                self.tfidf_matrix = pickle.load(f)

        self.dimension = self.index.d
        self.is_trained = True
        self.sparse_dirty = False

        print(f"Production DB '{db_name}' loaded.")

Key Enhancements:

  1. Hybrid Retrieval Support:
    • Dense embeddings (from SentenceTransformers).
    • Sparse vectors (TF-IDF-based keyword search).
    • Controlled by alpha (balance between dense and sparse scores).
  2. Text Storage:
    • Stores raw text data in self.text_store.
  3. TF-IDF Integration:
    • Uses TfidfVectorizer to compute sparse vectors and similarity.
  4. Combine Scores:
    • Combines dense and sparse scores into a weighted hybrid score.

Example Query:

For the query "How is similarity search performed?":

  • Dense retrieval may pull "FAISS is great for similarity search."
  • Sparse retrieval may surface "This hybrid model combines embeddings and keyword search."
  • Hybrid retrieval adjusts based on alpha.

This approach gives a powerful retrieval mechanism suitable for hybrid use cases.