Faiss manager
Incorporating hybrid retrieval into the FAISS-based database manager requires combining dense (vector-based) and sparse (keyword or traditional text-based) retrieval methods. This refined approach allows better retrieval by leveraging both embedding similarity and term-based matching.
This example uses SentenceTransformer for vector embeddings and TfidfVectorizer (from scikit-learn) for sparse keyword-based retrieval.
Complete Code with Hybrid Retrieval¶
import faiss
import os
import numpy as np
import pickle
import uuid
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class ProductionHybridFaiss:
def __init__(self,
embedding_model='all-MiniLM-L6-v2',
base_dir="faiss_dbs",
nlist=100):
self.embedding_model = SentenceTransformer(embedding_model)
self.base_dir = base_dir
self.nlist = nlist # number of IVF clusters
os.makedirs(self.base_dir, exist_ok=True)
self.index = None
self.vectorizer = None
self.tfidf_matrix = None
self.documents = []
self.dimension = self.embedding_model.get_sentence_embedding_dimension()
self.is_trained = False
self.sparse_dirty = False
# ----------------------------
# Utility Paths
# ----------------------------
def _paths(self, db_name):
db_path = os.path.join(self.base_dir, db_name)
os.makedirs(db_path, exist_ok=True)
return {
"faiss": os.path.join(db_path, "index.faiss"),
"docs": os.path.join(db_path, "documents.pkl"),
"vectorizer": os.path.join(db_path, "vectorizer.pkl"),
"tfidf": os.path.join(db_path, "tfidf.pkl"),
}
# ----------------------------
# Create Database
# ----------------------------
def create(self, db_name):
quantizer = faiss.IndexFlatIP(self.dimension)
self.index = faiss.IndexIVFFlat(
quantizer,
self.dimension,
self.nlist,
faiss.METRIC_INNER_PRODUCT
)
self.vectorizer = TfidfVectorizer()
self.documents = []
self.is_trained = False
self.sparse_dirty = True
self._persist(db_name)
print(f"Production DB '{db_name}' created.")
# ----------------------------
# Insert Documents
# ----------------------------
def insert(self, db_name, docs):
"""
docs: list of dicts:
{
"text": "...",
"metadata": {...}
}
"""
texts = [d["text"] for d in docs]
embeddings = self.embedding_model.encode(
texts,
convert_to_numpy=True
).astype("float32")
# Normalize for cosine similarity
faiss.normalize_L2(embeddings)
# Train IVF if not trained
if not self.is_trained:
self.index.train(embeddings)
self.is_trained = True
self.index.add(embeddings)
# Store documents
for doc in docs:
if "id" not in doc:
doc["id"] = str(uuid.uuid4())
self.documents.append(doc)
self.sparse_dirty = True
self._persist(db_name)
print(f"{len(docs)} documents inserted.")
# ----------------------------
# Rebuild Sparse Index (Lazy)
# ----------------------------
def _rebuild_sparse(self):
texts = [d["text"] for d in self.documents]
self.tfidf_matrix = self.vectorizer.fit_transform(texts)
self.sparse_dirty = False
# ----------------------------
# Hybrid Search
# ----------------------------
def search(self, query, alpha=0.7, k=5):
if self.index.ntotal == 0:
return []
# Dense search
query_embedding = self.embedding_model.encode(
[query],
convert_to_numpy=True
).astype("float32")
faiss.normalize_L2(query_embedding)
self.index.nprobe = min(10, self.nlist)
dense_scores, dense_indices = self.index.search(query_embedding, k)
dense_scores = dense_scores[0]
dense_indices = dense_indices[0]
# Sparse search
if self.sparse_dirty:
self._rebuild_sparse()
tfidf_query = self.vectorizer.transform([query])
sparse_scores = cosine_similarity(tfidf_query, self.tfidf_matrix)[0]
# Combine scores
combined = []
for i, doc in enumerate(self.documents):
dense_score = 0.0
if i in dense_indices:
dense_score = dense_scores[list(dense_indices).index(i)]
sparse_score = sparse_scores[i]
score = alpha * dense_score + (1 - alpha) * sparse_score
combined.append((score, doc))
combined.sort(key=lambda x: x[0], reverse=True)
return combined[:k]
# ----------------------------
# Persist
# ----------------------------
def _persist(self, db_name):
paths = self._paths(db_name)
if self.index:
faiss.write_index(self.index, paths["faiss"])
with open(paths["docs"], "wb") as f:
pickle.dump(self.documents, f)
with open(paths["vectorizer"], "wb") as f:
pickle.dump(self.vectorizer, f)
if self.tfidf_matrix is not None:
with open(paths["tfidf"], "wb") as f:
pickle.dump(self.tfidf_matrix, f)
# ----------------------------
# Load
# ----------------------------
def load(self, db_name):
paths = self._paths(db_name)
self.index = faiss.read_index(paths["faiss"])
with open(paths["docs"], "rb") as f:
self.documents = pickle.load(f)
with open(paths["vectorizer"], "rb") as f:
self.vectorizer = pickle.load(f)
if os.path.exists(paths["tfidf"]):
with open(paths["tfidf"], "rb") as f:
self.tfidf_matrix = pickle.load(f)
self.dimension = self.index.d
self.is_trained = True
self.sparse_dirty = False
print(f"Production DB '{db_name}' loaded.")
Key Enhancements:¶
- Hybrid Retrieval Support:
- Dense embeddings (from SentenceTransformers).
- Sparse vectors (TF-IDF-based keyword search).
- Controlled by
alpha(balance between dense and sparse scores).
- Text Storage:
- Stores raw text data in
self.text_store.
- Stores raw text data in
- TF-IDF Integration:
- Uses
TfidfVectorizerto compute sparse vectors and similarity.
- Uses
- Combine Scores:
- Combines dense and sparse scores into a weighted hybrid score.
Example Query:¶
For the query "How is similarity search performed?":
- Dense retrieval may pull "FAISS is great for similarity search."
- Sparse retrieval may surface "This hybrid model combines embeddings and keyword search."
- Hybrid retrieval adjusts based on
alpha.
This approach gives a powerful retrieval mechanism suitable for hybrid use cases.