Hybrid RAG¶

Build a retrieval-augmented generation pipeline that combines keyword precision (BM25) with semantic understanding (vector search).

What you'll build¶

A knowledge base table with text content and embeddings
BM25 + vector indexes on the same table
Hybrid retrieval with tunable weights
A retrieve_context function ready to plug into any LLM

Why hybrid¶

Problem	Mode	What happens
"Semantic drift"	Vector only	Returns vaguely related results that miss the point
"Keyword brittleness"	BM25 only	Misses relevant results that use different words
Both handled	Hybrid	BM25 anchors precision, vector fills semantic gaps

Code¶

Setup¶

import requests

API_URL = "https://api.deeplake.ai"
TOKEN = "YOUR_TOKEN"
WORKSPACE = "YOUR_WORKSPACE"
TABLE = "knowledge_base"

headers = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json",
}

def query(sql):
    res = requests.post(
        f"{API_URL}/workspaces/{WORKSPACE}/tables/query",
        headers=headers,
        json={"query": sql},
    )
    return res.json()

def embed_texts(texts):
    """Replace with your embedding provider."""
    res = requests.post(
        "https://openrouter.ai/api/v1/embeddings",
        headers={
            "Authorization": "Bearer YOUR_KEY",
            "Content-Type": "application/json",
        },
        json={"model": "openai/text-embedding-3-small", "input": texts},
    )
    return [item["embedding"] for item in res.json()["data"]]

Create the knowledge base¶

query(f"""
    CREATE TABLE IF NOT EXISTS "{WORKSPACE}"."{TABLE}" (
        id BIGSERIAL PRIMARY KEY,
        title TEXT,
        content TEXT,
        source TEXT,
        embedding FLOAT4[]
    ) USING deeplake
""")

Ingest documents with embeddings¶

docs = [
    {"title": "Auth overview", "content": "JWT tokens expire after 24 hours. Refresh tokens last 30 days.", "source": "auth-docs"},
    {"title": "Rate limits", "content": "API rate limit is 1000 requests per minute per API key.", "source": "api-docs"},
    {"title": "Error codes", "content": "Error 429 means rate limit exceeded. Retry after the Retry-After header.", "source": "api-docs"},
    {"title": "SSO setup", "content": "SAML SSO requires configuring the IdP with our ACS URL and entity ID.", "source": "auth-docs"},
]

for doc in docs:
    emb = embed_texts([doc["content"]])[0]
    emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"

    safe_content = doc["content"].replace("'", "''")
    query(f"""
        INSERT INTO "{WORKSPACE}"."{TABLE}" (title, content, source, embedding)
        VALUES ('{doc["title"]}', '{safe_content}', '{doc["source"]}', {emb_literal})
    """)

Create both indexes¶

# Vector index
query(f"""
    CREATE INDEX IF NOT EXISTS idx_kb_vec
    ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (embedding DESC)
""")

# BM25 index
query(f"""
    CREATE INDEX IF NOT EXISTS idx_kb_bm25
    ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (content)
    WITH (index_type = 'bm25')
""")

Hybrid retrieval¶

def retrieve_context(question, top_k=5, vector_weight=0.5):
    """Retrieve relevant documents with hybrid search."""
    emb = embed_texts([question])[0]
    emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"
    bm25_weight = 1.0 - vector_weight
    safe_q = question.replace("'", "''")

    result = query(f"""
        SELECT title, content, source,
               (embedding, content) <#> deeplake_hybrid_record(
                   {emb_literal},
                   '{safe_q}',
                   {vector_weight}, {bm25_weight}
               ) AS score
        FROM "{WORKSPACE}"."{TABLE}"
        WHERE embedding IS NOT NULL
        ORDER BY score ASC
        LIMIT {top_k}
    """)
    return result.get("rows", [])

Use it¶

# Keyword-heavy query → lean toward BM25
results = retrieve_context("error 429 rate limit", vector_weight=0.3)

# Conceptual query → lean toward vector
results = retrieve_context("how do I keep users logged in longer", vector_weight=0.7)

# Balanced default
results = retrieve_context("authentication token expiry", vector_weight=0.5)

for title, content, source, score in results:
    print(f"[{source}] {title}: {content[:80]}... (score: {score:.4f})")

Plug into an LLM¶

context = retrieve_context("how long do tokens last")
context_text = "\n\n".join(f"[{row[2]}] {row[0]}: {row[1]}" for row in context)

prompt = f"""Answer the question using only the context below.

Context:
{context_text}

Question: How long do tokens last?
"""

# Send to your LLM of choice

Filter by source¶

Combine hybrid search with SQL WHERE:

result = query(f"""
    SELECT title, content,
           (embedding, content) <#> deeplake_hybrid_record(
               {emb_literal}, '{safe_q}', 0.5, 0.5
           ) AS score
    FROM "{WORKSPACE}"."{TABLE}"
    WHERE source = 'auth-docs'
    ORDER BY score ASC
    LIMIT 5
""")

Weight tuning guide¶

Query type	Vector weight	BM25 weight	Example
Exact identifiers	0.2	0.8	"error 429", "JWT refresh"
Conceptual	0.8	0.2	"keep users logged in"
Mixed	0.5	0.5	"authentication token expiry"

Start with 0.5, 0.5. Adjust based on your retrieval quality.

What to try next¶

Agent memory — hybrid search over agent traces
Search fundamentals — all four search modes in detail