Skip to content

Hybrid RAG

Build a retrieval-augmented generation pipeline that combines keyword precision (BM25) with semantic understanding (vector search).

What you'll build

  • A knowledge base table with text content and embeddings
  • BM25 + vector indexes on the same table
  • Hybrid retrieval with tunable weights
  • A retrieve_context function ready to plug into any LLM

Why hybrid

Problem Mode What happens
"Semantic drift" Vector only Returns vaguely related results that miss the point
"Keyword brittleness" BM25 only Misses relevant results that use different words
Both handled Hybrid BM25 anchors precision, vector fills semantic gaps

Code

Setup

import requests

API_URL = "https://api.deeplake.ai"
TOKEN = "YOUR_TOKEN"
WORKSPACE = "YOUR_WORKSPACE"
TABLE = "knowledge_base"

headers = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json",
}

def query(sql):
    res = requests.post(
        f"{API_URL}/workspaces/{WORKSPACE}/tables/query",
        headers=headers,
        json={"query": sql},
    )
    return res.json()

def embed_texts(texts):
    """Replace with your embedding provider."""
    res = requests.post(
        "https://openrouter.ai/api/v1/embeddings",
        headers={
            "Authorization": "Bearer YOUR_KEY",
            "Content-Type": "application/json",
        },
        json={"model": "openai/text-embedding-3-small", "input": texts},
    )
    return [item["embedding"] for item in res.json()["data"]]

Create the knowledge base

query(f"""
    CREATE TABLE IF NOT EXISTS "{WORKSPACE}"."{TABLE}" (
        id BIGSERIAL PRIMARY KEY,
        title TEXT,
        content TEXT,
        source TEXT,
        embedding FLOAT4[]
    ) USING deeplake
""")

Ingest documents with embeddings

docs = [
    {"title": "Auth overview", "content": "JWT tokens expire after 24 hours. Refresh tokens last 30 days.", "source": "auth-docs"},
    {"title": "Rate limits", "content": "API rate limit is 1000 requests per minute per API key.", "source": "api-docs"},
    {"title": "Error codes", "content": "Error 429 means rate limit exceeded. Retry after the Retry-After header.", "source": "api-docs"},
    {"title": "SSO setup", "content": "SAML SSO requires configuring the IdP with our ACS URL and entity ID.", "source": "auth-docs"},
]

for doc in docs:
    emb = embed_texts([doc["content"]])[0]
    emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"

    safe_content = doc["content"].replace("'", "''")
    query(f"""
        INSERT INTO "{WORKSPACE}"."{TABLE}" (title, content, source, embedding)
        VALUES ('{doc["title"]}', '{safe_content}', '{doc["source"]}', {emb_literal})
    """)

Create both indexes

# Vector index
query(f"""
    CREATE INDEX IF NOT EXISTS idx_kb_vec
    ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (embedding DESC)
""")

# BM25 index
query(f"""
    CREATE INDEX IF NOT EXISTS idx_kb_bm25
    ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (content)
    WITH (index_type = 'bm25')
""")

Hybrid retrieval

def retrieve_context(question, top_k=5, vector_weight=0.5):
    """Retrieve relevant documents with hybrid search."""
    emb = embed_texts([question])[0]
    emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"
    bm25_weight = 1.0 - vector_weight
    safe_q = question.replace("'", "''")

    result = query(f"""
        SELECT title, content, source,
               (embedding, content) <#> deeplake_hybrid_record(
                   {emb_literal},
                   '{safe_q}',
                   {vector_weight}, {bm25_weight}
               ) AS score
        FROM "{WORKSPACE}"."{TABLE}"
        WHERE embedding IS NOT NULL
        ORDER BY score ASC
        LIMIT {top_k}
    """)
    return result.get("rows", [])

Use it

# Keyword-heavy query → lean toward BM25
results = retrieve_context("error 429 rate limit", vector_weight=0.3)

# Conceptual query → lean toward vector
results = retrieve_context("how do I keep users logged in longer", vector_weight=0.7)

# Balanced default
results = retrieve_context("authentication token expiry", vector_weight=0.5)

for title, content, source, score in results:
    print(f"[{source}] {title}: {content[:80]}... (score: {score:.4f})")

Plug into an LLM

context = retrieve_context("how long do tokens last")
context_text = "\n\n".join(f"[{row[2]}] {row[0]}: {row[1]}" for row in context)

prompt = f"""Answer the question using only the context below.

Context:
{context_text}

Question: How long do tokens last?
"""

# Send to your LLM of choice

Filter by source

Combine hybrid search with SQL WHERE:

result = query(f"""
    SELECT title, content,
           (embedding, content) <#> deeplake_hybrid_record(
               {emb_literal}, '{safe_q}', 0.5, 0.5
           ) AS score
    FROM "{WORKSPACE}"."{TABLE}"
    WHERE source = 'auth-docs'
    ORDER BY score ASC
    LIMIT 5
""")

Weight tuning guide

Query type Vector weight BM25 weight Example
Exact identifiers 0.2 0.8 "error 429", "JWT refresh"
Conceptual 0.8 0.2 "keep users logged in"
Mixed 0.5 0.5 "authentication token expiry"

Start with 0.5, 0.5. Adjust based on your retrieval quality.

What to try next