Skip to content

Retrieval to Training

Describe what you want in natural language. Retrieve the top-k matches. Train on that slice. No data export. No ETL pipeline.

What you'll build

  • A table with embeddings and file references
  • A retrieval query driven by a training objective
  • A download loop that pulls matched files
  • A training-ready dataset from a single SQL query

The idea

Traditional workflow:

Full dataset → export → filter → transform → train

Deep Lake workflow:

Training intent (text) → SQL query → top-k rows + files → train

Your training set is a query. Change the query, change the training set. Instantly reproducible.

Code

Setup

import os
import requests

API_URL = "https://api.deeplake.ai"
TOKEN = "YOUR_TOKEN"
WORKSPACE = "YOUR_WORKSPACE"
TABLE = "video_chunks"

headers = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json",
}
auth_headers = {"Authorization": f"Bearer {TOKEN}"}

def query(sql):
    res = requests.post(
        f"{API_URL}/workspaces/{WORKSPACE}/tables/query",
        headers=headers,
        json={"query": sql},
    )
    return res.json()

Define training objectives as queries

training_objectives = [
    "robot arm picking up small objects from a table",
    "autonomous vehicle navigating through narrow corridors",
    "human-robot handover of tools in assembly line",
]

Retrieve matching data

def retrieve_training_set(objective, top_k=20):
    """Retrieve files matching a training objective."""
    emb = encode_text(objective)  # your encoder

    # Format embedding (single-vector or multi-vector)
    if isinstance(emb[0], list):
        # Multi-vector
        inner = ", ".join(
            "ARRAY[" + ",".join(str(v) for v in row) + "]" for row in emb
        )
        emb_literal = f"ARRAY[{inner}]::float4[][]"
    else:
        # Single-vector
        emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"

    result = query(f"""
        SELECT id, file_id, metadata,
               embedding <#> {emb_literal} AS score
        FROM "{WORKSPACE}"."{TABLE}"
        ORDER BY score DESC
        LIMIT {top_k}
    """)
    return result.get("rows", [])

Download matched files

def download_training_set(rows, output_dir="/tmp/training_data"):
    os.makedirs(output_dir, exist_ok=True)
    paths = []

    for row in rows:
        row_id, file_id = row[0], row[1]
        if not file_id:
            continue

        res = requests.get(
            f"{API_URL}/workspaces/{WORKSPACE}/files/{file_id}/content",
            headers=auth_headers,
        )
        if res.status_code == 200:
            path = os.path.join(output_dir, f"sample_{row_id}.mp4")
            with open(path, "wb") as f:
                f.write(res.content)
            paths.append(path)

    return paths

Put it together

for objective in training_objectives:
    print(f"\n--- Objective: {objective} ---")
    rows = retrieve_training_set(objective, top_k=10)
    paths = download_training_set(rows)
    print(f"Downloaded {len(paths)} files for training")

    # Feed paths to your training loop
    # train(paths, objective=objective)

Add SQL filters for precision

Narrow the retrieval with structured filters:

result = query(f"""
    SELECT id, file_id, metadata,
           embedding <#> {emb_literal} AS score
    FROM "{WORKSPACE}"."{TABLE}"
    WHERE metadata->>'camera' = 'cam1'
      AND start_s > 100.0
    ORDER BY score DESC
    LIMIT 20
""")

Reproducible training sets

Every training set is defined by a query. Log the query, reproduce the set:

training_config = {
    "objective": "robot arm picking up small objects",
    "table": "video_chunks",
    "top_k": 20,
    "filters": {"camera": "cam1"},
    "timestamp": "2025-06-01T12:00:00Z",
}
# Save this config alongside your model checkpoint

What to try next