Multimodal Asset Library¶

Build a single searchable library for images, video, audio, and text. "Find assets like this" becomes a query.

What you'll build¶

A unified table for mixed media types
File uploads for binary assets
Cross-modal search: describe what you want in text, find it in any format
Filtered retrieval by media type, tags, or metadata

Code¶

Setup¶

import requests
import json

API_URL = "https://api.deeplake.ai"
TOKEN = "YOUR_TOKEN"
WORKSPACE = "YOUR_WORKSPACE"
TABLE = "media_library"

headers = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json",
}
auth_headers = {"Authorization": f"Bearer {TOKEN}"}

def query(sql):
    res = requests.post(
        f"{API_URL}/workspaces/{WORKSPACE}/tables/query",
        headers=headers,
        json={"query": sql},
    )
    return res.json()

Create the library table¶

One table for all media types. The media_type column distinguishes them.

query(f"""
    CREATE TABLE IF NOT EXISTS "{WORKSPACE}"."{TABLE}" (
        id BIGSERIAL PRIMARY KEY,
        title TEXT,
        description TEXT,
        media_type TEXT,
        tags JSONB,
        embedding FLOAT4[],
        file_id UUID,
        created_at TIMESTAMPTZ DEFAULT NOW()
    ) USING deeplake
""")

Ingest mixed assets¶

def upload_file(file_path, mime_type):
    with open(file_path, "rb") as f:
        res = requests.post(
            f"{API_URL}/workspaces/{WORKSPACE}/files",
            headers=auth_headers,
            files={"file": (file_path.split("/")[-1], f, mime_type)},
            timeout=120,
        )
    return res.json().get("id") if res.status_code in (200, 201) else None

def ingest_asset(title, description, media_type, tags, file_path, mime_type):
    # Upload binary
    file_id = upload_file(file_path, mime_type)
    if not file_id:
        return

    # Embed the description (text-based cross-modal search)
    emb = embed_text(description)  # your encoder
    emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"
    tags_json = json.dumps(tags).replace("'", "''")
    safe_desc = description.replace("'", "''")

    query(f"""
        INSERT INTO "{WORKSPACE}"."{TABLE}" (title, description, media_type, tags, embedding, file_id)
        VALUES ('{title}', '{safe_desc}', '{media_type}',
                '{tags_json}'::jsonb, {emb_literal}, '{file_id}'::uuid)
    """)

# Ingest different media types
ingest_asset(
    "Beach sunset", "Golden hour waves crashing on sandy beach",
    "image", ["nature", "ocean", "golden-hour"],
    "beach.jpg", "image/jpeg",
)

ingest_asset(
    "Warehouse walkthrough", "Camera moving through aisles of a logistics warehouse",
    "video", ["industrial", "logistics", "indoor"],
    "warehouse.mp4", "video/mp4",
)

ingest_asset(
    "Rain ambience", "Gentle rain falling on leaves with distant thunder",
    "audio", ["ambience", "nature", "rain"],
    "rain.wav", "audio/wav",
)

Create indexes¶

query(f"""
    CREATE INDEX IF NOT EXISTS idx_media_vec
    ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (embedding DESC)
""")

query(f"""
    CREATE INDEX IF NOT EXISTS idx_media_bm25
    ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (description)
    WITH (index_type = 'bm25')
""")

Search across modalities¶

Describe what you want in natural language. It searches everything.

def search_library(text, media_type=None, top_k=10):
    emb = embed_text(text)
    emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"

    where = "WHERE embedding IS NOT NULL"
    if media_type:
        where += f" AND media_type = '{media_type}'"

    result = query(f"""
        SELECT title, description, media_type, file_id,
               embedding <#> {emb_literal} AS score
        FROM "{WORKSPACE}"."{TABLE}"
        {where}
        ORDER BY score DESC
        LIMIT {top_k}
    """)
    return result.get("rows", [])

# Search all types
results = search_library("dramatic lighting on water")

# Search only video
results = search_library("indoor warehouse footage", media_type="video")

# Search only audio
results = search_library("rain sounds", media_type="audio")

Filter by tags¶

result = query(f"""
    SELECT title, description, file_id,
           embedding <#> {emb_literal} AS score
    FROM "{WORKSPACE}"."{TABLE}"
    WHERE tags ? 'nature'
    ORDER BY score DESC
    LIMIT 10
""")

"More like this"¶

Find assets similar to one you already have:

def more_like_this(asset_id, top_k=5):
    # Get the embedding of the reference asset
    ref = query(f'SELECT embedding FROM "{WORKSPACE}"."{TABLE}" WHERE id = {asset_id}')
    if not ref.get("rows"):
        return []

    emb = ref["rows"][0][0]
    emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"

    result = query(f"""
        SELECT title, description, media_type, file_id,
               embedding <#> {emb_literal} AS score
        FROM "{WORKSPACE}"."{TABLE}"
        WHERE id != {asset_id}
        ORDER BY score DESC
        LIMIT {top_k}
    """)
    return result.get("rows", [])

# "Show me more assets like asset #42"
similar = more_like_this(42)

What to try next¶

Image search — focused image workflow
Video retrieval — chunk-level video search
Hybrid RAG — combine semantic + keyword for precision