Multimodal Asset Library¶
Build a single searchable library for images, video, audio, and text. "Find assets like this" becomes a query.
What you'll build¶
- A unified table for mixed media types
- File uploads for binary assets
- Cross-modal search: describe what you want in text, find it in any format
- Filtered retrieval by media type, tags, or metadata
Code¶
Setup¶
import requests
import json
API_URL = "https://api.deeplake.ai"
TOKEN = "YOUR_TOKEN"
WORKSPACE = "YOUR_WORKSPACE"
TABLE = "media_library"
headers = {
"Authorization": f"Bearer {TOKEN}",
"Content-Type": "application/json",
}
auth_headers = {"Authorization": f"Bearer {TOKEN}"}
def query(sql):
res = requests.post(
f"{API_URL}/workspaces/{WORKSPACE}/tables/query",
headers=headers,
json={"query": sql},
)
return res.json()
Create the library table¶
One table for all media types. The media_type column distinguishes them.
query(f"""
CREATE TABLE IF NOT EXISTS "{WORKSPACE}"."{TABLE}" (
id BIGSERIAL PRIMARY KEY,
title TEXT,
description TEXT,
media_type TEXT,
tags JSONB,
embedding FLOAT4[],
file_id UUID,
created_at TIMESTAMPTZ DEFAULT NOW()
) USING deeplake
""")
Ingest mixed assets¶
def upload_file(file_path, mime_type):
with open(file_path, "rb") as f:
res = requests.post(
f"{API_URL}/workspaces/{WORKSPACE}/files",
headers=auth_headers,
files={"file": (file_path.split("/")[-1], f, mime_type)},
timeout=120,
)
return res.json().get("id") if res.status_code in (200, 201) else None
def ingest_asset(title, description, media_type, tags, file_path, mime_type):
# Upload binary
file_id = upload_file(file_path, mime_type)
if not file_id:
return
# Embed the description (text-based cross-modal search)
emb = embed_text(description) # your encoder
emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"
tags_json = json.dumps(tags).replace("'", "''")
safe_desc = description.replace("'", "''")
query(f"""
INSERT INTO "{WORKSPACE}"."{TABLE}" (title, description, media_type, tags, embedding, file_id)
VALUES ('{title}', '{safe_desc}', '{media_type}',
'{tags_json}'::jsonb, {emb_literal}, '{file_id}'::uuid)
""")
# Ingest different media types
ingest_asset(
"Beach sunset", "Golden hour waves crashing on sandy beach",
"image", ["nature", "ocean", "golden-hour"],
"beach.jpg", "image/jpeg",
)
ingest_asset(
"Warehouse walkthrough", "Camera moving through aisles of a logistics warehouse",
"video", ["industrial", "logistics", "indoor"],
"warehouse.mp4", "video/mp4",
)
ingest_asset(
"Rain ambience", "Gentle rain falling on leaves with distant thunder",
"audio", ["ambience", "nature", "rain"],
"rain.wav", "audio/wav",
)
Create indexes¶
query(f"""
CREATE INDEX IF NOT EXISTS idx_media_vec
ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (embedding DESC)
""")
query(f"""
CREATE INDEX IF NOT EXISTS idx_media_bm25
ON "{WORKSPACE}"."{TABLE}" USING deeplake_index (description)
WITH (index_type = 'bm25')
""")
Search across modalities¶
Describe what you want in natural language. It searches everything.
def search_library(text, media_type=None, top_k=10):
emb = embed_text(text)
emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"
where = "WHERE embedding IS NOT NULL"
if media_type:
where += f" AND media_type = '{media_type}'"
result = query(f"""
SELECT title, description, media_type, file_id,
embedding <#> {emb_literal} AS score
FROM "{WORKSPACE}"."{TABLE}"
{where}
ORDER BY score DESC
LIMIT {top_k}
""")
return result.get("rows", [])
# Search all types
results = search_library("dramatic lighting on water")
# Search only video
results = search_library("indoor warehouse footage", media_type="video")
# Search only audio
results = search_library("rain sounds", media_type="audio")
Filter by tags¶
result = query(f"""
SELECT title, description, file_id,
embedding <#> {emb_literal} AS score
FROM "{WORKSPACE}"."{TABLE}"
WHERE tags ? 'nature'
ORDER BY score DESC
LIMIT 10
""")
"More like this"¶
Find assets similar to one you already have:
def more_like_this(asset_id, top_k=5):
# Get the embedding of the reference asset
ref = query(f'SELECT embedding FROM "{WORKSPACE}"."{TABLE}" WHERE id = {asset_id}')
if not ref.get("rows"):
return []
emb = ref["rows"][0][0]
emb_literal = "ARRAY[" + ",".join(str(v) for v in emb) + "]::float4[]"
result = query(f"""
SELECT title, description, media_type, file_id,
embedding <#> {emb_literal} AS score
FROM "{WORKSPACE}"."{TABLE}"
WHERE id != {asset_id}
ORDER BY score DESC
LIMIT {top_k}
""")
return result.get("rows", [])
# "Show me more assets like asset #42"
similar = more_like_this(42)
What to try next¶
- Image search — focused image workflow
- Video retrieval — chunk-level video search
- Hybrid RAG — combine semantic + keyword for precision