Skip to main content

ChromaDB Integration

ChromaDB is our vector database for semantic food search. Let's set it up and create our retriever.

ChromaDB Basics

ChromaDB organizes data into collections (like tables). Each item in a collection has:

  • ID - Unique identifier
  • Document - The text content
  • Embedding - Vector representation (auto-generated or provided)
  • Metadata - Key-value pairs for filtering

Step 1: Create the Retriever

Create app/rag/retriever.py:

app/rag/retriever.py
import json
import chromadb
from chromadb.config import Settings

from app.core.config import settings

class FoodRetriever:
def __init__(self):
# Connect to ChromaDB server
self.client = chromadb.HttpClient(
host=settings.chroma_host,
port=settings.chroma_port
)

# Get or create our collection
self.collection = self.client.get_or_create_collection(
name="foods",
metadata={"description": "Indian vegetarian food database"}
)

def add_foods(self, foods: list[dict]):
"""Add food items to the collection."""
ids = []
documents = []
metadatas = []

for food in foods:
ids.append(food["id"])

# Create rich document for embedding
doc = self._create_document(food)
documents.append(doc)

# Store metadata for filtering
metadatas.append({
"name": food["name"],
"cuisine": food["cuisine"],
"spice_level": food["spice_level"],
"meal_type": json.dumps(food["meal_type"]),
"allergens": json.dumps(food.get("allergens", [])),
"is_vegetarian": food.get("is_vegetarian", True),
"is_vegan": food.get("is_vegan", False),
"is_high_protein": food.get("is_high_protein", False),
"is_low_carb": food.get("is_low_carb", False),
})

# Upsert (insert or update)
self.collection.upsert(
ids=ids,
documents=documents,
metadatas=metadatas
)

return len(ids)

def _create_document(self, food: dict) -> str:
"""Create a rich text representation for embedding."""
parts = [
f"{food['name']}: {food['description']}",
f"Cuisine: {food['cuisine']}",
f"Region: {food.get('region', 'India')}",
f"Meal types: {', '.join(food['meal_type'])}",
f"Spice level: {food['spice_level']}",
f"Ingredients: {', '.join(food.get('ingredients', []))}",
]

# Add health tags
if food.get("is_high_protein"):
parts.append("High protein dish")
if food.get("is_low_carb"):
parts.append("Low carb option")

return "\n".join(parts)

Key Decisions

Why create a rich document?

The document is what gets embedded. More context = better semantic matching:

# Bad: Just the name
"Masala Dosa"

# Good: Rich context
"""
Masala Dosa: A crispy, savory crepe made from fermented rice batter
Cuisine: south_indian
Region: Karnataka
Meal types: breakfast, dinner
Spice level: medium
Ingredients: rice, urad dal, potato, onion, spices
"""

Why store metadata separately?

Metadata enables exact filtering that doesn't rely on embeddings:

# "Find vegan dishes" - metadata filter is exact
where={"is_vegan": True}

# "Find something spicy" - relies on embedding similarity
query_texts=["spicy food"]

Step 2: Add Search Method

Continue in app/rag/retriever.py:

app/rag/retriever.py (continued)
    def search(
self,
query: str,
cuisine: str = None,
spice_level: str = None,
meal_type: str = None,
exclude_allergens: list[str] = None,
dietary_type: str = None,
top_k: int = 5
) -> list[dict]:
"""Search for foods matching the query and filters."""

# Build where clause
where_conditions = []

if cuisine:
where_conditions.append({"cuisine": cuisine})

if spice_level:
where_conditions.append({"spice_level": spice_level})

if dietary_type == "vegan":
where_conditions.append({"is_vegan": True})

# Build the where clause
where = None
if len(where_conditions) == 1:
where = where_conditions[0]
elif len(where_conditions) > 1:
where = {"$and": where_conditions}

# Execute query
results = self.collection.query(
query_texts=[query],
n_results=top_k * 2, # Get extra, we'll filter more
where=where
)

# Post-filter for allergens (ChromaDB doesn't support $nin on arrays well)
foods = []
for i, id in enumerate(results["ids"][0]):
metadata = results["metadatas"][0][i]

# Check allergens
if exclude_allergens:
food_allergens = json.loads(metadata.get("allergens", "[]"))
if any(a in food_allergens for a in exclude_allergens):
continue

foods.append({
"id": id,
"name": metadata["name"],
"cuisine": metadata["cuisine"],
"spice_level": metadata["spice_level"],
"document": results["documents"][0][i],
"score": results["distances"][0][i] if results["distances"] else None
})

if len(foods) >= top_k:
break

return foods

def get_by_name(self, name: str) -> dict | None:
"""Get a specific food by name."""
results = self.collection.get(
where={"name": name},
limit=1
)

if not results["ids"]:
return None

return {
"id": results["ids"][0],
"name": results["metadatas"][0]["name"],
"document": results["documents"][0],
**results["metadatas"][0]
}

def get_count(self) -> int:
"""Get total number of foods in collection."""
return self.collection.count()


# Singleton instance
retriever = FoodRetriever()

Step 3: Understanding the Search Flow

User query: "high protein breakfast"

Step 1: Build filters
where = {"is_high_protein": True}

Step 2: Query ChromaDB
query_texts=["high protein breakfast"]

ChromaDB internally:
1. Embeds "high protein breakfast" → [0.23, 0.45, ...]
2. Finds similar document embeddings
3. Applies metadata filter
4. Returns ranked results

Step 3: Post-filter allergens
Remove any dishes with user's allergens

Step 4: Return formatted results
[
{"name": "Pesarattu", "score": 0.92, ...},
{"name": "Sprouts Salad", "score": 0.87, ...}
]

Step 4: Export the Retriever

Create app/rag/__init__.py:

app/rag/__init__.py
from .retriever import retriever, FoodRetriever

__all__ = ["retriever", "FoodRetriever"]

Step 5: Create the Admin Ingest Endpoint

Update app/api/admin.py:

app/api/admin.py
import json
from pathlib import Path
from fastapi import APIRouter

from app.rag import retriever

router = APIRouter(tags=["admin"])

@router.post("/admin/ingest")
async def ingest_data():
"""Load food data into ChromaDB."""
# Load from JSON file
data_path = Path(__file__).parent.parent.parent / "data" / "foods.json"

with open(data_path) as f:
foods = json.load(f)

# Add to ChromaDB
count = retriever.add_foods(foods)

return {
"message": f"Ingested {count} foods",
"total_in_db": retriever.get_count()
}

@router.get("/admin/stats")
async def get_stats():
"""Get database statistics."""
return {
"foods_count": retriever.get_count()
}

Testing the Retriever

After starting ChromaDB and ingesting data:

# Ingest food data
curl -X POST http://localhost:8080/api/admin/ingest

# Test search (once chat endpoint is built)
curl -X POST http://localhost:8080/api/chat \
-H "Content-Type: application/json" \
-d '{"message": "I want something spicy for breakfast"}'

ChromaDB Query Options

# Basic query
results = collection.query(
query_texts=["breakfast"],
n_results=5
)

# With metadata filter
results = collection.query(
query_texts=["breakfast"],
where={"cuisine": "south_indian"}
)

# Multiple conditions (AND)
results = collection.query(
query_texts=["breakfast"],
where={
"$and": [
{"cuisine": "south_indian"},
{"spice_level": "mild"}
]
}
)

# Multiple conditions (OR)
results = collection.query(
query_texts=["breakfast"],
where={
"$or": [
{"cuisine": "south_indian"},
{"cuisine": "north_indian"}
]
}
)

# Include/exclude specific fields
results = collection.query(
query_texts=["breakfast"],
include=["documents", "metadatas", "distances"]
)

Next, let's create the OpenAI client for generating responses.