elastic · JessicaGarson · Feb 28, 2025 · Feb 8, 2025 · Feb 8, 2025 · Feb 10, 2025
diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/.dockerignore b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/.dockerignore
@@ -0,0 +1,8 @@
+# Ignore everything
+**
+
+# Allow specific files and directories
+!requirements.txt
+!data/
+!src/
+!stages/
diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/Dockerfile b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/Dockerfile
@@ -0,0 +1,36 @@
+# Use non-slim image due to OS dependencies of python packages. This gives us
+# git, build-essential, libglib2 (opencv) and gomp (torchaudio).
+FROM python:3.12
+
+COPY /requirements.txt .
+
+# Our python requirements have some OS dependencies beyond the base layer:
+#
+# * imagebind pulls in cartopy which has OS dependencies on geos and proj
+# * opencv has a runtime OS dependency on libgl1-mesa-glx
+#
+# The dev dependencies are installed temporarily to compile the wheels.
+# We leave the only the runtime dependencies, to keep the image smaller.
+RUN apt-get update && \
+    # install build and runtime dependencies
+    apt-get install -y --no-install-recommends \
+        libgeos-dev \
+        libproj-dev \
+        libgeos-c1v5 \
+        libproj25 \
+        libgl1-mesa-glx && \
+    # Install everything except xformers first
+    grep -v "\bxformers\b" requirements.txt > /tmp/r.txt && pip install -r /tmp/r.txt && \
+    # Now, install xformers, as it should be able to see torch now
+    grep "\bxformers\b" requirements.txt > /tmp/r.txt && pip install -r /tmp/r.txt && \
+    # remove build dependencies
+    apt-get purge -y libgeos-dev libproj-dev && \
+    apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+RUN mkdir -p ./data ./src ./stages
+COPY ./data ./data
+COPY ./src ./src
+COPY ./stages ./stages
+
diff --git a/...orting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/README.md b/...orting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/README.md
@@ -0,0 +1,80 @@
+# Building a Multimodal RAG Pipeline with Elasticsearch: The Story of Gotham City
+
+This repository contains the code for implementing a Multimodal Retrieval-Augmented Generation (RAG) system using Elasticsearch. The system processes and analyzes different types of evidence (images, audio, text, and depth maps) to solve a crime in Gotham City.
+
+## Overview
+
+The pipeline demonstrates how to:
+- Generate unified embeddings for multiple modalities using ImageBind
+- Store and search vectors efficiently in Elasticsearch
+- Analyze evidence using GPT-4 to generate forensic reports
+
+## Prerequisites
+
+- A Docker runtime with 8GB+ free ram
+  - GPU is optional, but recommended
+- Elasticsearch cluster (cloud or local)
+- OpenAI API key - Setup an OpenAI account and create a [secret key](https://platform.openai.com/docs/quickstart)
+
+## Quick Start
+
+This example runs four stages as docker compose services:
+
+```mermaid
+graph TD
+    verify-file-structure --> generate-embeddings
+    generate-embeddings --> index-content
+    index-content --> search-and-analyze
+```
+
+First, copy [env.example](env.example) to `.env` and fill in values noted inside.
+
+Now, enter below to run the pipeline:
+```bash
+docker compose run --build --rm search-and-analyze
+```
+
+The first time takes a while to build the image and download ImageBind weights.
+
+If you want to re-run just one stage, add `--no-deps` like this:
+```bash
+docker compose run --no-deps --build --rm search-and-analyze
+```
+
+## Project Structure
+
+```
+├── README.md
+├── requirements.txt
+├── src/
+│   ├── embedding_generator.py   # ImageBind wrapper
+│   ├── elastic_manager.py       # Elasticsearch operations
+│   └── llm_analyzer.py         # GPT-4 integration
+├── stages/
+│   ├── 01-stage/              # File organization
+│   ├── 02-stage/              # Embedding generation
+│   ├── 03-stage/              # Elasticsearch indexing/search
+│   └── 04-stage/              # Evidence analysis
+└── data/                      # Sample data
+    ├── images/
+    ├── audios/
+    ├── texts/
+    └── depths/
+```
+
+## Sample Data
+
+The repository includes sample evidence files:
+- Images: Crime scene photos and security camera footage
+- Audio: Suspicious sound recordings
+- Text: Mysterious notes and riddles
+- Depth Maps: 3D scene captures
+
+## How It Works
+
+1. **Evidence Collection**: Files are organized by modality in the `data/` directory
+2. **Embedding Generation**: ImageBind converts each piece of evidence into a 1024-dimensional vector
+3. **Vector Storage**: Elasticsearch stores embeddings with metadata for efficient retrieval
+4. **Similarity Search**: New evidence is compared against the database using k-NN search
+5. **Analysis**: GPT-4 analyzes the connections between evidence to identify suspects
+
diff --git a/...log-content/building-multimodal-rag-with-elasticsearch-gotham/data/audios/joker_laugh.wav b/...log-content/building-multimodal-rag-with-elasticsearch-gotham/data/audios/joker_laugh.wav
diff --git a/...building-multimodal-rag-with-elasticsearch-gotham/data/depths/depth_suspect.png b/...building-multimodal-rag-with-elasticsearch-gotham/data/depths/depth_suspect.png
diff --git a/...uilding-multimodal-rag-with-elasticsearch-gotham/data/depths/jdancing-depth.png b/...uilding-multimodal-rag-with-elasticsearch-gotham/data/depths/jdancing-depth.png
diff --git a/.../building-multimodal-rag-with-elasticsearch-gotham/data/images/crime_scene1.jpg b/.../building-multimodal-rag-with-elasticsearch-gotham/data/images/crime_scene1.jpg
diff --git a/.../building-multimodal-rag-with-elasticsearch-gotham/data/images/crime_scene2.jpg b/.../building-multimodal-rag-with-elasticsearch-gotham/data/images/crime_scene2.jpg
diff --git a/...tent/building-multimodal-rag-with-elasticsearch-gotham/data/images/jdancing.png b/...tent/building-multimodal-rag-with-elasticsearch-gotham/data/images/jdancing.png
diff --git a/...t/building-multimodal-rag-with-elasticsearch-gotham/data/images/joker_alley.jpg b/...t/building-multimodal-rag-with-elasticsearch-gotham/data/images/joker_alley.jpg
diff --git a/...uilding-multimodal-rag-with-elasticsearch-gotham/data/images/joker_laughing.png b/...uilding-multimodal-rag-with-elasticsearch-gotham/data/images/joker_laughing.png
diff --git a/...building-multimodal-rag-with-elasticsearch-gotham/data/images/playing-cards.png b/...building-multimodal-rag-with-elasticsearch-gotham/data/images/playing-cards.png
diff --git a/...building-multimodal-rag-with-elasticsearch-gotham/data/images/suspect_depth.jpg b/...building-multimodal-rag-with-elasticsearch-gotham/data/images/suspect_depth.jpg
diff --git a/...rting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/note2.txt b/...rting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/note2.txt
@@ -0,0 +1,8 @@
+Why so serious?
+
+The show has just begun and you're already running
+While clowns are dancing and the city's stunning
+In the abandoned theater, a surprise awaits
+Come play with me before it's too late!
+
+HAHAHAHAHA!
diff --git a/...og-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/police_report.txt b/...og-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/police_report.txt
@@ -0,0 +1,15 @@
+PRELIMINARY REPORT - GCPD
+Date: 01/28/2025
+Time: 22:30
+
+Incident: Break-in and Vandalism
+Location: Gotham Central Bank
+Evidence Found:
+- Playing cards scattered
+- Smile graffiti on walls
+- Suspicious audio recording
+- Witnesses report maniacal laughter
+
+Status: Under Investigation
+Priority Level: MAXIMUM
+Primary Suspect: Unknown (possible Joker involvement)
diff --git a/...ting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/riddle.txt b/...ting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/riddle.txt
@@ -0,0 +1,16 @@
+HAHAHA! 
+
+Dear Detective,
+
+In a city of endless night, a new game unfolds
+Where chaos reigns and fear takes hold
+I left a gift at Gotham Central Bank
+Time's ticking, your mind goes blank
+
+The clues are there, scattered with care
+Each laugh echoes everywhere
+Midnight strikes, you won't catch me
+In Gotham's heart, chaos runs free!
+
+With a smile,
+?
diff --git a/...ing-blog-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/threats.txt b/...ing-blog-content/building-multimodal-rag-with-elasticsearch-gotham/data/texts/threats.txt
@@ -0,0 +1,5 @@
+Incident Log:
+1. Gotham Central Bank - 22:15 - Alarm triggered
+2. Monarch Theater - 22:45 - Suspicious laughter reported
+3. Abandoned Amusement Park - 23:00 - Strange lights
+4. Ace Chemical Plant - 23:30 - Suspicious movement
diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-compose.yml b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-compose.yml
@@ -0,0 +1,61 @@
+name: gotham-city-crime-analysis
+
+services:
+  verify-file-structure:
+    build:
+      context: .
+    container_name: verify-file-structure
+    restart: 'no'  # no need to re-verify file structure
+    env_file:
+      - .env
+    command: python stages/01-stage/files_check.py
+    extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
+        - "localhost:host-gateway"
+
+  generate-embeddings:
+    depends_on:
+      verify-file-structure:
+        condition: service_completed_successfully
+    build:
+      context: .
+    container_name: generate-embeddings
+    restart: 'no'  # no need to re-generate embeddings
+    env_file:
+      - .env
+    command: python stages/02-stage/test_embedding_generation.py
+    extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
+        - "localhost:host-gateway"
+    volumes:
+      - torch-checkpoints:/root/cache/torch/checkpoints/
+
+  index-content:
+    depends_on:
+      generate-embeddings:
+        condition: service_completed_successfully
+    build:
+      context: .
+    container_name: index-content
+    restart: 'no'  # no need to re-verify file structure
+    env_file:
+      - .env
+    command: python stages/03-stage/index_all_modalities.py
+    extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
+        - "localhost:host-gateway"
+
+  search-and-analyze:
+    depends_on:
+      index-content:
+        condition: service_completed_successfully
+    build:
+      context: .
+    container_name: search-and-analyze
+    restart: 'no'  # no need to re-verify file structure
+    env_file:
+      - .env
+    command: python stages/04-stage/rag_crime_analyze.py
+    extra_hosts:  # send localhost traffic to the docker host, e.g. your laptop
+        - "localhost:host-gateway"
+
+volumes:
+  # Avoid re-downloading a >4GB model checkpoint
+  torch-checkpoints:
diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/requirements.txt b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/requirements.txt
@@ -0,0 +1,15 @@
+elasticsearch~=8.17.1
+torch~=2.6.0
+torchvision~=0.21.0
+torchaudio~=2.6.0
+imagebind @ git+https://github.com/hkchengrex/ImageBind.git
+openai~=1.64.0
+python-dotenv~=1.0.1
+numpy~=2.1.3
+pillow~=11.1.0
+opencv-python~=4.11.0
+librosa~=0.10.2
+matplotlib~=3.10.0
+wheel~=0.45.1
+setuptools
+xformers~=0.0.29
diff --git a/...ing-blog-content/building-multimodal-rag-with-elasticsearch-gotham/src/elastic_manager.py b/...ing-blog-content/building-multimodal-rag-with-elasticsearch-gotham/src/elastic_manager.py
@@ -0,0 +1,110 @@
+from elasticsearch import Elasticsearch, helpers
+import base64
+import os
+from dotenv import load_dotenv
+import numpy as np
+
+
+class ElasticsearchManager:
+    """Manages multimodal operations in Elasticsearch"""
+
+    def __init__(self):
+        load_dotenv()  # Load variables from .env
+        self.es = self._connect_elastic()
+        self.index_name = "multimodal_content"
+        self._setup_index()
+
+    def _connect_elastic(self):
+        """Connects to Elasticsearch"""
+        ELASTICSEARCH_URL = os.getenv("ELASTICSEARCH_URL")
+        ELASTICSEARCH_USER = os.getenv("ELASTICSEARCH_USER")
+        ELASTICSEARCH_PASSWORD = os.getenv("ELASTICSEARCH_PASSWORD")
+        ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY")
+
+        if ELASTICSEARCH_USER:
+            return Elasticsearch(
+                hosts=[ELASTICSEARCH_URL],
+                basic_auth=(ELASTICSEARCH_USER, ELASTICSEARCH_PASSWORD),
+            )
+        elif ELASTICSEARCH_API_KEY:
+            return Elasticsearch(
+                hosts=[ELASTICSEARCH_URL], api_key=ELASTICSEARCH_API_KEY
+            )
+        else:
+            raise ValueError(
+                "Please provide either ELASTICSEARCH_USER or ELASTICSEARCH_API_KEY"
+            )
+
+    def _setup_index(self):
+        """Sets up the index if it doesn't exist"""
+        if not self.es.indices.exists(index=self.index_name):
+            mapping = {
+                "mappings": {
+                    "properties": {
+                        "embedding": {
+                            "type": "dense_vector",
+                            "dims": 1024,
+                            "index": True,
+                            "similarity": "cosine",
+                        },
+                        "modality": {"type": "keyword"},
+                        "content": {"type": "binary"},
+                        "description": {"type": "text"},
+                        "metadata": {"type": "object"},
+                        "content_path": {"type": "text"},
+                    }
+                }
+            }
+            self.es.indices.create(index=self.index_name, body=mapping)
+
+    def index_content(
+        self,
+        embedding,
+        modality,
+        content=None,
+        description="",
+        metadata=None,
+        content_path=None,
+    ):
+        """Indexes multimodal content"""
+        doc = {
+            "embedding": embedding.tolist(),
+            "modality": modality,
+            "description": description,
+            "metadata": metadata or {},
+            "content_path": content_path,
+        }
+
+        if content:
+            doc["content"] = (
+                base64.b64encode(content).decode()
+                if isinstance(content, bytes)
+                else content
+            )
+
+        return self.es.index(index=self.index_name, document=doc)
+
+    def search_similar(self, query_embedding, modality=None, k=5):
+        """Searches for similar contents"""
+        query = {
+            "knn": {
+                "field": "embedding",
+                "query_vector": query_embedding.tolist(),
+                "k": k,
+                "num_candidates": 100,
+                "filter": [{"term": {"modality": modality}}] if modality else [],
+            }
+        }
+
+        try:
+            response = self.es.search(index=self.index_name, query=query, size=k)
+
+            # Return both source data and score for each hit
+            return [
+                {**hit["_source"], "score": hit["_score"]}
+                for hit in response["hits"]["hits"]
+            ]
+
+        except Exception as e:
+            print(f"Error: processing search_evidence: {str(e)}")
+            return "Error generating search evidence"