Skip to content

Commit 4f92c8f

Browse files
Adding project created for content about Building a recipe search with Elasticsearch (#350)
* This PR contains scripts used for the article Building a recipe search with Elasticsearch * fixing code * fixing reformatted files
1 parent 0c4d06f commit 4f92c8f

File tree

9 files changed

+12663
-0
lines changed

9 files changed

+12663
-0
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Building a Recipe Search with Elasticsearch
2+
3+
This project demonstrates how to implement a semantic search using Elastic's
4+
ELSER and compare its results with a traditional lexical search. The setup is made practical and efficient by using a cluster created in Elastic Cloud, simplifying the use of ELSER and accelerating development.
5+
6+
> **Tip:** To learn more about Elastic Cloud and how to use it, visit: [https://www.elastic.co/pt/cloud](https://www.elastic.co/pt/cloud)
7+
8+
## Project Objectives
9+
10+
1. **Configure Elasticsearch infrastructure** to support semantic and lexical search indexes.
11+
2. **Data ingestion**: Use Python scripts to populate indexes with grocery product data.
12+
3. **Compare search types**: Perform searches and display the results for comparison.
13+
14+
## Prerequisites
15+
16+
- **Elasticsearch v8.15** (recommended): To support ELSER.
17+
- **Python 3.x**: Required to run the ingestion and search scripts.
18+
- **Python Libraries**: Required libraries are listed in the `requirements.txt` file.
19+
20+
To install the dependencies, use the following command:
21+
22+
```bash
23+
pip install -r requirements.txt
24+
```
25+
26+
## Creating the Indexes
27+
To create the semantic and lexical search indexes, run the following scripts:
28+
29+
### Semantic Index
30+
31+
```bash
32+
python infra.py
33+
```
34+
35+
### Lexical Index
36+
```bash
37+
python infra_lexical_index.py
38+
```
39+
40+
These scripts will automatically configure the indexes in Elasticsearch.
41+
42+
## Data Ingestion
43+
To ingest the recipe data into the indexes, use the commands below:
44+
45+
### Ingest Data into the Semantic Index
46+
47+
```bash
48+
python ingestion.py
49+
```
50+
51+
### Ingest Data into the Lexical Index
52+
```bash
53+
python ingestion_lexical_index.py
54+
```
55+
56+
## Search
57+
To perform searches and obtain results from both the semantic and lexical searches,
58+
run the following command:
59+
60+
```bash
61+
python search.py
62+
```
63+
64+
This script performs searches in both indexes and displays the results in the console,
65+
making it easy to compare the two approaches.
66+
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import yaml
2+
from elasticsearch import Elasticsearch, AsyncElasticsearch
3+
4+
5+
class ElasticsearchConnection:
6+
7+
def __init__(self, config_file="config.yml"):
8+
with open(config_file, "r") as f:
9+
config = yaml.safe_load(f)
10+
self.client = Elasticsearch(
11+
cloud_id=config["cloud_id"], api_key=config["api_key"]
12+
)
13+
14+
def get_client(self):
15+
return self.client
16+
17+
def get_async_client(self):
18+
with open("config.yml", "r") as f:
19+
config = yaml.safe_load(f)
20+
self.client = AsyncElasticsearch(
21+
cloud_id=config["cloud_id"],
22+
api_key=config["api_key"],
23+
request_timeout=240,
24+
)
25+
return self.client

supporting-blog-content/building-a-recipe-search-with-elasticsearch/files/output.json

Lines changed: 12301 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from elasticsearch_connection import ElasticsearchConnection
2+
3+
client = ElasticsearchConnection().get_client()
4+
5+
6+
def create_index_embedding():
7+
response = client.indices.create(
8+
index="grocery-catalog-elser",
9+
mappings={
10+
"properties": {
11+
"id": {"type": "integer"},
12+
"name": {"type": "text"},
13+
"description": {"type": "text", "copy_to": "description_embedding"},
14+
"category": {"type": "keyword"},
15+
"brand": {"type": "keyword"},
16+
"price": {"type": "float"},
17+
"unit": {"type": "keyword"},
18+
"description_embedding": {
19+
"type": "semantic_text",
20+
"inference_id": "elser_embeddings",
21+
},
22+
}
23+
},
24+
)
25+
print(response)
26+
27+
28+
def create_inference():
29+
response = client.inference.put(
30+
inference_id="elser_embeddings",
31+
task_type="sparse_embedding",
32+
body={
33+
"service": "elser",
34+
"service_settings": {"num_allocations": 1, "num_threads": 1},
35+
},
36+
)
37+
print(response)
38+
39+
40+
if __name__ == "__main__":
41+
42+
create_inference()
43+
44+
create_index_embedding()
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from elasticsearch_connection import ElasticsearchConnection
2+
3+
client = ElasticsearchConnection().get_client()
4+
5+
6+
def create_index():
7+
response = client.indices.create(
8+
index="grocery-catalog",
9+
mappings={
10+
"properties": {
11+
"id": {"type": "integer"},
12+
"name": {"type": "text"},
13+
"description": {"type": "text", "copy_to": "description_embedding"},
14+
"category": {"type": "keyword"},
15+
"brand": {"type": "keyword"},
16+
"price": {"type": "float"},
17+
"unit": {"type": "keyword"},
18+
}
19+
},
20+
)
21+
print(response)
22+
23+
24+
if __name__ == "__main__":
25+
create_index()
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import asyncio
2+
import json
3+
4+
from elasticsearch import helpers
5+
6+
from elasticsearch_connection import ElasticsearchConnection
7+
8+
async_client = ElasticsearchConnection().get_async_client()
9+
10+
11+
def partition_list(lst, chunk_size):
12+
return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
13+
14+
15+
async def index_data():
16+
global partitions
17+
with open("files/output.json", "r") as file:
18+
data_json = json.load(file)
19+
documents = []
20+
for doc in data_json:
21+
documents.append(
22+
{
23+
"_index": "grocery-catalog-elser",
24+
"_source": doc,
25+
}
26+
)
27+
28+
partitions = partition_list(documents, 500)
29+
30+
for i, partition in enumerate(partitions):
31+
print(f"partition {i + 1}")
32+
await async_bulk_indexing(async_client, partition)
33+
34+
35+
async def async_bulk_indexing(client, documents):
36+
success, failed = await helpers.async_bulk(client, documents)
37+
print(
38+
f"Successfully indexed {success} documents. Failed to index {failed} documents."
39+
)
40+
41+
42+
async def main():
43+
await index_data()
44+
45+
46+
loop = asyncio.get_event_loop()
47+
loop.run_until_complete(main())
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import asyncio
2+
import json
3+
4+
from elasticsearch import helpers
5+
6+
from elasticsearch_connection import ElasticsearchConnection
7+
8+
async_client = ElasticsearchConnection().get_async_client()
9+
10+
11+
def partition_list(lst, chunk_size):
12+
return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
13+
14+
15+
async def index_data():
16+
global partitions
17+
with open("files/output.json", "r") as file:
18+
data_json = json.load(file)
19+
documents = []
20+
for doc in data_json:
21+
documents.append(
22+
{
23+
"_index": "grocery-catalog",
24+
"_source": doc,
25+
}
26+
)
27+
28+
partitions = partition_list(documents, 500)
29+
30+
for i, partition in enumerate(partitions):
31+
print(f"partition {i + 1}")
32+
await async_bulk_indexing(async_client, partition)
33+
34+
35+
async def async_bulk_indexing(client, documents):
36+
success, failed = await helpers.async_bulk(client, documents)
37+
print(
38+
f"Successfully indexed {success} documents. Failed to index {failed} documents."
39+
)
40+
41+
42+
async def main():
43+
await index_data()
44+
45+
46+
loop = asyncio.get_event_loop()
47+
loop.run_until_complete(main())
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
elasticsearch
2+
aiohttp
3+
pyyaml
4+
pandas
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import pandas as pd
2+
3+
from elasticsearch_connection import ElasticsearchConnection
4+
5+
es_client = ElasticsearchConnection().get_client()
6+
7+
term = "seafood for grilling"
8+
size = 5
9+
10+
11+
def format_text(description, line_length=120):
12+
words = description.split()
13+
if len(words) <= line_length:
14+
return description
15+
else:
16+
return " ".join(words[:line_length]) + "..."
17+
18+
19+
def search_semantic(term):
20+
result = []
21+
response = es_client.search(
22+
index="grocery-catalog-elser",
23+
size=size,
24+
source_excludes="description_embedding",
25+
query={"semantic": {"field": "description_embedding", "query": term}},
26+
)
27+
28+
for hit in response["hits"]["hits"]:
29+
score = hit["_score"]
30+
name = format_text(hit["_source"]["name"], line_length=10)
31+
description = hit["_source"]["description"]
32+
formatted_description = format_text(description)
33+
result.append(
34+
{
35+
"score": score,
36+
"name": name,
37+
"description": formatted_description,
38+
}
39+
)
40+
return result
41+
42+
43+
def search_lexical(term):
44+
result = []
45+
response = es_client.search(
46+
index="grocery-catalog-elser",
47+
size=size,
48+
source_excludes="description_embedding",
49+
query={"multi_match": {"query": term, "fields": ["name", "description"]}},
50+
)
51+
52+
for hit in response["hits"]["hits"]:
53+
score = hit["_score"]
54+
name = format_text(hit["_source"]["name"], line_length=10)
55+
description = hit["_source"]["description"]
56+
result.append(
57+
{
58+
"score": score,
59+
"name": name,
60+
"description": description,
61+
}
62+
)
63+
return result
64+
65+
66+
if __name__ == "__main__":
67+
rs1 = search_semantic(term)
68+
rs2 = search_lexical(term)
69+
70+
df1 = (
71+
pd.DataFrame(rs1)[["name", "score"]]
72+
if rs1
73+
else pd.DataFrame(columns=["name", "score"])
74+
)
75+
df2 = (
76+
pd.DataFrame(rs2)[["name", "score"]]
77+
if rs2
78+
else pd.DataFrame(columns=["name", "score"])
79+
)
80+
df1 = (
81+
pd.DataFrame(rs1)[["name", "score"]]
82+
if rs1
83+
else pd.DataFrame(columns=["name", "score"])
84+
)
85+
df1["Search Type"] = "Semantic"
86+
87+
df2 = (
88+
pd.DataFrame(rs2)[["name", "score"]]
89+
if rs2
90+
else pd.DataFrame(columns(["name", "score"]))
91+
)
92+
df2["Search Type"] = "Lexical"
93+
94+
tabela = pd.concat([df1, df2], axis=0).reset_index(drop=True)
95+
96+
tabela = tabela[["Search Type", "name", "score"]]
97+
98+
tabela.columns = ["Search Type", "Name", "Score"]
99+
100+
tabela["Search Type"] = tabela["Search Type"].astype(str).str.ljust(0)
101+
tabela["Name"] = tabela["Name"].astype(str).str.ljust(15)
102+
tabela["Score"] = tabela["Score"].astype(str).str.ljust(5)
103+
104+
print(tabela.to_string(index=False))

0 commit comments

Comments
 (0)