Skip to content

Commit 99d091f

Browse files
Added support for the semantic_text field and semantic query type
1 parent ec60616 commit 99d091f

File tree

4 files changed

+315
-0
lines changed

4 files changed

+315
-0
lines changed

elasticsearch_dsl/field.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,3 +560,7 @@ class TokenCount(Field):
560560

561561
class Murmur3(Field):
562562
name = "murmur3"
563+
564+
565+
class SemanticText(Field):
566+
name = "semantic_text"

elasticsearch_dsl/query.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,10 @@ class Shape(Query):
527527
name = "shape"
528528

529529

530+
class Semantic(Query):
531+
name = "semantic"
532+
533+
530534
class SimpleQueryString(Query):
531535
name = "simple_query_string"
532536

examples/async/semantic_text.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# Licensed to Elasticsearch B.V. under one or more contributor
2+
# license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright
4+
# ownership. Elasticsearch B.V. licenses this file to you under
5+
# the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
19+
"""
20+
# Sparse vector database example
21+
22+
Requirements:
23+
24+
$ pip install nltk tqdm elasticsearch-dsl[async]
25+
26+
Before running this example, an ELSER inference endpoint must be created in the
27+
Elasticsearch cluster. This can be done manually from Kibana, or with the
28+
following curl command from a terminal, adjusting the endpoint as needed:
29+
30+
curl -X PUT \
31+
"$ELASTICSEARCH_URL/_inference/sparse_embedding/my-elser-endpoint" \
32+
-H "Content-Type: application/json" \
33+
-d '{"service":"elser","service_settings":{"num_allocations":1,"num_threads":1}}'
34+
35+
To run the example:
36+
37+
$ python semantic_text.py "text to search"
38+
39+
The index will be created automatically if it does not exist. Add
40+
`--recreate-index` to regenerate it.
41+
42+
The example dataset includes a selection of workplace documents. The
43+
following are good example queries to try out with this dataset:
44+
45+
$ python semantic_text.py "work from home"
46+
$ python semantic_text.py "vacation time"
47+
$ python semantic_text.py "can I bring a bird to work?"
48+
49+
When the index is created, the inference service will split the documents into
50+
short passages, and for each passage a sparse embedding is generated using
51+
Elastic's ELSER v2 model.
52+
53+
The documents that are returned as search results are those that have the
54+
highest scored passages. Add `--show-inner-hits` to the command to see
55+
individual passage results as well.
56+
"""
57+
58+
import argparse
59+
import asyncio
60+
import json
61+
import os
62+
from datetime import datetime
63+
from typing import Any, Optional
64+
from urllib.request import urlopen
65+
66+
from tqdm import tqdm
67+
68+
import elasticsearch_dsl as dsl
69+
70+
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
71+
72+
73+
class WorkplaceDoc(dsl.AsyncDocument):
74+
class Index:
75+
name = "workplace_documents_semantic"
76+
77+
name: str
78+
summary: str
79+
content: Any = dsl.mapped_field(
80+
dsl.field.SemanticText(inference_id="my-elser-endpoint")
81+
)
82+
created: datetime
83+
updated: Optional[datetime]
84+
url: str = dsl.mapped_field(dsl.Keyword())
85+
category: str = dsl.mapped_field(dsl.Keyword())
86+
87+
88+
async def create() -> None:
89+
90+
# create the index
91+
await WorkplaceDoc._index.delete(ignore_unavailable=True)
92+
await WorkplaceDoc.init()
93+
94+
# download the data
95+
dataset = json.loads(urlopen(DATASET_URL).read())
96+
97+
# import the dataset
98+
for data in tqdm(dataset, desc="Indexing documents..."):
99+
doc = WorkplaceDoc(
100+
name=data["name"],
101+
summary=data["summary"],
102+
content=data["content"],
103+
created=data.get("created_on"),
104+
updated=data.get("updated_at"),
105+
url=data["url"],
106+
category=data["category"],
107+
)
108+
await doc.save()
109+
110+
# refresh the index
111+
await WorkplaceDoc._index.refresh()
112+
113+
114+
async def search(query: str) -> dsl.AsyncSearch[WorkplaceDoc]:
115+
return WorkplaceDoc.search()[:5].query(
116+
"semantic",
117+
field=WorkplaceDoc.content,
118+
query=query,
119+
)
120+
121+
122+
def parse_args() -> argparse.Namespace:
123+
parser = argparse.ArgumentParser(description="Vector database with Elasticsearch")
124+
parser.add_argument(
125+
"--recreate-index", action="store_true", help="Recreate and populate the index"
126+
)
127+
parser.add_argument("query", action="store", help="The search query")
128+
return parser.parse_args()
129+
130+
131+
async def main() -> None:
132+
args = parse_args()
133+
134+
# initiate the default connection to elasticsearch
135+
dsl.async_connections.create_connection(hosts=[os.environ["ELASTICSEARCH_URL"]])
136+
137+
if args.recreate_index or not await WorkplaceDoc._index.exists():
138+
await create()
139+
140+
results = await search(args.query)
141+
142+
async for hit in results:
143+
print(
144+
f"Document: {hit.name} [Category: {hit.category}] [Score: {hit.meta.score}]"
145+
)
146+
print(f"Content: {hit.content.text}")
147+
print("--------------------\n")
148+
149+
# close the connection
150+
await dsl.async_connections.get_connection().close()
151+
152+
153+
if __name__ == "__main__":
154+
asyncio.run(main())

examples/semantic_text.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# Licensed to Elasticsearch B.V. under one or more contributor
2+
# license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright
4+
# ownership. Elasticsearch B.V. licenses this file to you under
5+
# the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
19+
"""
20+
# Sparse vector database example
21+
22+
Requirements:
23+
24+
$ pip install nltk tqdm elasticsearch-dsl
25+
26+
Before running this example, an ELSER inference endpoint must be created in the
27+
Elasticsearch cluster. This can be done manually from Kibana, or with the
28+
following curl command from a terminal, adjusting the endpoint as needed:
29+
30+
curl -X PUT \
31+
"$ELASTICSEARCH_URL/_inference/sparse_embedding/my-elser-endpoint" \
32+
-H "Content-Type: application/json" \
33+
-d '{"service":"elser","service_settings":{"num_allocations":1,"num_threads":1}}'
34+
35+
To run the example:
36+
37+
$ python semantic_text.py "text to search"
38+
39+
The index will be created automatically if it does not exist. Add
40+
`--recreate-index` to regenerate it.
41+
42+
The example dataset includes a selection of workplace documents. The
43+
following are good example queries to try out with this dataset:
44+
45+
$ python semantic_text.py "work from home"
46+
$ python semantic_text.py "vacation time"
47+
$ python semantic_text.py "can I bring a bird to work?"
48+
49+
When the index is created, the inference service will split the documents into
50+
short passages, and for each passage a sparse embedding is generated using
51+
Elastic's ELSER v2 model.
52+
53+
The documents that are returned as search results are those that have the
54+
highest scored passages. Add `--show-inner-hits` to the command to see
55+
individual passage results as well.
56+
"""
57+
58+
import argparse
59+
import json
60+
import os
61+
from datetime import datetime
62+
from typing import Any, Optional
63+
from urllib.request import urlopen
64+
65+
from tqdm import tqdm
66+
67+
import elasticsearch_dsl as dsl
68+
69+
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
70+
71+
72+
class WorkplaceDoc(dsl.Document):
73+
class Index:
74+
name = "workplace_documents_semantic"
75+
76+
name: str
77+
summary: str
78+
content: Any = dsl.mapped_field(
79+
dsl.field.SemanticText(inference_id="my-elser-endpoint")
80+
)
81+
created: datetime
82+
updated: Optional[datetime]
83+
url: str = dsl.mapped_field(dsl.Keyword())
84+
category: str = dsl.mapped_field(dsl.Keyword())
85+
86+
87+
def create() -> None:
88+
89+
# create the index
90+
WorkplaceDoc._index.delete(ignore_unavailable=True)
91+
WorkplaceDoc.init()
92+
93+
# download the data
94+
dataset = json.loads(urlopen(DATASET_URL).read())
95+
96+
# import the dataset
97+
for data in tqdm(dataset, desc="Indexing documents..."):
98+
doc = WorkplaceDoc(
99+
name=data["name"],
100+
summary=data["summary"],
101+
content=data["content"],
102+
created=data.get("created_on"),
103+
updated=data.get("updated_at"),
104+
url=data["url"],
105+
category=data["category"],
106+
)
107+
doc.save()
108+
109+
# refresh the index
110+
WorkplaceDoc._index.refresh()
111+
112+
113+
def search(query: str) -> dsl.Search[WorkplaceDoc]:
114+
return WorkplaceDoc.search()[:5].query(
115+
"semantic",
116+
field=WorkplaceDoc.content,
117+
query=query,
118+
)
119+
120+
121+
def parse_args() -> argparse.Namespace:
122+
parser = argparse.ArgumentParser(description="Vector database with Elasticsearch")
123+
parser.add_argument(
124+
"--recreate-index", action="store_true", help="Recreate and populate the index"
125+
)
126+
parser.add_argument("query", action="store", help="The search query")
127+
return parser.parse_args()
128+
129+
130+
def main() -> None:
131+
args = parse_args()
132+
133+
# initiate the default connection to elasticsearch
134+
dsl.connections.create_connection(hosts=[os.environ["ELASTICSEARCH_URL"]])
135+
136+
if args.recreate_index or not WorkplaceDoc._index.exists():
137+
create()
138+
139+
results = search(args.query)
140+
141+
for hit in results:
142+
print(
143+
f"Document: {hit.name} [Category: {hit.category}] [Score: {hit.meta.score}]"
144+
)
145+
print(f"Content: {hit.content.text}")
146+
print("--------------------\n")
147+
148+
# close the connection
149+
dsl.connections.get_connection().close()
150+
151+
152+
if __name__ == "__main__":
153+
main()

0 commit comments

Comments
 (0)