Updated vector examples to use the punkt_tab tokenizer

miguelgrinberg · miguelgrinberg · commit cb0147bbe554 · 2024-08-15T12:18:29.000+01:00
diff --git a/examples/async/sparse_vectors.py b/examples/async/sparse_vectors.py
@@ -67,7 +67,7 @@
 from typing import Any, Dict, List, Optional
 from urllib.request import urlopen
 
-import nltk  # type: ignore
+from nltk.tokenize import PunktTokenizer  # type: ignore
 from tqdm import tqdm
 
 from elasticsearch_dsl import (
@@ -84,7 +84,7 @@
 DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
 
 # initialize sentence tokenizer
-nltk.download("punkt", quiet=True)
+tok = PunktTokenizer()
 
 
 class Passage(InnerDoc):
@@ -110,7 +110,7 @@ class Index:
 
     def clean(self) -> None:
         # split the content into sentences
-        passages = nltk.sent_tokenize(self.content)
+        passages = tok.tokenize(self.content)
 
         # generate an embedding for each passage and save it as a nested document
         for passage in passages:
diff --git a/examples/async/vectors.py b/examples/async/vectors.py
@@ -51,7 +51,7 @@
 from typing import Any, List, Optional, cast
 from urllib.request import urlopen
 
-import nltk  # type: ignore
+from nltk.tokenize import PunktTokenizer  # type: ignore
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
 
@@ -70,7 +70,7 @@
 MODEL_NAME = "all-MiniLM-L6-v2"
 
 # initialize sentence tokenizer
-nltk.download("punkt", quiet=True)
+tok = PunktTokenizer()
 
 # this will be the embedding model
 embedding_model: Any = None
@@ -103,7 +103,7 @@ def get_embedding(cls, input: str) -> List[float]:
 
     def clean(self) -> None:
         # split the content into sentences
-        passages = cast(List[str], nltk.sent_tokenize(self.content))
+        passages = cast(List[str], tok.tokenize(self.content))
 
         # generate an embedding for each passage and save it as a nested document
         for passage in passages:
diff --git a/examples/sparse_vectors.py b/examples/sparse_vectors.py
@@ -66,7 +66,7 @@
 from typing import Any, Dict, List, Optional
 from urllib.request import urlopen
 
-import nltk  # type: ignore
+from nltk.tokenize import PunktTokenizer  # type: ignore
 from tqdm import tqdm
 
 from elasticsearch_dsl import (
@@ -83,7 +83,7 @@
 DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
 
 # initialize sentence tokenizer
-nltk.download("punkt", quiet=True)
+tok = PunktTokenizer()
 
 
 class Passage(InnerDoc):
@@ -109,7 +109,7 @@ class Index:
 
     def clean(self) -> None:
         # split the content into sentences
-        passages = nltk.sent_tokenize(self.content)
+        passages = tok.tokenize(self.content)
 
         # generate an embedding for each passage and save it as a nested document
         for passage in passages:
diff --git a/examples/vectors.py b/examples/vectors.py
@@ -50,7 +50,7 @@
 from typing import Any, List, Optional, cast
 from urllib.request import urlopen
 
-import nltk  # type: ignore
+from nltk.tokenize import PunktTokenizer  # type: ignore
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
 
@@ -69,7 +69,7 @@
 MODEL_NAME = "all-MiniLM-L6-v2"
 
 # initialize sentence tokenizer
-nltk.download("punkt", quiet=True)
+tok = PunktTokenizer()
 
 # this will be the embedding model
 embedding_model: Any = None
@@ -102,7 +102,7 @@ def get_embedding(cls, input: str) -> List[float]:
 
     def clean(self) -> None:
         # split the content into sentences
-        passages = cast(List[str], nltk.sent_tokenize(self.content))
+        passages = cast(List[str], tok.tokenize(self.content))
 
         # generate an embedding for each passage and save it as a nested document
         for passage in passages: