Skip to content

Commit cb0147b

Browse files
Updated vector examples to use the punkt_tab tokenizer
1 parent 6bd7aa3 commit cb0147b

File tree

4 files changed

+12
-12
lines changed

4 files changed

+12
-12
lines changed

examples/async/sparse_vectors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
from typing import Any, Dict, List, Optional
6868
from urllib.request import urlopen
6969

70-
import nltk # type: ignore
70+
from nltk.tokenize import PunktTokenizer # type: ignore
7171
from tqdm import tqdm
7272

7373
from elasticsearch_dsl import (
@@ -84,7 +84,7 @@
8484
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
8585

8686
# initialize sentence tokenizer
87-
nltk.download("punkt", quiet=True)
87+
tok = PunktTokenizer()
8888

8989

9090
class Passage(InnerDoc):
@@ -110,7 +110,7 @@ class Index:
110110

111111
def clean(self) -> None:
112112
# split the content into sentences
113-
passages = nltk.sent_tokenize(self.content)
113+
passages = tok.tokenize(self.content)
114114

115115
# generate an embedding for each passage and save it as a nested document
116116
for passage in passages:

examples/async/vectors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
from typing import Any, List, Optional, cast
5252
from urllib.request import urlopen
5353

54-
import nltk # type: ignore
54+
from nltk.tokenize import PunktTokenizer # type: ignore
5555
from sentence_transformers import SentenceTransformer
5656
from tqdm import tqdm
5757

@@ -70,7 +70,7 @@
7070
MODEL_NAME = "all-MiniLM-L6-v2"
7171

7272
# initialize sentence tokenizer
73-
nltk.download("punkt", quiet=True)
73+
tok = PunktTokenizer()
7474

7575
# this will be the embedding model
7676
embedding_model: Any = None
@@ -103,7 +103,7 @@ def get_embedding(cls, input: str) -> List[float]:
103103

104104
def clean(self) -> None:
105105
# split the content into sentences
106-
passages = cast(List[str], nltk.sent_tokenize(self.content))
106+
passages = cast(List[str], tok.tokenize(self.content))
107107

108108
# generate an embedding for each passage and save it as a nested document
109109
for passage in passages:

examples/sparse_vectors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
from typing import Any, Dict, List, Optional
6767
from urllib.request import urlopen
6868

69-
import nltk # type: ignore
69+
from nltk.tokenize import PunktTokenizer # type: ignore
7070
from tqdm import tqdm
7171

7272
from elasticsearch_dsl import (
@@ -83,7 +83,7 @@
8383
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
8484

8585
# initialize sentence tokenizer
86-
nltk.download("punkt", quiet=True)
86+
tok = PunktTokenizer()
8787

8888

8989
class Passage(InnerDoc):
@@ -109,7 +109,7 @@ class Index:
109109

110110
def clean(self) -> None:
111111
# split the content into sentences
112-
passages = nltk.sent_tokenize(self.content)
112+
passages = tok.tokenize(self.content)
113113

114114
# generate an embedding for each passage and save it as a nested document
115115
for passage in passages:

examples/vectors.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
from typing import Any, List, Optional, cast
5151
from urllib.request import urlopen
5252

53-
import nltk # type: ignore
53+
from nltk.tokenize import PunktTokenizer # type: ignore
5454
from sentence_transformers import SentenceTransformer
5555
from tqdm import tqdm
5656

@@ -69,7 +69,7 @@
6969
MODEL_NAME = "all-MiniLM-L6-v2"
7070

7171
# initialize sentence tokenizer
72-
nltk.download("punkt", quiet=True)
72+
tok = PunktTokenizer()
7373

7474
# this will be the embedding model
7575
embedding_model: Any = None
@@ -102,7 +102,7 @@ def get_embedding(cls, input: str) -> List[float]:
102102

103103
def clean(self) -> None:
104104
# split the content into sentences
105-
passages = cast(List[str], nltk.sent_tokenize(self.content))
105+
passages = cast(List[str], tok.tokenize(self.content))
106106

107107
# generate an embedding for each passage and save it as a nested document
108108
for passage in passages:

0 commit comments

Comments
 (0)