File tree Expand file tree Collapse file tree 4 files changed +12
-12
lines changed Expand file tree Collapse file tree 4 files changed +12
-12
lines changed Original file line number Diff line number Diff line change 67
67
from typing import Any , Dict , List , Optional
68
68
from urllib .request import urlopen
69
69
70
- import nltk # type: ignore
70
+ from nltk . tokenize import PunktTokenizer # type: ignore
71
71
from tqdm import tqdm
72
72
73
73
from elasticsearch_dsl import (
84
84
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
85
85
86
86
# initialize sentence tokenizer
87
- nltk . download ( "punkt" , quiet = True )
87
+ tok = PunktTokenizer ( )
88
88
89
89
90
90
class Passage (InnerDoc ):
@@ -110,7 +110,7 @@ class Index:
110
110
111
111
def clean (self ) -> None :
112
112
# split the content into sentences
113
- passages = nltk . sent_tokenize (self .content )
113
+ passages = tok . tokenize (self .content )
114
114
115
115
# generate an embedding for each passage and save it as a nested document
116
116
for passage in passages :
Original file line number Diff line number Diff line change 51
51
from typing import Any , List , Optional , cast
52
52
from urllib .request import urlopen
53
53
54
- import nltk # type: ignore
54
+ from nltk . tokenize import PunktTokenizer # type: ignore
55
55
from sentence_transformers import SentenceTransformer
56
56
from tqdm import tqdm
57
57
70
70
MODEL_NAME = "all-MiniLM-L6-v2"
71
71
72
72
# initialize sentence tokenizer
73
- nltk . download ( "punkt" , quiet = True )
73
+ tok = PunktTokenizer ( )
74
74
75
75
# this will be the embedding model
76
76
embedding_model : Any = None
@@ -103,7 +103,7 @@ def get_embedding(cls, input: str) -> List[float]:
103
103
104
104
def clean (self ) -> None :
105
105
# split the content into sentences
106
- passages = cast (List [str ], nltk . sent_tokenize (self .content ))
106
+ passages = cast (List [str ], tok . tokenize (self .content ))
107
107
108
108
# generate an embedding for each passage and save it as a nested document
109
109
for passage in passages :
Original file line number Diff line number Diff line change 66
66
from typing import Any , Dict , List , Optional
67
67
from urllib .request import urlopen
68
68
69
- import nltk # type: ignore
69
+ from nltk . tokenize import PunktTokenizer # type: ignore
70
70
from tqdm import tqdm
71
71
72
72
from elasticsearch_dsl import (
83
83
DATASET_URL = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/datasets/workplace-documents.json"
84
84
85
85
# initialize sentence tokenizer
86
- nltk . download ( "punkt" , quiet = True )
86
+ tok = PunktTokenizer ( )
87
87
88
88
89
89
class Passage (InnerDoc ):
@@ -109,7 +109,7 @@ class Index:
109
109
110
110
def clean (self ) -> None :
111
111
# split the content into sentences
112
- passages = nltk . sent_tokenize (self .content )
112
+ passages = tok . tokenize (self .content )
113
113
114
114
# generate an embedding for each passage and save it as a nested document
115
115
for passage in passages :
Original file line number Diff line number Diff line change 50
50
from typing import Any , List , Optional , cast
51
51
from urllib .request import urlopen
52
52
53
- import nltk # type: ignore
53
+ from nltk . tokenize import PunktTokenizer # type: ignore
54
54
from sentence_transformers import SentenceTransformer
55
55
from tqdm import tqdm
56
56
69
69
MODEL_NAME = "all-MiniLM-L6-v2"
70
70
71
71
# initialize sentence tokenizer
72
- nltk . download ( "punkt" , quiet = True )
72
+ tok = PunktTokenizer ( )
73
73
74
74
# this will be the embedding model
75
75
embedding_model : Any = None
@@ -102,7 +102,7 @@ def get_embedding(cls, input: str) -> List[float]:
102
102
103
103
def clean (self ) -> None :
104
104
# split the content into sentences
105
- passages = cast (List [str ], nltk . sent_tokenize (self .content ))
105
+ passages = cast (List [str ], tok . tokenize (self .content ))
106
106
107
107
# generate an embedding for each passage and save it as a nested document
108
108
for passage in passages :
You can’t perform that action at this time.
0 commit comments