Skip to content

Commit ec6b164

Browse files
committed
feat: refactoring of the tokenization function
1 parent 4ab26a2 commit ec6b164

File tree

4 files changed

+53
-13
lines changed

4 files changed

+53
-13
lines changed

requirements-dev.lock

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ cycler==0.12.1
7171
dataclasses-json==0.6.7
7272
# via langchain-community
7373
dill==0.3.8
74+
# via multiprocess
7475
# via pylint
7576
distro==1.9.0
7677
# via openai
@@ -87,6 +88,7 @@ fastapi-pagination==0.12.26
8788
# via burr
8889
filelock==3.15.4
8990
# via huggingface-hub
91+
# via transformers
9092
fonttools==4.53.1
9193
# via matplotlib
9294
free-proxy==1.1.1
@@ -152,6 +154,7 @@ httpx-sse==0.4.0
152154
# via langchain-mistralai
153155
huggingface-hub==0.24.5
154156
# via tokenizers
157+
# via transformers
155158
idna==3.7
156159
# via anyio
157160
# via httpx
@@ -235,9 +238,13 @@ mdurl==0.1.2
235238
# via markdown-it-py
236239
minify-html==0.15.0
237240
# via scrapegraphai
241+
mpire==2.10.2
242+
# via semchunk
238243
multidict==6.0.5
239244
# via aiohttp
240245
# via yarl
246+
multiprocess==0.70.16
247+
# via mpire
241248
mypy-extensions==1.0.0
242249
# via typing-inspect
243250
narwhals==1.3.0
@@ -254,6 +261,7 @@ numpy==1.26.4
254261
# via pydeck
255262
# via sf-hamilton
256263
# via streamlit
264+
# via transformers
257265
ollama==0.3.2
258266
# via langchain-ollama
259267
openai==1.40.3
@@ -271,6 +279,7 @@ packaging==24.1
271279
# via pytest
272280
# via sphinx
273281
# via streamlit
282+
# via transformers
274283
pandas==2.2.2
275284
# via scrapegraphai
276285
# via sf-hamilton
@@ -320,6 +329,7 @@ pyee==11.1.0
320329
# via playwright
321330
pygments==2.18.0
322331
# via furo
332+
# via mpire
323333
# via rich
324334
# via sphinx
325335
pylint==3.2.6
@@ -342,11 +352,13 @@ pyyaml==6.0.2
342352
# via langchain
343353
# via langchain-community
344354
# via langchain-core
355+
# via transformers
345356
referencing==0.35.1
346357
# via jsonschema
347358
# via jsonschema-specifications
348359
regex==2024.7.24
349360
# via tiktoken
361+
# via transformers
350362
requests==2.32.3
351363
# via burr
352364
# via free-proxy
@@ -358,6 +370,7 @@ requests==2.32.3
358370
# via sphinx
359371
# via streamlit
360372
# via tiktoken
373+
# via transformers
361374
rich==13.7.1
362375
# via streamlit
363376
rpds-py==0.20.0
@@ -367,6 +380,10 @@ rsa==4.9
367380
# via google-auth
368381
s3transfer==0.10.2
369382
# via boto3
383+
safetensors==0.4.5
384+
# via transformers
385+
semchunk==2.2.0
386+
# via scrapegraphai
370387
sf-hamilton==1.73.1
371388
# via burr
372389
six==1.16.0
@@ -416,6 +433,7 @@ tiktoken==0.7.0
416433
# via scrapegraphai
417434
tokenizers==0.19.1
418435
# via langchain-mistralai
436+
# via transformers
419437
toml==0.10.2
420438
# via streamlit
421439
tomli==2.0.1
@@ -428,8 +446,13 @@ tornado==6.4.1
428446
tqdm==4.66.5
429447
# via google-generativeai
430448
# via huggingface-hub
449+
# via mpire
431450
# via openai
432451
# via scrapegraphai
452+
# via semchunk
453+
# via transformers
454+
transformers==4.44.2
455+
# via scrapegraphai
433456
typing-extensions==4.12.2
434457
# via altair
435458
# via anyio

requirements.lock

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ charset-normalizer==3.3.2
4141
# via requests
4242
dataclasses-json==0.6.7
4343
# via langchain-community
44+
dill==0.3.8
45+
# via multiprocess
4446
distro==1.9.0
4547
# via openai
4648
exceptiongroup==1.2.2
@@ -49,6 +51,7 @@ faiss-cpu==1.8.0.post1
4951
# via scrapegraphai
5052
filelock==3.15.4
5153
# via huggingface-hub
54+
# via transformers
5255
free-proxy==1.1.1
5356
# via scrapegraphai
5457
frozenlist==1.4.1
@@ -103,6 +106,7 @@ httpx-sse==0.4.0
103106
# via langchain-mistralai
104107
huggingface-hub==0.24.1
105108
# via tokenizers
109+
# via transformers
106110
idna==3.7
107111
# via anyio
108112
# via httpx
@@ -153,9 +157,13 @@ marshmallow==3.21.3
153157
# via dataclasses-json
154158
minify-html==0.15.0
155159
# via scrapegraphai
160+
mpire==2.10.2
161+
# via semchunk
156162
multidict==6.0.5
157163
# via aiohttp
158164
# via yarl
165+
multiprocess==0.70.16
166+
# via mpire
159167
mypy-extensions==1.0.0
160168
# via typing-inspect
161169
numpy==1.26.4
@@ -164,6 +172,7 @@ numpy==1.26.4
164172
# via langchain-aws
165173
# via langchain-community
166174
# via pandas
175+
# via transformers
167176
ollama==0.3.2
168177
# via langchain-ollama
169178
openai==1.41.0
@@ -175,6 +184,7 @@ packaging==24.1
175184
# via huggingface-hub
176185
# via langchain-core
177186
# via marshmallow
187+
# via transformers
178188
pandas==2.2.2
179189
# via scrapegraphai
180190
playwright==1.45.1
@@ -205,6 +215,8 @@ pydantic-core==2.20.1
205215
# via pydantic
206216
pyee==11.1.0
207217
# via playwright
218+
pygments==2.18.0
219+
# via mpire
208220
pyparsing==3.1.2
209221
# via httplib2
210222
python-dateutil==2.9.0.post0
@@ -219,8 +231,10 @@ pyyaml==6.0.1
219231
# via langchain
220232
# via langchain-community
221233
# via langchain-core
234+
# via transformers
222235
regex==2024.5.15
223236
# via tiktoken
237+
# via transformers
224238
requests==2.32.3
225239
# via free-proxy
226240
# via google-api-core
@@ -229,10 +243,15 @@ requests==2.32.3
229243
# via langchain-community
230244
# via langsmith
231245
# via tiktoken
246+
# via transformers
232247
rsa==4.9
233248
# via google-auth
234249
s3transfer==0.10.2
235250
# via boto3
251+
safetensors==0.4.5
252+
# via transformers
253+
semchunk==2.2.0
254+
# via scrapegraphai
236255
six==1.16.0
237256
# via python-dateutil
238257
sniffio==1.3.1
@@ -253,11 +272,17 @@ tiktoken==0.7.0
253272
# via scrapegraphai
254273
tokenizers==0.19.1
255274
# via langchain-mistralai
275+
# via transformers
256276
tqdm==4.66.4
257277
# via google-generativeai
258278
# via huggingface-hub
279+
# via mpire
259280
# via openai
260281
# via scrapegraphai
282+
# via semchunk
283+
# via transformers
284+
transformers==4.44.2
285+
# via scrapegraphai
261286
typing-extensions==4.12.2
262287
# via anyio
263288
# via google-generativeai

scrapegraphai/utils/tokenizer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
2323
num_tokens_fn = num_tokens_ollama
2424

2525
else:
26-
raise NotImplementedError(f"There is no tokenization implementation for model '{llm_model}'")
27-
26+
from .tokenizers.tokenizer_openai import num_tokens_openai
27+
num_tokens_fn = num_tokens_openai
28+
2829
num_tokens = num_tokens_fn(string, llm_model)
2930
return num_tokens

scrapegraphai/utils/tokenizers/tokenizer_openai.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,8 @@ def num_tokens_openai(text: str, llm_model:BaseChatModel) -> int:
2121
logger = get_logger()
2222

2323
logger.debug(f"Counting tokens for text of {len(text)} characters")
24-
try:
25-
model = llm_model.model_name
26-
except AttributeError:
27-
raise NotImplementedError(f"The model provider you are using ('{llm_model}') "
28-
"does not give us a model name so we cannot identify which encoding to use")
2924

30-
try:
31-
encoding = tiktoken.encoding_for_model(model)
32-
except KeyError:
33-
raise NotImplementedError(f"Tiktoken does not support identifying the encoding for "
34-
"the model '{model}'")
35-
25+
encoding = tiktoken.encoding_for_model("gpt-4")
26+
3627
num_tokens = len(encoding.encode(text))
3728
return num_tokens

0 commit comments

Comments
 (0)