Skip to content

Commit 85cb957

Browse files
feat: finished basic version of deep scraper
Co-Authored-By: Matteo Vedovati <[email protected]>
1 parent 4b371f4 commit 85cb957

10 files changed

+149
-38
lines changed

examples/openai/depth_search_graph_openai.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
"""
22
depth_search_graph_opeani example
33
"""
4+
import os
5+
from dotenv import load_dotenv
46
from scrapegraphai.graphs import DepthSearchGraph
57

8+
load_dotenv()
9+
10+
openai_key = os.getenv("OPENAI_APIKEY")
11+
612
graph_config = {
713
"llm": {
8-
"api_key":"YOUR_API_KEY",
14+
"api_key": openai_key,
915
"model": "openai/gpt-4o-mini",
1016
},
1117
"verbose": True,
1218
"headless": False,
1319
"depth": 2,
14-
"only_inside_links": True,
20+
"only_inside_links": False,
1521
}
1622

1723
search_graph = DepthSearchGraph(
1824
prompt="List me all the projects with their description",
19-
source="https://perinim.github.io/projects/",
25+
source="https://perinim.github.io",
2026
config=graph_config
2127
)
2228

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ dependencies = [
3131
"google>=3.0.0",
3232
"langchain-ollama>=0.1.3",
3333
"semchunk==2.2.0",
34-
"transformers==4.44.2"
34+
"transformers==4.44.2",
35+
"qdrant-client>=1.11.3",
36+
"fastembed>=0.3.6"
3537
]
3638

3739
license = "MIT"
@@ -99,7 +101,7 @@ screenshot_scraper = [
99101
"pillow>=10.4.0",
100102
]
101103

102-
# Group 5: Faiss CPU
104+
# Group 5: qdrant
103105
qdrant = [
104106
"qdrant-client>=1.11.3",
105107
"fastembed>=0.3.6"

requirements-dev.lock

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ click==8.1.7
6464
# via burr
6565
# via streamlit
6666
# via uvicorn
67+
coloredlogs==15.0.1
68+
# via onnxruntime
6769
contourpy==1.2.1
6870
# via matplotlib
6971
cycler==0.12.1
@@ -84,9 +86,13 @@ fastapi==0.112.0
8486
# via burr
8587
fastapi-pagination==0.12.26
8688
# via burr
89+
fastembed==0.3.6
90+
# via scrapegraphai
8791
filelock==3.15.4
8892
# via huggingface-hub
8993
# via transformers
94+
flatbuffers==24.3.25
95+
# via onnxruntime
9096
fonttools==4.53.1
9197
# via matplotlib
9298
free-proxy==1.1.1
@@ -132,11 +138,19 @@ greenlet==3.0.3
132138
grpcio==1.65.4
133139
# via google-api-core
134140
# via grpcio-status
141+
# via grpcio-tools
142+
# via qdrant-client
135143
grpcio-status==1.62.3
136144
# via google-api-core
145+
grpcio-tools==1.62.3
146+
# via qdrant-client
137147
h11==0.14.0
138148
# via httpcore
139149
# via uvicorn
150+
h2==4.1.0
151+
# via httpx
152+
hpack==4.0.0
153+
# via h2
140154
html2text==2024.2.26
141155
# via scrapegraphai
142156
httpcore==1.0.5
@@ -149,11 +163,17 @@ httpx==0.27.0
149163
# via langsmith
150164
# via ollama
151165
# via openai
166+
# via qdrant-client
152167
httpx-sse==0.4.0
153168
# via langchain-mistralai
154169
huggingface-hub==0.24.5
170+
# via fastembed
155171
# via tokenizers
156172
# via transformers
173+
humanfriendly==10.0
174+
# via coloredlogs
175+
hyperframe==6.0.1
176+
# via h2
157177
idna==3.7
158178
# via anyio
159179
# via httpx
@@ -218,6 +238,7 @@ langsmith==0.1.121
218238
# via langchain-core
219239
loguru==0.7.2
220240
# via burr
241+
# via fastembed
221242
lxml==5.3.0
222243
# via free-proxy
223244
markdown-it-py==3.0.0
@@ -236,8 +257,12 @@ minify-html==0.15.0
236257
# via scrapegraphai
237258
mistral-common==1.4.1
238259
# via scrapegraphai
260+
mmh3==4.1.0
261+
# via fastembed
239262
mpire==2.10.2
240263
# via semchunk
264+
mpmath==1.3.0
265+
# via sympy
241266
multidict==6.0.5
242267
# via aiohttp
243268
# via yarl
@@ -249,19 +274,27 @@ narwhals==1.3.0
249274
# via altair
250275
numpy==1.26.4
251276
# via contourpy
277+
# via fastembed
252278
# via langchain
253279
# via langchain-aws
254280
# via langchain-community
255281
# via matplotlib
282+
# via onnx
283+
# via onnxruntime
256284
# via opencv-python-headless
257285
# via pandas
258286
# via pyarrow
259287
# via pydeck
288+
# via qdrant-client
260289
# via sf-hamilton
261290
# via streamlit
262291
# via transformers
263292
ollama==0.3.2
264293
# via langchain-ollama
294+
onnx==1.17.0
295+
# via fastembed
296+
onnxruntime==1.19.2
297+
# via fastembed
265298
openai==1.40.3
266299
# via burr
267300
# via langchain-openai
@@ -275,6 +308,7 @@ packaging==24.1
275308
# via langchain-core
276309
# via marshmallow
277310
# via matplotlib
311+
# via onnxruntime
278312
# via pytest
279313
# via sphinx
280314
# via streamlit
@@ -284,6 +318,7 @@ pandas==2.2.2
284318
# via sf-hamilton
285319
# via streamlit
286320
pillow==10.4.0
321+
# via fastembed
287322
# via matplotlib
288323
# via mistral-common
289324
# via streamlit
@@ -294,6 +329,8 @@ playwright==1.45.1
294329
# via undetected-playwright
295330
pluggy==1.5.0
296331
# via pytest
332+
portalocker==2.10.1
333+
# via qdrant-client
297334
proto-plus==1.24.0
298335
# via google-ai-generativelanguage
299336
# via google-api-core
@@ -303,6 +340,9 @@ protobuf==4.25.4
303340
# via google-generativeai
304341
# via googleapis-common-protos
305342
# via grpcio-status
343+
# via grpcio-tools
344+
# via onnx
345+
# via onnxruntime
306346
# via proto-plus
307347
# via streamlit
308348
pyarrow==17.0.0
@@ -326,6 +366,7 @@ pydantic==2.8.2
326366
# via mistral-common
327367
# via openai
328368
# via pydantic-settings
369+
# via qdrant-client
329370
pydantic-core==2.20.1
330371
# via pydantic
331372
pydantic-settings==2.5.2
@@ -343,6 +384,8 @@ pylint==3.2.6
343384
pyparsing==3.1.2
344385
# via httplib2
345386
# via matplotlib
387+
pystemmer==2.2.0.1
388+
# via fastembed
346389
pytest==8.0.0
347390
# via pytest-mock
348391
pytest-mock==3.14.0
@@ -361,6 +404,8 @@ pyyaml==6.0.2
361404
# via langchain-community
362405
# via langchain-core
363406
# via transformers
407+
qdrant-client==1.11.3
408+
# via scrapegraphai
364409
referencing==0.35.1
365410
# via jsonschema
366411
# via jsonschema-specifications
@@ -369,6 +414,7 @@ regex==2024.7.24
369414
# via transformers
370415
requests==2.32.3
371416
# via burr
417+
# via fastembed
372418
# via free-proxy
373419
# via google-api-core
374420
# via huggingface-hub
@@ -395,6 +441,8 @@ semchunk==2.2.0
395441
# via scrapegraphai
396442
sentencepiece==0.2.0
397443
# via mistral-common
444+
setuptools==75.1.0
445+
# via grpcio-tools
398446
sf-hamilton==1.73.1
399447
# via burr
400448
six==1.16.0
@@ -406,6 +454,7 @@ sniffio==1.3.1
406454
# via httpx
407455
# via openai
408456
snowballstemmer==2.2.0
457+
# via fastembed
409458
# via sphinx
410459
soupsieve==2.5
411460
# via beautifulsoup4
@@ -434,6 +483,8 @@ starlette==0.37.2
434483
# via fastapi
435484
streamlit==1.37.1
436485
# via burr
486+
sympy==1.13.3
487+
# via onnxruntime
437488
tenacity==8.5.0
438489
# via langchain
439490
# via langchain-community
@@ -444,6 +495,7 @@ tiktoken==0.7.0
444495
# via mistral-common
445496
# via scrapegraphai
446497
tokenizers==0.19.1
498+
# via fastembed
447499
# via langchain-mistralai
448500
# via transformers
449501
toml==0.10.2
@@ -456,6 +508,7 @@ tomlkit==0.13.0
456508
tornado==6.4.1
457509
# via streamlit
458510
tqdm==4.66.5
511+
# via fastembed
459512
# via google-generativeai
460513
# via huggingface-hub
461514
# via mpire
@@ -495,6 +548,7 @@ uritemplate==4.1.1
495548
# via google-api-python-client
496549
urllib3==1.26.19
497550
# via botocore
551+
# via qdrant-client
498552
# via requests
499553
uvicorn==0.30.5
500554
# via burr

0 commit comments

Comments
 (0)