Skip to content

Commit 602dd00

Browse files
committed
feat: refactoring_to_md function
1 parent bb62439 commit 602dd00

File tree

5 files changed

+6
-72
lines changed

5 files changed

+6
-72
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ dependencies = [
3434
"undetected-playwright==0.3.0",
3535
"semchunk==1.0.1",
3636
"html2text==2024.2.26",
37-
"trafilatura==1.10.0",
3837
"langchain-fireworks==0.1.3"
3938
]
4039

requirements-dev.lock

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ attrs==23.2.0
4141
# via jsonschema
4242
# via referencing
4343
babel==2.15.0
44-
# via courlan
4544
# via sphinx
4645
beautifulsoup4==4.12.3
4746
# via furo
@@ -63,27 +62,20 @@ certifi==2024.2.2
6362
# via httpcore
6463
# via httpx
6564
# via requests
66-
# via trafilatura
6765
charset-normalizer==3.3.2
68-
# via htmldate
6966
# via requests
70-
# via trafilatura
7167
click==8.1.7
7268
# via burr
7369
# via streamlit
7470
# via typer
7571
# via uvicorn
7672
contourpy==1.2.1
7773
# via matplotlib
78-
courlan==1.2.0
79-
# via trafilatura
8074
cycler==0.12.1
8175
# via matplotlib
8276
dataclasses-json==0.6.6
8377
# via langchain
8478
# via langchain-community
85-
dateparser==1.2.0
86-
# via htmldate
8779
defusedxml==0.7.1
8880
# via langchain-anthropic
8981
dill==0.3.8
@@ -204,8 +196,6 @@ h11==0.14.0
204196
# via uvicorn
205197
html2text==2024.2.26
206198
# via scrapegraphai
207-
htmldate==1.8.1
208-
# via trafilatura
209199
httpcore==1.0.5
210200
# via httpx
211201
httplib2==0.22.0
@@ -259,8 +249,6 @@ jsonschema==4.22.0
259249
# via altair
260250
jsonschema-specifications==2023.12.1
261251
# via jsonschema
262-
justext==3.0.1
263-
# via trafilatura
264252
kiwisolver==1.4.5
265253
# via matplotlib
266254
langchain==0.1.15
@@ -302,12 +290,6 @@ loguru==0.7.2
302290
# via burr
303291
lxml==5.2.2
304292
# via free-proxy
305-
# via htmldate
306-
# via justext
307-
# via lxml-html-clean
308-
# via trafilatura
309-
lxml-html-clean==0.1.1
310-
# via lxml
311293
markdown-it-py==3.0.0
312294
# via rich
313295
markupsafe==2.1.5
@@ -430,9 +412,7 @@ pytest==8.0.0
430412
pytest-mock==3.14.0
431413
python-dateutil==2.9.0.post0
432414
# via botocore
433-
# via dateparser
434415
# via google-cloud-bigquery
435-
# via htmldate
436416
# via matplotlib
437417
# via pandas
438418
python-dotenv==1.0.1
@@ -441,7 +421,6 @@ python-dotenv==1.0.1
441421
python-multipart==0.0.9
442422
# via fastapi
443423
pytz==2024.1
444-
# via dateparser
445424
# via pandas
446425
pyyaml==6.0.1
447426
# via huggingface-hub
@@ -453,7 +432,6 @@ referencing==0.35.1
453432
# via jsonschema
454433
# via jsonschema-specifications
455434
regex==2024.5.15
456-
# via dateparser
457435
# via tiktoken
458436
requests==2.32.2
459437
# via burr
@@ -534,8 +512,6 @@ tenacity==8.3.0
534512
tiktoken==0.6.0
535513
# via langchain-openai
536514
# via scrapegraphai
537-
tld==0.13
538-
# via courlan
539515
tokenizers==0.19.1
540516
# via anthropic
541517
toml==0.10.2
@@ -555,8 +531,6 @@ tqdm==4.66.4
555531
# via openai
556532
# via scrapegraphai
557533
# via semchunk
558-
trafilatura==1.10.0
559-
# via scrapegraphai
560534
typer==0.12.3
561535
# via fastapi-cli
562536
typing-extensions==4.12.0
@@ -586,8 +560,6 @@ typing-inspect==0.9.0
586560
# via sf-hamilton
587561
tzdata==2024.1
588562
# via pandas
589-
tzlocal==5.2
590-
# via dateparser
591563
ujson==5.10.0
592564
# via fastapi
593565
undetected-playwright==0.3.0
@@ -596,10 +568,7 @@ uritemplate==4.1.1
596568
# via google-api-python-client
597569
urllib3==1.26.18
598570
# via botocore
599-
# via courlan
600-
# via htmldate
601571
# via requests
602-
# via trafilatura
603572
uvicorn==0.29.0
604573
# via burr
605574
# via fastapi

requirements.lock

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ async-timeout==4.0.3
2828
# via langchain
2929
attrs==23.2.0
3030
# via aiohttp
31-
babel==2.15.0
32-
# via courlan
3331
beautifulsoup4==4.12.3
3432
# via google
3533
# via scrapegraphai
@@ -44,18 +42,11 @@ certifi==2024.2.2
4442
# via httpcore
4543
# via httpx
4644
# via requests
47-
# via trafilatura
4845
charset-normalizer==3.3.2
49-
# via htmldate
5046
# via requests
51-
# via trafilatura
52-
courlan==1.2.0
53-
# via trafilatura
5447
dataclasses-json==0.6.6
5548
# via langchain
5649
# via langchain-community
57-
dateparser==1.2.0
58-
# via htmldate
5950
defusedxml==0.7.1
6051
# via langchain-anthropic
6152
distro==1.9.0
@@ -150,8 +141,6 @@ h11==0.14.0
150141
# via httpcore
151142
html2text==2024.2.26
152143
# via scrapegraphai
153-
htmldate==1.8.1
154-
# via trafilatura
155144
httpcore==1.0.5
156145
# via httpx
157146
httplib2==0.22.0
@@ -181,8 +170,6 @@ jsonpatch==1.33
181170
# via langchain-core
182171
jsonpointer==2.4
183172
# via jsonpatch
184-
justext==3.0.1
185-
# via trafilatura
186173
langchain==0.1.15
187174
# via scrapegraphai
188175
langchain-anthropic==0.1.11
@@ -220,12 +207,6 @@ langsmith==0.1.63
220207
# via langchain-core
221208
lxml==5.2.2
222209
# via free-proxy
223-
# via htmldate
224-
# via justext
225-
# via lxml-html-clean
226-
# via trafilatura
227-
lxml-html-clean==0.1.1
228-
# via lxml
229210
marshmallow==3.21.2
230211
# via dataclasses-json
231212
minify-html==0.15.0
@@ -298,22 +279,18 @@ pyparsing==3.1.2
298279
# via httplib2
299280
python-dateutil==2.9.0.post0
300281
# via botocore
301-
# via dateparser
302282
# via google-cloud-bigquery
303-
# via htmldate
304283
# via pandas
305284
python-dotenv==1.0.1
306285
# via scrapegraphai
307286
pytz==2024.1
308-
# via dateparser
309287
# via pandas
310288
pyyaml==6.0.1
311289
# via huggingface-hub
312290
# via langchain
313291
# via langchain-community
314292
# via langchain-core
315293
regex==2024.5.15
316-
# via dateparser
317294
# via tiktoken
318295
requests==2.32.2
319296
# via free-proxy
@@ -354,8 +331,6 @@ tenacity==8.3.0
354331
tiktoken==0.6.0
355332
# via langchain-openai
356333
# via scrapegraphai
357-
tld==0.13
358-
# via courlan
359334
tokenizers==0.19.1
360335
# via anthropic
361336
tqdm==4.66.4
@@ -364,8 +339,6 @@ tqdm==4.66.4
364339
# via openai
365340
# via scrapegraphai
366341
# via semchunk
367-
trafilatura==1.10.0
368-
# via scrapegraphai
369342
typing-extensions==4.12.0
370343
# via anthropic
371344
# via anyio
@@ -382,17 +355,12 @@ typing-inspect==0.9.0
382355
# via dataclasses-json
383356
tzdata==2024.1
384357
# via pandas
385-
tzlocal==5.2
386-
# via dateparser
387358
undetected-playwright==0.3.0
388359
# via scrapegraphai
389360
uritemplate==4.1.1
390361
# via google-api-python-client
391362
urllib3==1.26.18
392363
# via botocore
393-
# via courlan
394-
# via htmldate
395364
# via requests
396-
# via trafilatura
397365
yarl==1.9.4
398366
# via aiohttp

scrapegraphai/utils/convert_to_md.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
convert_to_md modul
33
"""
44
import html2text
5-
from trafilatura import extract
6-
75

86
def convert_to_md(html):
97
""" Convert HTML to Markdown.
@@ -20,6 +18,6 @@ def convert_to_md(html):
2018
'This is a paragraph.\n\n# This is a heading.'
2119
2220
Note: All the styles and links are ignored during the conversion. """
23-
24-
return extract(filecontent=html,include_images=True,
25-
include_links=True, include_tables=True, output_format="markdown")
21+
h = html2text.HTML2Text()
22+
h.ignore_links = False
23+
return h.handle(html)

tests/utils/convert_to_md_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def test_basic_html_to_md():
77

88
def test_html_with_links_and_images():
99
html = '<p>This is a <a href="https://example.com">link</a> and this is an <img src="https://example.com/image.jpg" alt="image"></p>'
10-
assert convert_to_md(html) is None
10+
assert convert_to_md(html) is not None
1111

1212
def test_html_with_tables():
1313
html = '''
@@ -17,11 +17,11 @@ def test_html_with_tables():
1717
<tr><td>Row 2, Cell 1</td><td>Row 2, Cell 2</td></tr>
1818
</table>
1919
'''
20-
assert convert_to_md(html) is None
20+
assert convert_to_md(html) is not None
2121

2222
def test_empty_html():
2323
html = ""
24-
assert convert_to_md(html) is None
24+
assert convert_to_md(html) is not None
2525

2626
def test_complex_html_structure():
2727
html = '''

0 commit comments

Comments
 (0)