Merge pull request #724 from ScrapeGraphAI/tem

VinciGit00 · web-flow · commit 8bacd533bcb6 · 2024-10-05T09:35:36.000+02:00
allignment
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## [1.25.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.1...v1.25.2) (2024-10-03)
+
+
+### Bug Fixes
+
+* update dependencies ([7579d0e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7579d0e2599d63c0003b1b7a0918132511a9c8f1))
+
+## [1.25.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.25.1) (2024-09-29)
 ## [1.26.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.2...v1.26.0-beta.3) (2024-10-04)
 
 
diff --git a/README.md b/README.md
@@ -98,7 +98,6 @@ The output will be a dictionary like the following:
     "contact_email": "contact@scrapegraphai.com"
 }
 ```
-
 There are other pipelines that can be used to extract information from multiple pages, generate Python scripts, or even generate audio files.
 
 | Pipeline Name           | Description                                                                                                      |
@@ -110,6 +109,8 @@ There are other pipelines that can be used to extract information from multiple
 | SmartScraperMultiGraph  | Multi-page scraper that extracts information from multiple pages given a single prompt and a list of sources.    |
 | ScriptCreatorMultiGraph | Multi-page scraper that generates a Python script for extracting information from multiple pages and sources.     |
 
+For each of these graphs there is the multi version. It allows to make calls of the LLM in parallel.
+
 It is possible to use different LLM through APIs, such as **OpenAI**, **Groq**, **Azure** and **Gemini**, or local models using **Ollama**.
 
 Remember to have [Ollama](https://ollama.com/) installed and download the models using the **ollama pull** command, if you want to use local models.
@@ -140,6 +141,9 @@ Check out also the Docusaurus [here](https://scrapegraph-doc.onrender.com/).
   <a href="https://2ly.link/1zNj1">
     <img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/transparent_stat.png" alt="Stats" style="width: 15%;">
   </a>
+    <a href="https://scrape.do">
+    <img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/scrapedo.png" alt="Stats" style="width: 11%;">
+  </a>
 </div>
 
 ## 🤝 Contributing
@@ -152,34 +156,6 @@ Please see the [contributing guidelines](https://github.com/VinciGit00/Scrapegra
 [![My Skills](https://skillicons.dev/icons?i=linkedin)](https://www.linkedin.com/company/scrapegraphai/)
 [![My Skills](https://skillicons.dev/icons?i=twitter)](https://twitter.com/scrapegraphai)
 
-## 🗺️ Roadmap
-
-We are working on the following features! If you are interested in collaborating right-click on the feature and open in a new tab to file a PR. If you have doubts and wanna discuss them with us, just contact us on [discord](https://discord.gg/uJN7TYcpNa) or open a [Discussion](https://github.com/VinciGit00/Scrapegraph-ai/discussions) here on Github!
-
-```mermaid
-%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#5C4B9B', 'edgeLabelBackground':'#ffffff', 'tertiaryColor': '#ffffff', 'primaryBorderColor': '#5C4B9B', 'fontFamily': 'Arial', 'fontSize': '16px', 'textColor': '#5C4B9B' }}}%%
-graph LR
-    A[DeepSearch Graph] --> F[Use Existing Chromium Instances]
-    F --> B[Page Caching]
-    B --> C[Screenshot Scraping]
-    C --> D[Handle Dynamic Content]
-    D --> E[New Webdrivers]
-
-    style A fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
-    style F fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
-    style B fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
-    style C fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
-    style D fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
-    style E fill:#ffffff,stroke:#5C4B9B,stroke-width:2px,rx:10,ry:10
-
-    click A href "https://github.com/VinciGit00/Scrapegraph-ai/issues/260" "Open DeepSearch Graph Issue"
-    click F href "https://github.com/VinciGit00/Scrapegraph-ai/issues/329" "Open Chromium Instances Issue"
-    click B href "https://github.com/VinciGit00/Scrapegraph-ai/issues/197" "Open Page Caching Issue"
-    click C href "https://github.com/VinciGit00/Scrapegraph-ai/issues/197" "Open Screenshot Scraping Issue"
-    click D href "https://github.com/VinciGit00/Scrapegraph-ai/issues/279" "Open Handle Dynamic Content Issue"
-    click E href "https://github.com/VinciGit00/Scrapegraph-ai/issues/171" "Open New Webdrivers Issue"
-```
-
 ## 📈 Telemetry 
 We collect anonymous usage metrics to enhance our package's quality and user experience. The data helps us prioritize improvements and ensure compatibility. If you wish to opt-out, set the environment variable SCRAPEGRAPHAI_TELEMETRY_ENABLED=false. For more information, please refer to the documentation [here](https://scrapegraph-ai.readthedocs.io/en/latest/scrapers/telemetry.html).
 
diff --git a/docs/assets/scrapedo.png b/docs/assets/scrapedo.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.26.0b3"
+version = "1.25.2"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
@@ -30,6 +30,7 @@ dependencies = [
     "undetected-playwright>=0.3.0",
     "google>=3.0.0",
     "langchain-ollama>=0.1.3",
+
     "semchunk==2.2.0",
     "transformers==4.44.2",
     "qdrant-client>=1.11.3",
diff --git a/scrapegraphai/utils/cleanup_code.py b/scrapegraphai/utils/cleanup_code.py
@@ -4,6 +4,9 @@
 import re
 
 def extract_code(code: str) -> str:
+    """
+    Module for extracting code 
+    """
     pattern = r'```(?:python)?\n(.*?)```'
 
     match = re.search(pattern, code, re.DOTALL)
diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py
@@ -101,7 +101,7 @@ def reduce_html(html, reduction):
         for attr in list(tag.attrs):
             if attr not in attrs_to_keep:
                 del tag[attr]
-   
+
     if reduction == 1:
         return minify_html(str(soup))
 
diff --git a/scrapegraphai/utils/code_error_analysis.py b/scrapegraphai/utils/code_error_analysis.py
@@ -2,24 +2,27 @@
 This module contains the functions that are used to generate the prompts for the code error analysis.
 """
 from typing import Any, Dict
+import json
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-import json
 from ..prompts import (
     TEMPLATE_SYNTAX_ANALYSIS, TEMPLATE_EXECUTION_ANALYSIS,
     TEMPLATE_VALIDATION_ANALYSIS, TEMPLATE_SEMANTIC_ANALYSIS
 )
 
 def syntax_focused_analysis(state: dict, llm_model) -> str:
-    prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"])
+    prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS,
+                            input_variables=["generated_code", "errors"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "generated_code": state["generated_code"],
         "errors": state["errors"]["syntax"]
     })
 
 def execution_focused_analysis(state: dict, llm_model) -> str:
-    prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS, input_variables=["generated_code", "errors", "html_code", "html_analysis"])
+    prompt = PromptTemplate(template=TEMPLATE_EXECUTION_ANALYSIS,
+                            input_variables=["generated_code", "errors",
+                                              "html_code", "html_analysis"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "generated_code": state["generated_code"],
@@ -29,7 +32,9 @@ def execution_focused_analysis(state: dict, llm_model) -> str:
     })
 
 def validation_focused_analysis(state: dict, llm_model) -> str:
-    prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS, input_variables=["generated_code", "errors", "json_schema", "execution_result"])
+    prompt = PromptTemplate(template=TEMPLATE_VALIDATION_ANALYSIS,
+                            input_variables=["generated_code", "errors", 
+                                             "json_schema", "execution_result"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "generated_code": state["generated_code"],
@@ -39,7 +44,9 @@ def validation_focused_analysis(state: dict, llm_model) -> str:
     })
 
 def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], llm_model) -> str:        
-    prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS, input_variables=["generated_code", "differences", "explanation"])
+    prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_ANALYSIS,
+                            input_variables=["generated_code", 
+                                             "differences", "explanation"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "generated_code": state["generated_code"],
diff --git a/scrapegraphai/utils/code_error_correction.py b/scrapegraphai/utils/code_error_correction.py
@@ -10,32 +10,38 @@
 )
 
 def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
-    prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"])
+    prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION,
+                            input_variables=["analysis", "generated_code"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "analysis": analysis,
         "generated_code": state["generated_code"]
     })
 
 def execution_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
-    prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"])
+    prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION,
+                            input_variables=["analysis", "generated_code"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "analysis": analysis,
         "generated_code": state["generated_code"]
     })
 
 def validation_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
-    prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"])
+    prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION,
+                            input_variables=["analysis", "generated_code",
+                                             "json_schema"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "analysis": analysis,
         "generated_code": state["generated_code"],
         "json_schema": state["json_schema"]
     })
-    
+
 def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> str:
-    prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
+    prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION,
+                            input_variables=["analysis", "generated_code",
+                                             "generated_result", "reference_result"])
     chain = prompt | llm_model | StrOutputParser()
     return chain.invoke({
         "analysis": analysis,