Merge pull request #619 from tm-robinson/543-ScriptCreatorGraph-only-use-first-chunk

VinciGit00 · web-flow · commit fd0a90236ee2 · 2024-09-02T11:36:05.000+02:00
543 script creator graph only use first chunk
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
-## [1.16.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.3...v1.16.0-beta.4) (2024-09-02)
+## [1.16.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.2...v1.16.0) (2024-09-01)
+
 
 
 ### Features
@@ -11,6 +12,9 @@
 * deepcopy fail for coping model_instance config ([cd07418](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cd07418474112cecd53ab47866262f2f31294223))
 * fix pydantic object copy ([553527a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/553527a269cdd70c0c174ad5c78cbf35c00b22c1))
 
+## [1.15.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.15.1...v1.15.2) (2024-09-01)
+
+
 ## [1.16.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.16.0-beta.2...v1.16.0-beta.3) (2024-09-01)
 
 
@@ -27,6 +31,7 @@
 
 
 
+
 ### Bug Fixes
 
 * pyproject.toml ([360ce1c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/360ce1c0e468c959e63555120ac7cecf55563846))
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,6 @@
 [project]
 name = "scrapegraphai"
+
 version = "1.16.0b4"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -141,7 +141,7 @@ def _create_llm(self, llm_config: dict) -> object:
         try:
             self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
         except KeyError:
-            print("Model not found, using default token size (8192)")
+            print(f"Model {llm_params['model_provider']}/{llm_params['model']} not found, using default token size (8192)")
             self.model_token = 8192
 
         try:
diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py
@@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
        
diff --git a/scrapegraphai/graphs/markdown_scraper_graph.py b/scrapegraphai/graphs/markdown_scraper_graph.py
@@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph:
             output=["parsed_doc"],
             node_config={
                 "parse_html": False,
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         generate_answer_node = GenerateAnswerNode(
diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py
@@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         image_to_text_node = ImageToTextNode(
diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
             output=["parsed_doc"],
             node_config={
                 "parse_html": False,
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
 
diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py
@@ -73,11 +73,12 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={"chunk_size": self.model_token,
-                         "parse_html": False
+                         "parse_html": False,
+                         "llm_model": self.llm_model
                          }
         )
         generate_scraper_node = GenerateScraperNode(
-            input="user_prompt & (doc)",
+            input="user_prompt & (parsed_doc)",
             output=["answer"],
             node_config={
                 "llm_model": self.llm_model,
diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py
@@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         search_link_node = SearchLinkNode(
diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         generate_answer_node = GenerateAnswerNode(
diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py
@@ -102,9 +102,19 @@ def execute(self, state: dict) -> dict:
             TEMPLATE_NO_CHUNKS += self.additional_info
 
         if len(doc) > 1:
-            raise NotImplementedError(
-                "Currently GenerateScraperNode cannot handle more than 1 context chunks"
-            )
+            # Short term partial fix for issue #543 (Context length exceeded)
+            # If there are more than one chunks returned by ParseNode we just use the first one
+            # on the basis that the structure of the remainder of the HTML page is probably
+            # very similar to the first chunk therefore the generated script should still work.
+            # The better fix is to generate multiple scripts then use the LLM to merge them.
+
+            #raise NotImplementedError(
+            #    "Currently GenerateScraperNode cannot handle more than 1 context chunks"
+            #)
+            self.logger.warn(f"Warning: {self.node_name} Node provided with {len(doc)} chunks but can only "
+                "support 1, ignoring remaining chunks")
+            doc = [doc[0]]
+            template = TEMPLATE_NO_CHUNKS
         else:
             template = TEMPLATE_NO_CHUNKS
 
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -40,6 +40,7 @@ def __init__(
         self.parse_html = (
             True if node_config is None else node_config.get("parse_html", True)
         )
+        self.llm_model = node_config['llm_model']
 
     def execute(self, state: dict) -> dict:
         """
@@ -64,31 +65,33 @@ def execute(self, state: dict) -> dict:
         input_data = [state[key] for key in input_keys]
         docs_transformed = input_data[0]
 
+        def count_tokens(text):
+            from ..utils import token_count
+            return token_count(text, self.llm_model.model_name)
+
         if self.parse_html:
             docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
             docs_transformed = docs_transformed[0]
 
             chunks = chunk(text=docs_transformed.page_content,
                             chunk_size=self.node_config.get("chunk_size", 4096)-250,
-                            token_counter=lambda text: len(text.split()),
+                            token_counter=count_tokens,
                             memoize=False)
         else:
             docs_transformed = docs_transformed[0]
-
             chunk_size = self.node_config.get("chunk_size", 4096)
             chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
 
             if isinstance(docs_transformed, Document):
                 chunks = chunk(text=docs_transformed.page_content,
                             chunk_size=chunk_size,
-                            token_counter=lambda text: len(text.split()),
+                            token_counter=count_tokens,
                             memoize=False)
             else:
                 chunks = chunk(text=docs_transformed,
                                 chunk_size=chunk_size,
-                                token_counter=lambda text: len(text.split()),
+                                token_counter=count_tokens,
                                 memoize=False)
 
         state.update({self.output[0]: chunks})
-
         return state
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -11,3 +11,4 @@
 from .cleanup_html import cleanup_html
 from .logging import *
 from .convert_to_md import convert_to_md
+from .token_calculator import *
diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py
@@ -6,27 +6,26 @@
 from ..helpers.models_tokens import models_tokens
 
 
-def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
+def truncate_text_tokens(text: str, model: str) -> List[str]:
     """
     Truncates text into chunks that are small enough to be processed by specified llm models.
 
     Args:
         text (str): The input text to be truncated.
         model (str): The name of the llm model to determine the maximum token limit.
-        encoding_name (str): The encoding strategy used to encode the text before truncation.
 
     Returns:
         List[str]: A list of text chunks, each within the token limit of the specified model.
 
     Example:
-        >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
+        >>> truncate_text_tokens("This is a sample text for truncation.", "gpt-4o-mini")
         ["This is a sample text", "for truncation."]
 
     This function ensures that each chunk of text can be tokenized 
     by the specified model without exceeding the model's token limit.
     """
 
-    encoding = tiktoken.get_encoding(encoding_name)
+    encoding = tiktoken.encoding_for_model(model)
     max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9))
     encoded_text = encoding.encode(text)
 
@@ -36,3 +35,28 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]
     result = [encoding.decode(chunk) for chunk in chunks]
 
     return result
+
+
+def token_count(text: str, model: str) -> List[str]:
+    """
+    Return the number of tokens within the text, based on the encoding of the specified model.
+
+    Args:
+        text (str): The input text to be counted.
+        model (str): The name of the llm model to determine the encoding.
+
+    Returns:
+        int: Number of tokens.
+
+    Example:
+        >>> token_count("This is a sample text for counting.", "gpt-4o-mini")
+        9
+
+    This function ensures that each chunk of text can be tokenized 
+    by the specified model without exceeding the model's token limit.
+    """
+
+    encoding = tiktoken.encoding_for_model(model)
+    num_tokens = len(encoding.encode(text))
+
+    return num_tokens

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph:`
`75`	`75`	`input="doc",`
`76`	`76`	`output=["parsed_doc"],`
`77`	`77`	`node_config={`
`78`		`- "chunk_size": self.model_token`
	`78`	`+ "chunk_size": self.model_token,`
	`79`	`+ "llm_model": self.llm_model`
`79`	`80`	`}`
`80`	`81`	`)`
`81`	`82`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph:`
`60`	`60`	`output=["parsed_doc"],`
`61`	`61`	`node_config={`
`62`	`62`	`"parse_html": False,`
`63`		`- "chunk_size": self.model_token`
	`63`	`+ "chunk_size": self.model_token,`
	`64`	`+ "llm_model": self.llm_model`
`64`	`65`	`}`
`65`	`66`	`)`
`66`	`67`	`generate_answer_node = GenerateAnswerNode(`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph:`
`74`	`74`	`input="doc",`
`75`	`75`	`output=["parsed_doc"],`
`76`	`76`	`node_config={`
`77`		`- "chunk_size": self.model_token`
	`77`	`+ "chunk_size": self.model_token,`
	`78`	`+ "llm_model": self.llm_model`
`78`	`79`	`}`
`79`	`80`	`)`
`80`	`81`	`image_to_text_node = ImageToTextNode(`
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:`
`68`	`68`	`output=["parsed_doc"],`
`69`	`69`	`node_config={`
`70`	`70`	`"parse_html": False,`
`71`		`- "chunk_size": self.model_token`
	`71`	`+ "chunk_size": self.model_token,`
	`72`	`+ "llm_model": self.llm_model`
`72`	`73`	`}`
`73`	`74`	`)`
`74`	`75`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph:`
`64`	`64`	`input="doc",`
`65`	`65`	`output=["parsed_doc"],`
`66`	`66`	`node_config={`
`67`		`- "chunk_size": self.model_token`
	`67`	`+ "chunk_size": self.model_token,`
	`68`	`+ "llm_model": self.llm_model`
`68`	`69`	`}`
`69`	`70`	`)`
`70`	`71`	`search_link_node = SearchLinkNode(`