|
| 1 | +""" |
| 2 | +BlocksIndentifier Module |
| 3 | +""" |
| 4 | + |
| 5 | +from typing import List, Optional |
| 6 | +from langchain_community.document_loaders import AsyncChromiumLoader |
| 7 | +from langchain_core.documents import Document |
| 8 | +from .base_node import BaseNode |
| 9 | + |
| 10 | + |
| 11 | + |
| 12 | +class BlocksIndentifier(BaseNode): |
| 13 | + """ |
| 14 | + A node responsible to identify the blocks in the HTML content of a specified HTML content |
| 15 | + e.g products in a E-commerce, flights in a travel website etc. |
| 16 | +
|
| 17 | + Attributes: |
| 18 | + headless (bool): A flag indicating whether the browser should run in headless mode. |
| 19 | + verbose (bool): A flag indicating whether to print verbose output during execution. |
| 20 | + |
| 21 | + Args: |
| 22 | + input (str): Boolean expression defining the input keys needed from the state. |
| 23 | + output (List[str]): List of output keys to be updated in the state. |
| 24 | + node_config (Optional[dict]): Additional configuration for the node. |
| 25 | + node_name (str): The unique identifier name for the node, defaulting to "BlocksIndentifier". |
| 26 | + """ |
| 27 | + |
| 28 | + def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "BlocksIndentifier"): |
| 29 | + super().__init__(node_name, "node", input, output, 1) |
| 30 | + |
| 31 | + self.headless = True if node_config is None else node_config.get("headless", True) |
| 32 | + self.verbose = True if node_config is None else node_config.get("verbose", False) |
| 33 | + |
| 34 | + def execute(self, state): |
| 35 | + """ |
| 36 | + Executes the node's logic, caracterized by a pre-processing of the HTML content and |
| 37 | + subsequent identification of the blocks in the HTML content. |
| 38 | +
|
| 39 | + Args: |
| 40 | + state (dict): The current state of the graph. The input keys will be used |
| 41 | + to fetch the correct data types from the state. |
| 42 | +
|
| 43 | + Returns: |
| 44 | + dict: The updated state with a new output key containing the fetched HTML content. |
| 45 | +
|
| 46 | + Raises: |
| 47 | + KeyError: If the input key is not found in the state, indicating that the |
| 48 | + necessary information to perform the operation is missing. |
| 49 | + """ |
| 50 | + if self.verbose: |
| 51 | + print(f"--- Executing {self.node_name} Node ---") |
| 52 | + |
| 53 | + # Interpret input keys based on the provided input expression |
| 54 | + input_keys = self.get_input_keys(state) |
| 55 | + |
| 56 | + # Fetching data from the state based on the input keys |
| 57 | + input_data = [state[key] for key in input_keys] |
0 commit comments