Skip to content

docs(gpt-4o-mini): added new gpt, fixed chromium lazy loading, #477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/scrapers/graph_config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Some interesting ones are:
- `burr_kwargs`: A dictionary with additional parameters to enable `Burr` graphical user interface.
- `max_images`: The maximum number of images to be analyzed. Useful in `OmniScraperGraph` and `OmniSearchGraph`.
- `cache_path`: The path where the cache files will be saved. If already exists, the cache will be loaded from this path.

- `additional_info`: Add additional text to default prompts defined in the graphs.
.. _Burr:

Burr Integration
Expand Down
5 changes: 5 additions & 0 deletions docs/source/scrapers/telemetry.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,13 @@ Additionally, the following properties are collected:
"llm_model": llm_model_name,
"embedder_model": embedder_model_name,
"source_type": source_type,
"source": source,
"execution_time": execution_time,
"prompt": prompt,
"schema": schema,
"error_node": error_node_name,
"exception": exception,
"response": response,
"total_tokens": total_tokens,
}

Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ async def ascrape_playwright(self, url: str) -> str:
context = await browser.new_context()
await Malenia.apply_stealth(context)
page = await context.new_page()
await page.goto(url)
await page.goto(url, wait_until="domcontentloaded")
await page.wait_for_load_state(self.load_state)
results = await page.content() # Simply get the HTML content
logger.info("Content scraped")
Expand Down
35 changes: 34 additions & 1 deletion scrapegraphai/graphs/base_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,32 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
source_type = None
llm_model = None
embedder_model = None
source = []
prompt = None
schema = None

while current_node_name:
curr_time = time.time()
current_node = next(node for node in self.nodes if node.node_name == current_node_name)


# check if there is a "source" key in the node config
if current_node.__class__.__name__ == "FetchNode":
# get the second key name of the state dictionary
source_type = list(state.keys())[1]
if state.get("user_prompt", None):
prompt = state["user_prompt"] if type(state["user_prompt"]) == str else None
# quick fix for local_dir source type
if source_type == "local_dir":
source_type = "html_dir"
elif source_type == "url":
if type(state[source_type]) == list:
# iterate through the list of urls and see if they are strings
for url in state[source_type]:
if type(url) == str:
source.append(url)
elif type(state[source_type]) == str:
source.append(state[source_type])

# check if there is an "llm_model" variable in the class
if hasattr(current_node, "llm_model") and llm_model is None:
Expand All @@ -135,6 +149,16 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
elif hasattr(embedder_model, "model"):
embedder_model = embedder_model.model

if hasattr(current_node, "node_config"):
if type(current_node.node_config) is dict:
if current_node.node_config.get("schema", None) and schema is None:
if type(current_node.node_config["schema"]) is not dict:
# convert to dict
try:
schema = current_node.node_config["schema"].schema()
except Exception as e:
schema = None

with get_openai_callback() as cb:
try:
result = current_node.execute(state)
Expand All @@ -144,11 +168,15 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
graph_execution_time = time.time() - start_time
log_graph_execution(
graph_name=self.graph_name,
source=source,
prompt=prompt,
schema=schema,
llm_model=llm_model,
embedder_model=embedder_model,
source_type=source_type,
execution_time=graph_execution_time,
error_node=error_node
error_node=error_node,
exception=str(e)
)
raise e
node_exec_time = time.time() - curr_time
Expand Down Expand Up @@ -191,11 +219,16 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:

# Log the graph execution telemetry
graph_execution_time = time.time() - start_time
response = state.get("answer", None) if source_type == "url" else None
log_graph_execution(
graph_name=self.graph_name,
source=source,
prompt=prompt,
schema=schema,
llm_model=llm_model,
embedder_model=embedder_model,
source_type=source_type,
response=response,
execution_time=graph_execution_time,
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
)
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/helpers/models_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
"gpt-4-32k": 32768,
"gpt-4-32k-0613": 32768,
"gpt-4o": 128000,
"gpt-4o-mini": 128000,
"gpt-4o-mini":128000,

},
"azure": {
"gpt-3.5-turbo-0125": 16385,
"gpt-3.5-turbo-0125": 16385,
"gpt-3.5": 4096,
"gpt-3.5-turbo": 16385,
"gpt-3.5-turbo-1106": 16385,
Expand Down
3 changes: 2 additions & 1 deletion scrapegraphai/nodes/graph_iterator_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ async def _async_run(graph):
for url in urls:
instance = copy.copy(graph_instance)
instance.source = url

if url.startswith("http"):
instance.input_key = "url"
participants.append(instance)

futures = [_async_run(graph) for graph in participants]
Expand Down
7 changes: 6 additions & 1 deletion scrapegraphai/telemetry/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,19 @@ def log_event(event: str, properties: Dict[str, any]):
send_event_json(event_json)


def log_graph_execution(graph_name: str, llm_model: str, embedder_model: str, source_type: str, execution_time: float, error_node: str = None, total_tokens: int = None):
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
properties = {
"graph_name": graph_name,
"source": source,
"prompt": prompt,
"schema": schema,
"llm_model": llm_model,
"embedder_model": embedder_model,
"source_type": source_type,
"response": response,
"execution_time": execution_time,
"error_node": error_node,
"exception": exception,
"total_tokens": total_tokens,
}
log_event("graph_execution", properties)
Expand Down
Loading