Skip to content

Commit 99dc849

Browse files
authored
docs(gpt-4o-mini): added new gpt, fixed chromium lazy loading,
add documentation and metrics
1 parent b4b90b3 commit 99dc849

File tree

7 files changed

+52
-6
lines changed

7 files changed

+52
-6
lines changed

docs/source/scrapers/graph_config.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Some interesting ones are:
1414
- `burr_kwargs`: A dictionary with additional parameters to enable `Burr` graphical user interface.
1515
- `max_images`: The maximum number of images to be analyzed. Useful in `OmniScraperGraph` and `OmniSearchGraph`.
1616
- `cache_path`: The path where the cache files will be saved. If already exists, the cache will be loaded from this path.
17-
17+
- `additional_info`: Add additional text to default prompts defined in the graphs.
1818
.. _Burr:
1919

2020
Burr Integration

docs/source/scrapers/telemetry.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,13 @@ Additionally, the following properties are collected:
2727
"llm_model": llm_model_name,
2828
"embedder_model": embedder_model_name,
2929
"source_type": source_type,
30+
"source": source,
3031
"execution_time": execution_time,
32+
"prompt": prompt,
33+
"schema": schema,
3134
"error_node": error_node_name,
35+
"exception": exception,
36+
"response": response,
3237
"total_tokens": total_tokens,
3338
}
3439

scrapegraphai/docloaders/chromium.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ async def ascrape_playwright(self, url: str) -> str:
8282
context = await browser.new_context()
8383
await Malenia.apply_stealth(context)
8484
page = await context.new_page()
85-
await page.goto(url)
85+
await page.goto(url, wait_until="domcontentloaded")
8686
await page.wait_for_load_state(self.load_state)
8787
results = await page.content() # Simply get the HTML content
8888
logger.info("Content scraped")

scrapegraphai/graphs/base_graph.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,18 +106,32 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
106106
source_type = None
107107
llm_model = None
108108
embedder_model = None
109+
source = []
110+
prompt = None
111+
schema = None
109112

110113
while current_node_name:
111114
curr_time = time.time()
112115
current_node = next(node for node in self.nodes if node.node_name == current_node_name)
113116

117+
114118
# check if there is a "source" key in the node config
115119
if current_node.__class__.__name__ == "FetchNode":
116120
# get the second key name of the state dictionary
117121
source_type = list(state.keys())[1]
122+
if state.get("user_prompt", None):
123+
prompt = state["user_prompt"] if type(state["user_prompt"]) == str else None
118124
# quick fix for local_dir source type
119125
if source_type == "local_dir":
120126
source_type = "html_dir"
127+
elif source_type == "url":
128+
if type(state[source_type]) == list:
129+
# iterate through the list of urls and see if they are strings
130+
for url in state[source_type]:
131+
if type(url) == str:
132+
source.append(url)
133+
elif type(state[source_type]) == str:
134+
source.append(state[source_type])
121135

122136
# check if there is an "llm_model" variable in the class
123137
if hasattr(current_node, "llm_model") and llm_model is None:
@@ -135,6 +149,16 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
135149
elif hasattr(embedder_model, "model"):
136150
embedder_model = embedder_model.model
137151

152+
if hasattr(current_node, "node_config"):
153+
if type(current_node.node_config) is dict:
154+
if current_node.node_config.get("schema", None) and schema is None:
155+
if type(current_node.node_config["schema"]) is not dict:
156+
# convert to dict
157+
try:
158+
schema = current_node.node_config["schema"].schema()
159+
except Exception as e:
160+
schema = None
161+
138162
with get_openai_callback() as cb:
139163
try:
140164
result = current_node.execute(state)
@@ -144,11 +168,15 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
144168
graph_execution_time = time.time() - start_time
145169
log_graph_execution(
146170
graph_name=self.graph_name,
171+
source=source,
172+
prompt=prompt,
173+
schema=schema,
147174
llm_model=llm_model,
148175
embedder_model=embedder_model,
149176
source_type=source_type,
150177
execution_time=graph_execution_time,
151-
error_node=error_node
178+
error_node=error_node,
179+
exception=str(e)
152180
)
153181
raise e
154182
node_exec_time = time.time() - curr_time
@@ -191,11 +219,16 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
191219

192220
# Log the graph execution telemetry
193221
graph_execution_time = time.time() - start_time
222+
response = state.get("answer", None) if source_type == "url" else None
194223
log_graph_execution(
195224
graph_name=self.graph_name,
225+
source=source,
226+
prompt=prompt,
227+
schema=schema,
196228
llm_model=llm_model,
197229
embedder_model=embedder_model,
198230
source_type=source_type,
231+
response=response,
199232
execution_time=graph_execution_time,
200233
total_tokens=cb_total["total_tokens"] if cb_total["total_tokens"] > 0 else None,
201234
)

scrapegraphai/helpers/models_tokens.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
"gpt-4-32k": 32768,
1717
"gpt-4-32k-0613": 32768,
1818
"gpt-4o": 128000,
19+
"gpt-4o-mini":128000,
1920
},
2021
"azure": {
21-
"gpt-3.5-turbo-0125": 16385,
22+
"gpt-3.5-turbo-0125": 16385,
2223
"gpt-3.5": 4096,
2324
"gpt-3.5-turbo": 16385,
2425
"gpt-3.5-turbo-1106": 16385,
@@ -34,6 +35,7 @@
3435
"gpt-4-32k": 32768,
3536
"gpt-4-32k-0613": 32768,
3637
"gpt-4o": 128000,
38+
"gpt-4o-mini":128000,
3739
},
3840
"gemini": {
3941
"gemini-pro": 128000,

scrapegraphai/nodes/graph_iterator_node.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ async def _async_run(graph):
126126
for url in urls:
127127
instance = copy.copy(graph_instance)
128128
instance.source = url
129-
129+
if url.startswith("http"):
130+
instance.input_key = "url"
130131
participants.append(instance)
131132

132133
futures = [_async_run(graph) for graph in participants]

scrapegraphai/telemetry/telemetry.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,19 @@ def log_event(event: str, properties: Dict[str, any]):
156156
send_event_json(event_json)
157157

158158

159-
def log_graph_execution(graph_name: str, llm_model: str, embedder_model: str, source_type: str, execution_time: float, error_node: str = None, total_tokens: int = None):
159+
def log_graph_execution(graph_name: str, source: str, prompt:str, schema:dict, llm_model: str, embedder_model: str, source_type: str, execution_time: float, response: dict = None, error_node: str = None, exception: str = None, total_tokens: int = None):
160160
properties = {
161161
"graph_name": graph_name,
162+
"source": source,
163+
"prompt": prompt,
164+
"schema": schema,
162165
"llm_model": llm_model,
163166
"embedder_model": embedder_model,
164167
"source_type": source_type,
168+
"response": response,
165169
"execution_time": execution_time,
166170
"error_node": error_node,
171+
"exception": exception,
167172
"total_tokens": total_tokens,
168173
}
169174
log_event("graph_execution", properties)

0 commit comments

Comments
 (0)