Skip to content

Commit 227a0e7

Browse files
authored
refactor: use comment filed in annotated to pass metric-related information (#1385)
1 parent 3363d8b commit 227a0e7

File tree

8 files changed

+60
-59
lines changed

8 files changed

+60
-59
lines changed

lib/engines/mistralrs/src/lib.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,6 @@ impl
418418
id: None,
419419
data: Some(delta),
420420
event: None,
421-
chunk_tokens: None,
422-
input_tokens: None,
423-
output_tokens: None,
424421
comment: None,
425422
};
426423
yield ann;
@@ -585,9 +582,6 @@ impl AsyncEngine<SingleIn<NvCreateCompletionRequest>, ManyOut<Annotated<Completi
585582
id: None,
586583
data: Some(inner),
587584
event: None,
588-
chunk_tokens: None,
589-
input_tokens: None,
590-
output_tokens: None,
591585
comment: None,
592586
};
593587
yield ann;

lib/llm/src/engines.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,15 +202,15 @@ impl
202202
let response = NvCreateChatCompletionStreamResponse {
203203
inner,
204204
};
205-
yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, chunk_tokens: None, input_tokens: None, output_tokens: None, comment: None };
205+
yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, comment: None };
206206
id += 1;
207207
}
208208

209209
let inner = deltas.create_choice(0, None, Some(async_openai::types::FinishReason::Stop), None);
210210
let response = NvCreateChatCompletionStreamResponse {
211211
inner,
212212
};
213-
yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, chunk_tokens: None, input_tokens: None, output_tokens: None, comment: None };
213+
yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, comment: None };
214214
};
215215

216216
Ok(ResponseStream::new(Box::pin(output), ctx))
@@ -234,11 +234,11 @@ impl AsyncEngine<SingleIn<NvCreateCompletionRequest>, ManyOut<Annotated<Completi
234234
for c in chars_string.chars() {
235235
tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
236236
let response = deltas.create_choice(0, Some(c.to_string()), None);
237-
yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, chunk_tokens: None, input_tokens: None, output_tokens: None, comment: None };
237+
yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, comment: None };
238238
id += 1;
239239
}
240240
let response = deltas.create_choice(0, None, Some("stop".to_string()));
241-
yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, chunk_tokens: None, input_tokens: None, output_tokens: None, comment: None };
241+
yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, comment: None };
242242

243243
};
244244

lib/llm/src/http/service/openai.rs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use super::{
2727
service_v2, RouteDoc,
2828
};
2929

30+
use crate::preprocessor::LLMMetricAnnotation;
3031
use crate::protocols::openai::embeddings::{NvCreateEmbeddingRequest, NvCreateEmbeddingResponse};
3132
use crate::protocols::openai::{
3233
chat_completions::NvCreateChatCompletionResponse, completions::CompletionResponse,
@@ -500,6 +501,12 @@ fn process_event_converter<T: Serialize>(
500501
) -> Result<Event, axum::Error> {
501502
let annotated = annotated.0;
502503

504+
// update metrics
505+
if let Ok(Some(metrics)) = LLMMetricAnnotation::from_annotation(&annotated) {
506+
response_collector.observe_current_osl(metrics.output_tokens);
507+
response_collector.observe_response(metrics.input_tokens, metrics.chunk_tokens);
508+
}
509+
503510
let mut event = Event::default();
504511

505512
if let Some(data) = annotated.data {
@@ -516,16 +523,6 @@ fn process_event_converter<T: Serialize>(
516523
event = event.event(msg);
517524
}
518525

519-
if let Some(osl) = annotated.output_tokens {
520-
response_collector.observe_current_osl(osl);
521-
}
522-
523-
if let Some(isl) = annotated.input_tokens {
524-
if let Some(chunk_tokens) = annotated.chunk_tokens {
525-
response_collector.observe_response(isl, chunk_tokens);
526-
}
527-
}
528-
529526
if let Some(comments) = annotated.comment {
530527
for comment in comments {
531528
event = event.comment(comment);

lib/llm/src/preprocessor.rs

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,41 @@ pub use crate::protocols::common::llm_backend::{BackendOutput, PreprocessedReque
5959

6060
pub const ANNOTATION_FORMATTED_PROMPT: &str = "formatted_prompt";
6161
pub const ANNOTATION_TOKEN_IDS: &str = "token_ids";
62+
pub const ANNOTATION_LLM_METRICS: &str = "llm_metrics";
63+
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
64+
pub struct LLMMetricAnnotation {
65+
pub input_tokens: usize,
66+
pub output_tokens: usize,
67+
pub chunk_tokens: usize,
68+
}
69+
70+
impl LLMMetricAnnotation {
71+
/// Convert this metrics struct to an Annotated event
72+
pub fn to_annotation<T>(&self) -> Result<Annotated<T>, serde_json::Error> {
73+
Annotated::from_annotation(ANNOTATION_LLM_METRICS, self)
74+
}
75+
76+
/// Extract LLM metrics from an Annotated event, if present
77+
pub fn from_annotation<T>(
78+
annotation: &Annotated<T>,
79+
) -> Result<Option<LLMMetricAnnotation>, Box<dyn std::error::Error>> {
80+
if annotation.event.is_none() {
81+
return Ok(None);
82+
}
83+
if annotation.event.as_ref().unwrap() != ANNOTATION_LLM_METRICS {
84+
return Ok(None);
85+
}
86+
let comments = annotation
87+
.comment
88+
.as_ref()
89+
.ok_or("missing comments block")?;
90+
if comments.len() != 1 {
91+
return Err("malformed comments block - expected exactly 1 comment".into());
92+
}
93+
let metrics: LLMMetricAnnotation = serde_json::from_str(&comments[0])?;
94+
Ok(Some(metrics))
95+
}
96+
}
6297

6398
pub struct OpenAIPreprocessor {
6499
mdcsum: String,
@@ -251,9 +286,20 @@ impl OpenAIPreprocessor {
251286
.map_err(|e| e.to_string())
252287
});
253288

254-
response.chunk_tokens = Some(chunk_tokens);
255-
response.input_tokens = Some(isl);
256-
response.output_tokens = Some(current_osl);
289+
// Create LLM metrics annotation
290+
let llm_metrics = LLMMetricAnnotation {
291+
input_tokens: isl,
292+
output_tokens: current_osl,
293+
chunk_tokens,
294+
};
295+
296+
if let Ok(metrics_annotated) = llm_metrics.to_annotation::<()>() {
297+
// Only set event if not already set to avoid overriding existing events (like errors)
298+
if response.event.is_none() {
299+
response.event = metrics_annotated.event;
300+
}
301+
response.comment = metrics_annotated.comment;
302+
}
257303

258304
tracing::trace!(
259305
request_id = inner.context.id(),

lib/llm/src/protocols/codec.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,6 @@ where
118118
data,
119119
id: value.id,
120120
event: value.event,
121-
chunk_tokens: None,
122-
input_tokens: None,
123-
output_tokens: None,
124121
comment: value.comments,
125122
})
126123
}

lib/llm/src/protocols/openai/chat_completions/aggregator.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -284,9 +284,6 @@ mod tests {
284284
data: Some(data),
285285
id: Some("test_id".to_string()),
286286
event: None,
287-
chunk_tokens: None,
288-
input_tokens: None,
289-
output_tokens: None,
290287
comment: None,
291288
}
292289
}
@@ -430,9 +427,6 @@ mod tests {
430427
data: Some(data),
431428
id: Some("test_id".to_string()),
432429
event: None,
433-
chunk_tokens: None,
434-
input_tokens: None,
435-
output_tokens: None,
436430
comment: None,
437431
};
438432
let stream = Box::pin(stream::iter(vec![annotated_delta]));

lib/llm/src/protocols/openai/completions/aggregator.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,6 @@ mod tests {
205205
}),
206206
id: Some("test_id".to_string()),
207207
event: None,
208-
chunk_tokens: None,
209-
input_tokens: None,
210-
output_tokens: None,
211208
comment: None,
212209
}
213210
}
@@ -317,9 +314,6 @@ mod tests {
317314
}),
318315
id: Some("test_id".to_string()),
319316
event: None,
320-
chunk_tokens: None,
321-
input_tokens: None,
322-
output_tokens: None,
323317
comment: None,
324318
};
325319

lib/runtime/src/protocols/annotated.rs

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,6 @@ pub struct Annotated<R> {
3737
#[serde(skip_serializing_if = "Option::is_none")]
3838
pub event: Option<String>,
3939
#[serde(skip_serializing_if = "Option::is_none")]
40-
pub chunk_tokens: Option<usize>,
41-
#[serde(skip_serializing_if = "Option::is_none")]
42-
pub input_tokens: Option<usize>,
43-
#[serde(skip_serializing_if = "Option::is_none")]
44-
pub output_tokens: Option<usize>,
45-
#[serde(skip_serializing_if = "Option::is_none")]
4640
pub comment: Option<Vec<String>>,
4741
}
4842

@@ -53,9 +47,6 @@ impl<R> Annotated<R> {
5347
data: None,
5448
id: None,
5549
event: Some("error".to_string()),
56-
chunk_tokens: None,
57-
input_tokens: None,
58-
output_tokens: None,
5950
comment: Some(vec![error]),
6051
}
6152
}
@@ -66,9 +57,6 @@ impl<R> Annotated<R> {
6657
data: Some(data),
6758
id: None,
6859
event: None,
69-
chunk_tokens: None,
70-
input_tokens: None,
71-
output_tokens: None,
7260
comment: None,
7361
}
7462
}
@@ -84,9 +72,6 @@ impl<R> Annotated<R> {
8472
data: None,
8573
id: None,
8674
event: Some(name.into()),
87-
chunk_tokens: None,
88-
input_tokens: None,
89-
output_tokens: None,
9075
comment: Some(vec![serde_json::to_string(value)?]),
9176
})
9277
}
@@ -122,9 +107,6 @@ impl<R> Annotated<R> {
122107
data,
123108
id: self.id,
124109
event: self.event,
125-
chunk_tokens: self.chunk_tokens,
126-
input_tokens: self.input_tokens,
127-
output_tokens: self.output_tokens,
128110
comment: self.comment,
129111
}
130112
}
@@ -140,9 +122,6 @@ impl<R> Annotated<R> {
140122
data,
141123
id: self.id,
142124
event: self.event,
143-
chunk_tokens: self.chunk_tokens,
144-
input_tokens: self.input_tokens,
145-
output_tokens: self.output_tokens,
146125
comment: self.comment,
147126
},
148127
Err(e) => Annotated::from_error(e),

0 commit comments

Comments
 (0)