|
| 1 | +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | +from typing import Optional |
| 4 | + |
| 5 | +from metric_attribute_generator import MetricAttributeGenerator |
| 6 | +from typing_extensions import override |
| 7 | + |
| 8 | +from opentelemetry.context import Context |
| 9 | +from opentelemetry.metrics import Histogram |
| 10 | +from opentelemetry.sdk.resources import Resource |
| 11 | +from opentelemetry.sdk.trace import BoundedAttributes, ReadableSpan, Span, SpanProcessor, StatusCode |
| 12 | +from opentelemetry.semconv.trace import SpanAttributes |
| 13 | + |
| 14 | +_HTTP_STATUS_CODE = SpanAttributes.HTTP_STATUS_CODE |
| 15 | +_NANOS_TO_MILLIS: float = 1_000_000.0 |
| 16 | + |
| 17 | +# Constants for deriving error and fault metrics |
| 18 | +_ERROR_CODE_LOWER_BOUND: int = 400 |
| 19 | +_ERROR_CODE_UPPER_BOUND: int = 499 |
| 20 | +_FAULT_CODE_LOWER_BOUND: int = 500 |
| 21 | +_FAULT_CODE_UPPER_BOUND: int = 599 |
| 22 | + |
| 23 | + |
| 24 | +class AwsSpanMetricsProcessor(SpanProcessor): |
| 25 | + """AwsSpanMetricsProcessor is SpanProcessor that generates metrics from spans |
| 26 | +
|
| 27 | + This processor will generate metrics based on span data. It depends on a MetricAttributeGenerator being provided on |
| 28 | + instantiation, which will provide a means to determine attributes which should be used to create metrics. A Resource |
| 29 | + must also be provided, which is used to generate metrics. Finally, three Histogram must be provided, which will be |
| 30 | + used to actually create desired metrics (see below) |
| 31 | +
|
| 32 | + AwsSpanMetricsProcessor produces metrics for errors (e.g. HTTP 4XX status codes), faults (e.g. HTTP 5XX status |
| 33 | + codes), and latency (in Milliseconds). Errors and faults are counted, while latency is measured with a histogram. |
| 34 | + Metrics are emitted with attributes derived from span attributes. |
| 35 | +
|
| 36 | + For highest fidelity metrics, this processor should be coupled with the AlwaysRecordSampler, which will result in |
| 37 | + 100% of spans being sent to the processor. |
| 38 | + """ |
| 39 | + |
| 40 | + # Metric instruments |
| 41 | + _error_histogram: Histogram |
| 42 | + _fault_histogram: Histogram |
| 43 | + _latency_histogram: Histogram |
| 44 | + |
| 45 | + _generator: MetricAttributeGenerator |
| 46 | + _resource: Resource |
| 47 | + |
| 48 | + def __init__( |
| 49 | + self, |
| 50 | + error_histogram: Histogram, |
| 51 | + fault_histogram: Histogram, |
| 52 | + latency_histogram: Histogram, |
| 53 | + generator: MetricAttributeGenerator, |
| 54 | + resource: Resource, |
| 55 | + ): |
| 56 | + self._error_histogram = error_histogram |
| 57 | + self._fault_histogram = fault_histogram |
| 58 | + self._latency_histogram = latency_histogram |
| 59 | + self._generator = generator |
| 60 | + self._resource = resource |
| 61 | + |
| 62 | + # pylint: disable=no-self-use |
| 63 | + @override |
| 64 | + def on_start(self, span: Span, parent_context: Optional[Context] = None) -> None: |
| 65 | + return |
| 66 | + |
| 67 | + @override |
| 68 | + def on_end(self, span: ReadableSpan) -> None: |
| 69 | + attribute_dict: dict[str, BoundedAttributes] = self._generator.generate_metric_attributes_dict_from_span( |
| 70 | + span, self._resource |
| 71 | + ) |
| 72 | + map(lambda attributes: self._record_metrics(span, attributes), attribute_dict.values()) |
| 73 | + |
| 74 | + @override |
| 75 | + def shutdown(self) -> None: |
| 76 | + self.force_flush() |
| 77 | + |
| 78 | + # pylint: disable=no-self-use |
| 79 | + @override |
| 80 | + def force_flush(self, timeout_millis: int = None) -> bool: |
| 81 | + return True |
| 82 | + |
| 83 | + def _record_metrics(self, span: ReadableSpan, attributes: BoundedAttributes) -> None: |
| 84 | + # Only record metrics if non-empty attributes are returned. |
| 85 | + if len(attributes) > 0: |
| 86 | + self._record_error_or_fault(span, attributes) |
| 87 | + self._record_latency(span, attributes) |
| 88 | + |
| 89 | + def _record_error_or_fault(self, span: ReadableSpan, attributes: BoundedAttributes) -> None: |
| 90 | + # The logic to record error and fault should be kept in sync with the aws-xray exporter whenever possible except |
| 91 | + # for the throttle. |
| 92 | + # https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/exporter/awsxrayexporter/internal/translator/cause.go#L121-L160 |
| 93 | + http_status_code: int = span.attributes.get(_HTTP_STATUS_CODE) |
| 94 | + status_code: StatusCode = span.status.status_code |
| 95 | + |
| 96 | + if http_status_code is None: |
| 97 | + http_status_code = attributes.get(_HTTP_STATUS_CODE) |
| 98 | + |
| 99 | + if _is_not_error_or_fault(http_status_code): |
| 100 | + if StatusCode.ERROR == status_code: |
| 101 | + self._error_histogram.record(0, attributes) |
| 102 | + self._fault_histogram.record(1, attributes) |
| 103 | + else: |
| 104 | + self._error_histogram.record(0, attributes) |
| 105 | + self._fault_histogram.record(0, attributes) |
| 106 | + elif _ERROR_CODE_LOWER_BOUND <= http_status_code <= _ERROR_CODE_UPPER_BOUND: |
| 107 | + self._error_histogram.record(1, attributes) |
| 108 | + self._fault_histogram.record(0, attributes) |
| 109 | + elif _FAULT_CODE_LOWER_BOUND <= http_status_code <= _FAULT_CODE_UPPER_BOUND: |
| 110 | + self._error_histogram.record(0, attributes) |
| 111 | + self._fault_histogram.record(1, attributes) |
| 112 | + |
| 113 | + def _record_latency(self, span: ReadableSpan, attributes: BoundedAttributes) -> None: |
| 114 | + nanos: int = span.end_time - span.start_time |
| 115 | + millis: float = nanos / _NANOS_TO_MILLIS |
| 116 | + self._latency_histogram.record(millis, attributes) |
| 117 | + |
| 118 | + |
| 119 | +def _is_not_error_or_fault(http_status_code: int) -> bool: |
| 120 | + return ( |
| 121 | + http_status_code is None |
| 122 | + or http_status_code < _ERROR_CODE_LOWER_BOUND |
| 123 | + or http_status_code > _FAULT_CODE_UPPER_BOUND |
| 124 | + ) |
0 commit comments