Skip to content

Commit 77ac67b

Browse files
authored
Fix UnknownRemoteOperation for Database instrumentation. (#50)
Description of changes: When `"db.operation"` is not provided in span attributes, the `"aws.remote.operation"` will be `UnknowRemoteOperation`. This PR updated the `_set_remote_service_and_operation` function to parsing the `"db.statement"` attribute when `"db.operation"` is missing, and extract the valid `"aws.remote.operation"` (We setup a set of valid database related `"aws.remote.operation"` for security concern): 1. Add a json file `configuration/dialect_keywords.json` saving all the valid keywords. In this file, the sequence of key words matter -> The keyword with longer word length are placed towards the front of the list so it will be matched first. 2. We retrieve the first 27 characters from `"db.statement"` to avoid the case where very large statements, and use regular expression to match the keyword, it will match the beginning of the string. If the string's start does not conform to the regular expression, the match fails. 3. Add unit test for _set_remote_service_and_operation performance where tested _DB_STATEMENT is/is not present and is/is not valid. 4. Add unit test covering different `"db.statement"` cases: a. Only 1 valid keywords match b. More than 1 valid keywords match, we want to pick the longest match c. More than 1 valid keywords match, but the other keywords is not at the start of the SpanAttributes.DB_STATEMENT. We want to only pick start match d. No valid match e. Have valid but it is not at the start of SpanAttributes.DB_STATEMENT f. Have valid keywords, match the longest word g. Have valid keywords, match with first word h.Have valid keywords, match with upper case 5.Add unit test for testing keywords sequence in json file: a.Confirm the keywords are sorted based on descending order of keywords character length b.Confirm maximum length of keywords is not longer than MAX_KEYWORD_LENGTH 6. exclude `configuration/dialect_keywords.json` from codespell check because the Keywords are fixed for the query, and codespell does not recognize the comments in json. Testing Tested by deploying the code changes to EKS and nodes, where we can see the actual operation instead of `UnknowRemoteOperation`: ![Screenshot 2024-02-13 at 6 21 34 PM](https://github.com/aws-observability/aws-otel-python-instrumentation/assets/146124015/cd4cb2d5-f069-45c9-8487-45a709ef6e5a) By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
1 parent 07a4cb2 commit 77ac67b

File tree

8 files changed

+1005
-5
lines changed

8 files changed

+1005
-5
lines changed

.codespellrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[codespell]
22
# skipping auto generated folders
3-
skip = ./.tox,./.mypy_cache,./target,*/LICENSE,./venv
3+
skip = ./.tox,./.mypy_cache,./target,*/LICENSE,./venv,*/sql_dialect_keywords.json
44
ignore-words-list = ot

aws-opentelemetry-distro/src/amazon/opentelemetry/distro/_aws_metric_attribute_generator.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
# SPDX-License-Identifier: Apache-2.0
3+
import re
34
from logging import DEBUG, Logger, getLogger
4-
from typing import Optional
5+
from typing import Match, Optional
56
from urllib.parse import ParseResult, urlparse
67

78
from amazon.opentelemetry.distro._aws_attribute_keys import (
@@ -14,6 +15,8 @@
1415
)
1516
from amazon.opentelemetry.distro._aws_span_processing_util import (
1617
LOCAL_ROOT,
18+
MAX_KEYWORD_LENGTH,
19+
SQL_KEYWORD_PATTERN,
1720
UNKNOWN_OPERATION,
1821
UNKNOWN_REMOTE_OPERATION,
1922
UNKNOWN_REMOTE_SERVICE,
@@ -38,6 +41,7 @@
3841
# Pertinent OTEL attribute keys
3942
_SERVICE_NAME: str = ResourceAttributes.SERVICE_NAME
4043
_DB_OPERATION: str = SpanAttributes.DB_OPERATION
44+
_DB_STATEMENT: str = SpanAttributes.DB_STATEMENT
4145
_DB_SYSTEM: str = SpanAttributes.DB_SYSTEM
4246
_FAAS_INVOKED_NAME: str = SpanAttributes.FAAS_INVOKED_NAME
4347
_FAAS_TRIGGER: str = SpanAttributes.FAAS_TRIGGER
@@ -189,9 +193,12 @@ def _set_remote_service_and_operation(span: ReadableSpan, attributes: BoundedAtt
189193
elif is_key_present(span, _RPC_SERVICE) or is_key_present(span, _RPC_METHOD):
190194
remote_service = _normalize_service_name(span, _get_remote_service(span, _RPC_SERVICE))
191195
remote_operation = _get_remote_operation(span, _RPC_METHOD)
192-
elif is_key_present(span, _DB_SYSTEM) or is_key_present(span, _DB_OPERATION):
196+
elif is_key_present(span, _DB_SYSTEM) or is_key_present(span, _DB_OPERATION) or is_key_present(span, _DB_STATEMENT):
193197
remote_service = _get_remote_service(span, _DB_SYSTEM)
194-
remote_operation = _get_remote_operation(span, _DB_OPERATION)
198+
if is_key_present(span, _DB_OPERATION):
199+
remote_operation = _get_remote_operation(span, _DB_OPERATION)
200+
else:
201+
remote_operation = _get_db_statement_remote_operation(span, _DB_STATEMENT)
195202
elif is_key_present(span, _FAAS_INVOKED_NAME) or is_key_present(span, _FAAS_TRIGGER):
196203
remote_service = _get_remote_service(span, _FAAS_INVOKED_NAME)
197204
remote_operation = _get_remote_operation(span, _FAAS_TRIGGER)
@@ -232,6 +239,28 @@ def _get_remote_operation(span: ReadableSpan, remote_operation_key: str) -> str:
232239
return remote_operation
233240

234241

242+
def _get_db_statement_remote_operation(span: ReadableSpan, statement_key: str) -> str:
243+
"""
244+
If no db.operation attribute provided in the span,
245+
we use db.statement to compute a valid remote operation in a best-effort manner.
246+
To do this, we take the first substring of the statement
247+
and compare to a regex list of known SQL keywords.
248+
The substring length is determined by the longest known SQL keywords.
249+
"""
250+
remote_operation: str = span.attributes.get(statement_key)
251+
252+
if remote_operation is None:
253+
return UNKNOWN_REMOTE_OPERATION
254+
255+
# Remove all whitespace and newline characters from the beginning of remote_operation
256+
# and retrieve the first MAX_KEYWORD_LENGTH characters
257+
remote_operation = remote_operation.lstrip()[:MAX_KEYWORD_LENGTH]
258+
match: Optional[Match[str]] = re.match(SQL_KEYWORD_PATTERN, remote_operation.upper())
259+
remote_operation = match.group(0) if match else UNKNOWN_REMOTE_OPERATION
260+
261+
return remote_operation
262+
263+
235264
def _normalize_service_name(span: ReadableSpan, service_name: str) -> str:
236265
"""
237266
TODO: Determine if problems in Java instrumentation are relevant here. Do we need normalization? If so, probably we

aws-opentelemetry-distro/src/amazon/opentelemetry/distro/_aws_span_processing_util.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
# SPDX-License-Identifier: Apache-2.0
33
"""Utility module designed to support shared logic across AWS Span Processors."""
4+
import json
5+
import os
6+
from typing import Dict, List
7+
48
from amazon.opentelemetry.distro._aws_attribute_keys import AWS_CONSUMER_PARENT_SPAN_KIND, AWS_LOCAL_OPERATION
59
from opentelemetry.sdk.trace import InstrumentationScope, ReadableSpan
610
from opentelemetry.semconv.trace import MessagingOperationValues, SpanAttributes
@@ -18,6 +22,23 @@
1822
_SQS_RECEIVE_MESSAGE_SPAN_NAME: str = "Sqs.ReceiveMessage"
1923
_AWS_SDK_INSTRUMENTATION_SCOPE_PREFIX: str = "io.opentelemetry.aws-sdk-"
2024

25+
# Max keyword length supported by parsing into remote_operation from DB_STATEMENT
26+
MAX_KEYWORD_LENGTH = 27
27+
28+
29+
# Get dialect keywords retrieved from dialect_keywords.json file.
30+
# Only meant to be invoked by SQL_KEYWORD_PATTERN and unit tests
31+
def _get_dialect_keywords() -> List[str]:
32+
current_dir: str = os.path.dirname(__file__)
33+
file_path: str = os.path.join(current_dir, "configuration/sql_dialect_keywords.json")
34+
with open(file_path, "r", encoding="utf-8") as json_file:
35+
keywords_data: Dict[str, str] = json.load(json_file)
36+
return keywords_data["keywords"]
37+
38+
39+
# A regular expression pattern to match SQL keywords.
40+
SQL_KEYWORD_PATTERN = r"^(?:" + "|".join(_get_dialect_keywords()) + r")\b"
41+
2142

2243
def get_ingress_operation(__, span: ReadableSpan) -> str:
2344
"""

0 commit comments

Comments
 (0)