fix(eap): Correctly map eap columns to sentry prefixed keys (getsentr…

…y#79793) The old data is going to be deleted soon, so stop coalescing the prefixed and unprefixed versions together.
skyplaying · Oct 28, 2024 · 3d1b0ad · 3d1b0ad
1 parent 34fc1c2
commit 3d1b0ad
Show file tree

Hide file tree

Showing 4 changed files with 235 additions and 100 deletions.
diff --git a/src/sentry/search/events/builder/spans_indexed.py b/src/sentry/search/events/builder/spans_indexed.py
@@ -83,67 +83,27 @@ def resolve_field(self, raw_field: str, alias: bool = False) -> Column:
             # attr field is less permissive than tags, we can't have - in them
             or "-" in field
         ):
-            # Temporary until at least after 22 Dec 2024 when old data rotates out, otherwise we should just call super
-            # here and return default_field without any extra work
-            default_field = super().resolve_field(raw_field, alias)
-            if (
-                isinstance(default_field, Column)
-                and default_field.subscriptable == "attr_str"
-                or isinstance(default_field, AliasedExpression)
-                and default_field.exp.subscriptable == "attr_str"
-            ):
-                key = (
-                    default_field.key
-                    if isinstance(default_field, Column)
-                    else default_field.exp.key
-                )
-                unprefixed_field = Column(f"attr_str[{key}]")
-                prefixed_field = Column(f"attr_str[sentry.{key}]")
-                return Function(
-                    "if",
-                    [
-                        Function("mapContains", [Column("attr_str"), key]),
-                        unprefixed_field,
-                        prefixed_field,
-                    ],
-                    raw_field if alias else None,
-                )
-            else:
-                return default_field
+            return super().resolve_field(raw_field, alias)
 
         if field_type not in ["number", "string"]:
             raise InvalidSearchQuery(
                 f"Unknown type for field {raw_field}, only string and number are supported"
             )
 
         if field_type == "string":
-            attr_type = "attr_str"
             field_col = Column(f"attr_str[{field}]")
         else:
-            attr_type = "attr_num"
             field_col = Column(f"attr_num[{field}]")
 
         if alias:
             field_alias = f"tags_{field}@{field_type}"
+
             self.typed_tag_to_alias_map[raw_field] = field_alias
             self.alias_to_typed_tag_map[field_alias] = raw_field
-        else:
-            field_alias = None
-
-        # Temporary until at least after 22 Dec 2024 when old data rotates out
-        unprefixed_field = field_col
-        prefixed_field = Column(f"{attr_type}[sentry.{field}]")
-        col = Function(
-            "if",
-            [
-                Function("mapContains", [Column(attr_type), field]),
-                unprefixed_field,
-                prefixed_field,
-            ],
-            field_alias,
-        )
 
-        return col
+            field_col = AliasedExpression(field_col, field_alias)
+
+        return field_col
 
 
 class TimeseriesSpanIndexedQueryBuilder(SpansIndexedQueryBuilderMixin, TimeseriesQueryBuilder):

diff --git a/src/sentry/search/events/datasets/spans_indexed.py b/src/sentry/search/events/datasets/spans_indexed.py
@@ -411,7 +411,7 @@ def _resolve_span_duration(self, alias: str) -> SelectType:
 
     def _resolve_bounded_sample(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str,
     ) -> SelectType:
         base_condition = Function(
@@ -447,7 +447,7 @@ def _resolve_bounded_sample(
 
     def _resolve_rounded_time(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str,
     ) -> SelectType:
         start, end = self.builder.start, self.builder.end
@@ -473,7 +473,7 @@ def _resolve_rounded_time(
 
     def _resolve_percentile(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str,
         fixed_percentile: float | None = None,
     ) -> SelectType:
@@ -493,7 +493,7 @@ def _resolve_percentile(
 
     def _resolve_random_samples(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str,
     ) -> SelectType:
         offset = 0 if self.builder.offset is None else self.builder.offset.offset
@@ -567,9 +567,9 @@ def _resolve_span_duration(self, alias: str) -> SelectType:
 
     def _resolve_aggregate_if(
         self, aggregate: str
-    ) -> Callable[[Mapping[str, str | Column | SelectType | int | float], str | None], SelectType]:
+    ) -> Callable[[Mapping[str, str | SelectType | int | float], str | None], SelectType]:
         def resolve_aggregate_if(
-            args: Mapping[str, str | Column | SelectType | int | float],
+            args: Mapping[str, str | SelectType | int | float],
             alias: str | None = None,
         ) -> SelectType:
             attr = extract_attr(args["column"])
@@ -945,7 +945,7 @@ def field_alias_converter(self) -> Mapping[str, Callable[[str], SelectType]]:
 
     def _resolve_sum_weighted(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str | None = None,
     ) -> SelectType:
         attr = extract_attr(args["column"])
@@ -988,7 +988,7 @@ def _resolve_sum_weighted(
 
     def _resolve_count_weighted(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str | None = None,
     ) -> SelectType:
         attr = extract_attr(args["column"])
@@ -1027,7 +1027,7 @@ def _resolve_count_weighted(
 
     def _resolve_percentile_weighted(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str,
         fixed_percentile: float | None = None,
     ) -> SelectType:
@@ -1081,7 +1081,7 @@ def _zscore(self):
 
     def _resolve_margin_of_error(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str | None = None,
     ) -> SelectType:
         """Calculates the Margin of error for a given value, but unfortunately basis the total count based on
@@ -1127,7 +1127,7 @@ def _resolve_unadjusted_margin(
 
     def _resolve_finite_population_correction(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         total_samples: SelectType,
         population_size: int | float,
     ) -> SelectType:
@@ -1152,7 +1152,7 @@ def _resolve_finite_population_correction(
 
     def _resolve_lower_limit(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str,
     ) -> SelectType:
         """round(max(0, proportion_by_sample - margin_of_error) * total_population)"""
@@ -1198,7 +1198,7 @@ def _resolve_lower_limit(
 
     def _resolve_upper_limit(
         self,
-        args: Mapping[str, str | Column | SelectType | int | float],
+        args: Mapping[str, str | SelectType | int | float],
         alias: str,
     ) -> SelectType:
         """round(max(0, proportion_by_sample + margin_of_error) * total_population)"""
@@ -1236,31 +1236,8 @@ def _resolve_upper_limit(
 
 
 def extract_attr(
-    column: str | Column | SelectType | int | float,
+    column: str | SelectType | int | float,
 ) -> tuple[Column, str] | None:
-    # This check exists to handle the temporay prefixing.
-    # Once that's removed, this condition should become much simpler
-
-    if not isinstance(column, Function):
-        return None
-
-    if column.function != "if":
-        return None
-
-    if len(column.parameters) != 3:
-        return None
-
-    if (
-        not isinstance(column.parameters[0], Function)
-        or column.parameters[0].function != "mapContains"
-        or len(column.parameters[0].parameters) != 2
-    ):
-        return None
-
-    attr_col = column.parameters[0].parameters[0]
-    attr_name = column.parameters[0].parameters[1]
-
-    if not isinstance(attr_col, Column) or not isinstance(attr_name, str):
-        return None
-
-    return attr_col, attr_name
+    if isinstance(column, Column) and column.subscriptable in {"attr_str", "attr_num"}:
+        return Column(column.subscriptable), column.key
+    return None
diff --git a/src/sentry/utils/snuba.py b/src/sentry/utils/snuba.py
@@ -142,6 +142,7 @@ def log_snuba_info(content):
     "user.id": "sentry_tags[user.id]",
     "user.email": "sentry_tags[user.email]",
     "user.username": "sentry_tags[user.username]",
+    "user.ip": "sentry_tags[user.ip]",
     "profile.id": "profile_id",
     "cache.hit": "sentry_tags[cache.hit]",
     "transaction.method": "sentry_tags[transaction.method]",
@@ -181,7 +182,7 @@ def log_snuba_info(content):
     "project": "project_id",
     "project.id": "project_id",
     "project_id": "project_id",
-    "span.action": "attr_str[action]",
+    "span.action": "attr_str[sentry.action]",
     # For some reason the decision was made to store description as name? its called description everywhere else though
     "span.description": "name",
     "description": "name",
@@ -191,12 +192,12 @@ def log_snuba_info(content):
     # These sample columns are for debugging only and shouldn't be used
     "sampling_weight": "sampling_weight",
     "sampling_factor": "sampling_factor",
-    "span.domain": "attr_str[domain]",
-    "span.group": "attr_str[group]",
-    "span.op": "attr_str[op]",
-    "span.category": "attr_str[category]",
+    "span.domain": "attr_str[sentry.domain]",
+    "span.group": "attr_str[sentry.group]",
+    "span.op": "attr_str[sentry.op]",
+    "span.category": "attr_str[sentry.category]",
     "span.self_time": "exclusive_time_ms",
-    "span.status": "attr_str[status]",
+    "span.status": "attr_str[sentry.status]",
     "timestamp": "timestamp",
     "trace": "trace_id",
     "transaction": "segment_name",
@@ -205,21 +206,30 @@ def log_snuba_info(content):
     # txn event id(32 char uuid). EAP will no longer be storing this.
     "transaction.id": "segment_id",
     "transaction.span_id": "segment_id",
-    "transaction.method": "attr_str[transaction.method]",
+    "transaction.method": "attr_str[sentry.transaction.method]",
     "is_transaction": "is_segment",
     "segment.id": "segment_id",
     # We should be able to delete origin.transaction and just use transaction
     "origin.transaction": "segment_name",
     # Copy paste, unsure if this is truth in production
-    "messaging.destination.name": "attr_str[messaging.destination.name]",
-    "messaging.message.id": "attr_str[messaging.message.id]",
-    "span.status_code": "attr_str[status_code]",
-    "replay.id": "attr_str[replay_id]",
-    "span.ai.pipeline.group": "attr_str[ai_pipeline_group]",
-    "trace.status": "attr_str[trace.status]",
-    "browser.name": "attr_str[browser.name]",
+    "messaging.destination.name": "attr_str[sentry.messaging.destination.name]",
+    "messaging.message.id": "attr_str[sentry.messaging.message.id]",
+    "span.status_code": "attr_str[sentry.status_code]",
+    "replay.id": "attr_str[sentry.replay_id]",
+    "span.ai.pipeline.group": "attr_str[sentry.ai_pipeline_group]",
+    "trace.status": "attr_str[sentry.trace.status]",
+    "browser.name": "attr_str[sentry.browser.name]",
     "ai.total_tokens.used": "attr_num[ai_total_tokens_used]",
     "ai.total_cost": "attr_num[ai_total_cost]",
+    "sdk.name": "attr_str[sentry.sdk.name]",
+    "release": "attr_str[release]",
+    "user": "attr_str[sentry.user]",
+    "user.id": "attr_str[sentry.user.id]",
+    "user.email": "attr_str[sentry.user.email]",
+    "user.username": "attr_str[sentry.user.username]",
+    "user.ip": "attr_str[sentry.user.ip]",
+    "user.geo.subregion": "attr_str[sentry.user.geo.subregion]",
+    "user.geo.country_code": "attr_str[sentry.user.geo.country_code]",
 }
 
 METRICS_SUMMARIES_COLUMN_MAP = {
@@ -1437,7 +1447,8 @@ def _resolve_column(col):
         elif dataset == Dataset.EventsAnalyticsPlatform:
             if isinstance(col, str) and col.startswith("sentry_tags["):
                 # Replace the first instance of sentry tags with attr str instead
-                return col.replace("sentry_tags", "attr_str", 1)
+                # And sentry tags are always prefixed with `sentry.`
+                return col.replace("sentry_tags[", "attr_str[sentry.", 1)
             if isinstance(col, str) and col.startswith("tags["):
                 # Replace the first instance of sentry tags with attr str instead
                 return col.replace("tags", "attr_str", 1)