Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## Unreleased

**Features**:

- Normalize deprecated attribute names according to `sentry-conventions` for logs and V2 spans. ([#5257](https://github.com/getsentry/relay/pull/5257))

## 25.10.0

**Features**:
Expand Down
171 changes: 168 additions & 3 deletions relay-event-normalization/src/eap/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
use chrono::{DateTime, Utc};
use relay_common::time::UnixTimestamp;
use relay_conventions::{
BROWSER_NAME, BROWSER_VERSION, OBSERVED_TIMESTAMP_NANOS, USER_GEO_CITY, USER_GEO_COUNTRY_CODE,
USER_GEO_REGION, USER_GEO_SUBDIVISION,
AttributeInfo, BROWSER_NAME, BROWSER_VERSION, OBSERVED_TIMESTAMP_NANOS, USER_GEO_CITY,
USER_GEO_COUNTRY_CODE, USER_GEO_REGION, USER_GEO_SUBDIVISION, WriteBehavior,
};
use relay_event_schema::protocol::{AttributeType, Attributes, BrowserContext, Geo};
use relay_protocol::{Annotated, ErrorKind, Value};
use relay_protocol::{Annotated, ErrorKind, Remark, RemarkType, Value};

use crate::{ClientHints, FromUserAgentInfo as _, RawUserAgentInfo};

Expand Down Expand Up @@ -132,6 +132,58 @@ pub fn normalize_user_geo(
attributes.insert_if_missing(USER_GEO_REGION, || geo.region);
}

/// Normalizes deprecated attributes according to `sentry-conventions`.
///
/// Attributes with a status of `"normalize"` will be moved to their replacement name.
/// If there is already a value present under the replacement name, it will be left alone,
/// but the deprecated attribute is removed anyway.
///
/// Attributes with a status of `"backfill"` will be copied to their replacement name if the
/// replacement name is not present. In any case, the original name is left alone.
pub fn normalize_attribute_names(attributes: &mut Annotated<Attributes>) {
normalize_attribute_names_inner(attributes, relay_conventions::attribute_info)
}

fn normalize_attribute_names_inner(
attributes: &mut Annotated<Attributes>,
attribute_info: fn(&str) -> Option<&'static AttributeInfo>,
) {
let attributes = attributes.get_or_insert_with(Default::default);
let attribute_names: Vec<_> = attributes.keys().cloned().collect();

for name in attribute_names {
let Some(attribute_info) = attribute_info(&name) else {
continue;
};

match attribute_info.write_behavior {
WriteBehavior::CurrentName => continue,
WriteBehavior::NewName(new_name) => {
let Some(old_attribute) = attributes.get_raw_mut(&name) else {
continue;
};

let new_attribute = old_attribute.clone();
old_attribute.set_value(None);
old_attribute
.meta_mut()
.add_remark(Remark::new(RemarkType::Removed, "attribute.deprecated"));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there something better that we could add to this remark as the "rule ID"?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'd want to differentiate here on the remarks:

  • removed
  • moved (renamed)
  • removed, not moved (because new attribute already exists)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. What is the difference between the first and the third?
  2. Which RemarkType would you use for the second?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the difference between the first and the third?

  1. it's just a deletion
  2. it was supposed to be a rename, but the target already existed so it was removed. This rename does degrade to a remove, but to better understand what it is happening, I imagine a hint that something was supposed to be moved but it degraded to a remove is helpful.

Which RemarkType would you use for the second?

Removed fits closest imo, we can further differentiate on the rule_id imo. Not sure if it's worth introducing a rename, when that is essentially a just a remove.
Alternatively, we can also only mark the new value that it was renamed.


if !attributes.contains_key(new_name) {
attributes.insert_raw(new_name.to_owned(), new_attribute);
}
}
WriteBehavior::BothNames(new_name) => {
if !attributes.contains_key(new_name)
&& let Some(current_attribute) = attributes.get_raw(&name).cloned()
{
attributes.insert_raw(new_name.to_owned(), current_attribute);
}
}
}
}
}

#[cfg(test)]
mod tests {
use relay_protocol::SerializableAnnotated;
Expand Down Expand Up @@ -464,4 +516,117 @@ mod tests {
"#,
);
}

#[test]
fn test_normalize_attributes() {
fn mock_attribute_info(name: &str) -> Option<&'static AttributeInfo> {
use relay_conventions::Pii;

match name {
"replace.empty" => Some(&AttributeInfo {
write_behavior: WriteBehavior::NewName("replaced"),
pii: Pii::Maybe,
aliases: &["replaced"],
}),
"replace.existing" => Some(&AttributeInfo {
write_behavior: WriteBehavior::NewName("not.replaced"),
pii: Pii::Maybe,
aliases: &["not.replaced"],
}),
"backfill.empty" => Some(&AttributeInfo {
write_behavior: WriteBehavior::BothNames("backfilled"),
pii: Pii::Maybe,
aliases: &["backfilled"],
}),
"backfill.existing" => Some(&AttributeInfo {
write_behavior: WriteBehavior::BothNames("not.backfilled"),
pii: Pii::Maybe,
aliases: &["not.backfilled"],
}),
_ => None,
}
}

let mut attributes = Annotated::new(Attributes::from([
(
"replace.empty".to_owned(),
Annotated::new("Should be moved".to_owned().into()),
),
(
"replace.existing".to_owned(),
Annotated::new("Should be removed".to_owned().into()),
),
(
"not.replaced".to_owned(),
Annotated::new("Should be left alone".to_owned().into()),
),
(
"backfill.empty".to_owned(),
Annotated::new("Should be copied".to_owned().into()),
),
(
"backfill.existing".to_owned(),
Annotated::new("Should be left alone".to_owned().into()),
),
(
"not.backfilled".to_owned(),
Annotated::new("Should be left alone".to_owned().into()),
),
]));

normalize_attribute_names_inner(&mut attributes, mock_attribute_info);

insta::assert_json_snapshot!(SerializableAnnotated(&attributes), @r###"
{
"backfill.empty": {
"type": "string",
"value": "Should be copied"
},
"backfill.existing": {
"type": "string",
"value": "Should be left alone"
},
"backfilled": {
"type": "string",
"value": "Should be copied"
},
"not.backfilled": {
"type": "string",
"value": "Should be left alone"
},
"not.replaced": {
"type": "string",
"value": "Should be left alone"
},
"replace.empty": null,
"replace.existing": null,
"replaced": {
"type": "string",
"value": "Should be moved"
},
"_meta": {
"replace.empty": {
"": {
"rem": [
[
"attribute.deprecated",
"x"
]
]
}
},
"replace.existing": {
"": {
"rem": [
[
"attribute.deprecated",
"x"
]
]
}
}
}
}
"###);
}
}
31 changes: 31 additions & 0 deletions relay-event-schema/src/protocol/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,24 @@ impl Attributes {
Some(&self.0.get(key)?.value()?.value.value)
}

/// Returns the annotated attribute with the given key.
pub fn get_raw<Q>(&self, key: &Q) -> Option<&Annotated<Attribute>>
where
String: Borrow<Q>,
Q: Ord + ?Sized,
{
self.0.get(key)
}

/// Mutably returns the annotated attribute with the given key.
pub fn get_raw_mut<Q>(&mut self, key: &Q) -> Option<&mut Annotated<Attribute>>
where
String: Borrow<Q>,
Q: Ord + ?Sized,
{
self.0.get_mut(key)
}

/// Inserts an attribute with the given value into this collection.
pub fn insert<K: Into<String>, V: Into<AttributeValue>>(&mut self, key: K, value: V) {
fn inner(slf: &mut Attributes, key: String, value: AttributeValue) {
Expand Down Expand Up @@ -296,6 +314,19 @@ impl Attributes {
) -> std::collections::btree_map::IterMut<'_, String, Annotated<Attribute>> {
self.0.iter_mut()
}

/// Returns an iterator over the keys in this collection.
pub fn keys(&self) -> std::collections::btree_map::Keys<'_, String, Annotated<Attribute>> {
self.0.keys()
}

/// Provides mutable access to an entry in this collection.
pub fn entry(
&mut self,
key: String,
) -> std::collections::btree_map::Entry<'_, String, Annotated<Attribute>> {
self.0.entry(key)
}
}

impl IntoIterator for Attributes {
Expand Down
1 change: 1 addition & 0 deletions relay-server/src/processing/logs/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ fn normalize_log(log: &mut Annotated<OurLog>, meta: &RequestMeta) -> Result<()>
eap::normalize_received(&mut log.attributes, meta.received_at());
eap::normalize_user_agent(&mut log.attributes, meta.user_agent(), meta.client_hints());
eap::normalize_attribute_types(&mut log.attributes);
eap::normalize_attribute_names(&mut log.attributes);

Ok(())
}
Expand Down
2 changes: 1 addition & 1 deletion relay-server/src/processing/logs/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::envelope::WithHeader;

/// Returns the calculated size of a [`OurLog`].
///
/// Unlike [`relay_ourlogs::calculate_size`], this access the already manifested byte size
/// Unlike [`relay_ourlogs::calculate_size`], this accesses the already manifested byte size
/// of the log, instead of calculating it.
///
/// When compiled with debug assertion the function asserts the presence of a manifested byte size.
Expand Down
3 changes: 2 additions & 1 deletion relay-server/src/processing/spans/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ fn normalize_span(
if let Some(span) = span.value_mut() {
eap::normalize_received(&mut span.attributes, meta.received_at());
eap::normalize_user_agent(&mut span.attributes, meta.user_agent(), meta.client_hints());
eap::normalize_attribute_types(&mut span.attributes);
eap::normalize_user_geo(&mut span.attributes, || {
meta.client_addr().and_then(|ip| geo_lookup.lookup(ip))
});
eap::normalize_attribute_types(&mut span.attributes);
eap::normalize_attribute_names(&mut span.attributes);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we agree on an approach how to handle normalization and PII scrubbing? Was it that PII scrubbing rules need to be migrated on the sentry side? IMO the best approach would be if PII Selectors would match on both field names, but that might be a stretch for this PR.

Even if we don't solve the PII problem right away, I would add a test case that documents current behavior (i.e. user has an advanced rule for a deprecated field, but the normalized new field does not get scrubbed).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was it that PII scrubbing rules need to be migrated on the sentry side?

Maybe. It has its upsides, but also massive downsides :(

IMO the best approach would be if PII Selectors would match on both field names

I think that's maybe what we'd have to do, when resolving pii we'd need to consider all aliases.

Even if we don't solve the PII problem right away, I would add a test case that documents current behavior

👍

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Attribute Normalization Order Causes Data Loss

The normalize_attribute_names function runs after normalize_user_agent and normalize_user_geo in log and span processing. This means deprecated attribute names aren't normalized to canonical names before other functions access them. Consequently, user-provided values in deprecated attributes may be lost, with automatically inferred values taking precedence.

Additional Locations (1)

Fix in Cursor Fix in Web


// TODO: ai model costs
} else {
Expand Down
43 changes: 15 additions & 28 deletions tests/integration/test_ourlogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ def timestamps(ts: datetime):
"sentry.observed_timestamp_nanos": {
"stringValue": time_within(ts, expect_resolution="ns")
},
"sentry._internal.observed_timestamp_nanos": {
"stringValue": time_within(ts, expect_resolution="ns")
},
"sentry.timestamp_nanos": {
"stringValue": time_within_delta(
ts, delta=timedelta(seconds=0), expect_resolution="ns", precision="us"
Expand Down Expand Up @@ -130,16 +133,16 @@ def test_ourlog_multiple_containers_not_allowed(
@pytest.mark.parametrize(
"external_mode,expected_byte_size",
[
# 260 here is a billing relevant metric, do not arbitrarily change it,
# 296 here is a billing relevant metric, do not arbitrarily change it,
# this value is supposed to be static and purely based on data received,
# independent of any normalization.
(None, 260),
(None, 296),
# Same applies as above, a proxy Relay does not need to run normalization.
("proxy", 260),
("proxy", 296),
# If an external Relay/Client makes modifications, sizes can change,
# this is fuzzy due to slight changes in sizes due to added timestamps
# and may need to be adjusted when changing normalization.
("managed", 454),
("managed", 641),
],
)
def test_ourlog_extraction_with_sentry_logs(
Expand Down Expand Up @@ -200,6 +203,7 @@ def test_ourlog_extraction_with_sentry_logs(
"string.attribute": {"value": "some string", "type": "string"},
"pii": {"value": "4242 4242 4242 4242", "type": "string"},
"sentry.severity_text": {"value": "info", "type": "string"},
"http.response_content_length": {"value": 17, "type": "integer"},
"unknown_type": {"value": "info", "type": "unknown"},
"broken_type": {"value": "info", "type": "not_a_real_type"},
"mismatched_type": {"value": "some string", "type": "boolean"},
Expand Down Expand Up @@ -263,6 +267,8 @@ def test_ourlog_extraction_with_sentry_logs(
"sentry.browser.version": {"stringValue": "2.32"},
"sentry.severity_text": {"stringValue": "info"},
"sentry.payload_size_bytes": {"intValue": mock.ANY},
"http.response_content_length": {"intValue": "17"},
"http.response.body.size": {"intValue": "17"},
"sentry.span_id": {"stringValue": "eee19b7ec3c1b174"},
"string.attribute": {"stringValue": "some string"},
"valid_string_with_other": {"stringValue": "test"},
Expand Down Expand Up @@ -355,6 +361,10 @@ def test_ourlog_extraction_with_string_pii_scrubbing(
"type": "string",
"value": time_within(ts, expect_resolution="ns"),
},
"sentry._internal.observed_timestamp_nanos": {
"type": "string",
"value": time_within(ts, expect_resolution="ns"),
},
},
"__header": {"byte_size": mock.ANY},
"_meta": {
Expand Down Expand Up @@ -496,36 +506,13 @@ def test_ourlog_extraction_default_pii_scrubbing_does_not_scrub_default_attribut
"custom_field": {"stringValue": "[REDACTED]"},
"sentry.body": {"stringValue": "[REDACTED]"},
"sentry.severity_text": {"stringValue": "info"},
"sentry.observed_timestamp_nanos": {
"stringValue": time_within_delta(
ts,
delta=timedelta(seconds=1),
expect_resolution="ns",
precision="us",
)
},
"sentry.span_id": {"stringValue": "eee19b7ec3c1b174"},
"sentry.payload_size_bytes": mock.ANY,
"sentry.browser.name": {"stringValue": "Python Requests"},
"sentry._meta.fields.body": {
"stringValue": '{"meta":{"":{"rem":[["remove_custom_field","s",0,10]],"len":8}}}'
},
"sentry.timestamp_nanos": {
"stringValue": time_within_delta(
ts,
delta=timedelta(seconds=0),
expect_resolution="ns",
precision="us",
)
},
"sentry.timestamp_precise": {
"intValue": time_within_delta(
ts,
delta=timedelta(seconds=0),
expect_resolution="ns",
precision="us",
)
},
**timestamps(ts),
},
"clientSampleRate": 1.0,
"itemId": mock.ANY,
Expand Down
Loading
Loading