Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions sentry_sdk/ai/consts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import re

# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
DATA_URL_BASE64_REGEX = re.compile(
r"^data:(?:[a-zA-Z0-9][a-zA-Z0-9.+\-]*/[a-zA-Z0-9][a-zA-Z0-9.+\-]*)(?:;[a-zA-Z0-9\-]+=[^;,]*)*;base64,(?:[A-Za-z0-9+/\-_]+={0,2})$"
)

Check warning on line 6 in sentry_sdk/ai/consts.py

View check run for this annotation

@sentry/warden / warden: find-bugs

[7XB-5A9] TypeError when redacting image_url that is a string instead of a dict (additional location)

Line 685 assumes `item["image_url"]` is a dict when performing `item["image_url"]["url"] = BLOB_DATA_SUBSTITUTE`. If `image_url` is a string (which is valid per OpenAI's format), this will raise `TypeError: 'str' object does not support item assignment`. This is a separate issue from the detection bug since even if detection were fixed, the redaction would still fail.
26 changes: 23 additions & 3 deletions sentry_sdk/ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import TYPE_CHECKING

from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
from sentry_sdk.ai.consts import DATA_URL_BASE64_REGEX

if TYPE_CHECKING:
from typing import Any, Callable, Dict, List, Optional, Tuple
Expand Down Expand Up @@ -588,6 +589,20 @@
return 0


def _is_image_type_with_blob_content(item: "Dict[str, Any]") -> bool:
"""
Some content blocks contain an image_url property with base64 content as its value.
This is used to identify those while not leading to unnecessary copying of data when the image URL does not contain base64 content.
"""
if item.get("type") != "image_url":
return False

image_url = item.get("image_url", {}).get("url", "")

Check failure on line 600 in sentry_sdk/ai/utils.py

View check run for this annotation

@sentry/warden / warden: code-review

AttributeError when image_url content is a string instead of dict

The `_is_image_type_with_blob_content` function at line 600 assumes `image_url` is always a dict by calling `.get("url", "")`. However, OpenAI's format allows `image_url` to be a string directly (e.g., `{"type": "image_url", "image_url": "https://..."}`), as handled in `transform_openai_content_part` (lines 142-143). When a string is passed, this will raise `AttributeError: 'str' object has no attribute 'get'`, causing a runtime crash during blob redaction.

Check warning on line 600 in sentry_sdk/ai/utils.py

View check run for this annotation

@sentry/warden / warden: find-bugs

AttributeError when image_url is a string instead of a dict

The `_is_image_type_with_blob_content` function assumes `image_url` is always a dict, but OpenAI's format also supports a string shorthand (e.g., `{"type": "image_url", "image_url": "data:image/jpeg;base64,..."}`). When `image_url` is a string, calling `.get("url", "")` on it will raise `AttributeError: 'str' object has no attribute 'get'`. This causes `redact_blob_message_parts` to crash when processing messages with the string format, potentially leaking base64 image content to Sentry span data.
data_url_match = DATA_URL_BASE64_REGEX.match(image_url)

return bool(data_url_match)


def redact_blob_message_parts(
messages: "List[Dict[str, Any]]",
) -> "List[Dict[str, Any]]":
Expand Down Expand Up @@ -640,7 +655,9 @@
content = message.get("content")
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "blob":
if isinstance(item, dict) and (
item.get("type") == "blob" or _is_image_type_with_blob_content(item)
):

Check failure on line 660 in sentry_sdk/ai/utils.py

View check run for this annotation

@sentry/warden / warden: code-review

[8C7-DQ7] AttributeError when image_url content is a string instead of dict (additional location)

The `_is_image_type_with_blob_content` function at line 600 assumes `image_url` is always a dict by calling `.get("url", "")`. However, OpenAI's format allows `image_url` to be a string directly (e.g., `{"type": "image_url", "image_url": "https://..."}`), as handled in `transform_openai_content_part` (lines 142-143). When a string is passed, this will raise `AttributeError: 'str' object has no attribute 'get'`, causing a runtime crash during blob redaction.

Check warning on line 660 in sentry_sdk/ai/utils.py

View check run for this annotation

@sentry/warden / warden: find-bugs

[R44-S4Y] AttributeError when image_url is a string instead of a dict (additional location)

The `_is_image_type_with_blob_content` function assumes `image_url` is always a dict, but OpenAI's format also supports a string shorthand (e.g., `{"type": "image_url", "image_url": "data:image/jpeg;base64,..."}`). When `image_url` is a string, calling `.get("url", "")` on it will raise `AttributeError: 'str' object has no attribute 'get'`. This causes `redact_blob_message_parts` to crash when processing messages with the string format, potentially leaking base64 image content to Sentry span data.
has_blobs = True
break
if has_blobs:
Expand All @@ -661,8 +678,11 @@
content = message.get("content")
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "blob":
item["content"] = BLOB_DATA_SUBSTITUTE
if isinstance(item, dict):
if item.get("type") == "blob":
item["content"] = BLOB_DATA_SUBSTITUTE
elif _is_image_type_with_blob_content(item):
item["image_url"]["url"] = BLOB_DATA_SUBSTITUTE

Check warning on line 685 in sentry_sdk/ai/utils.py

View check run for this annotation

@sentry/warden / warden: find-bugs

TypeError when redacting image_url that is a string instead of a dict

Line 685 assumes `item["image_url"]` is a dict when performing `item["image_url"]["url"] = BLOB_DATA_SUBSTITUTE`. If `image_url` is a string (which is valid per OpenAI's format), this will raise `TypeError: 'str' object does not support item assignment`. This is a separate issue from the detection bug since even if detection were fixed, the redaction would still fail.

return messages_copy

Expand Down
7 changes: 0 additions & 7 deletions sentry_sdk/integrations/pydantic_ai/consts.py
Original file line number Diff line number Diff line change
@@ -1,8 +1 @@
import re

SPAN_ORIGIN = "auto.ai.pydantic_ai"

# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
DATA_URL_BASE64_REGEX = re.compile(
r"^data:(?:[a-zA-Z0-9][a-zA-Z0-9.+\-]*/[a-zA-Z0-9][a-zA-Z0-9.+\-]*)(?:;[a-zA-Z0-9\-]+=[^;,]*)*;base64,(?:[A-Za-z0-9+/\-_]+={0,2})$"
)
2 changes: 1 addition & 1 deletion sentry_sdk/integrations/pydantic_ai/spans/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sentry_sdk.ai.utils import get_modality_from_mime_type
from sentry_sdk.consts import SPANDATA

from ..consts import DATA_URL_BASE64_REGEX
from sentry_sdk.ai.consts import DATA_URL_BASE64_REGEX

from typing import TYPE_CHECKING

Expand Down
65 changes: 65 additions & 0 deletions tests/test_ai_monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,71 @@ def test_redacts_blobs_in_multiple_messages(self):
assert result[1]["content"] == "I see the image." # Unchanged
assert result[2]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE

def test_redacts_single_blob_within_image_url_content(self):
messages = [
{
"role": "user",
"content": [
{
"text": "How many ponies do you see in the image?",
"type": "text",
},
{
"type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRg=="},
},
],
}
]

original_blob_content = messages[0]["content"][1]

result = redact_blob_message_parts(messages)

assert messages[0]["content"][1] == original_blob_content

assert (
result[0]["content"][0]["text"]
== "How many ponies do you see in the image?"
)
assert result[0]["content"][0]["type"] == "text"
assert result[0]["content"][1]["type"] == "image_url"
assert result[0]["content"][1]["image_url"]["url"] == BLOB_DATA_SUBSTITUTE

def test_does_not_redact_image_url_content_with_non_blobs(self):
messages = [
{
"role": "user",
"content": [
{
"text": "How many ponies do you see in the image?",
"type": "text",
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.jpg"},
},
],
}
]

original_blob_content = messages[0]["content"][1]

result = redact_blob_message_parts(messages)

assert messages[0]["content"][1] == original_blob_content

assert (
result[0]["content"][0]["text"]
== "How many ponies do you see in the image?"
)
assert result[0]["content"][0]["type"] == "text"
assert result[0]["content"][1]["type"] == "image_url"
assert (
result[0]["content"][1]["image_url"]["url"]
== "https://example.com/image.jpg"
)

def test_no_blobs_returns_original_list(self):
"""Test that messages without blobs are returned as-is (performance optimization)"""
messages = [
Expand Down
Loading