Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/release-history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Release History
In Development
--------------

n/a
- Treat the `binary/octet-stream` as a generic media type, just like `application/octet-stream`, when trying to determine if content is not HTML. Even though `binary/octet-stream` is not a registered IANA media type it turns out some AWS SDKs use it when uploading files to S3, so it’s somewhat common.


Version 0.1.4 (2024-01-01)
Expand Down
2 changes: 2 additions & 0 deletions web_monitoring_diff/content_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
# Matches Content Types that *could* be acceptable for diffing as HTML
UNKNOWN_CONTENT_TYPE_PATTERN = re.compile(r'^(%s)$' % '|'.join((
r'application/octet-stream',
r'binary/octet-stream',
r'application/x-download',
r'text/.+'
)))
Expand Down Expand Up @@ -70,6 +71,7 @@ def is_not_html(text, headers=None, check_options='normal'):
- `nosniff` uses the `Content-Type` header but does not sniff.
- `ignore` doesn’t do any checking at all.
"""
print(f'#is_not_html: check_options="{check_options}", headers={headers}, text={text[:500]}')
if headers and (check_options == 'normal' or check_options == 'nosniff'):
content_type = headers.get('Content-Type', '').split(';', 1)[0].strip()
if content_type and VALID_CONTENT_TYPE_PATTERN.match(content_type):
Expand Down
8 changes: 8 additions & 0 deletions web_monitoring_diff/tests/test_html_diff_validity.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,14 @@ def test_html_diff_render_should_not_check_content_type_header_if_header_is_malf
b_headers={'Content-Type': 'text/html'})


def test_html_diff_render_should_not_check_content_type_header_if_header_is_generic():
html_diff_render(
'<p>Just a little HTML</p>',
'<p>Just some HTML</p>',
a_headers={'Content-Type': 'binary/octet-stream'},
b_headers={'Content-Type': 'application/x-download'})


def test_html_diff_render_should_not_check_content_type_header_if_content_type_options_is_nocheck():
html_diff_render(
'<p>Just a little HTML</p>',
Expand Down