Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions bleach/html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,37 @@ def __iter__(self):
last_error_token = None
yield token

elif (
last_error_token["data"]
in (
"invalid-character-in-attribute-name",
"invalid-character-after-attribute-name",
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This handles both the <foo <b>text</b> (invalid-character-in-attribute-name) and <foo <bar <b>text</b> (both an invalid-character-in-attribute-name and invalid-character-after-attribute-name errors) cases.

)
and token["type"] == TAG_TOKEN_TYPE_CHARACTERS
and token.get("data")
and " " in token["data"]
):
# token["data"] has something that starts with a left angle
# bracket, then has some characters followed by a space
# followed by another left angle bracket and ending with
# a right angle bracket. That part could be a real tag, so
# we don't want it to get treated as Characters. For
# example, soemthing in this shape: <nottag <...>
# If so, we want to take off the first bit that is
# definitely not a tag and reparse the rest.
head, rest = token["data"].split(" ", 1)
if rest.strip().startswith("<"):
# yield the not-a-tag plus the space we split on
token["data"] = head + " "
yield token

# shove the rest back in the stream for the praser to look
# at
for c in reversed(rest):
self.stream.unget(c)
else:
yield token

elif (
last_error_token["data"] == "expected-closing-tag-but-got-char"
and self.parser.tags is not None
Expand Down
13 changes: 13 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,19 @@ def test_bare_entities_get_escaped_correctly(text, expected):
("<some thing thing", "&lt;some thing thing"),
# this is an expected-end-of-tag-but-got-eof parser error
("<some thing thing2 ", "&lt;some thing thing2 "),
# handle invalid-character-in-attribute-name correctly tests
("<tag <b><em>text</em></b> <a></a>", "&lt;tag <b><em>text</em></b> <a></a>"),
("<foo <a>link text</a>", "&lt;foo <a>link text</a>"),
# keep the tag and add an end tag
("<foo <b>", "&lt;foo <b></b>"),
# escape disallowed tags
("<foo <p>text</p>", "&lt;foo &lt;p&gt;text&lt;/p&gt;"),
# keep tags with attributes
('<foo <a href="x">text</a>', '&lt;foo <a href="x">text</a>'),
# multiple spaces
("<foo <a>link text</a>", "&lt;foo <a>link text</a>"),
("text <foo <b>text</b>", "text &lt;foo <b>text</b>"),
("text <foo <bar <b>text</b>", "text &lt;foo &lt;bar <b>text</b>"),
],
)
def test_lessthan_escaping(text, expected):
Expand Down
Loading