diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index f083db75..e8027017 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -346,6 +346,37 @@ def __iter__(self): last_error_token = None yield token + elif ( + last_error_token["data"] + in ( + "invalid-character-in-attribute-name", + "invalid-character-after-attribute-name", + ) + and token["type"] == TAG_TOKEN_TYPE_CHARACTERS + and token.get("data") + and " " in token["data"] + ): + # token["data"] has something that starts with a left angle + # bracket, then has some characters followed by a space + # followed by another left angle bracket and ending with + # a right angle bracket. That part could be a real tag, so + # we don't want it to get treated as Characters. For + # example, soemthing in this shape: + # If so, we want to take off the first bit that is + # definitely not a tag and reparse the rest. + head, rest = token["data"].split(" ", 1) + if rest.strip().startswith("<"): + # yield the not-a-tag plus the space we split on + token["data"] = head + " " + yield token + + # shove the rest back in the stream for the praser to look + # at + for c in reversed(rest): + self.stream.unget(c) + else: + yield token + elif ( last_error_token["data"] == "expected-closing-tag-but-got-char" and self.parser.tags is not None diff --git a/tests/test_clean.py b/tests/test_clean.py index 5a13cfaf..4021ef14 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -171,6 +171,19 @@ def test_bare_entities_get_escaped_correctly(text, expected): ("text ", "<tag text "), + ("link text", "<foo link text"), + # keep the tag and add an end tag + ("", "<foo "), + # escape disallowed tags + ("text

", "<foo <p>text</p>"), + # keep tags with attributes + ('text', '<foo text'), + # multiple spaces + ("link text", "<foo link text"), + ("text text", "text <foo text"), + ("text text", "text <foo <bar text"), ], ) def test_lessthan_escaping(text, expected):