mozilla · willkg · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · willkg
@@ -346,6 +346,37 @@ def __iter__(self):
                     last_error_token = None
                     yield token
 
+                elif (
+                    last_error_token["data"]
+                    in (
+                        "invalid-character-in-attribute-name",
+                        "invalid-character-after-attribute-name",
+                    )
+                    and token["type"] == TAG_TOKEN_TYPE_CHARACTERS
+                    and token.get("data")
+                    and " " in token["data"]
+                ):
+                    # token["data"] has something that starts with a left angle
+                    # bracket, then has some characters followed by a space
+                    # followed by another left angle bracket and ending with
+                    # a right angle bracket. That part could be a real tag, so
+                    # we don't want it to get treated as Characters. For
+                    # example, soemthing in this shape: <nottag <...>
+                    # If so, we want to take off the first bit that is
+                    # definitely not a tag and reparse the rest.
+                    head, rest = token["data"].split(" ", 1)
+                    if rest.strip().startswith("<"):
+                        # yield the not-a-tag plus the space we split on
+                        token["data"] = head + " "
+                        yield token
+
+                        # shove the rest back in the stream for the praser to look
+                        # at
+                        for c in reversed(rest):
+                            self.stream.unget(c)
+                    else:
+                        yield token
+
                 elif (
                     last_error_token["data"] == "expected-closing-tag-but-got-char"
                     and self.parser.tags is not None

@@ -171,6 +171,19 @@ def test_bare_entities_get_escaped_correctly(text, expected):
         ("<some thing thing", "&lt;some thing thing"),
         # this is an expected-end-of-tag-but-got-eof parser error
         ("<some thing thing2 ", "&lt;some thing thing2 "),
+        # handle invalid-character-in-attribute-name correctly tests
+        ("<tag <b><em>text</em></b> <a></a>", "&lt;tag <b><em>text</em></b> <a></a>"),
+        ("<foo <a>link text</a>", "&lt;foo <a>link text</a>"),
+        # keep the tag and add an end tag
+        ("<foo <b>", "&lt;foo <b></b>"),
+        # escape disallowed tags
+        ("<foo <p>text</p>", "&lt;foo &lt;p&gt;text&lt;/p&gt;"),
+        # keep tags with attributes
+        ('<foo <a href="x">text</a>', '&lt;foo <a href="x">text</a>'),
+        # multiple spaces
+        ("<foo   <a>link text</a>", "&lt;foo   <a>link text</a>"),
+        ("text <foo <b>text</b>", "text &lt;foo <b>text</b>"),
+        ("text <foo <bar <b>text</b>", "text &lt;foo &lt;bar <b>text</b>"),
     ],
 )
 def test_lessthan_escaping(text, expected):