From 58f0d09e888cebaa2a7441855d4b371a005d74a8 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Sun, 27 Jul 2025 14:50:44 +0000 Subject: [PATCH 1/9] [PATCH] urllib.parse: Restrict IPv6 ZoneID characters to RFC 6874-compliant set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current parsing logic for IPv6 addresses with Zone Identifiers (ZoneIDs) uses the `ipaddress` module, which validates ZoneIDs according to RFC 4007, allowing any non-null string. However, when used in URLs, ZoneIDs must follow the percent-encoded format defined in RFC 6874. This patch adds a check to restrict ZoneIDs to the allowed characters: ALPHA / DIGIT / "-" / "." / "_" / "~" / "% HEXDIG HEXDIG" RFC 6874 §2.1 specifies the format of an IPv6 address with a ZoneID in a URI as: `IPv6addrz = IPv6address "%25" ZoneID` Additionally, RFC 6874 recommends accepting a bare `%` without hex digits as a liberal extension, but that flexibility still requires ZoneID content to conform to a safe character set. This patch enforces that ZoneIDs do not include characters outside the permitted range. ### Before the fix: ```py >>> import urllib.parse >>> urllib.parse.urlparse("http://[::1%2|test]/path") ParseResult(scheme='http', netloc='[::1%2|test]', path='/path', ...) ``` Invalid characters such as `|` were incorrectly accepted in ZoneIDs. ### After the fix: ```py >>> import urllib.parse >>> urllib.parse.urlparse("http://[::1%2|test]/path") Traceback (most recent call last): ... ValueError: IPv6 ZoneID is invalid ``` This patch ensures `urllib.parse` properly rejects ZoneIDs with invalid characters, improving compliance with the URI standards and helping prevent subtle bugs or security vulnerabilities. --- Lib/urllib/parse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 67d9bbea0d3150..fab7871b423b34 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -466,6 +466,8 @@ def _check_bracketed_host(hostname): ip = ipaddress.ip_address(hostname) # Throws Value Error if not IPv6 or IPv4 if isinstance(ip, ipaddress.IPv4Address): raise ValueError(f"An IPv4 address cannot be in brackets") + if "%" in hostname and not re.match(r"\A(%[a-fA-F0-9]{2}|[\w\.~-])+\z", hostname.split("%", 1)[1]): + raise ValueError(f"IPv6 ZoneID is invalid") # typed=True avoids BytesWarnings being emitted during cache key # comparison since this API supports both bytes and str input. From e33f0505bb0fbadf75a1fb7d7998b7f0138533dc Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Sun, 27 Jul 2025 15:26:25 +0000 Subject: [PATCH 2/9] Add blurb entries for IPv6 ZoneID validation fixes --- .../next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst diff --git a/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst b/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst new file mode 100644 index 00000000000000..816035b581d89e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst @@ -0,0 +1 @@ +Validate IPv6 ZoneID characters in bracketed hostnames to match RFC 6874. `urllib.parse` now rejects ZoneIDs containing invalid or unsafe characters. From 0e2684dd1cc4942d0a980d372f43f41227d4e6b5 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Mon, 28 Jul 2025 20:39:31 +0000 Subject: [PATCH 3/9] gh-137146: Fix unicode characters in IPv6 Zone ID --- Lib/urllib/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index fab7871b423b34..3f10077966b6de 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -466,7 +466,7 @@ def _check_bracketed_host(hostname): ip = ipaddress.ip_address(hostname) # Throws Value Error if not IPv6 or IPv4 if isinstance(ip, ipaddress.IPv4Address): raise ValueError(f"An IPv4 address cannot be in brackets") - if "%" in hostname and not re.match(r"\A(%[a-fA-F0-9]{2}|[\w\.~-])+\z", hostname.split("%", 1)[1]): + if "%" in hostname and not re.match(r"\A(%[a-fA-F0-9]{2}|[\w\.~-])+\z", hostname.split("%", 1)[1], flags=re.ASCII): raise ValueError(f"IPv6 ZoneID is invalid") # typed=True avoids BytesWarnings being emitted during cache key From be71b378779afaa0902985a9ca3914e8b6be9313 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Mon, 28 Jul 2025 20:43:08 +0000 Subject: [PATCH 4/9] gh-137146: Add tests on IPv6 Zone ID checks --- Lib/test/test_urlparse.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index b2bde5a9b1d696..1c553dbedf5886 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -3,6 +3,7 @@ import unittest import urllib.parse from test import support +from string import ascii_letters, digits RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -1419,6 +1420,15 @@ def test_invalid_bracketed_hosts(self): self.assertRaises(ValueError, urllib.parse.urlsplit, 'scheme://prefix]v6a.ip[suffix') self.assertRaises(ValueError, urllib.parse.urlsplit, 'scheme://prefix]v6a.ip') self.assertRaises(ValueError, urllib.parse.urlsplit, 'scheme://v6a.ip[suffix') + # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + unreserved = ascii_letters + digits + "-" + "." + "_" + "~" + zoneid_authorized_characters = unreserved + removed_characters = "\t\n\r" + for character in range(256): + character = chr(character) + if character in zoneid_authorized_characters or character in removed_characters: + continue + self.assertRaises(ValueError, parse.urlsplit, f'scheme://[::1%invalid{character}invalid]/') def test_splitting_bracketed_hosts(self): p1 = urllib.parse.urlsplit('scheme://user@[v6a.ip]:1234/path?query') From 41feeeef6dc071567fa9373e5516b5aa0c3a9da4 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Mon, 28 Jul 2025 20:44:28 +0000 Subject: [PATCH 5/9] Fix: reStructuredText language syntax --- .../next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst b/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst index 816035b581d89e..ae91bffebcbeb6 100644 --- a/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst +++ b/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst @@ -1 +1 @@ -Validate IPv6 ZoneID characters in bracketed hostnames to match RFC 6874. `urllib.parse` now rejects ZoneIDs containing invalid or unsafe characters. +Validate IPv6 ZoneID characters in bracketed hostnames to match RFC 6874. :func:`urllib.parse.urlparse` now rejects ZoneIDs containing invalid or unsafe characters. From b93e903a1768fd7a325c0a42f0a99c32e704a858 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Mon, 28 Jul 2025 21:07:53 +0000 Subject: [PATCH 6/9] gh-137146: Fix tests on IPv6 Zone ID checks --- Lib/test/test_urlparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 1c553dbedf5886..79c68ae844ba3a 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1428,7 +1428,7 @@ def test_invalid_bracketed_hosts(self): character = chr(character) if character in zoneid_authorized_characters or character in removed_characters: continue - self.assertRaises(ValueError, parse.urlsplit, f'scheme://[::1%invalid{character}invalid]/') + self.assertRaises(ValueError, urllib.parse.urlsplit, f'scheme://[::1%invalid{character}invalid]/') def test_splitting_bracketed_hosts(self): p1 = urllib.parse.urlsplit('scheme://user@[v6a.ip]:1234/path?query') From de4bfc7852a9ba772ec173649e040932c9437524 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Sat, 28 Feb 2026 15:53:38 +0100 Subject: [PATCH 7/9] Refactor: make implementation more compliant with standards --- Lib/urllib/parse.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 3f10077966b6de..8f8cb8ea04e542 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -91,6 +91,9 @@ # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] +# Zone ID regex as defined in RFC 6874 +zone_id_regex = re.compile(r"(%[a-fA-F0-9]{2}|[\w\.~-])+") + def clear_cache(): """Clear internal performance caches. Undocumented; some tests want it.""" urlsplit.cache_clear() @@ -461,13 +464,13 @@ def _check_bracketed_netloc(netloc): def _check_bracketed_host(hostname): if hostname.startswith('v'): if not re.match(r"\Av[a-fA-F0-9]+\..+\z", hostname): - raise ValueError(f"IPvFuture address is invalid") + raise ValueError("IPvFuture address is invalid") else: ip = ipaddress.ip_address(hostname) # Throws Value Error if not IPv6 or IPv4 if isinstance(ip, ipaddress.IPv4Address): - raise ValueError(f"An IPv4 address cannot be in brackets") - if "%" in hostname and not re.match(r"\A(%[a-fA-F0-9]{2}|[\w\.~-])+\z", hostname.split("%", 1)[1], flags=re.ASCII): - raise ValueError(f"IPv6 ZoneID is invalid") + raise ValueError("An IPv4 address cannot be in brackets") + if "%" in hostname and not zone_id_regex.fullmatch(hostname.split("%", 1)[1], flags=re.ASCII): + raise ValueError("IPv6 ZoneID is invalid") # typed=True avoids BytesWarnings being emitted during cache key # comparison since this API supports both bytes and str input. From 21ab53a2d4f8e5bb6fb5bb710b01b028e9fd5510 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Sat, 28 Feb 2026 15:56:30 +0100 Subject: [PATCH 8/9] Adjust NEWS entry to follow conventions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- .../Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst b/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst index ae91bffebcbeb6..ac78a808277033 100644 --- a/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst +++ b/Misc/NEWS.d/next/Library/2025-07-27-15-23-32.gh-issue-137146.BE_ylT.rst @@ -1 +1,2 @@ -Validate IPv6 ZoneID characters in bracketed hostnames to match RFC 6874. :func:`urllib.parse.urlparse` now rejects ZoneIDs containing invalid or unsafe characters. +:func:`urllib.parse.urlparse` now rejects IPv6 ZoneIDs containing +invalid or unsafe characters as per :rfc:`6874`. From f49014a0ff3cee3594cc514a5bf0311fe199e771 Mon Sep 17 00:00:00 2001 From: Maurice Lambert <50479118+mauricelambert@users.noreply.github.com> Date: Sat, 28 Feb 2026 16:20:04 +0100 Subject: [PATCH 9/9] Fix: flags in compiled regex --- Lib/urllib/parse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 8f8cb8ea04e542..ae4fac6bcd35c4 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -92,7 +92,7 @@ _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] # Zone ID regex as defined in RFC 6874 -zone_id_regex = re.compile(r"(%[a-fA-F0-9]{2}|[\w\.~-])+") +zone_id_regex = re.compile(r"(%[a-fA-F0-9]{2}|[\w\.~-])+", flags=re.ASCII) def clear_cache(): """Clear internal performance caches. Undocumented; some tests want it.""" @@ -469,7 +469,7 @@ def _check_bracketed_host(hostname): ip = ipaddress.ip_address(hostname) # Throws Value Error if not IPv6 or IPv4 if isinstance(ip, ipaddress.IPv4Address): raise ValueError("An IPv4 address cannot be in brackets") - if "%" in hostname and not zone_id_regex.fullmatch(hostname.split("%", 1)[1], flags=re.ASCII): + if "%" in hostname and not zone_id_regex.fullmatch(hostname.split("%", 1)[1]): raise ValueError("IPv6 ZoneID is invalid") # typed=True avoids BytesWarnings being emitted during cache key