From 23da08087ced3e870fd1acf862dec7f4f8bdaf87 Mon Sep 17 00:00:00 2001 From: Berend Klein Haneveld Date: Tue, 26 May 2026 11:30:48 +0200 Subject: [PATCH] perf: linear reverse-op construction in diff_dicts/diff_sets Replace the per-iteration rops.insert(0, ...) calls and the key_rops.extend(rops) accumulator in diff_dicts with bucket-based assembly: collect input-only, output-only, and common reverse-op chunks separately, then splice them together once at the end. Same treatment for diff_sets. Op order is preserved bit-for-bit (existing tests pass unchanged); time drops from O(n^2) to O(n) for dicts with large common-key sets. Co-Authored-By: Claude Opus 4.7 --- benchmarks/benchmark.py | 9 +++++ patchdiff/diff.py | 79 ++++++++++++++++++++++++----------------- tests/test_diff.py | 59 +++++++++++++++++++++++++++++- 3 files changed, 114 insertions(+), 33 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 063abf2..0bbc6c4 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -149,6 +149,15 @@ def test_dict_diff_nested(benchmark): benchmark(diff, a, b) +@pytest.mark.benchmark(group="dict-diff") +@pytest.mark.parametrize("n", [500, 1000, 2000]) +def test_dict_diff_large_common(benchmark, n): + """Benchmark: dicts where every key is common and every value changes.""" + a = {f"k{i}": {"v": i} for i in range(n)} + b = {f"k{i}": {"v": i + 1} for i in range(n)} + benchmark(diff, a, b) + + # ======================================== # Set Diff Benchmarks # ======================================== diff --git a/patchdiff/diff.py b/patchdiff/diff.py index e74cba8..98d7335 100644 --- a/patchdiff/diff.py +++ b/patchdiff/diff.py @@ -135,44 +135,59 @@ def diff_lists(input: List, output: List, ptr: Pointer) -> Tuple[List, List]: def diff_dicts(input: Dict, output: Dict, ptr: Pointer) -> Tuple[List, List]: - ops, rops = [], [] + ops: List = [] + input_only_rops: List = [] + output_only_rops: List = [] + common_rops_chunks: List[List] = [] + input_keys = set(input.keys()) if input else set() output_keys = set(output.keys()) if output else set() - if input_only := input_keys - output_keys: - for key in input_only: - key_ptr = ptr.append(key) - ops.append({"op": "remove", "path": key_ptr}) - rops.insert(0, {"op": "add", "path": key_ptr, "value": input[key]}) - if output_only := output_keys - input_keys: - for key in output_only: - key_ptr = ptr.append(key) - ops.append( - { - "op": "add", - "path": key_ptr, - "value": output[key], - } - ) - rops.insert(0, {"op": "remove", "path": key_ptr}) - if common := input_keys & output_keys: - for key in common: - key_ops, key_rops = diff(input[key], output[key], ptr.append(key)) - ops.extend(key_ops) - key_rops.extend(rops) - rops = key_rops + + for key in input_keys - output_keys: + key_ptr = ptr.append(key) + ops.append({"op": "remove", "path": key_ptr}) + input_only_rops.append({"op": "add", "path": key_ptr, "value": input[key]}) + input_only_rops.reverse() + + for key in output_keys - input_keys: + key_ptr = ptr.append(key) + ops.append({"op": "add", "path": key_ptr, "value": output[key]}) + output_only_rops.append({"op": "remove", "path": key_ptr}) + output_only_rops.reverse() + + for key in input_keys & output_keys: + key_ops, key_rops = diff(input[key], output[key], ptr.append(key)) + ops.extend(key_ops) + if key_rops: + common_rops_chunks.append(key_rops) + + # Match the historical insert(0,…) + key_rops.extend(rops) layering: + # later common chunks went in front of earlier ones, and the input/output + # singletons sat behind them in reverse iteration order. + rops: List = [] + for chunk in reversed(common_rops_chunks): + rops.extend(chunk) + rops.extend(output_only_rops) + rops.extend(input_only_rops) return ops, rops def diff_sets(input: Set, output: Set, ptr: Pointer) -> Tuple[List, List]: - ops, rops = [], [] - if input_only := input - output: - for value in input_only: - ops.append({"op": "remove", "path": ptr.append(value)}) - rops.insert(0, {"op": "add", "path": ptr.append("-"), "value": value}) - if output_only := output - input: - for value in output_only: - ops.append({"op": "add", "path": ptr.append("-"), "value": value}) - rops.insert(0, {"op": "remove", "path": ptr.append(value)}) + ops: List = [] + input_only_rops: List = [] + output_only_rops: List = [] + + for value in input - output: + ops.append({"op": "remove", "path": ptr.append(value)}) + input_only_rops.append({"op": "add", "path": ptr.append("-"), "value": value}) + input_only_rops.reverse() + + for value in output - input: + ops.append({"op": "add", "path": ptr.append("-"), "value": value}) + output_only_rops.append({"op": "remove", "path": ptr.append(value)}) + output_only_rops.reverse() + + rops = output_only_rops + input_only_rops return ops, rops diff --git a/tests/test_diff.py b/tests/test_diff.py index 7f456c4..f9050c8 100644 --- a/tests/test_diff.py +++ b/tests/test_diff.py @@ -1,4 +1,6 @@ -from patchdiff import diff +import random + +from patchdiff import apply, diff from patchdiff.pointer import Pointer @@ -168,3 +170,58 @@ def test_mixed(): {"op": "add", "path": Pointer(["a", 3, "-"]), "value": "a"}, {"op": "remove", "path": Pointer(["c"])}, ] + + +def _random_dict(rng, n_keys, value_pool): + return {f"k{i}": rng.choice(value_pool) for i in range(n_keys)} + + +def _mutate_dict(rng, base, value_pool): + result = dict(base) + keys = list(result.keys()) + # Replace + for key in rng.sample(keys, k=max(1, len(keys) // 3)): + result[key] = rng.choice(value_pool) + # Remove + for key in rng.sample(keys, k=max(1, len(keys) // 4)): + result.pop(key, None) + # Add + for i in range(max(1, len(keys) // 4)): + result[f"new_{i}_{rng.randint(0, 10_000)}"] = rng.choice(value_pool) + return result + + +def test_dict_diff_roundtrip_property(): + rng = random.Random(20260526) + value_pool = [ + 0, + 1, + "x", + "y", + (1, 2), + {"nested": 1}, + [1, 2, 3], + {"a", "b"}, + ] + cases = 25 + for _ in range(cases): + n_keys = rng.randint(0, 30) + a = _random_dict(rng, n_keys, value_pool) + b = _mutate_dict(rng, a, value_pool) if a else _random_dict(rng, 5, value_pool) + ops, rops = diff(a, b) + assert apply(a, ops) == b + assert apply(b, rops) == a + + +def test_set_diff_roundtrip_property(): + rng = random.Random(20260527) + universe = list(range(50)) + [f"s{i}" for i in range(50)] + cases = 25 + for _ in range(cases): + size_a = rng.randint(0, 30) + size_b = rng.randint(0, 30) + a = set(rng.sample(universe, k=size_a)) + b = set(rng.sample(universe, k=size_b)) + ops, rops = diff(a, b) + assert apply(a, ops) == b + assert apply(b, rops) == a