semantic-reasoning · justinjoy · Jun 21, 2026 · Jun 21, 2026
diff --git a/tests/unit/test_dedup_anchor.py b/tests/unit/test_dedup_anchor.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for normalize_rows anchor-insensitive dedup (#135).
+
+The raw candidate dedup key must strip any '#anchor', consistent with the
+status-preservation keys (existing_superseded_keys / existing_engine_keys /
+existing_review_keys).  Two rows sharing (subject, relation, object, source-file)
+that differ only by anchor collapse to one row, and the surviving 'source' is
+chosen deterministically: the full source that sorts lexicographically first
+(bare path < any anchored variant), independent of input order.
+"""
+from __future__ import annotations
+
+import merge_candidates as mc
+
+
+def _root_with_source(tmp_path, name="a.md"):
+    """A KB root whose sources/ holds one real file, so rows referencing it
+    pass the source-existence check inside normalize_rows."""
+    sources = tmp_path / "sources"
+    sources.mkdir()
+    (sources / name).write_text("# heading\n", encoding="utf-8")
+    return tmp_path
+
+
+def _row(subject, relation, obj, source, status="candidate", confidence="0.50", note=""):
+    return {
+        "subject": subject,
+        "relation": relation,
+        "object": obj,
+        "source": source,
+        "status": status,
+        "confidence": confidence,
+        "note": note,
+    }
+
+
+class TestAnchorInsensitiveDedup:
+    def test_bare_and_anchored_collapse_to_one_row(self, tmp_path):
+        root = _root_with_source(tmp_path)
+        rows = [
+            _row("A", "rel", "B", "sources/a.md"),
+            _row("A", "rel", "B", "sources/a.md#sec1"),
+        ]
+        out = mc.normalize_rows(root, rows)
+        assert len(out) == 1
+        # Bare path sorts before the anchored variant, so it survives.
+        assert out[0]["source"] == "sources/a.md"
+
+    def test_two_anchors_collapse_keeping_lexicographically_first(self, tmp_path):
+        root = _root_with_source(tmp_path)
+        rows = [
+            _row("A", "rel", "B", "sources/a.md#sec2"),
+            _row("A", "rel", "B", "sources/a.md#sec1"),
+        ]
+        out = mc.normalize_rows(root, rows)
+        assert len(out) == 1
+        # Lexicographically-first anchor wins; no bare variant present.
+        assert out[0]["source"] == "sources/a.md#sec1"
+
+    def test_surviving_source_is_order_independent(self, tmp_path):
+        root = _root_with_source(tmp_path)
+        forward = [
+            _row("A", "rel", "B", "sources/a.md#sec1"),
+            _row("A", "rel", "B", "sources/a.md"),
+        ]
+        reverse = list(reversed(forward))
+        out_forward = mc.normalize_rows(root, forward)
+        out_reverse = mc.normalize_rows(root, reverse)
+        assert len(out_forward) == len(out_reverse) == 1
+        # Same surviving source regardless of which order the rows arrived in.
+        assert out_forward[0]["source"] == out_reverse[0]["source"] == "sources/a.md"
+
+    def test_distinct_triples_on_same_file_are_kept(self, tmp_path):
+        root = _root_with_source(tmp_path)
+        rows = [
+            _row("A", "rel", "B", "sources/a.md#sec1"),
+            _row("C", "rel", "D", "sources/a.md#sec2"),
+        ]
+        out = mc.normalize_rows(root, rows)
+        assert len(out) == 2
diff --git a/tools/merge_candidates.py b/tools/merge_candidates.py
@@ -251,14 +251,25 @@ def normalize_rows(
 
     - status is normalised to 'needs_review' if not in VALID_STATUSES
     - confidence is clamped to [0.00, 1.00]
-    - exact duplicate (subject, relation, object, source) tuples are dropped
+    - duplicate (subject, relation, object, source-file) tuples collapse to one
+      row.  The dedup key strips any '#anchor' (so 'sources/a.md' and
+      'sources/a.md#sec1' are one fact), consistent with the anchor-insensitive
+      status-preservation keys (existing_superseded_keys / existing_engine_keys
+      / existing_review_keys).  The surviving row is chosen deterministically:
+      the one whose full 'source' sorts lexicographically first — a bare
+      'sources/a.md' (a prefix of, thus less than, any 'sources/a.md#anchor')
+      wins over every anchored variant, otherwise the lexicographically-first
+      anchor wins.  This is independent of input row order.
     - result is sorted by (source, subject, relation, object)
 
     If *strict* is True, any dropped row causes a non-zero exit.
     """
     known_sources = source_file_refs(root)
-    seen: set[tuple[str, str, str, str]] = set()
-    normalized: list[dict[str, str]] = []
+    # Anchor-insensitive dedup: key on (subject, relation, object, source-file),
+    # mapping to the surviving row.  On collision the row whose full 'source'
+    # sorts lexicographically first wins (bare path < anchored variant), so the
+    # winner is fixed by value, not input order.
+    dedup: dict[tuple[str, str, str, str], dict[str, str]] = {}
     dropped = 0
 
     for row in rows:
@@ -287,18 +298,23 @@ def normalize_rows(
         clean["source"] = source  # NFC-normalised canonical source
         clean["status"] = clean["status"] if clean["status"] in VALID_STATUSES else "needs_review"
         clean["confidence"] = normalize_confidence(clean["confidence"])
-        key = (clean["subject"], clean["relation"], clean["object"], clean["source"])
-        if key in seen:
+        # Anchor-insensitive key: source_file is the pre-'#anchor' portion
+        # (already computed above for the source-existence check).
+        key = (clean["subject"], clean["relation"], clean["object"], source_file)
+        existing = dedup.get(key)
+        if existing is None:
+            dedup[key] = clean
+        else:
             dropped += 1
-            continue
-        seen.add(key)
-        normalized.append(clean)
+            # Keep the row whose full source sorts first (bare < anchored).
+            if clean["source"] < existing["source"]:
+                dedup[key] = clean
 
     if dropped:
         print(f"  warning: {dropped} row(s) dropped during normalise/dedup", file=sys.stderr)
 
     return sorted(
-        normalized,
+        dedup.values(),
         key=lambda item: (item["source"], item["subject"], item["relation"], item["object"]),
     )