Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions tests/unit/test_dedup_anchor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# SPDX-License-Identifier: Apache-2.0
"""Unit tests for normalize_rows anchor-insensitive dedup (#135).

The raw candidate dedup key must strip any '#anchor', consistent with the
status-preservation keys (existing_superseded_keys / existing_engine_keys /
existing_review_keys). Two rows sharing (subject, relation, object, source-file)
that differ only by anchor collapse to one row, and the surviving 'source' is
chosen deterministically: the full source that sorts lexicographically first
(bare path < any anchored variant), independent of input order.
"""
from __future__ import annotations

import merge_candidates as mc


def _root_with_source(tmp_path, name="a.md"):
"""A KB root whose sources/ holds one real file, so rows referencing it
pass the source-existence check inside normalize_rows."""
sources = tmp_path / "sources"
sources.mkdir()
(sources / name).write_text("# heading\n", encoding="utf-8")
return tmp_path


def _row(subject, relation, obj, source, status="candidate", confidence="0.50", note=""):
return {
"subject": subject,
"relation": relation,
"object": obj,
"source": source,
"status": status,
"confidence": confidence,
"note": note,
}


class TestAnchorInsensitiveDedup:
def test_bare_and_anchored_collapse_to_one_row(self, tmp_path):
root = _root_with_source(tmp_path)
rows = [
_row("A", "rel", "B", "sources/a.md"),
_row("A", "rel", "B", "sources/a.md#sec1"),
]
out = mc.normalize_rows(root, rows)
assert len(out) == 1
# Bare path sorts before the anchored variant, so it survives.
assert out[0]["source"] == "sources/a.md"

def test_two_anchors_collapse_keeping_lexicographically_first(self, tmp_path):
root = _root_with_source(tmp_path)
rows = [
_row("A", "rel", "B", "sources/a.md#sec2"),
_row("A", "rel", "B", "sources/a.md#sec1"),
]
out = mc.normalize_rows(root, rows)
assert len(out) == 1
# Lexicographically-first anchor wins; no bare variant present.
assert out[0]["source"] == "sources/a.md#sec1"

def test_surviving_source_is_order_independent(self, tmp_path):
root = _root_with_source(tmp_path)
forward = [
_row("A", "rel", "B", "sources/a.md#sec1"),
_row("A", "rel", "B", "sources/a.md"),
]
reverse = list(reversed(forward))
out_forward = mc.normalize_rows(root, forward)
out_reverse = mc.normalize_rows(root, reverse)
assert len(out_forward) == len(out_reverse) == 1
# Same surviving source regardless of which order the rows arrived in.
assert out_forward[0]["source"] == out_reverse[0]["source"] == "sources/a.md"

def test_distinct_triples_on_same_file_are_kept(self, tmp_path):
root = _root_with_source(tmp_path)
rows = [
_row("A", "rel", "B", "sources/a.md#sec1"),
_row("C", "rel", "D", "sources/a.md#sec2"),
]
out = mc.normalize_rows(root, rows)
assert len(out) == 2
34 changes: 25 additions & 9 deletions tools/merge_candidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,14 +251,25 @@ def normalize_rows(

- status is normalised to 'needs_review' if not in VALID_STATUSES
- confidence is clamped to [0.00, 1.00]
- exact duplicate (subject, relation, object, source) tuples are dropped
- duplicate (subject, relation, object, source-file) tuples collapse to one
row. The dedup key strips any '#anchor' (so 'sources/a.md' and
'sources/a.md#sec1' are one fact), consistent with the anchor-insensitive
status-preservation keys (existing_superseded_keys / existing_engine_keys
/ existing_review_keys). The surviving row is chosen deterministically:
the one whose full 'source' sorts lexicographically first — a bare
'sources/a.md' (a prefix of, thus less than, any 'sources/a.md#anchor')
wins over every anchored variant, otherwise the lexicographically-first
anchor wins. This is independent of input row order.
- result is sorted by (source, subject, relation, object)

If *strict* is True, any dropped row causes a non-zero exit.
"""
known_sources = source_file_refs(root)
seen: set[tuple[str, str, str, str]] = set()
normalized: list[dict[str, str]] = []
# Anchor-insensitive dedup: key on (subject, relation, object, source-file),
# mapping to the surviving row. On collision the row whose full 'source'
# sorts lexicographically first wins (bare path < anchored variant), so the
# winner is fixed by value, not input order.
dedup: dict[tuple[str, str, str, str], dict[str, str]] = {}
dropped = 0

for row in rows:
Expand Down Expand Up @@ -287,18 +298,23 @@ def normalize_rows(
clean["source"] = source # NFC-normalised canonical source
clean["status"] = clean["status"] if clean["status"] in VALID_STATUSES else "needs_review"
clean["confidence"] = normalize_confidence(clean["confidence"])
key = (clean["subject"], clean["relation"], clean["object"], clean["source"])
if key in seen:
# Anchor-insensitive key: source_file is the pre-'#anchor' portion
# (already computed above for the source-existence check).
key = (clean["subject"], clean["relation"], clean["object"], source_file)
existing = dedup.get(key)
if existing is None:
dedup[key] = clean
else:
dropped += 1
continue
seen.add(key)
normalized.append(clean)
# Keep the row whose full source sorts first (bare < anchored).
if clean["source"] < existing["source"]:
dedup[key] = clean

if dropped:
print(f" warning: {dropped} row(s) dropped during normalise/dedup", file=sys.stderr)

return sorted(
normalized,
dedup.values(),
key=lambda item: (item["source"], item["subject"], item["relation"], item["object"]),
)

Expand Down