Fixed from_embeddings bug

Pringled · Pringled · commit 97f25961eb6f · 2026-01-09T16:43:22.000+01:00
diff --git a/semhash/semhash.py b/semhash/semhash.py
@@ -143,10 +143,19 @@ def from_embeddings(
                     break
 
         # Build index mapping for embeddings (accounting for removed exact duplicates)
+        # We need to keep only the first occurrence of each unique record
+        col_set = set(columns)
+        seen_hashes = set()
         embedding_indices = []
+
         for i, record in enumerate(dict_records):
-            if record in deduplicated_records:
-                embedding_indices.append(i)
+            frozen = to_frozendict(record, col_set)
+            # Only keep the first occurrence of each unique record
+            if frozen not in seen_hashes:
+                # Check if this record hash is in the deduplicated set
+                if any(to_frozendict(dedup_rec, col_set) == frozen for dedup_rec in deduplicated_records):
+                    embedding_indices.append(i)
+                    seen_hashes.add(frozen)
 
         # Select embeddings for non-exact-duplicate records
         deduplicated_embeddings = embeddings[embedding_indices]
diff --git a/tests/test_semhash.py b/tests/test_semhash.py
@@ -233,3 +233,77 @@ def test_from_embeddings(model: Encoder, train_texts: list[str]) -> None:
 
     assert len(result1.selected) == len(result2.selected)
     assert len(result1.filtered) == len(result2.filtered)
+
+
+def test_from_embeddings_with_exact_duplicates(model: Encoder) -> None:
+    """
+    Test that from_embeddings correctly handles exact duplicates in input.
+
+    This is a regression test for Issue #1: the bug where duplicate records
+    would cause duplicate embeddings to be kept in the index.
+    """
+    # Create records with exact duplicates
+    records = [
+        "apple",  # 0
+        "banana",  # 1
+        "apple",  # 2 - duplicate of 0
+        "cherry",  # 3
+        "banana",  # 4 - duplicate of 1
+        "date",  # 5
+    ]
+
+    # Generate embeddings for all records (including duplicates)
+    embeddings = model.encode(records)
+
+    # Create SemHash from embeddings
+    semhash = SemHash.from_embeddings(embeddings=embeddings, records=records, model=model)
+
+    # The index should only contain 4 unique records (apple, banana, cherry, date)
+    assert len(semhash.index.vectors) == 4, f"Expected 4 unique vectors, got {len(semhash.index.vectors)}"
+    assert len(semhash.index.items) == 4, f"Expected 4 items, got {len(semhash.index.items)}"
+
+    # Verify that duplicates are grouped correctly
+    # Each item is a list of exact duplicates
+    items_by_text = {}
+    for item in semhash.index.items:
+        text = item[0]["text"]
+        items_by_text[text] = len(item)
+
+    # apple and banana should have 2 records each (original + duplicate)
+    # cherry and date should have 1 record each
+    assert items_by_text["apple"] == 2, "apple should have 2 records"
+    assert items_by_text["banana"] == 2, "banana should have 2 records"
+    assert items_by_text["cherry"] == 1, "cherry should have 1 record"
+    assert items_by_text["date"] == 1, "date should have 1 record"
+
+    # Verify embeddings correspond to first occurrences
+    # The vectors should match embeddings at indices [0, 1, 3, 5]
+    # (order may vary in the index, so we can't do exact comparison)
+    # but the count should be correct
+    assert semhash.index.vectors.shape[0] == 4
+
+
+def test_from_embeddings_dict_records_with_duplicates(model: Encoder) -> None:
+    """Test that from_embeddings handles duplicates correctly with dictionary records."""
+    records = [
+        {"id": "1", "text": "apple"},
+        {"id": "2", "text": "banana"},
+        {"id": "3", "text": "apple"},  # Duplicate based on 'text' column
+        {"id": "4", "text": "cherry"},
+    ]
+
+    # Generate embeddings
+    texts = [r["text"] for r in records]
+    embeddings = model.encode(texts)
+
+    # Create SemHash using only 'text' column for deduplication
+    semhash = SemHash.from_embeddings(embeddings=embeddings, records=records, columns=["text"], model=model)
+
+    # Should have 3 unique 'text' values
+    assert len(semhash.index.vectors) == 3
+    assert len(semhash.index.items) == 3
+
+    # Find the item with "apple" text
+    apple_items = [item for item in semhash.index.items if item[0]["text"] == "apple"]
+    assert len(apple_items) == 1, "Should find exactly one item group for 'apple'"
+    assert len(apple_items[0]) == 2, "The 'apple' item should contain 2 records"