KRLabsOrg · adaamko · Jun 24, 2026 · Jun 22, 2026
diff --git a/lettucedetect/datasets/hallucination_dataset.py b/lettucedetect/datasets/hallucination_dataset.py
@@ -76,8 +76,8 @@ def from_json(cls, json_dict: dict) -> "HallucinationSample":
             labels=json_dict["labels"],
             split=json_dict["split"],
             task_type=json_dict["task_type"],
-            dataset=json_dict["dataset"],
-            language=json_dict["language"],
+            dataset=json_dict.get("dataset", "unknown"),
+            language=json_dict.get("language", "en"),
             context_modality=json_dict.get("context_modality", "prose"),
             category=json_dict.get("category"),
             subcategory=json_dict.get("subcategory"),

diff --git a/tests/test_datasets_pytest.py b/tests/test_datasets_pytest.py
@@ -0,0 +1,40 @@
+"""Pytest tests for the datasets module."""
+
+from lettucedetect.datasets.hallucination_dataset import HallucinationSample
+
+
+def test_from_json_with_minimal_keys():
+    """Test that from_json handles data missing optional dataset/language keys."""
+    sample = HallucinationSample.from_json(
+        {
+            "prompt": "User request: What is AI?\n\nAI is artificial intelligence.",
+            "answer": "AI stands for Artificial Intelligence.",
+            "labels": [],
+            "split": "test",
+            "task_type": "qa",
+        }
+    )
+    assert sample.prompt == "User request: What is AI?\n\nAI is artificial intelligence."
+    assert sample.answer == "AI stands for Artificial Intelligence."
+    assert sample.labels == []
+    assert sample.split == "test"
+    assert sample.task_type == "qa"
+    assert sample.dataset == "unknown"
+    assert sample.language == "en"
+
+
+def test_from_json_explicit_dataset_language():
+    """Test that explicit dataset and language keys are preserved."""
+    sample = HallucinationSample.from_json(
+        {
+            "prompt": "Test prompt",
+            "answer": "Test answer",
+            "labels": [{"start": 0, "end": 4, "label": "intrinsic"}],
+            "split": "train",
+            "task_type": "summarization",
+            "dataset": "cnn_dailymail",
+            "language": "de",
+        }
+    )
+    assert sample.dataset == "cnn_dailymail"
+    assert sample.language == "de"