scitix · jack-scitix-ai · Jun 22, 2026 · Jun 22, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/sieval/community/mbpp.py b/sieval/community/mbpp.py
@@ -0,0 +1,94 @@
+"""MBPP prompt helpers adapted from lm-evaluation-harness.
+
+The fixed task_id 2/3/4 few-shot examples are copied verbatim (including their
+``\r\n`` line endings) from lm-evaluation-harness:
+https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mbpp/utils.py
+lm-evaluation-harness is distributed under the MIT License
+(Copyright (c) 2020 EleutherAI). The example data itself originates from the
+MBPP dataset (Austin et al., 2021), CC-BY-4.0.
+
+AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
+"""
+
+from typing import TypedDict
+
+
+class MBPPFewShotSample(TypedDict):
+    task_id: int
+    text: str
+    code: str
+    test_list: list[str]
+    is_fewshot: bool
+
+
+def list_fewshot_samples() -> list[MBPPFewShotSample]:
+    return [
+        {
+            "task_id": 2,
+            "text": (
+                "Write a function to find the similar elements from the given two "
+                "tuple lists."
+            ),
+            "code": (
+                "def similar_elements(test_tup1, test_tup2):\r\n"
+                "  res = tuple(set(test_tup1) & set(test_tup2))\r\n"
+                "  return (res) "
+            ),
+            "test_list": [
+                "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
+                "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
+                (
+                    "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) "
+                    "== (13, 14)"
+                ),
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 3,
+            "text": "Write a python function to identify non-prime numbers.",
+            "code": (
+                "import math\r\n"
+                "def is_not_prime(n):\r\n"
+                "    result = False\r\n"
+                "    for i in range(2,int(math.sqrt(n)) + 1):\r\n"
+                "        if n % i == 0:\r\n"
+                "            result = True\r\n"
+                "    return result"
+            ),
+            "test_list": [
+                "assert is_not_prime(2) == False",
+                "assert is_not_prime(10) == True",
+                "assert is_not_prime(35) == True",
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 4,
+            "text": (
+                "Write a function to find the largest integers from a given list "
+                "of numbers using heap queue algorithm."
+            ),
+            "code": (
+                "import heapq as hq\r\n"
+                "def heap_queue_largest(nums,n):\r\n"
+                "  largest_nums = hq.nlargest(n, nums)\r\n"
+                "  return largest_nums"
+            ),
+            "test_list": [
+                (
+                    "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
+                    "22, 58],3)==[85, 75, 65] "
+                ),
+                (
+                    "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
+                    "22, 58],2)==[85, 75] "
+                ),
+                (
+                    "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
+                    "22, 58],5)==[85, 75, 65, 58, 35]"
+                ),
+            ],
+            "is_fewshot": True,
+        },
+    ]
diff --git a/sieval/datasets/__init__.pyi b/sieval/datasets/__init__.pyi
@@ -37,6 +37,10 @@ from .math_500 import (
     MATH500Dataset,
     MATH500DatasetSample,
 )
+from .mbpp import (
+    MBPPDataset,
+    MBPPDatasetSample,
+)
 from .mmlu import (
     MMLUDataset,
     MMLUDatasetSample,
@@ -69,6 +73,8 @@ __all__ = [
     "LiveCodeBenchDatasetSample",
     "MATH500Dataset",
     "MATH500DatasetSample",
+    "MBPPDataset",
+    "MBPPDatasetSample",
     "MMLUDataset",
     "MMLUDatasetSample",
     "MMLUProDataset",

diff --git a/sieval/datasets/downloaders/url.py b/sieval/datasets/downloaders/url.py
@@ -34,8 +34,14 @@ def download(
         try:
             with httpx.stream("GET", url, timeout=_TIMEOUT, follow_redirects=True) as r:
                 r.raise_for_status()
-                # Catches 2xx responses that dropped the connection mid-stream.
-                expected = _parse_content_length(r.headers.get("content-length"))
+                # Catches 2xx identity responses that dropped the connection
+                # mid-stream. For compressed transport, Content-Length is the
+                # encoded byte count while iter_bytes() yields decoded bytes.
+                expected = (
+                    None
+                    if _has_compressed_content(r.headers)
+                    else _parse_content_length(r.headers.get("content-length"))
+                )
                 written = 0
                 with tmp.open("wb") as f:
                     for chunk in r.iter_bytes(chunk_size=1 << 16):
@@ -87,3 +93,10 @@ def _parse_content_length(raw: str | None) -> int | None:
         return int(raw.strip())
     except ValueError:
         return None
+
+
+def _has_compressed_content(headers: httpx.Headers | dict[str, str]) -> bool:
+    encoding = headers.get("content-encoding")
+    if encoding is None:
+        return False
+    return encoding.strip().lower() not in {"", "identity"}
diff --git a/sieval/datasets/mbpp.py b/sieval/datasets/mbpp.py
@@ -0,0 +1,113 @@
+"""
+MBPP dataset loader (Mostly Basic Python Problems).
+
+Loads the upstream ``mbpp.jsonl`` and rebuilds the four official splits by
+``task_id`` range: prompt (1-10), test (11-510), validation (511-600), and
+train (601-974).
+
+AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
+"""
+
+from pathlib import Path
+from typing import TypedDict, override
+
+from datasets import DatasetDict as HFDatasetDict
+from datasets import load_dataset
+
+from sieval.core.datasets import (
+    Category,
+    Dataset,
+    Level1Category,
+    sieval_dataset,
+)
+from sieval.core.utils.hf import ensure_dataset_dict
+
+MBPP_JSONL_URL = (
+    "https://raw.githubusercontent.com/google-research/google-research/"
+    "2529b9bcfb930399929b047731804d40dc9a9e2a/mbpp/mbpp.jsonl"
+)
+_MBPP_FILENAME = "mbpp.jsonl"
+
+
+class MBPPDatasetSample(TypedDict):
+    task_id: int
+    text: str
+    code: str
+    test_list: list[str]
+    test_setup_code: str
+    challenge_test_list: list[str]
+
+
+def _process_sample(sample: dict) -> MBPPDatasetSample:
+    return {
+        "task_id": sample["task_id"],
+        "text": sample.get("text") or sample.get("prompt", ""),
+        "code": sample["code"],
+        "test_list": sample["test_list"],
+        "test_setup_code": sample.get("test_setup_code") or "",
+        "challenge_test_list": sample.get("challenge_test_list") or [],
+    }
+
+
+def _resolve_data_file(name_or_path: str) -> str:
+    if name_or_path.startswith(("http://", "https://")):
+        return name_or_path
+
+    path = Path(name_or_path)
+    if path.is_dir():
+        path = path / _MBPP_FILENAME
+    if not path.exists():
+        raise FileNotFoundError(
+            f"MBPP data file not found: {path}\n"
+            "Tip: run `sieval dataset download mbpp` to fetch the dataset."
+        )
+    return str(path)
+
+
+@sieval_dataset(
+    name="mbpp",
+    display_name="MBPP",
+    description="Mostly Basic Python Problems: 974 entry-level Python tasks.",
+    source=f"url:{MBPP_JSONL_URL}",
+    categories=(Category(Level1Category.CODE, "CodeGeneration"),),
+    tags=("english", "python", "code-exec"),
+    license="CC-BY-4.0",
+)
+class MBPPDataset(Dataset[MBPPDatasetSample]):
+    @override
+    def load(
+        self,
+        name_or_path: str,
+        config: str | None = None,
+        **kwargs,
+    ) -> HFDatasetDict:
+        data_file = _resolve_data_file(name_or_path)
+        dataset = load_dataset(
+            "json",
+            config,
+            data_files={"full": data_file},
+            **kwargs,
+        )
+        dataset = ensure_dataset_dict(dataset)
+        full = dataset["full"].map(_process_sample)
+
+        splits = HFDatasetDict(
+            {
+                "prompt": full.filter(lambda sample: 1 <= sample["task_id"] <= 10),
+                "test": full.filter(lambda sample: 11 <= sample["task_id"] <= 510),
+                "validation": full.filter(
+                    lambda sample: 511 <= sample["task_id"] <= 600
+                ),
+                "train": full.filter(lambda sample: 601 <= sample["task_id"] <= 974),
+            }
+        )
+
+        empty = [name for name, split in splits.items() if len(split) == 0]
+        if empty:
+            raise ValueError(
+                f"MBPP produced empty split(s) {empty} from {len(full)} rows. "
+                "Expected task_id ranges prompt(1-10)/test(11-510)/"
+                "validation(511-600)/train(601-974); the data file may be "
+                "truncated or use unexpected task_id values."
+            )
+        return splits
diff --git a/sieval/meta/index.json b/sieval/meta/index.json
@@ -190,6 +190,27 @@
       "deps_group": null,
       "license": "MIT"
     },
+    {
+      "name": "mbpp",
+      "display_name": "MBPP",
+      "description": "Mostly Basic Python Problems: 974 entry-level Python tasks.",
+      "source": [
+        "url:https://raw.githubusercontent.com/google-research/google-research/2529b9bcfb930399929b047731804d40dc9a9e2a/mbpp/mbpp.jsonl"
+      ],
+      "categories": [
+        {
+          "level1": "Code",
+          "level2": "CodeGeneration"
+        }
+      ],
+      "tags": [
+        "english",
+        "python",
+        "code-exec"
+      ],
+      "deps_group": null,
+      "license": "CC-BY-4.0"
+    },
     {
       "name": "mmlu",
       "display_name": "MMLU",
@@ -438,6 +459,28 @@
       },
       "status": "stable"
     },
+    {
+      "name": "mbpp_kshot_base_gen",
+      "display_name": "MBPP (few-shot, base generative)",
+      "description": "MBPP few-shot code generation with pass@k execution scoring.",
+      "dataset": "mbpp",
+      "eval_mode": "gen",
+      "n_shot": 3,
+      "tags": [
+        "english",
+        "python",
+        "code-exec",
+        "base-model"
+      ],
+      "deps_group": null,
+      "model_type": "gen",
+      "reference_impl": {
+        "source": "lm-evaluation-harness",
+        "url": "https://github.com/EleutherAI/lm-evaluation-harness/blob/1dd931087362abba74e0375c8c631295559f48b2/lm_eval/tasks/mbpp/mbpp.yaml",
+        "notes": "Prompt, [DONE] stop token, and default task_id 2/3/4 few-shot samples mirror lm-eval MBPP; num_shots is configurable via YAML task args. Greedy generation (temperature=0, top_p=1, max_tokens=1024). Published Qwen2.5-72B-Base MBPP 3-shot Pass@1 is 76.0 (Qwen3 report, Table 3) and 72.6 (DeepSeek-V3 report, Table 3)."
+      },
+      "status": "stable"
+    },
     {
       "name": "mmlu_0shot_gen",
       "display_name": "MMLU (0-shot, generative)",

diff --git a/sieval/tasks/__init__.pyi b/sieval/tasks/__init__.pyi
@@ -28,6 +28,9 @@ from .livecodebench_code_generation_0shot_gen import (
 from .math_500_0shot_gen import (
     MATH500ZeroShotGenTask,
 )
+from .mbpp_kshot_base_gen import (
+    MBPPFewShotBaseGenTask,
+)
 from .mmlu_0shot_gen import (
     MMLUZeroShotGenTask,
 )
@@ -48,6 +51,7 @@ __all__ = [
     "IFEvalZeroShotGenTask",
     "LiveCodeBenchCodeGenerationZeroShotGenTask",
     "MATH500ZeroShotGenTask",
+    "MBPPFewShotBaseGenTask",
     "MMLUProZeroShotGenTask",
     "MMLUZeroShotGenTask",
     "TEvalBeforeCallingZeroShotGenTask",