scitix · jack-scitix-ai · Jun 22, 2026 · Jun 22, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/sieval/community/mbpp.py b/sieval/community/mbpp.py
@@ -0,0 +1,94 @@
+"""MBPP prompt helpers adapted from lm-evaluation-harness.
+
+The fixed task_id 2/3/4 few-shot examples are copied verbatim (including their
+``\r\n`` line endings) from lm-evaluation-harness:
+https://github.com/EleutherAI/lm-evaluation-harness/blob/1dd931087362abba74e0375c8c631295559f48b2/lm_eval/tasks/mbpp/utils.py
+lm-evaluation-harness is distributed under the MIT License
+(Copyright (c) 2020 EleutherAI). The example data itself originates from the
+MBPP dataset (Austin et al., 2021), CC-BY-4.0.
+
+AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
+"""
+
+from typing import TypedDict
+
+
+class MBPPFewShotSample(TypedDict):
+    task_id: int
+    text: str
+    code: str
+    test_list: list[str]
+    is_fewshot: bool
+
+
+def list_fewshot_samples() -> list[MBPPFewShotSample]:
+    return [
+        {
+            "task_id": 2,
+            "text": (
+                "Write a function to find the similar elements from the given two "
+                "tuple lists."
+            ),
+            "code": (
+                "def similar_elements(test_tup1, test_tup2):\r\n"
+                "  res = tuple(set(test_tup1) & set(test_tup2))\r\n"
+                "  return (res) "
+            ),
+            "test_list": [
+                "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
+                "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
+                (
+                    "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) "
+                    "== (13, 14)"
+                ),
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 3,
+            "text": "Write a python function to identify non-prime numbers.",
+            "code": (
+                "import math\r\n"
+                "def is_not_prime(n):\r\n"
+                "    result = False\r\n"
+                "    for i in range(2,int(math.sqrt(n)) + 1):\r\n"
+                "        if n % i == 0:\r\n"
+                "            result = True\r\n"
+                "    return result"
+            ),
+            "test_list": [
+                "assert is_not_prime(2) == False",
+                "assert is_not_prime(10) == True",
+                "assert is_not_prime(35) == True",
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 4,
+            "text": (
+                "Write a function to find the largest integers from a given list "
+                "of numbers using heap queue algorithm."
+            ),
+            "code": (
+                "import heapq as hq\r\n"
+                "def heap_queue_largest(nums,n):\r\n"
+                "  largest_nums = hq.nlargest(n, nums)\r\n"
+                "  return largest_nums"
+            ),
+            "test_list": [
+                (
+                    "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
+                    "22, 58],3)==[85, 75, 65] "
+                ),
+                (
+                    "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
+                    "22, 58],2)==[85, 75] "
+                ),
+                (
+                    "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
+                    "22, 58],5)==[85, 75, 65, 58, 35]"
+                ),
+            ],
+            "is_fewshot": True,
+        },
+    ]
diff --git a/sieval/datasets/__init__.pyi b/sieval/datasets/__init__.pyi
@@ -37,6 +37,10 @@ from .math_500 import (
     MATH500Dataset,
     MATH500DatasetSample,
 )
+from .mbpp import (
+    MBPPDataset,
+    MBPPDatasetSample,
+)
 from .mmlu import (
     MMLUDataset,
     MMLUDatasetSample,
@@ -69,6 +73,8 @@ __all__ = [
     "LiveCodeBenchDatasetSample",
     "MATH500Dataset",
     "MATH500DatasetSample",
+    "MBPPDataset",
+    "MBPPDatasetSample",
     "MMLUDataset",
     "MMLUDatasetSample",
     "MMLUProDataset",

diff --git a/sieval/datasets/mbpp.py b/sieval/datasets/mbpp.py
@@ -0,0 +1,53 @@
+"""
+MBPP dataset loader (Mostly Basic Python Problems).
+
+Loads ``google-research-datasets/mbpp`` config ``full`` — the same repo and
+config lm-evaluation-harness uses. The repo natively ships the four official
+splits prompt (10), test (500), validation (90), and train (374), so no
+task_id-range split rebuild is needed here.
+
+AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
+"""
+
+from typing import TypedDict, override
+
+from datasets import DatasetDict as HFDatasetDict
+from datasets import load_dataset
+
+from sieval.core.datasets import (
+    Category,
+    Dataset,
+    Level1Category,
+    sieval_dataset,
+)
+from sieval.core.utils.hf import ensure_dataset_dict
+
+
+class MBPPDatasetSample(TypedDict):
+    task_id: int
+    text: str
+    code: str
+    test_list: list[str]
+    test_setup_code: str
+    challenge_test_list: list[str]
+
+
+@sieval_dataset(
+    name="mbpp",
+    display_name="MBPP",
+    description="Mostly Basic Python Problems: 974 entry-level Python tasks.",
+    source="hf:google-research-datasets/mbpp@4bb6404fdc6cacfda99d4ac4205087b89d32030c",
+    categories=(Category(Level1Category.CODE, "CodeGeneration"),),
+    tags=("english", "python", "code-exec"),
+    license="CC-BY-4.0",
+)
+class MBPPDataset(Dataset[MBPPDatasetSample]):
+    @override
+    def load(
+        self,
+        name_or_path: str,
+        config: str | None = "full",
+        **kwargs,
+    ) -> HFDatasetDict:
+        dataset = load_dataset(name_or_path, config, **kwargs)
+        return ensure_dataset_dict(dataset)
diff --git a/sieval/meta/index.json b/sieval/meta/index.json
@@ -190,6 +190,27 @@
       "deps_group": null,
       "license": "MIT"
     },
+    {
+      "name": "mbpp",
+      "display_name": "MBPP",
+      "description": "Mostly Basic Python Problems: 974 entry-level Python tasks.",
+      "source": [
+        "hf:google-research-datasets/mbpp@4bb6404fdc6cacfda99d4ac4205087b89d32030c"
+      ],
+      "categories": [
+        {
+          "level1": "Code",
+          "level2": "CodeGeneration"
+        }
+      ],
+      "tags": [
+        "english",
+        "python",
+        "code-exec"
+      ],
+      "deps_group": null,
+      "license": "CC-BY-4.0"
+    },
     {
       "name": "mmlu",
       "display_name": "MMLU",
@@ -438,6 +459,28 @@
       },
       "status": "stable"
     },
+    {
+      "name": "mbpp_kshot_base_gen",
+      "display_name": "MBPP (few-shot, base generative)",
+      "description": "MBPP few-shot code generation with pass@k execution scoring.",
+      "dataset": "mbpp",
+      "eval_mode": "gen",
+      "n_shot": 3,
+      "tags": [
+        "english",
+        "python",
+        "code-exec",
+        "base-model"
+      ],
+      "deps_group": null,
+      "model_type": "gen",
+      "reference_impl": {
+        "source": "lm-evaluation-harness",
+        "url": "https://github.com/EleutherAI/lm-evaluation-harness/blob/1dd931087362abba74e0375c8c631295559f48b2/lm_eval/tasks/mbpp/mbpp.yaml",
+        "notes": "Prompt, [DONE] stop token, and default task_id 2/3/4 few-shot samples mirror lm-eval MBPP; k (few-shot count) is configurable via YAML task args. Greedy generation (temperature=0, top_p=1, max_tokens=1024). Published Qwen2.5-72B-Base MBPP 3-shot Pass@1 is 76.0 (Qwen3 report, Table 3) and 72.6 (DeepSeek-V3 report, Table 3); DeepSeek-V3 leaves its MBPP protocol unspecified, so the gap to the Qwen-aligned number is a protocol difference, not an implementation error."
+      },
+      "status": "stable"
+    },
     {
       "name": "mmlu_0shot_gen",
       "display_name": "MMLU (0-shot, generative)",

diff --git a/sieval/tasks/__init__.pyi b/sieval/tasks/__init__.pyi
@@ -28,6 +28,9 @@ from .livecodebench_code_generation_0shot_gen import (
 from .math_500_0shot_gen import (
     MATH500ZeroShotGenTask,
 )
+from .mbpp_kshot_base_gen import (
+    MBPPFewShotBaseGenTask,
+)
 from .mmlu_0shot_gen import (
     MMLUZeroShotGenTask,
 )
@@ -48,6 +51,7 @@ __all__ = [
     "IFEvalZeroShotGenTask",
     "LiveCodeBenchCodeGenerationZeroShotGenTask",
     "MATH500ZeroShotGenTask",
+    "MBPPFewShotBaseGenTask",
     "MMLUProZeroShotGenTask",
     "MMLUZeroShotGenTask",
     "TEvalBeforeCallingZeroShotGenTask",