Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions sieval/community/mbpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""MBPP prompt helpers adapted from lm-evaluation-harness.

The fixed task_id 2/3/4 few-shot examples are copied verbatim (including their
``\r\n`` line endings) from lm-evaluation-harness:
https://github.com/EleutherAI/lm-evaluation-harness/blob/1dd931087362abba74e0375c8c631295559f48b2/lm_eval/tasks/mbpp/utils.py
lm-evaluation-harness is distributed under the MIT License
(Copyright (c) 2020 EleutherAI). The example data itself originates from the
MBPP dataset (Austin et al., 2021), CC-BY-4.0.

AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
"""

from typing import TypedDict


class MBPPFewShotSample(TypedDict):
task_id: int
text: str
code: str
test_list: list[str]
is_fewshot: bool


def list_fewshot_samples() -> list[MBPPFewShotSample]:
return [
{
"task_id": 2,
"text": (
"Write a function to find the similar elements from the given two "
"tuple lists."
),
"code": (
"def similar_elements(test_tup1, test_tup2):\r\n"
" res = tuple(set(test_tup1) & set(test_tup2))\r\n"
" return (res) "
),
"test_list": [
"assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
"assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
(
"assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) "
"== (13, 14)"
),
],
"is_fewshot": True,
},
{
"task_id": 3,
"text": "Write a python function to identify non-prime numbers.",
"code": (
"import math\r\n"
"def is_not_prime(n):\r\n"
" result = False\r\n"
" for i in range(2,int(math.sqrt(n)) + 1):\r\n"
" if n % i == 0:\r\n"
" result = True\r\n"
" return result"
),
"test_list": [
"assert is_not_prime(2) == False",
"assert is_not_prime(10) == True",
"assert is_not_prime(35) == True",
],
"is_fewshot": True,
},
{
"task_id": 4,
"text": (
"Write a function to find the largest integers from a given list "
"of numbers using heap queue algorithm."
),
"code": (
"import heapq as hq\r\n"
"def heap_queue_largest(nums,n):\r\n"
" largest_nums = hq.nlargest(n, nums)\r\n"
" return largest_nums"
),
"test_list": [
(
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
"22, 58],3)==[85, 75, 65] "
),
(
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
"22, 58],2)==[85, 75] "
),
(
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
"22, 58],5)==[85, 75, 65, 58, 35]"
),
],
"is_fewshot": True,
},
]
6 changes: 6 additions & 0 deletions sieval/datasets/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ from .math_500 import (
MATH500Dataset,
MATH500DatasetSample,
)
from .mbpp import (
MBPPDataset,
MBPPDatasetSample,
)
from .mmlu import (
MMLUDataset,
MMLUDatasetSample,
Expand Down Expand Up @@ -69,6 +73,8 @@ __all__ = [
"LiveCodeBenchDatasetSample",
"MATH500Dataset",
"MATH500DatasetSample",
"MBPPDataset",
"MBPPDatasetSample",
"MMLUDataset",
"MMLUDatasetSample",
"MMLUProDataset",
Expand Down
53 changes: 53 additions & 0 deletions sieval/datasets/mbpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
MBPP dataset loader (Mostly Basic Python Problems).

Loads ``google-research-datasets/mbpp`` config ``full`` — the same repo and
config lm-evaluation-harness uses. The repo natively ships the four official
splits prompt (10), test (500), validation (90), and train (374), so no
task_id-range split rebuild is needed here.

AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
"""

from typing import TypedDict, override

from datasets import DatasetDict as HFDatasetDict
from datasets import load_dataset

from sieval.core.datasets import (
Category,
Dataset,
Level1Category,
sieval_dataset,
)
from sieval.core.utils.hf import ensure_dataset_dict


class MBPPDatasetSample(TypedDict):
task_id: int
text: str
code: str
test_list: list[str]
test_setup_code: str
challenge_test_list: list[str]


@sieval_dataset(
name="mbpp",
display_name="MBPP",
description="Mostly Basic Python Problems: 974 entry-level Python tasks.",
source="hf:google-research-datasets/mbpp@4bb6404fdc6cacfda99d4ac4205087b89d32030c",
categories=(Category(Level1Category.CODE, "CodeGeneration"),),
tags=("english", "python", "code-exec"),
license="CC-BY-4.0",
)
class MBPPDataset(Dataset[MBPPDatasetSample]):
@override
def load(
self,
name_or_path: str,
config: str | None = "full",
**kwargs,
) -> HFDatasetDict:
dataset = load_dataset(name_or_path, config, **kwargs)
return ensure_dataset_dict(dataset)
43 changes: 43 additions & 0 deletions sieval/meta/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,27 @@
"deps_group": null,
"license": "MIT"
},
{
"name": "mbpp",
"display_name": "MBPP",
"description": "Mostly Basic Python Problems: 974 entry-level Python tasks.",
"source": [
"hf:google-research-datasets/mbpp@4bb6404fdc6cacfda99d4ac4205087b89d32030c"
],
"categories": [
{
"level1": "Code",
"level2": "CodeGeneration"
}
],
"tags": [
"english",
"python",
"code-exec"
],
"deps_group": null,
"license": "CC-BY-4.0"
},
{
"name": "mmlu",
"display_name": "MMLU",
Expand Down Expand Up @@ -438,6 +459,28 @@
},
"status": "stable"
},
{
"name": "mbpp_kshot_base_gen",
"display_name": "MBPP (few-shot, base generative)",
"description": "MBPP few-shot code generation with pass@k execution scoring.",
"dataset": "mbpp",
"eval_mode": "gen",
"n_shot": 3,
"tags": [
"english",
"python",
"code-exec",
"base-model"
],
"deps_group": null,
"model_type": "gen",
"reference_impl": {
"source": "lm-evaluation-harness",
"url": "https://github.com/EleutherAI/lm-evaluation-harness/blob/1dd931087362abba74e0375c8c631295559f48b2/lm_eval/tasks/mbpp/mbpp.yaml",
"notes": "Prompt, [DONE] stop token, and default task_id 2/3/4 few-shot samples mirror lm-eval MBPP; k (few-shot count) is configurable via YAML task args. Greedy generation (temperature=0, top_p=1, max_tokens=1024). Published Qwen2.5-72B-Base MBPP 3-shot Pass@1 is 76.0 (Qwen3 report, Table 3) and 72.6 (DeepSeek-V3 report, Table 3); DeepSeek-V3 leaves its MBPP protocol unspecified, so the gap to the Qwen-aligned number is a protocol difference, not an implementation error."
},
"status": "stable"
},
{
"name": "mmlu_0shot_gen",
"display_name": "MMLU (0-shot, generative)",
Expand Down
4 changes: 4 additions & 0 deletions sieval/tasks/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ from .livecodebench_code_generation_0shot_gen import (
from .math_500_0shot_gen import (
MATH500ZeroShotGenTask,
)
from .mbpp_kshot_base_gen import (
MBPPFewShotBaseGenTask,
)
from .mmlu_0shot_gen import (
MMLUZeroShotGenTask,
)
Expand All @@ -48,6 +51,7 @@ __all__ = [
"IFEvalZeroShotGenTask",
"LiveCodeBenchCodeGenerationZeroShotGenTask",
"MATH500ZeroShotGenTask",
"MBPPFewShotBaseGenTask",
"MMLUProZeroShotGenTask",
"MMLUZeroShotGenTask",
"TEvalBeforeCallingZeroShotGenTask",
Expand Down
Loading
Loading