Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions sieval/community/mbpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""MBPP prompt helpers adapted from lm-evaluation-harness.

The fixed task_id 2/3/4 few-shot examples are copied verbatim (including their
``\r\n`` line endings) from lm-evaluation-harness:
https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mbpp/utils.py
lm-evaluation-harness is distributed under the MIT License
(Copyright (c) 2020 EleutherAI). The example data itself originates from the
MBPP dataset (Austin et al., 2021), CC-BY-4.0.

AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
"""

from typing import TypedDict


class MBPPFewShotSample(TypedDict):
task_id: int
text: str
code: str
test_list: list[str]
is_fewshot: bool


def list_fewshot_samples() -> list[MBPPFewShotSample]:
return [
{
"task_id": 2,
"text": (
"Write a function to find the similar elements from the given two "
"tuple lists."
),
"code": (
"def similar_elements(test_tup1, test_tup2):\r\n"
" res = tuple(set(test_tup1) & set(test_tup2))\r\n"
" return (res) "
),
"test_list": [
"assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
"assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
(
"assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) "
"== (13, 14)"
),
],
"is_fewshot": True,
},
{
"task_id": 3,
"text": "Write a python function to identify non-prime numbers.",
"code": (
"import math\r\n"
"def is_not_prime(n):\r\n"
" result = False\r\n"
" for i in range(2,int(math.sqrt(n)) + 1):\r\n"
" if n % i == 0:\r\n"
" result = True\r\n"
" return result"
),
"test_list": [
"assert is_not_prime(2) == False",
"assert is_not_prime(10) == True",
"assert is_not_prime(35) == True",
],
"is_fewshot": True,
},
{
"task_id": 4,
"text": (
"Write a function to find the largest integers from a given list "
"of numbers using heap queue algorithm."
),
"code": (
"import heapq as hq\r\n"
"def heap_queue_largest(nums,n):\r\n"
" largest_nums = hq.nlargest(n, nums)\r\n"
" return largest_nums"
),
"test_list": [
(
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
"22, 58],3)==[85, 75, 65] "
),
(
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
"22, 58],2)==[85, 75] "
),
(
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, "
"22, 58],5)==[85, 75, 65, 58, 35]"
),
],
"is_fewshot": True,
},
]
6 changes: 6 additions & 0 deletions sieval/datasets/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ from .math_500 import (
MATH500Dataset,
MATH500DatasetSample,
)
from .mbpp import (
MBPPDataset,
MBPPDatasetSample,
)
from .mmlu import (
MMLUDataset,
MMLUDatasetSample,
Expand Down Expand Up @@ -69,6 +73,8 @@ __all__ = [
"LiveCodeBenchDatasetSample",
"MATH500Dataset",
"MATH500DatasetSample",
"MBPPDataset",
"MBPPDatasetSample",
"MMLUDataset",
"MMLUDatasetSample",
"MMLUProDataset",
Expand Down
17 changes: 15 additions & 2 deletions sieval/datasets/downloaders/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,14 @@ def download(
try:
with httpx.stream("GET", url, timeout=_TIMEOUT, follow_redirects=True) as r:
r.raise_for_status()
# Catches 2xx responses that dropped the connection mid-stream.
expected = _parse_content_length(r.headers.get("content-length"))
# Catches 2xx identity responses that dropped the connection
# mid-stream. For compressed transport, Content-Length is the
# encoded byte count while iter_bytes() yields decoded bytes.
expected = (
None
if _has_compressed_content(r.headers)
else _parse_content_length(r.headers.get("content-length"))
)
written = 0
with tmp.open("wb") as f:
for chunk in r.iter_bytes(chunk_size=1 << 16):
Expand Down Expand Up @@ -87,3 +93,10 @@ def _parse_content_length(raw: str | None) -> int | None:
return int(raw.strip())
except ValueError:
return None


def _has_compressed_content(headers: httpx.Headers | dict[str, str]) -> bool:
encoding = headers.get("content-encoding")
if encoding is None:
return False
return encoding.strip().lower() not in {"", "identity"}
113 changes: 113 additions & 0 deletions sieval/datasets/mbpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
MBPP dataset loader (Mostly Basic Python Problems).

Loads the upstream ``mbpp.jsonl`` and rebuilds the four official splits by
``task_id`` range: prompt (1-10), test (11-510), validation (511-600), and
train (601-974).

AI-Generated Code - Claude Opus 4.8 (1M context) (Anthropic)
"""

from pathlib import Path
from typing import TypedDict, override

from datasets import DatasetDict as HFDatasetDict
from datasets import load_dataset

from sieval.core.datasets import (
Category,
Dataset,
Level1Category,
sieval_dataset,
)
from sieval.core.utils.hf import ensure_dataset_dict

MBPP_JSONL_URL = (
"https://raw.githubusercontent.com/google-research/google-research/"
"2529b9bcfb930399929b047731804d40dc9a9e2a/mbpp/mbpp.jsonl"
)
_MBPP_FILENAME = "mbpp.jsonl"


class MBPPDatasetSample(TypedDict):
task_id: int
text: str
code: str
test_list: list[str]
test_setup_code: str
challenge_test_list: list[str]


def _process_sample(sample: dict) -> MBPPDatasetSample:
return {
"task_id": sample["task_id"],
"text": sample.get("text") or sample.get("prompt", ""),
"code": sample["code"],
"test_list": sample["test_list"],
"test_setup_code": sample.get("test_setup_code") or "",
"challenge_test_list": sample.get("challenge_test_list") or [],
}


def _resolve_data_file(name_or_path: str) -> str:
if name_or_path.startswith(("http://", "https://")):
return name_or_path

path = Path(name_or_path)
if path.is_dir():
path = path / _MBPP_FILENAME
if not path.exists():
raise FileNotFoundError(
f"MBPP data file not found: {path}\n"
"Tip: run `sieval dataset download mbpp` to fetch the dataset."
)
return str(path)


@sieval_dataset(
name="mbpp",
display_name="MBPP",
description="Mostly Basic Python Problems: 974 entry-level Python tasks.",
source=f"url:{MBPP_JSONL_URL}",
categories=(Category(Level1Category.CODE, "CodeGeneration"),),
tags=("english", "python", "code-exec"),
license="CC-BY-4.0",
)
class MBPPDataset(Dataset[MBPPDatasetSample]):
@override
def load(
self,
name_or_path: str,
config: str | None = None,
**kwargs,
) -> HFDatasetDict:
data_file = _resolve_data_file(name_or_path)
dataset = load_dataset(
"json",
config,
data_files={"full": data_file},
**kwargs,
)
dataset = ensure_dataset_dict(dataset)
full = dataset["full"].map(_process_sample)

splits = HFDatasetDict(
{
"prompt": full.filter(lambda sample: 1 <= sample["task_id"] <= 10),
"test": full.filter(lambda sample: 11 <= sample["task_id"] <= 510),
"validation": full.filter(
lambda sample: 511 <= sample["task_id"] <= 600
),
"train": full.filter(lambda sample: 601 <= sample["task_id"] <= 974),
}
)

empty = [name for name, split in splits.items() if len(split) == 0]
if empty:
raise ValueError(
f"MBPP produced empty split(s) {empty} from {len(full)} rows. "
"Expected task_id ranges prompt(1-10)/test(11-510)/"
"validation(511-600)/train(601-974); the data file may be "
"truncated or use unexpected task_id values."
)
return splits
43 changes: 43 additions & 0 deletions sieval/meta/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,27 @@
"deps_group": null,
"license": "MIT"
},
{
"name": "mbpp",
"display_name": "MBPP",
"description": "Mostly Basic Python Problems: 974 entry-level Python tasks.",
"source": [
"url:https://raw.githubusercontent.com/google-research/google-research/2529b9bcfb930399929b047731804d40dc9a9e2a/mbpp/mbpp.jsonl"
],
"categories": [
{
"level1": "Code",
"level2": "CodeGeneration"
}
],
"tags": [
"english",
"python",
"code-exec"
],
"deps_group": null,
"license": "CC-BY-4.0"
},
{
"name": "mmlu",
"display_name": "MMLU",
Expand Down Expand Up @@ -438,6 +459,28 @@
},
"status": "stable"
},
{
"name": "mbpp_kshot_base_gen",
"display_name": "MBPP (few-shot, base generative)",
"description": "MBPP few-shot code generation with pass@k execution scoring.",
"dataset": "mbpp",
"eval_mode": "gen",
"n_shot": 3,
"tags": [
"english",
"python",
"code-exec",
"base-model"
],
"deps_group": null,
"model_type": "gen",
"reference_impl": {
"source": "lm-evaluation-harness",
"url": "https://github.com/EleutherAI/lm-evaluation-harness/blob/1dd931087362abba74e0375c8c631295559f48b2/lm_eval/tasks/mbpp/mbpp.yaml",
"notes": "Prompt, [DONE] stop token, and default task_id 2/3/4 few-shot samples mirror lm-eval MBPP; num_shots is configurable via YAML task args. Greedy generation (temperature=0, top_p=1, max_tokens=1024). Published Qwen2.5-72B-Base MBPP 3-shot Pass@1 is 76.0 (Qwen3 report, Table 3) and 72.6 (DeepSeek-V3 report, Table 3)."
},
"status": "stable"
},
{
"name": "mmlu_0shot_gen",
"display_name": "MMLU (0-shot, generative)",
Expand Down
4 changes: 4 additions & 0 deletions sieval/tasks/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ from .livecodebench_code_generation_0shot_gen import (
from .math_500_0shot_gen import (
MATH500ZeroShotGenTask,
)
from .mbpp_kshot_base_gen import (
MBPPFewShotBaseGenTask,
)
from .mmlu_0shot_gen import (
MMLUZeroShotGenTask,
)
Expand All @@ -48,6 +51,7 @@ __all__ = [
"IFEvalZeroShotGenTask",
"LiveCodeBenchCodeGenerationZeroShotGenTask",
"MATH500ZeroShotGenTask",
"MBPPFewShotBaseGenTask",
"MMLUProZeroShotGenTask",
"MMLUZeroShotGenTask",
"TEvalBeforeCallingZeroShotGenTask",
Expand Down
Loading
Loading