Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions sieval/community/livecodebench/prompts/code_generation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,20 @@
# adapted from https://github.com/LiveCodeBench/LiveCodeBench/blob/28fef95ea8c9f7a547c8329f2cd3d32b92c1fa24/lcb_runner/prompts/code_generation.py
import json
from pathlib import Path

_FEWSHOT_DIR = Path(__file__).parent / "few_shot_examples" / "generation"

# Upstream loads these few-shot pools at module level; we only fix the path to
# be package-relative. Each pool is the 2 upstream examples verbatim plus 1
# sieval-authored example (marked with a `_source` key, ignored by the template)
# so the base-model template can reach a 3-shot setting — upstream ships only 2.
with (_FEWSHOT_DIR / "func.json").open(encoding="utf-8") as _f:
func = json.load(_f)

with (_FEWSHOT_DIR / "stdin.json").open(encoding="utf-8") as _f:
stdin = json.load(_f)


class PromptConstants:
SYSTEM_MESSAGE_GENERIC = "You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests."

Expand Down Expand Up @@ -46,3 +62,62 @@ def get_generic_question_template_answer(question: dict, cot: bool = False) -> s
else:
prompt += "### Answer: (use the provided format with backticks)\n\n"
return prompt


# adapted from upstream `get_base_model_question_template_answer`. Two divergences
# from upstream, both documented on the consuming task: (1) generalized from
# upstream's hardcoded single example to `n_shot` in-context examples; (2) takes a
# `dict` instead of a `CodeGenerationProblem` (matching this repo's
# `get_generic_question_template_answer` convention). Decomposed into prefix +
# target so the fixed few-shot prefix can be cached once per run instead of rebuilt
# per sample; `get_base_model_question_template_answer` is retained as the faithful
# entry point and, for `n_shot == 1`, its output is byte-identical to upstream.
def _format_base_example(example: dict, has_starter: bool) -> str:
prompt = ""
prompt += "### Question\n"
prompt += example["question"]
prompt += "\n\n"
if has_starter:
prompt += "### Starter Code\n"
prompt += example["sample_code"]
prompt += "\n\n"
prompt += "### Answer\n\n"
prompt += example["answer"]
if example["answer"]:
prompt += "\n\n"
return prompt


def get_base_model_fewshot_prefix(has_starter: bool, n_shot: int = 1) -> str:
"""Fixed `n_shot` in-context prefix (no target). Cache once per `has_starter`."""
examples_json = func if has_starter else stdin
if n_shot < 0:
raise ValueError(f"n_shot must be >= 0, got {n_shot}")
if n_shot > len(examples_json):
raise ValueError(
f"n_shot={n_shot} exceeds the {len(examples_json)} available few-shot "
f"examples for {'starter-code' if has_starter else 'stdin'} problems."
)
return "".join(
_format_base_example(example, has_starter)
for example in examples_json[:n_shot]
)


def get_base_model_target_block(question_content: str, starter_code: str) -> str:
"""Per-sample trailing block: the target question with an empty answer."""
return _format_base_example(
{
"question": question_content,
"sample_code": starter_code,
"answer": "",
},
bool(starter_code),
)


def get_base_model_question_template_answer(question: dict, n_shot: int = 1) -> str:
has_starter = bool(question["starter_code"])
return get_base_model_fewshot_prefix(has_starter, n_shot) + (
get_base_model_target_block(question["question_content"], question["starter_code"])
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[
{
"question": "You are given a 0-indexed array of positive integers nums. Find the number of triplets (i, j, k) that meet the following conditions:\n\n0 <= i < j < k < nums.length\nnums[i], nums[j], and nums[k] are pairwise distinct.\n\t\nIn other words, nums[i] != nums[j], nums[i] != nums[k], and nums[j] != nums[k].\n\n\n\nReturn the number of triplets that meet the conditions.\n \nExample 1:\n\nInput: nums = [4,4,2,4,3]\nOutput: 3\nExplanation: The following triplets meet the conditions:\n- (0, 2, 4) because 4 != 2 != 3\n- (1, 2, 4) because 4 != 2 != 3\n- (2, 3, 4) because 2 != 4 != 3\nSince there are 3 triplets, we return 3.\nNote that (2, 0, 4) is not a valid triplet because 2 > 0.\n\nExample 2:\n\nInput: nums = [1,1,1,1,1]\nOutput: 0\nExplanation: No triplets meet the conditions so we return 0.\n\n \nConstraints:\n\n3 <= nums.length <= 100\n1 <= nums[i] <= 1000\n\n",
"sample_code": "class Solution:\n def unequalTriplets(self, nums: List[int]) -> int:\n ",
"answer": "class Solution:\n def unequalTriplets(self, a: List[int]) -> int:\n ans = 0\n n = len(a)\n for i in range(n):\n for j in range(i + 1, n):\n for k in range(j + 1, n):\n ans += len({a[i], a[j], a[k]}) == 3\n return ans"
},
{
"question": "You are given two strings s and t consisting of only lowercase English letters.\nReturn the minimum number of characters that need to be appended to the end of s so that t becomes a subsequence of s.\nA subsequence is a string that can be derived from another string by deleting some or no characters without changing the order of the remaining characters.\n \nExample 1:\n\nInput: s = \"coaching\", t = \"coding\"\nOutput: 4\nExplanation: Append the characters \"ding\" to the end of s so that s = \"coachingding\".\nNow, t is a subsequence of s (\"coachingding\").\nIt can be shown that appending any 3 characters to the end of s will never make t a subsequence.\n\nExample 2:\n\nInput: s = \"abcde\", t = \"a\"\nOutput: 0\nExplanation: t is already a subsequence of s (\"abcde\").\n\nExample 3:\n\nInput: s = \"z\", t = \"abcde\"\nOutput: 5\nExplanation: Append the characters \"abcde\" to the end of s so that s = \"zabcde\".\nNow, t is a subsequence of s (\"zabcde\").\nIt can be shown that appending any 4 characters to the end of s will never make t a subsequence.\n\n \nConstraints:\n\n1 <= s.length, t.length <= 10^5\ns and t consist only of lowercase English letters.\n\n",
"sample_code": "class Solution:\n def appendCharacters(self, s: str, t: str) -> int:\n ",
"answer": "class Solution:\n def appendCharacters(self, s: str, t: str) -> int:\n i = 0\n for char in s:\n if i < len(t) and char == t[i]:\n i += 1\n return len(t) - i"
},
{
"_source": "sieval-authored example (NOT from upstream LiveCodeBench)",
"question": "You are given an array nums consisting of integers. The running sum of an array is defined as runningSum[i] = sum of nums[0] through nums[i].\n\nReturn the running sum of nums.\n\n \nExample 1:\n\nInput: nums = [1,2,3,4]\nOutput: [1,3,6,10]\nExplanation: Running sum is obtained as follows: [1, 1+2, 1+2+3, 1+2+3+4].\n\nExample 2:\n\nInput: nums = [1,1,1,1,1]\nOutput: [1,2,3,4,5]\nExplanation: Running sum is obtained as follows: [1, 1+1, 1+1+1, 1+1+1+1, 1+1+1+1+1].\n\n \nConstraints:\n\n1 <= nums.length <= 1000\n-10^6 <= nums[i] <= 10^6\n\n",
"sample_code": "class Solution:\n def runningSum(self, nums: List[int]) -> List[int]:\n ",
"answer": "class Solution:\n def runningSum(self, nums: List[int]) -> List[int]:\n ans = []\n total = 0\n for x in nums:\n total += x\n ans.append(total)\n return ans"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[
{
"question": "You have $n$ gifts and you want to give all of them to children. Of course, you don't want to offend anyone, so all gifts should be equal between each other. The $i$-th gift consists of $a_i$ candies and $b_i$ oranges.\n\nDuring one move, you can choose some gift $1 \\le i \\le n$ and do one of the following operations:\n\n eat exactly one candy from this gift (decrease $a_i$ by one); eat exactly one orange from this gift (decrease $b_i$ by one); eat exactly one candy and exactly one orange from this gift (decrease both $a_i$ and $b_i$ by one). \n\nOf course, you can not eat a candy or orange if it's not present in the gift (so neither $a_i$ nor $b_i$ can become less than zero).\n\nAs said above, all gifts should be equal. This means that after some sequence of moves the following two conditions should be satisfied: $a_1 = a_2 = \\dots = a_n$ and $b_1 = b_2 = \\dots = b_n$ (and $a_i$ equals $b_i$ is not necessary).\n\nYour task is to find the minimum number of moves required to equalize all the given gifts.\n\nYou have to answer $t$ independent test cases.\n\n\n-----Input-----\n\nThe first line of the input contains one integer $t$ ($1 \\le t \\le 1000$) — the number of test cases. Then $t$ test cases follow.\n\nThe first line of the test case contains one integer $n$ ($1 \\le n \\le 50$) — the number of gifts. The second line of the test case contains $n$ integers $a_1, a_2, \\dots, a_n$ ($1 \\le a_i \\le 10^9$), where $a_i$ is the number of candies in the $i$-th gift. The third line of the test case contains $n$ integers $b_1, b_2, \\dots, b_n$ ($1 \\le b_i \\le 10^9$), where $b_i$ is the number of oranges in the $i$-th gift.\n\n\n-----Output-----\n\nFor each test case, print one integer: the minimum number of moves required to equalize all the given gifts.\n\n\n-----Example-----\nInput\n5\n3\n3 5 6\n3 2 3\n5\n1 2 3 4 5\n5 4 3 2 1\n3\n1 1 1\n2 2 2\n6\n1 1000000000 1000000000 1000000000 1000000000 1000000000\n1 1 1 1 1 1\n3\n10 12 8\n7 5 4\n\nOutput\n6\n16\n0\n4999999995\n7\n\n\n\n-----Note-----\n\nIn the first test case of the example, we can perform the following sequence of moves:\n\n choose the first gift and eat one orange from it, so $a = [3, 5, 6]$ and $b = [2, 2, 3]$; choose the second gift and eat one candy from it, so $a = [3, 4, 6]$ and $b = [2, 2, 3]$; choose the second gift and eat one candy from it, so $a = [3, 3, 6]$ and $b = [2, 2, 3]$; choose the third gift and eat one candy and one orange from it, so $a = [3, 3, 5]$ and $b = [2, 2, 2]$; choose the third gift and eat one candy from it, so $a = [3, 3, 4]$ and $b = [2, 2, 2]$; choose the third gift and eat one candy from it, so $a = [3, 3, 3]$ and $b = [2, 2, 2]$.",
"answer": "def minimum_moves(t, test_cases):\n for _ in range(t):\n n = test_cases[_][0]\n candies = test_cases[_][1]\n oranges = test_cases[_][2]\n min_candies = min(candies)\n min_oranges = min(oranges)\n ans = 0\n for i in range(n):\n ans += max(candies[i] - min_candies, oranges[i] - min_oranges)\n print(ans)\n\n\ndef main():\n t = int(input())\n test_cases = []\n for _ in range(t):\n n = int(input())\n candies = list(map(int, input().split()))\n oranges = list(map(int, input().split()))\n test_cases.append((n, candies, oranges))\n minimum_moves(t, test_cases)\n\n\nmain()\n"
},
{
"question": "Let's call a string a phone number if it has length 11 and fits the pattern \"8xxxxxxxxxx\", where each \"x\" is replaced by a digit.\n\nFor example, \"80123456789\" and \"80000000000\" are phone numbers, while \"8012345678\" and \"79000000000\" are not.\n\nYou have n cards with digits, and you want to use them to make as many phone numbers as possible. Each card must be used in at most one phone number, and you don't have to use all cards. The phone numbers do not necessarily have to be distinct.\n\nInput\n\nThe first line contains an integer n — the number of cards with digits that you have (1 ≤ n ≤ 100).\n\nThe second line contains a string of n digits (characters \"0\", \"1\", ..., \"9\") s_1, s_2, …, s_n. The string will not contain any other characters, such as leading or trailing spaces.\n\nOutput\n\nIf at least one phone number can be made from these cards, output the maximum number of phone numbers that can be made. Otherwise, output 0.\n\nExamples\n\nInput\n\n11\n00000000008\n\n\nOutput\n\n1\n\n\nInput\n\n22\n0011223344556677889988\n\n\nOutput\n\n2\n\n\nInput\n\n11\n31415926535\n\n\nOutput\n\n0\n\nNote\n\nIn the first example, one phone number, \"8000000000\", can be made from these cards.\n\nIn the second example, you can make two phone numbers from the cards, for example, \"80123456789\" and \"80123456789\".\n\nIn the third example you can't make any phone number from the given cards.",
"answer": "def count_phone_numbers(num_cards, card_digits):\n count_eights = card_digits.count(\"8\")\n max_phone_numbers = num_cards // 11\n max_possible = min(count_eights, max_phone_numbers)\n return max_possible\n\ndef main():\n num_cards = int(input())\n card_digits = input().strip()\n max_possible = count_phone_numbers(num_cards, card_digits)\n print(max_possible)\n\nmain()"
},
{
"_source": "sieval-authored example (NOT from upstream LiveCodeBench)",
"question": "You are given t test cases. Each test case consists of two integers a and b. For each test case, print the sum a + b on its own line.\n\nInput\n\nThe first line contains one integer t (1 <= t <= 100) - the number of test cases.\n\nEach of the next t lines contains two integers a and b (1 <= a, b <= 10^9).\n\nOutput\n\nFor each test case, print one integer - the value of a + b.\n\nExample\n\nInput\n\n3\n1 2\n10 20\n100 200\n\nOutput\n\n3\n30\n300\n",
"answer": "def main():\n t = int(input())\n for _ in range(t):\n a, b = map(int, input().split())\n print(a + b)\n\n\nmain()\n"
}
]
4 changes: 3 additions & 1 deletion sieval/datasets/livecodebench_code_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
)
from sieval.core.utils.hf import ensure_dataset_dict

LIVECODEBENCH_REVISION = "0fe84c3912ea0c4d4a78037083943e8f0c4dd505"

VERSION_FILES = {
"release_v0": [], # placeholder for initial version
"release_v1": ["test.jsonl"],
Expand Down Expand Up @@ -50,7 +52,7 @@ class LiveCodeBenchDatasetSample(TypedDict):
name="livecodebench_code_generation",
display_name="LiveCodeBench Code Generation",
description="LiveCodeBench code generation lite — contamination-free benchmark.",
source="hf:livecodebench/code_generation_lite",
source=f"hf:livecodebench/code_generation_lite@{LIVECODEBENCH_REVISION}",
categories=(Category(Level1Category.CODE, "CodeGeneration"),),
tags=("english", "python", "code-exec"),
# Mirrors upstream HF label verbatim (unversioned 'cc'); the license
Expand Down
24 changes: 23 additions & 1 deletion sieval/meta/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@
"display_name": "LiveCodeBench Code Generation",
"description": "LiveCodeBench code generation lite — contamination-free benchmark.",
"source": [
"hf:livecodebench/code_generation_lite"
"hf:livecodebench/code_generation_lite@0fe84c3912ea0c4d4a78037083943e8f0c4dd505"
],
"categories": [
{
Expand Down Expand Up @@ -418,6 +418,28 @@
},
"status": "stable"
},
{
"name": "livecodebench_code_generation_kshot_base_gen",
"display_name": "LiveCodeBench Code Generation (few-shot, base generative)",
"description": "LiveCodeBench — contamination-free code benchmark, base-model few-shot generation subset.",
"dataset": "livecodebench_code_generation",
"eval_mode": "gen",
"n_shot": 3,
"tags": [
"english",
"python",
"code-exec",
"base-model"
],
"deps_group": null,
"model_type": "gen",
"reference_impl": {
"source": "livecodebench",
"url": "https://github.com/LiveCodeBench/LiveCodeBench/blob/28fef95ea8c9f7a547c8329f2cd3d32b92c1fa24/lcb_runner/prompts/code_generation.py",
"notes": "Base-model few-shot template vendored from lcb_runner/prompts.get_base_model_question_template_answer and extract_code(lmstyle='GenericBase') from lcb_runner/utils. Default n_shot=3 and stop=('###',) follow DeepSeek-V3 Table 3 'LiveCodeBench-Base (Pass@1), 3-shot' and upstream's runner default. Upstream ships only 2 few-shot examples per pool, so the 3rd example is sieval-authored. Recommended problem window 2024-08-01..2024-11-01 is configured via dataset YAML args (version_tag/start_date/end_date)."
},
"status": "stable"
},
{
"name": "math_500_0shot_gen",
"display_name": "MATH-500 (0-shot, generative)",
Expand Down
4 changes: 4 additions & 0 deletions sieval/tasks/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ from .ifeval_0shot_gen import (
from .livecodebench_code_generation_0shot_gen import (
LiveCodeBenchCodeGenerationZeroShotGenTask,
)
from .livecodebench_code_generation_kshot_base_gen import (
LiveCodeBenchCodeGenerationFewShotBaseGenTask,
)
from .math_500_0shot_gen import (
MATH500ZeroShotGenTask,
)
Expand All @@ -46,6 +49,7 @@ __all__ = [
"GSM8KFewShotBaseGenTask",
"HumanEvalZeroShotGenTask",
"IFEvalZeroShotGenTask",
"LiveCodeBenchCodeGenerationFewShotBaseGenTask",
"LiveCodeBenchCodeGenerationZeroShotGenTask",
"MATH500ZeroShotGenTask",
"MMLUProZeroShotGenTask",
Expand Down
Loading
Loading