PrimeIntellect-ai · mikasenghaas · Apr 19, 2026 · Apr 19, 2026
diff --git a/configs/acereason_math/stage1.toml b/configs/acereason_math/stage1.toml
@@ -19,11 +19,11 @@ name = "stage1"
 batch_size = 1024
 rollouts_per_example = 8
 
-[orchestrator.sampling]
+[orchestrator.train.sampling]
 temperature = 0.6
-max_tokens = 8192
+max_completion_tokens = 8192
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 args = { dataset_name = "nvidia/AceReason-Math", dataset_subset = "default", question_key = "problem", math_verify_max_workers = 128, math_verify_timeout = 60 }
 

diff --git a/configs/acereason_math/stage2.toml b/configs/acereason_math/stage2.toml
@@ -20,11 +20,11 @@ name = "stage2"
 batch_size = 2048
 rollouts_per_example = 16
 
-[orchestrator.sampling]
+[orchestrator.train.sampling]
 temperature = 0.6
-max_tokens = 16384
+max_completion_tokens = 16384
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 args = { dataset_name = "nvidia/AceReason-Math", dataset_subset = "default", question_key = "problem", math_verify_max_workers = 128, math_verify_timeout = 60 }
 

diff --git a/configs/alphabet_sort/rl.toml b/configs/alphabet_sort/rl.toml
@@ -13,10 +13,10 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 batch_size = 512
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 512
+[orchestrator.train.sampling]
+max_completion_tokens = 512
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "alphabet-sort"
 name = "alphabet-sort"
 args = { min_turns = 2, max_turns = 2 }

diff --git a/configs/ci/integration/alphabet_sort/start.toml b/configs/ci/integration/alphabet_sort/start.toml
@@ -13,10 +13,10 @@ lr = 1e-5
 batch_size = 128
 rollouts_per_example = 8
 
-[orchestrator.sampling]
-max_tokens = 384
+[orchestrator.train.sampling]
+max_completion_tokens = 384
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "alphabet-sort"
 name = "alphabet-sort"
 args = { min_turns = 2, max_turns = 2, min_names_per_turn = 1, max_names_per_turn = 3, similarity_power = 4, power_per_turn = false }

diff --git a/configs/ci/integration/alphabet_sort_branch/start.toml b/configs/ci/integration/alphabet_sort_branch/start.toml
@@ -18,10 +18,10 @@ lr = 1e-5
 batch_size = 128
 rollouts_per_example = 8
 
-[orchestrator.sampling]
-max_tokens = 1024
+[orchestrator.train.sampling]
+max_completion_tokens = 1024
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "primeintellect/alphabet-sort"
 name = "alphabet-sort"
 args = { min_turns = 2, max_turns = 2, min_names_per_turn = 1, max_names_per_turn = 3, similarity_power = 4, power_per_turn = false }

diff --git a/configs/ci/integration/rl/resume.toml b/configs/ci/integration/rl/resume.toml
@@ -14,10 +14,10 @@ lr = 3e-6
 batch_size = 128
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 128
+[orchestrator.train.sampling]
+max_completion_tokens = 128
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "reverse-text"
 
 [inference]
diff --git a/configs/ci/integration/rl/start.toml b/configs/ci/integration/rl/start.toml
@@ -13,10 +13,10 @@ lr = 3e-6
 batch_size = 128
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 128
+[orchestrator.train.sampling]
+max_completion_tokens = 128
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "reverse-text"
 
 [inference]
diff --git a/configs/ci/integration/rl_lora/resume.toml b/configs/ci/integration/rl_lora/resume.toml
@@ -23,10 +23,10 @@ rollouts_per_example = 16
 [orchestrator.model.lora]
 name = "r8-1e-4"
 
-[orchestrator.sampling]
-max_tokens = 128
+[orchestrator.train.sampling]
+max_completion_tokens = 128
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "reverse-text"
 
 [inference]

diff --git a/configs/ci/integration/rl_lora/start.toml b/configs/ci/integration/rl_lora/start.toml
@@ -22,10 +22,10 @@ rollouts_per_example = 16
 [orchestrator.model.lora]
 name = "r8-1e-4"
 
-[orchestrator.sampling]
-max_tokens = 128
+[orchestrator.train.sampling]
+max_completion_tokens = 128
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "reverse-text"
 
 [inference]

diff --git a/configs/ci/integration/rl_moe/start.toml b/configs/ci/integration/rl_moe/start.toml
@@ -13,10 +13,10 @@ lr = 3e-6
 batch_size = 128
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 128
+[orchestrator.train.sampling]
+max_completion_tokens = 128
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "reverse-text"
 
 [inference]

diff --git a/configs/ci/integration/rl_multi_run/orchestrator.toml b/configs/ci/integration/rl_multi_run/orchestrator.toml
@@ -11,10 +11,10 @@ name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
 [optim]
 lr = 3e-5
 
-[sampling]
-max_tokens = 128
+[train.sampling]
+max_completion_tokens = 128
 
-[[env]]
+[[train.env]]
 id = "reverse-text"
 
 [ckpt]

diff --git a/configs/ci/nightly/acereason_math.toml b/configs/ci/nightly/acereason_math.toml
@@ -17,11 +17,11 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
 batch_size = 1024
 rollouts_per_example = 8
 
-[orchestrator.sampling]
+[orchestrator.train.sampling]
 temperature = 0.6
-max_tokens = 8192
+max_completion_tokens = 8192
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "acereason-math"
 args = { dataset_name = "nvidia/AceReason-Math", dataset_subset = "default", question_key = "problem", math_verify_max_workers = 128, math_verify_timeout = 60 }

diff --git a/configs/ci/nightly/multimodal_color_codeword.toml b/configs/ci/nightly/multimodal_color_codeword.toml
@@ -17,10 +17,10 @@ language_model_attr = "model.language_model"
 batch_size = 256
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 64
+[orchestrator.train.sampling]
+max_completion_tokens = 64
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "color-codeword"
 args = { images_per_turn = 1, max_turns = 3, num_examples = 1000, seed = 42 }
 

diff --git a/configs/debug/orch.toml b/configs/debug/orch.toml
@@ -2,6 +2,6 @@ max_steps = 5
 max_async_level = 5
 batch_size = 16
 
-[sampling]
-max_tokens = 16
+[train.sampling]
+max_completion_tokens = 16
 
diff --git a/configs/deepscaler/stage1.toml b/configs/deepscaler/stage1.toml
@@ -19,11 +19,11 @@ interval = 100
 batch_size = 1024
 rollouts_per_example = 8
 
-[orchestrator.sampling]
+[orchestrator.train.sampling]
 temperature = 0.6
-max_tokens = 8192
+max_completion_tokens = 8192
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "deepscaler"
 args = { dataset_name= "agentica-org/DeepScaleR-Preview-Dataset", dataset_subset = "default", question_key = "problem", answer_key = "solution", math_verify_max_workers = 128, math_verify_timeout = 60 }

diff --git a/configs/deepscaler/stage2.toml b/configs/deepscaler/stage2.toml
@@ -20,11 +20,11 @@ resume_step = 500
 batch_size = 1024
 rollouts_per_example = 8
 
-[orchestrator.sampling]
+[orchestrator.train.sampling]
 temperature = 0.6
-max_tokens = 16384
+max_completion_tokens = 16384
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "deepscaler"
 args = { dataset_name= "agentica-org/DeepScaleR-Preview-Dataset", dataset_subset = "default", question_key = "problem", answer_key = "solution", math_verify_max_workers = 128, math_verify_timeout = 60 }

diff --git a/configs/deepscaler/stage3.toml b/configs/deepscaler/stage3.toml
@@ -20,11 +20,11 @@ resume_step = 1000
 batch_size = 1024
 rollouts_per_example = 8
 
-[orchestrator.sampling]
+[orchestrator.train.sampling]
 temperature = 0.6
-max_tokens = 24576
+max_completion_tokens = 24576
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "deepscaler"
 args = { dataset_name= "agentica-org/DeepScaleR-Preview-Dataset", dataset_subset = "default", question_key = "problem", answer_key = "solution", math_verify_max_workers = 128, math_verify_timeout = 60 }

diff --git a/configs/elastic/rl.toml b/configs/elastic/rl.toml
@@ -32,15 +32,15 @@ lr = 1e-5
 batch_size = 512
 rollouts_per_example = 8
 
-[orchestrator.sampling]
-max_tokens = 768
+[orchestrator.train.sampling]
+max_completion_tokens = 768
 
 [orchestrator.client.elastic]
 hostname = "localhost"
 port = 8000
 sync_interval = 5.0
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "alphabet-sort"
 name = "alphabet-sort"
 args = { min_turns = 3, max_turns = 5, power_per_turn = false }
diff --git a/configs/env_mix/env_mix.toml b/configs/env_mix/env_mix.toml
@@ -16,27 +16,27 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 batch_size = 512
 rollouts_per_example = 16
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "math"
 args = { min_avg_reward = 0.1, rubric_max_workers = 128 }
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "code-env"
 name = "code"
 args = { min_avg_reward = 0.1 }
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "science-env"
 name = "science"
 args = { min_avg_reward = 0.1, pool_size = 128, rubric_max_workers = 128 }
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "logic-env"
 name = "logic"
 args = { min_avg_reward = 0.1 }
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "math-python"
 args = { min_avg_reward = 0.1, sandbox_client_max_workers = 128, sandbox_timeout_minutes = 10, sandbox_memory_gb = 1, sandbox_disk_size_gb = 1, pip_install_packages = "numpy sympy", python_tool = true, rubric_max_workers = 128 }

diff --git a/configs/gsm8k/rl.toml b/configs/gsm8k/rl.toml
@@ -12,10 +12,10 @@ name = "PrimeIntellect/Qwen3-0.6B"
 batch_size = 512
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 2048
+[orchestrator.train.sampling]
+max_completion_tokens = 2048
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "gsm8k"
 args = { dataset_name = "openai/gsm8k", dataset_subset = "main", math_verify_max_workers = 128, math_verify_timeout = 60 }

diff --git a/configs/hendrycks_math/rl.toml b/configs/hendrycks_math/rl.toml
@@ -12,10 +12,10 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 batch_size = 512
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 2048
+[orchestrator.train.sampling]
+max_completion_tokens = 2048
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "hendrycks-math"
 args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "default", math_verify_max_workers = 128, math_verify_timeout = 60 }
@@ -28,7 +28,7 @@ hard_threshold = 0.0
 interval = 10
 
 [orchestrator.eval.sampling]
-max_tokens = 2048
+max_completion_tokens = 2048
 
 [[orchestrator.eval.env]]
 id = "math500"

diff --git a/configs/hendrycks_math/sanity.toml b/configs/hendrycks_math/sanity.toml
@@ -12,7 +12,7 @@ name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 batch_size = 512
 rollouts_per_example = 8
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 args = { dataset_name = "mikasenghaas/Sanity-Test-R1D-1.5B", dataset_subset = "default" }
 name = "hendrycks-math"

diff --git a/configs/math_group/rl.toml b/configs/math_group/rl.toml
@@ -11,13 +11,13 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 batch_size = 256
 rollouts_per_example = 8
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "hendrycks-math"
 args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "default" }
 ratio = 0.5
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-env"
 name = "acereason-math"
 args = { dataset_name = "nvidia/AceReason-Math", dataset_subset = "default", question_key = "problem" }

diff --git a/configs/math_python/math_python.toml b/configs/math_python/math_python.toml
@@ -12,10 +12,10 @@ name = "Qwen/Qwen3-4B-Instruct-2507"
 batch_size = 512
 rollouts_per_example = 16
 
-[orchestrator.sampling]
-max_tokens = 512
+[orchestrator.train.sampling]
+max_completion_tokens = 512
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "math-python"
 args = { sandbox_client_max_workers = 128, sandbox_timeout_minutes = 10, sandbox_memory_gb = 1, sandbox_disk_size_gb = 1, pip_install_packages = "numpy sympy" }
 

diff --git a/configs/multimodal/rl_color_codeword.toml b/configs/multimodal/rl_color_codeword.toml
@@ -13,10 +13,10 @@ batch_size = 256
 rollouts_per_example = 16
 
 
-[orchestrator.sampling]
-max_tokens = 64
+[orchestrator.train.sampling]
+max_completion_tokens = 64
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "color-codeword"
 args = { images_per_turn = 1, max_turns = 3, num_examples = 1000, seed = 42 }
 

diff --git a/configs/multimodal/rl_color_codeword_test.toml b/configs/multimodal/rl_color_codeword_test.toml
@@ -13,10 +13,10 @@ language_model_attr = "model.language_model"
 batch_size = 16
 rollouts_per_example = 2
 
-[orchestrator.sampling]
-max_tokens = 32
+[orchestrator.train.sampling]
+max_completion_tokens = 32
 
-[[orchestrator.env]]
+[[orchestrator.train.env]]
 id = "color-codeword"
 args = { images_per_turn = 1, max_turns = 2, num_examples = 100, seed = 42 }