-
Notifications
You must be signed in to change notification settings - Fork 0
β‘ Bolt: [performance improvement] Replace dict.setdefault with explicit membership checks to avoid eager instantiation #21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
|
|
||
| ## 2024-05-18 - Eager instantiation with dict.setdefault | ||
| **Learning:** `dict.setdefault(key, expensive_default)` inside hot loops eagerly evaluates `expensive_default` on *every* iteration, which is a major performance bottleneck for Python loops where defaults are objects or lists. | ||
| **Action:** Always replace `setdefault` with an explicit `if key not in dict: dict[key] = ...` membership check to prevent eager evaluation. | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -17,7 +17,6 @@ | |||||||||||||||||||
| HardmaxResult, | ||||||||||||||||||||
| NumberLike, | ||||||||||||||||||||
| ValueLike, | ||||||||||||||||||||
| _as_fraction, | ||||||||||||||||||||
| _coerce_key, | ||||||||||||||||||||
| _coerce_value, | ||||||||||||||||||||
| _normalize_number, | ||||||||||||||||||||
|
|
@@ -205,10 +204,12 @@ def _rebuild_if_needed(self) -> None: | |||||||||||||||||||
| total_value_sum = [Fraction(0) for _ in range(self._value_width or 0)] | ||||||||||||||||||||
|
|
||||||||||||||||||||
| for index, (key, value) in enumerate(self._entries): | ||||||||||||||||||||
| bucket = aggregates.setdefault( | ||||||||||||||||||||
| key, | ||||||||||||||||||||
| {"value_sum": [Fraction(0) for _ in value], "count": 0, "entry_indices": []}, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| # Avoid eager instantiation of expensive default dict/list for every iteration | ||||||||||||||||||||
| # by using an explicit membership check instead of setdefault. | ||||||||||||||||||||
| if key not in aggregates: | ||||||||||||||||||||
| aggregates[key] = {"value_sum": [Fraction(0) for _ in value], "count": 0, "entry_indices": []} | ||||||||||||||||||||
| bucket = aggregates[key] | ||||||||||||||||||||
|
Comment on lines
+208
to
+211
|
||||||||||||||||||||
| # by using an explicit membership check instead of setdefault. | |
| if key not in aggregates: | |
| aggregates[key] = {"value_sum": [Fraction(0) for _ in value], "count": 0, "entry_indices": []} | |
| bucket = aggregates[key] | |
| # by initializing the bucket lazily only when the key is first seen. | |
| bucket = aggregates.get(key) | |
| if bucket is None: | |
| bucket = {"value_sum": [Fraction(0) for _ in value], "count": 0, "entry_indices": []} | |
| aggregates[key] = bucket |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -697,14 +697,14 @@ def evaluate_free_running_programs( | |
|
|
||
| outcomes.append(outcome) | ||
| bucket = bucket_name(outcome.program_steps) | ||
| bucket_state = per_bucket.setdefault( | ||
| bucket, | ||
| { | ||
| # Avoid eager instantiation of default dict on every iteration | ||
| if bucket not in per_bucket: | ||
| per_bucket[bucket] = { | ||
| "program_count": 0, | ||
|
Comment on lines
699
to
703
|
||
| "exact_trace_count": 0, | ||
| "exact_final_state_count": 0, | ||
| }, | ||
| ) | ||
| } | ||
| bucket_state = per_bucket[bucket] | ||
| bucket_state["program_count"] += 1 | ||
| bucket_state["exact_trace_count"] += int(outcome.exact_trace_match) | ||
| bucket_state["exact_final_state_count"] += int(outcome.exact_final_state_match) | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -455,7 +455,10 @@ def fit_transition_library( | |||||||||||||||||||
| examples = build_transition_examples(programs, interpreter=interpreter) | ||||||||||||||||||||
| by_opcode: dict[Opcode, list[TransitionExample]] = {} | ||||||||||||||||||||
| for example in examples: | ||||||||||||||||||||
| by_opcode.setdefault(example.opcode, []).append(example) | ||||||||||||||||||||
| # Avoid eager instantiation of default list on every iteration | ||||||||||||||||||||
| if example.opcode not in by_opcode: | ||||||||||||||||||||
| by_opcode[example.opcode] = [] | ||||||||||||||||||||
| by_opcode[example.opcode].append(example) | ||||||||||||||||||||
|
Comment on lines
+458
to
+461
|
||||||||||||||||||||
| # Avoid eager instantiation of default list on every iteration | |
| if example.opcode not in by_opcode: | |
| by_opcode[example.opcode] = [] | |
| by_opcode[example.opcode].append(example) | |
| examples_for_opcode = by_opcode.get(example.opcode) | |
| if examples_for_opcode is None: | |
| examples_for_opcode = [] | |
| by_opcode[example.opcode] = examples_for_opcode | |
| examples_for_opcode.append(example) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -582,15 +582,15 @@ def evaluate_teacher_forced_model( | |
| total_correct += correct | ||
|
|
||
| bucket = baseline_bucket_name(example.program_steps) | ||
| bucket_state = per_bucket.setdefault( | ||
| bucket, | ||
| { | ||
| # Avoid eager instantiation of default dict on every iteration | ||
| if bucket not in per_bucket: | ||
| per_bucket[bucket] = { | ||
| "example_count": 0, | ||
|
Comment on lines
584
to
588
|
||
| "token_count": 0, | ||
| "correct_tokens": 0, | ||
| "weighted_loss": 0.0, | ||
| }, | ||
| ) | ||
| } | ||
| bucket_state = per_bucket[bucket] | ||
| bucket_state["example_count"] = int(bucket_state["example_count"]) + 1 | ||
| bucket_state["token_count"] = int(bucket_state["token_count"]) + token_count | ||
| bucket_state["correct_tokens"] = int(bucket_state["correct_tokens"]) + correct | ||
|
|
@@ -753,7 +753,10 @@ def evaluate_free_running_rollout( | |
| ) | ||
|
|
||
| bucket = baseline_bucket_name(example.program_steps) | ||
| bucket_state = per_bucket.setdefault(bucket, {"example_count": 0, "exact_count": 0}) | ||
| # Avoid eager instantiation of default dict on every iteration | ||
| if bucket not in per_bucket: | ||
| per_bucket[bucket] = {"example_count": 0, "exact_count": 0} | ||
| bucket_state = per_bucket[bucket] | ||
|
Comment on lines
755
to
+759
|
||
| bucket_state["example_count"] = int(bucket_state["example_count"]) + 1 | ||
| bucket_state["exact_count"] = int(bucket_state["exact_count"]) + int(exact) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -171,7 +171,10 @@ def exact_program_accuracy(scorer: TrainableLatestWriteScorer, samples: Sequence | |||||||||||||||||
| return 0.0 | ||||||||||||||||||
| per_program: dict[str, list[bool]] = {} | ||||||||||||||||||
| for sample in samples: | ||||||||||||||||||
| per_program.setdefault(sample.program_name, []).append(scorer.predict_index(sample) == sample.target_index) | ||||||||||||||||||
| # Avoid eager instantiation of default list on every iteration | ||||||||||||||||||
| if sample.program_name not in per_program: | ||||||||||||||||||
| per_program[sample.program_name] = [] | ||||||||||||||||||
| per_program[sample.program_name].append(scorer.predict_index(sample) == sample.target_index) | ||||||||||||||||||
|
Comment on lines
+174
to
+177
|
||||||||||||||||||
| exact = sum(1 for outcomes in per_program.values() if all(outcomes)) | ||||||||||||||||||
| return exact / len(per_program) | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
@@ -198,12 +201,21 @@ def evaluate_scorer( | |||||||||||||||||
| correct_samples += int(correct) | ||||||||||||||||||
|
|
||||||||||||||||||
| bucket = bucket_name(sample.program_steps) | ||||||||||||||||||
| bucket_state = per_bucket.setdefault(bucket, {"sample_count": 0, "sample_correct": 0, "programs": {}}) | ||||||||||||||||||
| # Avoid eager instantiation of default dict on every iteration | ||||||||||||||||||
| if bucket not in per_bucket: | ||||||||||||||||||
| per_bucket[bucket] = {"sample_count": 0, "sample_correct": 0, "programs": {}} | ||||||||||||||||||
| bucket_state = per_bucket[bucket] | ||||||||||||||||||
|
Comment on lines
+205
to
+207
|
||||||||||||||||||
| if bucket not in per_bucket: | |
| per_bucket[bucket] = {"sample_count": 0, "sample_correct": 0, "programs": {}} | |
| bucket_state = per_bucket[bucket] | |
| bucket_state = per_bucket.get(bucket) | |
| if bucket_state is None: | |
| bucket_state = {"sample_count": 0, "sample_correct": 0, "programs": {}} | |
| per_bucket[bucket] = bucket_state |
Copilot
AI
Apr 17, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
programs_dict = bucket_state["programs"] is followed by a membership check and then another index lookup. If this loop is performance-sensitive, consider using lst = programs_dict.get(sample.program_name) and initializing only when missing to reduce dict lookups.
| if sample.program_name not in programs_dict: | |
| programs_dict[sample.program_name] = [] | |
| programs_dict[sample.program_name].append(correct) | |
| program_outcomes = programs_dict.get(sample.program_name) | |
| if program_outcomes is None: | |
| program_outcomes = [] | |
| programs_dict[sample.program_name] = program_outcomes | |
| program_outcomes.append(correct) |
Copilot
AI
Apr 17, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This uses if sample.program_name not in per_program followed by per_program[sample.program_name], which does two lookups for existing keys. Using .get() to fetch the list and only allocating on miss avoids the extra lookup.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The guidance here is written as an absolute (βAlways replace
setdefaultβ¦β), butsetdefaultis only problematic when the default expression allocates/is expensive (and especially inside hot loops). Consider qualifying this recommendation (and/or recommending the one-lookupd.get(key)+ initialize-on-miss pattern) to avoid encouraging unnecessary rewrites or potential slowdowns where the default is cheap/constant.