pyronear · rensortino · May 26, 2026 · May 26, 2026 · May 26, 2026 · Jun 2, 2026
diff --git a/experiments/temporal-models/temporal-model-leaderboard/dvc.yaml b/experiments/temporal-models/temporal-model-leaderboard/dvc.yaml
@@ -79,6 +79,26 @@ stages:
     outs:
       - data/07_model_output/bbox-tube-temporal-gru-convnext-finetune
 
+  # Best combination from the tube-multiscale-fusion ablation sweep
+  # (tubelet spatial x transformer temporal = the dinov2_multiscale default:
+  # top F1, perfect recall, cheapest — see that experiment's
+  # COMBINED_VARIATIONS.md).
+  evaluate_tube_multiscale_fusion_dinov2_multiscale:
+    cmd: >-
+      uv run python scripts/evaluate.py
+      --model-name tube-multiscale-fusion-dinov2-multiscale
+      --model-type tube-multiscale-fusion
+      --model-package data/01_raw/models/tube-multiscale-fusion-dinov2-multiscale.zip
+      --test-dir data/01_raw/sequential_test/test
+      --output-dir data/07_model_output/tube-multiscale-fusion-dinov2-multiscale
+    deps:
+      - scripts/evaluate.py
+      - src/temporal_model_leaderboard
+      - data/01_raw/sequential_test
+      - data/01_raw/models/tube-multiscale-fusion-dinov2-multiscale.zip
+    outs:
+      - data/07_model_output/tube-multiscale-fusion-dinov2-multiscale
+
   leaderboard:
     cmd: >-
       uv run python scripts/leaderboard.py

diff --git a/experiments/temporal-models/temporal-model-leaderboard/pyproject.toml b/experiments/temporal-models/temporal-model-leaderboard/pyproject.toml
@@ -9,6 +9,7 @@ dependencies = [
     "mtb-change-detection",
     "pyro-detector-baseline",
     "bbox-tube-temporal",
+    "tube-multiscale-fusion",
     "tqdm>=4.67.3",
 ]
 
@@ -18,6 +19,7 @@ tracking-fsm-baseline = { path = "../tracking-fsm-baseline" }
 mtb-change-detection = { path = "../mtb-change-detection" }
 pyro-detector-baseline = { path = "../pyro-detector-baseline" }
 bbox-tube-temporal = { path = "../bbox-tube-temporal" }
+tube-multiscale-fusion = { path = "../tube-multiscale-fusion" }
 
 [dependency-groups]
 dev = [

diff --git a/...nts/temporal-models/temporal-model-leaderboard/src/temporal_model_leaderboard/registry.py b/...nts/temporal-models/temporal-model-leaderboard/src/temporal_model_leaderboard/registry.py
@@ -26,6 +26,10 @@
         "bbox_tube_temporal.model",
         "BboxTubeTemporalModel",
     ),
+    "tube-multiscale-fusion": (
+        "tube_multiscale_fusion.model",
+        "TubeMultiscaleFusionModel",
+    ),
 }
 
 

diff --git a/experiments/temporal-models/temporal-model-leaderboard/uv.lock b/experiments/temporal-models/temporal-model-leaderboard/uv.lock
diff --git a/experiments/temporal-models/tube-multiscale-fusion/.dvc/.gitignore b/experiments/temporal-models/tube-multiscale-fusion/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/experiments/temporal-models/tube-multiscale-fusion/.dvc/config b/experiments/temporal-models/tube-multiscale-fusion/.dvc/config
@@ -0,0 +1,5 @@
+[core]
+    remote = s3remote
+    analytics = false
+['remote "s3remote"']
+    url = s3://pyro-vision-rd/dvc/experiments/tube-multiscale-fusion/
diff --git a/experiments/temporal-models/tube-multiscale-fusion/.dvcignore b/experiments/temporal-models/tube-multiscale-fusion/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
diff --git a/experiments/temporal-models/tube-multiscale-fusion/.gitattributes b/experiments/temporal-models/tube-multiscale-fusion/.gitattributes
@@ -0,0 +1,2 @@
+*.ipynb filter=nbstripout
+*.ipynb diff=ipynb
diff --git a/experiments/temporal-models/tube-multiscale-fusion/.gitignore b/experiments/temporal-models/tube-multiscale-fusion/.gitignore
@@ -0,0 +1,8 @@
+__pycache__/
+*.pyc
+.venv/
+.ipynb_checkpoints/
+.pytest_cache/
+.ruff_cache/
+logs/
+datasets_full/
diff --git a/experiments/temporal-models/tube-multiscale-fusion/.python-version b/experiments/temporal-models/tube-multiscale-fusion/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/experiments/temporal-models/tube-multiscale-fusion/ABLATIONS.md b/experiments/temporal-models/tube-multiscale-fusion/ABLATIONS.md
@@ -0,0 +1,119 @@
+# Ablation study — tube-multiscale-fusion
+
+This study decomposes the full two-branch model by removing or simplifying one
+component at a time, to attribute the model's performance to its parts. All
+runs use the **same data, seed (42), augmentation, training schedule, and tube
+geometry** (2×2 grid of 112×112 cells × 4-frame windows at stride 2 → 28 tubes
+per sequence) as the default model; only the named component changes. Metrics
+are on the **val split (280 tubes: 135 smoke / 145 fp)**, single seed.
+
+## Variants
+
+| # | Variant | What changed vs. full model |
+|---|---------|------------------------------|
+| 0 | **Full** | Global DINOv2 sequence branch + local tube branch + cross-attention fusion (self-attn over tubes, global=query). |
+| 1 | **− temporal** (`no_temporal`) | **Global branch removed.** Local branch + fusion kept; the fusion's global query is replaced by a learnable token. |
+| 2 | **− spatial** (`no_spatial`) | **Local branch removed.** Global branch only → MLP head; no fusion. |
+| 3 | **fusion → weighted-mean** (`weighted_mean`) | Both branches kept; **cross-attention fusion replaced** by a learned weighted mean over tubes + a learned gate between the two branches (no attention). |
+
+Each variant is a clean, single-component change: the kept modules are
+byte-for-byte identical to the full model.
+
+## Results (val, 280 tubes)
+
+### Classification quality
+
+| Variant | F1 | Accuracy | Precision | Recall | PR-AUC | FP | FN |
+|---------|---:|---:|---:|---:|---:|---:|---:|
+| **Full** | **0.9783** | 0.9786 | 0.9574 | **1.0000** | 0.9936 | 6 | 0 |
+| fusion → weighted-mean | 0.9779 | 0.9786 | **0.9708** | 0.9852 | **0.9952** | **4** | 2 |
+| − spatial (global only) | 0.9708 | 0.9714 | 0.9568 | 0.9852 | 0.9901 | 6 | 2 |
+| − temporal (local only) | 0.8716 | 0.8643 | 0.8012 | 0.9556 | 0.9019 | 32 | 6 |
+
+### Operating point & compute
+
+| Variant | Recall@FPR=1% | @5% | @10% | Params | Trainable | GFLOPs/seq | Latency (ms/seq)¹ |
+|---------|---:|---:|---:|---:|---:|---:|---:|
+| **Full** | **0.874** | **1.000** | **1.000** | 36.4 M | 16.6 M | 226.5 | 18.9 |
+| fusion → weighted-mean | 0.837 | 0.993 | 1.000 | 29.3 M | 9.5 M | 226.3 | 24.1 |
+| − spatial (global only) | 0.770 | 0.993 | 0.993 | 25.3 M | 5.4 M | 196.1 | 20.6 |
+| − temporal (local only) | 0.193 | 0.563 | 0.748 | 11.3 M | 11.3 M | 30.5 | 6.2 |
+
+¹ Batch-1, single RTX 4090, one consistent measurement pass. Batch-1 latency is
+dominated by kernel-launch overhead and is *not* strictly proportional to FLOPs
+(hence `no_spatial` ≈ `full` despite fewer FLOPs); treat GFLOPs as the reliable
+compute metric.
+
+## Findings
+
+### 1. The temporal (global) module is by far the most important component.
+
+Removing it (`no_temporal`) is **catastrophic**: F1 collapses 0.978 → 0.872,
+precision 0.957 → 0.801 (false positives jump 6 → 32), and recall at a strict
+1 % FPR falls off a cliff, 0.874 → 0.193. Recall holds up best (1.000 → 0.956),
+so the local branch alone still *finds* smoke — it simply cannot *reject*
+slow-moving look-alikes (cloud, fog, haze) without the long-range sequence
+context. The global branch is the workhorse; everything else refines it.
+
+### 2. The spatial (local) module adds a modest but real gain — concentrated at the strict operating point.
+
+Global-only (`no_spatial`) is already strong (F1 0.971), so the local branch
+buys only ~0.7 F1 on average. But its value shows up where it matters for
+deployment: **recall @ 1 % FPR improves 0.770 → 0.874** and PR-AUC 0.990 →
+0.994 when the local branch is added back. The high-frequency turbulence cue
+from the tubes sharpens the model's confidence precisely in the
+low-false-alarm regime — consistent with the design rationale that local motion
+detail disambiguates the hardest cases.
+
+### 3. The cross-attention fusion is **not** clearly better than a simple weighted mean.
+
+Replacing the attention-based fusion with a learned weighted mean
+(`weighted_mean`) **matches the full model**: F1 0.9779 vs 0.9783 (a one-tube
+difference, within noise), with *higher* precision (0.971 vs 0.957, 4 FP vs 6)
+and *higher* PR-AUC (0.9952 vs 0.9936). It trails the full model only at the
+very strictest 1 % FPR point (0.837 vs 0.874).
+
+The takeaway: on this dataset the gain comes from **having both branches**, not
+from the attention mechanism in the fusion step. Since the fusion is a
+negligible fraction of compute (both variants are ~226 GFLOPs — the cost is
+the 16 DINOv2 passes), the cross-attention buys a small edge at the extreme
+operating point for ~7 M extra parameters. The weighted-mean fusion is a strong
+**simplification candidate** if that extreme-FPR margin is not required.
+
+### Summary
+
+```
+contribution to val F1 (full = 0.978):
+  temporal/global module : +0.106   (0.872 → 0.978)   ← dominant
+  spatial/local module   : +0.007   (0.971 → 0.978)   ← modest, helps low-FPR
+  cross-attn over w-mean  : +0.000   (0.978 → 0.978)   ← negligible on this val set
+```
+
+The architecture's accuracy is driven overwhelmingly by the **global temporal
+context**, with the **local tube branch** adding a small but deployment-relevant
+gain at low false-alarm rates. The **attention-based fusion** is the most
+expendable piece: a learned weighted mean is statistically indistinguishable
+here and even slightly more precise.
+
+## Caveats
+
+- **Single seed, small val set (280 tubes).** Differences under ~0.005 F1
+  (e.g. full vs. weighted-mean) are within run-to-run noise; the temporal and
+  spatial effects are large enough to be robust, but the fusion comparison
+  should be confirmed with multiple seeds before acting on it.
+- All variants share the default 2×2 tube geometry; interactions between the
+  fusion choice and finer/denser tube geometries were not explored.
+
+## Reproduce
+
+```bash
+# Full model
+uv run dvc repro train_dinov2_multiscale evaluate_dinov2_multiscale
+# Ablations
+uv run dvc repro train_ablation_no_temporal      evaluate_ablation_no_temporal
+uv run dvc repro train_ablation_no_spatial       evaluate_ablation_no_spatial
+uv run dvc repro train_ablation_weighted_mean    evaluate_ablation_weighted_mean
+```
+
+Per-variant metrics, confusion matrices, and PR/ROC curves are written to
+`data/08_reporting/<variant>/{train,val}/`.
diff --git a/experiments/temporal-models/tube-multiscale-fusion/COMBINED_VARIATIONS.md b/experiments/temporal-models/tube-multiscale-fusion/COMBINED_VARIATIONS.md
@@ -0,0 +1,106 @@
+# Combined spatial × temporal variations — tube-multiscale-fusion
+
+The [spatial](SPATIAL_COMPARISON.md) and [temporal](TEMPORAL_COMPARISON.md)
+comparisons each varied one module *in isolation* (local-only / global-only).
+This study puts both modules back together and sweeps the **cartesian product**
+of the strongest candidates **inside the full two-branch model** (global DINOv2
+branch + local tube branch + cross-attention fusion + head):
+
+- **spatial** (local per-tube encoder): `tubelet transformer`, `ViViT`, `3D ResNet`
+- **temporal** (global aggregator): `transformer`, `LSTM`
+
+= 6 full models. Same data, seed (42), schedule, augmentation, and 2×2 tube
+geometry; only the two module kinds change. `tubelet × transformer` is the
+production default (= `dinov2_multiscale`). Metrics on the **val split
+(280 tubes)**, single seed.
+
+## Results (val, 280 tubes)
+
+| Spatial | Temporal | F1 | Acc | Prec | Recall | PR-AUC | FP | FN | Params | GFLOPs | ms/seq¹ | R@FPR=1% | @5% |
+|---------|----------|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| **tubelet** | **transformer** (default) | **0.9783** | 0.9786 | 0.9574 | **1.0000** | 0.9936 | 6 | **0** | 36.4 M | 226.5 | 10.8 | 0.874 | **1.000** |
+| **resnet3d** | **LSTM** | 0.9779 | 0.9786 | **0.9708** | 0.9852 | **0.9952** | **4** | 2 | 63.4 M | 800.1 | 23.1 | 0.911 | 0.993 |
+| vivit | LSTM | 0.9708 | 0.9714 | 0.9568 | 0.9852 | 0.9911 | 6 | 2 | 37.6 M | 250.9 | 12.8 | **0.933** | 0.993 |
+| resnet3d | transformer | 0.9675 | 0.9679 | 0.9437 | 0.9926 | 0.9946 | 8 | 1 | 65.7 M | 800.2 | 22.9 | 0.867 | 0.993 |
+| vivit | transformer | 0.9675 | 0.9679 | 0.9437 | 0.9926 | 0.9931 | 8 | 1 | 40.0 M | 251.0 | 12.6 | 0.793 | 0.993 |
+| tubelet | LSTM | 0.9673 | 0.9679 | 0.9500 | 0.9852 | 0.9950 | 7 | 2 | 34.1 M | 226.4 | 10.9 | 0.874 | 0.985 |
+
+(sorted by F1) ¹ Batch-1, single RTX 4090.
+
+## Findings
+
+### 1. In the full model, the spatial × temporal choice barely moves the needle.
+
+All six combinations land within **F1 0.967–0.978 (~1.1 points)** — essentially
+a tie on this val set. This is the key result and it contrasts sharply with the
+isolated [spatial study](SPATIAL_COMPARISON.md), where the encoder choice spanned
+9 F1 points (0.85–0.94). The explanation is the [ablation](ABLATIONS.md) finding:
+once the strong **global DINOv2 branch** is present, it carries the prediction
+and the local branch contributes only ~0.7 F1. So swapping the local encoder
+(tubelet → ViViT → 3D ResNet) or the aggregator (transformer ↔ LSTM) only
+reshuffles that small residual — within noise. **The architecture is robust to
+these choices; none of the expensive upgrades is worth it in the full model.**
+
+### 2. The cheapest model is also (statistically) the best.
+
+`tubelet × transformer` — the current default and the **least expensive**
+(226 GFLOPs, 10.8 ms, 36 M params) — has the top F1 (0.978), perfect recall
+(0 missed smokes), and R@5%FPR = 1.000. Nothing beats it by more than noise, and
+everything else costs more.
+
+### 3. `resnet3d × LSTM` is the only combo that rivals it — at 3.5× the compute.
+
+It edges the default on **precision (0.971 vs 0.957, 4 FP vs 6)** and **PR-AUC
+(0.9952)**, i.e. slightly cleaner false-alarm behaviour. But it costs **800
+GFLOPs (3.5×) and 23 ms (2.1×)** for a difference well inside single-seed noise.
+Not a worthwhile trade unless false positives are extremely costly *and* the
+compute budget is large.
+
+### 4. Temporal aggregator: LSTM ≈ transformer; spatial encoder: pretraining helps precision, not F1.
+
+- LSTM vs transformer is a wash across spatial backbones (LSTM wins for
+  resnet3d/vivit, transformer wins for tubelet — all within noise), and the
+  aggregator is compute-negligible (LSTM is marginally smaller).
+- The pretrained `3D ResNet` spatial encoder shows up only as a small
+  precision/PR-AUC edge (best with LSTM), **not** as an F1 win — consistent with
+  the local branch being a minor contributor here. Its standalone dominance in
+  the [spatial study](SPATIAL_COMPARISON.md) does **not** transfer to the full
+  model, because the global branch already supplies most of the signal.
+
+### Recommendation
+
+**Keep the current default, `tubelet × transformer`.** It is the cheapest and is
+statistically the best on this val set (top F1, perfect recall). If a deployment
+is acutely false-alarm-sensitive and compute is not a constraint,
+`resnet3d × LSTM` offers marginally higher precision/PR-AUC at 3.5× the FLOPs —
+otherwise it is not justified. `vivit × LSTM` is a middle option (best
+recall@1%FPR, +10% FLOPs) but no better on F1.
+
+The broader lesson across all four studies: **the global temporal branch is the
+workhorse; the local spatial branch and the specific module architectures are
+second-order.** Spend complexity budget on the global branch, not on heavier
+local encoders.
+
+## Caveats
+
+- **Single seed, small val set (280).** Every F1 gap here is < 0.012 — within
+  run-to-run noise. The compute/precision differences are robust; the F1
+  ranking is not. A multi-seed sweep would be needed to call a winner beyond the
+  default.
+- `tubelet × transformer` is reused from the existing `dinov2_multiscale` run
+  (identical config). The two `resnet3d` combos trained at batch size 8 (others
+  16) for memory; this affects optimisation noise only, not the reported
+  FLOPs/latency.
+
+## Reproduce
+
+```bash
+# Default cell (tubelet x transformer):
+uv run dvc repro train_dinov2_multiscale evaluate_dinov2_multiscale
+# The other 5 combos:
+uv run dvc repro train_full_combo evaluate_full_combo
+```
+
+Per-combo metrics, confusion matrices, and PR/ROC curves are in
+`data/08_reporting/full_<spatial>_<temporal>/{train,val}/` (and
+`data/08_reporting/dinov2_multiscale/` for the default cell).
diff --git a/experiments/temporal-models/tube-multiscale-fusion/Makefile b/experiments/temporal-models/tube-multiscale-fusion/Makefile
@@ -0,0 +1,20 @@
+.PHONY: install lint format test help notebook
+
+help: ## Show this help
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}'
+
+install: ## Install dependencies and set up notebook output stripping
+	uv sync
+	uv run nbstripout --install
+
+lint: ## Run ruff linter on code and notebooks
+	uv run ruff check .
+
+format: ## Format code and notebooks with ruff
+	uv run ruff format .
+
+test: ## Run tests with pytest
+	uv run pytest tests/ -v
+
+notebook: ## Launch JupyterLab for notebooks
+	uv run jupyter lab notebooks/