Standardize benchmark methodology and enforce single-threaded execution (#112)

ViralBShah · claude · web-flow · commit 54bfbd8669b2 · 2026-03-01T23:41:47.000-05:00
* Add Dependabot for monthly GitHub Actions updates

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* Standardize benchmark methodology and enforce single-threaded execution

- Go: replace testing.Benchmark (adaptive iterations, average time) with
  the standard 5-iteration minimum pattern used by all other languages
- Lua: replace adaptive timing loop (ran until 2s elapsed) with fixed
  5 iterations reporting minimum, matching all other languages
- Add GOMAXPROCS=1, JULIA_NUM_THREADS=1, NUMBA_NUM_THREADS=1, and
  MKL_NUM_THREADS=1 to both the Makefile and CI workflow to enforce
  single-threaded execution across all language runtimes
- Update Methodology and Notes docs to accurately describe how benchmarks
  are run: 5 internal iterations per script, 3 Makefile invocations,
  overall minimum via collect.jl, and all environment variables used

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -18,6 +18,15 @@ permissions:
   pull-requests: read
   statuses: write
 
+# Force single-threaded execution for all benchmarks
+env:
+  OMP_NUM_THREADS: 1
+  OPENBLAS_NUM_THREADS: 1
+  MKL_NUM_THREADS: 1
+  GOMAXPROCS: 1
+  JULIA_NUM_THREADS: 1
+  NUMBA_NUM_THREADS: 1
+
 # ---------------------------------------------------------------------------
 # Per-language benchmark jobs
 # ---------------------------------------------------------------------------
diff --git a/Makefile b/Makefile
@@ -31,6 +31,10 @@ default: benchmarks.html
 
 export OMP_NUM_THREADS=1
 export OPENBLAS_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export GOMAXPROCS=1
+export JULIA_NUM_THREADS=1
+export NUMBA_NUM_THREADS=1
 
 dsfmt:
 	mkdir -p dSFMT
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -27,9 +27,10 @@ Times are normalized relative to C.
 
 ## Methodology
 
-- Each benchmark runs 5 iterations; the minimum time is taken
-- Single-threaded execution (`OMP_NUM_THREADS=1`, `OPENBLAS_NUM_THREADS=1`)
-- Julia results exclude compile time
+- Each language's benchmark script internally runs each benchmark 5 times and records the minimum time
+- The Makefile invokes each script 3 times; `collect.jl` takes the overall minimum across all runs
+- JIT languages (Julia, Numba) include a warmup pass before timing to exclude compilation overhead
+- Single-threaded execution enforced via environment variables (`OMP_NUM_THREADS=1`, `OPENBLAS_NUM_THREADS=1`, `MKL_NUM_THREADS=1`, `GOMAXPROCS=1`, `JULIA_NUM_THREADS=1`, `NUMBA_NUM_THREADS=1`)
 - Runs on GitHub Actions `ubuntu-latest` (x86_64, single core used)
 - Benchmarks test equivalent code patterns, not peak-optimized implementations
 
diff --git a/docs/src/notes.md b/docs/src/notes.md
@@ -17,9 +17,22 @@ benchmarks use the same for-loop.
 
 ## Timing methodology
 
-- Each benchmark is run 5 times; the minimum time is reported
-- Julia discards the first iteration as JIT warmup
-- Both `OMP_NUM_THREADS` and `OPENBLAS_NUM_THREADS` are set to 1 for deterministic single-threaded execution
+All languages follow the same pattern:
+
+- Each benchmark is run 5 times internally; the minimum time is reported
+- The Makefile invokes each language's script 3 times (`ITERATIONS=3`), producing multiple sets of results
+- `bin/collect.jl` takes the overall minimum across all runs, so the final reported time is the best of up to 15 measurements
+- JIT languages (Julia, Numba) include a warmup pass before the 5 timed iterations to exclude compilation overhead
+
+Environment:
+
+- The following environment variables are set to 1 (via the Makefile and the GitHub Actions workflow) for deterministic single-threaded execution:
+  - `OMP_NUM_THREADS=1` — OpenMP threads
+  - `OPENBLAS_NUM_THREADS=1` — OpenBLAS threads
+  - `MKL_NUM_THREADS=1` — Intel MKL threads (if linked)
+  - `GOMAXPROCS=1` — Go runtime OS threads
+  - `JULIA_NUM_THREADS=1` — Julia threads
+  - `NUMBA_NUM_THREADS=1` — Numba parallel threads
 - Runs on GitHub Actions `ubuntu-latest` runners (x86_64)
 
 ## Matrix benchmarks and BLAS
diff --git a/go/perf.go b/go/perf.go
@@ -17,14 +17,12 @@ package main
 
 import (
 	"bufio"
-	"errors"
 	"fmt"
-	"log"
 	"math"
 	"math/rand"
 	"os"
 	"strconv"
-	"testing"
+	"time"
 
 	"gonum.org/v1/gonum/mat"
 	"gonum.org/v1/gonum/stat"
@@ -215,133 +213,85 @@ func pisum() float64 {
 	return sum
 }
 
-func print_perf(name string, time float64) {
-	fmt.Printf("go,%v,%v\n", name, time*1000)
-}
+const NITER = 5
 
-// run tests
+func print_perf(name string, t float64) {
+	fmt.Printf("go,%v,%v\n", name, t*1000)
+}
 
-func assert(b *testing.B, t bool) {
-	if t != true {
-		b.Fatal("assert failed")
+func timeit(name string, fn func()) {
+	tmin := math.Inf(1)
+	for i := 0; i < NITER; i++ {
+		t := time.Now()
+		fn()
+		elapsed := time.Since(t).Seconds()
+		if elapsed < tmin {
+			tmin = elapsed
+		}
 	}
+	print_perf(name, tmin)
 }
 
+// run benchmarks
+
 func main() {
-	for _, bm := range benchmarks {
-		seconds, err := runBenchmarkFor(bm.fn)
-		if err != nil {
-			log.Fatalf("%s %s", bm.name, err)
+	n := 20
+	sink = &n // prevent constant propagation of fib argument
+	if fib(n) != 6765 {
+		panic("unexpected value for fib(20)")
+	}
+	timeit("recursion_fibonacci", func() {
+		fib(n)
+	})
+
+	timeit("parse_integers", func() {
+		for k := 0; k < 1000; k++ {
+			n := rnd.Uint32()
+			m, _ := strconv.ParseUint(strconv.FormatUint(uint64(n), 16), 16, 32)
+			if uint32(m) != n {
+				panic("incorrect value for m")
+			}
 		}
-		print_perf(bm.name, seconds)
+	})
+
+	if mandelperf() != 14791 {
+		panic("unexpected value for mandelperf")
 	}
-}
+	timeit("userfunc_mandelbrot", func() {
+		mandelperf()
+	})
+
+	timeit("recursion_quicksort", func() {
+		lst := make([]float64, 5000)
+		for k := range lst {
+			lst[k] = rnd.Float64()
+		}
+		qsort_kernel(lst, 0, len(lst)-1)
+	})
 
-func runBenchmarkFor(fn func(*testing.B)) (seconds float64, err error) {
-	bm := testing.Benchmark(fn)
-	if (bm.N == 0) {
-		return 0, errors.New("failed")
+	if math.Abs(pisum()-1.644834071848065) >= 1e-6 {
+		panic("pi_sum out of range")
 	}
-	return bm.T.Seconds() / float64(bm.N), nil
-}
+	timeit("iteration_pi_sum", func() {
+		pisum()
+	})
 
-var benchmarks = []struct {
-	name string
-	fn   func(*testing.B)
-}{
-	{
-		name: "recursion_fibonacci",
-		fn: func(b *testing.B) {
-			n := 20
-			sink = &n // prevent constant propagation of fib argument
-			for i := 0; i < b.N; i++ {
-				if fib(n) != 6765 {
-					b.Fatal("unexpected value for fib(20)")
-				}
-			}
-		},
-	},
-
-	{
-		name: "parse_integers",
-		fn: func(b *testing.B) {
-			for i := 0; i < b.N; i++ {
-				for k := 0; k < 1000; k++ {
-					n := rnd.Uint32()
-					m, _ := strconv.ParseUint(strconv.FormatUint(uint64(n), 16), 16, 32)
-					if uint32(m) != n {
-						b.Fatal("incorrect value for m")
-					}
-				}
-			}
-		},
-	},
-
-	{
-		name: "userfunc_mandelbrot",
-		fn: func(b *testing.B) {
-			for i := 0; i < b.N; i++ {
-				if mandelperf() != 14791 {
-					b.Fatal("unexpected value for mandelperf")
-				}
-			}
-		},
-	},
-
-	{
-		name: "print_to_file",
-		fn: func(b *testing.B) {
-			for i := 0; i < b.N; i++ {
-				printfd(100000)
-			}
-		},
-	},
-
-	{
-		name: "recursion_quicksort",
-		fn: func(b *testing.B) {
-			lst := make([]float64, 5000)
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				for k := range lst {
-					lst[k] = rnd.Float64()
-				}
-				qsort_kernel(lst, 0, len(lst)-1)
-			}
-		},
-	},
-
-	{
-		name: "iteration_pi_sum",
-		fn: func(b *testing.B) {
-			for i := 0; i < b.N; i++ {
-				if math.Abs(pisum()-1.644834071848065) >= 1e-6 {
-					b.Fatal("pi_sum out of range")
-				}
-			}
-		},
-	},
-
-	{
-		name: "matrix_statistics",
-		fn: func(b *testing.B) {
-			for i := 0; i < b.N; i++ {
-				c1, c2 := randmatstat(1000)
-				assert(b, 0.5 < c1)
-				assert(b, c1 < 1.0)
-				assert(b, 0.5 < c2)
-				assert(b, c2 < 1.0)
-			}
-		},
-	},
-
-	{
-		name: "matrix_multiply",
-		fn: func(b *testing.B) {
-			for i := 0; i < b.N; i++ {
-				c := randmatmul(1000)
-				assert(b, c.At(0, 0) >= 0)
-			}
-		},
-	},
+	c1, c2 := randmatstat(1000)
+	if !(0.5 < c1 && c1 < 1.0 && 0.5 < c2 && c2 < 1.0) {
+		panic("randmatstat out of range")
+	}
+	timeit("matrix_statistics", func() {
+		randmatstat(1000)
+	})
+
+	timeit("matrix_multiply", func() {
+		c := randmatmul(1000)
+		if c.At(0, 0) < 0 {
+			panic("unexpected negative value")
+		}
+	})
+
+	timeit("print_to_file", func() {
+		printfd(100000)
+	})
 }
diff --git a/lua/perf.lua b/lua/perf.lua
@@ -39,16 +39,16 @@ local function elapsed(f)
     return t1 - t0, val1, val2
 end
 
+local NITER = 5
+
 local function timeit(f, name, check)
-    local t, k, s = 1/0, 0, now_ms()
-    while true do
-        k = k + 1
+    local t = 1/0
+    for k = 1, NITER do
         local tx, val1, val2 = elapsed(f)
         t = min(t, tx)
         if check then
             check(val1, val2)
         end
-        if k > 5 and (now_ms() - s) >= 2000 then break end
     end
     io.write(format('lua,%s,%g\n', name, t))
 end