huggingface · EricLBuehler · Jun 10, 2026 · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/candle-core/benches/benchmarks/broadcast.rs b/candle-core/benches/benchmarks/broadcast.rs
@@ -18,7 +18,9 @@ fn run_bias_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &s
     let x = Tensor::ones((batch_size, ch, m, m), dtype, device).unwrap();
     let bias = Tensor::ones((1, bias_size, 1, 1), dtype, device).unwrap();
 
-    let flops = batch_size * ch * m * bias_size * dtype.size_in_bytes();
+    let output_size = batch_size * bias_size * m * m;
+
+    let flops = output_size * dtype.size_in_bytes();
 
     let mut group = c.benchmark_group(device.bench_name(name));
     group.throughput(Throughput::Bytes(flops as u64));
@@ -56,9 +58,36 @@ fn run_scalar_broadcast_benchmark(c: &mut Criterion, device: &Device, dtype: DTy
     group.finish();
 }
 
+fn run_contiguous_add_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
+    let bias_size = 128;
+    let m = 126;
+
+    let a = Tensor::ones((bias_size, m, m), dtype, device).unwrap();
+    let b = Tensor::ones((bias_size, m, m), dtype, device).unwrap();
+
+    let flops = 3 * bias_size * m * m * dtype.size_in_bytes();
+
+    let mut group = c.benchmark_group(device.bench_name(name));
+    group.throughput(Throughput::Bytes(flops as u64));
+    group.bench_function("iter", move |b_| {
+        b_.iter_custom(|iters| {
+            let start = Instant::now();
+            for _i in 0..iters {
+                run(black_box(&a), black_box(&b));
+            }
+            device.sync().unwrap();
+            start.elapsed()
+        })
+    });
+    group.finish();
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     let handler = BenchDeviceHandler::new().unwrap();
     for device in handler.devices {
+        run_contiguous_add_benchmark(c, &device, DType::F32, "broadcast_add_contiguous_f32");
+        run_contiguous_add_benchmark(c, &device, DType::F16, "broadcast_add_contiguous_f16");
+        run_contiguous_add_benchmark(c, &device, DType::BF16, "broadcast_add_contiguous_bf16");
         run_bias_benchmark(c, &device, DType::F32, "broadcast_add_f32");
         run_bias_benchmark(c, &device, DType::F16, "broadcast_add_f16");
         run_bias_benchmark(c, &device, DType::BF16, "broadcast_add_bf16");

diff --git a/candle-core/src/cpu/kernels.rs b/candle-core/src/cpu/kernels.rs
@@ -2,6 +2,21 @@ pub trait VecOps: num_traits::NumAssign + Copy {
     fn min(self, rhs: Self) -> Self;
     fn max(self, rhs: Self) -> Self;
 
+    /// Element-wise addition of two slices into a third.
+    #[inline(always)]
+    fn vec_add(lhs: &[Self], rhs: &[Self], res: &mut [Self]) {
+        lhs.iter()
+            .zip(rhs)
+            .zip(res)
+            .for_each(|((&a, &b), y)| *y = a + b)
+    }
+
+    /// Add a broadcast scalar to every element of a slice: `ys[i] = xs[i] + scalar`.
+    #[inline(always)]
+    fn scalar_add(scalar: Self, xs: &[Self], ys: &mut [Self]) {
+        xs.iter().zip(ys).for_each(|(&x, y)| *y = x + scalar)
+    }
+
     /// Dot-product of two vectors.
     ///
     /// # Safety
@@ -70,6 +85,18 @@ impl VecOps for f32 {
         Self::max(self, other)
     }
 
+    fn vec_add(lhs: &[Self], rhs: &[Self], res: &mut [Self]) {
+        #[cfg(feature = "mkl")]
+        crate::mkl::vs_add(lhs, rhs, res);
+        #[cfg(all(feature = "accelerate", not(feature = "mkl")))]
+        crate::accelerate::vs_add(lhs, rhs, res);
+        #[cfg(not(any(feature = "mkl", feature = "accelerate")))]
+        lhs.iter()
+            .zip(rhs)
+            .zip(res)
+            .for_each(|((&a, &b), y)| *y = a + b)
+    }
+
     #[inline(always)]
     unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) {
         super::vec_dot_f32(lhs, rhs, res, len)
@@ -92,6 +119,15 @@ impl VecOps for half::f16 {
         Self::max(self, other)
     }
 
+    fn vec_add(lhs: &[Self], rhs: &[Self], res: &mut [Self]) {
+        unsafe { super::vec_add_f16(lhs.as_ptr(), rhs.as_ptr(), res.as_mut_ptr(), lhs.len()) }
+    }
+
+    #[inline(always)]
+    fn scalar_add(scalar: Self, xs: &[Self], ys: &mut [Self]) {
+        unsafe { super::vec_scalar_add_f16(scalar, xs.as_ptr(), ys.as_mut_ptr(), xs.len()) }
+    }
+
     #[inline(always)]
     unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) {
         let mut res_f32 = 0f32;
@@ -110,6 +146,19 @@ impl VecOps for f64 {
     fn max(self, other: Self) -> Self {
         Self::max(self, other)
     }
+
+    #[inline(always)]
+    fn vec_add(lhs: &[f64], rhs: &[f64], res: &mut [f64]) {
+        #[cfg(feature = "mkl")]
+        crate::mkl::vd_add(lhs, rhs, res);
+        #[cfg(all(feature = "accelerate", not(feature = "mkl")))]
+        crate::accelerate::vd_add(lhs, rhs, res);
+        #[cfg(not(any(feature = "mkl", feature = "accelerate")))]
+        lhs.iter()
+            .zip(rhs)
+            .zip(res)
+            .for_each(|((&a, &b), y)| *y = a + b)
+    }
 }
 impl VecOps for half::bf16 {
     #[inline(always)]
@@ -122,6 +171,16 @@ impl VecOps for half::bf16 {
         Self::max(self, other)
     }
 
+    #[inline(always)]
+    fn vec_add(lhs: &[Self], rhs: &[Self], res: &mut [Self]) {
+        unsafe { super::vec_add_bf16(lhs.as_ptr(), rhs.as_ptr(), res.as_mut_ptr(), lhs.len()) }
+    }
+
+    #[inline(always)]
+    fn scalar_add(scalar: Self, xs: &[Self], ys: &mut [Self]) {
+        unsafe { super::vec_scalar_add_bf16(scalar, xs.as_ptr(), ys.as_mut_ptr(), xs.len()) }
+    }
+
     #[inline(always)]
     unsafe fn vec_dot(lhs: *const Self, rhs: *const Self, res: *mut Self, len: usize) {
         let mut res_f32 = 0f32;

diff --git a/candle-core/src/cpu/mod.rs b/candle-core/src/cpu/mod.rs
@@ -248,3 +248,152 @@ pub(crate) unsafe fn vec_dot_bf16(a_row: *const bf16, b_row: *const bf16, c: *mu
     }
     *c = sum;
 }
+
+#[cfg(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+))]
+pub(crate) unsafe fn vec_add_f16(a_row: *const f16, b_row: *const f16, c: *mut f16, k: usize) {
+    let mut i = 0;
+    while i + CurrentCpuF16::STEP <= k {
+        for j in 0..CurrentCpuF16::ARR {
+            CurrentCpuF16::vec_store(
+                c.add(i + j * CurrentCpuF16::EPR),
+                CurrentCpuF16::vec_add(
+                    CurrentCpuF16::load(a_row.add(i + j * CurrentCpuF16::EPR)),
+                    CurrentCpuF16::load(b_row.add(i + j * CurrentCpuF16::EPR)),
+                ),
+            );
+        }
+        i += CurrentCpuF16::STEP;
+    }
+
+    // leftovers
+    for j in i..k {
+        *c.add(j) = *a_row.add(j) + *b_row.add(j);
+    }
+}
+
+#[cfg(not(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+)))]
+#[inline(always)]
+pub(crate) unsafe fn vec_add_f16(a_row: *const f16, b_row: *const f16, c: *mut f16, k: usize) {
+    for i in 0..k {
+        *c.add(i) = *a_row.add(i) + *b_row.add(i);
+    }
+}
+
+#[cfg(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+))]
+pub(crate) unsafe fn vec_add_bf16(a_row: *const bf16, b_row: *const bf16, c: *mut bf16, k: usize) {
+    let mut i = 0;
+    while i + CurrentCpuBF16::STEP <= k {
+        for j in 0..CurrentCpuBF16::ARR {
+            CurrentCpuBF16::vec_store(
+                c.add(i + j * CurrentCpuBF16::EPR),
+                CurrentCpuBF16::vec_add(
+                    CurrentCpuBF16::load(a_row.add(i + j * CurrentCpuBF16::EPR)),
+                    CurrentCpuBF16::load(b_row.add(i + j * CurrentCpuBF16::EPR)),
+                ),
+            );
+        }
+        i += CurrentCpuBF16::STEP;
+    }
+
+    // leftovers
+    for j in i..k {
+        *c.add(j) = *a_row.add(j) + *b_row.add(j);
+    }
+}
+
+#[cfg(not(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+)))]
+#[inline(always)]
+pub(crate) unsafe fn vec_add_bf16(a_row: *const bf16, b_row: *const bf16, c: *mut bf16, k: usize) {
+    for i in 0..k {
+        *c.add(i) = *a_row.add(i) + *b_row.add(i);
+    }
+}
+
+#[cfg(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+))]
+#[inline(always)]
+pub(crate) unsafe fn vec_scalar_add_f16(scalar: f16, xs: *const f16, ys: *mut f16, k: usize) {
+    let sv = CurrentCpuF16::from_f32(scalar.to_f32());
+    let mut i = 0;
+    while i + CurrentCpuF16::STEP <= k {
+        for j in 0..CurrentCpuF16::ARR {
+            CurrentCpuF16::vec_store(
+                ys.add(i + j * CurrentCpuF16::EPR),
+                CurrentCpuF16::vec_add(CurrentCpuF16::load(xs.add(i + j * CurrentCpuF16::EPR)), sv),
+            );
+        }
+        i += CurrentCpuF16::STEP;
+    }
+    for j in i..k {
+        *ys.add(j) = *xs.add(j) + scalar;
+    }
+}
+
+#[cfg(not(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+)))]
+#[inline(always)]
+pub(crate) unsafe fn vec_scalar_add_f16(scalar: f16, xs: *const f16, ys: *mut f16, k: usize) {
+    for i in 0..k {
+        *ys.add(i) = *xs.add(i) + scalar;
+    }
+}
+
+#[cfg(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+))]
+#[inline(always)]
+pub(crate) unsafe fn vec_scalar_add_bf16(scalar: bf16, xs: *const bf16, ys: *mut bf16, k: usize) {
+    let sv = CurrentCpuBF16::from_f32(scalar.to_f32());
+    let mut i = 0;
+    while i + CurrentCpuBF16::STEP <= k {
+        for j in 0..CurrentCpuBF16::ARR {
+            CurrentCpuBF16::vec_store(
+                ys.add(i + j * CurrentCpuBF16::EPR),
+                CurrentCpuBF16::vec_add(
+                    CurrentCpuBF16::load(xs.add(i + j * CurrentCpuBF16::EPR)),
+                    sv,
+                ),
+            );
+        }
+        i += CurrentCpuBF16::STEP;
+    }
+    for j in i..k {
+        *ys.add(j) = *xs.add(j) + scalar;
+    }
+}
+
+#[cfg(not(any(
+    target_feature = "neon",
+    target_feature = "avx2",
+    target_feature = "simd128"
+)))]
+#[inline(always)]
+pub(crate) unsafe fn vec_scalar_add_bf16(scalar: bf16, xs: *const bf16, ys: *mut bf16, k: usize) {
+    for i in 0..k {
+        *ys.add(i) = *xs.add(i) + scalar;
+    }
+}