Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ readme = "README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[workspace]
members = ["snmalloc-sys"]
members = ["snmalloc-sys", "xtask"]

[dependencies]
snmalloc-sys = { version = "0.3.8", path = "snmalloc-sys", default-features = false }
Expand All @@ -34,3 +34,9 @@ notls = ["snmalloc-sys/notls"]
stats = ["snmalloc-sys/stats"]
usewait-on-address = ["snmalloc-sys/usewait-on-address"]
libc-api = ["snmalloc-sys/libc-api"]
tracing = ["snmalloc-sys/tracing"]
fuzzing = ["snmalloc-sys/fuzzing"]
vendored-stl = ["snmalloc-sys/vendored-stl"]
check-loads = ["snmalloc-sys/check-loads"]
pageid = ["snmalloc-sys/pageid"]
gwp-asan = ["snmalloc-sys/gwp-asan"]
27 changes: 20 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,12 @@ the [`snmalloc` paper](https://github.com/microsoft/snmalloc/blob/master/snmallo
are listed at
[bench_suite](https://github.com/SchrodingerZhu/bench_suite). There are three features defined in this crate:

- `debug`: Enable the `Debug` mode in `snmalloc`.
- ~~`1mib`: Use the `1mib` chunk configuration. From `0.2.17`, this is set as a default feature~~ (removed since 0.3.0)
- ~~`16mib`: Use the `16mib` chunk configuration.~~ (removed since 0.3.0)
- ~~`cache-friendly`: Make the allocator more cache friendly (setting `CACHE_FRIENDLY_OFFSET` to `64` in building the
library).~~ (removed since 0.3.0)
- `debug`: Enable the `Debug` mode in `snmalloc`. This is also automatically enabled if Cargo's `DEBUG` environment variable is set to `true`.
- `native-cpu`: Optimize `snmalloc` for the native CPU of the host machine. (this is not a default behavior
since `0.2.14`)
- `qemu`: Workaround `madvise` problem of QEMU environment
- ~~`stats`: Enable statistics~~ (removed since 0.3.0)
- `local_dynamic_tls`: Workaround cannot allocate memory in static tls block
- `build_cc`: Use of cc crate instead of cmake (cmake still default) as builder (more platform agnostic)
- ~~`usecxx20`: Enable C++20 standard if available~~ (removed since 0.3.0)
- `usecxx17`: Use C++17 standard
- `check`: Enable extra checks to improve security, see upstream [security docs](https://github.com/microsoft/snmalloc/tree/main/docs/security).
Note that the `memcpy` protection is not enabled in Rust.
Expand All @@ -40,6 +34,25 @@ are listed at
- `notls`: Enables to be loaded dynamically, thus disable tls.
- `stats`: Enables allocation statistics.
- `libc-api`: Enables libc API backed by snmalloc.
- `usewait-on-address`: Enable `WaitOnAddress` support on Windows (enabled by default).
- `tracing`: Enable structured tracing/logging.
- `fuzzing`: Enable fuzzing support.
- `vendored-stl`: Use self-vendored STL.
- `check-loads`: Enable check loads feature.
- `pageid`: Enable page ID feature.
- `gwp-asan`: Enable GWP-ASan integration. Requires `SNMALLOC_GWP_ASAN_INCLUDE_PATH` and `SNMALLOC_GWP_ASAN_LIBRARY_PATH`.

## Build Configuration

The build script ensures architectural alignment between the Rust profile and the underlying `snmalloc` allocator:

### Environment Variables
The following environment variables are automatically detected and propagated:
- `DEBUG`: Synchronizes the `snmalloc` build type with the Cargo profile. If `true`, `snmalloc` is built in `Debug` mode.
- `OPT_LEVEL`: Propagated to the C++ compiler to ensure optimization parity between Rust and C++ components.

### Windows CRT Consistency
On Windows, the build script enforces static CRT linking (`/MT` or `/MTd`) across both `cc` and `cmake` builders. This prevents linker errors and ensures consistency when `snmalloc` is used as a global allocator.

**To get the crates compiled, you need to choose either `1mib` or `16mib` to determine the chunk configuration**

Expand Down
76 changes: 76 additions & 0 deletions examples/bench_contention.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use std::sync::mpsc::channel;
use std::thread;
use std::time::Instant;
use std::alloc::Layout;

#[global_allocator]
static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;

const BLOCK_SIZE: usize = 64;
const ITERATIONS: usize = 1_000_000;

struct Ptr(*mut u8);
unsafe impl Send for Ptr {}

fn main() {
let thread_count = std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4);
println!("Running contention benchmark with {} threads, {} iterations per thread", thread_count, ITERATIONS);

// Use std::sync::Barrier
let barrier = std::sync::Arc::new(std::sync::Barrier::new(thread_count + 1));

let mut senders = Vec::new();
let mut receivers = Vec::new();

// Create a ring topology channels
for _ in 0..thread_count {
let (tx, rx) = channel::<Ptr>();
senders.push(tx);
receivers.push(Some(rx));
}

let mut handles = Vec::new();

// Start timing from here, but actual work starts after barrier
let _start = Instant::now();

for i in 0..thread_count {
let barrier = barrier.clone();
// Thread i sends to (i + 1) % N
let tx = senders[(i + 1) % thread_count].clone();
// Thread i receives from i
let rx = receivers[i].take().unwrap();

handles.push(thread::spawn(move || {
// Pre-allocate some items to fill the pipe
let layout = Layout::from_size_align(BLOCK_SIZE, 8).unwrap();

barrier.wait(); // Synchronize start

for _ in 0..ITERATIONS {
// 1. Allocate a new block
let ptr = unsafe { std::alloc::alloc(layout) };

// 2. Send to next neighbor (who will free it)
tx.send(Ptr(ptr)).unwrap();

// 3. Receive from prev neighbor (who allocated it)
let received = rx.recv().unwrap();

// 4. Free the received block
unsafe { std::alloc::dealloc(received.0, layout) };
}
}));
}

barrier.wait(); // Start timing
let loop_start = Instant::now();

for h in handles {
h.join().unwrap();
}

let duration = loop_start.elapsed();
println!("Benchmark completed in {:.2?}", duration);
println!("Throughput: {:.2} Mops/sec", (thread_count * ITERATIONS) as f64 / duration.as_secs_f64() / 1_000_000.0);
}
8 changes: 7 additions & 1 deletion snmalloc-sys/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ cc = { version = "1.0", optional = true }
cmake = { version = "0.1", optional = true }

[features]
default = ["build_cmake"]
default = ["build_cmake", "usewait-on-address"]
build_cc = ["cc"]
build_cmake = ["cmake"]
qemu = []
Expand All @@ -33,3 +33,9 @@ notls = []
stats = []
usewait-on-address = []
libc-api = []
tracing = []
fuzzing = []
vendored-stl = []
check-loads = []
pageid = []
gwp-asan = []
Loading
Loading