eth-easl · elvingerpaul · Nov 19, 2025 · Nov 17, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/inter_sm/gcontext_test.py b/inter_sm/gcontext_test.py
diff --git a/inter_sm/l2_cache/l2_config.json b/inter_sm/l2_cache/l2_config.json
@@ -1,5 +1,5 @@
 {
-  "kernel_name": "CopyKernel",
+  "kernel_name": "L2Kernel",
   "init_args": {
     "shared_lib": "/home/elpaul/vllm_profile/vllm/interference/build/libinterference_kernels.so",
     "use_gcontexts": true,

diff --git a/inter_sm/mem_bw/README.md b/inter_sm/mem_bw/README.md
@@ -1,13 +1,25 @@
-# Measuring/Plotting TPOT latency with full model
+# Memory Bandwidth Interference
 
-1. Compile for GPU architecture you're running on if not already done
+This subfolder contains supporting code and scripts for measuring the sensitivity of memory bandwidth interference. The relevant section in the paper is **4.3**. We use a `CopyKernel` to but pressure on the memory bandwidth. By using vectorized instructions and increasing the number of thread blocks (thereby increasing the number of threads) of the interference kernel we increase the amount of data copied per instance of time putting higher pressure on the bandwidth. The paper experiment was exectued on an NVIDIA H100 GPU.
 
-2. Gather necessary NCU metrics from the interference (copy) kernel
+## Prerequisites
+1. Don't forget to compile the interference kernels for your GPU architecture into the `libinterference_kernels.so` library.
+2. In your `mem_bw_config.json` file, ensure that
+    - `shared_lib` points to your `libinterference_kernels.so` library.
+    - `num_tb` and `num_threads_per_block` are set with respect to the GPU that you are running on.
+    - `inter_sm = True` and `use_gcontexts = True` since we are running some inter-SM experiments and want to use CUDA Green
+
+## Measuring TBT latency of the full model in the decode phase
 ```bash
-bash inter_sm/mem_bw/ncu_profile_mem_interf_kernel.sh
+bash bench_full_decode.sh
 ```
 
-3. Measure the TPOT latency for the full model
+The `bench_full_decode.sh` script measures the Time Between Tokens (TBT) latency of an LLM. In addition, it generates an `nsys` trace so you can verify that token generation and the interference kernel overlap. You may adjust the values of the different variables as needed. Pay attention to:
+- **Changing the LLM model**: Ensure that you profile the entire model with all its hidden layers by updating the `NUM_HIDDEN_LAYERS` variable accordingly. This variable primarily serves as a safeguard to prevent accidentally running the model with fewer hidden layers due to a previously cached version.
+- **Verifying the `nsys` trace**: Check that the interference kernel runs for the full duration of token generation. Otherwise update the number of iterations in `mem_bw_config.json` if necessary.
+
+## Collect the NCU profile of the interference kernel
 ```bash
-bash inter_sm/mem_bw/nsys_benchmark_mem_full_model.sh
-```
+bash ncu_profile_mem_interf_kernel.sh
+```
+Run the above script to collect the NCU metric for the interfernce kernel when running on a restriced number of SMs using CUDA green context.
diff --git a/...m/mem_bw/nsys_benchmark_mem_full_model.sh → inter_sm/mem_bw/bench_full_decode.sh b/...m/mem_bw/nsys_benchmark_mem_full_model.sh → inter_sm/mem_bw/bench_full_decode.sh
@@ -2,7 +2,7 @@
 
 # Model to benchmark, update the model config file path accordingly
 MODEL=llama
-MODEL_CONFIG_FILE=~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json
+MODEL_CONFIG_FILE=/home/elpaul/vllm_profile/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/config.json
 
 # configure number of hidden layers to profile by changing the model's config.json file
 NUM_HIDDEN_LAYERS=32
@@ -13,7 +13,7 @@ mv tmp.json $MODEL_CONFIG_FILE
 INTERFERENCE_CONFIG="mem_bw_config.json"
 PROFILE_TYPE=decode
 NUM_TOKENS=10
-NUM_REQUESTS=5
+NUM_WARMUP=5
 
 
 for BATCH_SIZE in 8; do
@@ -36,19 +36,20 @@ for BATCH_SIZE in 8; do
                 --cuda-graph-trace node \
                 --force-overwrite true \
                 -o "$OUTPUT_FILE".nsys-rep \
-            python3 inter_sm/gcontext_test.py \
+            python3 ../../main.py \
                 --model $MODEL \
+                --num_warmup $NUM_WARMUP \
                 --profile $PROFILE_TYPE \
                 --prompt_size $PROMPT_SIZE \
                 --num_tokens $NUM_TOKENS \
                 --batch_size $BATCH_SIZE \
-                --num_requests $NUM_REQUESTS \
                 --interference_config_json $INTERFERENCE_CONFIG \
-                --inter_tb $TB \
+                --num_tb $TB \
+                --num_threads_per_block 1024 \
                 --inter_decode > "$OUTPUT_FILE".txt
         done
 
-        python plot_mem_tpot_latency.py --output_dir $OUTPUT_FOLDER --fig_path "$OUTPUT_FOLDER"/tpot.png --num_tokens $NUM_TOKENS
+        # python plot_mem_tpot_latency.py --output_dir $OUTPUT_FOLDER --fig_path "$OUTPUT_FOLDER"/tpot.png --num_tokens $NUM_TOKENS
     done
 
 done
diff --git a/inter_sm/mem_bw/mem_bw_config.json b/inter_sm/mem_bw/mem_bw_config.json
@@ -12,7 +12,8 @@
     "num_itrs": 100,
     "set_percentage": false
   },
-  "num_requests": 3,
   "inter_sm": true,
-  "intra_sm": false
+  "intra_sm": false,
+  "num_warmup": 5,
+  "num_requests": 6
 }
diff --git a/inter_sm/mem_bw/ncu_profile_mem_interf_kernel.sh b/inter_sm/mem_bw/ncu_profile_mem_interf_kernel.sh
@@ -2,7 +2,7 @@
 
 INTER_FILE=mem_bw_config.json
 
-OUTPUT_FOLDER="results/inter_ncu_profiles_l2_poll"
+OUTPUT_FOLDER="results/inter_ncu_profiles_mem_bw"
 mkdir -p $OUTPUT_FOLDER
 
 # adjust the number of thread blocks based on GPU that you are running on
@@ -15,7 +15,7 @@ for TB in 34 68 102 136; do
         python3 run_inter_kernel.py \
         --interference_config_json $INTER_FILE \
         --inter_tb $TB \
-        --inter_num_floats 4194304 # TODO check if this is enough
+        --inter_num_floats 1073741824
 
     ncu --import "$NCU_FILE".ncu-rep --csv > "$NCU_FILE".csv
 done
diff --git a/inter_sm/mem_bw/plot_mem_tpot_latency.py b/inter_sm/mem_bw/plot_mem_tpot_latency.py
diff --git a/inter_sm/mem_bw/run_inter_kernel.py b/inter_sm/mem_bw/run_inter_kernel.py
@@ -10,13 +10,10 @@
 import numpy as np
 
 import torch
-import threading
 from vllm.interference.inter_funcs import kernel_name_dict, GlobalData
 
 
 def run_inter(inter_config_dict, args, data, idx):
-    print(f"Start interference kernel, idx {idx}")
-
     init_args=inter_config_dict["init_args"]
     run_args=inter_config_dict["run_args"]
 
@@ -28,20 +25,11 @@ def run_inter(inter_config_dict, args, data, idx):
     run_args["num_tb"] = args.inter_tb
     run_args["num_floats"] = args.inter_num_floats
 
-
     interference_class=kernel_name_dict[inter_config_dict["kernel_name"]]
     inter_kernel = interference_class(**init_args)
     inter_kernel.run(**run_args)
     torch.cuda.synchronize()
 
-
-    # print(f"Start kernel, run_args is {run_args}")
-    # for _ in range(10):
-    #     start = time.time()
-    #     inter_kernel.run(**run_args)
-    #     torch.cuda.synchronize()
-    #     print(f"total time is {(time.time()-start)*1000} ms")
-    # torch.cuda.synchronize()
     print(f"Exiting ....")
 
 if __name__ == "__main__":