Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# results
*.txt

# version file generated by setuptools-scm
/vllm/_version.py

Expand Down
44 changes: 0 additions & 44 deletions inter_sm/benchmark_l2.sh

This file was deleted.

60 changes: 0 additions & 60 deletions inter_sm/benchmark_l2_nsys_single.sh

This file was deleted.

45 changes: 0 additions & 45 deletions inter_sm/benchmark_mem.sh

This file was deleted.

53 changes: 0 additions & 53 deletions inter_sm/benchmark_mem_nsys_single.sh

This file was deleted.

50 changes: 0 additions & 50 deletions inter_sm/extract_lats.py

This file was deleted.

85 changes: 85 additions & 0 deletions inter_sm/extract_single_layer_lats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
import os
import pandas as pd
import sqlite3

INDEX="index"
KERNEL_NAME="Kernel Name"
START="Start (ms)"
END="End (ms)"
DURATION="Duration (ms)"

interference_kernels = [
"mul_fp32_ilp4",
"fma_fp32_ilp1",
"fma_fp32_ilp2",
"fma_fp32_ilp3",
"fma_fp32_ilp4",
"interf_copy_kernel",
"sleep_kernel",
]

# (batch_size, prompt_size): (num prefill kernels, num decode tokens)
num_kernels = {
(1, 100): (29, 29),
(1, 1000): (27, 29),
(8, 100): (34, 36),
(8, 1000): (34, 36),
(16, 100): (42, 44),
(16, 1000): (42, 44)
}

def read_sqlite_trace(nsys_trace: str) -> pd.DataFrame:
conn = sqlite3.connect(nsys_trace)
df = pd.read_sql("""
SELECT * FROM
CUPTI_ACTIVITY_KIND_KERNEL kernel_events
LEFT JOIN StringIds kernel_names ON kernel_events.demangledName = kernel_names.id;
""", conn)

df[KERNEL_NAME] = df["value"]
df[START] = df["start"] / 1e6 # convert to ms
df[END] = df["end"] / 1e6 # convert to ms
df[DURATION] = df[END] - df[START]

# drop interference kernel
mask = df[KERNEL_NAME].apply(lambda x: any(ik in x for ik in interference_kernels))
df = df[~mask].reset_index(drop=True)

lats_df = pd.DataFrame.from_records([tuple(df[DURATION])], columns=df[KERNEL_NAME])

return lats_df


def extract_last_decode_kernel_lats(args, num_decode_kernels: int):
lats = []
for file in os.listdir(args.input_folder):
if file.endswith(".sqlite"):
nsys_trace = os.path.join(args.input_folder, file)
lats_df = read_sqlite_trace(nsys_trace)

# keep only last decode iteration
decode_df = lats_df.iloc[:, -num_decode_kernels * (args.num_tokens-1):]

lats.append(decode_df)

# Combine all latencies into a single DataFrame
all_lats_df = pd.concat(lats, ignore_index=True)
all_lats_df.to_csv(os.path.join(args.input_folder, f"latencies.csv"), index=False)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_folder", type=str, required=True, help="Folder to single layer nsys traces")
parser.add_argument("--batch_size", type=int, required=True, help="Batch size used in the trace")
parser.add_argument("--prompt_size", type=int, required=True, help="Prompt size used in the trace")
parser.add_argument("--num_tokens", type=int, default=2, help="Number of tokens generated per request")
parser.add_argument("--decode", action="store_true", help="Extract latencies for decode kernels only")
args = parser.parse_args()

num_prefill_kernels, num_decode_kernels = num_kernels.get((args.batch_size, args.prompt_size), (None, None))
assert num_decode_kernels is not None, f"Number of prefill and decode kernels unknown for batch size {args.batch_size} and prompt size {args.prompt_size}"

if args.decode:
extract_last_decode_kernel_lats(args, num_decode_kernels)

2 changes: 1 addition & 1 deletion inter_sm/gcontext_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


model_names = {
"llama": "meta-llama/Llama-3.1-8B",
"llama": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"mistral": "mistralai/Ministral-8B-Instruct-2410",
"qwen": "Qwen/Qwen2.5-7B-Instruct",
"gemma-9b": "gogole/gemma-2-9b-it",
Expand Down
34 changes: 34 additions & 0 deletions inter_sm/l2_cache/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Measuring/Plotting TPOT latency with full model

1. Compile for GPU architecture you're running on if not already done

2. Gather necessary NCU metrics from the interference (copy) kernel
```bash
bash inter_sm/l2_cache/ncu_profile_l2_interf_kernel.sh
```

3. Measure the TPOT latency for the full model
```bash
bash inter_sm/l2_cache/nsys_benchmark_l2_full_model.sh
```

# Measuring/Plotting slowdown per kernel

1. Compile for GPU architecture you're running on if not already done

2. Benchmark the latency of the model with a single hidden layer while colocated next to the copy kernel that subsequently increases the amount of data copied

```bash
bash inter_sm/l2_cache/nsys_benchmark_l2_single_layer.sh
```

3. Collect the NCU metrics of each kernel of the model with a single layer if not already done

```bash
bash inter_sm/ncu_prof_model_single_layer.sh
```

4. Generate the plots and adjust the batch size, prompt size
```bash
bash inter_sm/l2_cache/generate_l2_kernel_trace_plots.sh
```
Loading
Loading