MagicEnc/bench.py at main · dreaming-panda/MagicEnc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import torch
torch.cuda.set_per_process_memory_fraction(0.5, 6)
import os
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from llama import LlamaForCausalLM
name_or_path = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
tokenizer = AutoTokenizer.from_pretrained(name_or_path, trust_remote_code=True)
llm :LlamaForCausalLM = LlamaForCausalLM.from_pretrained(name_or_path, torch_dtype=torch.bfloat16).cuda()
SEQ_LEN = 262144
input_ids = torch.randint(low=3, high=30000, size=(1, SEQ_LEN), device="cuda")

torch.cuda.synchronize()
t1 = time.perf_counter()
logits, cache = llm.prefilling(input_ids, chunk_size=4096, allow_fp16_qk_reduction=False, swap_memory=True, reserved_layers=0)
torch.cuda.synchronize()
t2 = time.perf_counter()
print("Prefill throughput {} token/s".format((SEQ_LEN/(t2 - t1))))

# GEN_LEN = 256
# input_ids = torch.randint(low=3, high=30000, size=(1, 1))
# past_key_values = cache
# torch.cuda.synchronize()
# t1 = time.perf_counter()
# with torch.inference_mode():
#     for _ in range(GEN_LEN):
#         output = llm(input_ids=input_ids, past_key_values=past_key_values, use_cache=True)
#         past_key_values = output.past_key_values
# torch.cuda.synchronize()
# t2 = time.perf_counter()
# print("Decode throughput {} token/s".format((GEN_LEN/(t2 - t1))))