Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 79 additions & 13 deletions kv_cache_manager/optimizer/analysis/script/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,15 @@ bazel run //kv_cache_manager/optimizer/analysis/script:optimizer_run -- -c confi

### 输出

- `<output_result_path>/*_hit_rates.csv` — 每个 instance 的命中率时序数据
- `<output_result_path>/multi_instance_cache_analysis.png` — 命中率时序图(需 `--draw-chart`)
- `<output_result_path>/*_lifecycle.csv` — block 生命周期数据(需 `--export-lifecycle`)
```
<output_result_path>/
├── *_hit_rates.csv # 每个 instance 的命中率时序数据
├── *_template_prefix_traces.csv # per-trace 模板归属明细
├── *_template_prefix_summary.csv # 模板级汇总
├── *_lifecycle.csv # block 生命周期数据(需 --export-lifecycle)
└── timeseries/
└── multi_instance_cache_analysis.png # 命中率时序图(需 --draw-chart)
```

---

Expand Down Expand Up @@ -106,9 +112,19 @@ bazel run //kv_cache_manager/optimizer/analysis/script:tradeoff -- \

### 输出

- `pareto_curve_<type>.png` — 单策略 Pareto 散点图
- `multi_policy_<type>.png` — 多策略对比子图
- `csv_results/cap_<capacity>_<policy>/` — 每次运行的 CSV(需 `--save-csv`)
```
<output_result_path>/
├── pareto/
│ ├── pareto_curve_<type>.png # 单策略 Pareto 散点图
│ └── multi_policy_<type>.png # 多策略对比子图
├── timeseries/
│ └── multi_instance_cache_analysis.png # 时序图(需 --plot-timeseries)
└── csv_results/ # 需 --save-csv
└── cap_<capacity>_<policy>/
├── *_hit_rates.csv
├── *_template_prefix_traces.csv
└── *_template_prefix_summary.csv
```

---

Expand Down Expand Up @@ -180,9 +196,13 @@ python kv_cache_manager/optimizer/analysis/script/plot/radix_tree_plot.py \

### 输出

- `<instance>_radix_tree.json` — 前缀树结构数据
- `<instance>_radix_tree.png` — 完整树可视化
- `<instance>_hot_paths.png` — 热点路径可视化
```
<output_result_path>/
└── radix_tree/
├── <instance>_radix_tree.json # 前缀树结构数据
├── <instance>_radix_tree.png # 完整树可视化
└── <instance>_hot_paths.png # 热点路径可视化
```

---

Expand Down Expand Up @@ -227,10 +247,15 @@ bazel run //kv_cache_manager/optimizer/analysis/script:analyze_lifecycle -- \

### 输出

- 控制台统计报告
- `<instance>_physical_lifespan_cdf.png` — Physical Lifespan CDF(全量 + Evicted)
- `<instance>_active_lifespan_cdf.png` — Active Lifespan CDF
- `<instance>_access_count.png` — Access Count 直方图(全量 + 去零两张子图)
```
<output_result_path>/
└── lifecycle/
├── <instance>_physical_lifespan_cdf.png # Physical Lifespan CDF(全量 + Evicted)
├── <instance>_active_lifespan_cdf.png # Active Lifespan CDF
└── <instance>_access_count.png # Access Count 直方图(全量 + 去零两张子图)
```

控制台同步输出统计报告。

---

Expand Down Expand Up @@ -285,4 +310,45 @@ script/
├── optimizer_runner.py # optimizer 运行封装
├── csv_loader.py # CSV 加载 + 容量列表
└── plot_utils.py # 绘图风格 + Pareto 绘图
```

---

## 输出目录总览

所有脚本共享同一个根目录 `<output_result_path>`(来自 config.json `output_result_path` 字段)。

```
<output_result_path>/
│ # ── C++ optimizer 原始数据输出 ──────────────────────────────
├── *_hit_rates.csv # 命中率时序(每条 trace 上报)
├── *_template_prefix_traces.csv # per-trace 模板归属明细
├── *_template_prefix_summary.csv # 模板级汇总
├── *_lifecycle.csv # block 生命周期(需 --export-lifecycle)
│ # ── Python 图表输出 ────────────────────────────────────────
├── pareto/ # tradeoff
│ ├── pareto_curve_<type>.png
│ └── multi_policy_<type>.png
├── timeseries/ # optimizer_run --draw-chart
│ └── multi_instance_cache_analysis.png # tradeoff --plot-timeseries
├── lifecycle/ # analyze_lifecycle
│ ├── *_physical_lifespan_cdf.png
│ ├── *_active_lifespan_cdf.png
│ └── *_access_count.png
├── radix_tree/ # export_tree
│ ├── *_radix_tree.json
│ ├── *_radix_tree.png
│ └── *_hot_paths.png
│ # ── tradeoff --save-csv 实验中间数据 ─────────────────────────
└── csv_results/
└── cap_<N>_<policy>/
├── *_hit_rates.csv
├── *_template_prefix_traces.csv
└── *_template_prefix_summary.csv
```
76 changes: 72 additions & 4 deletions kv_cache_manager/optimizer/analysis/script/plot/hit_rate_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,49 @@ def read_csv_file(csv_file_path):
print(f"Error reading {csv_file_path}: {str(e)}")
return None

def plot_multi_instance_analysis(csv_dir):
def _load_sp_cumulative(csv_dir, instance_name):
"""
从 template_prefix_traces.csv 计算 system prompt 累积命中率时序。

返回 DataFrame: [TimestampUs, AccSpHitRate]
AccSpHitRate = cumsum(min(hit, template_depth)) / cumsum(total_blocks)
"""
basename = instance_name.replace("_hit_rates", "")
sp_path = os.path.join(csv_dir, f"{basename}_template_prefix_traces.csv")
if not os.path.exists(sp_path):
return None

df = pd.read_csv(sp_path)
# trace_id format: trace_<instance>_<timestamp_us>
df['TimestampUs'] = df['TraceId'].str.rsplit('_', n=1).str[-1].astype(np.int64)
df = df.sort_values('TimestampUs')

sp_hits = np.where(
(df['TemplateId'] != 'NONE') & (df['TemplateDepth'] > 0),
np.minimum(df['HitBlocks'].values, df['TemplateDepth'].values),
0
)

cum_sp_hits = np.cumsum(sp_hits)
cum_total = np.cumsum(df['TotalBlocks'].values)

acc_sp_rate = np.where(cum_total > 0, cum_sp_hits / cum_total, 0.0)

return pd.DataFrame({
'TimestampUs': df['TimestampUs'].values,
'AccSpHitRate': acc_sp_rate,
})


def plot_multi_instance_analysis(csv_dir, output_dir: str = None):
"""
读取 csv_dir 下的命中率 CSV,生成时序分析图。

Args:
csv_dir: CSV 数据目录
output_dir: 图表根输出目录,图表保存至 output_dir/timeseries/
默认为 csv_dir(向后兼容)
"""
csv_files = sorted(glob.glob(os.path.join(csv_dir, "*_hit_rates.csv")))
if not csv_files:
print(f"Error: No CSV files found in directory: {csv_dir}")
Expand Down Expand Up @@ -77,8 +119,8 @@ def plot_multi_instance_analysis(csv_dir):
base = pd.DataFrame({'t': base_timestamps}) # 用于merge_asof

all_acc_hit, all_acc_external_hit, all_time_ranges = [], [], []
# 用于瞬时命中率计算:累积读块数 / 累积命中块数(反推)
all_acc_read_blocks, all_acc_hit_blocks, all_acc_ext_hit_blocks = [], [], []
all_acc_sp_hit = []
global_updates_list = []
for df in dataframes:
d = df.copy()
Expand Down Expand Up @@ -114,6 +156,22 @@ def plot_multi_instance_analysis(csv_dir):
all_acc_hit_blocks.append(aligned['AccHitBlocks'].to_numpy(float))
all_acc_ext_hit_blocks.append(aligned['AccExtHitBlocks'].to_numpy(float))

# ---- SP 累积命中率对齐 ----
for idx, name in enumerate(instance_names):
sp_df = _load_sp_cumulative(csv_dir, name)
if sp_df is None:
all_acc_sp_hit.append(None)
continue
sp_df['t'] = (sp_df['TimestampUs'] - min_timestamp) / 1e6
sp_df = sp_df.sort_values('t')
sp_aligned = pd.merge_asof(
base, sp_df[['t', 'AccSpHitRate']], on='t',
direction='backward', allow_exact_matches=True
)
t0, _ = all_time_ranges[idx]
sp_aligned.loc[sp_aligned['t'] < t0, 'AccSpHitRate'] = np.nan
all_acc_sp_hit.append(sp_aligned['AccSpHitRate'].to_numpy(float))

global_updates = pd.concat(global_updates_list, ignore_index=True)
global_updates = global_updates.dropna(subset=['t', 'CachedBlocksAllInstance']).sort_values('t')

Expand Down Expand Up @@ -233,7 +291,7 @@ def window_hit_rate(timestamps, acc_hit_blocks, acc_read_blocks, window_seconds=
top_lines = [ax_top.lines[0]]
bot_lines = [ax_bot.lines[0]]

# 上图:累计命中率
# 上图:累计命中率 + system prompt 累积命中率
for i, name in enumerate(instance_names):
t0, t1 = all_time_ranges[i]
valid = (base_timestamps >= t0) & (base_timestamps <= t1)
Expand All @@ -246,6 +304,14 @@ def window_hit_rate(timestamps, acc_hit_blocks, acc_read_blocks, window_seconds=
linewidth=1.5, drawstyle='steps-post')
top_lines += l1

if all_acc_sp_hit[i] is not None:
sp_line = ax_top_r.plot(
base_timestamps[valid], np.array(all_acc_sp_hit[i])[valid],
color=colors[i], linestyle=':', linewidth=2.5, alpha=0.9,
label=f'{name} - SP AccHitRate',
drawstyle='steps-post')
top_lines += sp_line

# 下图:时间窗口内真实命中率(累积量差值)+ 按时间降采样
downsample_interval_s = 10 # 每隔 10 秒取一个代表点
window_seconds = 10 # 窗口内累积命中率的统计时间跨度
Expand Down Expand Up @@ -300,7 +366,9 @@ def window_hit_rate(timestamps, acc_hit_blocks, acc_read_blocks, window_seconds=
ax_top.set_title(f'Cache Analysis - {len(instance_names)} Instances', fontsize=15, fontweight='bold', pad=12)

fig.tight_layout()
output_file = os.path.join(csv_dir, "multi_instance_cache_analysis.png")
timeseries_dir = os.path.join(output_dir or csv_dir, "timeseries")
os.makedirs(timeseries_dir, exist_ok=True)
output_file = os.path.join(timeseries_dir, "multi_instance_cache_analysis.png")
plt.savefig(output_file, dpi=300, bbox_inches='tight', facecolor='white')
print(f"Chart saved to: {output_file}")
plt.close()
Expand Down
20 changes: 12 additions & 8 deletions kv_cache_manager/optimizer/analysis/script/plot/lifecycle_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,18 @@ def _annotate_percentiles(ax, sorted_data, percentiles, x_max):


def plot_physical_lifespan_cdf(all_sorted, evicted_sorted, instance_name, output_path):
"""Physical Lifespan CDF: 全量 + Evicted"""
"""Physical Lifespan CDF: 全量 + Evicted,统计标注基于 Evicted Only"""
n_all, n_ev = len(all_sorted), len(evicted_sorted)
if n_all == 0:
print(" 跳过 Physical Lifespan CDF: 无数据")
return

# 统计标注优先用 evicted_sorted(真实驱逐分布),无驱逐数据时 fallback 到 all
stat_data = evicted_sorted if n_ev > 0 else all_sorted

cdf_all = np.arange(1, n_all + 1) / n_all * 100
p99 = float(np.percentile(all_sorted, 99))
x_max = min(p99 * 1.5, float(all_sorted[-1])) if all_sorted[-1] > 0 else 1.0
p99 = float(np.percentile(stat_data, 99))
x_max = min(p99 * 1.5, float(stat_data[-1])) if stat_data[-1] > 0 else 1.0

fig, ax = plt.subplots(figsize=(14, 8))

Expand All @@ -55,14 +58,15 @@ def plot_physical_lifespan_cdf(all_sorted, evicted_sorted, instance_name, output
label=f"Evicted Only (n={n_ev:,})", alpha=0.8)
ax.fill_between(evicted_sorted, cdf_ev, alpha=0.1, color="red")

_annotate_percentiles(ax, all_sorted, [50, 75, 90, 95, 99], x_max)
_annotate_percentiles(ax, stat_data, [50, 75, 90, 95, 99], x_max)

mean_val = float(np.mean(all_sorted))
median_val = float(np.median(all_sorted))
mean_val = float(np.mean(stat_data))
median_val = float(np.median(stat_data))
stat_label = "Evicted" if n_ev > 0 else "All"
ax.axvline(mean_val, color="blue", linestyle="--", linewidth=2,
label=f"Mean: {mean_val:.1f}s", alpha=0.7)
label=f"Mean ({stat_label}): {mean_val:.1f}s", alpha=0.7)
ax.axvline(median_val, color="orange", linestyle="--", linewidth=2,
label=f"Median: {median_val:.1f}s", alpha=0.7)
label=f"Median ({stat_label}): {median_val:.1f}s", alpha=0.7)

ax.set_xlim([0, x_max])
ax.set_ylim([0, 105])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@


def analyze_single(csv_path: str, output_dir: str, stats_only: bool = False):
"""分析单个 lifecycle CSV"""
"""分析单个 lifecycle CSV,图表保存至 output_dir/lifecycle/"""
name = Path(csv_path).stem.replace("_lifecycle", "")
print(f"\n{'='*60}")
print(f"分析: {name}")
Expand All @@ -48,21 +48,22 @@ def analyze_single(csv_path: str, output_dir: str, stats_only: bool = False):
if stats_only:
return

os.makedirs(output_dir, exist_ok=True)
lifecycle_dir = os.path.join(output_dir, "lifecycle")
os.makedirs(lifecycle_dir, exist_ok=True)
plot_data = extract_plot_data(df)

print(f"\n生成图表:")
plot_physical_lifespan_cdf(
plot_data["physical_all"], plot_data["physical_evicted"],
name, os.path.join(output_dir, f"{name}_physical_lifespan_cdf.png"))
name, os.path.join(lifecycle_dir, f"{name}_physical_lifespan_cdf.png"))

plot_active_lifespan_cdf(
plot_data["active_all"],
name, os.path.join(output_dir, f"{name}_active_lifespan_cdf.png"))
name, os.path.join(lifecycle_dir, f"{name}_active_lifespan_cdf.png"))

plot_access_count_histogram(
plot_data["access_counts"],
name, os.path.join(output_dir, f"{name}_access_count.png"))
name, os.path.join(lifecycle_dir, f"{name}_access_count.png"))


def main():
Expand Down
7 changes: 4 additions & 3 deletions kv_cache_manager/optimizer/analysis/script/run/export_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,15 @@ def main():
sys.exit(1)
config = config_loader.config()

output_dir = Path(args.output_dir) if args.output_dir else Path(config.output_result_path())
root_dir = Path(args.output_dir) if args.output_dir else Path(config.output_result_path())

This comment was marked as resolved.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed output_dir in export_tree.py script

output_dir = root_dir / "radix_tree"
output_dir.mkdir(parents=True, exist_ok=True)

print("=" * 80)
print("Radix Tree Export and Visualization")
print("=" * 80)
print("Config: {}".format(args.config))
print("Output: {}".format(output_dir))
print("Output: {}".format(root_dir))
print()

optimizer = kvcm_py_optimizer.OptimizerManager(config)
Expand Down Expand Up @@ -160,7 +161,7 @@ def main():
)

print("\n" + "=" * 80)
print("Done! Output: {}".format(output_dir))
print("Done! Output: {}".format(root_dir))
print("=" * 80)

kvcm_py_optimizer.LoggerBroker.DestroyLogger()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def main():
if args.draw_chart:
t5 = time.time()
print("\n[5/5] Generating charts...")
plot_multi_instance_analysis(output_path)
plot_multi_instance_analysis(output_path, output_path)
print(" Charts done: {:.2f}s".format(time.time() - t5))
else:
print("\n[5/5] Skipping chart generation.")
Expand Down
8 changes: 4 additions & 4 deletions kv_cache_manager/optimizer/analysis/script/run/tradeoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ def _print_multi_policy_table(results_by_policy, policies):
# 时序图
# ============================================================================

def _plot_timeseries(csv_save_dir, results_by_policy, target_caps=None):
"""为指定容量点生成命中率时序图"""
def _plot_timeseries(csv_save_dir, results_by_policy, output_dir, target_caps=None):
"""为指定容量点生成命中率时序图,图表保存至 output_dir/timeseries/"""
print("\n" + "=" * 60)
print("Generating Timeseries Plots")
print("=" * 60)
Expand All @@ -118,7 +118,7 @@ def _plot_timeseries(csv_save_dir, results_by_policy, target_caps=None):
if os.path.exists(cap_dir):
print("Plotting {} capacity={}...".format(pol, cap))
try:
plot_multi_instance_analysis(cap_dir)
plot_multi_instance_analysis(cap_dir, output_dir)
count += 1
except Exception as e:
print(" Failed: {}".format(e))
Expand Down Expand Up @@ -264,7 +264,7 @@ def main():
# ----------------------------------------------------------------
has_csv = args.save_csv or args.skip_run
if args.plot_timeseries and has_csv:
_plot_timeseries(csv_save_dir, results_by_policy, args.plot_capacity)
_plot_timeseries(csv_save_dir, results_by_policy, output_dir, args.plot_capacity)
elif args.plot_timeseries and not has_csv:
print("\nWarning: --plot-timeseries requires --save-csv or --skip-run")

Expand Down
Loading
Loading