Skip to content

[NPU] Add optimized NPU mhc#1173

Open
lowdy1 wants to merge 3 commits intolinkedin:mainfrom
lowdy1:mhc_npu
Open

[NPU] Add optimized NPU mhc#1173
lowdy1 wants to merge 3 commits intolinkedin:mainfrom
lowdy1:mhc_npu

Conversation

@lowdy1
Copy link
Copy Markdown
Contributor

@lowdy1 lowdy1 commented Mar 28, 2026

Add Ascend NPU Triton kernels for the three mHC sub-operators:

  • Fused matmul + RMS normalization (forward/backward)
  • Sinkhorn routing with split pre/post/residual coefficients (forward/backward)
  • Pre-aggregate weighted sum (forward/backward)
  • Post + residual mixing (forward/backward)

NPU optimizations applied:

  • Unified UB tiling via compute_default_tiling_strategy for matrix
  • Persistent grid-stride loops (tl.range + num_programs)
  • Adaptive BLOCK_N/BLOCK_M for core utilisation at small seq_len
  • Fused backward coefficient assembly kernel

Hardware Type: Atlas 800I A2

  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

@noemotiovon noemotiovon mentioned this pull request Mar 30, 2026
3 tasks
@lowdy1
Copy link
Copy Markdown
Contributor Author

lowdy1 commented Apr 8, 2026

mhc_coeffs_speed_full_token_length mhc_pre_speed_full_token_length mhc_post_res_speed_full_token_length

@lowdy1
Copy link
Copy Markdown
Contributor Author

lowdy1 commented Apr 8, 2026

**************************************
     BENCHMARKING SPEED for MHC_COEFFS
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      3.1525299549102783,
      3.397739887237549,
      3.635780096054077,
      4.77672004699707,
      6.1290998458862305
    ],
    "y_values_20": [
      3.0985679626464844,
      3.295844078063965,
      3.5463480949401855,
      4.69598388671875,
      6.091179847717285
    ],
    "y_values_80": [
      3.2731680870056152,
      3.4446959495544434,
      3.736743927001953,
      4.858716011047363,
      6.218291759490967
    ],
    "timestamp": "2026-04-08 08:35:35",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      3.1888198852539062,
      3.000959873199463,
      3.083699941635132,
      4.059080123901367,
      6.724510192871094
    ],
    "y_values_20": [
      3.0566399097442627,
      2.9898040294647217,
      3.0508079528808594,
      4.04995584487915,
      6.71122407913208
    ],
    "y_values_80": [
      3.3086600303649902,
      3.0244319438934326,
      3.2188119888305664,
      4.067448139190674,
      6.734888076782227
    ],
    "timestamp": "2026-04-08 08:35:36",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      4.272759914398193,
      4.66588020324707,
      5.566180229187012,
      9.460880279541016,
      13.468339920043945
    ],
    "y_values_20": [
      4.251659870147705,
      4.625919818878174,
      5.538640022277832,
      9.351024627685547,
      13.289239883422852
    ],
    "y_values_80": [
      4.3094000816345215,
      4.691267967224121,
      5.596799850463867,
      9.647804260253906,
      13.659379959106445
    ],
    "timestamp": "2026-04-08 08:35:37",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      7.03741979598999,
      7.107659816741943,
      7.500259876251221,
      12.01867961883545,
      20.657928466796875
    ],
    "y_values_20": [
      6.9990763664245605,
      7.088927745819092,
      7.483860015869141,
      11.998956680297852,
      20.630111694335938
    ],
    "y_values_80": [
      7.074220180511475,
      7.138556003570557,
      7.509880065917969,
      12.028979301452637,
      20.665538787841797
    ],
    "timestamp": "2026-04-08 08:35:38",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      7.522640228271484,
      8.09965991973877,
      9.331859588623047,
      14.421659469604492,
      19.808300018310547
    ],
    "y_values_20": [
      7.415900230407715,
      8.050780296325684,
      9.25543212890625,
      14.342860221862793,
      19.66875648498535
    ],
    "y_values_80": [
      7.565244197845459,
      8.157699584960938,
      9.395907402038574,
      14.620200157165527,
      19.97071647644043
    ],
    "timestamp": "2026-04-08 08:35:39",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      11.815910339355469,
      11.890480041503906,
      11.838839530944824,
      15.964380264282227,
      27.55221939086914
    ],
    "y_values_20": [
      11.780595779418945,
      11.821011543273926,
      11.827340126037598,
      15.9473876953125,
      27.534530639648438
    ],
    "y_values_80": [
      11.898123741149902,
      11.930831909179688,
      11.867332458496094,
      15.97535228729248,
      27.575956344604492
    ],
    "timestamp": "2026-04-08 08:35:40",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  }
]
**************************************
     BENCHMARKING MEMORY for MHC_COEFFS
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      83.0322265625,
      164.548828125,
      325.3349609375,
      648.4052734375,
      1294.5458984375
    ],
    "y_values_20": [
      83.0322265625,
      164.548828125,
      325.3349609375,
      648.4052734375,
      1294.5458984375
    ],
    "y_values_80": [
      83.0322265625,
      164.548828125,
      325.3349609375,
      648.4052734375,
      1294.5458984375
    ],
    "timestamp": "2026-04-08 08:35:40",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_coeffs",
    "kernel_provider": "torch",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      225.55810546875,
      449.60498046875,
      897.69873046875,
      1793.88623046875,
      3586.26123046875
    ],
    "y_values_20": [
      225.55810546875,
      449.60498046875,
      897.69873046875,
      1793.88623046875,
      3586.26123046875
    ],
    "y_values_80": [
      225.55810546875,
      449.60498046875,
      897.69873046875,
      1793.88623046875,
      3586.26123046875
    ],
    "timestamp": "2026-04-08 08:35:41",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"coeffs\"}",
    "liger_version": "0.7.0"
  }
]
**************************************
     BENCHMARKING SPEED for MHC_PRE
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.2706499993801117,
      0.2808300256729126,
      0.305759996175766,
      0.36250001192092896,
      0.4489800035953522
    ],
    "y_values_20": [
      0.26260799169540405,
      0.27570000290870667,
      0.2987399995326996,
      0.35867199301719666,
      0.44839999079704285
    ],
    "y_values_80": [
      0.2788279950618744,
      0.2866800129413605,
      0.3097600042819977,
      0.3701480031013489,
      0.4495680034160614
    ],
    "timestamp": "2026-04-08 08:35:42",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.19262000918388367,
      0.4295499920845032,
      1.0013999938964844,
      1.9880599975585938,
      3.943240165710449
    ],
    "y_values_20": [
      0.19025199115276337,
      0.4275040030479431,
      0.9994999766349792,
      1.9841400384902954,
      3.9388718605041504
    ],
    "y_values_80": [
      0.19528800249099731,
      0.43254798650741577,
      1.0030479431152344,
      1.9931199550628662,
      3.9482040405273438
    ],
    "timestamp": "2026-04-08 08:35:43",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.6086199879646301,
      0.6297100186347961,
      0.8999999761581421,
      1.6647599935531616,
      3.4969398975372314
    ],
    "y_values_20": [
      0.6029640436172485,
      0.6237280368804932,
      0.8988360166549683,
      1.6631840467453003,
      3.4953360557556152
    ],
    "y_values_80": [
      0.6203519701957703,
      0.6414719820022583,
      0.9022400379180908,
      1.6677119731903076,
      3.4976561069488525
    ],
    "timestamp": "2026-04-08 08:35:43",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.5481699705123901,
      1.2595800161361694,
      2.5191898345947266,
      5.325829982757568,
      11.012419700622559
    ],
    "y_values_20": [
      0.5460720062255859,
      1.2563040256500244,
      2.5147640705108643,
      5.323080062866211,
      11.002732276916504
    ],
    "y_values_80": [
      0.5500959753990173,
      1.261944055557251,
      2.524019956588745,
      5.337560176849365,
      11.01311206817627
    ],
    "timestamp": "2026-04-08 08:35:44",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.9966100454330444,
      1.050469994544983,
      1.1100000143051147,
      2.0029001235961914,
      4.172019958496094
    ],
    "y_values_20": [
      0.9887400269508362,
      1.0352760553359985,
      1.1074119806289673,
      2.0001919269561768,
      4.169595718383789
    ],
    "y_values_80": [
      1.009220004081726,
      1.0630000829696655,
      1.1152480840682983,
      2.0053679943084717,
      4.175796031951904
    ],
    "timestamp": "2026-04-08 08:35:45",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.7002800107002258,
      1.7106200456619263,
      3.548689842224121,
      7.470200061798096,
      15.546460151672363
    ],
    "y_values_20": [
      0.6967999935150146,
      1.7071239948272705,
      3.5425760746002197,
      7.467360019683838,
      15.540656089782715
    ],
    "y_values_80": [
      0.7029759883880615,
      1.7146079540252686,
      3.5539281368255615,
      7.475180149078369,
      15.551715850830078
    ],
    "timestamp": "2026-04-08 08:35:46",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  }
]
**************************************
     BENCHMARKING MEMORY for MHC_PRE
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      96.8134765625,
      192.8681640625,
      384.9775390625,
      769.19677734375,
      1537.6337890625
    ],
    "y_values_20": [
      96.8134765625,
      192.8681640625,
      384.9775390625,
      769.19677734375,
      1537.6337890625
    ],
    "y_values_80": [
      96.8134765625,
      192.8681640625,
      384.9775390625,
      769.19677734375,
      1537.6337890625
    ],
    "timestamp": "2026-04-08 08:35:46",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_pre",
    "kernel_provider": "torch",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      176.80615234375,
      352.85302734375,
      704.94677734375,
      1409.13427734375,
      2817.50927734375
    ],
    "y_values_20": [
      176.80615234375,
      352.85302734375,
      704.94677734375,
      1409.13427734375,
      2817.50927734375
    ],
    "y_values_80": [
      176.80615234375,
      352.85302734375,
      704.94677734375,
      1409.13427734375,
      2817.50927734375
    ],
    "timestamp": "2026-04-08 08:35:46",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"pre\"}",
    "liger_version": "0.7.0"
  }
]
**************************************
     BENCHMARKING SPEED for MHC_POST_RES
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.292279988527298,
      0.3346799910068512,
      0.4829599857330322,
      0.8054200410842896,
      1.4581600427627563
    ],
    "y_values_20": [
      0.28380000591278076,
      0.3301520049571991,
      0.48169201612472534,
      0.8045399785041809,
      1.4570159912109375
    ],
    "y_values_80": [
      0.30197200179100037,
      0.3397560119628906,
      0.48456400632858276,
      0.8062599897384644,
      1.4608960151672363
    ],
    "timestamp": "2026-04-08 08:35:47",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.4698199927806854,
      1.1576600074768066,
      2.262889862060547,
      4.440579891204834,
      8.987939834594727
    ],
    "y_values_20": [
      0.4691239893436432,
      1.1566799879074097,
      2.261120080947876,
      4.438340187072754,
      8.980844497680664
    ],
    "y_values_80": [
      0.4705960154533386,
      1.159567952156067,
      2.264523983001709,
      4.44212007522583,
      8.997260093688965
    ],
    "timestamp": "2026-04-08 08:35:48",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.7601000070571899,
      1.263659954071045,
      2.6509499549865723,
      5.435160160064697,
      10.660539627075195
    ],
    "y_values_20": [
      0.7533800005912781,
      1.2619999647140503,
      2.6485278606414795,
      5.434067726135254,
      10.657916069030762
    ],
    "y_values_80": [
      0.7742800116539001,
      1.2660000324249268,
      2.654279947280884,
      5.438051700592041,
      10.671500205993652
    ],
    "timestamp": "2026-04-08 08:35:48",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      1.2056500911712646,
      2.8477399349212646,
      5.860099792480469,
      11.544690132141113,
      22.9312801361084
    ],
    "y_values_20": [
      1.2019200325012207,
      2.8453400135040283,
      5.8562116622924805,
      11.530872344970703,
      22.918855667114258
    ],
    "y_values_80": [
      1.2080999612808228,
      2.854140043258667,
      5.875947952270508,
      11.552400588989258,
      22.938936233520508
    ],
    "timestamp": "2026-04-08 08:35:49",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      1.240920066833496,
      1.6016199588775635,
      3.378469944000244,
      6.856750011444092,
      13.33169937133789
    ],
    "y_values_20": [
      1.229915976524353,
      1.598215937614441,
      3.375324010848999,
      6.855123996734619,
      13.329719543457031
    ],
    "y_values_80": [
      1.2539280652999878,
      1.6045440435409546,
      3.3836960792541504,
      6.865488052368164,
      13.33292007446289
    ],
    "timestamp": "2026-04-08 08:35:50",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      1.676259994506836,
      4.242280006408691,
      8.69873046875,
      17.18317985534668,
      34.343929290771484
    ],
    "y_values_20": [
      1.6736639738082886,
      4.237880229949951,
      8.694124221801758,
      17.176347732543945,
      34.336544036865234
    ],
    "y_values_80": [
      1.6791640520095825,
      4.245220184326172,
      8.705008506774902,
      17.1884708404541,
      34.351314544677734
    ],
    "timestamp": "2026-04-08 08:35:51",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  }
]
**************************************
     BENCHMARKING MEMORY for MHC_POST_RES
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      164.88818359375,
      329.01220703125,
      657.26220703125,
      1313.76220703125,
      2626.76220703125
    ],
    "y_values_20": [
      164.88818359375,
      329.01220703125,
      657.26220703125,
      1313.76220703125,
      2626.76220703125
    ],
    "y_values_80": [
      164.88818359375,
      329.01220703125,
      657.26220703125,
      1313.76220703125,
      2626.76220703125
    ],
    "timestamp": "2026-04-08 08:35:51",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  },
  {
    "kernel_name": "mhc_post_res",
    "kernel_provider": "torch",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "T",
    "x_label": "Sequence Length (T)",
    "x_values": [
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      240.84716796875,
      480.93310546875,
      961.10498046875,
      1921.44873046875,
      3842.13623046875
    ],
    "y_values_20": [
      240.84716796875,
      480.93310546875,
      961.10498046875,
      1921.44873046875,
      3842.13623046875
    ],
    "y_values_80": [
      240.84716796875,
      480.93310546875,
      961.10498046875,
      1921.44873046875,
      3842.13623046875
    ],
    "timestamp": "2026-04-08 08:35:52",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 4, \"HC\": 4, \"C\": 4096, \"tmax\": 20, \"rms_eps\": 1e-06, \"pre_eps\": 0.0, \"sinkhorn_eps\": 1e-06, \"post_mult\": 2.0, \"sub_kernel\": \"post_res\"}",
    "liger_version": "0.7.0"
  }
]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant