You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
<spanclass="bp">self</span><spanclass="o">.</span><spanclass="n">mean</span><spanclass="o">=</span><spanclass="n">Mean</span><spanclass="p">(</span><spanclass="n">dim</span><spanclass="o">=</span><spanclass="mi">1</span><spanclass="p">)</span><spanclass="c1"># average over the query heads</span>
318
+
<spanclass="bp">self</span><spanclass="o">.</span><spanclass="n">gemm</span><spanclass="o">=</span><spanclass="n">GeMM</span><spanclass="p">()</span><spanclass="c1"># GeMM(x, y) = y @ xᵀ</span>
319
+
<spanclass="bp">self</span><spanclass="o">.</span><spanclass="n">output_func</span><spanclass="o">=</span><spanclass="n">topK</span><spanclass="p">()</span><spanclass="c1"># must end in topK / approxTopK</span>
310
320
<spanclass="c1"># Cache-side ops (run once per finished page)</span>
<spanclass="bp">self</span><spanclass="o">.</span><spanclass="n">reduction</span><spanclass="o">=</span><spanclass="n">CMean</span><spanclass="p">(</span><spanclass="n">dim</span><spanclass="o">=</span><spanclass="mi">1</span><spanclass="p">)</span><spanclass="c1"># one centroid (mean key) per page</span>
@@ -317,9 +327,10 @@ <h2>Quick Example<a class="headerlink" href="#quick-example" title="Link to this
317
327
<spanclass="n">cache</span><spanclass="p">:</span><spanclass="n">Dict</span><spanclass="p">[</span><spanclass="nb">str</span><spanclass="p">,</span><spanclass="n">torch</span><spanclass="o">.</span><spanclass="n">Tensor</span><spanclass="p">],</span><spanclass="c1"># viewed as [S, r, c] per create_cache()</span>
<spanclass="bp">self</span><spanclass="o">.</span><spanclass="n">output_func</span><spanclass="p">(</span><spanclass="n">score</span><spanclass="p">,</span><spanclass="n">o</span><spanclass="p">,</span><spanclass="n">ctx</span><spanclass="o">=</span><spanclass="n">ctx</span><spanclass="p">)</span><spanclass="c1"># must end in topK / approxTopK</span>
330
+
<spanclass="c1"># No native torch ops here — every tensor flows through vortex ops.</span>
<spanclass="n">attention_backend</span><spanclass="o">=</span><spanclass="s2">"flashinfer"</span><spanclass="p">,</span><spanclass="c1"># SGLang's base backend</span>
353
+
<spanclass="n">attention_backend</span><spanclass="o">=</span><spanclass="s2">"flashinfer"</span><spanclass="p">,</span><spanclass="c1"># mandatory: SGLang's base backend</span>
354
+
<spanclass="n">disable_overlap_schedule</span><spanclass="o">=</span><spanclass="kc">True</span><spanclass="p">,</span><spanclass="c1"># mandatory for vortex sparsity</span>
343
355
<spanclass="n">enable_vortex_sparsity</span><spanclass="o">=</span><spanclass="kc">True</span><spanclass="p">,</span><spanclass="c1"># otherwise computes full attention</span>
344
356
<spanclass="n">vortex_topk_val</span><spanclass="o">=</span><spanclass="mi">30</span><spanclass="p">,</span><spanclass="c1"># pages kept per request</span>
0 commit comments