Skip to content

Commit 9fa4ac0

Browse files
Merge pull request #111 from DataScienceUIBK/demo
Demo
2 parents dfb7252 + 92881be commit 9fa4ac0

2 files changed

Lines changed: 122 additions & 129 deletions

File tree

demo-web/src/app/arena/page.tsx

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,8 @@ export default function ArenaPage() {
220220

221221
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
222222
{/* Result Card A */}
223-
<div className="bg-white p-6 rounded-2xl border border-slate-200 shadow-sm flex flex-col gap-6 relative overflow-hidden">
224-
{results.pipeline_a.mrr_10 > results.pipeline_b.mrr_10 && <div className="absolute top-0 right-0 w-16 h-16 bg-emerald-500 transform rotate-45 translate-x-8 -translate-y-8" />}
223+
<div className="bg-white p-6 rounded-2xl border border-slate-200 shadow-sm flex flex-col gap-4 relative overflow-hidden">
224+
{results.pipeline_a.ndcg_10 > results.pipeline_b.ndcg_10 && <div className="absolute top-0 right-0 w-16 h-16 bg-emerald-500 transform rotate-45 translate-x-8 -translate-y-8" />}
225225

226226
<div className="flex items-center gap-3">
227227
<div className="w-10 h-10 rounded-xl bg-slate-100 text-slate-600 flex items-center justify-center font-bold text-lg">A</div>
@@ -230,16 +230,31 @@ export default function ArenaPage() {
230230
<div className="text-xs text-slate-500">BM25 {pipeA.method !== "none" ? `+ ${pipeA.method} (${pipeA.model})` : "(No Reranker)"}</div>
231231
</div>
232232
</div>
233-
<div className="flex flex-col gap-4 mt-2">
234-
<MetricBar label="NDCG@10 (Ranking Quality)" value={results.pipeline_a.ndcg_10} max={100} format="percent" />
233+
234+
<div className="text-[10px] font-bold text-slate-400 uppercase tracking-wider mt-1">Ranking Quality</div>
235+
<div className="flex flex-col gap-3">
236+
<MetricBar label="NDCG@1" value={results.pipeline_a.ndcg_1} max={100} format="percent" />
237+
<MetricBar label="NDCG@5" value={results.pipeline_a.ndcg_5} max={100} format="percent" />
238+
<MetricBar label="NDCG@10" value={results.pipeline_a.ndcg_10} max={100} format="percent" />
239+
</div>
240+
241+
<div className="text-[10px] font-bold text-slate-400 uppercase tracking-wider mt-1">Mean Average Precision</div>
242+
<div className="flex flex-col gap-3">
243+
<MetricBar label="MAP@1" value={results.pipeline_a.map_1} max={100} format="percent" />
244+
<MetricBar label="MAP@5" value={results.pipeline_a.map_5} max={100} format="percent" />
245+
<MetricBar label="MAP@10" value={results.pipeline_a.map_10} max={100} format="percent" />
246+
</div>
247+
248+
<div className="text-[10px] font-bold text-slate-400 uppercase tracking-wider mt-1">Relevance & Efficiency</div>
249+
<div className="flex flex-col gap-3">
235250
<MetricBar label="MRR@10 (Top Relevance)" value={results.pipeline_a.mrr_10} max={100} format="percent" />
236-
<MetricBar label="Avg End-to-End Latency" value={results.pipeline_a.latency_ms} max={3000} format="ms" />
251+
<MetricBar label="Avg Reranking Latency" value={results.pipeline_a.latency_ms} max={3000} format="ms" />
237252
</div>
238253
</div>
239254

240255
{/* Result Card B */}
241-
<div className="bg-white p-6 rounded-2xl border border-indigo-200 shadow-md flex flex-col gap-6 relative overflow-hidden ring-1 ring-indigo-500 ring-opacity-20">
242-
{results.pipeline_b.mrr_10 > results.pipeline_a.mrr_10 && <div className="absolute top-0 right-0 w-16 h-16 bg-emerald-500 transform rotate-45 translate-x-8 -translate-y-8 flex items-end justify-center pb-1"><Trophy className="w-4 h-4 text-white -rotate-45" /></div>}
256+
<div className="bg-white p-6 rounded-2xl border border-indigo-200 shadow-md flex flex-col gap-4 relative overflow-hidden ring-1 ring-indigo-500 ring-opacity-20">
257+
{results.pipeline_b.ndcg_10 > results.pipeline_a.ndcg_10 && <div className="absolute top-0 right-0 w-16 h-16 bg-emerald-500 transform rotate-45 translate-x-8 -translate-y-8 flex items-end justify-center pb-1"><Trophy className="w-4 h-4 text-white -rotate-45" /></div>}
243258

244259
<div className="flex items-center gap-3">
245260
<div className="w-10 h-10 rounded-xl bg-indigo-100 text-indigo-700 flex items-center justify-center font-bold text-lg">B</div>
@@ -248,14 +263,30 @@ export default function ArenaPage() {
248263
<div className="text-xs text-slate-500">BM25 {pipeB.method !== "none" ? `+ ${pipeB.method} (${pipeB.model})` : "(No Reranker)"}</div>
249264
</div>
250265
</div>
251-
<div className="flex flex-col gap-4 mt-2">
252-
<MetricBar label="NDCG@10 (Ranking Quality)" value={results.pipeline_b.ndcg_10} max={100} format="percent" />
266+
267+
<div className="text-[10px] font-bold text-slate-400 uppercase tracking-wider mt-1">Ranking Quality</div>
268+
<div className="flex flex-col gap-3">
269+
<MetricBar label="NDCG@1" value={results.pipeline_b.ndcg_1} max={100} format="percent" />
270+
<MetricBar label="NDCG@5" value={results.pipeline_b.ndcg_5} max={100} format="percent" />
271+
<MetricBar label="NDCG@10" value={results.pipeline_b.ndcg_10} max={100} format="percent" />
272+
</div>
273+
274+
<div className="text-[10px] font-bold text-slate-400 uppercase tracking-wider mt-1">Mean Average Precision</div>
275+
<div className="flex flex-col gap-3">
276+
<MetricBar label="MAP@1" value={results.pipeline_b.map_1} max={100} format="percent" />
277+
<MetricBar label="MAP@5" value={results.pipeline_b.map_5} max={100} format="percent" />
278+
<MetricBar label="MAP@10" value={results.pipeline_b.map_10} max={100} format="percent" />
279+
</div>
280+
281+
<div className="text-[10px] font-bold text-slate-400 uppercase tracking-wider mt-1">Relevance & Efficiency</div>
282+
<div className="flex flex-col gap-3">
253283
<MetricBar label="MRR@10 (Top Relevance)" value={results.pipeline_b.mrr_10} max={100} format="percent" />
254-
<MetricBar label="Avg End-to-End Latency" value={results.pipeline_b.latency_ms} max={3000} format="ms" />
284+
<MetricBar label="Avg Reranking Latency" value={results.pipeline_b.latency_ms} max={3000} format="ms" />
255285
</div>
256286
</div>
257287
</div>
258288
</div>
289+
259290
)}
260291

261292
</div>

demo_server.py

Lines changed: 81 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -482,149 +482,111 @@ async def gen():
482482

483483
@app.post("/api/arena/run")
484484
async def arena_run(req: ArenaRequest):
485-
"""Compare two pipelines on a dataset using Rankify's BEIR evaluation."""
486-
import copy, math, tempfile, os, requests
485+
"""Compare two BEIR pipelines using Rankify's Metrics.calculate_trec_metrics()."""
486+
import copy, os, requests
487+
488+
# Publicly accessible QREL files from castorini/anserini-tools (verified HTTP 200)
489+
ANSERINI_BASE = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/"
490+
QREL_URLS = {
491+
"dl19": ANSERINI_BASE + "qrels.dl19-passage.txt",
492+
"dl20": ANSERINI_BASE + "qrels.dl20-passage.txt",
493+
"covid": ANSERINI_BASE + "qrels.beir-v1.0.0-trec-covid.test.txt",
494+
"nfc": ANSERINI_BASE + "qrels.beir-v1.0.0-nfcorpus.test.txt",
495+
"touche": ANSERINI_BASE + "qrels.beir-v1.0.0-webis-touche2020.test.txt",
496+
"dbpedia": ANSERINI_BASE + "qrels.beir-v1.0.0-dbpedia-entity.test.txt",
497+
"scifact": ANSERINI_BASE + "qrels.beir-v1.0.0-scifact.test.txt",
498+
"signal": ANSERINI_BASE + "qrels.beir-v1.0.0-signal1m.test.txt",
499+
"news": ANSERINI_BASE + "qrels.beir-v1.0.0-trec-news.test.txt",
500+
"robust04":ANSERINI_BASE + "qrels.beir-v1.0.0-robust04.test.txt",
501+
"arguana": ANSERINI_BASE + "qrels.beir-v1.0.0-arguana.test.txt",
502+
"fever": ANSERINI_BASE + "qrels.beir-v1.0.0-fever.test.txt",
503+
"fiqa": ANSERINI_BASE + "qrels.beir-v1.0.0-fiqa.test.txt",
504+
"quora": ANSERINI_BASE + "qrels.beir-v1.0.0-quora.test.txt",
505+
"scidocs": ANSERINI_BASE + "qrels.beir-v1.0.0-scidocs.test.txt",
506+
}
487507

488508
try:
489509
from rankify.dataset.dataset import Dataset
490510
from rankify.metrics.metrics import Metrics
491511

492-
logger.info(f"Arena: Running benchmark on {req.dataset}")
493-
494-
# ── QREL file download ──────────────────────────────────────────────
495-
# Pyserini is broken on Python 3.13 (jar issue), so we download qrel
496-
# files directly from the HuggingFace mirror that pyserini uses.
497-
# pyserini dataset-id → HF path on castorini/anserini-tools
498-
PYSERINI_QREL_URLS = {
499-
"dl19": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/dl19-passage.trec",
500-
"dl20": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/dl20-passage.trec",
501-
"covid": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.covid.qrels",
502-
"nfc": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.nfcorpus.qrels",
503-
"touche": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.touche.qrels",
504-
"dbpedia": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.dbpedia.qrels",
505-
"scifact": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.scifact.qrels",
506-
"signal": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.signal.qrels",
507-
"news": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.news.qrels",
508-
"robust04":"https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.robust04.qrels",
509-
"arguana": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.arguana.qrels",
510-
"fever": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.fever.qrels",
511-
"fiqa": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.fiqa.qrels",
512-
"quora": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.quora.qrels",
513-
"scidocs": "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.scidocs.qrels",
514-
}
515-
516-
# Determine the short qrel key from dataset name (e.g. "beir-covid" → "covid")
517-
dataset_key = req.dataset
518-
if req.dataset.startswith("beir-"):
519-
dataset_key = req.dataset.split("-", 1)[1]
520-
521-
# Download qrel file (cached per run)
522-
qrel_path = None
523-
qrel_cache_dir = os.path.join(os.environ.get("RERANKING_CACHE_DIR", "./cache"), "qrels")
524-
os.makedirs(qrel_cache_dir, exist_ok=True)
525-
qrel_cache_file = os.path.join(qrel_cache_dir, f"{dataset_key}.qrel")
526-
527-
if os.path.exists(qrel_cache_file):
528-
qrel_path = qrel_cache_file
529-
logger.info(f"Using cached QREL: {qrel_cache_file}")
530-
elif dataset_key in PYSERINI_QREL_URLS:
531-
url = PYSERINI_QREL_URLS[dataset_key]
532-
logger.info(f"Downloading QREL from {url}")
533-
try:
534-
resp = requests.get(url, timeout=30)
535-
if resp.status_code == 200:
536-
with open(qrel_cache_file, "w") as f:
537-
f.write(resp.text)
538-
qrel_path = qrel_cache_file
539-
logger.info(f"QREL downloaded to {qrel_cache_file}, {len(resp.text)} chars")
540-
else:
541-
logger.warning(f"QREL download failed: HTTP {resp.status_code}")
542-
except Exception as e:
543-
logger.warning(f"QREL download error: {e}")
544-
545-
# ── Dataset download ────────────────────────────────────────────────
512+
logger.info(f"Arena eval start: {req.dataset}")
513+
514+
# Map dataset name to qrel key (e.g. "beir-covid" -> "covid")
515+
if req.dataset in ["dl19", "dl20"]:
516+
qrel_key = req.dataset
517+
else:
518+
qrel_key = req.dataset.split("-", 1)[1] if req.dataset.startswith("beir-") else req.dataset
519+
520+
# Pre-download QREL file locally (Pyserini Java QREL download fails on this server)
521+
# Metrics.calculate_trec_metrics() accepts a local file path via os.path.exists() check
522+
qrel_dir = os.path.join(os.getcwd(), "cache", "qrels")
523+
os.makedirs(qrel_dir, exist_ok=True)
524+
qrel_path = os.path.join(qrel_dir, f"{qrel_key}.qrel")
525+
526+
if not os.path.exists(qrel_path) and qrel_key in QREL_URLS:
527+
url = QREL_URLS[qrel_key]
528+
logger.info(f"Downloading QREL: {url}")
529+
resp = requests.get(url, timeout=30)
530+
if resp.status_code == 200:
531+
with open(qrel_path, "w") as f:
532+
f.write(resp.text)
533+
logger.info(f"QREL saved: {qrel_path}")
534+
else:
535+
logger.warning(f"QREL download failed HTTP {resp.status_code}")
536+
537+
if not os.path.exists(qrel_path):
538+
raise ValueError(f"QREL file not available for dataset '{qrel_key}'")
539+
540+
# Download BEIR dataset (BM25 pre-retrieved, doc.id = query_id, ctx.id = passage_id)
546541
ds = Dataset(retriever="bm25", dataset_name=req.dataset, n_docs=req.n_docs)
547-
documents = ds.download(force_download=False)
548-
if not documents:
542+
data = ds.download(force_download=False)
543+
if not data:
549544
raise ValueError(f"Failed to load dataset: {req.dataset}")
550545

551546
import random
552-
eval_docs = random.sample(documents, min(req.n_queries, len(documents)))
553-
logger.info(f"Evaluating {len(eval_docs)} queries from {req.dataset}")
547+
eval_docs = random.sample(data, min(req.n_queries, len(data)))
548+
logger.info(f"Evaluating {len(eval_docs)} queries")
554549

555-
# ── Per-pipeline evaluation ─────────────────────────────────────────
556550
def evaluate_pipeline(pipeline_cfg: ArenaPipeline, docs):
557551
docs_copy = copy.deepcopy(docs)
558552
rr_latency = 0.0
559-
ret_results = docs_copy
560553

561-
# Reranking (retrieval already done — BEIR datasets come pre-retrieved)
554+
# Apply reranking if configured
562555
reranker = get_reranker(pipeline_cfg.rerankerCategory, pipeline_cfg.rerankerModel)
563556
if reranker:
564557
t1 = time.time()
565-
ret_results = reranker.rank(ret_results)
558+
reranker.rank(docs_copy)
566559
rr_latency = (time.time() - t1) * 1000 / max(1, len(docs_copy))
567560

568561
use_rr = reranker is not None
569562

570-
# ── Try Rankify's calculate_trec_metrics with downloaded QREL file ──
571-
ndcg_10, mrr_10 = 0.0, 0.0
572-
if qrel_path:
573-
try:
574-
metrics_obj = Metrics(ret_results)
575-
trec = metrics_obj.calculate_trec_metrics(
576-
ndcg_cuts=[10],
577-
map_cuts=[10],
578-
mrr_cuts=[10],
579-
qrel=qrel_path,
580-
use_reordered=use_rr,
581-
)
582-
ndcg_10 = trec.get("ndcg@10", 0.0) * 100
583-
mrr_10 = trec.get("mrr@10", 0.0) * 100
584-
logger.info(f"TREC eval: NDCG@10={ndcg_10:.2f}% MRR@10={mrr_10:.2f}%")
585-
except Exception as e:
586-
logger.warning(f"calculate_trec_metrics failed ({e}), using binary fallback")
587-
588-
# ── Pure-Python binary fallback using has_answer ─────────────────
589-
if ndcg_10 == 0.0 and mrr_10 == 0.0:
590-
mrr_sum, ndcg_sum = 0.0, 0.0
591-
for doc in ret_results:
592-
ctxs = (
593-
doc.reorder_contexts
594-
if (use_rr and getattr(doc, "reorder_contexts", None))
595-
else doc.contexts
596-
)
597-
if not ctxs:
598-
continue
599-
for i, ctx in enumerate(ctxs[:10]):
600-
if getattr(ctx, "has_answer", False):
601-
mrr_sum += 1.0 / (i + 1)
602-
break
603-
dcg, rels = 0.0, []
604-
for i, ctx in enumerate(ctxs[:10]):
605-
rel = 1 if getattr(ctx, "has_answer", False) else 0
606-
rels.append(rel)
607-
if rel:
608-
dcg += 1.0 / math.log2(i + 2)
609-
idcg = sum(r / math.log2(i + 2) for i, r in enumerate(sorted(rels, reverse=True)) if r)
610-
if idcg > 0:
611-
ndcg_sum += dcg / idcg
612-
n = len(ret_results)
613-
mrr_10 = (mrr_sum / n) * 100 if n > 0 else 0.0
614-
ndcg_10 = (ndcg_sum / n) * 100 if n > 0 else 0.0
615-
logger.info(f"Binary fallback: NDCG@10={ndcg_10:.2f}% MRR@10={mrr_10:.2f}%")
616-
617-
return {"mrr_10": mrr_10, "ndcg_10": ndcg_10, "latency_ms": rr_latency}
563+
# Use Rankify's Metrics.calculate_trec_metrics() with pre-downloaded local QREL path
564+
# (same as the Gradio demo but passing local file path to bypass Java download)
565+
metrics_obj = Metrics(docs_copy)
566+
trec = metrics_obj.calculate_trec_metrics(
567+
ndcg_cuts=[1, 5, 10],
568+
map_cuts=[1, 5, 10],
569+
mrr_cuts=[10],
570+
qrel=qrel_path, # local file path — framework checks os.path.exists()
571+
use_reordered=use_rr,
572+
)
573+
def pct(key): return round(trec.get(key, 0.0) * 100, 2)
574+
logger.info(f"Pipeline [{pipeline_cfg.rerankerCategory}/{pipeline_cfg.rerankerModel}]: NDCG@10={pct('ndcg@10')}% MRR@10={pct('mrr@10')}%")
575+
return {
576+
"ndcg_1": pct("ndcg@1"),
577+
"ndcg_5": pct("ndcg@5"),
578+
"ndcg_10": pct("ndcg@10"),
579+
"map_1": pct("map@1"),
580+
"map_5": pct("map@5"),
581+
"map_10": pct("map@10"),
582+
"mrr_10": pct("mrr@10"),
583+
"latency_ms": rr_latency,
584+
}
618585

619586
res_a = evaluate_pipeline(req.pipeline_a, eval_docs)
620587
res_b = evaluate_pipeline(req.pipeline_b, eval_docs)
621-
622-
return {
623-
"num_queries": len(eval_docs),
624-
"pipeline_a": res_a,
625-
"pipeline_b": res_b
626-
}
627-
588+
return {"num_queries": len(eval_docs), "pipeline_a": res_a, "pipeline_b": res_b}
589+
628590
except Exception as e:
629591
logger.error(traceback.format_exc())
630592
raise HTTPException(status_code=500, detail=str(e))

0 commit comments

Comments
 (0)