@@ -482,149 +482,111 @@ async def gen():
482482
483483@app .post ("/api/arena/run" )
484484async def arena_run (req : ArenaRequest ):
485- """Compare two pipelines on a dataset using Rankify's BEIR evaluation."""
486- import copy , math , tempfile , os , requests
485+ """Compare two BEIR pipelines using Rankify's Metrics.calculate_trec_metrics()."""
486+ import copy , os , requests
487+
488+ # Publicly accessible QREL files from castorini/anserini-tools (verified HTTP 200)
489+ ANSERINI_BASE = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/"
490+ QREL_URLS = {
491+ "dl19" : ANSERINI_BASE + "qrels.dl19-passage.txt" ,
492+ "dl20" : ANSERINI_BASE + "qrels.dl20-passage.txt" ,
493+ "covid" : ANSERINI_BASE + "qrels.beir-v1.0.0-trec-covid.test.txt" ,
494+ "nfc" : ANSERINI_BASE + "qrels.beir-v1.0.0-nfcorpus.test.txt" ,
495+ "touche" : ANSERINI_BASE + "qrels.beir-v1.0.0-webis-touche2020.test.txt" ,
496+ "dbpedia" : ANSERINI_BASE + "qrels.beir-v1.0.0-dbpedia-entity.test.txt" ,
497+ "scifact" : ANSERINI_BASE + "qrels.beir-v1.0.0-scifact.test.txt" ,
498+ "signal" : ANSERINI_BASE + "qrels.beir-v1.0.0-signal1m.test.txt" ,
499+ "news" : ANSERINI_BASE + "qrels.beir-v1.0.0-trec-news.test.txt" ,
500+ "robust04" :ANSERINI_BASE + "qrels.beir-v1.0.0-robust04.test.txt" ,
501+ "arguana" : ANSERINI_BASE + "qrels.beir-v1.0.0-arguana.test.txt" ,
502+ "fever" : ANSERINI_BASE + "qrels.beir-v1.0.0-fever.test.txt" ,
503+ "fiqa" : ANSERINI_BASE + "qrels.beir-v1.0.0-fiqa.test.txt" ,
504+ "quora" : ANSERINI_BASE + "qrels.beir-v1.0.0-quora.test.txt" ,
505+ "scidocs" : ANSERINI_BASE + "qrels.beir-v1.0.0-scidocs.test.txt" ,
506+ }
487507
488508 try :
489509 from rankify .dataset .dataset import Dataset
490510 from rankify .metrics .metrics import Metrics
491511
492- logger .info (f"Arena: Running benchmark on { req .dataset } " )
493-
494- # ── QREL file download ──────────────────────────────────────────────
495- # Pyserini is broken on Python 3.13 (jar issue), so we download qrel
496- # files directly from the HuggingFace mirror that pyserini uses.
497- # pyserini dataset-id → HF path on castorini/anserini-tools
498- PYSERINI_QREL_URLS = {
499- "dl19" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/dl19-passage.trec" ,
500- "dl20" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/dl20-passage.trec" ,
501- "covid" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.covid.qrels" ,
502- "nfc" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.nfcorpus.qrels" ,
503- "touche" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.touche.qrels" ,
504- "dbpedia" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.dbpedia.qrels" ,
505- "scifact" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.scifact.qrels" ,
506- "signal" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.signal.qrels" ,
507- "news" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.news.qrels" ,
508- "robust04" :"https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.robust04.qrels" ,
509- "arguana" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.arguana.qrels" ,
510- "fever" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.fever.qrels" ,
511- "fiqa" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.fiqa.qrels" ,
512- "quora" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.quora.qrels" ,
513- "scidocs" : "https://huggingface.co/datasets/castorini/beir-qrels/resolve/main/test.scidocs.qrels" ,
514- }
515-
516- # Determine the short qrel key from dataset name (e.g. "beir-covid" → "covid")
517- dataset_key = req .dataset
518- if req .dataset .startswith ("beir-" ):
519- dataset_key = req .dataset .split ("-" , 1 )[1 ]
520-
521- # Download qrel file (cached per run)
522- qrel_path = None
523- qrel_cache_dir = os .path .join (os .environ .get ("RERANKING_CACHE_DIR" , "./cache" ), "qrels" )
524- os .makedirs (qrel_cache_dir , exist_ok = True )
525- qrel_cache_file = os .path .join (qrel_cache_dir , f"{ dataset_key } .qrel" )
526-
527- if os .path .exists (qrel_cache_file ):
528- qrel_path = qrel_cache_file
529- logger .info (f"Using cached QREL: { qrel_cache_file } " )
530- elif dataset_key in PYSERINI_QREL_URLS :
531- url = PYSERINI_QREL_URLS [dataset_key ]
532- logger .info (f"Downloading QREL from { url } " )
533- try :
534- resp = requests .get (url , timeout = 30 )
535- if resp .status_code == 200 :
536- with open (qrel_cache_file , "w" ) as f :
537- f .write (resp .text )
538- qrel_path = qrel_cache_file
539- logger .info (f"QREL downloaded to { qrel_cache_file } , { len (resp .text )} chars" )
540- else :
541- logger .warning (f"QREL download failed: HTTP { resp .status_code } " )
542- except Exception as e :
543- logger .warning (f"QREL download error: { e } " )
544-
545- # ── Dataset download ────────────────────────────────────────────────
512+ logger .info (f"Arena eval start: { req .dataset } " )
513+
514+ # Map dataset name to qrel key (e.g. "beir-covid" -> "covid")
515+ if req .dataset in ["dl19" , "dl20" ]:
516+ qrel_key = req .dataset
517+ else :
518+ qrel_key = req .dataset .split ("-" , 1 )[1 ] if req .dataset .startswith ("beir-" ) else req .dataset
519+
520+ # Pre-download QREL file locally (Pyserini Java QREL download fails on this server)
521+ # Metrics.calculate_trec_metrics() accepts a local file path via os.path.exists() check
522+ qrel_dir = os .path .join (os .getcwd (), "cache" , "qrels" )
523+ os .makedirs (qrel_dir , exist_ok = True )
524+ qrel_path = os .path .join (qrel_dir , f"{ qrel_key } .qrel" )
525+
526+ if not os .path .exists (qrel_path ) and qrel_key in QREL_URLS :
527+ url = QREL_URLS [qrel_key ]
528+ logger .info (f"Downloading QREL: { url } " )
529+ resp = requests .get (url , timeout = 30 )
530+ if resp .status_code == 200 :
531+ with open (qrel_path , "w" ) as f :
532+ f .write (resp .text )
533+ logger .info (f"QREL saved: { qrel_path } " )
534+ else :
535+ logger .warning (f"QREL download failed HTTP { resp .status_code } " )
536+
537+ if not os .path .exists (qrel_path ):
538+ raise ValueError (f"QREL file not available for dataset '{ qrel_key } '" )
539+
540+ # Download BEIR dataset (BM25 pre-retrieved, doc.id = query_id, ctx.id = passage_id)
546541 ds = Dataset (retriever = "bm25" , dataset_name = req .dataset , n_docs = req .n_docs )
547- documents = ds .download (force_download = False )
548- if not documents :
542+ data = ds .download (force_download = False )
543+ if not data :
549544 raise ValueError (f"Failed to load dataset: { req .dataset } " )
550545
551546 import random
552- eval_docs = random .sample (documents , min (req .n_queries , len (documents )))
553- logger .info (f"Evaluating { len (eval_docs )} queries from { req . dataset } " )
547+ eval_docs = random .sample (data , min (req .n_queries , len (data )))
548+ logger .info (f"Evaluating { len (eval_docs )} queries" )
554549
555- # ── Per-pipeline evaluation ─────────────────────────────────────────
556550 def evaluate_pipeline (pipeline_cfg : ArenaPipeline , docs ):
557551 docs_copy = copy .deepcopy (docs )
558552 rr_latency = 0.0
559- ret_results = docs_copy
560553
561- # Reranking (retrieval already done — BEIR datasets come pre-retrieved)
554+ # Apply reranking if configured
562555 reranker = get_reranker (pipeline_cfg .rerankerCategory , pipeline_cfg .rerankerModel )
563556 if reranker :
564557 t1 = time .time ()
565- ret_results = reranker .rank (ret_results )
558+ reranker .rank (docs_copy )
566559 rr_latency = (time .time () - t1 ) * 1000 / max (1 , len (docs_copy ))
567560
568561 use_rr = reranker is not None
569562
570- # ── Try Rankify's calculate_trec_metrics with downloaded QREL file ──
571- ndcg_10 , mrr_10 = 0.0 , 0.0
572- if qrel_path :
573- try :
574- metrics_obj = Metrics (ret_results )
575- trec = metrics_obj .calculate_trec_metrics (
576- ndcg_cuts = [10 ],
577- map_cuts = [10 ],
578- mrr_cuts = [10 ],
579- qrel = qrel_path ,
580- use_reordered = use_rr ,
581- )
582- ndcg_10 = trec .get ("ndcg@10" , 0.0 ) * 100
583- mrr_10 = trec .get ("mrr@10" , 0.0 ) * 100
584- logger .info (f"TREC eval: NDCG@10={ ndcg_10 :.2f} % MRR@10={ mrr_10 :.2f} %" )
585- except Exception as e :
586- logger .warning (f"calculate_trec_metrics failed ({ e } ), using binary fallback" )
587-
588- # ── Pure-Python binary fallback using has_answer ─────────────────
589- if ndcg_10 == 0.0 and mrr_10 == 0.0 :
590- mrr_sum , ndcg_sum = 0.0 , 0.0
591- for doc in ret_results :
592- ctxs = (
593- doc .reorder_contexts
594- if (use_rr and getattr (doc , "reorder_contexts" , None ))
595- else doc .contexts
596- )
597- if not ctxs :
598- continue
599- for i , ctx in enumerate (ctxs [:10 ]):
600- if getattr (ctx , "has_answer" , False ):
601- mrr_sum += 1.0 / (i + 1 )
602- break
603- dcg , rels = 0.0 , []
604- for i , ctx in enumerate (ctxs [:10 ]):
605- rel = 1 if getattr (ctx , "has_answer" , False ) else 0
606- rels .append (rel )
607- if rel :
608- dcg += 1.0 / math .log2 (i + 2 )
609- idcg = sum (r / math .log2 (i + 2 ) for i , r in enumerate (sorted (rels , reverse = True )) if r )
610- if idcg > 0 :
611- ndcg_sum += dcg / idcg
612- n = len (ret_results )
613- mrr_10 = (mrr_sum / n ) * 100 if n > 0 else 0.0
614- ndcg_10 = (ndcg_sum / n ) * 100 if n > 0 else 0.0
615- logger .info (f"Binary fallback: NDCG@10={ ndcg_10 :.2f} % MRR@10={ mrr_10 :.2f} %" )
616-
617- return {"mrr_10" : mrr_10 , "ndcg_10" : ndcg_10 , "latency_ms" : rr_latency }
563+ # Use Rankify's Metrics.calculate_trec_metrics() with pre-downloaded local QREL path
564+ # (same as the Gradio demo but passing local file path to bypass Java download)
565+ metrics_obj = Metrics (docs_copy )
566+ trec = metrics_obj .calculate_trec_metrics (
567+ ndcg_cuts = [1 , 5 , 10 ],
568+ map_cuts = [1 , 5 , 10 ],
569+ mrr_cuts = [10 ],
570+ qrel = qrel_path , # local file path — framework checks os.path.exists()
571+ use_reordered = use_rr ,
572+ )
573+ def pct (key ): return round (trec .get (key , 0.0 ) * 100 , 2 )
574+ logger .info (f"Pipeline [{ pipeline_cfg .rerankerCategory } /{ pipeline_cfg .rerankerModel } ]: NDCG@10={ pct ('ndcg@10' )} % MRR@10={ pct ('mrr@10' )} %" )
575+ return {
576+ "ndcg_1" : pct ("ndcg@1" ),
577+ "ndcg_5" : pct ("ndcg@5" ),
578+ "ndcg_10" : pct ("ndcg@10" ),
579+ "map_1" : pct ("map@1" ),
580+ "map_5" : pct ("map@5" ),
581+ "map_10" : pct ("map@10" ),
582+ "mrr_10" : pct ("mrr@10" ),
583+ "latency_ms" : rr_latency ,
584+ }
618585
619586 res_a = evaluate_pipeline (req .pipeline_a , eval_docs )
620587 res_b = evaluate_pipeline (req .pipeline_b , eval_docs )
621-
622- return {
623- "num_queries" : len (eval_docs ),
624- "pipeline_a" : res_a ,
625- "pipeline_b" : res_b
626- }
627-
588+ return {"num_queries" : len (eval_docs ), "pipeline_a" : res_a , "pipeline_b" : res_b }
589+
628590 except Exception as e :
629591 logger .error (traceback .format_exc ())
630592 raise HTTPException (status_code = 500 , detail = str (e ))
0 commit comments