Add theme momentum coverage quality metrics

Pigbibi · Pigbibi · commit b0333eebde8b · 2026-05-31T14:06:54.000+08:00
diff --git a/README.md b/README.md
@@ -84,6 +84,19 @@ plugin contract.
 6. Downstream runtimes must treat the artifact as advisory context only until a
    separate deterministic policy engine explicitly consumes it.
 
+
+## Name and Horizon Boundary
+
+The repository name remains acceptable for now because this repo owns the
+long-horizon AI shadow context and cross-sector theme research artifacts.
+Short/medium/long final recommendations are produced by
+`QuantAdvisorResearch`, not by this repository.
+
+If the theme-momentum layer later becomes broader than AI context, a future
+rename such as `LongHorizonResearchSignals` can be considered, but that should
+be a separate migration because GitHub repo links, cross-repo checkout paths,
+and documentation references would all need updates.
+
 ## GitHub Configuration
 
 The model API keys are centralized in `CodexAuditBridge`; do not add
@@ -263,5 +276,7 @@ failures are recorded in `data_quality.missing_price_symbols` by default;
 `--strict-downloads` turns those into hard failures.
 
 The snapshot records fixed 12-1m, 6-1m, and 3m momentum windows, breadth, risk
-penalties, top symbols per theme, and a policy block that keeps the artifact
-research-only.
+penalties, top symbols per theme, source metadata, and a policy block that keeps
+the artifact research-only. `data_quality.coverage` now records configured
+symbol count, priced symbol count, price coverage ratio, and symbols with
+insufficient price history.
diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -68,6 +68,13 @@ data/output/signal_history/2026-05-28.json
 5. 所有 AI 生成的 artifact 必须保持 `mode=shadow`，并通过本地 schema validation。
 6. 下游系统在单独的确定性 policy engine 显式消费前，只能把 artifact 当作 advisory context。
 
+
+## 名称和周期边界
+
+当前暂不建议改仓库名。`AiLongHorizonSignalPipelines` 仍然准确描述了本仓库的核心职责：维护长周期 AI shadow context 和跨板块主题研究 artifact。短线/中线/长线最终推荐是在 `QuantAdvisorResearch` 里生成的，不由本仓库直接输出。
+
+如果后续主题动量层明显扩展成更通用的研究信号仓，可以单独评估改名，例如 `LongHorizonResearchSignals`。这需要迁移 GitHub 仓库链接、跨仓 checkout 路径和文档引用，不建议和本次数据质量增强混在一起做。
+
 ## GitHub 配置
 
 模型 API key 集中在 `CodexAuditBridge`；不要把 `OPENAI_API_KEY` 或 `ANTHROPIC_API_KEY` 放到本仓库。
@@ -216,6 +223,7 @@ python scripts/build_theme_momentum_snapshot.py \
 - `theme_ranks`：主题排名、动量分、breadth、风险惩罚和主题内 top symbols
 - `methodology`：固定窗口和权重，便于后续 walk-forward replay
 - `policy`：明确这是研究排序，不允许下单或仓位分配
+- `data_quality.coverage`：配置标的数、已有价格标的数、价格覆盖率和价格历史不足标的
 
 当前固定窗口：
 
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -138,3 +138,12 @@ The artifact is point-in-time research context.  It ranks themes and highlights
 strong members inside a theme, but it does not encode orders, target weights, or
 execution policy.  Future replay must consume saved snapshots rather than
 recomputing old theme ranks with revised constituents or revised weights.
+
+## Repository Name Decision
+
+Keep `AiLongHorizonSignalPipelines` for now. The short/medium/long final
+recommendation buckets live in `QuantAdvisorResearch`; this repository still
+provides long-horizon AI shadow context plus cross-sector theme artifacts. A
+rename to `LongHorizonResearchSignals` may be reasonable later, but it should be
+a deliberate migration because cross-repository workflow checkouts and public
+links would need updates.
diff --git a/scripts/build_theme_momentum_snapshot.py b/scripts/build_theme_momentum_snapshot.py
@@ -107,7 +107,9 @@ def main() -> int:
                 "ranked_theme_count": snapshot["summary"]["ranked_theme_count"],
                 "priced_symbol_count": snapshot["summary"]["priced_symbol_count"],
                 "top_theme_ids": snapshot["summary"]["top_theme_ids"],
+                "price_coverage_ratio": snapshot["data_quality"]["coverage"]["price_coverage_ratio"],
                 "missing_price_symbols": snapshot["data_quality"]["missing_price_symbols"],
+                "insufficient_history_symbols": snapshot["data_quality"].get("insufficient_history_symbols", []),
             }
         )
     )
diff --git a/src/ai_long_horizon_signal_pipelines/theme_momentum.py b/src/ai_long_horizon_signal_pipelines/theme_momentum.py
@@ -133,6 +133,11 @@ def build_theme_momentum_snapshot(
         for symbol, symbol_rows in sorted(rows_by_symbol.items())
         if symbol_rows
     }
+    exposure_symbols = sorted({symbol.upper() for symbol in exposures})
+    priced_exposure_symbols = [symbol for symbol in exposure_symbols if symbol in symbol_scores]
+    insufficient_history_symbols = sorted(
+        symbol for symbol in priced_exposure_symbols if symbol_scores[symbol]["momentum_score"] is None
+    )
     latest_dates = [parse_price_date(item["as_of"]) for item in symbol_scores.values()]
     snapshot_as_of = (as_of_date or max(latest_dates)).isoformat() if latest_dates or as_of_date else dt.date.today().isoformat()
 
@@ -231,7 +236,16 @@ def build_theme_momentum_snapshot(
         },
         "theme_ranks": theme_ranks,
         "data_quality": {
+            "coverage": {
+                "configured_symbol_count": len(exposure_symbols),
+                "priced_symbol_count": len(priced_exposure_symbols),
+                "price_coverage_ratio": round_optional(
+                    len(priced_exposure_symbols) / len(exposure_symbols) if exposure_symbols else None
+                ),
+                "insufficient_history_symbol_count": len(insufficient_history_symbols),
+            },
             "missing_price_symbols": sorted(missing_price_symbols),
+            "insufficient_history_symbols": insufficient_history_symbols,
             "unranked_themes": sorted(unranked_themes),
         },
         "policy": {
diff --git a/tests/test_theme_momentum.py b/tests/test_theme_momentum.py
@@ -63,6 +63,7 @@ def test_theme_momentum_ranks_strong_broad_theme_first() -> None:
     assert [item["symbol"] for item in ranked[0]["top_symbols"]] == ["MU", "HBM2"]
     assert ranked[0]["momentum_score"] > ranked[1]["momentum_score"]
     assert snapshot["policy"]["execution_allowed"] is False
+    assert snapshot["data_quality"]["coverage"]["price_coverage_ratio"] == 1.0
 
 
 def test_theme_momentum_records_missing_price_coverage() -> None:
@@ -86,5 +87,8 @@ def test_theme_momentum_records_missing_price_coverage() -> None:
     snapshot = build_theme_momentum_snapshot(rows, themes=themes, exposures=exposures)
 
     assert snapshot["data_quality"]["missing_price_symbols"] == ["SMCI"]
+    assert snapshot["data_quality"]["coverage"]["configured_symbol_count"] == 2
+    assert snapshot["data_quality"]["coverage"]["priced_symbol_count"] == 1
+    assert snapshot["data_quality"]["coverage"]["price_coverage_ratio"] == 0.5
     assert snapshot["theme_ranks"][0]["component_count"] == 2
     assert snapshot["theme_ranks"][0]["priced_symbol_count"] == 1

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,9 @@ def main() -> int:`
`107`	`107`	`"ranked_theme_count": snapshot["summary"]["ranked_theme_count"],`
`108`	`108`	`"priced_symbol_count": snapshot["summary"]["priced_symbol_count"],`
`109`	`109`	`"top_theme_ids": snapshot["summary"]["top_theme_ids"],`
	`110`	`+ "price_coverage_ratio": snapshot["data_quality"]["coverage"]["price_coverage_ratio"],`
`110`	`111`	`"missing_price_symbols": snapshot["data_quality"]["missing_price_symbols"],`
	`112`	`+ "insufficient_history_symbols": snapshot["data_quality"].get("insufficient_history_symbols", []),`
`111`	`113`	`}`
`112`	`114`	`)`
`113`	`115`	`)`