hiero-hackers · prajeeta15 · Apr 8, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 17, 2026
diff --git a/src/hiero_analytics/run_contributor_churn_analysis.py b/src/hiero_analytics/run_contributor_churn_analysis.py
@@ -0,0 +1,191 @@
+import pathlib
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import datetime, timedelta
+
+from hiero_analytics.config.logging import setup_logging
+from hiero_analytics.config.paths import ORG, ensure_repo_dirs
+from hiero_analytics.data_sources.github_client import GitHubClient
+from hiero_analytics.data_sources.github_ingest import fetch_repo_merged_pr_difficulty_graphql
+from hiero_analytics.analysis.prs import prs_to_dataframe
+from hiero_analytics.domain.labels import DIFFICULTY_LEVELS
+from hiero_analytics.plotting.base import create_figure, finalize_chart
+from hiero_analytics.config.charts import PRIMARY_PALETTE
+
+setup_logging()
+
+ORG_NAME = ORG
+REPO = "hiero-sdk-python"
+short_repo = REPO.split("/")[-1]
+
+def generate_mock_data():
+    """Generate mock PR data for testing when no token is present."""
+    print("Generating mock data for analysis...")
+    data = []
+    authors = [f"author_{i}" for i in range(100)]
+
+    start_date = datetime(2023, 1, 1)
+
+    for author in authors:
+        # Determine max level for this mock author
+        r = np.random.random()
+        if r < 0.6: # 60% stop at GFI
+            max_level_idx = 0
+        elif r < 0.85: # 25% reach Beginner
+            max_level_idx = 1
+        elif r < 0.95: # 10% reach Intermediate
+            max_level_idx = 2
+        else: # 5% reach Advanced
+            max_level_idx = 3
+
+        current_date = start_date + timedelta(days=np.random.randint(0, 300))
+
+        # Always start with GFI
+        levels_to_achieve = list(range(max_level_idx + 1))
+
+        for l_idx in levels_to_achieve:
+            level_name = DIFFICULTY_LEVELS[l_idx].name
+            num_prs = np.random.randint(1, 4) if l_idx == max_level_idx else 1
+            for _ in range(num_prs):
+                data.append({
+                    "author": author,
+                    "pr_merged_at": current_date,
+                    "level": level_name,
+                    "issue_labels": list(DIFFICULTY_LEVELS[l_idx].labels)
+                })
+                current_date += timedelta(days=np.random.randint(1, 30))
+
+    return pd.DataFrame(data)
+
+def get_contributor_level(labels: set[str]) -> str:
+    for spec in reversed(DIFFICULTY_LEVELS): # advanced, intermediate, beginner, gfi
+        if spec.matches(labels):
+            return spec.name
+    return "Unknown"
+
+def run_prediction_analysis(df):
+    """Simple prediction analysis using 80/20 split as requested."""
+    print("\n--- ML Prediction Analysis (80/20 Split) ---")
+
+    # Feature engineering: characteristics of contributors
+    # target: reached advanced
+    df["is_advanced"] = (df["max_level"] == "Advanced").astype(int)
+
+    # Shuffle and split
+    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
+    split_idx = int(len(df) * 0.8)
+    train_df = df.iloc[:split_idx]
+    test_df = df.iloc[split_idx:]
+
+    # Simple characteristic-based prediction: 
+    # If they have high PR count and stay active for > 60 days, predict Advanced
+    def predict(row):
+        return 1 if row["pr_count"] > 3 and row["tenure_days"] > 60 else 0
+
+    test_df["prediction"] = test_df.apply(predict, axis=1)
+
+    accuracy = (test_df["prediction"] == test_df["is_advanced"]).mean()
+    print(f"Training set size: {len(train_df)}")
+    print(f"Test set size: {len(test_df)}")
+    print(f"Prediction Accuracy (based on early characteristics): {accuracy:.2f}")
+
+def run():
+    repo_data_dir, repo_charts_dir = ensure_repo_dirs(f"{ORG_NAME}/{REPO}")
+
+    import os
+    if not os.getenv("GITHUB_TOKEN"):
+        print("GITHUB_TOKEN not set. Skipping API call and using mock data.")
+        df = generate_mock_data()
+    else:
+        try:
+            client = GitHubClient()
+            print(f"Attempting to fetch PR data for {ORG_NAME}/{REPO}...")
+            prs = fetch_repo_merged_pr_difficulty_graphql(
+                client,
+                owner=ORG_NAME,
+                repo=REPO,
+                use_cache=True
+            )
+            df = prs_to_dataframe(prs)
+            if df.empty:
+                df = generate_mock_data()
+            else:
+                df["level"] = df["issue_labels"].apply(lambda labels: get_contributor_level(set(labels or [])))
+        except Exception as e:
+            print(f"Error fetching data: {e}")
+            df = generate_mock_data()
+
+    df = df.dropna(subset=["author", "pr_merged_at"]).sort_values(["author", "pr_merged_at"])
+
+    # Progression Analysis
+    progression = df.groupby("author").agg({
+        "level": list,
+        "pr_merged_at": ["min", "max", "count"]
+    })
+    progression.columns = ["levels", "first_seen", "last_seen", "pr_count"]
+
+    level_order = {spec.name: i for i, spec in enumerate(DIFFICULTY_LEVELS)}
+    level_order["Unknown"] = -1
+
+    progression["max_level"] = progression["levels"].apply(lambda lvls: max(lvls, key=lambda l: level_order.get(l, -1)))
+    progression["start_level"] = progression["levels"].apply(lambda lvls: lvls[0])
+    progression["tenure_days"] = (progression["last_seen"] - progression["first_seen"]).dt.days
+
+    gfi_starters = progression[progression["start_level"] == "Good First Issue"].copy()
+    total_gfi = len(gfi_starters)
+
+    if total_gfi == 0:
+        print("No GFI starters found.")
+        return
+
+    # Stats
+    counts = gfi_starters["max_level"].value_counts()
+    reached_beginner = len(gfi_starters[gfi_starters["max_level"].isin(["Beginner", "Intermediate", "Advanced"])])
+    reached_intermediate = len(gfi_starters[gfi_starters["max_level"].isin(["Intermediate", "Advanced"])])
+    reached_advanced = len(gfi_starters[gfi_starters["max_level"] == "Advanced"])
+
+    stats = {
+        "Total GFI Starters": total_gfi,
+        "Stop at GFI": counts.get("Good First Issue", 0),
+        "Progress to Beginner+": reached_beginner,
+        "Progress to Intermediate+": reached_intermediate,
+        "Progress to Advanced": reached_advanced,
+    }
+
+    print("\n--- Contributor Churn Analysis ---")
+    for k, v in stats.items():
+        print(f"{k}: {v} ({v/total_gfi*100:.1f}%)")
+
+    run_prediction_analysis(gfi_starters)
+
+    # Visualizations
+    plot_progression_funnel(stats, repo_charts_dir / "contributor_churn_funnel.png")
+    plot_retention(gfi_starters, repo_charts_dir / "contributor_retention.png")
+
+def plot_progression_funnel(stats, output_path):
+    fig, ax = create_figure()
+    labels = ["GFI Starters", "Beginner+", "Intermediate+", "Advanced"]
+    values = [stats["Total GFI Starters"], stats["Progress to Beginner+"], stats["Progress to Intermediate+"], stats["Progress to Advanced"]]
+
+    y_pos = np.arange(len(labels))
+    ax.barh(y_pos, values, color=PRIMARY_PALETTE[0:4], height=0.6)
+    ax.set_yticks(y_pos)
+    ax.set_yticklabels(labels)
+    ax.invert_yaxis()
+
+    for i, v in enumerate(values):
+        pct = (v / values[0]) * 100
+        ax.text(v + 0.5, i, f"{v} ({pct:.1f}%)", va='center', fontweight='bold')
+
+    finalize_chart(fig=fig, ax=ax, title=f"{short_repo}: Contributor Progression Funnel", xlabel="Contributors", output_path=output_path)
+
+def plot_retention(df, output_path):
+    fig, ax = create_figure()
+    retention = [len(df[df["pr_count"] >= i]) for i in range(1, 11)]
+    ax.plot(range(1, 11), retention, marker='o', color=PRIMARY_PALETTE[0])
+    ax.set_xticks(range(1, 11))
+    finalize_chart(fig=fig, ax=ax, title=f"{short_repo}: Contributor Retention by PR Count", xlabel="Minimum PRs Merged", ylabel="Contributors", output_path=output_path)
+
+if __name__ == "__main__":
+    run()