Skip to content
191 changes: 191 additions & 0 deletions src/hiero_analytics/run_contributor_churn_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import pathlib
Comment thread
exploreriii marked this conversation as resolved.
Outdated
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from hiero_analytics.config.logging import setup_logging
from hiero_analytics.config.paths import ORG, ensure_repo_dirs
from hiero_analytics.data_sources.github_client import GitHubClient
from hiero_analytics.data_sources.github_ingest import fetch_repo_merged_pr_difficulty_graphql
from hiero_analytics.analysis.prs import prs_to_dataframe
from hiero_analytics.domain.labels import DIFFICULTY_LEVELS
from hiero_analytics.plotting.base import create_figure, finalize_chart
from hiero_analytics.config.charts import PRIMARY_PALETTE

setup_logging()

ORG_NAME = ORG
REPO = "hiero-sdk-python"
short_repo = REPO.split("/")[-1]

def generate_mock_data():
"""Generate mock PR data for testing when no token is present."""
print("Generating mock data for analysis...")
data = []
authors = [f"author_{i}" for i in range(100)]

start_date = datetime(2023, 1, 1)

for author in authors:
Comment thread
exploreriii marked this conversation as resolved.
Outdated
# Determine max level for this mock author
r = np.random.random()
if r < 0.6: # 60% stop at GFI
max_level_idx = 0
elif r < 0.85: # 25% reach Beginner
max_level_idx = 1
elif r < 0.95: # 10% reach Intermediate
max_level_idx = 2
else: # 5% reach Advanced
max_level_idx = 3

current_date = start_date + timedelta(days=np.random.randint(0, 300))

# Always start with GFI
levels_to_achieve = list(range(max_level_idx + 1))

for l_idx in levels_to_achieve:
level_name = DIFFICULTY_LEVELS[l_idx].name
num_prs = np.random.randint(1, 4) if l_idx == max_level_idx else 1
for _ in range(num_prs):
data.append({
"author": author,
"pr_merged_at": current_date,
"level": level_name,
"issue_labels": list(DIFFICULTY_LEVELS[l_idx].labels)
})
current_date += timedelta(days=np.random.randint(1, 30))

return pd.DataFrame(data)

def get_contributor_level(labels: set[str]) -> str:
for spec in reversed(DIFFICULTY_LEVELS): # advanced, intermediate, beginner, gfi
if spec.matches(labels):
return spec.name
return "Unknown"

def run_prediction_analysis(df):
"""Simple prediction analysis using 80/20 split as requested."""
print("\n--- ML Prediction Analysis (80/20 Split) ---")

# Feature engineering: characteristics of contributors
# target: reached advanced
df["is_advanced"] = (df["max_level"] == "Advanced").astype(int)

# Shuffle and split
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
split_idx = int(len(df) * 0.8)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

# Simple characteristic-based prediction:
# If they have high PR count and stay active for > 60 days, predict Advanced
def predict(row):
return 1 if row["pr_count"] > 3 and row["tenure_days"] > 60 else 0

test_df["prediction"] = test_df.apply(predict, axis=1)

accuracy = (test_df["prediction"] == test_df["is_advanced"]).mean()
print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Prediction Accuracy (based on early characteristics): {accuracy:.2f}")

def run():
Comment thread
exploreriii marked this conversation as resolved.
repo_data_dir, repo_charts_dir = ensure_repo_dirs(f"{ORG_NAME}/{REPO}")

import os
if not os.getenv("GITHUB_TOKEN"):
print("GITHUB_TOKEN not set. Skipping API call and using mock data.")
df = generate_mock_data()
else:
try:
client = GitHubClient()
print(f"Attempting to fetch PR data for {ORG_NAME}/{REPO}...")
prs = fetch_repo_merged_pr_difficulty_graphql(
client,
owner=ORG_NAME,
repo=REPO,
use_cache=True
)
df = prs_to_dataframe(prs)
if df.empty:
df = generate_mock_data()
else:
df["level"] = df["issue_labels"].apply(lambda labels: get_contributor_level(set(labels or [])))
except Exception as e:
print(f"Error fetching data: {e}")
df = generate_mock_data()

df = df.dropna(subset=["author", "pr_merged_at"]).sort_values(["author", "pr_merged_at"])

# Progression Analysis
Comment thread
prajeeta15 marked this conversation as resolved.
Outdated
progression = df.groupby("author").agg({
"level": list,
"pr_merged_at": ["min", "max", "count"]
})
progression.columns = ["levels", "first_seen", "last_seen", "pr_count"]
Comment thread
prajeeta15 marked this conversation as resolved.
Outdated

level_order = {spec.name: i for i, spec in enumerate(DIFFICULTY_LEVELS)}
level_order["Unknown"] = -1

progression["max_level"] = progression["levels"].apply(lambda lvls: max(lvls, key=lambda l: level_order.get(l, -1)))
progression["start_level"] = progression["levels"].apply(lambda lvls: lvls[0])
progression["tenure_days"] = (progression["last_seen"] - progression["first_seen"]).dt.days

gfi_starters = progression[progression["start_level"] == "Good First Issue"].copy()
total_gfi = len(gfi_starters)

if total_gfi == 0:
print("No GFI starters found.")
return

# Stats
counts = gfi_starters["max_level"].value_counts()
reached_beginner = len(gfi_starters[gfi_starters["max_level"].isin(["Beginner", "Intermediate", "Advanced"])])
reached_intermediate = len(gfi_starters[gfi_starters["max_level"].isin(["Intermediate", "Advanced"])])
reached_advanced = len(gfi_starters[gfi_starters["max_level"] == "Advanced"])

stats = {
"Total GFI Starters": total_gfi,
"Stop at GFI": counts.get("Good First Issue", 0),
"Progress to Beginner+": reached_beginner,
"Progress to Intermediate+": reached_intermediate,
"Progress to Advanced": reached_advanced,
}

print("\n--- Contributor Churn Analysis ---")
for k, v in stats.items():
print(f"{k}: {v} ({v/total_gfi*100:.1f}%)")

run_prediction_analysis(gfi_starters)

# Visualizations
plot_progression_funnel(stats, repo_charts_dir / "contributor_churn_funnel.png")
plot_retention(gfi_starters, repo_charts_dir / "contributor_retention.png")

def plot_progression_funnel(stats, output_path):
fig, ax = create_figure()
Comment thread
exploreriii marked this conversation as resolved.
Outdated
labels = ["GFI Starters", "Beginner+", "Intermediate+", "Advanced"]
values = [stats["Total GFI Starters"], stats["Progress to Beginner+"], stats["Progress to Intermediate+"], stats["Progress to Advanced"]]

y_pos = np.arange(len(labels))
ax.barh(y_pos, values, color=PRIMARY_PALETTE[0:4], height=0.6)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.invert_yaxis()

for i, v in enumerate(values):
pct = (v / values[0]) * 100
ax.text(v + 0.5, i, f"{v} ({pct:.1f}%)", va='center', fontweight='bold')

finalize_chart(fig=fig, ax=ax, title=f"{short_repo}: Contributor Progression Funnel", xlabel="Contributors", output_path=output_path)
Comment thread
exploreriii marked this conversation as resolved.
Outdated

def plot_retention(df, output_path):
fig, ax = create_figure()
Comment thread
exploreriii marked this conversation as resolved.
Outdated
retention = [len(df[df["pr_count"] >= i]) for i in range(1, 11)]
ax.plot(range(1, 11), retention, marker='o', color=PRIMARY_PALETTE[0])
ax.set_xticks(range(1, 11))
finalize_chart(fig=fig, ax=ax, title=f"{short_repo}: Contributor Retention by PR Count", xlabel="Minimum PRs Merged", ylabel="Contributors", output_path=output_path)

if __name__ == "__main__":
run()