-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdvc.yaml
More file actions
122 lines (116 loc) · 4.43 KB
/
Copy pathdvc.yaml
File metadata and controls
122 lines (116 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# =============================================================================
# HireLens — DVC Pipeline
# Run the full pipeline: dvc repro
# Run a single stage: dvc repro <stage>
# Show DAG: dvc dag
# Check metrics: dvc metrics show
# Compare runs: dvc metrics diff
# =============================================================================
stages:
# ── 1. Data Ingestion ────────────────────────────────────────────────────────
ingest:
cmd: python -m src.data.data_pipeline --stage ingest
deps:
- src/data/ingestion.py
- src/data/data_pipeline.py
- configs/config.yaml
params:
- params.yaml:
- data.random_seed
- data.min_resume_chars
- data.max_resume_chars
outs:
- data/raw/resumes_clean.csv
- data/raw/jobs_clean.csv
metrics:
- logs/dataset_distributions.json:
cache: false
desc: "Download and clean raw resume + job-posting data from Kaggle"
# ── 2. Preprocessing ─────────────────────────────────────────────────────────
preprocess:
cmd: python -m src.data.data_pipeline --stage preprocess
deps:
- src/data/preprocessing.py
- src/data/data_pipeline.py
- data/raw/resumes_clean.csv
- data/raw/jobs_clean.csv
- configs/config.yaml
params:
- params.yaml:
- data.train_val_test_split
- data.random_seed
- data.remove_pii
- data.normalize_whitespace
outs:
- data/processed/resumes_processed.csv
- data/processed/jobs_processed.csv
- data/processed/train_pairs.csv
- data/processed/val_pairs.csv
- data/processed/test_pairs.csv
desc: "Build train/val/test pairs with weak-supervision labels"
# ── 3. Synthetic Augmentation ────────────────────────────────────────────────
synthetic:
cmd: >-
python -m src.data.data_pipeline --stage synthetic
--n-pairs ${synthetic.num_samples}
deps:
- src/data/synthetic_gen.py
- src/data/data_pipeline.py
- data/processed/train_pairs.csv
- data/skills_taxonomy.json
- configs/config.yaml
params:
- params.yaml:
- synthetic.num_samples
- synthetic.noise_level
- synthetic.random_seed
outs:
- data/synthetic/synthetic_pairs.csv
desc: "Augment training data with skill-permutation synthetic pairs"
# ── 4. Model Training ────────────────────────────────────────────────────────
train:
cmd: >-
python -m src.models.train
--epochs ${fine_tuning.num_train_epochs}
--batch-size ${fine_tuning.per_device_train_batch_size}
--run-name dvc-auto
deps:
- src/models/train.py
- src/features/embeddings.py
- data/processed/train_pairs.csv
- data/processed/val_pairs.csv
- data/synthetic/synthetic_pairs.csv
- configs/config.yaml
params:
- params.yaml:
- fine_tuning
- model.base_name
- model.max_seq_length
outs:
- models/fine_tuned/hirelens_matcher:
cache: true
persist: true
metrics:
- logs/training_metrics.json:
cache: false
desc: "Fine-tune all-MiniLM-L6-v2 with CosineSimilarityLoss on GTX 1650"
# ── 5. Evaluation ────────────────────────────────────────────────────────────
evaluate:
cmd: python -m src.evaluation.metrics
deps:
- src/evaluation/metrics.py
- src/models/scorer.py
- src/features/ner.py
- models/fine_tuned/hirelens_matcher
- data/processed/test_pairs.csv
- configs/config.yaml
params:
- params.yaml:
- evaluation.k_values
- evaluation.num_eval_samples
- evaluation.target_metrics
- scoring.weights
metrics:
- logs/evaluation_report.json:
cache: false
desc: "Compute P@K, NDCG@K, AUC-ROC, MRR, and NER F1 on held-out test set"