Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,4 @@ setup-consultation:
$(if $(DIR),--dir $(DIR)) \
$(if $(RESPONSES),--responses $(RESPONSES)) \
$(if $(QU),--qu $(QU)) \
$(if $(UPLOAD),--upload) \
$(if $(VALIDATE_ONLY),--validate-only)
$(if $(UNTIL),--until $(UNTIL))
22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,19 +154,29 @@ This will:

| Makefile variable | Description |
|---|---|
| `VALIDATE_ONLY=1` | Run validation checks only — no files are written and no upload is performed. Useful for a quick sanity-check before committing to ingestion. |
| `UNTIL=validate` | Stop after validation — no files are written and no upload is performed. Useful for a quick sanity-check. |
| `UNTIL=build` | Validate and write local files, but skip uploading to S3. |
| `DIR=<path>` | Path to an existing consultation directory (skips the interactive prompt to create one). |
| `RESPONSES=<path>` | Path to the response data file (skips file-selection prompt). |
| `QU=<path>` | Path to the question understanding Excel file (skips file-selection prompt). |
| `UPLOAD=1` | Upload the generated input files to S3 after ingestion. |

Example — validate without ingesting:
By default, the pipeline runs all three stages: **validate → build → upload**.

Examples:

```bash
# Full pipeline (validate + build + upload to S3)
make setup-consultation NAME=my_consultation

# Validate only
make setup-consultation NAME=my_consultation UNTIL=validate

# Build files locally without uploading with file paths specified
make setup-consultation NAME=my_consultation \
RESPONSES=path/to/responses.xlsx \
QU=path/to/question_understanding.xlsx \
VALIDATE_ONLY=1
RESPONSES=path/to/responses.xlsx \
QU=path/to/question_understanding.xlsx \
UNTIL=build

```

For further instructions on setting up the consultation in the app, see the [Confluence guide](https://incubatorforartificialintelligence.atlassian.net/wiki/spaces/Consult/pages/136445956/1.2+Set+up+the+consultation+in+the+app).
Expand Down
54 changes: 33 additions & 21 deletions setup_consultation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import json
import logging
import re
import shutil
import sys
from difflib import SequenceMatcher
from pathlib import Path

import boto3
import botocore.exceptions
import pandas as pd

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -801,15 +803,18 @@ def prompt_file_selection(files: list[Path], role: str) -> Path:
print("Invalid choice, try again.")


def run_ingestion(
PIPELINE_STAGES = ("validate", "build", "upload")


def run_pipeline(
responses_path: Path,
question_understanding_path: Path,
output_dir: Path,
validate_only: bool = False,
until: str = "upload",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like how this now moves in phases "until" a condition is reached. Consider making "until" and Enum?

) -> None:
"""Run the ingestion pipeline: load → validate → ingest."""
"""Run the setup pipeline: load → validate → build → upload."""

# ── Phase 1: Load ─────────────────────────────────────────────────
# ── Load ──────────────────────────────────────────────────────────
print(f"\nLoading responses from: {responses_path.name}")
responses_df, original_headers = load_responses(responses_path)
print(f" Loaded {len(responses_df)} responses")
Expand All @@ -827,21 +832,23 @@ def run_ingestion(
label.replace("/", "-") for label in demographic_info["label"].tolist()
]

# ── Phase 2: Validate ─────────────────────────────────────────────
# ── Validate ──────────────────────────────────────────────────────
validate_data(
question_sheets,
original_headers,
responses_df,
demographic_columns,
demographic_labels,
interactive=not validate_only,
interactive=until != "validate",
)

if validate_only:
if until == "validate":
return

# ── Phase 3: Ingest ───────────────────────────────────────────────
output_dir.mkdir(parents=True, exist_ok=True)
# ── Build ─────────────────────────────────────────────────────────
if output_dir.exists():
shutil.rmtree(output_dir)
output_dir.mkdir(parents=True)

# Demographics
if demographic_columns and demographic_labels:
Expand Down Expand Up @@ -945,12 +952,10 @@ def main() -> None:
help="Path to question understanding file (skip file selection)",
)
parser.add_argument(
"--validate-only",
action="store_true",
help="Run validation only, skip ingestion and upload",
)
parser.add_argument(
"--upload", action="store_true", help="Upload inputs to S3 after ingestion"
"--until",
choices=PIPELINE_STAGES,
default="upload",
help="How far to run the pipeline: validate, build, or upload (default: upload)",
)
args = parser.parse_args()

Expand Down Expand Up @@ -1024,17 +1029,24 @@ def main() -> None:
remaining, "template question understanding data"
)

# Step 3: Run ingestion (or validation only)
# Step 3: Run pipeline
output_dir = consultation_dir / "inputs"
run_ingestion(responses_path, qu_path, output_dir, validate_only=args.validate_only)
run_pipeline(responses_path, qu_path, output_dir, until=args.until)

if args.validate_only:
if args.until == "validate":
return

# Step 4: Upload inputs to S3 (only if --upload flag is set)
if args.upload:
# Step 4: Upload inputs to S3
if args.until == "upload":
s3_prefix = f"app_data/consultations/{name}/inputs/"
upload_inputs_to_s3(output_dir, "i-dot-ai-prod-consult-data", s3_prefix)
try:
upload_inputs_to_s3(output_dir, "i-dot-ai-prod-consult-data", s3_prefix)
except botocore.exceptions.NoCredentialsError as e:
print(f"\nAWS error: {e}")
print("\nTo fix, either:")
print("1. Run: aws-vault exec first")
print("2. Re-run with UNTIL=build to skip the upload step")
sys.exit(1)

# Step 5: Point to Confluence
print("\n" + "=" * 60)
Expand Down
Loading