Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
model/quantized/** filter=lfs diff=lfs merge=lfs -text
Comment thread
Davidnet marked this conversation as resolved.
*.onnx filter=lfs diff=lfs merge=lfs -text
94 changes: 93 additions & 1 deletion model/dataset/huggingface/upload_dataset_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

import json
import os
import shutil
import subprocess
import sys
import tempfile
from collections import Counter
from pathlib import Path

Expand Down Expand Up @@ -277,6 +280,79 @@ def _generate_dataset_card(
return card.strip() + "\n"


def _upload_binary_via_git(
file_path: Path,
path_in_repo: str,
repo_id: str,
token: str,
) -> None:
"""Upload a binary file to HuggingFace via git clone + LFS push.

The HF Hub API rejects binary files and requires Xet/LFS storage.
This clones the repo, adds the file with LFS tracking, and pushes.
"""
tmpdir = Path(tempfile.mkdtemp())
try:
repo_url = f"https://x-access-token:{token}@huggingface.co/datasets/{repo_id}"
subprocess.run(
["git", "clone", "--depth", "1", repo_url, str(tmpdir / "repo")],
check=True,
capture_output=True,
env={**os.environ, "GIT_LFS_SKIP_SMUDGE": "1"},
Copy link

Copilot AI Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This embeds HF_TOKEN directly into the clone URL. That token can leak via process listings, shell history/logging, or Git error messages.

Prefer passing auth via GIT_ASKPASS/credential helper or git -c http.extraHeader="Authorization: Bearer …" clone ... so the token isn’t present in the command arguments/URL.

Copilot uses AI. Check for mistakes.
)
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This embeds the Hugging Face token directly in the git clone URL. That token can leak via process listings, crash logs, or git remote config inside the cloned repo. Prefer passing credentials via GIT_ASKPASS/GIT_TERMINAL_PROMPT=0 or git -c http.extraheader=Authorization: Bearer ... so the token isn't part of the command-line URL.

Copilot uses AI. Check for mistakes.
repo_dir = tmpdir / "repo"

# Ensure the file extension is tracked by LFS
ext = file_path.suffix # e.g. ".tsv"
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ext = file_path.suffix can be empty (e.g., files without an extension). In that case lfs_pattern becomes * filter=lfs ..., which would unintentionally track all files in the HF repo with LFS. Add a guard to require a non-empty suffix (or accept an explicit pattern) before writing .gitattributes.

Suggested change
ext = file_path.suffix # e.g. ".tsv"
ext = file_path.suffix # e.g. ".tsv"
if not ext:
raise ValueError(
f"Cannot infer an LFS tracking pattern for '{file_path.name}' because it has no file extension. "
"Refusing to write a catch-all '*' rule to .gitattributes."
)

Copilot uses AI. Check for mistakes.
gitattributes = repo_dir / ".gitattributes"
lfs_pattern = f"*{ext} filter=lfs diff=lfs merge=lfs -text"
if gitattributes.exists():
content = gitattributes.read_text()
if lfs_pattern not in content:
with gitattributes.open("a") as f:
f.write(f"\n{lfs_pattern}\n")
else:
gitattributes.write_text(f"{lfs_pattern}\n")

shutil.copy(file_path, repo_dir / path_in_repo)
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The shutil.copy(file_path, repo_dir / path_in_repo) call will fail if path_in_repo includes directories that don't exist in the cloned repo. Consider ensuring parent.mkdir(parents=True, exist_ok=True) before copying to make this helper safe for non-root paths.

Suggested change
shutil.copy(file_path, repo_dir / path_in_repo)
destination_path = repo_dir / path_in_repo
destination_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(file_path, destination_path)

Copilot uses AI. Check for mistakes.

git_env = {**os.environ}
subprocess.run(
["git", "add", ".gitattributes", path_in_repo],
cwd=repo_dir,
check=True,
capture_output=True,
env=git_env,
)
subprocess.run(
[
Copy link

Copilot AI Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

git commit will fail (exit status 1) when there are no changes (e.g., the target file already exists with identical content). With check=True, that raises and aborts the upload.

Handle the no-op case explicitly (e.g., check git status --porcelain before committing, or allow a non-zero return code specifically for “nothing to commit”).

Copilot uses AI. Check for mistakes.
"git",
"-c",
"user.name=kiji",
"-c",
"user.email=kiji@575.ai",
"commit",
"-m",
f"Add {path_in_repo}",
],
cwd=repo_dir,
check=True,
capture_output=True,
env=git_env,
)
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

git commit will fail with a non-zero exit code when there are no changes to commit (e.g., re-running the script after the file already exists), which will abort the upload. Consider checking git status --porcelain and skipping commit/push when clean, or using git commit --allow-empty only when appropriate.

Copilot uses AI. Check for mistakes.
subprocess.run(
["git", "push"],
cwd=repo_dir,
check=True,
capture_output=True,
text=True,
env=git_env,
)
print(f" Pushed {path_in_repo} via git LFS")
finally:
shutil.rmtree(tmpdir, ignore_errors=True)


def upload_to_huggingface(
samples_dir: str = "model/dataset/data_samples/training_samples",
repo_id: str | None = None,
Expand Down Expand Up @@ -343,7 +419,7 @@ def upload_to_huggingface(
print(f"\nPushing to {repo_id} (private={private})...")
dataset_dict.push_to_hub(repo_id, token=token)

# Generate and upload dataset card
# Generate dataset card
print("Generating dataset card...")
stats = _compute_dataset_stats(samples)
card = _generate_dataset_card(
Expand All @@ -352,13 +428,29 @@ def upload_to_huggingface(
train_count=len(dataset_dict["train"]),
test_count=len(dataset_dict["test"]),
)

# Upload dataset card (text file, via API)
api.upload_file(
path_or_fileobj=card.encode(),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
)

# Upload audit ledger via git clone + LFS (HF rejects binary files via API,
# requiring Xet/LFS storage via git push)
audit_ledger_path = Path(samples_dir) / "audit_ledger.tsv"
if audit_ledger_path.exists():
print(f"Uploading audit ledger via git LFS: {audit_ledger_path}")
_upload_binary_via_git(
file_path=audit_ledger_path,
path_in_repo="audit_ledger.tsv",
repo_id=repo_id,
token=token,
)
else:
print(f"Warning: audit_ledger.tsv not found at {audit_ledger_path}, skipping")

print(f"Done! Dataset available at: https://huggingface.co/datasets/{repo_id}")
print("\nUsage:")
print(" from datasets import load_dataset")
Expand Down
2 changes: 1 addition & 1 deletion model/flows/training_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ subsample_count = 0
num_ai4privacy_samples = -1
# Path to audit_allowlist.txt to filter training samples (empty = no filtering)
# Generate with: uv run python model/dataset/audit_dataset.py --samples-dir <dir>
audit_allowlist = ""
audit_allowlist = "/home/hannes/kiji-proxy/model/dataset/data_samples/training_samples/audit_ledger.tsv"
Copy link

Copilot AI Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

audit_allowlist is set to an absolute, machine-specific path. This will break other developers and CI runs that don’t have /home/hannes/....

Use a repo-relative path (e.g. under model/dataset/...) or make this configurable via an environment variable / CLI arg, and keep the default as an empty string (no filtering) or a relative path checked into the repo.

Suggested change
audit_allowlist = "/home/hannes/kiji-proxy/model/dataset/data_samples/training_samples/audit_ledger.tsv"
audit_allowlist = ""

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

audit_allowlist is set to an absolute, developer-specific path (/home/hannes/...), which will break training runs on other machines/CI. This should remain empty by default or be a repo-relative path (or configurable via env/CLI) so the config is portable.

Suggested change
audit_allowlist = "/home/hannes/kiji-proxy/model/dataset/data_samples/training_samples/audit_ledger.tsv"
audit_allowlist = ""

Copilot uses AI. Check for mistakes.

[pipeline]
# Skip Label Studio export and use existing data from paths.training_samples_dir
Expand Down
3 changes: 3 additions & 0 deletions model/quantized/added_tokens.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"[MASK]": 128000
}
1 change: 1 addition & 0 deletions model/quantized/crf_transitions.json

Large diffs are not rendered by default.

118 changes: 115 additions & 3 deletions model/quantized/label_mappings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,115 @@
version https://git-lfs.github.com/spec/v1
oid sha256:25751d603bba29f7a3269f38f7d3e402770ea31cc0ceff68a9eeda79efce3aa6
size 2933
{
"pii": {
"label2id": {
"O": 0,
"B-SURNAME": 1,
"I-SURNAME": 2,
"B-FIRSTNAME": 3,
"I-FIRSTNAME": 4,
"B-BUILDINGNUM": 5,
"I-BUILDINGNUM": 6,
"B-DATEOFBIRTH": 7,
"I-DATEOFBIRTH": 8,
"B-EMAIL": 9,
"I-EMAIL": 10,
"B-PHONENUMBER": 11,
"I-PHONENUMBER": 12,
"B-CITY": 13,
"I-CITY": 14,
"B-URL": 15,
"I-URL": 16,
"B-COMPANYNAME": 17,
"I-COMPANYNAME": 18,
"B-STATE": 19,
"I-STATE": 20,
"B-ZIP": 21,
"I-ZIP": 22,
"B-STREET": 23,
"I-STREET": 24,
"B-COUNTRY": 25,
"I-COUNTRY": 26,
"B-SSN": 27,
"I-SSN": 28,
"B-DRIVERLICENSENUM": 29,
"I-DRIVERLICENSENUM": 30,
"B-PASSPORTID": 31,
"I-PASSPORTID": 32,
"B-NATIONALID": 33,
"I-NATIONALID": 34,
"B-IDCARDNUM": 35,
"I-IDCARDNUM": 36,
"B-TAXNUM": 37,
"I-TAXNUM": 38,
"B-LICENSEPLATENUM": 39,
"I-LICENSEPLATENUM": 40,
"B-PASSWORD": 41,
"I-PASSWORD": 42,
"B-IBAN": 43,
"I-IBAN": 44,
"B-AGE": 45,
"I-AGE": 46,
"B-SECURITYTOKEN": 47,
"I-SECURITYTOKEN": 48,
"B-CREDITCARDNUMBER": 49,
"I-CREDITCARDNUMBER": 50,
"B-USERNAME": 51,
"I-USERNAME": 52
},
"id2label": {
"0": "O",
"1": "B-SURNAME",
"2": "I-SURNAME",
"3": "B-FIRSTNAME",
"4": "I-FIRSTNAME",
"5": "B-BUILDINGNUM",
"6": "I-BUILDINGNUM",
"7": "B-DATEOFBIRTH",
"8": "I-DATEOFBIRTH",
"9": "B-EMAIL",
"10": "I-EMAIL",
"11": "B-PHONENUMBER",
"12": "I-PHONENUMBER",
"13": "B-CITY",
"14": "I-CITY",
"15": "B-URL",
"16": "I-URL",
"17": "B-COMPANYNAME",
"18": "I-COMPANYNAME",
"19": "B-STATE",
"20": "I-STATE",
"21": "B-ZIP",
"22": "I-ZIP",
"23": "B-STREET",
"24": "I-STREET",
"25": "B-COUNTRY",
"26": "I-COUNTRY",
"27": "B-SSN",
"28": "I-SSN",
"29": "B-DRIVERLICENSENUM",
"30": "I-DRIVERLICENSENUM",
"31": "B-PASSPORTID",
"32": "I-PASSPORTID",
"33": "B-NATIONALID",
"34": "I-NATIONALID",
"35": "B-IDCARDNUM",
"36": "I-IDCARDNUM",
"37": "B-TAXNUM",
"38": "I-TAXNUM",
"39": "B-LICENSEPLATENUM",
"40": "I-LICENSEPLATENUM",
"41": "B-PASSWORD",
"42": "I-PASSWORD",
"43": "B-IBAN",
"44": "I-IBAN",
"45": "B-AGE",
"46": "I-AGE",
"47": "B-SECURITYTOKEN",
"48": "I-SECURITYTOKEN",
"49": "B-CREDITCARDNUMBER",
"50": "I-CREDITCARDNUMBER",
"51": "B-USERNAME",
"52": "I-USERNAME",
"-100": "IGNORE"
}
}
}
4 changes: 2 additions & 2 deletions model/quantized/model_quantized.onnx
Git LFS file not shown
36 changes: 33 additions & 3 deletions model/quantized/ort_config.json
Git LFS file not shown
54 changes: 51 additions & 3 deletions model/quantized/special_tokens_map.json
Git LFS file not shown
Binary file added model/quantized/spm.model
Binary file not shown.
Loading
Loading