-
Notifications
You must be signed in to change notification settings - Fork 36
feat: base model switched to roberta rebase #325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
e2764d2
22cd73b
877ef05
24d5176
a0672f1
0c01603
63040bf
7d56696
80627b8
fc8588b
ec8ed50
e777de2
5f08bb6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1,2 @@ | ||
| model/quantized/** filter=lfs diff=lfs merge=lfs -text | ||
| *.onnx filter=lfs diff=lfs merge=lfs -text | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -7,7 +7,10 @@ | |||||||||||||||
|
|
||||||||||||||||
| import json | ||||||||||||||||
| import os | ||||||||||||||||
| import shutil | ||||||||||||||||
| import subprocess | ||||||||||||||||
| import sys | ||||||||||||||||
| import tempfile | ||||||||||||||||
| from collections import Counter | ||||||||||||||||
| from pathlib import Path | ||||||||||||||||
|
|
||||||||||||||||
|
|
@@ -277,6 +280,79 @@ def _generate_dataset_card( | |||||||||||||||
| return card.strip() + "\n" | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| def _upload_binary_via_git( | ||||||||||||||||
| file_path: Path, | ||||||||||||||||
| path_in_repo: str, | ||||||||||||||||
| repo_id: str, | ||||||||||||||||
| token: str, | ||||||||||||||||
| ) -> None: | ||||||||||||||||
| """Upload a binary file to HuggingFace via git clone + LFS push. | ||||||||||||||||
|
|
||||||||||||||||
| The HF Hub API rejects binary files and requires Xet/LFS storage. | ||||||||||||||||
| This clones the repo, adds the file with LFS tracking, and pushes. | ||||||||||||||||
| """ | ||||||||||||||||
| tmpdir = Path(tempfile.mkdtemp()) | ||||||||||||||||
| try: | ||||||||||||||||
| repo_url = f"https://x-access-token:{token}@huggingface.co/datasets/{repo_id}" | ||||||||||||||||
| subprocess.run( | ||||||||||||||||
| ["git", "clone", "--depth", "1", repo_url, str(tmpdir / "repo")], | ||||||||||||||||
| check=True, | ||||||||||||||||
| capture_output=True, | ||||||||||||||||
| env={**os.environ, "GIT_LFS_SKIP_SMUDGE": "1"}, | ||||||||||||||||
|
||||||||||||||||
| ) | ||||||||||||||||
|
||||||||||||||||
| repo_dir = tmpdir / "repo" | ||||||||||||||||
|
|
||||||||||||||||
| # Ensure the file extension is tracked by LFS | ||||||||||||||||
| ext = file_path.suffix # e.g. ".tsv" | ||||||||||||||||
|
||||||||||||||||
| ext = file_path.suffix # e.g. ".tsv" | |
| ext = file_path.suffix # e.g. ".tsv" | |
| if not ext: | |
| raise ValueError( | |
| f"Cannot infer an LFS tracking pattern for '{file_path.name}' because it has no file extension. " | |
| "Refusing to write a catch-all '*' rule to .gitattributes." | |
| ) |
Copilot
AI
Apr 15, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The shutil.copy(file_path, repo_dir / path_in_repo) call will fail if path_in_repo includes directories that don't exist in the cloned repo. Consider ensuring parent.mkdir(parents=True, exist_ok=True) before copying to make this helper safe for non-root paths.
| shutil.copy(file_path, repo_dir / path_in_repo) | |
| destination_path = repo_dir / path_in_repo | |
| destination_path.parent.mkdir(parents=True, exist_ok=True) | |
| shutil.copy(file_path, destination_path) |
Copilot
AI
Apr 14, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
git commit will fail (exit status 1) when there are no changes (e.g., the target file already exists with identical content). With check=True, that raises and aborts the upload.
Handle the no-op case explicitly (e.g., check git status --porcelain before committing, or allow a non-zero return code specifically for “nothing to commit”).
Copilot
AI
Apr 15, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
git commit will fail with a non-zero exit code when there are no changes to commit (e.g., re-running the script after the file already exists), which will abort the upload. Consider checking git status --porcelain and skipping commit/push when clean, or using git commit --allow-empty only when appropriate.
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -39,7 +39,7 @@ subsample_count = 0 | |||||||||
| num_ai4privacy_samples = -1 | ||||||||||
| # Path to audit_allowlist.txt to filter training samples (empty = no filtering) | ||||||||||
| # Generate with: uv run python model/dataset/audit_dataset.py --samples-dir <dir> | ||||||||||
| audit_allowlist = "" | ||||||||||
| audit_allowlist = "/home/hannes/kiji-proxy/model/dataset/data_samples/training_samples/audit_ledger.tsv" | ||||||||||
|
||||||||||
| audit_allowlist = "/home/hannes/kiji-proxy/model/dataset/data_samples/training_samples/audit_ledger.tsv" | |
| audit_allowlist = "" |
Copilot
AI
Apr 15, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
audit_allowlist is set to an absolute, developer-specific path (/home/hannes/...), which will break training runs on other machines/CI. This should remain empty by default or be a repo-relative path (or configurable via env/CLI) so the config is portable.
| audit_allowlist = "/home/hannes/kiji-proxy/model/dataset/data_samples/training_samples/audit_ledger.tsv" | |
| audit_allowlist = "" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| { | ||
| "[MASK]": 128000 | ||
| } |
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,115 @@ | ||
| version https://git-lfs.github.com/spec/v1 | ||
| oid sha256:25751d603bba29f7a3269f38f7d3e402770ea31cc0ceff68a9eeda79efce3aa6 | ||
| size 2933 | ||
| { | ||
| "pii": { | ||
| "label2id": { | ||
| "O": 0, | ||
| "B-SURNAME": 1, | ||
| "I-SURNAME": 2, | ||
| "B-FIRSTNAME": 3, | ||
| "I-FIRSTNAME": 4, | ||
| "B-BUILDINGNUM": 5, | ||
| "I-BUILDINGNUM": 6, | ||
| "B-DATEOFBIRTH": 7, | ||
| "I-DATEOFBIRTH": 8, | ||
| "B-EMAIL": 9, | ||
| "I-EMAIL": 10, | ||
| "B-PHONENUMBER": 11, | ||
| "I-PHONENUMBER": 12, | ||
| "B-CITY": 13, | ||
| "I-CITY": 14, | ||
| "B-URL": 15, | ||
| "I-URL": 16, | ||
| "B-COMPANYNAME": 17, | ||
| "I-COMPANYNAME": 18, | ||
| "B-STATE": 19, | ||
| "I-STATE": 20, | ||
| "B-ZIP": 21, | ||
| "I-ZIP": 22, | ||
| "B-STREET": 23, | ||
| "I-STREET": 24, | ||
| "B-COUNTRY": 25, | ||
| "I-COUNTRY": 26, | ||
| "B-SSN": 27, | ||
| "I-SSN": 28, | ||
| "B-DRIVERLICENSENUM": 29, | ||
| "I-DRIVERLICENSENUM": 30, | ||
| "B-PASSPORTID": 31, | ||
| "I-PASSPORTID": 32, | ||
| "B-NATIONALID": 33, | ||
| "I-NATIONALID": 34, | ||
| "B-IDCARDNUM": 35, | ||
| "I-IDCARDNUM": 36, | ||
| "B-TAXNUM": 37, | ||
| "I-TAXNUM": 38, | ||
| "B-LICENSEPLATENUM": 39, | ||
| "I-LICENSEPLATENUM": 40, | ||
| "B-PASSWORD": 41, | ||
| "I-PASSWORD": 42, | ||
| "B-IBAN": 43, | ||
| "I-IBAN": 44, | ||
| "B-AGE": 45, | ||
| "I-AGE": 46, | ||
| "B-SECURITYTOKEN": 47, | ||
| "I-SECURITYTOKEN": 48, | ||
| "B-CREDITCARDNUMBER": 49, | ||
| "I-CREDITCARDNUMBER": 50, | ||
| "B-USERNAME": 51, | ||
| "I-USERNAME": 52 | ||
| }, | ||
| "id2label": { | ||
| "0": "O", | ||
| "1": "B-SURNAME", | ||
| "2": "I-SURNAME", | ||
| "3": "B-FIRSTNAME", | ||
| "4": "I-FIRSTNAME", | ||
| "5": "B-BUILDINGNUM", | ||
| "6": "I-BUILDINGNUM", | ||
| "7": "B-DATEOFBIRTH", | ||
| "8": "I-DATEOFBIRTH", | ||
| "9": "B-EMAIL", | ||
| "10": "I-EMAIL", | ||
| "11": "B-PHONENUMBER", | ||
| "12": "I-PHONENUMBER", | ||
| "13": "B-CITY", | ||
| "14": "I-CITY", | ||
| "15": "B-URL", | ||
| "16": "I-URL", | ||
| "17": "B-COMPANYNAME", | ||
| "18": "I-COMPANYNAME", | ||
| "19": "B-STATE", | ||
| "20": "I-STATE", | ||
| "21": "B-ZIP", | ||
| "22": "I-ZIP", | ||
| "23": "B-STREET", | ||
| "24": "I-STREET", | ||
| "25": "B-COUNTRY", | ||
| "26": "I-COUNTRY", | ||
| "27": "B-SSN", | ||
| "28": "I-SSN", | ||
| "29": "B-DRIVERLICENSENUM", | ||
| "30": "I-DRIVERLICENSENUM", | ||
| "31": "B-PASSPORTID", | ||
| "32": "I-PASSPORTID", | ||
| "33": "B-NATIONALID", | ||
| "34": "I-NATIONALID", | ||
| "35": "B-IDCARDNUM", | ||
| "36": "I-IDCARDNUM", | ||
| "37": "B-TAXNUM", | ||
| "38": "I-TAXNUM", | ||
| "39": "B-LICENSEPLATENUM", | ||
| "40": "I-LICENSEPLATENUM", | ||
| "41": "B-PASSWORD", | ||
| "42": "I-PASSWORD", | ||
| "43": "B-IBAN", | ||
| "44": "I-IBAN", | ||
| "45": "B-AGE", | ||
| "46": "I-AGE", | ||
| "47": "B-SECURITYTOKEN", | ||
| "48": "I-SECURITYTOKEN", | ||
| "49": "B-CREDITCARDNUMBER", | ||
| "50": "I-CREDITCARDNUMBER", | ||
| "51": "B-USERNAME", | ||
| "52": "I-USERNAME", | ||
| "-100": "IGNORE" | ||
| } | ||
| } | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.