-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_mlm.py
More file actions
26 lines (19 loc) · 755 Bytes
/
Copy pathconvert_mlm.py
File metadata and controls
26 lines (19 loc) · 755 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from pathlib import Path
import pandas as pd
from tqdm import tqdm
data_dir = Path("./ST2024").resolve()
output_dir = Path("./converted_data").resolve()
output_dir.mkdir(exist_ok=True)
langs = [
path.stem.split("_")[0]
for path in (data_dir / "fill_mask_word" / "train").iterdir()
if path.is_file()
]
for lang in tqdm(langs):
for split in ("train", "valid"):
data_path = data_dir / "fill_mask_word" / split / f"{lang}_{split}.tsv"
file_path = output_dir / "mlm" / split / f"{lang}_{split}.txt"
data = pd.read_csv(data_path, sep="\t", quotechar="^")
file_path.parent.mkdir(exist_ok=True, parents=True)
with open(file_path, "w") as file:
file.write("\n".join(data["src"].tolist()))