-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathconfig.yaml
More file actions
100 lines (88 loc) · 3 KB
/
Copy pathconfig.yaml
File metadata and controls
100 lines (88 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
data:
data_dir: "training_dataset" # folder that has *.wav and *.lab data
sample_rate: 16000 # not doing 44.1kHz cus both the encoders were trained on 16kHz- also its lighter to do ASR with lower sampling rate
num_val_files: 10 #amount of validation files
max_seq_len: null # no limits of phonemes amount in a segment (full segment)
frame_duration: 0.02 # 20ms, can be changed if encoder_type isn't "whisper"
n_mels: 80 # for mel when encoder_type is "none"
model:
encoder_type: "whisper" # "wavlm" or "whisper" or "none"
whisper_model: "openai/whisper-base" # look at model_list.txt for more model types
wavlm_model: "microsoft/wavlm-base-plus" # same here, but the more advanced the base, the bigger and slower it trains
freeze_encoder: false # false to finetune the encoder, true to not finetune
enable_bilstm: true
bilstm_num_layer: 2
enable_dilated_conv: true
dilated_conv_depth: 2
dilated_conv_kernel: 3
segmental_loss_weight: 1.0
segmental_loss_weights: [1.0, 1.0, 2.0] # [start_error, end_error, 1 - IoU]
subframe_loss_weight: 3.0
num_conformer_layers: 2
conformer_heads: 2
conformer_ff_expansion: 2
conformer_kernel_size: 31
conformer_dropout: 0.15
lang_emb_dim: 64
num_languages: 0 # auto update on save config after preprocess
training:
batch_size: 16 # Number of samples per batch.
num_workers: 4
optimizer: "RAdam"
#optimizer_params: {}
# Example for RAdam:
# optimizer: "RAdam"
optimizer_params:
betas: [0.9, 0.999]
eps: 1.0e-8
#
# Example for Ranger (RAdam + Lookahead):
# optimizer: "Ranger"
# optimizer_params:
# alpha: 0.5
# k: 6
# N_sma_threshhold: 5
# betas: [0.9, 0.999]
# eps: 1.0e-8
#
# Example for Prodigy:
# optimizer: "Prodigy"
# optimizer_params:
# lr: 1.0
# weight_decay: 0.01
# decouple: true
# d_coef: 2.0
# use_bias_correction: true
# safeguard_warmup: true
#
# Example for Muon:
# optimizer: "Muon"
# optimizer_params:
# lr: 1.0
# weight_decay: 0.01
# momentum: 0.95
# nesterov: true
# ns_steps: 5
learning_rate: 0.001
lr_decay_gamma: 0.98 # learning rate decay gamma every val_check_interval
weight_decay: 0.00001
label_smoothing: 0.1 # prevent `overconfident` labels
max_steps: 500000
val_check_interval: 2500
max_checkpoints: 5
log_dir: "/content/drive/MyDrive/WFL_13_mel_20ms/logs" # path to logs folder for tensorboard
merged_phoneme_groups: [] # groups of phonemes across languages that should share a label
augmentation:
enable: true
noise_std: 0.005
prob: 0.5
volume_range: [0.9, 1.1]
finetuning:
enable: false # enabling WLF finetuning
model_path: null # path to finetune model
output:
save_dir: "/content/drive/MyDrive/WFL_13_mel_20ms" # path to save folder
postprocess:
median_filter: 1 # 1 for no smoothing, higher the value, the more it smooths the prediction
merge_segments: "right" # right, left, previous, or none
confidence_threshold: 0.5 # range 0.0-1.0 | threshold for uncertain labels, 0 to disable