ChiefTheLord commited on
Commit
4c501cb
·
verified ·
1 Parent(s): 8fd26e5

Upload folder using huggingface_hub

Browse files
checkpoints-v4.1-discrete-conditional/checkpoint-1792/eval_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints-v4.1-discrete-conditional/checkpoint-1792/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed2509dff585539fb2a1d77839df6c78d36a2083d34eae519057c444be78102
3
+ size 24391688
checkpoints-v4.1-discrete-conditional/checkpoint-1792/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bffd4123f8189559f8a0b76d997a9de9894a398e8462f935a31b0f8959cfcc1
3
+ size 762635
checkpoints-v4.1-discrete-conditional/checkpoint-1792/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b717c1ad3cde0191a1a66b46199c309efa6f1cfe69c4facb92560e3f24a3881
3
+ size 14645
checkpoints-v4.1-discrete-conditional/checkpoint-1792/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a42e33465160c5ace903c63375f0694cfab8943854b6c37c46848f754e8871c0
3
+ size 1383
checkpoints-v4.1-discrete-conditional/checkpoint-1792/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970abb983a5a0197c365bc7fdfdc8155569e58b62f56513c3b9d937587189b2d
3
+ size 1465
checkpoints-v4.1-discrete-conditional/checkpoint-1792/trainer_state.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8849382716049383,
6
+ "eval_steps": 256,
7
+ "global_step": 1792,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06320987654320988,
14
+ "grad_norm": 0.04219071567058563,
15
+ "learning_rate": 0.000248046875,
16
+ "loss": 0.006440815050154924,
17
+ "step": 128
18
+ },
19
+ {
20
+ "epoch": 0.12641975308641976,
21
+ "grad_norm": 0.04216855764389038,
22
+ "learning_rate": 0.000498046875,
23
+ "loss": 0.006011463236063719,
24
+ "step": 256
25
+ },
26
+ {
27
+ "epoch": 0.12641975308641976,
28
+ "eval_bleu": 0.0,
29
+ "eval_cos_loss": 0.005870252109428004,
30
+ "eval_loss": 0.005870252109428004,
31
+ "eval_mse_loss": 443.6981372368045,
32
+ "step": 256
33
+ },
34
+ {
35
+ "epoch": 0.12641975308641976,
36
+ "eval_bleu": 0.0,
37
+ "eval_cos_loss": 0.005870252109428004,
38
+ "eval_loss": 0.005870252109428004,
39
+ "eval_mse_loss": 443.6981372368045,
40
+ "eval_runtime": 8.5982,
41
+ "eval_samples_per_second": 304.482,
42
+ "eval_steps_per_second": 4.768,
43
+ "step": 256
44
+ },
45
+ {
46
+ "epoch": 0.18962962962962962,
47
+ "grad_norm": 0.04382074996829033,
48
+ "learning_rate": 0.000748046875,
49
+ "loss": 0.005682823713868856,
50
+ "step": 384
51
+ },
52
+ {
53
+ "epoch": 0.2528395061728395,
54
+ "grad_norm": 0.04206819832324982,
55
+ "learning_rate": 0.000998046875,
56
+ "loss": 0.005469319876283407,
57
+ "step": 512
58
+ },
59
+ {
60
+ "epoch": 0.2528395061728395,
61
+ "eval_bleu": 0.0,
62
+ "eval_cos_loss": 0.005397265197754633,
63
+ "eval_loss": 0.005397265197754633,
64
+ "eval_mse_loss": 463.40619789681784,
65
+ "step": 512
66
+ },
67
+ {
68
+ "epoch": 0.2528395061728395,
69
+ "eval_bleu": 0.0,
70
+ "eval_cos_loss": 0.005397265197754633,
71
+ "eval_loss": 0.005397265197754633,
72
+ "eval_mse_loss": 463.40619789681784,
73
+ "eval_runtime": 7.4659,
74
+ "eval_samples_per_second": 350.662,
75
+ "eval_steps_per_second": 5.492,
76
+ "step": 512
77
+ },
78
+ {
79
+ "epoch": 0.3160493827160494,
80
+ "grad_norm": 0.040812306106090546,
81
+ "learning_rate": 0.0009827157247249464,
82
+ "loss": 0.005451996345072985,
83
+ "step": 640
84
+ },
85
+ {
86
+ "epoch": 0.37925925925925924,
87
+ "grad_norm": 0.05150594562292099,
88
+ "learning_rate": 0.0009315344337660421,
89
+ "loss": 0.005260218400508165,
90
+ "step": 768
91
+ },
92
+ {
93
+ "epoch": 0.37925925925925924,
94
+ "eval_bleu": 0.0,
95
+ "eval_cos_loss": 0.005222608051376372,
96
+ "eval_loss": 0.005222608051376372,
97
+ "eval_mse_loss": 478.69413199075836,
98
+ "step": 768
99
+ },
100
+ {
101
+ "epoch": 0.37925925925925924,
102
+ "eval_bleu": 0.0,
103
+ "eval_cos_loss": 0.005222608051376372,
104
+ "eval_loss": 0.005222608051376372,
105
+ "eval_mse_loss": 478.69413199075836,
106
+ "eval_runtime": 8.0676,
107
+ "eval_samples_per_second": 324.508,
108
+ "eval_steps_per_second": 5.082,
109
+ "step": 768
110
+ },
111
+ {
112
+ "epoch": 0.44246913580246916,
113
+ "grad_norm": 0.0297068003565073,
114
+ "learning_rate": 0.0008500491898731988,
115
+ "loss": 0.005236003547906876,
116
+ "step": 896
117
+ },
118
+ {
119
+ "epoch": 0.505679012345679,
120
+ "grad_norm": 0.03605964779853821,
121
+ "learning_rate": 0.0007439821899385376,
122
+ "loss": 0.005152056459337473,
123
+ "step": 1024
124
+ },
125
+ {
126
+ "epoch": 0.505679012345679,
127
+ "eval_bleu": 0.0,
128
+ "eval_cos_loss": 0.005151342339359406,
129
+ "eval_loss": 0.005151342339359406,
130
+ "eval_mse_loss": 487.98453800852707,
131
+ "step": 1024
132
+ },
133
+ {
134
+ "epoch": 0.505679012345679,
135
+ "eval_bleu": 0.0,
136
+ "eval_cos_loss": 0.005151342339359406,
137
+ "eval_loss": 0.005151342339359406,
138
+ "eval_mse_loss": 487.98453800852707,
139
+ "eval_runtime": 8.1175,
140
+ "eval_samples_per_second": 322.512,
141
+ "eval_steps_per_second": 5.051,
142
+ "step": 1024
143
+ },
144
+ {
145
+ "epoch": 0.5688888888888889,
146
+ "grad_norm": 0.03870987519621849,
147
+ "learning_rate": 0.0006207818531897271,
148
+ "loss": 0.005106513388454914,
149
+ "step": 1152
150
+ },
151
+ {
152
+ "epoch": 0.6320987654320988,
153
+ "grad_norm": 0.024118751287460327,
154
+ "learning_rate": 0.0004890997654891032,
155
+ "loss": 0.005096105858683586,
156
+ "step": 1280
157
+ },
158
+ {
159
+ "epoch": 0.6320987654320988,
160
+ "eval_bleu": 0.0,
161
+ "eval_cos_loss": 0.005091924810918366,
162
+ "eval_loss": 0.005091924810918366,
163
+ "eval_mse_loss": 497.0737781059451,
164
+ "step": 1280
165
+ },
166
+ {
167
+ "epoch": 0.6320987654320988,
168
+ "eval_bleu": 0.0,
169
+ "eval_cos_loss": 0.005091924810918366,
170
+ "eval_loss": 0.005091924810918366,
171
+ "eval_mse_loss": 497.0737781059451,
172
+ "eval_runtime": 8.4448,
173
+ "eval_samples_per_second": 310.013,
174
+ "eval_steps_per_second": 4.855,
175
+ "step": 1280
176
+ },
177
+ {
178
+ "epoch": 0.6953086419753086,
179
+ "grad_norm": 0.02920917421579361,
180
+ "learning_rate": 0.00035818313279679524,
181
+ "loss": 0.005061750765889883,
182
+ "step": 1408
183
+ },
184
+ {
185
+ "epoch": 0.7585185185185185,
186
+ "grad_norm": 0.018508030101656914,
187
+ "learning_rate": 0.00023722540797531234,
188
+ "loss": 0.005014664493501186,
189
+ "step": 1536
190
+ },
191
+ {
192
+ "epoch": 0.7585185185185185,
193
+ "eval_bleu": 0.0,
194
+ "eval_cos_loss": 0.00502042164571765,
195
+ "eval_loss": 0.00502042164571765,
196
+ "eval_mse_loss": 502.0919985887481,
197
+ "step": 1536
198
+ },
199
+ {
200
+ "epoch": 0.7585185185185185,
201
+ "eval_bleu": 0.0,
202
+ "eval_cos_loss": 0.00502042164571765,
203
+ "eval_loss": 0.00502042164571765,
204
+ "eval_mse_loss": 502.0919985887481,
205
+ "eval_runtime": 7.5012,
206
+ "eval_samples_per_second": 349.012,
207
+ "eval_steps_per_second": 5.466,
208
+ "step": 1536
209
+ },
210
+ {
211
+ "epoch": 0.8217283950617283,
212
+ "grad_norm": 0.020617935806512833,
213
+ "learning_rate": 0.00013472069233656453,
214
+ "loss": 0.004991861991584301,
215
+ "step": 1664
216
+ },
217
+ {
218
+ "epoch": 0.8849382716049383,
219
+ "grad_norm": 0.016280537471175194,
220
+ "learning_rate": 5.786724825584927e-05,
221
+ "loss": 0.004958455916494131,
222
+ "step": 1792
223
+ },
224
+ {
225
+ "epoch": 0.8849382716049383,
226
+ "eval_bleu": 0.0,
227
+ "eval_cos_loss": 0.0049657613470605235,
228
+ "eval_loss": 0.0049657613470605235,
229
+ "eval_mse_loss": 505.7769038502763,
230
+ "step": 1792
231
+ },
232
+ {
233
+ "epoch": 0.8849382716049383,
234
+ "eval_bleu": 0.0,
235
+ "eval_cos_loss": 0.0049657613470605235,
236
+ "eval_loss": 0.0049657613470605235,
237
+ "eval_mse_loss": 505.7769038502763,
238
+ "eval_runtime": 8.1016,
239
+ "eval_samples_per_second": 323.148,
240
+ "eval_steps_per_second": 5.061,
241
+ "step": 1792
242
+ }
243
+ ],
244
+ "logging_steps": 128,
245
+ "max_steps": 2025,
246
+ "num_input_tokens_seen": 0,
247
+ "num_train_epochs": 1,
248
+ "save_steps": 256,
249
+ "stateful_callbacks": {
250
+ "TrainerControl": {
251
+ "args": {
252
+ "should_epoch_stop": false,
253
+ "should_evaluate": false,
254
+ "should_log": false,
255
+ "should_save": true,
256
+ "should_training_stop": false
257
+ },
258
+ "attributes": {}
259
+ }
260
+ },
261
+ "total_flos": 0.0,
262
+ "train_batch_size": 64,
263
+ "trial_name": null,
264
+ "trial_params": null
265
+ }
checkpoints-v4.1-discrete-conditional/checkpoint-1792/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0839bffbc58eb6068cc228e4d756dbb22a9adf723766e40a7bc2a03aca92630
3
+ size 5137