-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
100 lines (87 loc) · 2.08 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
config:
# use wikitext-103-raw-v1 dataset
wikitext-103-raw-v1:
# I/O settings
out_dir: "output_directory"
model_name: "tinyGPT"
eval_interval: 500
log_interval: 10
eval_iters: 200
eval_only: false
always_save_checkpoint: true
init_from: "scratch" # "scratch" or "resume"
# Wandb logging
wandb_log: true
wandb_project: "owt"
wandb_run_name: "LanguageModel"
# Data settings
dataset: "wikitext-103-raw-v1"
gradient_accumulation_steps: 8 # Simulate larger batch sizes
batch_size: 16
block_size: 256
vocab_size: 32768
# Model architecture settings
n_layer: 8
n_head: 8
n_embd: 768
dropout: 0.1
bias: true
# AdamW optimizer settings
learning_rate: 5e-5
max_iters: 100000
weight_decay: 1e-1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
# Learning rate decay settings
decay_lr: true
warmup_iters: 2000
lr_decay_iters: 100000
min_lr: 1e-6
# System settings
device: "cuda"
dtype: "bfloat16"
compile: false
# use tiny_shakespeare dataset
tiny_shakespeare_data:
# I/O settings
out_dir: "output_directory"
model_name: "tiny_shakespeare"
eval_interval: 200
log_interval: 10
eval_iters: 200
eval_only: false
always_save_checkpoint: true
init_from: "scratch" # "scratch" or "resume"
# Wandb logging
wandb_log: true
wandb_project: "owt"
wandb_run_name: "LanguageModel"
# Data settings
dataset: "tiny_shakespeare_data"
gradient_accumulation_steps: 8 # Simulate larger batch sizes
batch_size: 16
block_size: 128
vocab_size: 32768
# Model architecture settings
n_layer: 4
n_head: 4
n_embd: 512
dropout: 0.1
bias: true
# AdamW optimizer settings
learning_rate: 6e-6
max_iters: 10000
weight_decay: 1e-1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
# Learning rate decay settings
decay_lr: true
warmup_iters: 1000
lr_decay_iters: 8000
min_lr: 6e-7
# System settings
device: "cuda"
dtype: "bfloat16"
compile: false