-
Notifications
You must be signed in to change notification settings - Fork 294
/
train_stage2.yaml
137 lines (132 loc) · 3.44 KB
/
train_stage2.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
model:
cldm:
target: diffbir.model.cldm.ControlLDM
params:
latent_scale_factor: 0.18215
unet_cfg:
use_checkpoint: True
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
legacy: False
vae_cfg:
embed_dim: 4
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
clip_cfg:
embed_dim: 1024
vision_cfg:
image_size: 224
layers: 32
width: 1280
head_width: 80
patch_size: 14
text_cfg:
context_length: 77
vocab_size: 49408
width: 1024
heads: 16
layers: 24
layer: "penultimate"
controlnet_cfg:
use_checkpoint: True
image_size: 32 # unused
in_channels: 4
hint_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
legacy: False
swinir:
target: diffbir.model.swinir.SwinIR
params:
img_size: 64
patch_size: 1
in_chans: 3
embed_dim: 180
depths: [6, 6, 6, 6, 6, 6, 6, 6]
num_heads: [6, 6, 6, 6, 6, 6, 6, 6]
window_size: 8
mlp_ratio: 2
sf: 8
img_range: 1.0
upsampler: "nearest+conv"
resi_connection: "1conv"
unshuffle: True
unshuffle_scale: 8
diffusion:
target: diffbir.model.gaussian_diffusion.Diffusion
params:
linear_start: 0.00085
linear_end: 0.0120
timesteps: 1000
zero_snr: False
parameterization: eps
dataset:
train:
target: diffbir.dataset.codeformer.CodeformerDataset
params:
# training file list path
file_list:
file_backend_cfg:
target: diffbir.dataset.file_backend.HardDiskBackend
out_size: 512
crop_type: center
blur_kernel_size: 41
kernel_list: ['iso', 'aniso']
kernel_prob: [0.5, 0.5]
blur_sigma: [0.1, 12]
downsample_range: [1, 12]
noise_range: [0, 15]
jpeg_range: [30, 100]
batch_transform:
target: diffbir.dataset.batch_transform.IdentityBatchTransform
train:
# pretrained sd v2.1 path
sd_path:
# experiment directory path
exp_dir:
# stage 1 swinir path.
# In our paper, we use SwinIR trained on ImageNet-1k with codeformer degradation.
swinir_path:
learning_rate: 1e-4
# ImageNet 1k (1.3M images)
# batch size = 192, lr = 1e-4, total training steps = 25k
# Our filtered laion2b-en (15M images)
# batch size = 256, lr = 1e-4 (first 30k), 1e-5 (next 50k), total training steps = 80k
batch_size: 256
num_workers:
train_steps: 30000
log_every: 50
ckpt_every: 10000
image_every: 1000
resume: ~
noise_aug_timestep: 0