-
Notifications
You must be signed in to change notification settings - Fork 75
refactor Flux transformer to use scanned blocks, dynamic checkpointing, and decoupled projections #417
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
refactor Flux transformer to use scanned blocks, dynamic checkpointing, and decoupled projections #417
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -214,6 +214,11 @@ def load_diffusers_checkpoint(self): | |
| dtype=self.config.activations_dtype, | ||
| weights_dtype=self.config.weights_dtype, | ||
| precision=max_utils.get_precision(self.config), | ||
| use_base2_exp=self.config.use_base2_exp, | ||
| use_experimental_scheduler=self.config.use_experimental_scheduler, | ||
| remat_policy=self.config.remat_policy, | ||
| names_which_can_be_saved=self.config.names_which_can_be_saved, | ||
| names_which_can_be_offloaded=self.config.names_which_can_be_offloaded, | ||
| ) | ||
| transformer_eval_params = transformer.init_weights( | ||
| rngs=self.rng, max_sequence_length=self.config.max_sequence_length, eval_only=True | ||
|
|
@@ -279,6 +284,11 @@ def load_checkpoint(self, step=None, scheduler_class=None): | |
| weights_dtype=self.config.weights_dtype, | ||
| precision=max_utils.get_precision(self.config), | ||
| from_pt=self.config.from_pt, | ||
| use_base2_exp=self.config.use_base2_exp, | ||
| use_experimental_scheduler=self.config.use_experimental_scheduler, | ||
| remat_policy=self.config.remat_policy, | ||
| names_which_can_be_saved=self.config.names_which_can_be_saved, | ||
| names_which_can_be_offloaded=self.config.names_which_can_be_offloaded, | ||
| ) | ||
|
Comment on lines
+290
to
292
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above |
||
|
|
||
| pipeline = FluxPipeline( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,6 +63,8 @@ jit_initializers: True | |
| from_pt: True | ||
| split_head_dim: True | ||
| attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te | ||
| use_base2_exp: False | ||
| use_experimental_scheduler: False | ||
| # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens. | ||
| # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster. | ||
| # However, when padding tokens are significant, this will lead to worse quality and should be set to True. | ||
|
|
@@ -73,18 +75,18 @@ mask_padding_tokens: True | |
| # in cross attention q. | ||
| attention_sharding_uniform: True | ||
|
|
||
| flash_block_sizes: {} | ||
| #flash_block_sizes: {} | ||
| # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem. | ||
| # flash_block_sizes: { | ||
| # "block_q" : 1536, | ||
| # "block_kv_compute" : 1536, | ||
| # "block_kv" : 1536, | ||
| # "block_q_dkv" : 1536, | ||
| # "block_kv_dkv" : 1536, | ||
| # "block_kv_dkv_compute" : 1536, | ||
| # "block_q_dq" : 1536, | ||
| # "block_kv_dq" : 1536 | ||
| # } | ||
| flash_block_sizes: { | ||
| "block_q" : 1536, | ||
| "block_kv_compute" : 1536, | ||
| "block_kv" : 1536, | ||
| "block_q_dkv" : 1536, | ||
| "block_kv_dkv" : 1536, | ||
| "block_kv_dkv_compute" : 1536, | ||
| "block_q_dq" : 1536, | ||
| "block_kv_dq" : 1536 | ||
| } | ||
| # GroupNorm groups | ||
| norm_num_groups: 32 | ||
|
|
||
|
|
@@ -147,9 +149,11 @@ mesh_axes: ['data', 'fsdp', 'context', 'tensor'] | |
| # conv_in : conv.shape[2] weight | ||
| # conv_out : conv.shape[-1] weight | ||
| logical_axis_rules: [ | ||
| ['batch', 'data'], | ||
| ['batch', ['data','fsdp']], | ||
| ['activation_batch', ['data','fsdp']], | ||
| ['activation_heads', 'tensor'], | ||
| ['activation_heads', 'fsdp'], | ||
| ['activation_length', 'context'], | ||
| ['activation_kv_length', 'context'], | ||
| ['activation_kv', 'tensor'], | ||
| ['mlp','tensor'], | ||
| ['embed','fsdp'], | ||
|
|
@@ -188,7 +192,7 @@ dataset_type: 'tfrecord' # Options: 'tfrecord', 'hf', 'tf', 'grain', 'synthetic | |
| # 2. Optionally set synthetic_num_samples (null=infinite, or a number like 10000) | ||
| # 3. Optionally override dimensions | ||
| # | ||
| # synthetic_num_samples: null # null for infinite, or set a number | ||
| synthetic_num_samples: 1000 # null for infinite, or set a number | ||
| # | ||
| # Optional dimension overrides: | ||
| # resolution: 512 | ||
|
|
@@ -218,6 +222,21 @@ transform_images_num_proc: 4 | |
| reuse_example_batch: False | ||
| enable_data_shuffling: True | ||
|
|
||
| # Defines the type of gradient checkpoint to enable. | ||
| # NONE - means no gradient checkpoint | ||
| # FULL - means full gradient checkpoint, whenever possible (minimum memory usage) | ||
| # MATMUL_WITHOUT_BATCH - means gradient checkpoint for every linear/matmul operation, | ||
| # except for ones that involve batch dimension - that means that all attention and projection | ||
| # layers will have gradient checkpoint, but not the backward with respect to the parameters. | ||
| # OFFLOAD_MATMUL_WITHOUT_BATCH - same as MATMUL_WITHOUT_BATCH but offload instead of recomputing. | ||
| # CUSTOM - set names to offload and save. | ||
| remat_policy: "FLUX_OPTIMIZED" | ||
| # For CUSTOM policy set below, current annotations are for: attn_output, query_proj, key_proj, value_proj | ||
| # xq_out, xk_out, ffn_activation | ||
| names_which_can_be_saved: [] | ||
| names_which_can_be_offloaded: [] | ||
|
Comment on lines
+236
to
+237
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above |
||
| flash_min_seq_length: 0 | ||
|
|
||
| # checkpoint every number of samples, -1 means don't checkpoint. | ||
| checkpoint_every: -1 | ||
| # enables one replica to read the ckpt then broadcast to the rest | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -314,6 +314,9 @@ def run(config): | |
| dtype=config.activations_dtype, | ||
| weights_dtype=config.weights_dtype, | ||
| precision=get_precision(config), | ||
| remat_policy=config.remat_policy, | ||
| names_which_can_be_saved=config.names_which_can_be_saved, | ||
| names_which_can_be_offloaded=config.names_which_can_be_offloaded, | ||
|
Comment on lines
+318
to
+319
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please change variable names |
||
| ) | ||
|
|
||
| num_channels_latents = transformer.in_channels // 4 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we make these variable names a little more explicit? Maybe something like
saved_transformer_layer_namesorsavable_transformer_layer_namesandoffloaded_transformer_layer_namesoroffloadable_transformer_layer_names? I will leave it upto you