ifargs.fused_backward_pass:# use fused optimizer for backward pass: other optimizers will be supported in the futureimportlibrary.adafactor_fusedlibrary.adafactor_fused.patch_adafactor_fused(optimizer)forparam_groupinoptimizer.param_groups:forparameterinparam_group["params"]:ifparameter.requires_grad:def__grad_hook(tensor:torch.Tensor,param_group=param_group):ifaccelerator.sync_gradientsandargs.max_grad_norm!=0.0:accelerator.clip_grad_norm_(tensor,args.max_grad_norm)optimizer.step_param(tensor,param_group)tensor.grad=Noneparameter.register_post_accumulate_grad_hook(__grad_hook)