
    
3jO                        S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SK Jr  S SKJ	r	  S SK
JrJr  S SKrS SKrS SKJs  Jr  \" \SS5      b  S SKJrJr  S SKJr  S SKJr  S	S
KJr  S	SKJr  S	SK J!r!  S	SK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*  \*" 5       (       a1  S SK+r+\+RX                  RZ                  R]                  5       (       a  S SK-r-\&" 5       (       a  S SK/J0r0  \'" 5       (       a  S SK1J2r2  \)" 5       (       a  S SK3J4r4  \(" 5       (       a  S SK5r5S\64S jr7S r8SSSSS.S\Rr                  S\Rr                  S\:S\:S\Rr                  S-  S\6S\;\Rr                  \Rr                  \Rr                  4   4S jjr<S \=4S! jr> SNS"\S#\!S$\Rr                  S%\Rr                  S&\Rr                  S'\Rr                  S(\Rr                  S)\:S\;\Rr                  \Rr                  4   4S* jjr?S"\S\@\=\Rr                  4   4S+ jrA\R                  4S,\R"                  R                  \D\R"                  R                     -  4S- jjrES.\@\=\Rr                  4   S/\=S0\R"                  R                  4S1 jrFS2\@\=\R"                  R                  4   S\@\=\4   4S3 jrG     SOS4\=S5\6S6\:S7\:S8\:S9\R                  \=-  S:\R                  S-  4S; jjrJSPS4\=4S< jjrKS= rL\S>S?.S@\R"                  R                  \-  S9\=\R                  -  SA\M4SB jj5       rNSC rOSD rPS\@4SE jrQS\@4SF jrR     SQS,\R"                  R                  S9\=\R                  -  SA\MSG\MSH\MSI\@\=\4   S-  SJ\S\T\R"                  R                        S-  S\4SK jjrU " SL SM5      rVg)R    N)contextmanager)partial)AnyIterabledistributed)
CPUOffloadShardingStrategy)FullyShardedDataParallel)transformer_auto_wrap_policy   )UNet2DConditionModel)DiffusionPipeline)SchedulerMixin)convert_state_dict_to_diffusersconvert_state_dict_to_peft	deprecateis_accelerate_availableis_peft_availableis_torch_npu_availableis_torchvision_availableis_transformers_available)
get_logger)set_peft_model_state_dict)
transformsseedc                 6   [         R                  " U 5        [        R                   R                  U 5        [        R                  " U 5        [        5       (       a   [        R                  R                  U 5        g[        R                  R                  U 5        g)z
Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.

Args:
    seed (`int`): The seed to set.

Returns:
    `None`
N)	randomr   nptorchmanual_seedr   npumanual_seed_allcuda)r   s    R/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/training_utils.pyset_seedr%   8   s[     KKIINN4	d		!!$'

""4(    c                    U R                   nUS-  nSU-
  S-  nUR                  UR                  S9U   R                  5       n[	        UR
                  5      [	        UR
                  5      :  a3  US   n[	        UR
                  5      [	        UR
                  5      :  a  M3  UR                  UR
                  5      nUR                  UR                  S9U   R                  5       n[	        UR
                  5      [	        UR
                  5      :  a3  US   n[	        UR
                  5      [	        UR
                  5      :  a  M3  UR                  UR
                  5      nXV-  S-  nU$ )a_  
Computes SNR as per
https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
for the given timesteps using the provided noise scheduler.

Args:
    noise_scheduler (`NoiseScheduler`):
        An object containing the noise schedule parameters, specifically `alphas_cumprod`, which is used to compute
        the SNR values.
    timesteps (`torch.Tensor`):
        A tensor of timesteps for which the SNR is computed.

Returns:
    `torch.Tensor`: A tensor containing the computed SNR values for each timestep.
      ?      ?device).N   )alphas_cumprodtor+   floatlenshapeexpand)noise_scheduler	timestepsr-   sqrt_alphas_cumprodsqrt_one_minus_alphas_cumprodalphasigmasnrs           r$   compute_snrr:   L   sF     %33N(#-%(>%9c$A! .00	8H8H0I)TZZ\
!''
(3y+?
?1)< !''
(3y+?
?&&y7E$A$D$DIL\L\$D$]^g$h$n$n$p!
+11
2S5I
I(Ei(P% +11
2S5I
I)00AE =Q
CJr&           r)   i)lambda_conftemperatureper_token_weightsignore_indexlogitslabelsr<   r=   r>   r?   returnc                h   U R                   S:  a"  [        S[        U R                  5       S35      eUR                  U R                  SS :w  a8  [        S[        UR                  5       S[        U R                  5       S35      eUS::  a  [        S	U S35      eUR	                  U5      nUc  [
        R                  " XR                  S
9nOkUR                  UR                  :w  a8  [        S[        UR                  5       S[        UR                  5       S35      eUR                  U R                  S
9nU R                  S   n[        R                  " U R                  SU5      UR                  S5      SUS9R                  U5      n	XvR                  UR                  5      -  R                  5       R                  S5      n
X-  UR                  U	R                  5      -  R                  5       U
-  nUS:X  a.  [
        R                  " SU R                   UR                  S9nXU4$ [
        R"                  " 5          U R%                  SS9nXmR'                  U5      -  nSSS5        U R)                  5       nUS:w  a  U[)        U5      -  n[
        R*                  " USS9n[
        R,                  " UR                  5      R.                  n[
        R0                  " UR                  U5      5      nUU-  R                  SS9R                  U R                  S
9* nUWR                  UR                  5      -  R                  5       R                  S5      nUU-  UR                  UR                  5      -  R                  5       U-  nU[)        U5      U-  -   nUX4$ ! , (       d  f       GNC= f)a  
Computes a confidence-aware training loss for token classification-style heads.

This loss combines:
  - `loss_sft`: standard supervised cross-entropy on all non-ignored labels.
  - `loss_conf`: an entropy penalty applied only on tokens that are already predicted correctly.

Args:
    logits (`torch.Tensor`): Logits of shape `(..., vocab_size)`.
    labels (`torch.Tensor`): Labels of shape `(...)`, matching `logits.shape[:-1]`. Values set to `ignore_index`
        are excluded from both losses.
    lambda_conf (`float`, *optional*, defaults to `0.0`): Weight for the confidence term.
    temperature (`float`, *optional*, defaults to `1.0`): Temperature used for the entropy term only. Lower values
        sharpen the distribution and change the strength of the confidence gradients.
    per_token_weights (`torch.Tensor`, *optional*): Optional weights of shape `(...)` to reweight both losses per
        token (e.g. schedule-aware weights). Tokens with weight `0` contribute nothing.
    ignore_index (`int`, *optional*, defaults to `-100`): Ignore index for labels.

Returns:
    `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]`: `(loss, loss_sft, loss_conf)`.
r,   z.`logits` must have at least 2 dims, got shape .Nz:`labels` shape must match `logits.shape[:-1]`, got labels=z logits=r   z`temperature` must be > 0, got )dtypez9`per_token_weights` shape must match `labels` shape, got z != none)	reductionr?   r   r;    )r+   rF   )dimr)   )ndim
ValueErrortupler1   ner   	ones_likerF   r.   Fcross_entropyreshape
reshape_assum	clamp_minzerosr+   no_gradargmaxeqr/   softmaxfinfotinylog)r@   rA   r<   r=   r>   r?   validweights
vocab_sizeper_token_nll	denom_sftloss_sft	loss_confpredcorrectscaled_logitsprobseps	log_probsentropy
denom_conflosss                         r$   compute_confidence_aware_lossrn   q   sQ   < {{QI%PVP\P\J]I^^_`aa||v||CR((Hv||I\H]]efklrlxlxfyezz{|
 	
 a:;-qIJJIIl#E //&=""fll2KERcRiRiLjKkkopuv|  wC  wC  qD  pE  EF  G  $&&V\\&: b!JOOr:&r!	
 j  88GMM22779CCAFI'%((=3F3F*GGLLNQZZH cKK6==O	9,,	}}}$''&/) 
 LLNMc%k(::MM-R0E
++ekk
"
'
'C		%//#./I	!&&2&.111EEGGJJw}}55::<FFqIJ7"WZZ%>>CCE
RIeK(944D$$# 
s   #N""
N1interpolation_typec                 0   [        5       (       d  [        S5      eU S:X  a  [        R                  R                  nU$ U S:X  a  [        R                  R
                  nU$ U S:X  a  [        R                  R                  nU$ U S:X  a  [        R                  R                  nU$ U S:X  a  [        R                  R                  nU$ U S:X  a  [        R                  R                  nU$ U S:X  a  [        R                  R                  nU$ [        S	U  S
35      e)a  
Maps a string describing an interpolation function to the corresponding torchvision `InterpolationMode` enum. The
full list of supported enums is documented at
https://pytorch.org/vision/0.9/transforms.html#torchvision.transforms.functional.InterpolationMode.

Args:
    interpolation_type (`str`):
        A string describing an interpolation method. Currently, `bilinear`, `bicubic`, `box`, `nearest`,
        `nearest_exact`, `hamming`, and `lanczos` are supported, corresponding to the supported interpolation modes
        in torchvision.

Returns:
    `torchvision.transforms.InterpolationMode`: an `InterpolationMode` enum used by torchvision's `resize`
    transform.
zhPlease make sure to install `torchvision` to be able to use the `resolve_interpolation_mode()` function.bilinearbicubicboxnearestnearest_exacthamminglanczoszThe given interpolation mode z is not supported. Currently supported interpolation modes are `bilinear`, `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`.)r   ImportErrorr   InterpolationModeBILINEARBICUBICBOXNEARESTNEAREST_EXACTHAMMINGLANCZOSrL   )ro   interpolation_modes     r$   resolve_interpolation_moder      s>     $%%v
 	
 Z''99BB& % 
y	('99AA" ! 
u	$'99==  
y	('99AA  
	.'99GG  
y	('99AA  
y	('99AA  +,>+? @m n
 	
r&   unetr3   r4   noisenoisy_latentstargetencoder_hidden_statesdream_detail_preservationc                 \   UR                   R                  UR                  5      USSS4   nSU-
  S-  n	X-  n
Sn[        R                  " 5          U " XBU5      R
                  nSSS5        Su  pUR                  R                  S:X  aL  UnX>-
  R                  5       nUR                  U
5        UR                  X-  5      nUR                  U5      nX4$ UR                  R                  S:X  a  [        S5      e[        SUR                  R                   35      e! , (       d  f       N= f)	a  
Implements "DREAM (Diffusion Rectification and Estimation-Adaptive Models)" from
https://huggingface.co/papers/2312.00210. DREAM helps align training with sampling to help training be more
efficient and accurate at the cost of an extra forward step without gradients.

Args:
    `unet`: The state unet to use to make a prediction.
    `noise_scheduler`: The noise scheduler used to add noise for the given timestep.
    `timesteps`: The timesteps for the noise_scheduler to user.
    `noise`: A tensor of noise in the shape of noisy_latents.
    `noisy_latents`: Previously noise latents from the training loop.
    `target`: The ground-truth tensor to predict after eps is removed.
    `encoder_hidden_states`: Text embeddings from the text model.
    `dream_detail_preservation`: A float value that indicates detail preservation level.
      See reference.

Returns:
    `tuple[torch.Tensor, torch.Tensor]`: Adjusted noisy_latents and target.
Nr)   r(   )NNepsilonv_predictionz/DREAM has not been implemented for v-predictionzUnknown prediction type )r-   r.   r+   r   rW   sampleconfigprediction_typedetachmul_addNotImplementedErrorrL   )r   r3   r4   r   r   r   r   r   r-   r6   dream_lambdare   _noisy_latents_targetpredicted_noisedelta_noises                   r$    compute_dream_and_update_latentsr      s*   : %3366y7G7GHTXZ^`dIdeN%(>%9c$A! 1KLD	M.CDKK 
 +N--:.668&&**+H+VW**[) "" 
			/	/>	A!"STT3O4J4J4Z4Z3[\]] 
s   D
D+c                     0 nU R                  5        H[  u  p#[        US5      (       d  M  [        US5      nUc  M)  UR                  5       nUR	                  5        H  u  pgXqU SU 3'   M     M]     U$ )z@
Returns:
    A state dict containing just the LoRA parameters.
set_lora_layer
lora_layerz.lora.)named_moduleshasattrgetattr
state_dictitems)r   lora_state_dictnamemoduler   current_lora_layer_sdlora_layer_matrix_name
lora_params           r$   unet_lora_state_dictr   )  s    
 O**,6+,, 6J%(2(=(=(?%:O:U:U:W6*OYtfF3I2J$KL ;X - r&   modelc                     [        U [        5      (       d  U /n U  HC  nUR                  5        H,  nUR                  (       d  M  UR	                  U5      Ul        M.     ME     g)z
Casts the training parameters of the model to the specified data type.

Args:
    model: The PyTorch model whose parameters will be cast.
    dtype: The data type to which the model parameters will be cast.
N)
isinstancelist
parametersrequires_gradr.   data)r   rF   mparams       r$   cast_training_paramsr   <  sM     eT""\\^E""""XXe_
 $ r&   r   prefixtext_encoderc                     U R                  5        VVs0 s H1  u  p4UR                  U5      (       d  M  UR                  US5       U_M3     nnn[        [	        U5      5      n[        X%SS9  gs  snnf )a,  
Sets the `lora_state_dict` into `text_encoder` coming from `transformers`.

Args:
    lora_state_dict: The state dictionary to be set.
    prefix: String identifier to retrieve the portion of the state dict that belongs to `text_encoder`.
    text_encoder: Where the `lora_state_dict` is to be set.
 default)adapter_nameN)r   
startswithreplacer   r   r   )r   r   r   kvtext_encoder_state_dicts         r$   !_set_state_dict_into_text_encoderr   M  ss     3B2G2G2I2I$!Q\\Z`Ma%199VR 
!A%2I   99XYp9qrlR[\	s
   A.A.modules_to_savec                     0 nU R                  5        H,  u  p#Uc  M
  UR                  S   R                  5       X S3'   M.     U$ )Nr   _lora_adapter_metadata)r   peft_configto_dict)r   	metadatasmodule_namer   s       r$   _collate_lora_metadatar   `  sP    I.446@F@R@RS\@]@e@e@gI%;<=  7 r&   weighting_scheme
batch_size
logit_mean	logit_std
mode_scaler+   	generatorc                 v   U S:X  aB  [         R                  " X#U4XVS9n[         R                  R                  R	                  U5      nU$ U S:X  aT  [         R
                  " U4XVS9nSU-
  U[         R                  " [        R                  U-  S-  5      S-  S-
  U-   -  -
  nU$ [         R
                  " U4XVS9nU$ )z
Compute the density for sampling the timesteps when doing SD3 training.

Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.

SD3 paper reference: https://huggingface.co/papers/2403.03206v1.
logit_normal)meanstdsizer+   r   mode)r   r+   r   r   r,   )	r   normalnn
functionalsigmoidrandcosmathpi)r   r   r   r   r   r+   r   us           r$   %compute_density_for_timestep_samplingr   h  s      >)LLjzmTZpHH''* H 
V	#JJZM&NEJ%))DGGaK!O"<"AA"E"IJJ H JJZM&NHr&   c                     U S:X  a  US-  R                  5       nU$ U S:X  a)  SSU-  -
  SUS-  -  -   nS[        R                  U-  -  nU$ [        R                  " U5      nU$ )z
Computes loss weighting scheme for SD3 training.

Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.

SD3 paper reference: https://huggingface.co/papers/2403.03206v1.

sigma_sqrtg       cosmapr   r,   )r/   r   r   r   rO   )r   sigmas	weightingbots       r$   compute_loss_weighting_for_sd3r     sz     <'T\((*	  
X	%!f*nq619},3'	  OOF+	r&   c                  V   [         R                  " 5         [        R                  R	                  5       (       a  [        R                  R                  5         g[        R                  R                  R	                  5       (       a  [        R                  R                  5         g[        5       (       a  [        R                  R                  5         g[        [        S5      (       aC  [        R                  R	                  5       (       a  [        R                  R                  5         ggg)zN
Runs garbage collection. Then clears the cache of the available accelerator.
xpuN)gccollectr   r#   is_availableempty_cachebackendsmpsr   	torch_npur!   r   r   rI   r&   r$   free_memoryr     s     JJLzz  

 				(	(	*	*				!	!!!#			599#9#9#;#;		 $<	r&   T)offloadmodulesr   c              '   
  #    U(       a  [        S U 5       5      (       + nU(       a4  U Vs/ s H&  n[        UR                  5       5      R                  PM(     nnO![	        U5      S:X  d   eUS   R                  /nU H  nUR                  U 5        M      Sv   U(       a'  [        UW5       H  u  pFUR                  U5        M     ggs  snf ! U(       a'  [        UW5       H  u  pFUR                  U5        M     f f = f7f)a  
Context manager that, if offload=True, moves each module to `device` on enter, then moves it back to its original
device on exit.

Args:
    device (`str` or `torch.Device`): Device to move the `modules` to.
    offload (`bool`): Flag to enable offloading.
c              3   B   #    U  H  n[        U[        5      v   M     g 7fN)r   r   ).0r   s     r$   	<genexpr>!offload_models.<locals>.<genexpr>  s     MW:a):;;Ws   r   r   N)anynextr   r+   r0   r.   zip)r+   r   r   is_modelr   original_devicesorig_devs          r$   offload_modelsr     s      MWMMMELMWQ\\^ 4 ; ;WMw<1$$$ '
 1 12ADDL "7,<=X  >   N "7,<=X  > s(   *D-C>DC 4D0D  Dc                 ^   U (       d  [        S5      eU R                  5       R                  S5      n/ nU H  n[        R                  " SU5      nU(       d  [        SU S35      e [        UR                  S5      5      n[        UR                  S5      5      nUS::  d  US::  a  [        S	5      eUS
-  S:w  d	  US
-  S:w  a  [        R                  " SU SU S35        UR                  XV45        M     U(       d  [        S5      eU$ ! [          a  n[        SU SU 35      UeSnAff = f)zGParses a string defining buckets into a list of (height, width) tuples.zBucket string cannot be empty.;z^\s*(\d+)\s*,\s*(\d+)\s*$zInvalid bucket format: 'z'. Expected 'height,width'.r   r,   r   z,Bucket dimensions must be positive integers.   zBucket dimension (,z.) not divisible by 8. This might cause issues.z Invalid integer in bucket pair 'z': Nz.No valid buckets found in the provided string.)
rL   stripsplitrematchintgroupwarningswarnappend)buckets_strbucket_pairsparsed_bucketspair_strr  heightwidthes           r$   parse_buckets_stringr    s/   9::$$&,,S1LN 5x@7zA\]^^		YQ(FA'E{eqj !OPPzQ%!)q. 26(!E7Bpqr!!6/2 ! IJJ  	Y?zQCPQWXX	Ys   'BD
D,D''D,c                     [        S5      nSn[        U5       H%  u  nu  pg[        X-  X-  -
  5      nX::  d  M!  UnUnM'     U$ )z6Finds the closes bucket to the given height and width.infN)r/   	enumerateabs)	hwbucket_options
min_metricbest_bucket_idx
bucket_idxbucket_hbucket_wmetrics	            r$   find_nearest_bucketr    sS    uJO,5n,E(
(XQ\AL01J(O	 -F
 r&   c           	          U R                  5        VVs0 s HS  u  pU[        U[        R                  5      (       a,  UR	                  5       R                  5       R                  5       OU_MU     snn$ s  snnf r   )r   r   r   Tensorr   cpu
contiguous)state_dictsr   r   s      r$   _to_cpu_contiguousr$    sW    _j_p_p_rs_rW[WXA
1ell0K0Kqxxz~~**,QRR_rssss   AA2c                     0 n[        U R                  SS5      nUc  [        S5      eU R                  R                  nUc  [        R
                  US'   U$ UR                  =(       d    [        R
                  US'   U$ )zL
Extract and convert FSDP config from Accelerator into PyTorch FSDP kwargs.
fsdp_pluginNzLAccelerate isn't configured to handle FSDP. Please update your installation.sharding_strategy)r   staterL   r&  r	   
FULL_SHARDr'  )acceleratorkwargs
fsdp_stater&  s       r$    get_fsdp_kwargs_from_acceleratorr-    s    
 F**M4@Jghh##//K&6&A&A"#
 M '2&C&C&bGWGbGb"#Mr&   use_orig_paramslimit_all_gathersfsdp_kwargstransformer_layer_clsc                 T   [        [        5      nUcJ  [        U R                  R                  R
                  S   5      nUR                  SUR                   35        [        [        U1S9nUU(       a	  [        US9OSUUUS.n	U(       a  U	R                  U5        [        U 40 U	D6n
U
$ )u)  
Wrap a model with FSDP using common defaults and optional transformer auto-wrapping.

Args:
    model: Model to wrap
    device: Target device (e.g., accelerator.device)
    offload: Whether to enable CPU parameter offloading
    use_orig_params: Whether to use original parameters
    limit_all_gathers: Whether to limit all gathers
    fsdp_kwargs: FSDP arguments (sharding_strategy, etc.) — usually from Accelerate config
    transformer_layer_cls: Classes for auto-wrapping (if not using policy from fsdp_kwargs)

Returns:
    FSDP-wrapped model
Nr   z8transformer_layer_cls is not provided, auto-inferred as )r1  )offload_params)	device_idcpu_offloadr.  r/  auto_wrap_policy)r   __name__typer   language_modellayersinfor   r   r   updateFSDP)r   r+   r   r.  r/  r0  r1  loggerr6  r   
fsdp_models              r$   wrap_with_fsdpr@    s    2 !F$ $U[[%?%?%F%Fq%I JNOdOmOmNnop ;TiSjk =Dz9$*.,F k"e&v&Jr&   c                   <   \ rS rSrSr         SS\\R                  R                     S\	S\	S\
S\S	\	\
-  S
\	\
-  S\S\S-  S\\\4   S-  4S jjr\SS S jj5       rS rS\
S\	4S jr\R(                  " 5       S\\R                  R                     4S j5       rS\\R                  R                     SS4S jrS!S jrS"S!S jjrS\4S jrS\\R                  R                     SS4S jrS\\R                  R                     SS4S jrS\SS4S jrSrg)#EMAModeli;  z.
Exponential Moving Average of models weights
Nr   decay	min_decayupdate_after_stepuse_ema_warmup	inv_gammapowerforeach	model_clsmodel_configc                    [        U[        R                  R                  5      (       a   Sn[	        SSUSS9  UR                  5       nSnUR                  SS5      b  S	n[	        SSUSS9  US   nUR                  S
S5      b  Sn[	        S
SUSS9  US
   n[        U5      nU Vs/ s H   oR                  5       R                  5       PM"     snU l
        UR                  SS5      b   Sn[	        SSUSS9  U R                  US   S9  SU l        X l        X0l        X@l        XPl        X`l        Xpl        SU l        SU l        Xl        Xl        Xl        gs  snf )a  
Args:
    parameters (Iterable[torch.nn.Parameter]): The parameters to track.
    decay (float): The decay factor for the exponential moving average.
    min_decay (float): The minimum decay factor for the exponential moving average.
    update_after_step (int): The number of steps to wait before starting to update the EMA weights.
    use_ema_warmup (bool): Whether to use EMA warmup.
    inv_gamma (float):
        Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
    power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
    foreach (bool): Use torch._foreach functions for updating shadow parameters. Should be faster.
    device (str | torch.device | None): The device to store the EMA weights on. If None, the EMA
                weights will be stored on CPU.

@crowsonkb's notes on EMA Warmup:
    If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
    to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
    gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
    at 215.4k steps).
zzPassing a `torch.nn.Module` to `ExponentialMovingAverage` is deprecated. Please pass the parameters of the module instead.z9passing a `torch.nn.Module` to `ExponentialMovingAverage`1.0.0Fstandard_warnT	max_valueNzCThe `max_value` argument is deprecated. Please use `decay` instead.	min_valuezGThe `min_value` argument is deprecated. Please use `min_decay` instead.r+   z=The `device` argument is deprecated. Please use `to` instead.r*   r   )r   r   r   Moduler   r   getr   cloner   shadow_paramsr.   temp_stored_paramsrC  rD  rE  rF  rG  rH  optimization_stepcur_decay_valuerI  rJ  rK  )selfr   rC  rD  rE  rF  rG  rH  rI  rJ  rK  r+  deprecation_messageps                 r$   __init__EMAModel.__init__@  sg   F j%((//22D   K##	 $..0J "N::k4(4"gk7,?uU;'E::k4(4"kk7,?uU{+I*%
:DE*Qggi..0*E::h%1"ah)<ERGG6(+G,"&
"!2,"
!"#"() Fs   %'ErB   c                     UR                  USS9u  pEUR                  U5      nU " UR                  5       X&R                  US9nUR	                  U5        U$ )NT)return_unused_kwargs)rJ  rK  rI  )from_configfrom_pretrainedr   r   load_state_dict)clspathrJ  rI  _
ema_kwargsr   	ema_models           r$   ra  EMAModel.from_pretrained  sY    !--d-N))$/((*illdkl	!!*-r&   c                 v   U R                   c  [        S5      eU R                  c  [        S5      eU R                   R                  U R                  5      nU R	                  5       nUR                  SS 5        UR                  " S0 UD6  U R                  UR                  5       5        UR                  U5        g )NzJ`save_pretrained` can only be used if `model_cls` was defined at __init__.zM`save_pretrained` can only be used if `model_config` was defined at __init__.rU  rI   )
rJ  rL   rK  r`  r   popregister_to_configcopy_tor   save_pretrained)rY  rd  r   r   s       r$   rm  EMAModel.save_pretrained  s    >>!ijj$lmm**4+<+<=__&
-  .:.U%%'(d#r&   rW  c                    [        SXR                  -
  S-
  5      nUS::  a  gU R                  (       a#  SSX R                  -  -   U R                  * -  -
  nOSU-   SU-   -  n[        X0R                  5      n[        X0R                  5      nU$ )z>
Compute the decay factor for the exponential moving average.
r   r   r;   
   )maxrE  rF  rG  rH  minrC  rD  )rY  rW  steprX  s       r$   	get_decayEMAModel.get_decay  s     1'*@*@@1DE191tnn'<#<$**"LLO 4xBI6Oozz:o~~>r&   c           	      T   [        U[        R                  R                  5      (       a  Sn[	        SSUSS9  UR                  5       n[        U5      nU =R                  S-  sl        U R                  U R                  5      nX0l	        SU-
  n[        R                  " 5       nU R                  (       Ga~  [        5       (       aK  [        R                  R                   R#                  5       (       a  [         R$                  R'                  US S9nU   U Vs/ s H  ofR(                  (       d  M  UPM     nn[+        U R,                  U5       VVs/ s H  u  pUR(                  (       d  M  UPM     n	nn[/        U5      [/        U5      :  au  [        R0                  " [+        U R,                  U5       VVs/ s H  u  pUR(                  (       a  M  UPM     snnU Vs/ s H  ofR(                  (       a  M  UPM     snSS	9  [        R2                  " U	[        R4                  " X5      US
9  S S S 5        g [+        U R,                  U5       H  u  p[        5       (       aK  [        R                  R                   R#                  5       (       a  [         R$                  R'                  US S9nU   UR(                  (       a  UR7                  XHU-
  -  5        OUR9                  U5        S S S 5        M     g s  snf s  snnf s  snnf s  snf ! , (       d  f       g = f! , (       d  f       M  = f)NzPassing a `torch.nn.Module` to `ExponentialMovingAverage.step` is deprecated. Please pass the parameters of the module instead.z>passing a `torch.nn.Module` to `ExponentialMovingAverage.step`rM  FrN  r   )modifier_rankT)non_blocking)r7   )r   r   r   rR  r   r   r   rW  rt  rX  
contextlibnullcontextrI  r   transformersintegrations	deepspeedis_deepspeed_zero3_enabledzeroGatheredParametersr   r   rU  r0   _foreach_copy__foreach_sub__foreach_subsub_copy_)
rY  r   rZ  rC  one_minus_decaycontext_managerr   params_grads_params_params_grads
             r$   rs  EMAModel.step  so   j%((//22D   P##	 $..0J*%
!# t556$e)$002<<<(**|/H/H/R/R/m/m/o/o"+.."C"CJ^b"C"c 2<T*@S@Su*T25d6H6H*2U!2UY^YlYlG2U  ! {#c*o5((7:4;M;Mz7Zv7Z^Wbgbubu7Zv,6RJ5>Q>QJR%) ##!5#5#5m#QYh !$ #&d&8&8*"E,..<3L3L3V3V3q3q3s3s&/nn&G&G]a&G&bO$**_%%HIe,	 %_	 #F# U! wR !, %_sg   LK1.K14LK6*K60AL4K<K<LL4L:0L*:L1L
L
L'	c           
         [        U5      nU R                  (       a~  [        R                  " U Vs/ s H  o"R                  PM     sn[        U R                  U5       VVs/ s H*  u  p2UR                  UR                  5      R                  PM,     snn5        g[        U R                  U5       HC  u  p2UR                  R                  UR                  UR                  5      R                  5        ME     gs  snf s  snnf )a1  
Copy current averaged parameters into given collection of parameters.

Args:
    parameters: Iterable of `torch.nn.Parameter`; the parameters to be
        updated with the stored moving averages. If `None`, the parameters with which this
        `ExponentialMovingAverage` was initialized will be used.
N)
r   rI  r   r  r   r   rU  r.   r+   r  )rY  r   r   r  s       r$   rl  EMAModel.copy_to  s     *%
<<  )344EHI[I[]gEhiEh>7ELL)..Ehi
 #&d&8&8*"E

  ELL!9!>!>? #F	 5is   C9!1C>c                 l    U R                    Vs/ s H  oR                  5       PM     snU l         gs  snf )z
Move internal buffers of the ExponentialMovingAverage to pinned memory. Useful for non-blocking transfers for
offloading EMA params to the host.
N)rU  
pin_memory)rY  r[  s     r$   r  EMAModel.pin_memory  s*     7;6H6HI6Hlln6HIIs   1c           	          U R                    Vs/ s H7  nUR                  5       (       a  UR                  XUS9OUR                  XS9PM9     snU l         gs  snf )z
Move internal buffers of the ExponentialMovingAverage to `device`.

Args:
    device: like `device` argument to `torch.Tensor.to`
)r+   rF   rx  )r+   rx  N)rU  is_floating_pointr.   )rY  r+   rF   rx  r[  s        r$   r.   EMAModel.to  se     ''	
 ( ""$$ DD,DGV?@ (	
 
s   >Ac           	          U R                   U R                  U R                  U R                  U R                  U R
                  U R                  U R                  S.$ )z
Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
checkpointing to save the ema state dict.
rC  rD  rW  rE  rF  rG  rH  rU  r  )rY  s    r$   r   EMAModel.state_dict$  sN     ZZ!%!7!7!%!7!7"11ZZ!//	
 		
r&   c                     U Vs/ s H.  o"R                  5       R                  5       R                  5       PM0     snU l        gs  snf )z
Saves the current parameters for restoring later.

Args:
    parameters: Iterable of `torch.nn.Parameter`. The parameters to be temporarily stored.
N)r   r!  rT  rV  )rY  r   r   s      r$   storeEMAModel.store7  s6     NX"XZE<<>#5#5#7#=#=#?Z"X"Xs   5Ac                    U R                   c  [        S5      eU R                  (       aV  [        R                  " U Vs/ s H  o"R
                  PM     snU R                    Vs/ s H  o3R
                  PM     sn5        OD[        U R                   U5       H*  u  p2UR
                  R                  UR
                  5        M,     SU l         gs  snf s  snf )a  
Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters
without: affecting the original optimization process. Store the parameters before the `copy_to()` method. After
validation (or model saving), use this to restore the former parameters.

Args:
    parameters: Iterable of `torch.nn.Parameter`; the parameters to be
        updated with the stored parameters. If `None`, the parameters with which this
        `ExponentialMovingAverage` was initialized will be used.
NzGThis ExponentialMovingAverage has no `store()`ed weights to `restore()`)rV  RuntimeErrorrI  r   r  r   r   r  )rY  r   r   c_params       r$   restoreEMAModel.restore@  s     ""*hii<<  )344SWSjSj6kSj||Sj6k #&d&=&=z"J

  . #K #' 56ks   C#C
r   c                 H   [         R                  " U5      nUR                  SU R                  5      U l        U R                  S:  d  U R                  S:  a  [	        S5      eUR                  SU R
                  5      U l        [        U R
                  [        5      (       d  [	        S5      eUR                  SU R                  5      U l        [        U R                  [        5      (       d  [	        S5      eUR                  S	U R                  5      U l
        [        U R                  [        5      (       d  [	        S
5      eUR                  SU R                  5      U l        [        U R                  [        5      (       d  [	        S5      eUR                  SU R                  5      U l        [        U R                  [        [        45      (       d  [	        S5      eUR                  SU R                  5      U l        [        U R                  [        [        45      (       d  [	        S5      eUR                  SS5      nUb]  X l        [        U R                  [         5      (       d  [	        S5      e[#        S U R                   5       5      (       d  [	        S5      egg)z
Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
ema state dict.

Args:
    state_dict (dict): EMA state. Should be an object returned
        from a call to :meth:`state_dict`.
rC  r;   r)   zDecay must be between 0 and 1rD  zInvalid min_decayrW  zInvalid optimization_steprE  zInvalid update_after_steprF  zInvalid use_ema_warmuprG  zInvalid inv_gammarH  zInvalid powerrU  Nzshadow_params must be a listc              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fr   )r   r   r   )r   r[  s     r$   r   +EMAModel.load_state_dict.<locals>.<genexpr>  s      O<Nqz!U\\22<Ns   ')z!shadow_params must all be Tensors)copydeepcopyrS  rC  rL   rD  r   r/   rW  r  rE  rF  boolrG  rH  rU  r   all)rY  r   rU  s      r$   rb  EMAModel.load_state_dictY  s    ]]:.
^^GTZZ8
::tzzC/<==#T^^D$..%00011!+0CTE[E[!\$00#66899!+0CTE[E[!\$00#66899(nn-=t?R?RS$--t44566#T^^D$..5#,77011^^GTZZ8
$**ucl33_--"=$!.d00$77 !?@@OD<N<NOOO !DEE P	 %r&   )rX  rC  rI  rG  rD  rJ  rK  rW  rH  rU  rV  rE  rF  )	gH.?r;   r   Fr)   gUUUUUU?FNN)F)rB   rB  )rB   N)NNF)r7  
__module____qualname____firstlineno____doc__r   r   r   	Parameterr/   r  r  r   dictstrr\  classmethodra  rm  rt  rW   rs  rl  r  r.   r   r  r  rb  __static_attributes__rI   r&   r$   rB  rB  ;  s    !"$!$" $.2R)UXX//0R) R) 	R)
 R) R) 3;R) s{R) R) :R) 38nt+R)h  $3 5 & ]]_7-x(:(:; 7- 7-r@(588+=+="> @4 @&J

D 
&Y);); < Y Y'(588+=+="> '4 '2.F$ .F4 .Fr&   rB  )r)   )NNNr!  Nr   )TTTNN)Wry  r  r   r   r   r  r  r   	functoolsr   typingr   r   numpyr   r   torch.nn.functionalr   r   rP   r   torch.distributed.fsdpr   r	   r
   r=  torch.distributed.fsdp.wrapr   modelsr   	pipelinesr   
schedulersr   utilsr   r   r   r   r   r   r   r   r{  r|  r}  r~  accelerate.loggingr   peftr   torchvisionr   r   r  r%   r:   r   r/   rM   rn   r  r   r   r  r   float32rR  r   r   r   r   r+   	Generatorr   r   r   r  r   r  r  r$  r-  setr8  r@  rB  rI   r&   r$   <module>r     s0     	   	  %        5-&2CGH ( ( &	 	 	   **EEGG-.&)3 )("R -1S%LLS%LLS% 	S%
 S% ||d*S% S% 5<<u||34S%l)3 )h (+3#
3##3# ||3# <<	3#
 <<3# LL3# !<<3#  %3# 5<<%&3#l3 S%,,=N8O & PU}} -$uxx2G G -"]#u||+,]69]IN]&Dehhoo1E,F 4PSUXPX>  !&(,  	
  LL3 %6S $   nr UXX__/@@ #PUP\P\J\ gk  >8	tt tT 4  ")-?C/88??/%,,/ / 	/
 / c3h$&/ tEHHOO45</ 
/fLF LFr&   