
    
9j              	       B   S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKJ	r	  S SK
Jr  S SKJr  S SKJrJr  S SKrSSKJr  S	S
KJrJrJrJr  S	SKJr  S	SKJrJr  S	SKJrJ r J!r!  S	SK"J#r#  \" \$5      r%S r&S r'S2S jr(S2S jr)S r*S3S jr+S3S jr,S4S jr-S3S jr.S5S\/S\/S\04S jjr1 S6S\/S\/S\0S\04S jjr2S\Rf                  Rh                  S \Rj                  4S! jr6S7S\Rf                  Rh                  S"\7S#\04S$ jjr8S%\Rr                  Rt                  S&\74S' jr;S\Rf                  Rh                  4S( jr<S\Rf                  Rh                  S)\Rf                  Rh                  S-  4S* jr=S\Rf                  Rh                  S)\Rf                  Rh                  4S+ jr>S\Rf                  Rh                  S)\\Rf                  Rh                  /\04   S-  4S, jr?S- r@S.\7S)\74S/ jrAS0\\	\Rf                  Rh                     \/4   S)\B\Rf                  R                     4S1 jrDg)8    N)defaultdict)Iterable)nullcontext)Path)CallableUnion   )
get_logger   )FSDP_MODEL_NAMEOPTIMIZER_NAMESAFE_WEIGHTS_NAMEWEIGHTS_NAME)get_module_class_from_name)get_non_persistent_buffersis_peft_model)get_module_children_bottom_upis_compiled_modulesave)is_torch_versionc                  x    S[         R                  ;  a  S[         R                  S'   S[         R                  S'   g)zS
Enables RAM efficient loading of Hugging Face models for FSDP in the environment.
ACCELERATE_USE_FSDPTrueFSDP_CPU_RAM_EFFICIENT_LOADINGNosenviron     U/home/wildlama/miniconda3/lib/python3.13/site-packages/accelerate/utils/fsdp_utils.py!enable_fsdp_ram_efficient_loadingr!   '   s.    
 BJJ.,2

()39BJJ/0r   c                  *    S[         R                  S'   g)zT
Disables RAM efficient loading of Hugging Face models for FSDP in the environment.
Falser   Nr   r   r   r    "disable_fsdp_ram_efficient_loadingr$   1   s     4;BJJ/0r   c                     U(       a&  [        U 5      (       a  SSKJn  U" X R                  S9$ Ub  SSKJn  U" XS9$ U R                  5       $ )Nr   )get_peft_model_state_dictadapter_name)get_model_state_dictoptions)r   peftr&   active_adapter'torch.distributed.checkpoint.state_dictr)   
state_dict)modeladapter_only
sd_optionsr&   r)   s        r    _get_model_state_dictr3   8   sH    e,,2(=Q=QRR P#E>>!!r   c                     U(       a'  [        U 5      (       a  SSKJn  U" XU R                  S9$ Ub  SSKJn  U" XUS9$ U R                  U5      $ )Nr   )set_peft_model_state_dictr'   )set_model_state_dictr*   )r   r,   r5   r-   r.   r6   load_state_dict)r0   r/   r1   r2   r5   r6   s         r    _set_model_state_dictr8   G   sN    e,,2(I]I]^^ P#EzJJ$$Z00r   c           	          S nU R                   S:X  aU  SSKJn  SSKJn  U" U R
                  UR                  :H  [        U R                  SS5      [        U R                  SS5      S9nU$ )	Nr	   r   )StateDictOptionsStateDictTypeoffload_to_cpuF
rank0_only)full_state_dictcpu_offloadbroadcast_from_rank0)	fsdp_versionr.   r:   2torch.distributed.fsdp.fully_sharded_data_parallelr<   state_dict_typeFULL_STATE_DICTgetattrstate_dict_config)fsdp_pluginr2   r:   r<   s       r    _prepare_sd_optionsrI   V   sj    J 1$LT%'77=;X;XX = =?OQVW!()F)FV[!\

 r   c                    SS K Js  Jn  SSKJn  SSKJn  SSKJn	  [        R                  " USS9  U R                  U	R                  :X  a/  UR                  S:  n
XR                  l        XR                  l        U R                   S:X  a1  UR                  X R                  U R                  U R"                  5      O	[%        5       n['        U 5      nU   [)        X%US9nU R                  U	R                  :X  a  US:X  a	  [*         S	3O[*         S
U S	3n[        R,                  R/                  X>5      nUR0                  S:X  aF  [2        R5                  SU 35        [6        R8                  " X5        [2        R5                  SU 35        GOfU R                  U	R:                  :X  a  US:X  a  [*         SUR0                   S	3O[*         S
U SUR0                   S	3n[        R,                  R/                  X>5      n[2        R5                  SU 35        [6        R8                  " X5        [2        R5                  SU 35        OU R                  U	R<                  :X  a  [        R,                  R/                  U[*         S
U 35      n[        R                  " USS9  [2        R5                  SU 35        SU0nUR9                  UUR?                  U5      U" 5       S9  [2        R5                  SU 35        S S S 5        g ! , (       d  f       g = f)Nr   DefaultSavePlannerFullyShardedDataParallelr;   Texist_okr   r1   r2   .bin_zSaving model to zModel saved to _rankr0   r/   storage_writerplanner) torch.distributed.checkpointdistributed
checkpoint,torch.distributed.checkpoint.default_plannerrL   rC   rN   r<   r   makedirsrD   rE   num_processesrG   r=   r>   rB   optim_state_dict_configr   rI   r3   r   pathjoinprocess_indexloggerinfotorchr   LOCAL_STATE_DICTSHARDED_STATE_DICTFileSystemWriter)rH   acceleratorr0   
output_dirmodel_indexr1   dist_cprL   FSDPr<   is_multi_processctxr2   r/   weights_nameoutput_model_fileckpt_dirs                    r    save_fsdp_modelrr   g   s   22OcPKK
T*""m&C&CC '44q87G%%43C%%0 ##q( 	..0M0M{OrOr	
 ]  %[1J	*5Xbc
&&-*G*GG7Ba7Go.d3P_O``abmannrMsL "Z F((A-./@.ABC

:9o.?-@AB((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
 !#Z FKK*+<*=>?JJz5KK/*;)<=>((M,L,LLww||J?2C1[M0RSHKK40KK*8*56!:.JLL%&77A*,  
 KK/(45? 
s   HK88
Lc                    SS K Js  Jn  SSKJn  SSKJn  SSKJn	  UR                  5         U R                  U	R                  :X  a/  UR                  S:  n
XR                  l        XR                  l        U R                  S:X  a1  UR                  X R                  U R                  U R                   5      O	[#        5       n[%        U 5      nU   U R                  U	R                  :X  Ga  ['        U5      ULaW  UR(                  S:w  aG  UR*                  (       d6  U R,                  (       d  U R                  S:X  a  [/        S5      e S S S 5        g US:X  a	  [0         S3O[0         SU S3n[2        R4                  R7                  X=5      n[8        R;                  S	U 35        UR*                  (       + =(       d    UR<                  nU(       a  [>        R@                  " US
S9nO0 n[8        R;                  SU 35        GOjU R                  U	RB                  :X  a  US:X  a  [0         SUR(                   S3O[0         SU SUR(                   S3n[2        R4                  R7                  X=5      n[8        R;                  S	U 35        [>        R@                  " US
S9n[8        R;                  SU 35        OU R                  U	RD                  :X  a  [0         U;  a)  [2        R4                  R7                  U[0         SU 35      OUn[8        R;                  S	U 35        S[G        X%US90nURA                  UURI                  U5      U" 5       S9  US   n[8        R;                  SU 35        [K        UWX\S9nS S S 5        U$ ! , (       d  f       W$ = f)Nr   )DefaultLoadPlannerrM   r;   r   zzSet the `sync_module_states` flag to `True` so that model states are synced across processes when initializing FSDP objectrR   rS   zLoading model from Tweights_onlyzModel loaded from rT   r0   rQ   )r/   storage_readerrW   )&rX   rY   rZ   r[   rt   rC   rN   r<   wait_for_everyonerD   rE   r]   rG   r=   r>   rB   r^   r   rI   typera   is_fsdp2sync_module_states
ValueErrorr   r   r_   r`   rb   rc   is_main_processrd   loadre   rf   r3   FileSystemReaderr8   )rH   rh   r0   	input_dirrj   r1   rk   rt   rl   r<   rm   rn   r2   ro   input_model_file
load_modelr/   rq   load_results                      r    load_fsdp_modelr      sO   22OcP!!#""m&C&CC '44q87G%%43C%%0 ##q( 	..0M0M{OrOr	
 ]  %[1J	&&-*G*GGE{$&;+D+D+IR]RfRf"55+:R:RVW:W$3   
 8Ca7Go.d3P_O``abmannrMsL!ww||IDKK-.>-?@A(111P[5P5PJ"ZZ(8tL

KK,-=,>?@((M,J,JJ !# ##5)B)B(C4H'(+eK<U<U;VVZ[ 
  "ww||IDKK-.>-?@A$44HJKK,-=,>?@((M,L,LL &&y8 Y?*;1[M(JK 
 KK-hZ89!#8fp#qrJLL%&77A*,  
 $G,JKK,XJ78+E:Lp[ 
\ ] 
\ s   A8M<H$M<<
Nc                 &   SS K Js  Jn  SSKJn  SSKJn  SSKJn	  [        R                  " USS9  U R                  S:X  a1  UR                  X0R                  U R                  U R                  5      O	[        5       n
[        U 5      nU
   U R                  S:X  a  SS	KJn  U" X2US
9nOUR%                  X25      nU R                  U	R&                  :X  a  UR(                  S:X  a  US:X  a	  [*         S3O[*         SU S3n[        R,                  R/                  XN5      n[0        R3                  SU 35        [4        R6                  " X5        [0        R3                  SU 35        O[        R,                  R/                  U[*         SU 35      n[        R                  " USS9  [0        R3                  SU 35        UR7                  SU0UR9                  U5      U" 5       S9  [0        R3                  SU 35        S S S 5        g ! , (       d  f       g = f)Nr   rK   rM   r;   TrO   r   r	   get_optimizer_state_dictr*   rR   rS   zSaving Optimizer state to zOptimizer state saved in 	optimizerrU   )rX   rY   rZ   r[   rL   rC   rN   r<   r   r\   rB   rD   rG   r^   r   rI   r.   r   optim_state_dictrE   ra   r   r_   r`   rb   rc   rd   r   rg   )rH   rh   r   r0   ri   optimizer_indexrk   rL   rl   r<   rn   r2   r   optim_stateoptim_state_nameoutput_optimizer_filerq   s                    r    save_fsdp_optimizerr      s   22OcPKK
T* ##q( 	..0M0M{OrOr	
 ]  %[1J	##q(X25ZXK//AK&&-*G*GG((A-/>!/C~&d+NK[[\]l\mmqIr ! )+Z(R%89N8OPQ

;>78M7NOPww||J>2B!OCT0UVHKK40KK4XJ?@LL'5&77A*,  
 KK3H:>?5 
s   	E0H
Hc                    SS K Js  Jn  SSKJn  SSKJn	  UR                  5         U R                  S:X  a1  UR                  X0R                  U R                  U R                  5      O	[        5       n
[        U 5      nU
   U R                  U	R                  :X  a  S nUR                  S:X  d  U R                  R                  (       d  US:X  a	  [          S3O[          SU S3n["        R$                  R'                  XM5      n[(        R+                  SU 35        [,        R.                  " USS	9n[(        R+                  S
U 35        O[          U;  a)  ["        R$                  R'                  U[          SU 35      OUn[(        R+                  SU 35        U R                  S:X  a  SSKJn  U" X2US9nOUR5                  X25      nSU0nUR/                  UUUR7                  U5      S9  US   n[(        R+                  SU 35        U R                  S:X  a"  UR9                  X2US9nUR;                  U5        OSSKJn  U" X2XS9  S S S 5        g ! , (       d  f       g = f)Nr   rM   r;   r   rR   rS   zLoading Optimizer state from Tru   zOptimizer state loaded from zLoading Optimizer from r	   r   r*   r   )checkpoint_idrw   zOptimizer loaded from )r0   optimr   )set_optimizer_state_dict)rX   rY   rZ   rC   rN   r<   rx   rB   rD   rG   r^   r   rI   rE   ra   r>   r   r   r_   r`   rb   rc   rd   r~   r.   r   r   r   optim_state_dict_to_loadr7   r   )rH   rh   r   r0   r   r   r1   rk   rl   r<   rn   r2   r   optimizer_nameinput_optimizer_filerq   r   flattened_osdr   s                      r    load_fsdp_optimizerr     s4   22cP!!#
 ##q( 	..0M0M{OrOr	
 ]  %[1J	&&-*G*GGK((A-[5X5X5c5c/>!/C~&d+NK[[\]l\mmqIr  (*ww||I'N$;<P;QRS#jj)=DQ:;O:PQR %%i7 Y>*:!O;L(MN 
 KK1(<=''1,\6uQ[\"33EE&4KLL&&77A  
 &k2KKK0
;<##q( 99it9uM%%m4X$U{WO 
s   >GI
Icheckpoint_dir	save_pathsafe_serializationc                 h   SSK Js  Jn  SSKJs  Js  Jn  0 n[        U5      nUR                  SS9  UR                  UUR                  U 5      UR                  5       SS9  U(       a	  U[        -  OU[        -  n[        UR                  5       5      S:X  a  U[        U5      S      n[        XQUS9  U$ )z
Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`

Will save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
r   NTrO   )rw   rW   no_distr   )r   )rX   rY   rZ   )torch.distributed.checkpoint.format_utilsformat_utilsr   mkdir_load_state_dictr   _EmptyStateDictLoadPlannerr   r   lenkeyslistr   )r   r   r   rk   dist_cp_format_utilsr/   s         r    )_distributed_checkpoint_to_merged_weightsr   R  s     32LLJYIOOTO"))//?$??A	 *  2D	--UaIaI :??"Z 0 34
3EFr   output_pathremove_checkpoint_dirc                    [        U 5      n SSKJn  [        SS5      (       d  [	        S5      eU R                  5       (       d  U S-  R                  5       nU S-  R                  5       nSU  S	3nU(       a  U(       a  US
-  nUSU  SU  S3-  nUS-  nO+U(       a  US-  nUSU  S3-  nOU(       a  US-  nUSU  S3-  n[	        U5      eU" 5       nUR                  (       aq  [        R                  SU  35        [        XU5      n	[        R                  SU	 35        U(       a.  [        R                  SU  35        [        R                  " U 5        UR                  5         g)a  
Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
`SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
`safe_serialization` else `pytorch_model.bin`.

Note: this is a CPU-bound process.

Args:
    checkpoint_dir (`str`):
        The directory containing the FSDP checkpoints (can be either the model or optimizer).
    output_path (`str`):
        The path to save the merged checkpoint.
    safe_serialization (`bool`, *optional*, defaults to `True`):
        Whether to save the merged weights with safetensors (recommended).
    remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
        Whether to remove the checkpoint directory after merging.
r   )PartialState>=z2.3.0z/`merge_fsdp_weights` requires PyTorch >= 2.3.0`pytorch_model_fsdp_0optimizer_0zTried to load from z) but couldn't find a valid metadata file.zE However, potential model and optimizer checkpoint directories exist.zPlease pass in either z/pytorch_model_fsdp_0 or z/optimizer_0zinstead.z8 However, a potential model checkpoint directory exists.zPlease try passing in z/pytorch_model_fsdp_0 instead.z< However, a potential optimizer checkpoint directory exists.z/optimizer_0 instead.zMerging FSDP weights from z.Successfully merged FSDP weights and saved to z"Removing old checkpoint directory N)r   accelerate.stater   r   r|   existsr}   rb   rc   r   shutilrmtreerx   )
r   r   r   r   r   model_path_existsoptimizer_path_existserrstater   s
             r    merge_fsdp_weightsr   n  sx   ( .)N-D'**JKK   ""+.DDLLN!/-!? G G I#N#33\]!6ZZC+N+;;TUcTddpqqC:CMMC+N+;;YZZC"QQC+N+;;PQQCo NE00@AB=n[mn	DYKPQ KK<^<LMNMM.)	r   r0   devicec                 8  ^ ^	 [        USS 5      nU(       d  T $ 0 m	U HV  nUR                  S5      nSR                  US S 5      US   pTUR                  U5      n[        Xe5      nS T	[	        U5      '   MX     S[
        R                  R                  4U	U 4S jjnU$ )N_tied_weights_keys.modulec                 N  > [        [        5      nU R                  SS9 H3  u  p#[        U5      T;   d  M  U[        U5         R	                  U5        M5     T" U 5      n UR                  5        H2  u  pEU H'  nTU   nUc  [        X5      TU'   M  [        XU5        M)     M4     U $ )NF)recurse)r   r   named_parametersidappenditemsrF   setattr)	r   params_to_tienparamid_key_param_names
param_name_tied_paramsparam_init_fns	          r    param_init_fn_tied_param7ensure_weights_retied.<locals>.param_init_fn_tied_param  s     $D)///>HA%yL(bi(//2 ? v& %2$7$7$9 F*
$V,= ,36+FL(F6 + %: r   )rF   splitr`   get_submoduler   rd   nnModule)
r   r0   r   _tied_namesnamer   modr   r   r   s
   `        @r    ensure_weights_retiedr     s    %!5t<K Lzz#88D"I.Rj!!$'("&RY   2 $#r   full_sdr@   c                 :   SSK Jn  SSKJnJn  UR                  5       n0 nS n	S n
U R                  (       Ga
  UR                  5        H  u  pX;  a'  [        SU S[        U5       S[        U5       S	35      eX+   nUR                  nUR                  5       R                  UR                  5      n[        X5      (       a  UR                  5       nUR!                  USUR"                  R$                  S
9  U" XUR&                  5      nU	" UUU5      u  nnU
" UUU5      nU(       a  UR                  S5      nXU'   M     OUR                  5        H  u  pUR                  n[(        R*                  " UR-                  5       UR                  UR.                  S9nUR!                  USUR"                  R$                  S
9  U" UXR&                  5      nU	" UUU5      u  nnU
" UUU5      nU(       a  UR                  S5      nXU'   M     UR1                  USS9  U$ )au  
Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
parameters from rank 0 to all other ranks. This function modifies the model in-place.

Args:
    accelerator (`Accelerator`): The accelerator instance
    model (`torch.nn.Module`):
        The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
    full_sd (`dict`): The full state dict to load, can only be on rank 0
    cpu_offload (`bool`, defaults to `False`):
        If True, move sharded parameters to CPU after distribution. Required when FSDP CPU offloading is enabled.
r   N)DTensordistribute_tensorc                     U R                  U5      n[        [        S5      nS nU=(       a    UR                  [        R                  :H  n	UR                  R                  (       a  U	(       d  UR                  nUS L=(       a    UR                  5       U4$ ! [         a3    UR                  SS5      u  pEU R                  U5      n[	        Xe5      n Nf = f)Nr   r   float8_e4m3fn)get_parameter_or_bufferAttributeErrorrsplitr   rF   hasattrrd   dtyper   is_floating_pointis_contiguous)
r0   r   empty_param	old_parambase_param_namelocal_param_name	submoduleis_torch_e4m3fn_availablecasting_dtypeis_param_float8_e4m3fns
             r    _infer_parameter_dtype:fsdp2_load_full_state_dict.<locals>._infer_parameter_dtype  s    	=55jAI %,E?$C!!:!g{?P?PTYTgTg?g..7M%OOM$B)@)@)BMQQ  	=0:0A0A#q0I-O++O<I	<I		=s   B :CCc                 X    Ub  U R                  US9n U(       a  U R                  5       n U $ )N)r   )to
contiguous)tensorto_contiguousr   s      r    _cast_and_contiguous8fsdp2_load_full_state_dict.<locals>._cast_and_contiguous  s.    YYUY+F&&(Fr   zParameter 'zZ' found in sharded model state dict but missing from full state dict. Full state dict has z keys, sharded has z keys.)srcgroupcpu)r   r   T)assign)torch.distributedrY   torch.distributed.tensorr   r   r/   r}   r   KeyErrorr   device_meshdetachr   device_type
isinstanceto_local	broadcastr   WORLD
placementsrd   emptysizer   r7   )rh   r0   r   r@   distr   r   meta_sharded_sd
sharded_sdr   r   r   sharded_param
full_paramr   sharded_tensorr   r   full_tensors                      r    fsdp2_load_full_state_dictr    s'    %C &&(OJR$ """)8)>)>)@%J(!* .++.w<.8KCP_L`Kaagi  !,J'33K#**,//0G0GHJ*.. (002
NN:1DJJ4D4DNE.zH`H`aN+A,(M=
 2.-Q^_N!/!2!25!9%3z"3 *A8 *9)>)>)@%J'33K++m&8&8&:;CZCZbobubuvKNN;ATZZ5E5ENF.{KIaIabN+A,(M=
 2.-Q^_N!/!2!25!9%3z" *A" 
*T2Lr   r   mappingc                     SSK Jn  0 nSX2'    U R                   H(  nUS    Vs/ s H  oQUR                     PM     snUS'   M*     gs  snf ! [         a    [	        S5      ef = f)a  
Switches the parameters of the optimizer to new ones (sharded parameters in usual case). This function modifies the
optimizer in-place.

Args:
    optimizer (`torch.optim.Optimizer`): Optimizer instance which contains the original model parameters
    mapping (`dict`): Mapping from the original parameter (specified by `data_ptr`) to the sharded parameter

Raises:
    KeyError:
        If a parameter in the optimizer couldn't be switched to its sharded version. This should never happen and
        indicates a bug. If we kept the original params instead of raising, the training wouldn't be numerically
        correct and weights wouldn't get updated.
r   )r   _local_tensorparamszA parameter in the optimizer couldn't be switched to its sharded version. This breaks the training. Please raise an issue on GitHub.N)r   r   param_groupsdata_ptrr   )r   r  r   accessor_mappingparam_groupps         r    !fsdp2_switch_optimizer_parametersr  3  sy     1 /
$11KBMhBW$XBWQQZZ%8BW$XK! 2$X 
  S
 	

s   A A
A A A"c                 n   SSK Jn  [        U R                  R                  U5      n[        USS9SS  H}  u  pE[        UR                  S5      5      S:  a  UR                  SS5      u  pgOSnUnU(       a  UR                  U5      OUnU" U5      (       d  Me  U" US	S
9nUR                  Xu5        M     U$ )a  
Applies the activation checkpointing to the model.

Args:
    accelerator (`Accelerator`): The accelerator instance
    model (`torch.nn.Module`): The model to apply the activation checkpointing to

Returns:
    `torch.nn.Module`: The model with the activation checkpointing applied
r   )checkpoint_wrapperT)return_fqnsNr   r   r   F)preserve_rng_state);torch.distributed.algorithms._checkpoint.checkpoint_wrapperr  fsdp2_prepare_auto_wrap_policyr   rH   r   r   r   r   r   register_module)	rh   r0   r  auto_wrap_policy_func
layer_namelayerparent_name
child_nameparent_modules	            r    fsdp2_apply_acr$  R  s     ;;;L;L;X;XZ_`:5dSTWUWX
z$%)&0&7&7Q&?#KK#J<G++K8U //&uGE))*< Y Lr   returnc                    [        U SS5      nU(       a  [        XS5      OU n[        U[        R                  R                  5      (       d  gSnUR                  5        H"  u  pES[        U5      R                  ;   d  M   UnM$     U$ )a  Find the final normalization layer before the output head.

The final norm is conventionally a direct child of the base model (e.g. `model.norm`
for Llama, `transformer.ln_f` for GPT-2), so we only scan the base model's direct
children. Returns the last norm found there, or None.
base_model_prefix NNorm)rF   r   rd   r   r   named_childrenry   __name__)r0   base_prefix
base_model
final_normrS   r   s         r    _find_final_normr/  s  st     %!4b9K6AT2uJj%((//22J..0	T&\***J 1 r   c           	      F   SSK JnJnJn  [	        X5      =(       d(    [        U5      =(       a    [	        UR                  U5      nU(       a  U$ U R                  R                  nUR                  U5        [        U SS5      nUR                  UR                  UR                  =(       d    U" 5       Ub"  U[        U R                  R                   5         OSS.n[#        SS5      (       a0  UR$                  b#  ['        UR$                  XR(                  5      US'   S	n	[+        5       n
UR-                  5        Hu  u  pUR.                  R0                  S
:X  d  M!  Sn	UR2                  (       a  M6  UR5                  5       (       a  MM  UR7                  5       (       a  Md  U
R9                  U5        Mw     U
(       ao  [#        SS5      (       a^  [+        UR;                  S[+        5       5      5      nX-  US'   U R<                  (       a#  [>        R@                  " S[C        U
5       S35        U RD                  S:w  a  U	(       d  / nUR-                  5        Hw  u  pUR2                  (       d  M  URF                  [H        RJ                  :w  d  M8  URM                  U5        URN                  RQ                  [H        RJ                  5      Ul'        My     U R<                  (       a,  U(       a%  [R        RU                  S[C        U5       SU S35        URW                  5       nURX                  (       a  U	(       d  [[        USSS9n[\        R^                  " URa                  5        VVs0 s H  u  nnUU;   d  M  UU_M     snn5      nURQ                  [H        R(                  " S5      5      n[c        US5      (       a  URe                  5         [g        Xa5      nUb@  [i        U5      SS  H.  nU" U5      (       d  M  [	        UU5      (       a  M%  U" U40 UD6  M0     [        USS 5      " 5       n[        USS 5      " 5       n[        USS5      n[        USS5      nUSL=(       a    UUL nU(       d  Ub  [	        UU5      (       d	  U" U40 UD6  [k        U5      nU(       a  UOUnUU4 Vs/ s H  nUc  M  [	        UU5      (       a  M  UPM     nnU(       a  U" U40 0 UESS	0ED6  [	        X5      (       d7  URm                  5        VVs0 s H  u  nnUS:w  d  M  UU_M     nnnU" U40 UD6  URX                  (       a,  U	(       d%  SSK J7n   [q        XU[	        UR                  U 5      S9  URX                  (       a  U	(       d  WRm                  5        Hc  u  n!n"U"RQ                  U R(                  5      n"SU!;   a'  U!Rs                  SS5      u  n#n$URu                  U#5      n%OU!n$Un%U%Rw                  U$U"S	S 9  Me     [c        US5      (       a  URe                  5         U$ s  snnf s  snf s  snnf )!a
  Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.

Args:
    accelerator (`Accelerator`): The accelerator instance
    model (`torch.nn.Module`): The model to prepare

Returns:
    `torch.nn.Module`: Prepared model
r   )
FSDPModuleMixedPrecisionPolicyfully_shardtorch_device_meshN)reshard_after_forwardoffload_policy	mp_policymeshr   z2.7.0ignored_paramsF
Params4bitTzFound z non-floating frozen Params4bit. Excluding from FSDP2 sharding to prevent quant_state corruption.To enable memory-efficient sharding of 4-bit weights, setbnb_4bit_quant_storage to a floating dtype (e.g. bf16).nozFSDP upcast of low precision parameters to fp32 (since mixed_precision != 'no') may affect the precision of model checkpoints. This effects z parameters: z...)r   fqnsmetatie_weightsr   get_input_embeddingsc                      g Nr   r   r   r    <lambda>%fsdp2_prepare_model.<locals>.<lambda>  s    r   get_output_embeddingsc                      g rA  r   r   r   r    rB  rC    s    4r   weightr5  )CPUOffloadPolicy)r@   r   r   )
persistent)<torch.distributed.fsdpr1  r2  r3  r   r   	_orig_modr   rH   set_auto_wrap_policyrF   r5  r@   mixed_precision_policytupleparallelism_configfsdp_dim_namesr   ignored_modulesget_parameters_from_modulesr   setr   	__class__r+  requires_gradr   
is_complexaddgetr}   warningswarnr   mixed_precisionr   rd   float32r   datar   rb   rc   r/   cpu_ram_efficient_loadingr   copydeepcopynamed_buffersr   r>  r  r   r/  r   rG  r  r   r   register_buffer)&rh   r0   r1  r2  r3  is_type_fsdpfsdp2_pluginr8  fsdp2_kwargsmodel_has_params4bitincompatible_params4bitr   r   ignoredupcasted_paramsoriginal_sdnon_persistent_buffer_fqnskvoriginal_non_persistent_buffersr  r   input_embedoutput_embedinput_weightoutput_weightis_weights_tiedr.  
tail_embedmtailroot_kwargsrG  fqnbuffer_tensor
parent_fqnlocal_buffer_namer#  s&                                         r    fsdp2_prepare_modelr{    sx    UTe0 5!Mj*&M  $$00L%%e,; 3T:D ".!C!C&22!88R<P<RNRN^U;99HHIJdhL g&&<+G+G+S)D((%1C1C*
%& !!e--/ ??##|3#'  '''%2I2I2K2KV[VfVfVhVh'++E2 0 #3D'#B#Bl&&'7?@)0)J%&&&MM456 7J J ""d*3G 113KD"""u{{emm'C&&t,"ZZ]]5==9
 4 &&?KK  #O 45]?BSSVX ""$K--6J &@tZ^%_"*.--#113W3daq<V7VTQT3W+
' f-. 5-((:<O(3E:3B?F$V,,Z
5S5SF3l3 @ %!7FHK5"9<HJL;$7LL(D9M"$.P<=3PO{6z+Wa?b?bK0<0!%(J !0\J"J/c/!1ATUWaIbA/DcDML|L-DeLMe(( )5(:(:(<](<1E\@\tq!t(<]E)[)--6J 	<"LD\D\^n9o	
 --6J"A"G"G"IC),,[-?-?@Mcz03

30B-
- % 3 3J ?$'! %))*;]W\)] #J  5-((LQ X> d ^s*   X
.X
 X+X>X?XXc                 ,  ^ ^
 SSK JnJn  T R                  n[	        U[
        R                  5      (       a  UR                  nXCL a  [        USS5      nUc  / n[        U5      nT R                  b  T R                  n[        5       m
U H1  n[        X5      nUc  [        SU S35      eT
R                  U5        M3     S[        R                   R"                  S[$        4U
4S	 jjn	U	$ XBL a*  S[        R                   R"                  S[$        4U 4S
 jjn	U	$ g)a  Prepares the auto wrap policy based on its type, done to mimic the behaviour of FSDP1 auto wrap policy.

Args:
    fsdp2_plugin (`FullyShardedDataParallelPlugin`):
        Instance of `FullyShardedDataParallelPlugin` containing the configuration options
    model (`torch.nn.Module`):
        The model to wrap

Returns:
    `Callable[[torch.nn.Module], bool] | None`:
        The auto wrap policy function to be applied to the model or `None`
r   )size_based_auto_wrap_policytransformer_auto_wrap_policy_no_split_modulesNz+Could not find the transformer layer class z in the model.r   r%  c                 >   > T(       d  g[        U [        T5      5      $ )NF)r   rM  )r   transformer_cls_to_wraps    r    policy.fsdp2_prepare_auto_wrap_policy.<locals>.policyM  s    *fe,C&DEEr   c                 b   > [        S U R                  5        5       5      nUTR                  :  $ )Nc              3   @   #    U  H  oR                  5       v   M     g 7frA  )numel).0r  s     r    	<genexpr>Afsdp2_prepare_auto_wrap_policy.<locals>.policy.<locals>.<genexpr>U  s     #K7J!GGII7Js   )sum
parametersmin_num_params)r   module_num_paramsrc  s     r    r  r  T  s.     ##Kv7H7H7J#K K$|'B'BBBr   )torch.distributed.fsdp.wrapr}  r~  auto_wrap_policyr   	functoolspartialfuncrF   r   transformer_cls_names_to_wraprR  r   r|   rV  rd   r   r   bool)rc  r0   r}  r~  fnno_split_modulesr  layer_classtransformer_clsr  r  s   `         @r    r  r  *  s    f		&	&B"i''((WW	)"5*=tD#!(,-=(>%55A,8,V,V)"%%8K8LO& #N{m[i!jkk#''8	 9	F588?? 	Ft 	F M 
	*	C588?? 	Ct 	C M r   c                      SSK Jn  U" S0 U D6$ )z
Returns a `GradScaler` for FSDP2, as the current implementation of `get_grad_scaler` doesn't accept other args. We
need this as current `get_grad_scaler` accepts only `distributed_type` as arg, which doesn't differentiate between
FSDP1 and FSDP2
r   )
GradScalerr   )torch.amp.grad_scalerr  )kwargsr  s     r    get_fsdp2_grad_scalerr  ]  s     1r   named_paramsc                    U R                  5        VVs0 s H  u  pUR                  SS5      U_M     n nnU R                  5        VVs0 s H0  u  pUR                  S5      (       a  UR                  SS5      OUU_M2     n nnU R                  5        VVs0 s H  u  pUR                  SS5      U_M     n nnU $ s  snnf s  snnf s  snnf )a  Removes parameter name modifiers in order to map them back to their original names.

See huggingface/accelerate#3554 for more context.

Args:
    named_params (`dict`): The named parameters dictionary to canonicalize.

Returns:
    `dict`: The canonicalized named parameters dictionary
z._checkpoint_wrapped_moduler(  z
_orig_mod.z
._orig_mod)r   replace
startswith)r  rk  rl  s      r    fsdp2_canonicalize_namesr  h  s     Q]PbPbPdePdAII;R@!CPdLeXdXjXjXlXlPTPQq||L'A'A		,#q!KXl   @L?Q?Q?ST?StqAIIlB/2?SLT f Us   B:	7C Cmodulesc                    U c
  [        5       $ / n[        U [        5      (       am  [        R                  " U 5      n/ nUR                  5        H?  u  pgUR                  U5      (       d  M  UR                  U5        UR                  U5        MA     Un U  H+  nUR                  [        UR                  5       5      5        M-     [        U5      $ )zConverts modules to parameters where modules can be a string or list of torch.nn.Module

Args:
    modules (`Union[Iterable[torch.nn.Module], str]`): List of modules

Returns:
    `set[torch.nn.Parameter]`: List of parameters
)rR  r   strrecompilenamed_modules	fullmatchr   r   extendr   r  )r  r0   r   r  regmapped_modulesr   r   s           r    rQ  rQ  {  s     uJ'3jj!!//1LD}}T""		&!%%f- 2 !$v00234 z?r   )FN)r   F)r   )T)TF)F)Er^  r  r   r  r   rX  collectionsr   collections.abcr   
contextlibr   pathlibr   typingr   r   rd   loggingr
   	constantsr   r   r   r   dataclassesr   modelingr   r   otherr   r   r   versionsr   r+  rb   r!   r$   r3   r8   rI   rr   r   r   r   r  r  r   r   r   r   r   r   dictr  r   	Optimizerr  r$  r/  r{  r  r  r  rR  	ParameterrQ  r   r   r    <module>r     s%     	 	   # $ "  "    W W 3 ? J J & 
H	:;"1"76tEP-@`6Xrc c gk : kp44&)4?C4cg4n+$ +$ +$\]588?? ]T ]`d ]@
1F1F 
QU 
>uxx BEHHOO $0F $bEHHOO b bJ0 0HV[V^V^VeVeUfhlUlLmptLt 0f 4 D &8EHHOO,c12		r   