
    
9jT                     x   S SK r S SKrS SKrS SKrS SKJr  S SK Jr  S SKJrJ	r	  S SK
Jr  S SKJr  S SKrS SKrS SKJr  S SKJr  S	S
KJr  S	SKJr  S	SKJr  SSKJr  SSKJr  SSK J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+  \" \,5      r-\$" 5       (       a	  S SK.J/s  J0r1  S\Rd                  Rf                  S\44S jr5S\Rd                  Rf                  S\44S jr6S\Rd                  Rf                  S\44S jr7S\Rd                  Rf                  S\44S jr8S\Rd                  Rf                  S\Rd                  Rf                  4S jr9S\Rd                  Rf                  4S jr:S\Rd                  Rf                  S\Rd                  Rf                  4S jr;S\Rd                  Rf                  S\44S jr< S7S\4S \4S!\44S" jjr=S# r>S$\?4S% jr@S8S&\4S'\44S( jjrA\"" S)5      (       a  \R                  O\R^                  rC\CR                  R                  \R                  \\R                  /rH\"" S*5      (       a%  \HR                  \R                  R                  5        S9S+ jrLS, rMS- rNS9S.\\O   S\44S/ jjrPS\O4S0 jrQS1 rRS2 rSS3\T4S4 jrUS:S\Rd                  Rf                  S5\4S\V\Rd                  Rf                     4S6 jjrWg);    N)encode)OrderedDict)partialreduce)
MethodType)Optional)Version)	save_file   )write_basic_config)
get_logger)PartialState   )FSDP_PYTORCH_VERSION)DistributedType)is_deepspeed_availableis_numpy_availableis_torch_distributed_availableis_torch_xla_availableis_weights_only_available)id_tensor_storage)convert_model)is_torch_versionmodulereturnc                     [        [        S5      (       d  g[        U [        R                  R                  R
                  5      $ )z<
Check whether the module was compiled with torch.compile()
_dynamoF)hasattrtorch
isinstancer   
eval_frameOptimizedModuler   s    P/home/wildlama/miniconda3/lib/python3.13/site-packages/accelerate/utils/other.pyis_compiled_moduler%   6   s1     5)$$femm66FFGG    c                     [        [        S5      (       d  gU R                  (       aL  U R                  5        H8  n[	        U[        R
                  R                  R                  5      (       d  M8    g   g)zT
Check whether the module has submodules that were compiled with `torch.compile()`.
r   FT)r   r   _modulesmodulesr    r   r!   r"   r   	submodules     r$   has_compiled_regionsr,   @   sQ     5)$$)I)U]]%=%=%M%MNN * r&   c                    ^  [        T [        R                  R                  5      =(       a*    [	        T 5      S:  =(       a    [        U 4S jT  5       5      $ )z
Check whether the module is a repeated block, i.e. `torch.nn.ModuleList` with all children of the same class. This
is useful to determine whether we should apply regional compilation to the module.
r   c              3   V   >#    U  H  n[        UTS    R                  5      v   M      g7f)r   N)r    	__class__).0mr   s     r$   	<genexpr>%is_repeated_blocks.<locals>.<genexpr>X   s%     CFq
1fQi1122Fs   &))r    r   nn
ModuleListlenallr#   s   `r$   is_repeated_blocksr8   O   sC     	6588../ 	DK!O	DCFCCr&   c                 x    U R                   (       a)  U R                  5        H  n[        U5      (       d  M    g   g)z
Check whether the module has repeated blocks, i.e. `torch.nn.ModuleList` with all children of the same class, at
any level of the module hierarchy. This is useful to determine whether we should apply regional compilation to the
module.
TF)r(   r)   r8   r*   s     r$   has_repeated_blocksr:   \   s/     )I!),, * r&   c                    ^ S[         R                  R                  S[         R                  R                  4U4S jjmT" U 40 UD6nSUR                  ;  a  XR                  S'   U$ )a  
Performs regional compilation where we target repeated blocks of the same class and compile them sequentially to
hit the compiler's cache. For example, in `GPT2LMHeadModel`, the repeated block/class is `GPT2Block`, and can be
accessed as `model.transformer.h[0]`. The rest of the model (e.g. model.lm_head) is compiled separately.

This allows us to speed up the compilation overhead / cold start of models like LLMs and Transformers in general.
See https://pytorch.org/tutorials/recipes/regional_compilation.html for more details.

Args:
    module (`torch.nn.Module`):
        The model to compile.
    **compile_kwargs:
        Additional keyword arguments to pass to `torch.compile()`.

Returns:
    `torch.nn.Module`: A new instance of the model with some compiled regions.

Example:
```python
>>> from accelerate.utils import compile_regions
>>> from transformers import AutoModelForCausalLM

>>> model = AutoModelForCausalLM.from_pretrained("gpt2")
>>> compiled_model = compile_regions(model, mode="reduce-overhead")
>>> compiled_model.transformer.h[0]
OptimizedModule(
    (_orig_mod): GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
        )
    )
)
```
r   r   c           	        > [        U 5      (       aO  [        R                  R                  5       nU  H)  nUR	                  [        R
                  " U40 UD65        M+     U$ [        U 5      (       a  U R                  R                  U R                  5      nUR                  R                  U R                  5        0 Ul        U R                  5        H  u  pCUR                  UT" U40 UD65        M      U$ [        R
                  " U 40 UD6nU$ N)r8   r   r4   r5   appendcompiler:   r/   __new____dict__updater(   named_children
add_module)r   compile_kwargs
new_moduler+   name_compile_regionss        r$   rH   )compile_regions.<locals>._compile_regions   s    f%%,,.J#	!!%--	"L^"LM $  !(())11&2B2BCJ&&v7"$J#)#8#8#:%%d,<Y,Y.,YZ $;
  v@@Jr&   	_orig_mod)r   r4   ModulerA   )r   rE   rF   rH   s      @r$   compile_regionsrL   j   sZ    ^ uxx   "&;N;J*---+1K(r&   c                     [        U 5      (       a  U  H  nUR                  " S0 UD6  M     g[        U 5      (       a$  U R                  5        H  n[	        U40 UD6  M     gU R                  " S0 UD6  g)a  
Performs regional compilation the same way as `compile_regions`, but specifically for `DeepSpeedEngine.module`.
Since the model is wrapped in a `DeepSpeedEngine` and has many added hooks, offloaded parameters, etc that
`torch.compile(...)` interferes with, version of trgional compilation uses the inplace `module.compile()` method
instead.

Args:
    module (`torch.nn.Module`):
        The model to compile.
    **compile_kwargs:
        Additional keyword arguments to pass to `module.compile()`.
N )r8   r?   r:   childrencompile_regions_deepspeedr   rE   r+   childs       r$   rP   rP      sd     &!!I//  	V	$	$__&E%e>~> ' 	((r&   c                     [        U 5      (       a  U  H  nUR                  " S0 UD6  M     U $ [        U 5      (       a%  U R                  5        H  n[	        U40 UD6  M     U $ U R                  " S0 UD6  U $ )a  
Like `compile_regions`, but uses the in-place `module.compile()` instead of `torch.compile(module)`.

Needed for the FSDP2 prepare path: `torch.compile(module)` returns an `OptimizedModule` whose `__call__`
bypasses `nn.Module._call_impl`, so forward/pre hooks added later by `fully_shard` never fire and per-layer
all-gather/reshard is lost. The in-place `module.compile()` keeps `_call_impl` (and its runtime hook check)
on the call path, so FSDP hooks installed afterwards still run.

Args:
    module (`torch.nn.Module`):
        The model to compile.
    **compile_kwargs:
        Additional keyword arguments to pass to `module.compile()`.
rN   )r8   r?   r:   rO   compile_regions_fsdp2rQ   s       r$   rT   rT      sy     &!!I//   M 
V	$	$__&E!%:>: ' M 	((Mr&   modelc                    ^ [        SS5      (       a  SSKJm  OSSKJm  [	        U4S jU R                  5        5       5      $ )z
Check if the model has DTensor parameters.

Args:
    model (`torch.nn.Module`):
        The model to check.

Returns:
    `bool`: Whether the model has DTensor parameters.
>=z2.5.0r   )DTensorc              3   <   >#    U  H  n[        UT5      v   M     g 7fr=   )r    )r0   prX   s     r$   r2   $model_has_dtensor.<locals>.<genexpr>   s     B/A!z!W%%/As   )r   torch.distributed.tensorrX   torch.distributed._tensorany
parameters)rU   rX   s    @r$   model_has_dtensorr`      s4     g&&4 	6Bu/?/?/ABBBr&   keep_fp32_wrapperkeep_torch_compile	recursivec                   ^ [         R                  R                  R                  [         R                  R                  4n[        U 5      n[        U 5      nSnU(       a  U nU R                  n O(U(       a!  SU R                  ;   a  U nU R                  S   n [        5       (       a  SSK
Jn  XH4-  n[        S[        5      (       a  [        5       (       a  SSKJn	  XI4-  n[#        X5      (       a  U R$                  n [#        X5      (       a  M  U(       a  U4S jmT" U 5      n U(       d  U R&                  n
U R                  R)                  SS5      nUbF  [+        U
S	5      (       a%  U
R,                  n
X:X  a  O[+        U
S	5      (       a  M%  [/        X5      U l        [1        U S
S5      (       a
  [3        U SS9  U(       a+  Ub(  U(       a
  Xl        Un U $ U(       a  XR                  S'   Un U $ )a_  
Extract a model from its distributed containers.

Args:
    model (`torch.nn.Module`):
        The model to extract.
    keep_fp32_wrapper (`bool`, *optional*):
        Whether to remove mixed precision hooks from the model.
    keep_torch_compile (`bool`, *optional*):
        Whether to unwrap compiled model.
    recursive (`bool`, *optional*, defaults to `False`):
        Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
        recursively, not just the top-level distributed containers.

Returns:
    `torch.nn.Module`: The extracted model.
NrJ   r   )DeepSpeedEnginerW   )FullyShardedDataParallelc                    > [        U S5      (       a  T" U R                  5      nOU nUR                  5        H  u  p#[        XT" U5      5        M     U$ )Nr   )r   r   rC   setattr)r   unwrapped_modulerG   rR   _recursive_unwraps       r$   rj   6extract_model_from_parallel.<locals>._recursive_unwrap*  sT     vx((#4V]]#C #) />>@(0A%0HI  A##r&   _original_forward__wrapped__ _converted_to_transformer_engineF)to_transformer_engine)r   r4   parallelDistributedDataParallelDataParallelr%   r,   rJ   rA   r   	deepspeedre   r   r   r   2torch.distributed.fsdp.fully_sharded_data_parallelrf   r    r   forwardpopr   rm   r   getattrr   )rU   ra   rb   rc   optionsis_compiledhas_compiledcompiled_modelre   FSDPru   original_forwardrj   s               @r$   extract_model_from_parallelr~      s   ( xx  88%((:O:OPG$U+K'.LN	%..("NNN;/E-%%2338V8X8Xg7
U
$
$ U
$
$ 
	$ "%(-- >>--.A4H''=11!--. '=11 'w6EM5<eDD%u=n8',$"E
 L	 38##K0"ELr&   c                  4    [        5       R                  5         g)z
Introduces a blocking point in the script, making sure all processes have reached this point before continuing.

<Tip warning={true}>

Make sure all processes will reach this instruction otherwise one of your processes will hang forever.

</Tip>
N)r   wait_for_everyonerN   r&   r$   r   r   P  s     N$$&r&   
state_dictc           	         [         R                  " [        5      nU R                  5        H9  u  p#[	        U[
        5      (       a  M  U[        U5         R                  U5        M;     UR                  5        VVs0 s H  u  pE[        U5      S:  d  M  XE_M     nnn[        5       nUR                  5        H>  nU Vs/ s H  o"U ;   d  M
  UPM     nnUR                  USS 5        USS  H  nX	 M     M@     [        U5      S:  a  [        R                  SU S35        U R                  5        V	V
s0 s H7  u  pU	[	        U
[        R                  5      (       a  U
R!                  5       OU
_M9     n n	n
U $ s  snnf s  snf s  sn
n	f )z
Cleans the state dictionary from a model and removes tensor aliasing if present.

Args:
    state_dict (`dict`):
        The state dictionary from a model
r   Nr   zRemoved shared tensor zk while saving. This should be OK, but check by verifying that you don't receive any warning while reloading)collectionsdefaultdictlistitemsr    strr   r>   r6   setvaluesrB   loggerwarningr   Tensor
contiguous)r   ptrsrG   tensorptrnamesshared_ptrs
warn_namesfound_nameskvs              r$    clean_state_dict_for_safetensorsr   ]  sa    ""4(D"((*&#&&"6*+2248 +
 15

O*#E
Q:3:KOJ##% ).D1CtD+ab/*OD  $ & :$ZL  1\  ]	
 WaVfVfVhiVhda!z!U\\'B'BQ\\^IVhJi# P E js   ;E7E7>	E=E=4>Fsave_on_each_nodesafe_serializationc                    [        5       R                  [        R                  :X  a  [        R
                  " U 5      n U(       a1  [        [        SS0S9n[        U [        5      (       a  [        U 5      n O[        R                  n[        5       R                  (       a  U(       d	  U" X5        g[        5       R                  (       a  U(       a	  U" X5        ggg)a  
Save the data to disk. Use in place of `torch.save()`.

Args:
    obj:
        The data to save
    f:
        The file (or file-like object) to use to save the data
    save_on_each_node (`bool`, *optional*, defaults to `False`):
        Whether to only save on the global main process
    safe_serialization (`bool`, *optional*, defaults to `False`):
        Whether to save `obj` using `safetensors` or the traditional PyTorch way (that uses `pickle`).
formatpt)metadataN)r   distributed_typer   XLAxm_maybe_convert_to_cpur   safe_save_filer    r   r   r   saveis_main_processis_local_main_process)objfr   r   	save_funcs        r$   r   r     s    $ ~&&/*=*==&&s+Nh5EF	c;''237CJJ	~%%.?#		-	-2C# 3D	-r&   z2.0.0z1.25.0c                 p    [        5       (       aM  [        R                  R                  5       nSU;  a  SUS'   [        R                  R	                  [
        5        OUR                  SS5        [        R                  " U 4SU0UD6n[        5       (       aD  [        R                  R                  5         W(       a  [        R                  R	                  U5        U$ ! [        5       (       aF  [        R                  R                  5         W(       a   [        R                  R	                  U5        f f f = f)a  
Compatible drop-in replacement of `torch.load()` which allows for `weights_only` to be used if `torch` version is
2.4.0 or higher. Otherwise will ignore the kwarg.

Will also add (and then remove) an exception for numpy arrays

Args:
    f:
        The file (or file-like object) to use to load the data
    map_location:
        a function, `torch.device`, string or a dict specifying how to remap storage locations
    **kwargs:
        Additional keyword arguments to pass to `torch.load()`.
weights_onlyTNmap_location)	r   r   serializationget_safe_globalsadd_safe_globalsTORCH_SAFE_GLOBALSrv   loadclear_safe_globals)r   r   kwargsold_safe_globals
loaded_objs        r$   r   r     s    G$&&$22CCEV+)-~&001CDJJ~t,ZZGGG
$&&224##445EF	 %&&224##445EF   's   BC AD5c                     [        U S5      (       d  [        U S5      (       d  [        U SU 5      n [        U S5      (       a  U R                  $ [        U S5      (       a  U R                  $ [	        U 5      $ )z 
Gets a pretty name from `obj`.
__qualname____name__r/   )r   rw   r   r   r   )r   s    r$   get_pretty_namer     sd     3''Z0H0Hc;,sN##sJ||s8Or&   c                     U R                  5        H=  u  p#[        U[        5      (       a  UR                  U0 5      n[	        X45        M9  X1U'   M?     U$ )z
Recursively merges two dictionaries.

Args:
    source (`dict`): The dictionary to merge into `destination`.
    destination (`dict`): The dictionary to merge `source` into.
)r   r    dict
setdefaultmerge_dicts)sourcedestinationkeyvaluenodes        r$   r   r     sM     lln
eT""))#r2D$$ % r&   portc                     U c  Sn [         R                   " [         R                  [         R                  5       nUR                  SU 45      S:H  sSSS5        $ ! , (       d  f       g= f)z
Checks if a port is in use on `localhost`. Useful for checking if multiple `accelerate launch` commands have been
run and need to see if the port is already in use.
Ni<s  	localhostr   )socketAF_INETSOCK_STREAM
connect_ex)r   ss     r$   is_port_in_user     sJ    
 |	v~~v'9'9	:a||[$/0A5 
;	:	:s   A
A'c                      [         R                   " [         R                  [         R                  5       n U R                  S5        U R	                  5       S   sSSS5        $ ! , (       d  f       g= f)z
Gets a free port on `localhost`. Useful for automatic port selection when port 0 is specified in distributed
training scenarios.

Returns:
    int: An available port number
) r   r   N)r   r   r   bindgetsockname)r   s    r$   get_free_portr     sC     
v~~v'9'9	:a	w}}q! 
;	:	:s   $A""
A0c                 n    S H!  nU S:  a  [        U S5       SU 3s  $ U S-  n M#     [        U S5       S3$ )z7Converts `size` from bytes to the largest possible unit)bytesKBMBGBTBg      @r    z PB)round)sizexs     r$   convert_bytesr   	  sJ    .&=D!n%Qqc** /
 D!nS!!r&   c                  
   [         R                  " 5       n U R                  nUS:w  a  g[        R                  " SU R
                  5      tp#nSn[        U5      [        U5      :  a  SU SU S3n[        R                  USS	9  gg)
zFWarns if the kernel version is below the recommended minimum on Linux.LinuxNz(\d+\.\d+\.\d+)z5.5.0zDetected kernel version z,, which is below the recommended minimum of zo; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.T)main_process_only)	platformunamesystemresplitreleaser	   r   r   )infor   _versionmin_versionmsgs         r$   check_os_kernelr     s     >>D[[FXX0$,,?NAKw'+..&wi/[\g[h is s 	 	sd3 /r&   attrc                 D    S n[        X /UR                  S5      -   5      $ )z
Recursive `getattr`.

Args:
    obj:
        A class instance holding the attribute.
    attr (`str`):
        The attribute that is to be retrieved, e.g. 'attribute1.attribute2'.
c                     [        X5      $ r=   )rw   )r   r   s     r$   _getattr#recursive_getattr.<locals>._getattr0  s    s!!r&   .)r   r   )r   r   r   s      r$   recursive_getattrr   %  s#    " (EDJJsO344r&   return_fqnsc                    U(       d  U OSU 4nU/n/ nU(       a  UR                  5       nU(       a  Uu  peUR                  5        Hm  u  px[        U[        R                  R
                  5      (       d  M0  U(       a%  W(       a  US-   U-   OUn	UR                  X45        M\  UR                  U5        Mo     U(       a  UR                  WU45        OUR                  U5        U(       a  M  USSS2   $ )a)  Traverse the model in bottom-up order and return the children modules in that order.

Args:
    model (`torch.nn.Module`): the model to get the children of

Returns:
    `list[torch.nn.Module]`: a list of children modules of `model` in bottom-up order. The last element is the
    `model` itself.
r   r   N)rv   rC   r    r   r4   rK   r>   )
rU   r   topstackordered_modulescurrent_modulecurrent_module_namerG   r   
child_names
             r$   get_module_children_bottom_upr   6  s     #%UCEEO
2@/(779JD$00EX!4s!:T!A^bJLL*!34LL& : ""$7#HI"">2 % 4R4  r&   )TTF)FFr=   )F)Xr   r   r   r   codecsr   r   	functoolsr   r   typesr   typingr   numpynpr   packaging.versionr	   safetensors.torchr
   r   commands.config.defaultr   loggingr   stater   	constantsr   dataclassesr   importsr   r   r   r   r   modelingr   transformer_enginer   versionsr   r   r   torch_xla.core.xla_modelcore	xla_modelr   r4   rK   boolr%   r,   r8   r:   rL   rP   rT   r`   r~   r   r   r   r   _corenp_core
multiarray_reconstructndarraydtyper   r>   dtypesUInt32DTyper   r   r   intr   r   r   r   r   r   r   r   rN   r&   r$   <module>r     s     	   # %     % 9 8     + (  ( - & 
H	 ))Huxx H4 H T 
uxx 
4 
 D EEHHOO E%((// EP)ehhoo )0%((//  4CUXX__ C C* _dU"U?CUW[Up
'   FD d H )11"((rww ##JJ
HH  hbii334@
$6# 6$ 6
"s 
""4$5 5"! !t !X\]b]e]e]l]lXm !r&   