
    
9jQ                        S r SSKrSSKrSSKJr  SSKJrJr  SSKJ	r	J
r
  SSKJr  SSKrSSKJrJr  S	S
KJr  S	SKJrJr  S	SKJrJrJr  S	SKJr  \" 5       (       a	  SSKJs  Jr  \" 5       (       a  SSK J!r!  S r"S r#S r$S r%S r&\"SS.S jr'SGS jr(S r)S r*S r+S r,S r-S r.S r/S r0 " S  S!\15      r2S" r3S# r4\3S$ 5       r5S%\4S& jr6S%\4S' jr7S%\4S( jr8SHS) jr9SIS* jr:\Rv                  S	\Rx                  S\Rz                  S+\R|                  S,\R~                  S-\R                  S.\R                  S/\R                  S0\R                  S1\R                  S20
rE\ER                  5        V Vs0 s H  u  pX_M	     snn rGS3 rHSJS4\R                  4S5 jjrJ\3SHS6\K4S7 jj5       rLSHS6\K4S8 jjrMSHS6\K4S9 jjrNSKS: jrOSHS; jrP " S< S=\Q5      rR\4SLS> j5       rSSHS? jrT\3SMS@ j5       rUSA rV " SB SC5      rWSD rXSE rY\SNSF j5       rZgs  snn f )OzB
A set of basic tensor ops compatible with tpu, gpu, and multigpu
    N)Mapping)contextmanagernullcontext)update_wrapperwraps)Any   )AcceleratorStatePartialState   )!TORCH_DISTRIBUTED_OPERATION_TYPES)DistributedTypeTensorInformation)is_npu_availableis_torch_distributed_availableis_torch_xla_available)is_torch_version)ReduceOpc                 6    [        U [        R                  5      $ N)
isinstancetorchTensortensors    U/home/wildlama/miniconda3/lib/python3.13/site-packages/accelerate/utils/operations.pyis_torch_tensorr   -   s    fell++    c           
      v   [        U [        R                  R                  [        R                  R                  [        R                  R
                  [        R                  R                  [        R                  R                  [        R                  R                  [        R                  R                  5      $ r   )
r   r   xpuFloatTensor
ByteTensor	IntTensor
LongTensor
HalfTensorDoubleTensorBFloat16Tensorr   s    r   is_torch_xpu_tensorr(   1   sm    														  	 	r   c                 "    [        U [        5      $ r   )r   r   tensor_infos    r   is_tensor_informationr,   >   s    k#455r   c                 n    [        U [        5      =(       a    [        U S5      =(       a    [        U S5      $ )z
Checks if `data` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
`namedtuple` perfectly.
_asdict_fields)r   tuplehasattrdatas    r   is_namedtupler4   B   s*    
 dE"\wtY'?\GDR[D\\r   c                 r    [        U 5      (       a  [        U 5      " [        U5      6 $ [        U 5      " U5      $ )zG
Cast a generator to the same type as obj (list, tuple, or namedtuple)
)r4   typelist)obj	generators     r   
honor_typer:   J   s1    
 SCy$y/**Cy##r   F	test_typeerror_on_other_typec                  ^ ^^^^ [        U[        [        45      (       a  [        UUUU UU4S jU 5       5      $ [        U[        5      (       aF  [        U5      " UR                  5        VVs0 s H  u  pgU[        T U/TQ7TTS.TD6_M     snn5      $ T" U5      (       a  T " U/TQ70 TD6$ T(       a2  [        S[        U5       ST R                   STR                   S35      eU$ s  snnf )a  
Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.

Args:
    func (`callable`):
        The function to recursively apply.
    data (nested list/tuple/dictionary of `main_type`):
        The data on which to apply `func`
    *args:
        Positional arguments that will be passed to `func` when applied on the unpacked data.
    main_type (`type`, *optional*, defaults to `torch.Tensor`):
        The base type of the objects to which apply `func`.
    error_on_other_type (`bool`, *optional*, defaults to `False`):
        Whether to return an error or not if after unpacking `data`, we get on an object that is not of type
        `main_type`. If `False`, the function will leave objects of types different than `main_type` unchanged.
    **kwargs (additional keyword arguments, *optional*):
        Keyword arguments that will be passed to `func` when applied on the unpacked data.

Returns:
    The same data structure as `data` with `func` applied to every object of type `main_type`.
c              3   J   >#    U  H  n[        TU/TQ7TTS .TD6v   M     g7f)r;   Nrecursively_apply).0oargsr=   funckwargsr<   s     r   	<genexpr>$recursively_apply.<locals>.<genexpr>n   sA       A "!".7M`dj s    #r;   zUnsupported types (z) passed to `z?`. Only nested list/tuple/dicts of objects that are valid for `z` should be passed.)
r   r0   r7   r:   r   r6   itemsrA   	TypeError__name__)rE   r3   r<   r=   rD   rF   kvs   ` ````  r   rA   rA   U   s   , $&&  	
 	
 
D'	"	"Dz
 !JJL	 )DA $!".7M`dj  )	
 	
 
4D*4*6**	!$t*]4==/ J++4+=+=*>>QS
 	
 Ks   . C*
c                   ^^^ [        U 5      (       d  [        U S5      (       a  TS:X  a  Sm U R                  TTS9$ [        U [        [        45      (       a  [        U UUU4S jU  5       5      $ [        U [        5      (       ad  [        T[        5      (       a  T/mOTc  / m[        U 5      " U R                  5        VVs0 s H  u  pVXUT;   a  UO[        UTTTS9_M     snn5      $ U $ ! [         a    U R                  T5      s $ [         a5  n[        5       (       a  [        T[        5      (       a  ST 3m SnAO
UeSnAff = f U R                  TTS9$ ! [         a    U R                  T5      s $ f = fs  snnf )	aO  
Recursively sends the elements in a nested list/tuple/dictionary of tensors to a given device.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to send to a given device.
    device (`torch.device`):
        The device to send the data to.
    non_blocking (`bool`, *optional*, defaults to `False`):
        If `True`, the transfer to the device is performed asynchronously, which can overlap
        data movement with computation. Only effective when the device supports it (e.g. CUDA).
    skip_keys (`str` or `List[str]`, *optional*):
        A key or list of keys in a dictionary `tensor` whose values should not be sent to
        the given `device`. Entries with these keys are left on their original device.

Returns:
    The same data structure as `tensor` with all tensors sent to the proper device.
tonpuznpu:0)non_blockingznpu:Nc              3   <   >#    U  H  n[        UTTTS 9v   M     g7f)rQ   	skip_keysN)send_to_device)rB   tdevicerQ   rT   s     r   rG   !send_to_device.<locals>.<genexpr>   s      ohncd^AvLT]^hns   rS   )r   r1   rO   rJ   AssertionErrorr   r   intr0   r7   r:   r   strr6   rI   rU   )r   rW   rQ   rT   errorrL   rV   s    ```   r   rU   rU      s{   & v'&$"7"7U?F	99V,9?? 
FUDM	*	*ohno
 	
 
FG	$	$i%%"IIF| #LLN*DA Y1N1fS_kt,uu*
 	
 =  	%99V$$ 	  !!fc**#F8_F		%99V,9?? 	%99V$$	%s;   C,  E<
,E		E)E EEE E98E9c                     S n[        X5      $ )a/  
Recursively gathers the information needed to rebuild a nested list/tuple/dictionary of tensors.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`):
        The data to send to analyze.

Returns:
    The same data structure as `data` with [`~utils.TensorInformation`] instead of tensors.
c                 >    [        U R                  U R                  S9$ )N)shapedtype)r   r_   r`   r   s    r   _get_data_structure/get_data_structure.<locals>._get_data_structure   s     v||6<<HHr   r@   )r3   ra   s     r   get_data_structurerc      s    I 077r   c                     S n[        X5      $ )a  
Recursively gathers the shape of a nested list/tuple/dictionary of tensors as a list.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`):
        The data to send to analyze.

Returns:
    The same data structure as `data` with lists of tensor shapes instead of tensors.
c                 ,    [        U R                  5      $ r   )r7   r_   r   s    r   
_get_shapeget_shape.<locals>._get_shape   s    FLL!!r   r@   )r3   rf   s     r   	get_shaperh      s    " Z..r   c                 $    S n[        X[        S9$ )z
Recursively initializes tensors from a nested list/tuple/dictionary of [`~utils.TensorInformation`].

Returns:
    The same data structure as `data` with tensors instead of [`~utils.TensorInformation`].
c                 V    [         R                  " U R                  SU R                  06$ Nr`   )r   emptyr_   r`   r*   s    r   _initialize_tensor.initialize_tensors.<locals>._initialize_tensor   s"    {{K--G[5F5FGGr   r<   )rA   r,   )data_structurerm   s     r   initialize_tensorsrq      s    H /K`aar   c                    [        U [        [        [        45      (       a'  [	        U 5      S:X  a  [        S[        U 5       S35      e[        U [        [        45      (       a  [        U S   5      $ [        U [        5      (       a%  U R                  5        H  n[        X   5      s  $    O7[        U [        R                  5      (       d  [        S[        U 5       S35      eU R                  S   $ )z
Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

Returns:
    `int`: The batch size.
r   z&Cannot find the batch size from empty .z0Can only find the batch size of tensors but got )r   r0   r7   r   len
ValueErrorr6   find_batch_sizekeysr   r   rJ   r_   )r3   rL   s     r   rv   rv      s     $g.//SY!^A$t*QOPP$&&tAw''	D'	"	"A"47++ ell++J4PT:,VWXYY::a=r   c                 F     [        U 5      $ ! [        [        4 a     gf = f)a   
Same as [`utils.operations.find_batch_size`] except will ignore if `ValueError` and `TypeErrors` are raised

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

Returns:
    `int`: The batch size.
N)rv   ru   rJ   r2   s    r   ignorant_find_batch_sizery     s,    t$$	" s   
   c                     S n[        X5      $ )a;  
Recursively finds tensors in a nested list/tuple/dictionary and converts them to a list of numbers.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to convert to regular numbers.

Returns:
    The same data structure as `data` with lists of numbers instead of `torch.Tensor`.
c                     U R                  5       R                  5       n U R                  [        R                  :X  a  U R                  [        R                  5      n U R                  5       $ r   )detachcpur`   r   bfloat16rO   float32tolistr   s    r   _convert_to_list!listify.<locals>._convert_to_list'  sF    $$&<<5>>) YYu}}-F}}r   r@   )r3   r   s     r   listifyr     s     -44r   c                 J    S n[        XSS9n[        R                  " 5         U$ )Nc                     U R                   S:X  a  U R                  5       S    n U R                  5       (       d  U R                  5       n [        R
                  " U 5      $ )Nr   )ndimcloneis_contiguous
contiguousxm
all_gatherr   s    r   _tpu_gather_one$_tpu_gather.<locals>._tpu_gather_one4  sL    ;;!\\^D)F ##%%&&(F}}V$$r   Tr=   )rA   r   	mark_step)r   r   ress      r   _tpu_gatherr   3  s#    % O
NCLLNJr   c                   ^^ [        5       m[        R                  R                  mTR                  R
                  S:X  a/  [        SS5      (       a  [        R                  R                  5         UU4S jn[        XSS9$ )Nr    z<=z2.8c                   > U R                   S:X  a  U R                  5       S    n U R                  5       (       d  U R                  5       n TR                  b  TR                  S:w  aq  [
        R                  " TR                  U R                  5       -  U R                  TR                  S9nT" X5        UR                  " S/U R                  5       SS  Q76 $ [        TR                  5       Vs/ s H  n[
        R                  " U 5      PM     nn[
        R                  R!                  X5        [
        R"                  " USS9$ s  snf )Nr   gloor`   rW   r   dim)r   r   r   r   backendr   rl   num_processesnumelr`   rW   viewsizerange
empty_likedistributedr   cat)r   output_tensors_	gather_opstates      r   _gpu_gather_one$_gpu_gather.<locals>._gpu_gather_oneJ  s   ;;!\\^D)F ##%%&&(F==$&)@
 #[[##flln4ll||N
 n-!&&r>FKKM!",=>>
 AFeFYFY@Z[@Z1e..v6@ZN[((@99^33 \s   / ETr   )
r   r   r   all_gather_into_tensorrW   r6   r   r    synchronizerA   )r   r   r   r   s     @@r   _gpu_gatherr   B  s`    NE!!88I ||E!&6tU&C&C		48 _$OOr   c                       \ rS rSrSrSrg)DistributedOperationExceptionii  z}
An exception class for distributed operations. Raised if the operation cannot be performed due to the shape of the
tensors.
 N)rK   
__module____qualname____firstlineno____doc____static_attributes__r   r   r   r   r   i  s    
 	r   r   c                 0   ^  [        T 5      U 4S j5       nU$ )zn
Verifies that `tensor` is the same shape across all processes. Only ran if `PartialState().debug` is `True`.
c                  R  > [        5       R                  [        R                  :X  d  [        5       R                  (       d  T
" U 0 UD6$ T
R
                   ST
R                   3nSU;   a  US   nOU S   n[        5       R                  R                  [        U5      R                  :w  ag  [        SU SUR                  R                   S[        5       R                  R                   S[        5       R                  R                   SU S35      e[        U5      n[        U/5      nUS   bm  UR                  US   5      [        U5      :H  nU(       dF  S	R                  [!        U5       VVs/ s H  u  pxS
U SU 3PM     snn5      n	[        SU SU	 35      eT
" U 0 UD6$ s  snnf )Nrs   r   r   z%One or more of the tensors passed to z were not on the z+ while the `Accelerator` is configured for z. Please move it to the z before calling z
  - zProcess z: znCannot apply desired operation due to shape mismatches. All shapes across devices must be valid.

Operation: `z`
Input shapes:
  - )r   distributed_typer   NOdebugr   rK   rW   r6   find_devicer   rh   gather_objectcountrt   join	enumerate)rD   rF   	operationr   shapesoutputare_sameir_   process_shape_strfunctions             r   wrapper!verify_operation.<locals>.wrapperw  s   >**o.@.@@H\H\T,V,,**+1X->->,?@	vH%F!WF>  %%V)<)A)AA/7	{BSTZTaTaTfTfSg  hS  T`  Tb  Ti  Ti  Tn  Tn  So o))5)>)>)C)C(DDTU^T__`b  6"x(!9 ||F1I.#f+=H$,MM[dek[l2m[lxqXaS5'3J[l2m$n!3''0k1GHYGZ\ 
 ((( 3ns   .F#
r   r   r   s   ` r   verify_operationr   r  s"    
 8_) )4 Nr   c                 0   ^  [        T 5      U 4S j5       nU$ )z
Checks that `verify_operation` failed and if so reports a more helpful error chaining the existing
`DistributedOperationException`.
c                     >  T" U 0 UD6$ ! [          a0  nTR                   STR                   3n[        SU S35      UeS nAff = f)Nrs   zError found while calling `z1`. Please see the earlier error for more details.)r   r   rK   )rD   rF   er   r   s       r   r   "chained_operation.<locals>.wrapper  sc    	T,V,,, 	#../q1B1B0CDI/-i[8ij	s    
A+A  Ar   r   s   ` r   chained_operationr     s"     8_  Nr   c                     [        5       R                  [        R                  :X  a  [	        U 5      $ [        5       R                  [
        ;   a  [        U 5      $ U $ )a  
Recursively gather tensor in a nested list/tuple/dictionary of tensors from all devices.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather.

Returns:
    The same data structure as `tensor` with all tensors sent to the proper device.
)r   r   r   XLAr   r   r   r   s    r   gatherr     sF     ~&&/*=*==6""		(	(,M	M6""r   objectc           	      *   SSK n[        5       nUR                  nUR                  nUR	                  5       n[
        R                  " X5        UR                  5       n[        U5      n[        R                  " U/[        R                  US9n[        U5       V	s/ s H'  n	[        R                  " S[        R                  US9PM)     n
n	[        R                  R                  X5        [!        [#        S U
 5       5      5      nSnX:  a  US-  nX:  a  M  [        R$                  R'                  U5      n[        R(                  " U5      R+                  U5      nUR-                  U5        [        R.                  " X-  [        R0                  US9n[        U5       Vs/ s H  nXU-  UUS-   -   PM     nn[        R                  R                  UU5        / n[3        U5       H  u  nn[!        U
U   R5                  5       5      nUSU R7                  5       R9                  5       R;                  5       n[
        R<                  " U5      n[?        U[@        5      (       a  URC                  U5        M  URE                  U5        M     U$ s  sn	f s  snf )a  Gather picklable objects from all ranks with padded allgather sizes.

On Neuron devices every unique tensor shape triggers a new NEFF compilation. The standard
``all_gather_object`` sizes its byte tensor to the exact max pickle size, which varies per call
and causes unbounded compilation cache growth.  This variant rounds the allgather size up to
the next power of 2 so the number of distinct compiled shapes stays bounded to O(log(max_size)).
r   Nr   r   c              3   @   #    U  H  oR                  5       v   M     g 7fr   )item)rB   ss     r   rG   (_neuron_gather_object.<locals>.<genexpr>  s     3Avvxxs      )#ior   rW   r   BytesIOpickledumpgetvaluert   r   r   longr   zerosr   r   rZ   maxByteStorage_from_bufferr"   rO   resize_rl   uint8r   r   r}   numpytobytesloadsr   r7   extendappend)r   r   r   rW   
group_sizebuf	raw_bytes
local_sizelocal_size_tensorr   	size_listmax_sizepadded_sizebyte_storageinput_tensor	coalescedr   r   resultr   obj_size	obj_bytesr8   s                          r   _neuron_gather_objectr     s    NE\\F$$J **,C
KKIYJ j\FSJOPZJ[\J[QQejj@J[I\	  >33334H K

  
  $$11)<L##L144V<L% K4EKKPVWIRWXbRcdRcQia+Q2GHRcNd	  > F~.	6y|((*+9H%))+113;;=	ll9%c4  MM#MM# / M; ]  es   .JJc                     [        [        5       R                  5       Vs/ s H  nS PM     nn[        R                  R                  X 5        U VVs/ s H  o3  H  oDPM     M     snn$ s  snf s  snnf r   )r   r   r   r   r   all_gather_object)r   r   output_objectsyxs        r   _gpu_gather_objectr     sa    $),.*F*F$GH$Gqd$GNH	''?%1~!q!AqA~11 I 2s   A*A/c                    [        5       R                  [        R                  :X  a  [	        S5      e[        5       R                  [
        ;   a8  [        5       R                  R                  S:X  a  [        U 5      $ [        U 5      $ U $ )a  
Recursively gather object in a nested list/tuple/dictionary of objects from all devices.

Args:
    object (nested list/tuple/dictionary of picklable object):
        The data to gather.

Returns:
    The same data structure as `object` with all the objects sent to every device.
z&gather objects in TPU is not supportedneuron)
r   r   r   r   NotImplementedErrorr   rW   r6   r   r   )r   s    r   r   r     sh     ~&&/*=*==!"JKK		(	(,M	M>  %%1(00!&))r   c                 "    SS jn[        X SUS9$ )Nc                 @    [         R                  R                  XS9  U $ )Nsrc)r   r   	broadcast)r   r  s     r   _gpu_broadcast_one*_gpu_broadcast.<locals>._gpu_broadcast_one  s    ##F#4r   T)r=   r  r   r@   )r3   r  r  s      r   _gpu_broadcastr	    s     /4UXYYr   c                 l  ^^ [        U [        [        45      (       a  [        U U4S j[	        U 5       5       5      $ [        U [
        5      (       aB  [        U 5      " U R                  5        VVs0 s H  u  p4U[        UT SU 3S9_M     snn5      $ [        R                  " TU U4S j5      $ s  snnf )Nc              3   F   >#    U  H  u  p[        UT S U 3S9v   M     g7f)r   nameN)_tpu_broadcast)rB   r   rV   r  s      r   rG   !_tpu_broadcast.<locals>.<genexpr>  s&     "gUfTQ>!TF!A3-#HUfs   !r   r  c                    > U T   $ r   r   )r   r  s    r   <lambda> _tpu_broadcast.<locals>.<lambda>  s	    !C&r   )r   r7   r0   r:   r   r   r6   rI   r  r   mesh_reduce)r   r  r  rL   rM   s    ``  r   r  r    s    &4-((&"gU^_eUf"ghh	FG	$	$F|RXR^R^R`aR`$!Qq$q} EER`abb>>$(899 bs   0B0
                  	   
   c                    Sn[        5       n[        R                  " U[        R                  UR                  S9nU bT  U R
                  n[        U R                     n[        R                  " [        U5      U/-   [        S9US[        U5      S-   & [        USS9nX3R                  5          n[        USS S	   5      nUSS nX64$ )
z]
Grabs the shape of `tensor` only available on one process and returns a tensor of its shape
r   r   Nr`   r   sum	reductionr   r   )r   r   rl   rZ   rW   r_   TENSOR_TYPE_TO_INTr`   r   r7   rt   reducenonzero)r   max_tensor_dimensionr   base_tensorr_   tensor_dtyper`   s          r   gather_tensor_shaper'  .  s    
 !NE++2%))ELLYK
 )&,,7(-T%[L>5QY\(]$c%j1n%6K1134KBC #$Ecr"Kr   returnc                     [        5       n[        U 5      u  p#U c5  [        R                  " U[        U   S9R                  UR                  5      n [        U SS9$ )at  
Copies a tensor that only exists on a single device and broadcasts it to other devices. Differs from `broadcast` as
each worker doesn't need to know its shape when used (and tensor can be `None`)

Args:
    tensor (`torch.tensor`):
        The tensor that should be sent to all devices. Must only have it be defined on a single device, the rest
        should be `None`.
r  r  r  )r   r'  r   r   TENSOR_INT_TO_DTYPErO   rW   r"  )r   r   r_   r`   s       r   copy_tensor_to_devicesr+  G  sN     NE&v.LE~U*=e*DEHHV&E**r   from_processc                     [        5       R                  [        R                  :X  a
  [	        XSS9$ [        5       R                  [
        ;   a	  [        XS9$ U $ )a  
Recursively broadcast tensor in a nested list/tuple/dictionary of tensors to all devices.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather.
    from_process (`int`, *optional*, defaults to 0):
        The process from which to send the data

Returns:
    The same data structure as `tensor` with all tensors broadcasted to the proper device.
zaccelerate.utils.broadcast)r  r  r  )r   r   r   r   r  r   r	  )r   r,  s     r   r  r  X  sI     ~&&/*=*==f=YZZ		(	(,M	Mf77r   c                    SSK n[        5       nUR                  nUR                  U:X  aB  UR	                  5       n[
        R                  " X5        UR                  5       n[        U5      nOSnSn[        R                  " U/[        R                  US9n[        R                  R                  U5        [        UR                  5       5      n	Sn
X:  a  U
S-  n
X:  a  M  UR                  U:X  aV  [        R                   R#                  U5      n[        R$                  " U5      R'                  U5      nUR)                  U
5        O$[        R*                  " U
[        R,                  US9n[        R                  R/                  XS9  [
        R0                  " USU	 R3                  5       R5                  5       R7                  5       5      n[9        [        U 5      5       H	  nX   X'   M     U $ )a  Broadcast a list of picklable objects with padded tensor sizes.

On Neuron devices ``ProcessGroupNeuron.broadcast()`` is implemented as an allreduce,
so every unique tensor shape triggers a new NEFF compilation.  The standard
``broadcast_object_list`` sizes its byte tensor to the exact serialized length, which
varies per call and causes unbounded compilation cache growth.  This variant rounds the
broadcast tensor size up to the next power of 2, bounding the number of distinct
compiled shapes to ~O(log(max_size)).
r   Nr   r   r   r   r  )r   r   rW   process_indexr   r   r   r   rt   r   r   r   r   
all_reducerZ   r   r   r   r"   rO   r   r   r   r  r   r}   r   r   r   )object_listr,  r   r   rW   r   r   r   size_tensor	real_sizer   r   data_tensorr   r   s                  r   _neuron_broadcast_object_listr5  n  s    NE\\F l*jjlK%LLN	^
	
 ,,
|5::fMK	  -K$$&'I K

! 
! l*((55i@&&|477?K(kk+U[[P	> \\+jy1557==?GGIJF3{#$ %r   c                   ^ [        5       R                  [        R                  :X  a4  [	        U 5       H#  u  p#[
        R                  " SUU4S j5      X'   M%     U $ [        5       R                  [        ;   aL  [        5       R                  R                  S:X  a  [        U TS9  U $ [        R                  R                  U TS9  U $ )ar  
Broadcast a list of picklable objects from one process to the others.

Args:
    object_list (list of picklable objects):
        The list of objects to broadcast. This list will be modified inplace.
    from_process (`int`, *optional*, defaults to 0):
        The process from which to send the data.

Returns:
    The same list containing the objects from process 0.
z&accelerate.utils.broadcast_object_listc                    > U T   $ r   r   )r   r,  s    r   r  'broadcast_object_list.<locals>.<lambda>  s    efgsetr   r   )r,  r  )r   r   r   r   r   r   r  r   rW   r6   r5  r   r   broadcast_object_list)r1  r,  r   r8   s    `  r   r9  r9    s     ~&&/*=*==,FA^^,TVY[tuKN -  
	(	(,M	M>  %%1)+LQ  33K\3Rr   c                      S n[        X@U5      $ )a*  
Recursively takes a slice in a nested list/tuple/dictionary of tensors.

Args:
    data (nested list/tuple/dictionary of `torch.Tensor`):
        The data to slice.
    tensor_slice (`slice`):
        The slice to take.

Returns:
    The same data structure as `data` with all the tensors slices.
c                 
    X   $ r   r   )r   tensor_slices     r   _slice_tensor$slice_tensors.<locals>._slice_tensor  s    ##r   r@   )r3   r<  r/  r   r=  s        r   slice_tensorsr?    s    $ ],??r   c                   ^ ^ [        T S   [        [        45      (       a/  [        T S   U U4S j[	        [        T S   5      5       5       5      $ [        T S   [        5      (       aS  [        T S   5      " T S   R                  5        VVs0 s H   o"[        T  Vs/ s H  o3U   PM	     snTS9_M"     snn5      $ [        T S   [        R                  5      (       a  [        R                  " T TS9$ [        T [        [        45      (       a  [        T 5      S:X  a  T S   $ [        S[        T S   5       35      es  snf s  snnf )a  
Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.
If there is only a single batch of data, it is returned as-is.

Args:
    data (nested list/tuple/dictionary of lists of tensors `torch.Tensor`):
        The data to concatenate.
    dim (`int`, *optional*, defaults to 0):
        The dimension on which to concatenate.

Returns:
    The same data structure as `data` with all the tensors concatenated.
r   c              3   h   >#    U  H"  n[        T Vs/ s H  o"U   PM	     snTS 9v   M$     gs  snf 7f)r   N)concatenate)rB   r   dr3   r   s      r   rG   concatenate.<locals>.<genexpr>  s/     #lXkSTKt0Dt!1t0D#$NXk0Ds   2-
2r   r   z%Can only concatenate tensors but got )r   r0   r7   r:   r   rt   r   r6   rw   rB  r   r   r   rJ   )r3   r   rL   rC  s   ``  r   rB  rB    s    $q'E4=))$q'#lX]^abfghbi^jXk#lmm	DGW	%	%DG}UYZ[U\UaUaUcdUcPQD-ADqdD-As!KKUcdee	DGU\\	*	*yy3''	D5$-	(	(SY!^Aw?T!WOPP .Bds   E 
D;)
E 
;E 
c                       \ rS rSrSrg)CannotPadNestedTensorWarningi  r   N)rK   r   r   r   r   r   r   r   rF  rF    s    r   rF  c           	      $    SS jn[        X@SXUS9$ )a  
Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so they
can safely be gathered.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather.
    dim (`int`, *optional*, defaults to 0):
        The dimension on which to pad.
    pad_index (`int`, *optional*, defaults to 0):
        The value with which to pad.
    pad_first (`bool`, *optional*, defaults to `False`):
        Whether to pad at the beginning or the end.
c                   ^^	^
 [        U SS5      (       a  [        R                  " S[        5        U $ T[	        U R
                  5      :  d  T[	        U R
                  5      * :  a  U $ TS:  a  T[	        U R
                  5      -  m[        R                  " U R
                  U R                  S9S    n[        U5      R                  5       n[        U4S jU 5       5      m	T	U R
                  T   :X  a  U $ U R
                  m
[        T
5      nT	UT'   U R                  [        U5      5      U-   nU(       a*  [        UU	U
4S j[        [	        U5      5       5       5      nO([        UU
4S j[        [	        U5      5       5       5      nXU'   U$ )	N	is_nestedFzHCannot pad nested tensors without more information. Leaving unprocessed.r   )rW   c              3   ,   >#    U  H	  oT   v   M     g 7fr   r   )rB   r   r   s     r   rG   Fpad_across_processes.<locals>._pad_across_processes.<locals>.<genexpr>  s     -u!vus   c              3   h   >#    U  H'  oT:X  a  [        TTT   -
  T5      O
[        S 5      v   M)     g 7fr   slice)rB   r   r   r   old_sizes     r   rG   rK    s4      `t[\#Xh#.95QU;V`ts   /2c              3   b   >#    U  H$  oT:X  a  [        S TT   5      O
[        S5      v   M&     g7fr   NrM  rB   r   r   rO  s     r   rG   rK    s,     oZnUVCxE!Xc]3U4[PZn   ,/)getattrwarningswarnrF  rt   r_   r   r   rW   r   r}   r   r7   	new_zerosr0   r   )r   r   	pad_index	pad_firstr   sizesnew_size
new_tensorindicesr   rO  s    `       @@r   _pad_across_processes3pad_across_processes.<locals>._pad_across_processes  sL   6;..MMZ, M#fll##sc&,,.?-?'?M73v||$$C ||FLL?Et  "-u--v||C((M<<> %%eHo6B
 `efijrfs`t G oZ_`cdl`mZnooG$7r   T)r=   r   rX  rY  r   r   Fr@   )r   r   rX  rY  r^  s        r   pad_across_processesra    s!    " D 4Sir r   c           	      (    SS jn[        UU SUUUS9$ )z
Takes a `tensor` of arbitrary size and pads it so that it can work given `num_processes` needed dimensions.

New tensors are just the last input repeated.

E.g.:
  Tensor: ([3,4,4]) Num processes: 4 Expected result shape: ([4,4,4])

c                 <  ^^
 X-  nXU-  -
  nX-  S:X  a  X!-
  nOX!U-  -
  nXVU-  s=:  a  S:  a  O  OXV-
  nU R                   m
[        T
5      nX-   US'   U R                  [        U5      5      n[        UU
4S j[	        [        U5      5       5       5      n	XU	'   U$ )Nr   r   c              3   b   >#    U  H$  oT:X  a  [        S TT   5      O
[        S5      v   M&     g7frQ  rM  rR  s     r   rG   @pad_input_tensors.<locals>._pad_input_tensors.<locals>.<genexpr>?  s,     kVjQR8a#/tLVjrS  )r_   r7   rW  r0   r   rt   )r   
batch_sizer   r   	remainderlast_inputsto_padr[  r\  r]  rO  s      `      @r   _pad_input_tensors-pad_input_tensors.<locals>._pad_input_tensors0  s    /	 $=>&!+"/F"M&ABF &,1, )F<<> )%%eHo6
kV[\_`h\iVjkk$7r   T)r=   rf  r   r   r  r@   )r   rf  r   r   rj  s        r   pad_input_tensorsrl  %  s(    &  # r   c                 "    SS jn[        X0SXS9$ )a.  
Recursively reduce the tensors in a nested list/tuple/dictionary of lists of tensors across all processes by the
mean of a given operation.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to reduce.
    reduction (`str`, *optional*, defaults to `"mean"`):
        A reduction method. Can be of "mean", "sum", "max", or "none"
    scale (`float`, *optional*):
        A default scaling value to be applied after the reduce, only valid on XLA.

Returns:
    The same data structure as `data` with all the tensors reduced.
c                 v   [        5       nU R                  5       nUR                  [        R                  :X  a  U$ UR                  [        R
                  :X  ai  [        R                  " 5         US:X  a  [        R                  O[        R                  n[        R                  " XT/U5        [        R                  " 5         OcUR                  R                  [        ;   aE  US:X  a  [        R                  O[        R                  n[         R"                  R                  XF5        US:X  a  XCR$                  -  nU$ )Nr   mean)r   r   r   r   r   r   r   r   
REDUCE_MAX
REDUCE_SUMr0  valuer   r   MAXSUMr   r   r   )r   r   scaler   cloned_tensorxla_optorch_ops          r   _reduce_across_processes(reduce.<locals>._reduce_across_processes_  s    !!_%7%77  !!_%8%88
 LLN&/5&8R]]bmmFMM&/59LLN##))-NN'0E'9x||x||H((A000Mr   T)r=   r   ru  ro  g      ?r@   )r   r   ru  ry  s       r   r"  r"  M  s    $*  di r   c                 "    S nS n[        XUS9$ )aZ  
Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to convert from FP16/BF16 to FP32.

Returns:
    The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
c                 "    U R                  5       $ r   )floatr   s    r   _convert_to_fp32)convert_to_fp32.<locals>._convert_to_fp32  s    ||~r   c                     [        U 5      =(       d    [        U S5      =(       a-    U R                  [        R                  [        R
                  4;   $ rk   )r   r1   r`   r   float16r~   r   s    r   _is_fp16_bf16_tensor-convert_to_fp32.<locals>._is_fp16_bf16_tensor  s@    'C767+C 
MMNNZ
 J
 	
r   ro   r@   )r   r  r  s      r   convert_to_fp32r  y  s    
 -AUVVr   c                   *    \ rS rSrSrS rS rS rSrg)ConvertOutputsToFp32i  aE  
Decorator to apply to a function outputting tensors (like a model forward pass) that ensures the outputs in FP16
precision will be convert back to FP32.

Args:
    model_forward (`Callable`):
        The function which outputs we want to treat.

Returns:
    The same function as `model_forward` but with converted outputs.
c                 &    Xl         [        X5        g r   )model_forwardr   )selfr  s     r   __init__ConvertOutputsToFp32.__init__  s    *t+r   c                 8    [        U R                  " U0 UD65      $ r   )r  r  )r  rD   rF   s      r   __call__ConvertOutputsToFp32.__call__  s    t114B6BCCr   c                 .    [         R                  " S5      e)NzCannot pickle a prepared model with automatic mixed precision, please unwrap the model with `Accelerator.unwrap_model(model)` before pickling it.)r   PicklingError)r  s    r   __getstate__!ConvertOutputsToFp32.__getstate__  s    "" `
 	
r   )r  N)	rK   r   r   r   r   r  r  r  r   r   r   r   r  r    s    
,D
r   r  c                 8   ^  [        T 5      m U 4S jnT Ul        U$ )Nc                     > T" U 0 UD6$ r   r   )rD   rF   r  s     r   forward(convert_outputs_to_fp32.<locals>.forward  s    d-f--r   )r  __wrapped__)r  r  s   ` r   convert_outputs_to_fp32r    s!    (7M. (GNr   c                 F   [        U [        5      (       a*  U R                  5        H  n[        U5      nUc  M  Us  $    g[        U [        [
        45      (       a  U  H  n[        U5      nUc  M  Us  $    g[        U [        R                  5      (       a  U R                  $ g)z
Finds the device on which a nested dict/list/tuple of tensors lies (assuming they are all on the same device).

Args:
    (nested list/tuple/dictionary of `torch.Tensor`): The data we want to know the device of.
N)	r   r   valuesr   r0   r7   r   r   rW   )r3   r8   rW   s      r   r   r     s     $  ;;=C %F! ! 
D5$-	(	(C %F!  
D%,,	'	'{{ 
(r   c              #   b  #    [        5       R                  [        R                  :w  d<  [        5       R                  b2  [        5       R                  R                  5       (       d  [        5       nOSSKnUR                  R                  XX#S9nU   Sv   SSS5        g! , (       d  f       g= f7f)z|
Wrapper around `deepspeed.runtime.zero.GatheredParameters`, but if Zero-3 is not enabled, will be a no-op context
manager.
Nr   )modifier_rank
fwd_moduleenabled)
r
   r   r   	DEEPSPEEDdeepspeed_pluginis_zero3_init_enabledr   	deepspeedzeroGatheredParameters)paramsr  r  r  gather_param_contextr  s         r   r  r    s      **o.G.GG++7 "33IIKK*}(~~@@J  A  
 
 
		s   BB/B	B/
B,(B/)FNr  )r   zbroadcast tensorr   )NNr`  r{  )NNT)[r   r   rU  collections.abcr   
contextlibr   r   	functoolsr   r   typingr   r   r   r
   r   	constantsr   dataclassesr   r   importsr   r   r   versionsr   torch_xla.core.xla_modelcore	xla_modelr   torch.distributedr   r   r(   r,   r4   r:   rA   rU   rc   rh   rq   rv   ry   r   r   r   	Exceptionr   r   r   r   r   r   r   r	  r  r~  doublehalfr~   r   int8int16int32int64boolr!  rI   r*  r'  r   r+  rZ   r  r5  r9  r?  rB  UserWarningrF  ra  rl  r"  r  r  r  r   r  )rL   rM   s   00r   <module>r     s[     # 2 +   2 8 ; 
 ' ))!##*,
6]$ 4CX] 0f7t8$/$b."5.$PN	I 	 F&  &3# 3l2s 2# *Z: 
KK	LL!	JJ	NNA	KK	JJ	KK	KK	KK	JJ  );(@(@(BC(Bqt(BC 2+5<< +" C  *2S 2jS 0@(Q4	; 	 4 4n%P ( (VW0
 
4	*  E Ds   1H