
    3j U                     >   S SK r S SKrS SKJr  S SKJr  S SKrS SKJr  S SK	J
r
  S SKJr  S SKJr  S SKJrJrJr  / S	Qr\ R(                  " \5      r S
r " S S5      r " S S\5      r\" \R4                  " S5      \R6                  5      rS r " S S5      r " S S5      rS\S\ S\!\   4S jr"S\RF                  S\S\ S\\RF                     4S jr$S r%  S$S\&\S4   S\'\(\4   S-  S\ S \&\S4   S-  S!\'\(\4   S-  S\&\!\&   \!\'   4   4S" jjr)S\!\   4S# jr*g)%    N)Sequence)Any)DTensor)	local_mapmap_aggregate)	BlockMask)tree_flattentree_maptree_unflatten)TensorChunkSpecsplit_args_kwargs_into_chunksmerge_chunksFc                       \ rS rSrSrS rSrg)_CustomReducer    a   
Custom reducer class that can be used to specify a custom operation that
reduces losses of multiple microbatches into one value.

Example:
>>> # xdoctest: +SKIP
>>> sum_reducer = _CustomReducer(
>>>     torch.tensor(0.0),
>>>     lambda a, b: a + b
>>> )
c                     Xl         X l        g N
init_value	reduce_fn)selfr   r   s      a/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/distributed/pipelining/microbatch.py__init___CustomReducer.__init__-   s    $"    r   N)__name__
__module____qualname____firstlineno____doc__r   __static_attributes__ r   r   r   r       s    
#r   r   c                       \ rS rSrSrg)_LossReducer2   r#   Nr   r   r   r    r"   r#   r   r   r%   r%   2       r   r%   g        c                   z    \ rS rSr% SrS r\\S'   S rS r	\
S\\S4   4S	 j5       r\
S\\\4   4S
 j5       rSrg)r   =   z*
Class used to specify chunking of inputs
c                     Xl         g r   	split_dim)r   r-   s     r   r   TensorChunkSpec.__init__B   s    "r   r-   c                 |    U R                   R                   SU R                   R                   SU R                   S3$ )N.())	__class__r   r   r-   r   s    r   __repr__TensorChunkSpec.__repr__G   s9    ~~(()4>>+B+B*C1T^^DTTUV	
r   c                 "    SU R                    S3$ )NzTensorChunkSpec(r2   r,   r4   s    r   __str__TensorChunkSpec.__str__L   s    !$..!133r   
chunk_dims.c                      [        U S 5      nU$ )aJ  
A helper for creating a tuple of `TensorChunkSpec` from a tuple of chunk
dimensions (int's).
Example:
    >>> # xdoctest: +SKIP
    >>> # There are three positional arguments to the model, and
    >>> # we are chunking them along dimension 0, 0 and 1, respectively
    >>> args_chunk_spec = TensorChunkSpec.from_tuple((0, 0, 1))
c                     [        U 5      $ r   r   dims    r   <lambda>,TensorChunkSpec.from_tuple.<locals>.<lambda>^   	    ,r   r   )r:   args_chunk_specs     r   
from_tupleTensorChunkSpec.from_tupleO   s     (,
 r   c                      [        U S 5      nU$ )a$  
A helper for creating a dictionary of `TensorChunkSpec` from a
dictionary of chunk dimensions (int's).
Example:
    >>> # xdoctest: +SKIP
    >>> # Chunk dimension 0 for the "id" argument, 1 for the "mask" argument
    >>> kwargs_chunk_spec = TensorChunkSpec.from_dict({"id": 0, "mask": 1})
c                     [        U 5      $ r   r=   r>   s    r   r@   +TensorChunkSpec.from_dict.<locals>.<lambda>p   rB   r   r   )r:   kwargs_chunk_specs     r   	from_dictTensorChunkSpec.from_dictb   s     *,
 ! r   r,   N)r   r   r   r    r!   r   int__annotations__r5   r8   staticmethodtuplerD   dictstrrJ   r"   r#   r   r   r   r   =   se    # N

4 #s(O $ !cN! !r   r   c                       \ rS rSrSrg)
_Replicatev   r#   Nr'   r#   r   r   rS   rS   v   r(   r   rS   
block_mask
num_chunksreturnc                   ^  T R                   R                  S5      S:X  a  T /U-  $ T R                   R                  S5      U:  d  [        S5      eSn[        R                  " T R                   X5      n[        R                  " T R
                  X5      nT R                  b!  [        R                  " T R                  X5      OS/U-  nT R                  b!  [        R                  " T R                  X5      OS/U-  n/ nSn[        U5       Hj  n	U 4S jn
UR                  [        R                  " X9   XI   XY   Xi   T R                  U
" U5      T R                  S95        XU	   R                  S5      -  nMl     U$ )zGiven a block mask, split the block mask along the batch dimension (dim0).

Args:
    block_mask: Block mask to split
    num_chunks: Number of chunks to split the block mask into

Returns:
    chunk_block_masks: List of chunked block masks
r      z;Block mask has fewer batch size than the number of chunks. Nc                    >^  UU 4S jnU$ )Nc                 \   > [         R                  " U T5      nTR                  X-   XU5      $ r   )torch	full_likemask_mod)bhq_idxkv_idxb_offsetrU   idxs        r   batch_offset_mask_modI_split_block_mask.<locals>.create_mask_mod.<locals>.batch_offset_mask_mod   s*     ??1c2!**1<6JJr   r#   )rd   re   rU   s   ` r   create_mask_mod*_split_block_mask.<locals>.create_mask_mod   s    K )(r   )kv_num_blocks
kv_indicesfull_kv_num_blocksfull_kv_indices
BLOCK_SIZEr^   seq_lengths)ri   sizeAssertionErrorr\   tensor_splitrj   rk   rl   rangeappendr	   from_kv_blocksrm   rn   )rU   rV   	batch_dimkv_num_blocks_chunkskv_indices_chunksfull_kv_num_blocks_chunksfull_kv_indices_chunkschunk_block_masksbatch_offset	chunk_idxrg   s   `          r   _split_block_maskr}   z   s    $$Q'1,|j((##((+z9I
 	
 I --  * **:+@+@*X ((4 	:88*PVj   %%1 	:55zMVj   L:&		) 	  $$2=,7#<#G 6 A%00(6&22
	
 	Y7<<Q??) '* r   tensorspecc                   ^^ U R                  TR                  5      T:  d(  [        SU R                  TR                  5       S35      e[        U [        5      nU(       a*  U R
                  n[        UU4S jU4T-  U4S9nU" U 5      nO"[        R                  " U TTR                  5      nU R                  (       a*  U R                  (       a  U H  nUR                  5         M     [        (       d  U$ S[        R                  S[        R                  S[        [        R                  S4   4U4S	 jjnU(       a@  U R
                  n[        U5      n	[        UU4U	-  U4U4U	-  -   S9n
[!        U
" U /UQ76 5      $ [!        U" U /UQ76 5      $ )
zGiven a tensor, and a chunking spec, split the tensor.
Args:

    tensor: Tensor to split
    spec: Chunking spec
    num_chunks: Number of chunks to split the tensor into

Returns:
    chunk_tensors: List of chunked tensors
zTensor size z is smaller than num_chunksc                 H   > [         R                  " U TTR                  5      $ r   )r\   rq   r-   )trV   r   s    r   r@   _split_tensor.<locals>.<lambda>   s    e((JGr   out_placementsin_placementsorigchunksrW   .c                 `  > / nSnU H  n[         R                  " U 5      nX4R                  TR                  5      -   n[	        S 5      /UR
                  -  n[	        X65      UTR                  '   XEU'   UR                  U5        X4R                  TR                  5      -  nM     [        U5      $ )Nr   )r\   
zeros_likero   r-   slicendimrs   rO   )	r   r   expandedrd   chunknew_valupperslicesr   s	           r   _expand_chunks%_split_tensor.<locals>._expand_chunks   s     E&&t,G**T^^44E#(;-',,">F%*3%6F4>>"#FOOOG$::dnn--C  Xr   )ro   r-   rp   
isinstancer   
placementsr   r\   rq   requires_gradis_leafretain_grad_debug_mask_minibatchesTensorrO   lenlist)r~   r   rV   _is_dtensorr   split_fnchunk_tensorsr   r   n	expand_fns    ``        r   _split_tensorr      sv     ;;t~~&*46;;t~~677RS
 	
 VW-K
 &&
G&=:5%-

 190@**6:t~~N
 "E # #"ll%*\\	u||S 	! &&
&=1,%-:-!*;;
	
 If5}566N6:M:;;r   c           	      z   U (       d  [        U5       Vs/ s H  n0 PM     sn$ [        U 5      [        U5      :X  d?  [        S[        U R	                  5       5       S[        UR	                  5       5       35      eUc  [        S5      e[        U S S9u  pE[        US S9u  pc/ n[        XFSS9 GHy  u  pU	[        L d  [        U	[        5      (       a  UR                  U5        M7  [        U[        R                  5      (       aX  [        U	[        5      (       d  [        S	[        U	5       35      eUR                  UR                  U	R                  5      5        M  [        U[         5      (       a  [        U	[        5      (       d  [        S	[        U	5       35      eU	R                  S
:X  d  [        S5      eUR"                  R                  S
5      S:X  a  UR                  U5        GM=  UR                  UR"                  R                  S
5      5        GMj  [%        SU	 SU S35      e   ['        / UQUP76 n
[        U
5       Vs/ s H  n/ PM     nn[        XFSS9 H  u  p/ nU	[        L d  [        U	[        5      (       a  U/U
-  nO_[        U[        R                  5      (       a  [)        XU
5      nO3[        U[         5      (       a  [+        X5      nO[%        SU	 SU S35      e[        XSS9 H  u  pUR                  U5        M     M     U Vs/ s H  n[-        X5      PM     sn$ s  snf s  snf s  snf )a3  
Given a dictionary of args, and a dictionary of chunking specs, shard the
args according to the chunking specs.

Args:
    args_dict: Dictionary of args
    args_chunk_spec: Dictionary of chunking specs
    num_chunks: Number of chunks to shard the args into

Returns:
    args_split: List of sharded args
zargs_dict.keys() = z args_chunk_spec.keys() = z.args_chunk_spec should have been set by callerc                 "    [        U [        5      $ r   r   r	   xs    r   r@   %_shard_dict_of_args.<locals>.<lambda>%  s    Z9%=r   r   c                 "    [        U [        5      $ r   r   r   s    r   r@   r   (  s    :a+Cr   TstrictzExpected TensorChunkSpec, got r   z#BlockMask only supports split_dim=0rY   zUnsupported chunk spec: z and value: z combination.)rr   r   rp   r   keysr
   ziprS   r   rs   r\   r   r   typero   r-   r	   ri   
ValueErrorminr   r}   r   )	args_dictrC   rV   _values	tree_specchunk_specssplit_sizesvr   result_num_chunksflat_split_resultsv_splits_flat_split_result_v_splits                  r   _shard_dict_of_argsr     s   $ !*-.-q-..y>S11!$y~~'7"8!9 :((,_-A-A-C(D'EG
 	
 MNN$=F "!CNK
 Kv48 :D*!=!=z*5<<((dO44$'Ed4j\%RSSqvvdnn569%%dO44$'Ed4j\%RSS>>Q&$%JKK##A&!+"":.""1??#7#7#:;*4&QC}M ) 9. 5[5*5167H1I$J1IAR1I$Jv48"$:D*!=!=s..H5<<(($Q.?@H9%%(>H*4&QC}M  -0-
( %%h/-
 9( #5"4 	)5"4  /X %K&s   L.L3L8args.kwargsr   rC   rI   c                 j  ^	 Uc  0 nS nUc  [        XPS S9nUc  [        XQS S9n[        [        [        U 5      5      [        [        U5      5      U5      n[	        U5      n[        UUU5      n[	        U5      U:  a<  [	        U5      n[        [        [        U 5      5      [        [        U5      5      U5      n[	        U5      [	        U5      :w  a#  [        S[	        U5       S[	        U5       35      eU V	^	s/ s H*  m	[        U	4S j[        [	        T	5      5       5       5      PM,     n
n	X4$ s  sn	f )a  
Given a sequence of args and kwargs, split them into a number of chunks
according to  their respective chunking specs.

Args:
    args: Tuple of args
    kwargs: Dict of kwargs
    chunks: Number of chunks to split the args and kwargs into
    args_chunk_spec: chunking specs for args, in same shape as args
    kwargs_chunk_spec: chunking specs for kwargs, in same shape as kwargs

Returns:
    args_split: List of sharded args
    kwargs_split: List of sharded kwargs
c                     [        U [        R                  [        -  5      (       a  [	        [
        5      $ [        5       $ r   )r   r\   r   r	   r   DEFAULT_CHUNK_DIMrS   r   s    r   default_spec3split_args_kwargs_into_chunks.<locals>.default_spec  s,    a	122"#455<r   c                 "    [        U [        5      $ r   r   r   s    r   r@   /split_args_kwargs_into_chunks.<locals>.<lambda>  s    *Q	2Jr   r   c                 "    [        U [        5      $ r   r   r   s    r   r@   r     s    Jq)4Lr   z;args and kwargs are split into different number of chunks: z, c              3   .   >#    U  H
  nTU   v   M     g 7fr   r#   ).0i
chunk_argss     r   	<genexpr>0split_args_kwargs_into_chunks.<locals>.<genexpr>  s     <%;jm%;s   )r   r   rP   	enumerater   RuntimeErrorrO   rr   )r   r   r   rC   rI   r   args_split_dictreal_num_chunkskwargs_splitr   
args_splits            ` r   r   r   _  sT   p ~  "(J
  $*L
 *Yt_Y'(O
 /*O&L <?* l+-4!?+,
 ?s<00I?#$Bs<'8&9;
 	
 *)J 	<U3z?%;<<)  
 ##s   :1D0c                 R  ^^ Ub  [        U5      u  p#O,[        U S   5      u  pC[        [        5      /[        U5      -  n/ nU  HJ  n[        U5      u  px[        U5      [        U5      :w  a  [	        SU SU 35      eUR                  U5        ML     / n	[        U5       GHp  u  n
m[        T[        5      (       Ga  [        [        U5      5       Vs/ s H
  nX[   U
   PM     nn[        (       GaA  US   R                  nUSS  H.  nUR                  U:X  a  M  [        SU SUR                   35      e   [        R                  " [        R                  " USS	06[        U5      TR                  S
9n/ nSn[        U5      [        U5      :X  d#  [        S[        U5       S[        U5       35      e[!        XSS9 Hp  u  nnUUR#                  TR                  5      -   n[%        SSS5      /UR&                  -  n[%        UU5      UTR                  '   UU   nUR                  U5        UnMr     OUnU Vs/ s H  n[        U[(        5      PM     nn[+        U5      (       a  [-        U5      (       d  [        S5      eUS   R.                  m[        USS S5       H4  u  nnUR.                  T:w  d  M  [        SU ST SUR.                   35      e   [1        U4S jT4[3        U4S j[        [        U5      5       5       5      S9nU	R                  U" U6 5        GM  U	R                  [        R4                  " UTR                  S95        GM  [        T[6        5      (       aR  TR8                  n[        [        U5      5       H  nTR;                  UX[   U
   5      nM     U	R                  U5        GM  US   U
   n[        S[        U5      5       H$  nX[   U
   U:X  a  M  [        SU SX[   U
    35      e   U	R                  U5        GMs     [=        X5      $ s  snf s  snf )z
Given a list of chunks, merge them into a single value according to
the chunk spec.

Args:
    chunks: list of chunks
    chunk_spec: Chunking spec for the chunks

Returns:
    value: Merged value
Nr   zChunk z did not match chunk spec rY   zExpected shape z, got devicemeta)sectionsr?   z6Expected len(partial_values) == len(meta_chunks), got z != Tr   zRmerge_chunks: expected all values to be DTensors or none to be DTensors, got a mixz*merge_chunks: placement mismatch at chunk z: expected c                  B   > [         R                  " U TR                  S9$ )Nr>   )r\   catr-   )r   args    r   r@   merge_chunks.<locals>.<lambda>S  s    EIIf#--$Hr   c              3   (   >#    U  H  nTv   M	     g 7fr   r#   )r   r   r   s     r   r   merge_chunks.<locals>.<genexpr>U  s     'V<Uq
<Us   r   r>   z	Expected )r
   r   r   r   r   rs   r   r   rr   r   shaperp   r\   rq   emptyr-   r   ro   r   r   r   anyallr   r   rO   r   r   r   r   r   )r   
chunk_specspec_flattenedflatten_specchunk0_flatchunks_flattenedr   chunk_flattenedr   args_flattenedarg_idxr|   partial_valuesoverall_shapevalmeta_chunksvalues_to_catchunk_start_idxpartial_value
meta_chunkchunk_end_idxslice_indicesslicedr   dtensor_flagsr   cat_fnreduced_valvaluer   r   s                                @@r   r   r     sD   Z '3J'?$ %1$;!)*;<=K@PP )%03~#66veW,FzlSTT0  N!.1c?++ "'s+;'<!=!=I !+G4!=  
 '& .q 1 7 7)!"-C995,-m_F399+N  .
 $00KK>v> 0 !#"#>*c+.>>(PQTUcQdPeeijmnyjzi{|  25"2-M: %4joocmm6T$TM%*4t%<$=@R@R$RM38-3XM#--0*=9F!((0&3O2 !/ >KK]Z73]MK=!!=))(9 
 +1-88
%mAB&7;DAq||z1,H L((2|6!,,I  < #H$.="''VE#mBT<U'V"V
 %%fm&<=%%eii3==&QR^,,..K"3'7#89	!mm!1!<W!E :
 !!+.$Q'0E"1c*:&;<	'27;uD(#E7&1A1LW1U0VW  =
 !!%(i 2n .77kV Ls   P8P$)NN)+loggingoperatorcollections.abcr   typingr   r\   torch.distributed.tensorr   %torch.distributed.tensor.experimentalr   torch.fx.noder   !torch.nn.attention.flex_attentionr	   torch.utils._pytreer
   r   r   __all__	getLoggerr   loggerr   r   r%   r~   addsum_reducerr   r   rS   rL   r   r}   r   r   r   rO   rP   rQ   r   r   r#   r   r   <module>r	     s     $   , ; ' 7 F F 
		8	$
   # #$	> 	 5<<,hll;  5! 5!r	 	>>> 
)_>BI<LLI<
I< I< ell	I<XUx ;?;?p$
S/p$cNT!p$ p$ ?C/047	p$
 C01D8p$ 4;T
"#p$f[8I[8r   