
    3j              	        	   S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKJrJ	r	  S SK
Jr  S SK Jr  S SKJr  S SKJrJr  S SKJr  S SKJrJrJr  S S	KJr  S SKrS SKJr  S SKJr  S SKJs  J r!  S S
K"J#r#  S SK$J%r%  S SK&J'r'J(r(J)r*  S SK+J,r,  S SK-J.r.J/r/  S SK0J1r1  S SK2J3r3J4r4J5r5  S SK6J7r7  S SK8J9r9J:r:J;r;  S SK<J=r=J>r>J?r?  S SK@JArAJBrBJCrCJDrD  S SKJErEJFrF  S SKGJHrI  S SKJJKrKJLrLJMrMJNrNJOrO  S SKPJQrQJRrRJSrSJTrTJUrUJVrVJWrW  S SKXJYrY  \V(       a)  \Z" S\[" S\R                  R                  5       5      5      r^OSr^\T(       a  Sr_Sr`\R                  R                  5       r^O8\U(       a  Sr_Sr`O,\W(       a  S r_S!r`\R                  R                  5       r^OS"r_S#r`S$r^ " S% S&\5      rb " S' S(\5      rc " S) S*\R                  \5      reS+\R                  S,\R                  S-\4S. jrgS/ rh  SS+\R                  S0\i4S1 jjrjSS2 jrkS3 rlS4 rmSS+\R                  S5\i4S6 jjrnS+\R                  S7\i4S8 jroS+\R                  S9\i4S: jrp " S; S<5      rq " S= S>\e5      rr " S? S@\e5      rs " SA SB\s5      rt " SC SD\s5      ru " SE SF\e5      rv " SG SH\v5      rw " SI SJ\R                  5      rx " SK SL\s5      ry " SM SN\R                  5      rz " SO SP\R                  5      r| " SQ SR\R                  5      r}\ R                  SS\4ST j5       r\ R                  SU\4SV j5       r\ R                  SW\4SX j5       r\ R                  SY\4SZ j5       r\ R                  S[\4S\ j5       r\\ R                  S]\4S^ j5       5       r\\ R                  S_\4S` j5       5       r\\ R                  Sa\4Sb j5       5       r\\ R                  Sc\4Sd j5       5       rSe\S-\Sf\Sg\4Sh jr SSi\R                  Sj\R                  Sk\\Sl4   4Sm jjr\GR                  " \WSn5       " So Sp\M5      5       r " Sq Sr5      r " Ss St\\L5      r " Su Sv\\K5      rSSw\S-  4Sx jjr " Sy Sz\R                  5      r " S{ S|\R                  5      r " S} S~\R                  5      rg)    N)ABCabstractmethod)Callable)nullcontext)deepcopy)autoEnumwraps)Anycastno_type_check)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcContinuousTestMultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_msset_rng_seed	TEST_CUDATEST_HPUTEST_WITH_ROCMTEST_XPU)
has_triton      cudancclzhpu:0hcclxpuxcclcpugloo   c                   0    \ rS rSr\" 5       r\" 5       rSrg)FSDPInitMode^    N)__name__
__module____qualname____firstlineno__r   NO_FSDP	RECURSIVE__static_attributes__rD       ]/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/testing/_internal/common_fsdp.pyrB   rB   ^   s    fGIrL   rB   c                   >    \ rS rSr\" 5       r\" 5       r\" 5       rSrg)DEVICEInitModeg   rD   N)	rE   rF   rG   rH   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrK   rD   rL   rM   rO   rO   g   s    FM6L6LrL   rO   c                       \ rS rSrSr\S\\R                  S4   4S j5       r	\S\R                  4S j5       r
\SS j5       r\\S	\S
\S\R                  4S j5       5       rSrg)FSDPTestModelp   zVThis defines the interface expected from all models used commonly for
FSDP unit tests.return.c                     g)z+Returns an input for the model as as tuple.NrD   selfdevices     rM   	get_inputFSDPTestModel.get_inputt        	rL   c                     g)z,Returns the loss given the input and output.NrD   )rZ   inputoutputs      rM   get_lossFSDPTestModel.get_lossy   r^   rL   Nc                     g)z<Runs the backward pass (e.g. including ``loss.backward()``).NrD   rZ   losss     rM   run_backwardFSDPTestModel.run_backward~   r^   rL   argskwargsc                      g)z&Initializes an instance of this model.NrD   )ri   rj   s     rM   initFSDPTestModel.init   s     	rL   rD   rW   N)rE   rF   rG   rH   __doc__r   tupletorchTensorr\   rb   rg   staticmethodr   nnModulerl   rK   rD   rL   rM   rU   rU   p   s     5s):#;        C 3 299   rL   rU   modelprocess_group	assert_fnc                 X   U R                  5        VVs/ s H$  u  p4X4R                  5       R                  5       4PM&     nnnUU R                  5        VVs/ s H$  u  pgXgR                  5       R                  5       4PM&     snn-  n[        R
                  " U5      n[        U5       V	s/ s H  n	SPM     n
n	[        R                  " XUS9  U
S   nUc  [        S5      eU
SS  H1  nUc  [        S5      e[        XSS9 H  u  u  pu  pU" X5        M     M3     gs  snnf s  snnf s  sn	f )	z
All-gathers module states across ranks and calls ``assert_fn`` on each pair
of corresponding states from rank 0 and a nonzero rank. For example, if
``assert_fn`` is ``self.assertEqual()``, then this checks that all module
states are equal across ranks.
Ngroupr   z$Expected rank0_states to not be Noner@   zExpected state to not be NoneTstrict)
named_parametersdetachr>   named_buffersdistget_world_sizerangeall_gather_objectAssertionErrorzip)rv   rw   rx   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  rM   _assert_module_statesr      s3    "'!7!7!9!9J 
\\^'')*!9   #(#6#6#8#8K 
mmo))+,#8  $$]3J ,-,aT,E-5]K8LCDDqr= !@AA #L EGQWab !F 
 .s   +D+D!*D'c                  6    [         R                  " [        5      $ N)rq   r[   DEVICE_TYPErD   rL   rM   get_devtyper      s    <<$$rL   zero_buffersc                    U(       a  [         R                  " U 5      O	[        5       nU   U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     U(       aE  U R                  5        H1  n[        R
                  " 5          UR                  5         SSS5        M3     SSS5        g! , (       d  f       M  = f! , (       d  f       M_  = f! , (       d  f       g= f)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrq   no_gradzero_buffers)rv   r   summon_fullctxr   r   s         rM   _zero_modelr      s     -8$
!
!%
([]C	%%'E ! ( --/]]_LLN %_ * 
  %_ 
s;   )C)C$;C)C0C)
CC)
C&!C))
C7c                     U(       d  U R                  [        5      n U(       a  U R                  5         U R                  5       $ r   )tor   half
state_dict)rv   cpu_offloadr   s      rM   _get_state_dictr      s.    %

rL   c           	      p    SR                  U Vs/ s H  o"b  U [        U5         OSPM     sn5      $ s  snf )Nr   none)joinstr)test_name_mappingri   ss      rM   subtest_namer      s9    88IMNAm	3q6	"	?N Ns   3c                    UR                  5        H=  u  p#UR                  [        R                  " S5      :w  d  M+  UR                  5       X'   M?     U S:X  a  UOS /n[        R
                  " U5        [        [        [        [        R                  4   US   5      nU H  nX   R                  [        5      X'   M     U$ )Nr>   r   )itemsr[   rq   r>   r   broadcast_object_listr   dictr   rr   r   r   )rankr   r   r   r   s        rM   _broadcast_state_dictr      s     (--/
<<5<<..%*YY[J" 0  19Z$/Eu%d3,-uQx8J 
!+!7!:!:;!G
 !rL   recursec                     [         R                  " XS9   [        [        U R	                  5       5      5      sSSS5        $ ! , (       d  f       g= f)a?  
Returns the full unsharded parameters of ``model``. Any FSDP-managed
parameters offloaded to CPU are moved to GPU in the returned list.

Args:
    recurse (bool): If ``False``, only unshards the parameters immediate to
        ``model``; if ``True``, recurses through the module hierarchy
        rooted at ``model``.
)r   N)r   r   r   listr   )rv   r   s     rM   get_full_paramsr      s4     
	 	 	8U--/01 
9	8	8s   "A
Amove_to_devicec                 >    U(       a  U R                  [        5      $ U $ r   )r   r   )rv   r   s     rM   _move_to_devicer      s    $2588K ==rL   	wrap_fsdpc                 2    U(       d  U $ [        U /UQ70 UD6$ r   r   )rv   r   ri   rj   s       rM   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCrL   c                   H    \ rS rSrS\S\4S jrS\4S jrS\4S jrS rS	r	g
)DummyProcessGroup   r   sizec                     Xl         X l        g r   _rank_size)rZ   r   r   s      rM   __init__DummyProcessGroup.__init__   s    

rL   rW   c                     U R                   $ r   )r   rZ   s    rM   r   DummyProcessGroup.rank       zzrL   c                     U R                   $ r   )r   r   s    rM   r   DummyProcessGroup.size   r   rL   c                 B    [         R                  " 5       nS nXCl        U$ )Nc                  d    [         R                  R                  5       n U R                  S5        U $ )Nr@   )rq   futuresFuture
set_result)futures    rM   
get_future/DummyProcessGroup.allreduce.<locals>.get_future  s'    +0==+?+?+AFa MrL   )r   Mockr   )rZ   ri   rj   	dist_waitr   s        rM   	allreduceDummyProcessGroup.allreduce  s     IIK		
  *rL   r   N)
rE   rF   rG   rH   intr   r   r   r   rK   rD   rL   rM   r   r      s2    S  c c 	rL   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\   SS\R
                  S\S\S\\\4   S-  S\S\S\R&                  \-  4S jj5       rS rSrU =r$ )TransformerWithSharedParamsi  r{   device_init_modeadd_bndeterministicc                   > [         TU ]  5         UR                  5       U l        UR                  5       U l        U(       a  [
        R                  " S5        SnSn[        R                  " XV5      U l	        [        R                  " USSSSS9U l        [        R                  " Xe5      U l        U R                  R                  U R                  l        U R                  SU R                  R                  R!                  U45      5        U R                  S	[
        R"                  " U R$                  [
        R&                  S
95        SU l        U(       a)  [
        R                  R+                  U R(                  5      O[
        R                  R-                  5       U l        U[0        R2                  :X  a  U R5                  [6        5      n U(       a  U R9                  5         g g )Nr         r8      g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r   rq   manual_seedrt   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrO   rQ   r   r   eval)rZ   r{   r   r   r   d_vocabr   	__class__s          rM   r   $TransformerWithSharedParams.__init__  s]    	JJL	**,a LL:>>  
 99W6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R~;;;77;'DIIK rL   c                 $   [         R                  " SU R                  -   5        [         R                  " SUS9R	                  SU R
                  5      n[         R                  " U R
                  S-  US9R	                  SU R
                  5      nX#4$ )Nr@      r[      r7   )rq   r   r   arangeviewr   )rZ   r[   srctgts       rM   r\   %TransformerWithSharedParams.get_input8  sj    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGzrL   c                     U R                  U5      nX0R                  -   U R                  R                  U5      -   nU R                  U5      nU R	                  U5      nU R                  X45      nU R                  U5      $ r   )r   r   r   type_asr   r   r   )rZ   src_idstgt_idsr
  r  xs         rM   forward#TransformerWithSharedParams.forward>  sr    (OO#d&6&6&>&>s&CC(ggclS&""rL   c                     Uu  p4[         R                  R                  UR                  SUR	                  S5      5      UR                  S5      SS9$ )Nsum)	reduction)rt   
functionalcross_entropyr	  r   )rZ   r`   ra   r   r  s        rM   rb   $TransformerWithSharedParams.get_lossF  sG    }}**KKFKKO,chhrle + 
 	
rL   c                 $    UR                  5         g r   backwardre   s     rM   rg   (TransformerWithSharedParams.run_backwardL      rL   Nfsdp_init_modefsdp_kwargsrW   c                 x   Uc  0 nU[         R                  :X  a)  [        U [        5      (       a  U S   nOU n[	        XbXT5      $ U[         R
                  :X  a  SU;  a  [        [        [        15      nOUR                  S5      nSU;   a?  US   [        R                  [        R                  1;   a  [        U [        5      (       d  SnOU n[        U [        5      (       a  U S   n	OU n	[	        XXT5      n
[        U
U4SU0UD6nU[        R                  :X  a  UR!                  ["        5      nU$ [%        SU 35      e)a  
Initializes a :class:`TransformerWithSharedParams` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps with
        top-level FSDP. By default, the top-level FSDP uses the
        ``ModuleWrapPolicy`` for encoder and decoder layers, but a
        different auto wrap policy may be specified via
        ``fsdp_kwargs``.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    add_bn (bool): Whether to include batch norm in the model.
Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )rB   rI   
isinstancerp   r   rJ   r   r(   r'   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rO   rR   r   r   
ValueError)r{   r   r   r!  r   r   pgr#  fsdp_pg
tformer_pgm
fsdp_models               rM   rl    TransformerWithSharedParams.initO  sW   6 K\111%''1X.f  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%00%''"1X
"
+fA  "2 	J  >#>#>>']];7
77GHIIrL   c                     U R                   /$ r   )r   r   s    rM   get_ignored_modules/TransformerWithSharedParams.get_ignored_modules  s      !!rL   )r   r   r   r   r   r   r   )NFT)rE   rF   rG   rH   r   ProcessGrouprO   boolr   r\   r  rb   rg   rs   rB   r   r   r   rt   ru   r   rl   r2  rK   __classcell__r  s   @rM   r   r     s    (  ( )( 	(
 (T#
 
 .2#KJ  KJ$KJ )KJ #s(^d*	KJ
 KJ KJ 
T	KJ KJZ" "rL   r   c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjrS r	S r
S	 rS
 r\  SS\R
                  S\S\S\\\4   S-  S\S\R&                  4S jj5       rSrU =r$ )NestedWrappedModulei  r{   r   r   r   c                   >^^^ [         TU ]  5         TR                  5       U l        TR                  5       U l        U[
        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   layerr!  r{   r   s    rM   _maybe_wrap1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88LrL   r   r   r7   r   )r   r   r   r   r   rO   rQ   rq   r   rt   
Sequentialr   r   module	rZ   r{   r   r   r   r!  r   r>  r  s	    ``  `  rM   r   NestedWrappedModule.__init__  s     	JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F 		"a(8.IJBIIaO^<

rL   c                 v    [         R                  " SU R                  -   5        [         R                  " SSUS94$ )Nr@   r7   r   r  )rq   r   r   randrY   s     rM   r\   NestedWrappedModule.get_input  s.    !dii-(

1a/11rL   c                 $    U R                  U5      $ r   rB  rZ   r  s     rM   r  NestedWrappedModule.forward      {{1~rL   c                 &    UR                  5       nU$ r   )r  rZ   r`   ra   rf   s       rM   rb   NestedWrappedModule.get_loss  s    zz|rL   c                 $    UR                  5         g r   r  re   s     rM   rg    NestedWrappedModule.run_backward  r  rL   Nr   r!  rW   c                    Uc  0 nU[         R                  :X  a  [        U SUUS9$ U[         R                  :X  a;  [        U 4SUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP but not the top-level module. The model may
        later be wrapped with a top-level FSDP external to this method
        if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
Fr   r   r   Tr%  )	rB   rI   r9  rJ   rO   rR   r   r   r*  )r{   r   r   r!  r   r/  s         rM   rl   NestedWrappedModule.init  s    . K\111&!1+	  |555,!1+	
 J  >#>#>>']];7
77GHIIrL   rB  r   r   NF)rE   rF   rG   rH   r   r4  r5  rO   r   r\   r  rb   rg   rs   rB   r   r   r   rt   ru   rl   rK   r6  r7  s   @rM   r9  r9    s    
  
 
 )	

 
@2 
 .2#+J  +J$+J )+J #s(^d*	+J
 +J 
+J +JrL   r9  c                   v   ^  \ rS rSr\  S
S\R                  S\S\S\	\
\4   S-  S\4
U 4S jjj5       rS	rU =r$ )AlwaysWrapNestedWrappedModulei  Nr{   r   r   r!  r   c                 :  > [         [        [        ]   U [        R                  UUUS9nU[        R                  :X  a  U$ U[        R
                  :X  aH  U=(       d    0 n[        U4S[        0UD6nU[        R                  :X  a  UR                  [        5      nU$ g)z
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
policy.
)r{   r   r   r!  r   r#  N)r   rX  rl   rB   rI   rJ   r   r   rO   rR   r   r   )r{   r   r   r!  r   rv   r/  r  s          rM   rl   "AlwaysWrapNestedWrappedModule.init   s     )+H
'//-#'  
 	 \111L|555%+KeX6HXKXJ>#>#>>']];7
 6rL   rD   rV  )rE   rF   rG   rH   rs   r   r4  rB   rO   r   r   r   r5  rl   rK   r6  r7  s   @rM   rX  rX    s^    
 .2#  $ ) #s(^d*	
  rL   rX  c                      ^  \ rS rSrS\R
                  S\S\S\4U 4S jjr\	SS j5       r
\	  SS\R
                  S	\S\S
\\\4   S-  S\4
S jj5       rSrU =r$ )NonUniformReqGradNWMi!  r{   r   r   r   c                   >^^^ [         [        U ]  5         TR                  5       U l        TR	                  5       U l        U[        R                  :H  nUUU4S jnU(       a  [        R                  " S5        [        R                  " [        [        R                  " SS5      U5      U" [        R                  " U" [        [        R                  " SS5      U5      5      [        [        R                  " SS5      U5      5      5      U" [        R                  " [        [        R                  " SS5      U5      [        [        R                  " SS5      U5      5      5      5      U l        g )Nc                 0   > T(       a  [        U T40 TD6$ U $ r   r   r<  s    rM   r>  2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap5  r@  rL   r   r   r7   r   )r   r9  r   r   r   r   rO   rQ   rq   r   rt   rA  r   r   rB  rC  s	    ``  `  rM   r   NonUniformReqGradNWM.__init__"  s     	!413 JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F #BIIb!$4nE#BIIaO^D
rL   Nc                     U R                  5        H3  u  p#[        R                  " X5      (       a  M"  UR                  S5        M5     g rV  )r~   rematchrequires_grad_)rv   req_grad_masknps       rM   _set_nonuniform_req_grad-NonUniformReqGradNWM._set_nonuniform_req_gradL  s4    **,DA88M--  ' -rL   r   r!  c                    [         R                  " S5      nU[        R                  :X  a#  [	        U SUUS9n[        R                  Xe5        U$ U[        R                  :X  aU  Uc  0 n[	        U 4SUUS.UD6nU[        R                  :X  a  UR                  [        5      n[        R                  Xu5        U$ [        SU 35      e)a  
Initializes a :class:`NestedWrappedModule` instance, but unlike
:meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
container to enable the desired non-uniform ``requires_grad``
``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
init modes, freezes all parameters except the last two to validate
``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
FSDP ``use_orig_params=True`` mode.
zmodule\.2.*\.1.*FrS  Tr%  )rb  compilerB   rI   r\  rh  rJ   rO   rR   r   r   r*  )r{   r   r   r!  r   req_grad_pattern	ddp_modelr/  s           rM   rl   NonUniformReqGradNWM.initR  s    ( ::&9:\111,!1+	I !99)V|555" -!1+	
 J  >#>#>>']];7
 99*W77GHIIrL   rU  rn   rV  )rE   rF   rG   rH   r   r4  r5  rO   r   rs   rh  rB   r   r   r   rl   rK   r6  r7  s   @rM   r\  r\  !  s    (
  (
 (
 )	(

 (
T ( (
 
 .2#+J  +J$+J )+J #s(^d*	+J
 +J +JrL   r\  c                      ^  \ rS rSrSrS\R                  S\S\4U 4S jjrS r	S r
S	 rS
 r\S\\   S\S\S\S\4
S j5       rSrU =r$ )ModuleWithDelayi  zThis class wraps a :class:`FSDPTestModel` to optionally add a delay
after computing the loss and/or before the gradient reduction.rB  delay_after_loss_msdelay_before_reduction_msc                 F   > [         TU ]  5         X l        X0l        Xl        g r   )r   r   rq  rr  rB  )rZ   rB  rq  rr  r  s       rM   r   ModuleWithDelay.__init__  s!     	#6 )B&rL   c                 8    U R                   R                  U5      $ r   )rB  r\   rY   s     rM   r\   ModuleWithDelay.get_input  s    {{$$V,,rL   c                 $    U R                  U5      $ r   rI  rJ  s     rM   r  ModuleWithDelay.forward  rL  rL   c                 b   U R                   R                  X5      nU R                  S:  a  [        (       d  [        (       a%  [
        R                  " U R                  S-  5        U$ [        (       a=  [        R                  R                  [        U R                  [        5       -  5      5        U$ Nr     )rB  rb   rq  r3   r5   timesleepr2   rq   r9   _sleepr   r0   rN  s       rM   rb   ModuleWithDelay.get_loss  s}    {{##E2##a'x88

433d:;  

!!#d&>&>ARAT&T"UVrL   c                    ^ ^ [         R                  R                  mUU 4S jn[        R                  " SU5         T R
                  R                  U5        S S S 5        g ! , (       d  f       g = f)Nc                  8  > TR                   S:  a  [        (       a>  [        R                  R	                  [        TR                   [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR                   S-  5        T" U 0 UD6$ rz  )rr  r2   rq   r9   r~  r   r0   r3   r5   r|  r}  )ri   rj   orig_reduce_scatterrZ   s     rM   _delayed_reduce_scatter=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  sq    --19JJ%%D::=N=PPQ XJJt==DE&777rL   z'torch.distributed.reduce_scatter_tensor)rq   distributedreduce_scatter_tensorr   patchrB  rg   )rZ   rf   r  r  s   `  @rM   rg   ModuleWithDelay.run_backward  sR    #//EE	8 ZZ57N
 KK$$T*
 
 
s   A  
A.module_class
model_argsmodel_kwargsc                <    [        U R                  " U0 UD6UU5      $ )a  
Args:
    module_class (Type[FSDPTestModel]): Wrapped module class to which
        to add delays.
    model_args: Positional arguments forwarded to the ``module_class``
        ``init()``.
    delay_after_loss_ms (int): Delay after computing the loss/before
        the optimizer step (in ms).
    delay_before_reduction_ms (int): Delay before reduce-scattering
        gradients (in ms).
    model_kwargs: Keyword arguments forwarded to the ``module_class``
        ``init()``.
)rp  rl   )r  rq  rr  r  r  s        rM   rl   ModuleWithDelay.init  s*    * z:\:%
 	
rL   )rq  rr  rB  )rE   rF   rG   rH   ro   rt   ru   r   r   r\   r  rb   rg   rs   typerU   r   rl   rK   r6  r7  s   @rM   rp  rp    s    F				 !	 $'		-+$ 
=)

 !
 $'	

 
 
rL   rp  c                       \ rS rSr\\R                  SSSS4S\R                  S\	S\S\
\\4   S-  S	\S
\S\4S jj5       rSrg)NestedWrappedModuleWithDelayi  NFr   r{   r   r   r!  r   rq  rr  c                 >    [         R                  [        U UUUUUUS9$ )Nr{   r   r   r!  r   rq  rr  )rp  rl   r9  r  s          rM   rl   !NestedWrappedModuleWithDelay.init  s4     ##)-#' 3&? $ 	
 		
rL   rD   )rE   rF   rG   rH   rs   rO   rR   r   r4  rB   r   r   r   r5  r   rl   rK   rD   rL   rM   r  r    s     ,:+F+F-1##$)*
  
$
 )
 #s(^d*	

 
 !
 $'
 
rL   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DummyDDPi  c                 .   > [         TU ]  5         Xl        g r   )r   r   rB  )rZ   rB  r  s     rM   r   DummyDDP.__init__  s    rL   c                 &    U R                   " U0 UD6$ r   rI  rZ   ri   rj   s      rM   r  DummyDDP.forward  s    {{D+F++rL   rI  rE   rF   rG   rH   r   r  rK   r6  r7  s   @rM   r  r    s    , ,rL   r  c                      ^  \ rS rSrS\R
                  S\S\S\S\4
U 4S jjr	S r
S	 r\   SS\R
                  S\S\S\\\4   S
-  S\S\4S jj5       rSrU =r$ )MixtureOfExpertsi  r{   r   r   delay_before_free_msr   c                   > [         TU ]  UUUUS9  Xl        X@l        X l        U[
        R                  :H  U l        U(       a#  [        R                  " SU R                  -   5        SnSnSn	[        [        R                  " Xx5      U R                  5      n
[        S U
R                  5        5       5      U l        U
R                  5        H
  nSUl        M     U(       a  [        R                  " S5        [        [        R                  " X5      U R                  5      nU(       aF  [        R$                  R'                  UR                  5       /5      n[)        X40 UD6n
[)        X40 UD6n[        R*                  " [        [        R                  " X5      U R                  5      UU
[        [        R                  " X5      U R                  5      5      U l        g )	N)r{   r   r   r   *   r   r  r   c              3   @   #    U  H  oR                  5       v   M     g 7fr   )numel).0rg  s     rM   	<genexpr>,MixtureOfExperts.__init__.<locals>.<genexpr>
  s     $L8K1WWYY8K   Tr   )r   r   r{   r  r   rO   rQ   r   rq   r   r   r   rt   r   r  r   num_expert_paramsexpertr  	new_groupr   rA  rB  )rZ   r{   r   r   r  r   r!  d_expertd_sharedd_inputr  rg  sharedexpert_groupr  s                 rM   r   MixtureOfExperts.__init__  s{    	-'	 	 	
 
$8!"..2N2NNb499n- 8!>@S@ST!$$L8I8I8K$L!L""$AAH % a  8!>@S@ST ,,66L &>+>F&7;7FmmBIIg8$:M:MNBIIh8$:M:MN	
rL   c                   ^ ^ T R                   S:  a  T R                  S   n[        U[        5      (       ag  [        R
                  R                  R                  R                  mUU 4S jn[        R                  " SU5         T R                  U5      sS S S 5        $ T R                  U5      $ ! , (       d  f       N= f)Nr   r8   c                    > [         (       a>  [        R                  R                  [	        TR
                  [        5       -  5      5        O9[        (       d  [        (       a#  [        R                  " TR
                  S-  5        T" U 0 UD6$ )Nr{  )r2   rq   r9   r~  r   r  r0   r3   r5   r|  r}  )ri   rj   orig_reshardrZ   s     rM   _delayed_reshard2MixtureOfExperts.forward.<locals>._delayed_reshard)  sc     y

)) 9 9<M<O OP "XX

4#<#<t#CD'888rL   z.torch.distributed.fsdp._runtime_utils._reshard)r  rB  r&  r   rq   r  fsdp_runtime_utils_reshardr   r  )rZ   r  r  r  r  s   `   @rM   r  MixtureOfExperts.forward#  s    $$q([[^F&$''$0055DDMM9 ZZDFV  ;;q> 
 {{1~ s   B//
B=c                    UR                  5         U R                  (       d  [        R                  " 5          U R	                  5        H|  n[        US5      (       a  M  UR                  c  M%  UR                  R                  U R                  5        [        R                  R                  UR                  U R                  S9  M~     S S S 5        g g ! , (       d  f       g = f)Nr  rz   )r  r   rq   r   r   hasattrgraddiv_r   r  
all_reducer{   )rZ   rf   rg  s      rM   rg   MixtureOfExperts.run_backward;  s    ~~*Aq(++ vv)DOO4))44QVV4::4N + !  s   2C-AC
C Nr   r!  c                 
   Uc  0 nU[         R                  :X  a  [        U SUUUS9$ U[         R                  :X  a<  [        U 4SUUUS.UD6nU[        R
                  :X  a  UR                  [        5      nU$ [        SU 35      e)a  
Initializes a :class:`MixtureOfExperts` instance.

Args:
    fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
        any modules with FSDP. If ``RECURSIVE``, then wraps some nested
        modules with FSDP, including the expert and shared layers, but
        not the top-level module. The model may later be wrapped with a
        top-level FSDP external to this method if desired.
    device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
    fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
        forwarded to the FSDP constructor.
    deterministic (bool): Whether to make the model deterministic
        across constructions.
    delay_before_free_ms (int): Delay before resharding expert
        parameters in the forward pass (in ms).
F)r   r   r  r   Tr%  )	rB   rI   r  rJ   rO   rR   r   r   r*  )r{   r   r   r!  r   r  r/  s          rM   rl   MixtureOfExperts.initG  s    4 K\111#!1%9+  |555)!1%9+ J  >#>#>>']];7
77GHIIrL   )r  r{   rB  r   r  r   )NFr   )rE   rF   rG   rH   r   r4  r5  rO   r   r   r  rg   rs   rB   r   r   r   rl   rK   r6  r7  s   @rM   r  r    s    2
  2
 2
 )	2

 "2
 2
h0
O 
 .2#$%0J  0J$0J )0J #s(^d*	0J
 0J "0J 0JrL   r  c                      ^  \ rS rSr SSSSS.S\S\R                  S-  S	\S
\S\4
U 4S jjjjrS\R                  S\R                  4S jr
S rSrU =r$ )MLPi{  NTFr7   )biaswith_bufferdim_multiplierdimr[   r  r  r  c                   > [         TU ]  5         [        R                  " XU-  X#S9U l        [        R                  " XQ-  XUS9U l        U(       a'  U R                  S[        R                  " U4US95        g S U l	        g )N)r[   r  r   r  )
r   r   rt   r   in_projout_projr   rq   randnr   )rZ   r  r[   r  r  r  r  s         rM   r   MLP.__init__|  sf     	yys&:6U		."6QUV  5;;vf+MNDKrL   r  rW   c                     U R                  U5      n[        R                  " U5      nU R                  U5      n[        R                  " U5      nU R                  b  X R                  -   nU$ r   )r  Frelur  r   )rZ   r  zs      rM   r  MLP.forward  sQ    LLOFF1IMM!FF1I;;"KKArL   c                     U R                   b4  [        R                  R                  R	                  U R                   5        g g r   )r   rq   rt   rl   normal_r   s    rM   reset_parametersMLP.reset_parameters  s+    ;;"HHMM!!$++. #rL   )r   r  r  r   )rE   rF   rG   rH   r   rq   r[   r5  r   rr   r  r  rK   r6  r7  s   @rM   r  r  {  s     '+
 ! t#
    " %,, / /rL   r  c                   V   ^  \ rS rSrSS.S\S\4U 4S jjjrS\S\S	\S
S 4S jrSr	U =r
$ )MLPStacki  F)with_seq_parallelmlp_dimr  c                   > [        USS9[        U5      [        USS9/nU(       a$  UR                  [        R                  " USS95        [        TU ]  " U6   X l        g )N   )r  Fr  )r  appendrt   	LayerNormr   r   r  )rZ   r  r  modulesr  s       rM   r   MLPStack.__init__  sV     *L*	$
 NN2<<e<='"!2rL   tp_meshdp_meshuse_activation_checkpointingrW   c           
         [        SS9[        SS9[        SS9[        SS9[        SS9U R                  (       a  [        [        S5      S9O	[        5       S.nU R                  (       a  [	        SS9US'   [        XUS9  U  HD  n[        U[        R                  5      (       a  M$  U(       a  [        U5        [        U4S	U0UD6  MF     [        U 4S	U0UD6  U $ )
NF)use_local_outputr@   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r#   r%   r  r"   r&   r$   r&  rt   r  r   r   )rZ   r  r  r  r!  r  rB  s          rM   parallelizeMLPStack.parallelize  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4GWXF&",,//+6"<W<<  	D6w6+6rL   )rE   rF   rG   rH   r   r5  r   r   r  rK   r6  r7  s   @rM   r  r    sN    BG 
3 
34 
3 
3  '+	 
 rL   r  c                      ^  \ rS rSrSrS
S\S\4U 4S jjjrS\R                  S\
\R                  \R                  4   \R                  -  4S jrS	rU =r$ )DoubleLineari  z
This can be used for returning multiple outputs from a module
(``use_second_linear=True``) or for having an unused module (``False``).
r  use_second_linearc                    > [         TU ]  5         [        R                  " X5      U l        [        R                  " X5      U l        [        R                  " 5       U l        X l        g r   )	r   r   rt   r   lin1lin2ReLUr  r  )rZ   r  r  r  s      rM   r   DoubleLinear.__init__  sA    IIc'	IIc'	GGI	!2rL   r  rW   c                     U R                   (       a@  U R                  U R                  U5      5      U R                  U R                  U5      5      4$ U R                  U R                  U5      5      $ r   )r  r  r  r  rJ  s     rM   r  DoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&rL   )r  r  r  r  T)rE   rF   rG   rH   ro   r   r5  r   rq   rr   rp   r  rK   r6  r7  s   @rM   r  r    sY    
3C 3D 3 3''	u||U\\)	*U\\	9' 'rL   r  new_all_gather_into_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   all_gather_into_tensorbarrier)r  orig_all_gathers     rM   patch_all_gatherr    sN     11OLLN"<D6&5# 	&5#   1A>A !A>"A;;A>new_foreach_all_gatherc              #   @  #    [         R                  R                  R                  R                  R
                  n[        R                  " 5         U [         R                  R                  R                  R                  l         S v   [        R                  " 5         U[         R                  R                  R                  R                  l        g ! [        R                  " 5         U[         R                  R                  R                  R                  l        f = f7fr   )rq   r  r  _fully_shard_fsdp_param_groupforeach_all_gatherr   r  )r  orig_foreach_all_gathers     rM   patch_foreach_all_gatherr    s      	++==PP  	LLN 
''99L
# 	++==P 	# 	++==P    BDC A	DA
DDnew_foreach_reducec              #   @  #    [         R                  R                  R                  R                  R
                  n[        R                  " 5         U [         R                  R                  R                  R                  l         S v   [        R                  " 5         U[         R                  R                  R                  R                  l        g ! [        R                  " 5         U[         R                  R                  R                  R                  l        f = f7fr   )rq   r  r  r  r  foreach_reducer   r  )r  orig_foreach_foreach_reduces     rM   patch_foreach_reducer    s      	++==LL   	LLN 
''99H
' 	++==L 	' 	++==Lr  new_reduce_scatter_tensorc              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   r  r  )r  r  s     rM   patch_reduce_scatterr    sO     44LLN!:D9%8" 	%8"r  new_all_reducec              #      #    [         R                  n[         R                  " 5         U [         l         S v   [         R                  " 5         U[         l        g ! [         R                  " 5         U[         l        f = f7fr   )r   r  r  )r  orig_all_reduces     rM   patch_all_reducer    sI     ooOLLN$DO*) 	)r  new_unshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   unshardr   r  )r  orig_unshards     rM   patch_unshardr  +  P      "))LLLN(N.!- 	!-r  new_reshardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   reshardr   r  )r  r  s     rM   patch_reshardr  8  r  r  new_post_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   post_backwardr   r  )r  orig_post_backwards     rM   patch_post_backwardr!  E  sQ      (55LLN#4N :'9$ 	'9$r  new_backwardc              #      #    [         R                  n[        R                  " 5         U [         l         S v   [        R                  " 5         U[         l        g ! [        R                  " 5         U[         l        f = f7fr   )r   r  r   r  )r"  orig_backwards     rM   *patch_register_post_backward_hook_backwardr%  R  sS      199MLLN,8 )>0=$- 	0=$-r  r  ri   rj   c                     [        U5      S:  a  US   nOSU;   a  US   nO[        SU SU 35      eU" U5        U" U0 UD6$ )Nr   ra   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenr   )clsr  rx   ri   rj   ra   s         rM   reduce_scatter_with_assertr)  _  sa     4y1}a	V	!;D6F8T
 	
 f///rL   replicated_modulesharded_moduleprefixes_to_ignore.c                    [        UR                  5       UR                  5       SS9 GH  u  u  pEu  pgUnU H  n	UR                  U	S5      nM     U R                  XH5        U R	                  U[
        5        [        U[
        5      (       d  [        S5      eUR                  UR                  p[        U5      [        S5      [        S5      4:X  a  [        S5      e[        XZU5      nU R                  UR                  5       UR                  5       5        UR                  c  U R                  UR                  5        GM  U R!                  UR                  5        [        UR                  X5      nU R	                  UR                  [
        5        [        UR                  [
        5      (       d  [        S5      eU R                  UR                  R                  5       UR                  5       5        GM     g )NTr|    z&Expected sharded_param to be a DTensorr   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using itz+Expected sharded_param.grad to be a DTensor)r   r~   replaceassertEqualassertIsInstancer!   r&  r   r  
placementsrp   r"   r    to_localr  assertIsNoneassertIsNotNone)r(  r*  r+  r,  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r2  sharded_ref_paramsharded_ref_grads                 rM   check_sharded_parityr>  r  s    OR**,'')OJ+-Jl
 *(F!3!;!;FB!G )<]G4-11 !IJJ(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BDU]//9-,,g66 !NOO**3357G7P7P7RS9OrL   znot-support-multithreadc                   J   ^  \ rS rSr\S 5       rU 4S jrS rS rS r	Sr
U =r$ )FSDPTestMultiThreadi  c                     [         $ r   DEVICE_COUNTr   s    rM   r   FSDPTestMultiThread.world_size      rL   c                 B   > [         TU ]  5         U R                  5         g r   )r   setUp_spawn_threadsrZ   r  s    rM   rG  FSDPTestMultiThread.setUp  s    rL   c                      [        U /UQ70 UD6$ r   r-   r  s      rM   r-    FSDPTestMultiThread.run_subtests      D242622rL   c                 @    [         R                  R                  5         g r   rq   _dynamoresetr   s    rM   perThreadSetUp"FSDPTestMultiThread.perThreadSetUp      rL   c                 @    [         R                  R                  5         g r   rP  r   s    rM   perThreadTearDown%FSDPTestMultiThread.perThreadTearDown  rU  rL   rD   )rE   rF   rG   rH   propertyr   rG  r-   rS  rW  rK   r6  r7  s   @rM   r@  r@    s.     3 rL   r@  c            #       d   \ rS rSrSrS rS rS rS r\	S 5       r
       S$S
\R                  S\S\S\S\S-  S\S\S-  S\S\S\\\4   S-  4S jjrSSS\" 5       SSSS	S	S	S	SS4S\\   S\S\S\S-  S\S\S\S\S-  S\S-  S\S-  S\S \S\S\S!\\\4   S-  S\\\4   S-  4 S" jjrS#rg)%FSDPTestMixini  z
Mixin class containing shared test utilities for FSDP tests.
Provides common helper methods for both FSDPTest and FSDPTestContinuous.
c                 :    U R                  X!R                  5        g r   )r0  r   )rZ   r/  r   s      rM   _check_cpu_offload FSDPTestMixin._check_cpu_offload  s    &<&<=rL   c                 :    U R                  X!R                  5        g r   )r0  backward_prefetch)rZ   r/  r`  s      rM   _check_backward_prefetch&FSDPTestMixin._check_backward_prefetch  s    *,H,HIrL   c                 :    U R                  X!R                  5        g r   )r0  forward_prefetch)rZ   r/  rd  s      rM   _check_forward_prefetch%FSDPTestMixin._check_forward_prefetch  s    )+F+FGrL   c                      [        U /UQ70 UD6$ r   rL  r  s      rM   r-   FSDPTestMixin.run_subtests  rN  rL   c                    U " U5      nXl         X6l        UR                  SS5      n[        SUR                    SUR                   35        [
        S:w  a`  [        R                  R                  5       UR                  :  a4  [        R                  " [        SUR                   3   R                  5         U(       a^  [        R                  R                  R                  R                   R#                  5       n[$        R&                  " SUR                  UUS9  OC[$        R&                  " UR(                  [*        [-        UR                  5      UR                   S	9   S n
UR                   [2        -  n[4        (       d  [6        (       a  [        R                  R9                  U5        U/n
[$        R:                  " U
S9  [        R<                  R?                  5         [A        5         URC                  X$5        [        R<                  R?                  5         [$        R:                  " U
S9  [$        RD                  " 5         g ! [.         a@  n	S
U	R0                  S   ;   a'  [        R                  " [        S   R                  5        e S n	A	ff = f)Nfake_pgFdist init r=, world=r>   
multi-gpu-fakebackendr   r   storeinit_methodrp  r   r   	recompiler   backend_unavailable
device_ids)#r   	file_namegetprintr   r   rq   acceleratordevice_countsysexitr.   	exit_codetesting	_internalr  rj  	FakeStorer   init_process_grouprs  DISTRIBUTED_BACKENDr   RuntimeErrorri   rC  r2   r5   set_device_indexr  rQ  rR  r1   run_testdestroy_process_groupr(  r   	test_namerx  piperj   rZ   rj  rq  erw  	device_ids               rM   _runFSDPTestMixin._run  s   9~	"**Y.TYYKx/@AB%E$5$5$B$B$Dt$VHHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	9..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s    7A$I AI 
J;JJNFrv   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         U=(       a    UR                   n[        UR                  5       5      R                  nU
c  0 n
[	        S	SU0U
D6n[
        R                  R                  UR                  5       USS9n[        U5       GHt  nUR                  5         [
        R                  R                  [        US9   UR                  R                  [
        R                  " [        5      5      nU	(       d  U(       aW  [        U[         5      (       dB  [        U[
        R"                  5      (       a  UR%                  5       nO['        S U 5       5      nU" U6 nU(       ap  [        U[         5      (       a[  UR(                  [*        ;  aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     UR                  R/                  UU5      R1                  U5      nS S S 5        UR3                  W5      nU(       d1  U	(       d*  UR4                  [
        R6                  :w  a  [9        S5      eOU	(       a+  U R-                  UR4                  [
        R:                  5        Ot[        U[         5      (       a5  Uc  [9        S5      eU R-                  UR4                  UR<                  5        O*U R-                  UR4                  [
        R6                  5        UR                  R?                  U5        U(       a\  [        U[         5      (       aG  UR                  5        H3  nU R-                  UR                  [
        R                  " S5      5        M5     URA                  U5        URC                  5         U(       d  GM  URE                  5       RG                  5        VVs0 s H  u  nnUURI                  5       _M     nnn[K        U5        URM                  U5        GMw     [        U[         5      (       a  URO                  [P        RR                  5        WRU                  5       $ ! , (       d  f       GNN= fs  snnf )
Nenabledg?)r  momentum)r  c              3   @   #    U  H  oR                  5       v   M     g 7fr   )r   )r  r  s     rM   r  9FSDPTestMixin._train_for_several_steps.<locals>.<genexpr>  s     %>1ffhhr  r>   zQloss data type should be float32, as the original parameter data type is float32.z'Expected mixed_precision to not be NonerD   )+offload_paramsnextr   r[   r   rq   optimSGDr   	zero_gradampr  r   rB  r\   r&  r   rr   r   rp   r$  r   r0  rb   r   scaler   float32r   float16param_dtyperg   stepupdater   r   cloner   load_state_dict_assert_stater   IDLEr   )rZ   rv   r  r  r  r  r  r  r  r  r  cpu_offload_paramsmodel_devicesharded_grad_scalerr  r   r`   ra   rg  rf   kvr   s                          rM   _train_for_several_steps&FSDPTestMixin._train_for_several_steps  s@    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy!AOO##K#B..u||K/HI _Zt=T=T!%66 %

 %%>%> > '"5$// //>? #--/((5<<3FG 0 ||,,UF;>>|L- C. ',,T2D"=::.(:  / !$$TZZ?t,,&.,E  $$TZZ1L1LM$$TZZ?LL%%d+!j&=&=))+A$$QXXu||E/BC ,  $$U+&&(z7<7G7G7I7O7O7QR7Qtq!al7Q
R E"%%j1{ "~ eT"" 2 23{{} CBn Ss   9E Q :Q 
Q	r8   Tmodel_classr   r   ref_init_fn	num_itersr   r`  r$  rd  use_orig_paramsinit_kwargsc                 @   U[         R                  :X  a  [        S5      eUc  0 nSnU R                  R	                  5       nUR
                  " U R                  [         R                  [        R                  4SS0UD6nUcC  [        (       a  [        U[        /[        S9nO+[        S:X  a  [        U5      nO[        UU/US9nOU" U5      nU(       a  UR                  5       nU R                  UUU
SLUUU
UUUS9	n[        UR                  5       5      nUR                  UUU	U
UUS	.5         UR
                  " U R                  UUU4SS0UD6n['        U[(        5      (       d  [)        UU R                  40 UD6nU(       a  UR                  5       nU[        R*                  :X  a  UR-                  [        5      nUSL=(       a    UR.                  nU=(       a    U[        R*                  :H  nU=(       a    U[        R*                  :g  nU(       aI  [0        R2                  " S5      nUR                  5        H  nU R5                  UR2                  U5        M!     U(       a  U R7                  [8        S[         35      O	[;        5       nU   U R                  UUSUUUU
UUUS9
n SSS5        U(       a  gU(       a^  [0        R2                  " S5      nUR                  5        H  nU R5                  UR2                  U5        M!     W R-                  [        5      n [=        U5      n![0        R>                  RA                  UW SS9  U
c  U(       d  U R5                  UU!SSS9  ggg! [          a   n[#        S
U S[%        U5       35      UeSnAff = f! , (       d  f       N= f)a  
Tests FSDP training against a reference, which defaults to DDP but
may be customized with ``ref_init_fn``.

Args:
    model_class (Type[FSDPTestModel]): A model class that inherits from
        ``FSDPTestModel``, which defines the expected interface.
    fsdp_init_mode (FSDPInitMode): The mode to initialize the
        FSDP-wrapped model. This should not be ``NO_FSDP``.
    ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
        non-wrapped model to construct the reference model, where this
        wrapper should provide data parallel semantics. If ``None``,
        then the callable defaults to the DDP constructor.
z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)rw  output_devicer>   )r  r  r  r  r  r  r  )r   r`  r$  r  rd  r  zInitializing z raised error zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)r  r  r  r  r  r  r  r  )check_dtypezFSDP did not match DDP)exact_devicemsg)!rB   rI   r   rw   r   rl   rO   rQ   r3   DDPr   r   r  r   r   r  	Exceptionr*  r   r&  r   rR   r   r  rq   r[   r0  assertRaisesRegexr  r   r   r  assert_close)"rZ   r  r   r   r  r  r  r   r`  r$  r  rd  r  r  r  r  r  r!  r  r   rv   	ref_modelref_loss
ddp_paramsr/  r  r  expects_device_errorexpects_cpu_device
cpu_devicer   context	fsdp_lossfsdp_unsharded_paramss"                                     rM   _test_fsdp_parityFSDPTestMixin._test_fsdp_parityO  s   D \111 !QRRK!!&&(    ((
 	

 
 x{m;	 %J	4&M	#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y$))"" 	
 # J *d++ j$*<*<LLJ#*J~:::#{3J$D0O[5O5O
 N/>3N3NN 	 N/>3N3NN 	 e,J#..0  z: 1 $ ""%%0M3  	 55!,% /+E++E 6 I    e,J#..0  z: 1![1I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF Ws$   ."M" N"
N,NN
Nrx  r   )r  NFNFFN)rE   rF   rG   rH   ro   r]  ra  re  r-   classmethodr  rt   ru   r   r5  floatr   r   r   r   r   r  r  rU   rB   rO   r   r   r   r  rK   rD   rL   rM   r[  r[    s   
>JH3 4% 4%v .2 15+0#<@YyyY Y 	Y
 Y %t+Y Y ($.Y %)Y Y %)cNT$9Y@ (,",,595915!& %+0#-1<@#h-(h %h )	h
 _h h h  h ,d2h ,d2h ($.h h h %)h h  #s(^d*!h" %)cNT$9#h hrL   r[  c                      ^  \ rS rSrU 4S jr\S 5       r\S 5       r\S\4S j5       r	\S 5       r
\S 5       rS	rU =r$ )
FSDPTesti  c                 h   > [         TU ]  5         S[        R                  S'   U R	                  5         g )N0TORCH_NCCL_DESYNC_DEBUG)r   rG  osenviron_spawn_processesrI  s    rM   rG  FSDPTest.setUp  s)     14

,-rL   c                     [         $ r   rB  r   s    rM   r   FSDPTest.world_size  rE  rL   c                 >    [         R                  R                  5       $ r   )r   distributed_c10d_get_default_groupr   s    rM   rw   FSDPTest.process_group  s    $$7799rL   rW   c                     grV  rD   r   s    rM   destroy_pg_upon_exitFSDPTest.destroy_pg_upon_exit  s     rL   c                 *    [          U R                   3$ r   )r/   rx  r   s    rM   rs  FSDPTest.init_method  s    t~~.//rL   c                    U " U5      nXl         X6l        UR                  SS5      n[        SUR                    SUR                   35        [
        R                  R                  5       UR                  :  a4  [        R                  " [        SUR                   3   R                  5         U(       a^  [
        R                  R                  R                  R                  R!                  5       n["        R$                  " SUR                  UUS9  OC["        R$                  " UR&                  [(        [+        UR                  5      UR                   S9   S n
UR                   [0        -  n[2        (       d  [4        (       a  [
        R                  R7                  U5        U/n
["        R8                  " U
S9  [
        R:                  R=                  5         [?        5         URA                  X$5        [
        R:                  R=                  5         ["        R8                  " U
S9  ["        RB                  " 5         g ! [,         a@  n	S	U	R.                  S
   ;   a'  [        R                  " [        S   R                  5        e S n	A	ff = f)Nrj  Frk  rl  rm  rn  ro  rr  rt  r   ru  rv  )"r   rx  ry  rz  r   rq   r{  r|  r}  r~  r.   r  r  r  r  rj  r  r   r  rs  r  r   r  ri   rC  r2   r5   r  r  rQ  rR  r1   r  r  r  s               rM   r  FSDPTest._run  s   9~	"**Y.TYYKx/@AB))+doo=HHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	9..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s    -A$H7 AH7 7
J;I<<Jr  )rE   rF   rG   rH   rG  rY  r   rw   r5  r  rs  r  r  rK   r6  r7  s   @rM   r  r    ss        : : d   0 0 4% 4%rL   r  c                      ^  \ rS rSr% Sr\r\\S'   \	S\
4S j5       r\	S\
4S j5       r\	U 4S j5       rU 4S jrU 4S	 jr\S
 5       rSrU =r$ )FSDPTestContinuousiL  z
FSDP test base class using MultiProcContinuousTest for faster test execution.
This class reuses worker processes across tests, reducing process spawn overhead.
Use this for tests that don't require fresh process state between tests.
r   rW   c                     [         $ r   )r  r(  s    rM   backend_strFSDPTestContinuous.backend_strU  s    ""rL   c                     [         $ r   )r   r  s    rM   device_typeFSDPTestContinuous.device_typeY  s    rL   c                 `  > S[         R                  S'   [        R                  R	                  5       U:  a*  [
        R                  " [        SU 3   R                  5        U[        -  n[        (       d  [        (       a  [        R                  R                  U5        [        TU ]9  XU5        g )Nr  r  rm  )r  r  rq   r{  r|  r}  r~  r.   r  rC  r2   r5   r  r   _init_pg)r(  r   r   	rdvz_filer  r  s        rM   r  FSDPTestContinuous._init_pg]  s    
 14

,-))+j8HHZ*ZL 9:DDE<'	9..y995rL   c                    > [         TU ]  5         U R                  U R                  :w  a  [        R
                  " 5         [        R                  R                  5         [        5         g r   )
r   rG  r   MAIN_PROCESS_RANKr   r  rq   rQ  rR  r1   rI  s    rM   rG  FSDPTestContinuous.setUpm  s>     99...LLNrL   c                    > U R                   U R                  :w  a  [        R                  " 5         [        TU ]  5         [        R                  R                  5         g r   )	r   r  r   r  r   tearDownrq   rQ  rR  rI  s    rM   r  FSDPTestContinuous.tearDownw  s9    99...LLNrL   c                 .    U R                   R                  $ r   )r  r+  r   s    rM   rw    FSDPTestContinuous.process_group~  s    ~~   rL   rD   )rE   rF   rG   rH   ro   rC  r   r   __annotations__r  r   r  r  r  rG  r  rY  rw   rK   r6  r7  s   @rM   r  r  L  sz     #J"#C # # C   6 6 ! !rL   r  compile_compute_on_modulec                 D   ^ ^^ U 4S jm " S S[         5      mUU4S jnU$ )Nc                     > [         R                  R                  R                  " U 0 UD6  Tb  [	        U S   T5      (       a  U S   R                  5         g g )Nr   )rq   r  r  r   r&  rk  )ri   rj   r  s     rM   !fully_shard_with_compiled_compute=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sS    **D;F;$,
G.1
 1
 GOO1
rL   c                   0    \ rS rSr\" 5       r\" 5       rSrg)*compiled_fsdp_test.<locals>.FullyShardModei  rD   N)rE   rF   rG   rH   r   EAGERCOMPILED_COMPUTErK   rD   rL   rM   FullyShardModer    s    6rL   r  c                 6   >^  [        T 5      UUU 4S j5       nU$ )Nc                    > [         R                  R                  R                  nT GHF  nUTR                  :w  a&  [        5       (       d  [        R                  " SSS9  M:  [         R                  R                  R                  n[         R                  R                  5         UTR                  :X  a  UnO@UTR                  :X  a"  S[         R                  R                  l
        TnO[        SU 35      eUTR                  UR                  '   T" U 0 UD6  [         R                  R                  5         UTR                  UR                  '   U[         R                  R                  l
        GMI     g )Nz0Inductor on GPU needs Triton and recent GPU archr8   )
stacklevelr@   z!Need to implement FullyShardMode=)rq   r  r  r   r  r6   warningswarn	_inductorconfigcompile_threadsr  r  NotImplementedError__globals__rE   )	ri   rj   original_fully_shardmodeoriginal_compile_threadsfully_shard_patchr  r  funcs	         rM   wrapper6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  s,   (-(9(9(>(>(J(J &>///
MMJWX +0??+A+A+Q+Q(!!))+>///(<%^<<<=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?9Q&&67 'rL   r
   )r  r  r  r  s   ` rM   	decorator%compiled_fsdp_test.<locals>.decorator  s"    	t	R 
	R> rL   )r	   )r  r  r  r  s   ` @@rM   compiled_fsdp_testr    s"    " "!F rL   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )
SkipModulei  c                 X   > [         TU ]  5         [        R                  " SSSS9U l        g N
   Fr  )r   r   rt   r   linrI  s    rM   r   SkipModule.__init__  s"    99R%0rL   c                 $    U R                  U5      $ r   r!  rJ  s     rM   r  SkipModule.forward  s    xx{rL   r$  rn   r  r7  s   @rM   r  r    s    1 rL   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )NestedLineari  c                    > [         TU ]  5         U(       a8  [        [        R                  " SSSS9R                  [        5      5      U l        g [        R                  " SSSS9R                  [        5      U l        g r  )r   r   r   rt   r   r   r   nested_linear)rZ   	fsdp_wrapr  s     rM   r   NestedLinear.__init__  sV    !%biiBU&C&F&F{&S!TD!#2r!>!A!A+!NDrL   c                 $    U R                  U5      $ r   r)  rJ  s     rM   r  NestedLinear.forward  s    !!!$$rL   r-  r  r7  s   @rM   r'  r'    s    O% %rL   r'  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	SkipModeli  c                    > [         TU ]  5         [        R                  " SSSS9R	                  [
        5      U l        [        5       R	                  [
        5      U l        [        [        US9[
        S9U l        g )Nr   Fr  )r*  )r  )r   r   rt   r   r   r   linearr  linear_skipr   r'  r)  )rZ   double_nestr  s     rM   r   SkipModel.__init__  sW    iiBU366{C%<??;7!;/;
rL   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   r2  r3  r)  rJ  s     rM   r  SkipModel.forward  s4    KKNQq!rL   r7  r  r7  s   @rM   r0  r0    s    
 rL   r0  )FT)FFr  )rD   r   )
contextlibr  rb  r}  r|  unittestr  abcr   r   collections.abcr   r   copyr   enumr   r	   	functoolsr   typingr   r   r   r   rq   torch.distributedr  r   torch.nnrt   torch.nn.functionalr  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r   r   torch.distributed.tensorr    r!   r"   !torch.distributed.tensor.parallelr#   r$   r%   r&   r'   r(   torch.nn.parallel.distributedr)   r  *torch.testing._internal.common_distributedr*   r+   r,   r-   r.   $torch.testing._internal.common_utilsr/   r0   r1   r2   r3   r4   r5   torch.utils._tritonr6   minmaxr9   r|  rC  r   r  r<   rB   rO   ru   rU   r4  r   r   r5  r   r   r   r   r   r   r   r   r   r9  rX  r\  rp  r  r  r  r  rA  r  r  contextmanagerr  r  r  r  r  r  r  r!  r%  r)  rp   r   r>  skipIfr@  r[  r  r  r  r  r  r'  r0  rD   rL   rM   <module>rW     s    	 	 
    # $ "    + +        4 4 
 ? S 
 I R R F F  F H    + q#a!8!8!:;<LLK ::**,LK K 99))+LK L4 T BIIs 499$$ B% #99##""2299 2t 2>299 >d >DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@J
m J
Z
? 
.,ryy ,JJ* JJZ/")) /@*r}} *Z'299 '6 6 6 6 
X 
 
" 
X 
 
" 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	"Tyy"T II"T c3h	"TJ 
45/  6&L L^
O%}2 O%d4!(? 4!n/$+ /d 	%299 	%		 rL   