
    
9j                     8   S SK r S SKrS SKrS SKJr  S SKJr  S SKrS SKJ	s  J
r  S SKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  \" 5       (       a  S SKJrJr  S SKJr  S SKJr  S SK J!r!  S SK"J#r#  S SK$J%r%  S SK&J'r'J(r(  S SK)J*r*  S SK+J,r,  S SK-J.r.  S SK/J0r0J1r1  S SK2J3r3  S SK4J5r5J6r6J7r7J8r8  S SK9J:r:J;r;J<r<J=r=J>r>  S SK?J@r@JArAJBrB  S SKCJDrD  S SKEJFrF  S SKGJHrHJIrIJJrJJKrKJLrLJMrM  S SKNJOrO  S SKPJQrQJRrRJSrSJTrTJUrUJVrV  S SKWJXrXJYrYJZrZ  S<S  jr[S! r\ " S" S#5      r]S$ r^S% r_ " S& S'\5      r`S( ra " S) S*5      rb " S+ S,\5      rcS- rd " S. S/\5      re " S0 S1\e5      rf " S2 S3\e5      rg " S4 S5\e5      rhS6 riS=S7 jrj " S8 S9\R                  R                  5      rlS: rmS; rng)>    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_available)recursively_applysend_to_device)mputensor_parallel)DistributedDataParallel)finalize_model_grads)	ModelType)get_num_microbatches)get_megatron_optimizer)get_tensor_model_parallel_group"get_tensor_model_parallel_src_rank)get_forward_backward_func)get_model_config)build_train_valid_test_datasets)	BertModelT5Model)Classification)get_argsget_tensorboard_writerget_tokenizerprint_rank_last)_add_data_args_add_validation_args!core_transformer_config_from_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint)set_global_variables)gpt_builder)_compile_dependencies_init_autoresume_initialize_distributed_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)_vocab_size_with_padding)%build_train_valid_test_data_iteratorsget_optimizer_param_schedulernum_floating_point_operationssetup_model_and_optimizer
train_steptraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsc           
      T   [        5       nUR                  (       a  SOSnUR                  S:X  a'  [        SUR                   SU S35        [        S5        [        U5      nUR                  S:X  aZ  UR                  (       a/  UR                  (       a  S	OSn[        UUUR                  S
U US9nU$ [        UUR                  S	U US9n U$ UR                  S:X  a  SUl
        [        X@USSS9nU$ UR                  S:X  a  [        USS
U UUUS9nU$ [        SUR                   35      e)zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr   T)confignum_tokentypesadd_binary_headparallel_outputpre_processpost_process)r>   num_classesr?   rB   rC   gptFN)vp_stager>   t5)r>   r?   rA   rB   rC   add_encoderadd_decoderUnsupported model type: )r   pretraining_flagrankprintmodel_type_namer$   bert_binary_headr   r   
num_labelsuse_legacy_modelsr+   r   
ValueError)	rB   rC   rH   rI   argsmoder>   r?   models	            V/home/wildlama/miniconda3/lib/python3.13/site-packages/accelerate/utils/megatron_lm.pymodel_provider_funcrW   U   sS   :D!22>DyyA~	$../~dV6JKx	
 /t4Fv%  "&"7"7QQN- $ 5 5 $')E@ L/ # OO ')E. L! 
			&!&D|dSWX L 
			% #%##
 L 3D4H4H3IJKK    c                    U R                  S5        [        5       nU R                  R                  R                  b  U R                  R                  R
                  c  [        S5      eU R                  R                  R
                  nU R                  R                  R	                  U5      n[        X5      n[        XS S9nO[        R                  nUR                  S:X  a  [        R                  n[        nU R                  R                  R
                  b   U R                  R                  R
                  n[        UU5      u  p4n[        U5      Ul        X4U4$ )Nz#Preparing model optimizer schedulerzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)	schedulerrG   )rM   r   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrR   prepare_optimizerprepare_schedulerr   encoder_or_decoderrN   encoder_and_decoderrW   r6   len	model_len)acceleratorrS   custom_model_provider_funcrU   	optimizerrZ   
model_typemodel_provider_func_s           rV   !prepare_model_optimizer_schedulerrj      s,   ;<:D++IIU//NNVs  &1%6%6%I%I%h%h"!!44RRSmn%k9	%kM	11
4'"66J2//NNZ#.#4#4#G#G#f#f (A )
%9 ZDNY&&rX   c                   0    \ rS rSrSrS rS rS rS rSr	g)	MegatronLMDummyDataLoader   z
Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

Args:
    **dataset_kwargs: Megatron data arguments.
c                     [         R                  " 5       n[        U5      n[        U5      nUR	                  5       n[        US   5      U l        U R                  R                  U5        SU R                  S'   g )Nr   Tmegatron_dataset_flag)argparseArgumentParserr"   r#   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argss       rV   __init__"MegatronLMDummyDataLoader.__init__   sh    ((*'%f-++-	 1.  05912rX   c                     [        5       nU R                  R                  5        H9  u  p#[        XS5      nXC:w  a  [	        SU SU SU SU 35        [        XU5        M;     g )N z<WARNING: MegatronLMDummyDataLoader overriding arguments for : with )r   rt   itemsgetattrrM   setattr)rv   rS   keyvalue	old_values        rV   set_megatron_data_args0MegatronLMDummyDataLoader.set_megatron_data_args   sq    z++113JC2.I!RSVRWWXYbXccijminnopuovw Du% 4rX   c                    S nUR                   R                  R                  b   UR                   R                  R                  $  [        5       nUR                  S:X  a  SSKJn  SUl        U$ UR                  S:X  a  SSKJn  SUl        U$ UR                  S:X  a  SSK	Jn  SUl        U$  U$ ! [         a     U$ f = f)Nc                 Z   [        5       n[        UR                  [        [        45      (       a  UR                  OUR                  /UR
                  U UR                  S.nUR                  S:X  a)  UR                  UR                  UR                  S.5        OUR                  S:X  a  UR                  SUR                  05        ORUR                  S:X  a*  UR                  UR                  UR                  SS.5        O[        SUR                   35      e[        S	0 UD6u  p4nX4U4$ )
z&Build train, valid, and test datasets.)data_prefixsplits_stringtrain_valid_test_num_samplesseedr=   )max_seq_lengthbinary_headrE   r   rG   )r   max_seq_length_decdataset_typerJ    )r   
isinstance	data_pathlisttuplesplitr   rN   ru   
seq_lengthrO   encoder_seq_lengthdecoder_seq_lengthrR   r   )train_val_test_num_samplesrS   rt   train_dsvalid_dstest_dss         rV   "train_valid_test_datasets_providerlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_provider   s   :D1;DNNTSXM1Z1Zt~~aeaoao`p!%0J			L ##v-##*.//'+'<'< %%.##($//
 %%-##*.*A*A.2.E.E(, !#;D<P<P;Q!RSS*I*YL*Y'Hw..rX   r=   r   )r   TrE   rG   )r[   r\   *custom_megatron_datasets_provider_functionr   rN   pretrain_bertr   is_distributedpretrain_gptpretrain_t5ImportError)rv   re   r   rS   s       rV   &get_train_valid_test_datasets_provider@MegatronLMDummyDataLoader.get_train_valid_test_datasets_provider   s    !	/F //ZZf$$77bbb	:D##v-LDH2A99%%.KDH2A99%%-JDH2A99	 . 21  	11	s   (B0 /B0 B0 0
B>=B>c                 x   [        5       nU R                  U5      nUR                  b  / n/ n/ n[        [	        USS5      5       H`  n[
        R                  " U5        [        U5      nUR                  US   5        UR                  US   5        UR                  US   5        Mb     O[        U5      u  pEnXEU4$ )Nrd   r   r   r   )	r   r   $virtual_pipeline_model_parallel_sizeranger   r   (set_virtual_pipeline_model_parallel_rankr3   append)	rv   re   rS   !train_valid_test_dataset_providertrain_data_iteratorvalid_data_iteratortest_data_iteratori	iteratorss	            rV   r3   ?MegatronLMDummyDataLoader.build_train_valid_test_data_iterators   s    z,0,W,WXc,d)44@"$"$!#74a89<<Q?ABcd	#**9Q<8#**9Q<8")))A,7 : Lq1LH6H #9KKKrX   )rt   N)
__name__
__module____qualname____firstlineno____doc__rz   r   r   r3   __static_attributes__r   rX   rV   rl   rl      s    :&:2xLrX   rl   c                     " S S5      nUS L n[         R                  " U[         R                  U R                  S9n[         R                  R                  U[        5       [        5       S9  U(       d  U(       a  U" 5       $ U$ )Nc                        \ rS rSrS rS rSrg)?_handle_megatron_data_iterator.<locals>.DummyMegatronDataloaderi  c                     U $ Nr   rv   s    rV   __iter__H_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__iter__  s    KrX   c                     0 $ r   r   r   s    rV   __next__H_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__next__  s    IrX   r   N)r   r   r   r   r   r   r   r   rX   rV   DummyMegatronDataloaderr     s    		rX   r   dtypedevicegroup)torchtensorboolr   distributed	broadcastr   r   )re   data_iteratorr   is_data_iterator_emptyis_src_data_iterator_emptys        rV   _handle_megatron_data_iteratorr     sw      +d2!&.DEJJ_j_q_q!r	"$F$HPoPq    &*@&((rX   c                 0   U R                  S5        [        5       nUR                  (       Gd3  SSKJnJn  UR                  UR                  -  nU Vs0 s H  of[        XX6   5      _M     nnUS   cS  [        US   [        R                  R                  R                  5      (       a
  XWS   l        OUS	 US	 US	 XWS   l        OUS	 XWS'   [        R                  R                  R                  " UR                   40 UD6nU" UU R"                  [$        R&                  " 5       [$        R(                  " 5       SS	U R*                  R-                  5       U R.                  S
9$ UR0                  b   UR0                  u  Ul        Ul        Ul        OSu  Ul        Ul        Ul        UR                  UR                  -  Ul        UR9                  U 5      u  nn	n
UR                  UR                  -  Ul        [;        XS9n[;        X	S9n	[;        X
S9n
XU
4$ s  snf )NzPreparing dataloaderr   )_PYTORCH_DATALOADER_KWARGSprepare_data_loader
batch_sizesamplershufflebatch_samplerFT)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batches)r   r   r   )re   r   )rM   r   ro   data_loaderr   r   micro_batch_sizenum_micro_batchesr   r   r   utilsdataBatchSamplerr   
DataLoaderdatasetr   r   get_data_parallel_world_sizeget_data_parallel_rankr   copyr   consumed_samplesconsumed_train_samplesconsumed_valid_samplesconsumed_test_samplesr3   r   )re   
dataloaderrS   r   r   r   kkwargsr   r   r   s              rV   r   r   !  s   ,-:D%%%Q0043I3IITnoTnqWZ,F,IJJTno,'&+U[[-=-=-J-JKK/?y!,9%9%<(5E'2'#3< [[%%001C1CNvN
 #::<446!++002(99	
 		
   ,
 %%	++* dk`D')DdF` $ 5 58N8N N <<[I		
 $ 5 59O9O O<#
 =#
 <v"9KKKo ps   Hc                   H   ^  \ rS rSrU 4S jrSS jrS r\S 5       rSr	U =r
$ )MegatronLMOptimizerWrapperic  c                 $   > [         TU ]  USS S9  g )NF)device_placementscalersuperrz   )rv   rg   	__class__s     rV   rz   #MegatronLMOptimizerWrapper.__init__d  s    U4HrX   c                     g r   r   )rv   set_to_nones     rV   	zero_grad$MegatronLMOptimizerWrapper.zero_gradg      rX   c                     g r   r   r   s    rV   stepMegatronLMOptimizerWrapper.stepj  r  rX   c                 .    U R                   R                  $ )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)rg   skipped_iterr   s    rV   step_was_skipped+MegatronLMOptimizerWrapper.step_was_skippedm  s     ~~***rX   r   r   )r   r   r   r   rz   r   r  propertyr  r   __classcell__r   s   @rV   r   r   c  s'    I + +rX   r   c                     U R                  S5        [        5       n[        XR                  UR                  UR
                  5      $ )NzPreparing optimizer)rM   r   r   no_wd_decay_condscale_lr_condlr_mult)re   rU   rS   s      rV   r_   r_   s  s:    +,:D!%)>)>@R@RTXT`T`aarX   c                   "    \ rS rSrSrSS jrSrg)MegatronLMDummyScheduleriz  a  
Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
loop when scheduler config is specified in the deepspeed config file.

Args:
    optimizer (`torch.optim.optimizer.Optimizer`):
        The optimizer to wrap.
    total_num_steps (int):
        Total number of steps.
    warmup_num_steps (int):
        Number of steps for warmup.
    **kwargs (additional keyword arguments, *optional*):
        Other arguments.
Nc                 4    Xl         X l        X0l        X@l        g r   )rg   total_num_stepswarmup_num_stepsr   )rv   rg   r  r  r   s        rV   rz   !MegatronLMDummyScheduler.__init__  s    ". 0rX   )r   rg   r  r  Nr   )r   r   r   r   r   rz   r   r   rX   rV   r  r  z  s    rX   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )MegatronLMSchedulerWrapperi  c                 $   > [         TU ]  X5        g r   r   )rv   rZ   
optimizersr   s      rV   rz   #MegatronLMSchedulerWrapper.__init__  s    /rX   c                     g r   r   )rv   rS   r   s      rV   r  MegatronLMSchedulerWrapper.step  s    rX   r   )r   r   r   r   rz   r  r   r
  r  s   @rV   r  r    s    0 rX   r  c                 >    U R                  S5        [        U5      nU$ )NzPreparing scheduler)rM   r4   )re   rg   rZ   s      rV   r`   r`     s!    +,-i8IrX   c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )AbstractTrainStepi  z;Abstract class for batching, forward pass and loss handler.c                 .   > [         TU ]  5         Xl        g r   )r   rz   name)rv   r"  r   s     rV   rz   AbstractTrainStep.__init__  s    	rX   c                     g r   r   )rv   re   ro   s      rV   get_batch_func AbstractTrainStep.get_batch_func  r  rX   c                     g r   r   r   s    rV   get_forward_step_func'AbstractTrainStep.get_forward_step_func  r  rX   c                     g r   r   )rv   re   s     rV   get_loss_funcAbstractTrainStep.get_loss_func  r  rX   )r"  )r   r   r   r   r   rz   r%  r(  r+  r   r
  r  s   @rV   r   r     s    E rX   r   c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )BertTrainStepi  zW
Bert train step class.

Args:
    args (`argparse.Namespace`): Megatron-LM arguments.
c                 Z  > [         TU ]  S5        U R                  XR                  5      U l        U R                  XR                  UR                  5      U l        U R                  UR                  UR                  5      U l        UR                  (       d  S U l        g SSKJn  X0l        g )Nr.  r   )SequenceClassifierOutput)r   rz   r%  ro   	get_batchr+  rK   rP   	loss_funcr(  rO   forward_stepmodel_return_dictmodel_output_classtransformers.modeling_outputsr0  )rv   re   rS   r0  r   s       rV   rz   BertTrainStep.__init__  s    ),,[:T:TU++K9N9NPTP_P_` 66t7L7LdNcNcd%%&*D#N&>#rX   c                     S nS nUR                   R                  R                  b   UR                   R                  R                  $ U(       a	   SSKJn  U$ U$ ! [
         a     U$ f = f)Nc                 h   / SQn[         R                  nU b  [        U 5      nOSn[        R                  " XU5      nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   R                  5       n	US   R                  5       n
XVXxX4$ )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr;  r<  r>  r?  r=  r@  r   int64nextr   broadcast_datalongfloat)r   keysdatatyper   data_btokensr<  sentence_orderr?  	lm_labelsr@  s              rV   get_batch_megatron8BertTrainStep.get_batch_func.<locals>.get_batch_megatron  s     YD{{H (M*$33DIF F^((*F7O((*E#K0557N{+113Ix(--/I!.1668L.YTTrX   c                    [        U 5      n[        U[        R                  R	                  5       5      nUS   R                  5       nUS   R                  5       nSU;   a  US   R                  5       nOSnSU;   a9  US   R                  5       nUS   S:g  R                  [        R                  5      nOSnSnSU;   a  US   R                  5       nOSnX$XvXS4$ )r:  	input_idsattention_masktoken_type_idsNr=  next_sentence_label)rC  r   r   cudacurrent_devicerE  torF  )r   r   rJ  r@  r<  rL  r?  rK  s           rV   get_batch_transformer;BertTrainStep.get_batch_func.<locals>.get_batch_transformer  s    &D!$

(A(A(CDD +&++-F 01668L4'-.3354 N//1	!(^t377D	 	 	$,!%&;!<!A!A!C!%.YTTrX   r   r1  )r[   r\   custom_get_batch_functionr   r1  r   rv   re   ro   rM  rX  r1  s         rV   r%  BertTrainStep.get_batch_func  ss    	U0	U2 //IIU$$77QQQ 3  
 )(	  %%   A 
A('A(c                    ^ ^ S nUU 4S jnUR                   R                  R                  b   UR                   R                  R                  $ U(       a  U$ U$ )Nc                    Uu  p4UR                  5       nU R                  5       n [        R                  " UR                  S5      U R	                  S5      -  5      U R                  5       -  nUbo  [
        R                  " UR                  SS5      R                  5       UR                  S5      SS9nUR                  5       nXV-   n[        XV/5      nXxS   US   S.4$ Un[        U/5      nUSUS   04$ )Nr   )ignore_indexr   r   )lm losszsop lossrc  )rF  r   sumviewreshapeFcross_entropyr9   )	r?  rK  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossess	            rV   loss_func_pretrain7BertTrainStep.get_loss_func.<locals>.loss_func_pretrain  s    #0 H~~'H!)Iiib 1I4E4Eb4I IJY]]_\G%??:??2q+A+G+G+I>K^K^_aKbqst#>>+)"KWL_"`);YZI[\\\ "KWI"Vi);<<<rX   c                   > TS:X  a2  [        5       nU" UR                  S5      U R                  S5      5      nOTR                  S:  aa  U R                  [        R
                  [        R                  4;   a3  [        5       nU" UR                  ST5      U R                  S5      5      nO[        5       nU" X5      n[        U/5      nUSUS   04$ )Nr   ra  rn  r   )
r   re  rP   r   r   rE  intr   r   r9   )r=  logitsloss_fctrn  ro  rP   rv   s        rV   loss_func_finetune7BertTrainStep.get_loss_func.<locals>.loss_func_finetune  s    Q"9BRA1$&,,5::uyy:Q*Q+-B
 ;V[[_M,./GOO&/!"4555rX   r[   r\   custom_loss_function)rv   re   rK   rP   rp  rv  s   `  `  rV   r+  BertTrainStep.get_loss_func  sN    	=&	6 //DDP$$77LLL%%%%rX   c                    ^ ^^ UUU 4S jnU$ )Nc                    > TR                  U 5      u  p#pEpgT
(       d  SnT(       a  U" X'X6S9nU[        TR                  XT5      4$ U" X'US9n	U	[        TR                  U5      4$ )Forward step.Ntokentype_idsrL  )r  r1  r   r2  )r   rU   rJ  r<  rK  r?  r=  r@  ri  rt  rO   rK   rv   s             rV   r3  9BertTrainStep.get_forward_step_func.<locals>.forward_step.  sh    MQ^^\iMjJF>f# %f% b$gdnni&XXXv5Iwt~~v>>>rX   r   )rv   rK   rO   r3  s   ``` rV   r(  #BertTrainStep.get_forward_step_func-  s    	? rX   r3  r1  r2  r5  r   r   r   r   r   rz   r%  r+  r(  r   r
  r  s   @rV   r.  r.    s#    
?>)@'&R rX   r.  c                   >   ^  \ rS rSrSrU 4S jrS rS rS rSr	U =r
$ )GPTTrainStepi>  zV
GPT train step class.

Args:
    args (`argparse.Namespace`): Megatron-LM arguments.
c                   > [         TU ]  S5        U R                  XR                  5      U l        U R                  U5      U l        U R                  5       U l        UR                  b  [        5       nUR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                   U l        UR"                  U l        UR$                  (       d  S U l        g SSKJn  X@l        g )Nr  r   )!CausalLMOutputWithCrossAttentions)r   rz   r%  ro   r1  r+  r2  r(  r3  
vocab_filer    eod	eod_tokeneos_token_id	pad_tokenreset_position_idsreset_attention_maskeod_mask_lossr4  r5  r6  r  )rv   re   rS   	tokenizerr  r   s        rV   rz   GPTTrainStep.__init__F  s    (,,[:T:TU++K8 668??&%I&]]DN****"&"9"9$($=$=!!//%%&*D#W&G#rX   c                    ^  U 4S jnU 4S jnUR                   R                  R                  b   UR                   R                  R                  $ U(       a	   SSKJn  U$ U$ ! [
         a     U$ f = f)Nc           
        > S/n[         R                  nU b  [        U 5      nOSn[        R                  " XU5      nUS   R                  5       nUSS2SS24   R                  5       nUSS2SS24   R                  5       n[        UTR                  TR                  TR                  TR                  TR                  SS9u  pn
XvXU
4$ )zGenerate a batchr;  Nr   ra  Tr  r  r  r  r  pad_mask_loss)r   rB  rC  r   rD  rE  
contiguousr;   r  r  r  r  )r   rG  rH  r   rI  tokens_r=  rJ  rQ  r?  position_idsrv   s              rV   rM  7GPTTrainStep.get_batch_func.<locals>.get_batch_megatron[  s     8D{{H (M*$33DIF Vn))+GQU^..0FQV_//1F 7V....#'#:#:%)%>%>"00"73N| 9lJJrX   c           
      L  > [        U 5      nSUS   0n[        U[        R                  R	                  5       5      nUS   R                  5       n[        R                  " UR                  S   S4UR                  UR                  S9T	R                  -   n[        R                  " X#/SS9nUS S 2SS 24   R                  5       nUS S 2S S24   R                  5       n[        UT	R                  T	R                  T	R                  T	R                  T	R                   SS9u  pgnXTXvU4$ )	NrP  r   r   r   dimra  Tr  )rC  r   r   rU  rV  rE  zerosshaper   r   r  concatr  r;   r  r  r  )
r   r   r  paddingr=  rJ  rQ  r?  r  rv   s
            rV   rX  :GPTTrainStep.get_batch_func.<locals>.get_batch_transformery  s   &Dk!23D!$

(A(A(CDD;',,.Gkk7==#3Q"7w}}U\UcUcdgkguguuGllG#51=GQU^..0FQV_//1F6U....#'#:#:%)%>%>"00"73N| 9lJJrX   r   rZ  )r[   r\   r[  r   r1  r   r\  s   `     rV   r%  GPTTrainStep.get_batch_funcZ  st    	K<	K, //IIU$$77QQQ 2  
 )(	  %%s   A! !
A/.A/c                    ^ [        5       mU4S jnUR                  R                  R                  b   UR                  R                  R                  $ U$ )Nc                   > TR                   (       a  Uu  p#OUnUR                  5       nU R                  S5      R                  5       n TR                  S:  a  [        R
                  " [        R                  " UR                  S5      U -  5      R                  S5      U R                  5       R                  S5      /5      n[        R                  R                  U[        R                  " 5       S9  US   US   -  nO9[        R                  " UR                  S5      U -  5      U R                  5       -  nTR                  (       au  [        R                  R                  5       nUR                  5       (       aB   SU S[        R                  R                  5        S[         R"                  " 5       S    35       e[%        U/5      nSUS   0nTR                   (       a  UR'                  S	W05        XG4$ )
Nra  r   r   r   zRank z7: found NaN in local forward loss calculation. Device: z, node: rc  rt  )return_logitsrF  re  context_parallel_sizer   catrd  r   
all_reducer   get_context_parallel_groupcheck_for_nan_in_loss_and_gradget_rankisnanrU  rV  osunamer9   ru   )	r?  ri  lossesrt  rn  global_rankaveraged_lossoutput_dictrS   s	           rV   r2  -GPTTrainStep.get_loss_func.<locals>.loss_func  s   !!!.&\\^F!r*002I))A-yy%))FKKOi,G"H"M"Ma"PR[R_R_RaRfRfghRi!jk!!,,T9W9W9Y,ZAwa(yyR9!<=	O 22#//88:::<< K= )$zz88:;8BHHJqM?T' FtfMM$mA&67K!!""Hf#56$$rX   )r   r[   r\   ry  )rv   re   r2  rS   s      @rV   r+  GPTTrainStep.get_loss_func  sG    z	%< //DDP$$77LLLrX   c                    ^  U 4S jnU$ )Nc                 l   > TR                  U 5      u  p#pEnU" X&XSS9nU[        TR                  U5      4$ )r}  )r=  r  )	r   rU   rJ  r=  r?  rQ  r  ri  rv   s	           rV   r3  8GPTTrainStep.get_forward_step_func.<locals>.forward_step  s?     GKnnUbFcCFI|!&VM '$..)"DDDrX   r   rv   r3  s   ` rV   r(  "GPTTrainStep.get_forward_step_func  s    	E rX   )	r  r  r3  r1  r2  r5  r  r  r  r  r  s   @rV   r  r  >  s%    H(A)F#J	 	rX   r  c                   n   ^  \ rS rSrSrU 4S jr\S 5       r\S 5       r\S 5       r	S r
S rS	 rS
rU =r$ )T5TrainStepi  zU
T5 train step class.

Args:
    args (`argparse.Namespace`): Megatron-LM arguments.
c                   > [         TU ]  S5        U R                  XR                  5      U l        U R                  U5      U l        U R                  5       U l        UR                  (       d  S U l
        g SSKJn  X0l
        g )Nr  r   )Seq2SeqLMOutput)r   rz   r%  ro   r1  r+  r2  r(  r3  r4  r5  r6  r  )rv   re   rS   r  r   s       rV   rz   T5TrainStep.__init__  se    ',,[:T:TU++K8 668%%&*D#E&5#rX   c                 \    U R                  S5      nU R                  S5      nX-  nUS:  nU$ )Nr   r         ?)	unsqueeze)rQ  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_masks        rV   attn_mask_postprocess!T5TrainStep.attn_mask_postprocess  s@     ,55a8+55a8/D"4s":&&rX   c                 f    [         R                  " [         R                  " SX 4US95      nUS:  nU$ Nr   r   r  )r   trilones)r   r   rQ  s      rV   get_decoder_maskT5TrainStep.get_decoder_mask  s1    EJJ:/JSY$Z['#-rX   c                     U R                   u  p4U R                  S5      n[        R                  " X1S4US9nXe-  nUS:  nU$ r  )r  r  r   r  )	rQ  dec_seq_lengthr   r   _r  r  r  r  s	            rV   get_enc_dec_maskT5TrainStep.get_enc_dec_mask  sS    &,,
 ,55a8"ZZQ(GPVW/D"4s":&&rX   c                     S nS nUR                   R                  R                  b   UR                   R                  R                  $ U(       a	   SSKJn  U$ U$ ! [
         a     U$ f = f)Nc                 N   / SQn[         R                  nU b  [        U 5      nOSn[        R                  " XU5      nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   R                  5       nUS   S:  n	US	   S:  n
US
   S:  nXVXXU4$ )r:  )text_enctext_decr=  r?  enc_maskdec_maskenc_dec_maskNr  r  r=  r?  r  r  r  r  rA  )r   rG  rH  r   rI  
tokens_enc
tokens_decr=  r?  r  r  r  s               rV   rM  6T5TrainStep.get_batch_func.<locals>.get_batch_megatron  s     kD{{H (M*$33DIF  
+002J
+002JH%**,F{+113Ij)C/Hj)C/H!.1C7L9hR^^^rX   c                 2   [        U 5      n[        U[        R                  R	                  5       5      nUS   R                  5       nUS   R                  5       nUS:g  R                  [        R                  5      nSU;   a  US   R                  5       nOkUR                  UR                  UR                  [        R
                  S9nUSSS24   R                  5       USS	S24'   S
US'   UR                  US:H  S
5        [        R                  US   R                  5       5      n[        R                  UR                  S	   UR                  5      n[        R!                  US   R                  5       UR                  S	   UR                  5      nX%XCXgU4$ )r:  rP  r=  rS  decoder_input_ids)r   r   .Nra  r   r   ).r   rQ  )rC  r   r   rU  rV  rE  rW  rF  	new_zerosr  r   clonemasked_fill_r  r  r  r  )	r   r   r  r=  r?  r  r  r  r  s	            rV   rX  9T5TrainStep.get_batch_func.<locals>.get_batch_transformer  st   &D!$

(A(A(CDDk*//1J(^((*F4++EKK8I"d*!"56;;=
#--fll6==X]XbXb-c
&,S#2#X&6&<&<&>
37#%&
6"''
d(:A>"88>N9O9T9T9VWH"33J4D4DQ4GIZIZ[H&77%&++-z/?/?/BJDUDUL 9hR^^^rX   r   rZ  )r[   r\   r[  r   r1  r   r\  s         rV   r%  T5TrainStep.get_batch_func  ss    	_2	_. //IIU$$77QQQ 1  
 )(	  %%r^  c                     S nUR                   R                  R                  b   UR                   R                  R                  $ U$ )Nc                     UR                  5       n[        R                  " UR                  S5      U R	                  S5      -  5      U R                  5       -  nUn[        U/5      nUSUS   04$ )Nra  rc  r   )rF  r   rd  re  rf  r9   )r?  ri  rj  rl  rn  ro  s         rV   r2  ,T5TrainStep.get_loss_func.<locals>.loss_funcA  sh    $**,Hiib 1I4E4Eb4I IJY]]_\GDG	RO)_Q%7888rX   rx  )rv   re   r2  s      rV   r+  T5TrainStep.get_loss_func@  s?    	9 //DDP$$77LLLrX   c                    ^  U 4S jnU$ )Nc           
      t   > T
R                  U 5      u  p#pEpgnU" X#XgUSUS9n	U	[        T
R                  U5      4$ )r}  Nr~  r  )r   rU   r  r  r?  rL  r  r  r  ri  rv   s             rV   r3  7T5TrainStep.get_forward_step_func.<locals>.forward_stepO  sU     ^b]k]k^ZJI(l "LX\hqM !'$..)"DDDrX   r   r  s   ` rV   r(  !T5TrainStep.get_forward_step_funcN  s    	E rX   r  )r   r   r   r   r   rz   staticmethodr  r  r  r%  r+  r(  r   r
  r  s   @rV   r  r    s^    
6 
' 
'  
 	' 	'=)~ rX   r  c                      [        5       n [        S S S 5        U R                  S:X  a  [        SU R                   S35        [        U R                  U R                  5        g )Nr   z> setting random seeds to z ...)r   r.   rL   rM   r   r/   data_parallel_random_init)rS   s    rV   finish_mpu_initr  _  sL    :DD$- yyA~*499+T:;TYY > >?rX   c                 V   Uc  0 nU R                  S5        [        R                  R                  5       (       d   S5       e[	        USS9nUR                  5        HM  u  pE[        X4S 5      b/  UR                  S:X  a  [        SU S[        X45       SU SU 3SS	9  [        X4U5        MO     UR                  (       d  UR                  S
S5      (       a  UR                  c   S5       e[        U5        [        U5        [        USS9  [        5         [!        5         [#        5         [%        5         ['        5       n[        USS 5      c  [)        UR*                  U5      Ul        UR.                  S:X  a)  UR0                  (       a  UR2                  S:X  a  SUl        OSUl        SUl        g )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z*WARNING: overriding default arguments for r~   r   )flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argument)build_tokenizerpadded_vocab_sizer=   r   )rM   r   rU  is_availabler%   r   r   rL   r   r  getloadr'   r&   r*   r  r-   r,   r0   r   r2   orig_vocab_sizer  rN   rK   rP   rO   	iteration)re   extra_args_providerargs_defaultsrS   r   r   s         rV   
initializer  l  s   01::""$$?&??$ )tDD $))+
4d#/yyA~@QwtGYFZZ`ad`eefglfmn 	5! , =#4#45JE#R#Ryy$W&WW$!$'$ u5    :Dt($/7!9$:N:NPT!Uv%$*?*?DOOWXDX $ %DNrX   c                   h   ^  \ rS rSrSrU 4S jrS rS rS rS r	S r
S	 rS
 rS rS rS rSrU =r$ )MegatronEnginei  z
Megatron-LM model wrapper

Args:
    accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
    model: Megatron-LM model
    optimizer: Megatron-LM optimizer
    lr_scheduler: Megatron-LM lr scheduler
c                   > [         TU ]  5         X l        US   U l        X0l        X@l        [        5       nUR                  R                  R                  bK  UR                  R                  R                  " U40 UR                  R                  R                  D6U l        O{UR                  S:X  a  [        X5      U l        OZUR                  S:X  a  [        X5      U l        O9UR                  S:X  a  [        X5      U l        O[!        SUR                   35      eSU R                  l        0 U l        0 U l        SU l        SU l        SU l        S U l        UR0                  b  [3        5         g g )Nr   r=   rE   rG   rJ   FT)r   rz   module
base_modelrg   rZ   r   r[   r\   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrN   r.  r  r  rR   r  total_loss_dicteval_total_loss_dictr  report_memory_flag$num_floating_point_operations_so_farmodule_configtensorboard_dirr1   )rv   re   rU   rg   rZ   rS   r   s         rV   rz   MegatronEngine.__init__  sD   (""z//GGS&1&7&7&J&J&b&b'#))<<UU'D# !!V+&3K&FD#!!U*&2;&ED#!!T)&1+&DD#78L8L7MNOO&+#  "$&!"&451!+%' ,rX   c                   ^ ^ [        5       n[        T R                  S   5      nT R                  R                  Ul        [        T R                  S   [        5      (       a  UR                  (       a  UR                  b   S5       eT R                   Vs/ s H  o3R                  PM     snUl	        [        T R                  5      S:X  a  UR                  S   Ul	        UR                  (       aX  T R                   Vs/ s H  o3R                  PM     snUl        [        T R                  5      S:X  a  UR                  S   Ul        UR                  (       ax  UR                   (       ag  [#        [        T R                  5      5       V^s/ s H
  mUU 4S jPM     snUl        [        T R                  5      S:X  a  UR$                  S   Ul        [&        Ul        U$ s  snf s  snf s  snf )Nr   zWhen overlap_grad_reduce is True, config.no_sync_func must be None; a custom no_sync_func is not supported when overlapping grad-reducer   c                 <   > TR                   R                  TU 5      $ r   )rg   finish_param_sync)xmodel_indexrv   s    rV   <lambda>2MegatronEngine.get_module_config.<locals>.<lambda>  s    $..::;JrX   )r   r   r  rg   
scale_lossgrad_scale_funcr   LocalDDPoverlap_grad_reduceno_sync_funcno_syncrc   delay_grad_reducestart_grad_syncgrad_sync_funcoverlap_param_gatherdelay_param_gatherr   param_sync_funcr   finalize_model_grads_func)rv   rS   r>   model_chunkr  s   `   `rV   get_module_config MegatronEngine.get_module_config  s   z!$++a.1!%!:!:dkk!nh//D4L4L&&. V. KO++"V+;#6#6+"VF4;;1$&,&9&9!&<#%%X\XcXc(dXc)D)DXc(d%t{{#q(,2,A,A!,DF)$$)@)@^cdghlhshsdt^u&^u{JJ^u&F" 4;;1$)/)?)?)B&+?( #W )e&s   G>G#G(c                     U R                    H  nUR                  5         M     U R                  c  U R                  5       U l        U R	                  5         g r   )r  trainr  r$  log_eval_resultsrv   model_modules     rV   r'  MegatronEngine.train  sG     KKL  ( %!%!7!7!9DrX   c                     U R                    H  nUR                  5         M     U R                  c  U R                  5       U l        g g r   )r  evalr  r$  r)  s     rV   r-  MegatronEngine.eval  s@     KKL ( %!%!7!7!9D &rX   c                    [        5       n/ n[        U5      S:  a  UR                  S:  aq  [        SUR                  5       HV  nUR	                  UR                  5        VVs0 s H&  u  pVXVXBR                  -  US-   UR                  -   _M(     snn5        MX     OU/n[        U R                  5      S:  ad  [        U5      S:  a:  [        [        U R                  5      5       Vs/ s H  n[        U5      PM     snnU$ S /[        U R                  5      -  nU$ [        U5      S:  a  [        U5      OS nU$ s  snnf s  snf )Nr   r   )	r   rc   r   r   r   r   r   r  iter)	rv   
batch_datarS   data_chunksr   r   vr  batch_data_iterators	            rV   get_batch_data_iterator&MegatronEngine.get_batch_data_iterator  sC   zz?Q%%)q$"8"89A&& )3(8(8(:(: %:%:!:a!etG\G\=\]](: :  *lt{{a z?Q& -2#dkk2B,CD,Cqk",CD   #"	 Vc$++..   #" 8;:7J${"3PT""! Es   #-D;(Ec           
         U R                  U5      n[        U R                  R                  UU R                  U R
                  U R                  U R                  [        5       S9u  p4    pVnUS:H  U R
                  l	        X4Xg4$ )z`
Training step for Megatron-LM

Args:
    batch_data (:obj:`dict`): The batch data to train on.
)forward_step_funcr   rU   rg   opt_param_schedulerr>   forward_backward_funcr   )
r5  r7   r  r3  r  rg   rZ   r  r   r  )rv   r1  r4  loss_reducedr  r  	grad_normnum_zeros_in_grads           rV   r7   MegatronEngine.train_step  s     #:::FLV"55BB-++nn $%%";"=M
IAq!8I '3a&7#9GGrX   c           
         [        5       nU R                  U5      n[        5       nU" U R                  R                  UU R
                  [        5       UR                  UR                  SS9nUR                  S:  a  [        R                  R                  5         U=R                  [        R                  " 5       UR                  -  [        5       -  -  sl        [        R                   " SS9(       as  0 nUS    Hf  nU Vs/ s H  oU   PM	     n	n[#        U	S   R$                  5      S:X  a  ['        U	5      [#        U	5      -  Xg'   MN  [        R(                  " U	5      Xg'   Mh     U$ 0 $ s  snf )ze
Evaluation step for Megatron-LM

Args:
    batch_data (:obj:`dict`): The batch data to evaluate on.
T)r8  r   rU   num_microbatchesr   r   forward_onlyr   )ignore_virtualr   )r   r5  r   r  r3  r  r   r   r   empty_unused_memory_levelr   rU  empty_cacher   r   r   is_pipeline_last_stagerc   r  rd  r  )
rv   r1  rS   r4  r:  
loss_dictsr;  r   r  losses_reduced_for_keys
             rV   	eval_stepMegatronEngine.eval_step#  sC    z":::F 9 ;*"55BB-++13!22

 ))Q.JJ""$##,,.1F1FFI]I__	
# %%T:L!!}:D)E*QC&*&)E-a06671<(+,B(CcJ`Fa(aL%(-5K(LL% %  	 *Fs   E(c                    [        5       nU R                  S   R                  (       Ga9  U R                  " S
0 UD6u  p4pVU =R                  S-  sl        [
        R                  " 5       UR                  -  [        5       -  nU=R                  U-  sl	        U =R                  [        X'5      -  sl
        UR                  b  U R                  R                  5       R                  5       nS n	UR                   (       a  [#        U R$                  5      n	['        UU R(                  U R                  R*                  S   S   U R                  UU R,                  UUU	U5
      U l        OU R.                  " S
0 UD6nUR                  b  U H  n
U R0                  R3                  U
[4        R6                  R9                  S/5      5      X:   -   U R0                  U
'   U R0                  R3                  U
S-   [4        R6                  R9                  S/5      5      [4        R6                  R9                  S/5      -   U R0                  U
S-   '   M     [4        R:                  " S[4        R6                  R=                  5       S9nU H'  n
[?        X:   R@                  5      S:X  d  M   XU
   -  nM)     S nSU;   a  US   nU RB                  RD                  b  U RB                  RE                  XS	9$ U$ )Nr   r   lrg        
_num_itersg      ?r  rt  )rn  rt  r   )#r   r  trainingr7   r  r   r   r   r   r   r  r5   r  rg   get_loss_scaleitemlog_params_normr:   rU   r8   r  param_groupsr
  rH  r	  r  r   rU  FloatTensorr   rV  rc   r  r  r5  )rv   r1  rS   	loss_dictr  r<  r=  r   
loss_scaleparams_normr   rn  rt  s                rV   forwardMegatronEngine.forwardK  s    z;;q>"""DHOODaV`DaAIYNNaN99;d>S>SSVjVllJ'':5'559VW[9hh5##/!^^::<AAC
"''"5djj"AK*6((NN//248NN++ %+' 44I##/$C1155c5::;Q;QSVRW;XY\e\jj --c2 EID]D]DaDal*EJJ,B,BC5,IE

..u5E6D--cL.@A	 % ||C

(A(A(CDC9>''(A-#&  y x(F""55A**==4=WWrX   c                 &   [        5       nUR                  b  U R                  S:X  a  g [        5       n[        5       nSU R                   S3nU R                   H  nUR                  S5      (       a  M  U R                  U   U R                  US-      -  nX4 SU S3-  n[        R                  " [        SUR                  5       5      5      nUR                  (       a
  X4 SU S3-  nU(       d  M  UR                  U S3UR                  5       U R                  5        UR                  (       d  M  UR                  U S	3X`R                  5        M     [        U5      S
-   n[        SU-  5        [        U5        [        SU-  5        0 U l        g )Nr   zvalidation loss at iteration z | rL  z value:    z PPL: z validationz validation pplr   -)r   r  r  r   r	  endswithmathexpminrO  rK   
add_scalarrc   r!   )rv   rS   writerstringr   r   ppllengths           rV   r(  MegatronEngine.log_eval_results  sa   z'4>>Q+>z')00@D,,C||L))--c2T5N5NsUaOa5bbEXeWC00F((3r5::<01C$$EuC00v!!SE"5uzz|T^^T(((%%_&=sNNS - Vqf%f%$&!rX   c                 :   U R                  5         [        5       nXl        [        R                  R                  5         [        U R                  U R                  U R                  U R                  U R                  S9  [        R                  R                  5         g )N)r  )r(  r   saver   r   barrierr)   r  r  rg   rZ   r  )rv   
output_dirrS   s      rV   r)   MegatronEngine.save_checkpoint  sm    z	!!#NNKKNNNN151Z1Z	
 	!!#rX   c                    [        5       nXl        SUl        SUl        [        R
                  R                  5         [        U R                  U R                  U R                  5      u  p4[        R
                  R                  5         X0l        X@l        UR                  (       a,  U R                  S:X  a  U R                  R                  5         g g g r  )r   r  r   r   r   r   rg  r(   r  rg   rZ   r  r  fp16reload_model_params)rv   	input_dirrS   r  r  s        rV   r(   MegatronEngine.load_checkpoint  s    z	&'#&'#!!#:I$++W[WeWegkgugu:v7	!!#"4X1991,NN..0 -9rX   )r  r	  r  r  r  r  rg   r
  rZ   r  r  )r   r   r   r   r   rz   r$  r'  r-  r5  r7   rH  rV  r(  r)   r(   r   r
  r  s   @rV   r  r    sG    (>4 :#2H0&P=~'4$1 1rX   r  c                     [        U 5      $ )z
Average losses across data parallel group.

Args:
    losses (List[Tensor]): List of losses to average across data parallel group.
)r9   )r  s    rV   %avg_losses_across_data_parallel_grouprp    s     5V<<rX   c                     S n[        XSS9$ )z
Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

Args:
    tensor (nested list/tuple/dictionary of `torch.Tensor`):
        The data to gather across data parallel ranks.

c                    U R                   S:X  a  U R                  5       S    n [        [        R                  R                  [        R                  " 5       S95       Vs/ s H  n[        R                  " U 5      PM     nn[        R                  R                  X [        R                  " 5       S9  [        R                  " USS9$ s  snf )Nr   r   r  )ndimr  r   r   r   get_world_sizer   get_data_parallel_group
empty_like
all_gatherr  )r   r  output_tensorss      rV   _gpu_gather_one;gather_across_data_parallel_groups.<locals>._gpu_gather_one  s    ;;!\\^D)F 5,,;;#B]B]B_;`a
a V$a 	 
 	$$^3C^C^C`$ayyQ//
s    CT)error_on_other_type)r   )r   ry  s     rV   "gather_across_data_parallel_groupsr|    s    0 _$OOrX   )TTTT)NN)orp   r\  r  abcr   	functoolsr   r   torch.nn.functionalnn
functionalrg  torch.nnr   r   r   rg   r	   rZ   r
   importsr   
operationsr   r   megatron.corer   r   megatron.core.distributedr   r  r   megatron.core.enumsr   )megatron.core.num_microbatches_calculatorr   megatron.core.optimizerr   megatron.core.parallel_stater   r   megatron.core.pipeline_parallelr   megatron.core.utilsr   "megatron.legacy.data.dataset_utilsr   megatron.legacy.modelr   r   $megatron.legacy.model.classificationr   megatron.trainingr   r   r    r!   megatron.training.argumentsr"   r#   r$   r%   r&   megatron.training.checkpointingr'   r(   r)   megatron.training.global_varsr*   megatron.training.gpt_buildersr+   megatron.training.initializer,   r-   r.   r/   r0   r1   %megatron.training.tokenizer.tokenizerr2   megatron.training.trainingr3   r4   r5   r6   r7   r8   megatron.training.utilsr9   r:   r;   rW   rj   rl   r   r   r   r_   r  r  r`   r   r.  r  r  r  r  Moduler  rp  r|  r   rX   rV   <module>r     sf     	      A A , , - 9 2M>-N>pI4R8C   lkB:  O  .b'8jL jLZ$>LD+!5 + b .!5  "K% K\M$ M`N# Nb	@/d_1UXX__ _1F	=PrX   