
    3jX                        S SK r S SKJr  S SKJr  S SKrS SKJr  S SKJ	r	  SSK
Jr  SSKJrJrJrJr  SSKJr  SS	KJrJrJr  SS
KJrJrJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/J0r0J1r1  SSK2J3r3  SSK4J5r5J6r6  SSK7J8r8  SSK9J:r:J;r;J<r<J=r=J>r>J?r?  \1R                  " \A5      rB\/" SS9\	 " S S\85      5       5       rC\/" SS9\	 " S S\5      5       5       rD " S S\=5      rE " S S\;5      rF " S  S!\>5      rG " S" S#\:5      rH " S$ S%\:5      rI " S& S'\ 5      rJ " S( S)\ 5      rK " S* S+\R                  5      rM " S, S-\R                  5      rN\/ " S. S/\<5      5       rOS0\R                  S-  S1\R                  S2\RS-  S3\R                  4S4 jrS " S5 S6\O5      rT " S7 S8\O5      rU\/ " S9 S:\O5      5       rV\/ " S; S<\O5      5       rW " S= S>\O\5      rX\/ " S? S@\O5      5       rY\/ " SA SB\O5      5       rZ/ SCQr[g)D    N)Callable)Any)strict   )initialization)CacheDynamicCacheEncoderDecoderCacheStaticCache)PreTrainedConfig)GenerationConfigGenerationMixinGenerationMode)create_bidirectional_mask(create_bidirectional_sliding_window_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingeager_attention_forwardzgoogle/t5_gemma_module-7b)
checkpointc                   6    \ rS rSr% SrSr\\S'   \" 5       r	Sr
g)T5GemmaModuleConfigC   a  
query_pre_attn_scalar (`float`, *optional*, defaults to 256):
    scaling factor used on the attention scores
final_logit_softcapping (`float`, *optional*, defaults to 30.0):
    scaling factor when applying tanh softcapping on the logits.
attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
    scaling factor when applying tanh softcapping on the attention scores.

```python
>>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
>>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
>>> configuration = T5GemmaModuleConfig()
>>> # Initializing a model from the t5_gemma_module-7b style configuration
>>> model = T5GemmaModuleModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```F
is_decoder N)__name__
__module____qualname____firstlineno____doc__r2   bool__annotations__AttributeErroruse_bidirectional_attention__static_attributes__r3       e/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr0   r0   C   s    $ J"0"2r>   r0   c                      ^  \ rS rSr% SrSrS/r\\S.rSr	\\
\\4   -  S-  \S'   Sr\\
\\4   -  S-  \S'   S	r\\S
'   Sr\\-  \S'   Sr\\-  \S'   Sr\\-  \S'   S	r\\S'   Sr\\S'   U 4S jrSrU =r$ )T5GemmaConfig\   a  
encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
    Configuration for the encoder.
decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
    Configuration for the decoder.

Example:

```python
>>> from transformers import T5GemmaConfig, T5GemmaModel
>>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
>>> model = T5GemmaModel(t5gemma_config)
```t5gemmapast_key_values)encoderdecoderNrE   rF   Tis_encoder_decoder        dropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddingsi  
vocab_sizec                   > [        U R                  [        5      (       a  [        S0 U R                  D6U l        OU R                  c  [        5       U l        [        U R                  [        5      (       a  [        S0 U R                  D6U l        OU R                  c  [        5       U l        SU R                  l        U R                  U R                  l        U R                  U R                  l        SU R                  l        SU R                  l        U R                  U R                  l        U R                  U R                  l        U R                  R                  U R                  l
        UR                  SU R                  R                  5      U l        S H"  nX!;  d  M
  [        U R                  U5      X'   M$     [        TU ]<  " S0 UD6  g )NFTinitializer_range)bos_token_idpad_token_ideos_token_idr3   )
isinstancerE   dictr0   rF   r2   rI   rK   	use_cachehidden_sizecross_attention_hidden_sizepoprO   getattrsuper__post_init__)selfkwargsspecial_token_key	__class__s      r?   r[   T5GemmaConfig.__post_init__z   sK   dllD)).>>DL\\!.0DLdllD)).>>DL\\!.0DL"'$($5$5!)-)?)?&"&!%$($5$5!)-)?)?&37<<3K3K0!',?A_A_!`!Q .,3DLLBS,T) "R 	''r>   )rF   rE   rO   )r4   r5   r6   r7   r8   
model_typekeys_to_ignore_at_inferencer0   sub_configsrE   rT   r   r:   rF   rG   r9   rI   intfloatrJ   rK   rL   rM   r[   r=   __classcell__r_   s   @r?   rA   rA   \   s     J#4"51>QRK;?G 4S>1D8?;?G 4S>1D8?## #L#+#+.S5[.%(us{( $$J( (r>   rA   c                       \ rS rSrSrg)T5GemmaRMSNorm   r3   Nr4   r5   r6   r7   r=   r3   r>   r?   ri   ri          r>   ri   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
T5GemmaMLP   c                 n   > [         TU ]  U5        [        R                  " UR                  5      U l        g N)rZ   __init__nnDropoutrI   dropoutr\   configr_   s     r?   rr   T5GemmaMLP.__init__   s&     zz&"5"56r>   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      nU R	                  U5      nU$ rq   )act_fn	gate_projup_projru   	down_proj)r\   xhidden_statesr}   s       r?   forwardT5GemmaMLP.forward   sH    DNN1$56aH]3NN=1	r>   )ru   )r4   r5   r6   r7   rr   r   r=   rf   rg   s   @r?   rn   rn      s    7 r>   rn   c                       \ rS rSrSrg)T5GemmaRotaryEmbedding   r3   Nrk   r3   r>   r?   r   r      rl   r>   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )T5GemmaSelfAttention   rw   	layer_idxc                 F   > [         TU ]  X5        UR                  U l        g rq   )rZ   rr   r2   	is_causalr\   rw   r   r_   s      r?   rr   T5GemmaSelfAttention.__init__   s    +**r>   )r   )	r4   r5   r6   r7   r0   rd   rr   r=   rf   rg   s   @r?   r   r      s    +2 +s + +r>   r   c                   
  ^  \ rS rSrS\S\4U 4S jjr SS\R                  S\R                  S-  S\R                  S-  S	\	S-  S
\
\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )T5GemmaCrossAttention   rw   r   c                   > [         TU ]  X5        U ?U ?SU l        UR
                  c  [        S5      e[        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        g )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rZ   rr   sliding_window
layer_typer   rW   
ValueErrorrs   Linearnum_key_value_headshead_dimattention_biask_projv_projr   s      r?   rr   T5GemmaCrossAttention.__init__   s    +O--5abbii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
r>   Nr   attention_maskencoder_hidden_statesrD   r]   returnc                    Uc  [        S5      eUR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nUb1  UR                  R                  U R                  5      n	UR                  n
Ub  W	(       d  UR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nU R                  U5      R	                  U5      R                  SS5      nUb7  W
R                  XU R                  5      u  pSUR                  U R                  '   OFW
R                  U R                     R                  nU
R                  U R                     R                  n[         R"                  " U R$                  R&                  [(        5      nU" U UUUU4U R*                  (       a  U R,                  OSU R.                  S U R0                  S.UD6u  nnUR2                  " / UQSP76 R5                  5       nU R7                  U5      nUU4$ )Nz5Encoder hidden state is required for cross attention.   r&   TrH   )ru   scalingr   softcap)r   shaper   q_projview	transpose
is_updatedgetr   cross_attention_cacher   r   updatelayerskeysvaluesr   get_interfacerw   _attn_implementationr-   trainingrK   r   attn_logit_softcappingreshape
contiguouso_proj)r\   r   r   r   rD   r]   input_shapehidden_shapequery_statesr   curr_past_key_valuesencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                     r?   r   T5GemmaCrossAttention.forward   s9    !(TUU#))#2.88b8$--8{{=166|DNNqRST&(3377GJ#2#H#H "*"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL*+?+F+Fzaeaoao+p(
=A**4>>:-44T^^DIIJ/66t~~FMML(?(M(MKK,,.E)
 %8%
 /3mmD**LL//%
 %
!\ "));;;;FFHkk+.L((r>   )r   r   r   rq   )r4   r5   r6   r7   r0   rd   rr   torchTensorr   r   r   tupler   r=   rf   rg   s   @r?   r   r      s    
2 
s 
* )-3)||3) t+3)  %||d2	3)
 3) -.3) 
u||U\\D0%2E2LL	M3) 3)r>   r   c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\	\R                  4   4
S jjrSrU =r$ )T5GemmaEncoderLayer   zEncoder sub-layer.r   c                 $  > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        g N)rw   r   eps)rZ   rr   rV   rw   r   layer_typesattention_typer   	self_attnri   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrn   mlppre_feedforward_layernormpost_feedforward_layernormrs   rt   rI   ru   r   s      r?   rr   T5GemmaEncoderLayer.__init__   s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r>   Nr   position_embeddingsr   position_idsr   c           	      8   UnU R                  U5      nU R                  " SUUUUS S.UD6u  pU R                  U5      nX`R                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX`R                  U5      -   nU$ )N)r   r   r   r   rD   r3   )r   r   r   ru   r   r   r   )r\   r   r   r   r   r]   residual_s           r?   r   T5GemmaEncoderLayer.forward  s     !44]C>> 
' 3)% 
 
 55mD <<#>> 66}E/77F <<#>>r>   )r   rw   ru   rV   r   r   r   r   r   r   r   )NNN)r4   r5   r6   r7   r8   rd   rr   r   r   r   
LongTensorFloatTensorr   r=   rf   rg   s   @r?   r   r      s    7# 7. IM.204|| #5<<#=>E t+	
 &&- 
u  !	" r>   r   c                   H  ^  \ rS rSrSrS\4U 4S jjr       SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  4S jjrSrU =r$ )T5GemmaDecoderLayeri.  z2Decoder sub-layer: an extra cross-attention layer.r   c                   > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        [+        XS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g r   )rZ   rr   rV   rw   r   r   r   r   r   ri   r   r   r   rn   r   r   r   rs   rt   rI   ru   r   
cross_attnpre_cross_attn_layernormpost_cross_attn_layernormr   s      r?   rr   T5GemmaDecoderLayer.__init__1  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r>   Nr   r   r   r   rD   rU   r   encoder_attention_maskr   c	           
         Un
U R                  U5      nU R                  " SUUUUUb  UR                  OS US.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  " SUUUUUS.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  U5      nU R                  U5      nXR	                  U5      -   nU$ )N)r   r   r   r   rD   rU   )r   r   r   rD   rU   r3   )r   r   self_attention_cacher   ru   r   r   r   r   r   r   )r\   r   r   r   r   rD   rU   r   r   r]   r   r   s               r?   r   T5GemmaDecoderLayer.forwardH  s-    !44]C>> 
' 3)%DSD_O@@ei
 
 55mD <<#>> 55mD?? 
'"71+
 
 66}E <<#>> 66}E/77F <<#>>r>   )r   rw   r   ru   rV   r   r   r   r   r   r   r   r   r   )NNNNFNN)r4   r5   r6   r7   r8   rd   rr   r   r   r   r   r
   r9   r   r   r=   rf   rg   s   @r?   r   r   .  s    <e# e4 IM.2046:!&596:,||, #5<<#=>E, t+	,
 &&-, -t3, $;,  %||d2, !&t 3, 
		, ,r>   r   c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaClassificationHeadiw  z-Head for sentence-level classification tasks.rV   
num_labelsrJ   c                    > [         TU ]  5         [        R                  " US9U l        [        R
                  " X5      U l        g )N)p)rZ   rr   rs   rt   ru   r   out_proj)r\   rV   r   rJ   r_   s       r?   rr   "T5GemmaClassificationHead.__init__z  s/    zz$;<		+:r>   r   r   c                 J    U R                  U5      nU R                  U5      nU$ rq   ru   r   )r\   r   s     r?   r   !T5GemmaClassificationHead.forward  s$    ]3m4r>   r   )rH   )r4   r5   r6   r7   r8   rd   re   rr   r   r   r   r=   rf   rg   s   @r?   r   r   w  sF    7;C ;S ;SX ; ;
U\\ ell  r>   r   c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaLMHeadi  z.Head for language modeling (generation) tasks.rV   rM   r   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )Nr   )rZ   rr   rs   r   r   )r\   rV   rM   r   r_   s       r?   rr   T5GemmaLMHead.__init__  s     		+Er>   r   r   c                 (    U R                  U5      nU$ rq   r   )r\   r   logitss      r?   r   T5GemmaLMHead.forward  s    }-r>   r   )F)r4   r5   r6   r7   r8   rd   r9   rr   r   r   r   r=   rf   rg   s   @r?   r   r     sJ    8FC FS F F FU\\ ell  r>   r   c            	           \ rS rSr% \\S'   SrSrSS/r\	\
" \SSS	9\
" \SS
S	9\
" \SS
S	9/S.r\R                  " 5       S 5       rS rSrg)T5GemmaPreTrainedModeli  rw   modelTr   r   r   r   )index
layer_namer   )r   
attentionsc                 f   [         R                  " X5        U R                  R                  n[	        U[
        5      (       a  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  [        UR                  S5      (       aC  UR                  R                  b+  [        R                  " UR                  R                  5        g g g [	        U[        5      (       ao  U R                  R                  (       dS  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  g g SUR                   R"                  ;   a!  [        R                  " UR                  5        g g )Nr   g      rH   )meanstdr   RMSNorm)r   _init_weightsrw   rO   rS   r   r   weightr   initnormal_hasattrr   zeros_r   rL   r_   r4   )r\   moduler  scales       r?   r  $T5GemmaPreTrainedModel._init_weights  s.    	%%d3kk++f788OO**003t;ELL//cs{Kv//FOO4H4H4TFOO001 5U/..;;22..44Q74?V__33#3;O 3 &**333KK& 4r>   c                 b   U R                   R                  R                  nU R                   R                  R                  nUc  [	        S5      eUR                  UR                  5      nUSSS24   R                  5       USSS24'   X$S'   Uc  [	        S5      eUR                  US:H  U5        U$ )	z
Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
pad_token_id replacement for labels that were -100.
This is a common preparation step for decoder inputs in sequence-to-sequence models.
Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rw   rF   rP   rQ   r   	new_zerosr   clonemasked_fill_)r\   	input_idsdecoder_start_token_idrQ   shifted_input_idss        r?   _shift_right#T5GemmaPreTrainedModel._shift_right  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r>   r3   N)r4   r5   r6   r7   rA   r:   base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r$   r   r   _can_record_outputsr   no_gradr  r  r=   r3   r>   r?   r   r     sv    &*#.0EF,/q[Q/q\R0lS
 ]]_' '"!r>   r   	token_idsr   rQ   r   c                    U b<  Uc  [        S5      eX:g  R                  UR                  [        R                  5      nU$ [        R
                  " UR                  S   UR                  S   4UR                  [        R                  S9nU$ )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r   devicedtype)r   tor   r   longonesr   )r  r   rQ   r   s       r?   make_default_2d_attention_maskr%    s     RSS#3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r>   c                      ^  \ rS rSr\\S.rU 4S jr\\	    SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S	\\   S
\\-  4S jj5       5       rSrU =r$ )T5GemmaEncoderi  )r  r   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf Nr   Frw   )rZ   rr   rQ   padding_idxrM   rs   	EmbeddingrV   embed_tokensri   r   normgradient_checkpointing
ModuleListrangenum_hidden_layersr   r   rt   rI   ru   r   
rotary_emb	post_initr   s      r?   rr   T5GemmaEncoder.__init__       !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	&+#mmEJ6KcKcEdeEd	 3Ede
 zz&"5"560? 	 f   ?D$Nr  r   r   inputs_embedsr]   r   c                    US L US L-  (       a  [        S5      eUR                  SS 5        Uc  U R                  U5      nUc=  [        R                  " UR
                  S   UR                  S9nUR                  S5      nUc   [        XU R                  R                  5      n[        U=n[        5      (       d'  U R                  UUS.n[        S0 UD6[        S0 UD6S.nUn[        R                  " U R                  R                   S-  UR"                  S	9n	X-  nU R%                  U5      nU R'                  X5      n
[)        U R*                  S U R                  R,                   5       H*  u  pU" UU
X`R                  R.                  U      U40 UD6nM,     U R1                  U5      nU R%                  U5      n[3        US
9$ )N:You must specify exactly one of input_ids or inputs_embedsrD   r   r   r   )rw   r8  r   full_attentionsliding_attention      ?r!  )last_hidden_stater3   )r   rX   r-  r   aranger   r   	unsqueezer%  rw   rQ   rS   rT   r   r   tensorrV   r!  ru   r3  	enumerater   r2  r   r.  r   )r\   r  r   r   r8  r]   self_attn_mask_mappingmask_kwargsr   
normalizerr   ilayer_modules                r?   r   T5GemmaEncoder.forward  s    -t";<YZZ 	

$d+  --i8M <<(;(;A(>}G[G[\L'11!4L!;IVZVaVaVnVnoNNB0DII++!."0K #<"Jk"J%M%\P[%\&"
 &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA(#&{{'>'>q'AB	
 M  W 		-0]3+
 	
r>   ru   r-  r/  r   r.  r+  r3  rM   NNNN)r4   r5   r6   r7   r   r   r  rr   r#   r%   r   r   r   r   r   r   r   r   r   r=   rf   rg   s   @r?   r'  r'    s    *,
$   .2.204266
##d*6
 t+6
 &&-	6

 ((4/6
 +,6
 
	 6
   6
r>   r'  c                   V  ^  \ rS rSr\" \SS9\" \SS9\S.rU 4S jr	\
\        SS\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                   S-  S\S-  S\R                  S-  S\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )T5GemmaDecoderi1  r   )r   )r  cross_attentionsr   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf r)  )rZ   rr   rQ   r+  rM   rs   r,  rV   r-  ri   r   r.  r/  r0  r1  r2  r   r   rt   rI   ru   r   r3  r4  r   s      r?   rr   T5GemmaDecoder.__init__8  r6  r7  Nr  r   r   rD   r8  rU   r   r   r]   r   c	                    US L US L-  (       a  [        S5      eUc  [        S5      eUc  U R                  U5      nU R                  (       d/  U(       a(  Uc%  [        [	        U R
                  S9[	        5       5      nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      nUc#  Uc   [        XU R
                  R                  5      n[        U=n[        5      (       d8  U R
                  UUUb  UR                   OS US.n[#        S0 UD6[%        S0 UD6S.n[        U=n[        5      (       d  S	['        U R
                  UUUS
90nUn[        R(                  " U R
                  R*                  S-  UR,                  S9nX-  nU R/                  U5      nU R1                  X5      n[3        U R4                  S U R
                  R6                   5       H2  u  nnU" UUXR
                  R8                  U      UUUUUS	   40 U	D6nM4     U R;                  U5      nU R/                  U5      n[=        UUS9$ )Nr:  z0`encoder_hidden_states` must be given in decoderr*  r   r   r;  )rw   r8  r   rD   r   r<  r=  )rw   r8  r   r   r?  r@  )rA  rD   r3   )r   r-  r   r
   r	   rw   get_seq_lengthr   rB  r   r   rC  r%  rQ   rS   rT   r   r   r   r   rD  rV   r!  ru   r3  rE  r   r2  r   r.  r   )r\   r  r   r   rD   r8  rU   r   r   r]   past_seen_tokensrF  rG  cross_attn_mask_mappingr   rH  r   rI  rJ  s                      r?   r   T5GemmaDecoder.forwardJ  so    -t";<YZZ (OPP  --i8M}}/F 2,dkk2RT`TbcOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L!o&=;IVZVaVaVnVnoNNB0DII++!."0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TRR ";;;"/#9*?	#'# &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA|(#&{{'>'>q'AB%'(89
 
M  W 		-0]38++
 	
r>   rL  )NNNNNNNN)r4   r5   r6   r7   r$   r   r   r   r  rr   r#   r%   r   r   r   r
   r   r9   r   r   r   r   r   r=   rf   rg   s   @r?   rO  rO  1  s   $%9C*+@J,$   .2.2046:26!%596:P
##d*P
 t+P
 &&-	P

 -t3P
 ((4/P
 $;P
  %||d2P
 !&t 3P
 +,P
 
:	:P
   P
r>   rO  c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	           SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\S-  S\
R                   S-  S\
R                   S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaModeli  rw   c                    > [         TU ]  U5        UR                  (       d  [        S5      e[	        UR
                  5      U l        [        UR                  5      U l        U R                  5         g )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rZ   rr   rG   r   r'  rE   rO  rF   r4  rv   s     r?   rr   T5GemmaModel.__init__  sO     ((uvv%fnn5%fnn5r>   c                 6    U R                   R                  5       $ rq   rE   get_input_embeddingsr\   s    r?   r^  !T5GemmaModel.get_input_embeddings      ||0022r>   c                 8    U R                   R                  U5      $ rq   rE   set_input_embeddingsr\   new_embeddingss     r?   rd  !T5GemmaModel.set_input_embeddings      ||00@@r>   Nr  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsrD   r8  decoder_inputs_embedsrU   r]   r   c                    Uc  U R                   " SUUUU	S.UD6nUR                  nU R                  " SUUUU
UUUUS.UD6n[        UR                  UR                  UR                  SS5      (       a  UR                  OUR                  4UR                  UR                  UR                  UR                  UR                  S9$ )a8  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
r  r   r   r8  )r  r   r   r8  rD   r   r   rU   output_hidden_statesF)rA  rD   decoder_hidden_statesdecoder_attentionsrP  encoder_last_hidden_stater   encoder_attentionsr3   )	rE   rA  rF   r   rD   r   r   r  rP  )r\   r  r   r   ri  rj  rk  rl  rD   r8  rm  rU   r]   r   decoder_outputss                  r?   r   T5GemmaModel.forward  s    , ""ll #-)+	
 O !0 A A,, 

'1-/+"7#1

 

 "-??+;;zz0%88 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r>   )rF   rE   )NNNNNNNNNNN)r4   r5   r6   r7   rA   rr   r^  rd  r!   r    r   r   r   
BoolTensorr   r
   r   r9   r   r   r   r   r=   rf   rg   s   @r?   rY  rY    sA   	} 	3A  .2370459:>8<266:-159!%6
##d*6
 ))D06
 &&-	6

 !++d26
 !& 0 04 76
 $..56
 )4/6
 -t36
 ||d*6
  %||d26
 $;6
 +,6
 
6
  6
r>   rY  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	    SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaEncoderModeli  rw   c                    > [         TU ]  U5        UR                  (       a  [        S5      e[	        UR
                  5      U l        U R                  5         g )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rZ   rr   rG   r   r'  rE   r4  rv   s     r?   rr   T5GemmaEncoderModel.__init__  s?     $$pqq%fnn5r>   c                 6    U R                   R                  5       $ rq   r]  r_  s    r?   r^  (T5GemmaEncoderModel.get_input_embeddings  ra  r>   c                 8    U R                   R                  U5      $ rq   rc  re  s     r?   rd  (T5GemmaEncoderModel.set_input_embeddings  rh  r>   Nr  r   r   r8  r]   r   c                 4    U R                   " SUUUUS.UD6nU$ )Nro  r3   rE   )r\   r  r   r   r8  r]   rl  s          r?   r   T5GemmaEncoderModel.forward  s5     ,, 
)%'	

 
 r>   r  rM  )r4   r5   r6   r7   rA   rr   r^  rd  r!   r    r   r   r   r   r   r   r   r   r=   rf   rg   s   @r?   ry  ry    s    } 3A  .23704-1##d* ))D0 &&-	
 ||d* +, 
  r>   ry  c            "       Z  ^  \ rS rSrSS0rSS0rSS/S/40rS\4U 4S	 jjrS
 r	S r
\\             S%S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                   S-  S\R                  S-  S\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\R*                  -  S\\   S\\R                     \-  4S jj5       5       rS\R*                  4S jrS\S\S \S!\S"\S\4U 4S# jjrS$r U =r!$ )&T5GemmaForConditionalGenerationi  zlm_head.out_proj.weightz!model.decoder.embed_tokens.weightzlm_head.out_projcolwise_gather_outputr   r   rw   c                   > SUl         [        TU ]	  U5        [        U5      U l        UR
                  R                  U l        [        UR
                  R                  U R                  5      U l	        SU l
        U R                  5         g )NTForMaskedLM)rG   rZ   rr   rY  r   rF   rM   r   rV   lm_head	loss_typer4  rv   s     r?   rr   (T5GemmaForConditionalGeneration.__init__  sb    $(! !&)
 ..33$V^^%?%?Q&r>   c                 2   XR                   l        U R                  R                  (       al  UR                  U R
                  R                  R                  l        UR                  R                  S   U R
                  R                  R                  l	        g g )Nr   )
r  r   rw   rL   r  r   rF   r-  r   num_embeddingsre  s     r?   set_output_embeddings5T5GemmaForConditionalGeneration.set_output_embeddings"  sh     . ;;**5C5J5JDJJ++2=K=R=R=X=XYZ=[DJJ++: +r>   c                 .    U R                   R                  $ rq   )r  r   r_  s    r?   get_output_embeddings5T5GemmaForConditionalGeneration.get_output_embeddings+  s    ||$$$r>   Nr  r   r   ri  rj  rk  rl  rD   r8  rm  labelsrU   logits_to_keepr]   r   c                    Ub  Uc  U
c  U R                  U5      nU R                  " SUUUUUUUUU	U
US.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  5       R                  nUR                  b4  UUR                  -  n[        R                  " U5      nUUR                  -  nSnUb  U R                  " UXR                  40 UD6n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )a  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)r  r   r   ri  rj  rk  rl  rD   r8  rm  rU   )	lossr   rD   rq  rr  rP  rs  r   rt  r3   )r  r   rA  rS   rd   slicer  get_decoderrw   final_logit_softcappingr   tanhloss_functionrM   r   rD   rq  rr  rP  rs  r   rt  )r\   r  r   r   ri  rj  rk  rl  rD   r8  rm  r  rU   r  r]   ru  r   slice_indicesr   decoder_configr  s                        r?   r   'T5GemmaForConditionalGeneration.forward.  sp   : "3";@U@] $ 1 1& 9.2jj /
)%/#9!5++'"7/
 /
 (998B>SV8W8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%%ffooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r>   c                 $    U R                  U5      $ rq   )r  )r\   r  s     r?   %prepare_decoder_input_ids_from_labelsET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labelsy  s      ((r>   generation_configmodel_kwargsgeneration_mode
batch_sizemax_cache_lengthc           
        > [         TU ]  UUUUU5        UR                  SL a  gUR                  nUc  SnOSUR                  ;   n[        R
                  " U R                  R                  SS95      nSUl        S/UR                  -  Ul
        UUS.n	UR                  S5      n
U
b  [        U
[        5      (       d  [        S	5      e[        U
R                   5      S
:  a!  U
R                   R                  S
5      (       a  g[#        U
R$                  5      nU[&        :X  a  US   S
   R(                  S   U	S'   U" S0 U	D6U
l        O:[        [+        S0 U R                  R                  SS9US.D6[+        5       5      US'   [-        U S5      (       aC  U R.                  b5  [        U R.                  [        5      (       d  [        S5      eUS   U l        ggg)a  Override cache preparation to force full attention on the cross-attention cache.

The decoder config may declare sliding-window layers, but cross-attention must always use full attention.
The default `_prepare_cache_for_generation` would otherwise build a sliding cross-attention cache.
FN	offloadedT)rF   r=  )rw   
offloadingrD   z`The `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma model.r   rl  r   max_cache_len_cachezKThe internal cache must be of type `EncoderDecoderCache` for T5Gemma model.r3   )rZ   _prepare_cache_for_generationrU   cache_implementationcopydeepcopyrw   get_text_configr   r2  r   r   rS   r
   r   lenr   typer   r   r   r	   r
  r  )r\   r  r  r  r  r  r  offload_cachecross_attn_configcross_attn_cache_kwargsrD   cross_attn_clsr_   s               r?   r  =T5GemmaForConditionalGeneration._prepare_cache_for_generation|  s    	-	
 &&%/0EE'!M'+<+Q+QQM !MM$++*E*Ed*E*ST+/()9(:=N=`=`(`% ('#

 '**+<=&o/BCC v 
 ?--.27Q7Q7U7UVW7X7X!/"G"GHN,;GHY;Z[\;];c;cde;f'84B4]E\4]O1 /B "&++"="=d"="K&3 /L*+ 4""t{{'>dkk+>?? !noo&'89DK	 (?"r>   )r  r  r  r   rM   )NNNNNNNNNNNNr   )"r4   r5   r6   r7   _tied_weights_keys_tp_plan_pp_planrA   rr   r  r  r!   r    r   r   r   rw  r   r
   r9   rd   r   r   r   r   r   r   r  r   rT   r   r  r=   rf   rg   s   @r?   r  r    s   35XY"$;<H"o%6
$CDH	} 	\%  .2370459:>8<266:26:>*.!%-.G
##d*G
 ))D0G
 &&-	G

 !++d2G
 !& 0 04 7G
 $..5G
 )4/G
 -t3G
 ((4/G
  %0047G
   4'G
 $;G
 ell*G
 +,G
  
u  	!O	3!G
  G
R)ELL )I:+I: I: (	I:
 I: I: 
I: I:r>   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ ) T5GemmaForSequenceClassificationi  Nrw   rG   c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
NrJ   皙?rG   rZ   rr   r   rY  r   ry  rE   rV   rF   rY   r   scorer4  r\   rw   rG   rV   classifier_dropoutr_   s        r?   rr   )T5GemmaForSequenceClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r>   c                 6    U R                   R                  5       $ rq   r   r^  r_  s    r?   r^  5T5GemmaForSequenceClassification.get_input_embeddings      zz..00r>   c                 :    U R                   R                  U5        g rq   r   rd  r\   values     r?   rd  5T5GemmaForSequenceClassification.set_input_embeddings      

''.r>   r  r   r   ri  rj  rk  rl  r8  rm  r  r]   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R                  c  US	:w  a  [        S
5      eU R                   R                  c  SnGOUb  XR                   R                  :g  R!                  UR"                  [$        R&                  5      n[$        R(                  " UR                  S   UR"                  [$        R&                  S9nUU-  R+                  S5      nU R                   R                  (       a*  US	-  n[$        R,                  " UUR                  S   S	-
  S9nO.Sn[.        R1                  U R                  R                   S35        U[$        R(                  " UUR"                  S9U4   nSnU
b  U R3                  UU
UU R                   S9n[5        UUUUS9$ )  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   ri  rj  rk  rl  r8  rm  rU   r   r   r8  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r;  )r   r  pooled_logitsrw   r  r   r   r  )rw   rG   NotImplementedErrorr_   r4   r   r  r   rA  rq  rr  r   r  r  r   rQ   r"  r   r   int32rB  argmaxclamploggerwarning_oncer  r   )r\   r  r   r   ri  rj  rk  rl  r8  rm  r  r]   outputsrA  r   r  r   r  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                          r?   r   (T5GemmaForSequenceClassification.forward  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r>   r   r   r  rq   
NNNNNNNNNN)r4   r5   r6   r7   rA   r9   rr   r^  rd  r!   r    r   r   r   r   r   r   r   r   r   r=   rf   rg   s   @r?   r  r    sS   } $+  .1/  .2.204596:8<2626:>*.i
##d*i
 t+i
 &&-	i

 !++d2i
 !&t 3i
 $..5i
 )4/i
 ((4/i
  %0047i
   4'i
 +,i
 
"i
  i
r>   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaForTokenClassificationiU  Nrw   rG   c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for token classification. When set to False, only encoder is used.
NrJ   r  r  r  s        r?   rr   &T5GemmaForTokenClassification.__init__W  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r>   c                 6    U R                   R                  5       $ rq   r  r_  s    r?   r^  2T5GemmaForTokenClassification.get_input_embeddingso  r  r>   c                 :    U R                   R                  U5        g rq   r  r  s     r?   rd  2T5GemmaForTokenClassification.set_input_embeddingsr  r  r>   r  r   r   ri  rj  rk  rl  r8  rm  r  r]   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nSnU
b  U R                  UXR                   5      n[        UUUUS9$ )	r  Nr  r  r  Fr  r  r  )rw   rG   r  r_   r4   r   r  r   rA  rq  rr  r   r  r  r  r   )r\   r  r   r   ri  rj  rk  rl  r8  rm  r  r]   r  rA  r   r  r   r  s                     r?   r   %T5GemmaForTokenClassification.forwardu  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-.%%ffkkBD$'!	
 	
r>   r  rq   r  )r4   r5   r6   r7   rA   r9   rr   r^  rd  r!   r    r   r   r   r   r   r   r   r   r   r=   rf   rg   s   @r?   r  r  U  sS   } $+  01/  .2.204596:8<2626:>*.N
##d*N
 t+N
 &&-	N

 !++d2N
 !&t 3N
 $..5N
 )4/N
 ((4/N
  %0047N
   4'N
 +,N
 
N
  N
r>   r  )rA   r0   r  rY  ry  r   r  r  )\r  collections.abcr   typingr   r   torch.nnrs   huggingface_hub.dataclassesr    r   r  cache_utilsr   r	   r
   r   configuration_utilsr   
generationr   r   r   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r    r!   r"   utils.genericr#   utils.output_capturingr$   r%   gemma2.configuration_gemma2r'   gemma2.modeling_gemma2r(   r)   r*   r+   r,   r-   
get_loggerr4   r  r0   rA   ri   rn   r   r   r   r   r   Moduler   r   r   r   r   rd   r%  r'  rO  rY  ry  r  r  r  __all__r3   r>   r?   <module>r     sg    $    . & P P 3 K K  C 9  G &  8 E 6  
		H	% 673, 3  83. 677($ 7(  87(t	] 		 		2 	+? +D)O D)N14 1hF4 FR		 	BII 	 8!2 8! 8!v$&<< * \\	"P
+ P
fk
+ k
\ J
) J
 J
Z !0 ! !Hs:&<o s:l I
'= I
 I
X o
$: o
 o
d	r>   