
    3j^                        S SK r S SKJr  S SKJr  S SKrS SKJr  SSKJ	r
  SSKJr  SSKJrJrJrJr  SSKJrJrJr  SS	KJrJr  SS
KJrJrJrJr  SSKJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0  SSK1J2r2J3r3J4r4J5r5  SSK6J7r7J8r8  SSK9J:r:J;r;  SSK<J=r=J>r>  \5R~                  " \@5      rA " S S\R                  5      rC " S S\R                  5      rD " S S\R                  5      rES rF\" S5      SLS j5       rGS\R                  S \IS!\R                  4S" jrJ   SMS#\R                  S$\R                  S%\R                  S&\R                  S'\R                  S-  S(\K\I-  S)\KS-  S*\KS-  S!\L\R                  \R                  4   4S+ jjrM\" \G5       " S, S-\R                  5      5       rN\" \G5       " S. S/\R                  5      5       rO " S0 S1\!5      rP " S2 S3\!5      rQ " S4 S5\R                  5      rR " S6 S7\R                  5      rS\3 " S8 S9\.5      5       rTS:\R                  S-  S\R                  S;\IS-  S!\R                  4S< jrV " S= S>\T5      rW " S? S@\T5      rX\3 " SA SB\T5      5       rY\3 " SC SD\T5      5       rZ " SE SF\T\5      r[\3 " SG SH\T5      5       r\\3 " SI SJ\T5      5       r]/ SKQr^g)N    N)Callable)Optional   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationConfigGenerationMixinGenerationMode)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_mask(create_bidirectional_sliding_window_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )T5GemmaConfigT5GemmaModuleConfigc                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
T5GemmaRMSNorm=   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__r1   nn	Parametertorchzerosweight)selfr0   r1   	__class__s      f/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr5   T5GemmaRMSNorm.__init__>   s,    ll5;;s#34    c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )N   T)keepdim)r8   rsqrtpowmeanr1   )r;   xs     r=   _normT5GemmaRMSNorm._normC   s4    5;;quuQx}}R}>IJJJr?   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )N      ?)rH   floatr:   type_as)r;   rG   outputs      r=   forwardT5GemmaRMSNorm.forwardF   sC    AGGI& 3!2!2!445~~a  r?   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler:   shaper1   r;   s    r=   
extra_reprT5GemmaRMSNorm.extra_reprM   s'    ))*+6$((<<r?   )r1   r:   )gư>)__name__
__module____qualname____firstlineno__intrL   r5   rH   rO   rU   __static_attributes____classcell__r<   s   @r=   r.   r.   =   s0    5C 5e 5 5
K!= =r?   r.   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
T5GemmaMLPQ   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        [
        R                  " UR                  5      U l        g )NFbias)r4   r5   confighidden_sizeintermediate_sizer6   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr;   re   r<   s     r=   r5   T5GemmaMLP.__init__R   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556zz&"5"56r?   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      nU R	                  U5      nU$ r3   )rm   ri   rj   rp   rk   )r;   rG   hidden_statesrk   s       r=   rO   T5GemmaMLP.forward]   sH    DNN1$56aH]3NN=1	r?   )rm   re   rk   rp   ri   rf   rg   rj   )rW   rX   rY   rZ   r5   rO   r\   r]   r^   s   @r=   r`   r`   Q   s    	7 r?   r`   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\
S   S\S-  S	\S
\4   4S jj5       r\R                  " 5       \S 5       5       rSrU =r$ )T5GemmaRotaryEmbeddingd   inv_freqNre   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultry   F)
persistentoriginal_inv_freq)r4   r5   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenre   rope_parametersr{   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r;   re   devicerope_init_fnry   r<   s        r=   r5   T5GemmaRotaryEmbedding.__init__g   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUr?   r   ztorch.deviceseq_lenreturnztorch.Tensorc           	         U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXe4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetahead_dimNrK   r   rA   dtyper   r   )	r   getattrrf   num_attention_headsr8   arangeint64torL   )re   r   r   baser0   attention_factorry   s          r=   r   6T5GemmaRotaryEmbedding.compute_default_rope_parametersw   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r?   c                 L   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rB   r*   mpscpuF)device_typeenabledrA   r0   r   )ry   rL   expandrS   r   r   
isinstancetypestrr&   	transposer8   catcosr   sinr   )
r;   rG   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r=   rO   T5GemmaRotaryEmbedding.forward   sN    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   BF
F#)r   re   r   r   r{   r3   NNN)rW   rX   rY   rZ   r8   Tensor__annotations__r+   r5   staticmethodr   r[   rR   rL   r   no_gradr   rO   r\   r]   r^   s   @r=   rw   rw   d   s    llV} V V  '++/"*$*(* t* 
~u$	%	* *: ]]_<  <r?   rw   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrB   rA   r   )rS   r8   r   )rG   x1x2s      r=   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r?   rotary_pos_embc                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXV4$ )aI  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r=   apply_rotary_pos_embr      sS    & --
&C
--
&Cw;q>C/0Gw;q>C/0Gr?   rt   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r*   N)rS   r   reshape)rt   r   batchnum_key_value_headsslenr   s         r=   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr?   modulequerykeyvalueattention_maskrp   scalingsoftcapc                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )N      rA   r   rB   )r0   r   )ptrainingr*   )r   r   num_key_value_groupsr8   matmulr   tanhr6   
functionalsoftmaxfloat32r   r   rp   r   
contiguous)r   r   r   r   r   rp   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs                r=   eager_attention_forwardr      s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r?   c                   0  ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\R                  \R                  4   S-  S	\R                  S-  S
\S-  S\\   S\
\R                  \R                  S-  \
\R                     S-  4   4S jjrSrU =r$ )T5GemmaSelfAttention   =Multi-headed attention from 'Attention Is All You Need' paperre   	layer_idxc                 N  > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        [        USUR                  UR                  -  5      U l
        UR                  UR                  -  U l        UR                  S-  U l        U R
                  R                  U l        UR                   U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  U R                  -  UR                  UR(                  S9U l        U R
                  R2                  U l        U R                  S:X  a  UR4                  U l        g S U l        g )Nlayer_typesr   r   rc   sliding_attention)r4   r5   hasattrr   
layer_typere   r   r   rf   r   r   r   r   query_pre_attn_scalarr   attention_dropout
is_decoder	is_causalr6   rh   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowr;   re   r   r<   s      r=   r5   T5GemmaSelfAttention.__init__   s   ;B6=;Y;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>**ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;J]7]f33cgr?   Nrt   position_embeddingsr   past_key_valuesr   r   c                 4   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Uu  p[        XX5      u  pUb  UR                  XU R                  5      u  p[        R                  " U R                  R                  [        5      nU" U UU	U
U4U R                  (       a  U R                   OSU R"                  U R$                  U R&                  S.UD6u  pUR(                  " / UQSP76 R+                  5       nU R-                  U5      nX4$ )NrB   r*   rA           rp   r   r   r   )rS   r   r   viewr   r   r   r   updater   r   get_interfacere   _attn_implementationr   r   r   r   r   r   r   r   r   )r;   rt   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   attention_interfacer   r   s                   r=   rO   T5GemmaSelfAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &'6'='=jX\XfXf'g$J(?(M(MKK,,.E)
 %8%
 /3mmD**LL..//%
 %
! "));;;;FFHkk+.((r?   )r   r   re   r   r   r   r   r   r   r   r   r   r   r   r   )rW   rX   rY   rZ   __doc__r,   r[   r5   r8   r   rR   r   r!   r   rO   r\   r]   r^   s   @r=   r   r      s    Gh2 hs h< IM.2(,()||() #5<<#=>E() t+	()
 () -.() 
u||U\\D0%2E2LL	M() ()r?   r   c                     ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S-  S	\R                  S-  S
\
S-  S\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )T5GemmaCrossAttentioni>  r   re   r   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        U R                  R,                  U l        UR$                  c  [/        S5      eg )Nr   r   Frc   zBCross-attention needs cross_attention_hidden_size to be specified.)r4   r5   re   r   r   rf   r   r   r   r   r   r   r   r   r6   rh   r   r   cross_attention_hidden_sizer   r   r   r   
ValueErrorr   s      r=   r5   T5GemmaCrossAttention.__init__B  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
 ii&&68J8JQWQfQf
 '+kk&H&H#--5abb 6r?   Nrt   r   encoder_hidden_statesr   r   r   c                    Uc  [        S5      eUR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nUb1  UR                  R                  U R                  5      n	UR                  n
Ub  W	(       d  UR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nU R                  U5      R	                  U5      R                  SS5      nUb7  W
R                  XU R                  5      u  pSUR                  U R                  '   OFW
R                  U R                     R                  nU
R                  U R                     R                  n[         R"                  " U R$                  R&                  [(        5      nU" U UUUU4U R*                  (       a  U R,                  OSU R.                  S U R0                  S.UD6u  nnUR2                  " / UQSP76 R5                  5       nU R7                  U5      nUU4$ )Nz5Encoder hidden state is required for cross attention.rB   r*   rA   Tr   r   )r	  rS   r   r   r   r   
is_updatedgetr   cross_attention_cacher   r   r   layerskeysvaluesr   r   re   r   r   r   r   r   r   r   r   r   )r;   rt   r   r  r   r   r   r   r  r  curr_past_key_valuesencoder_input_shapeencoder_hidden_shaper   r   r  r   r   s                     r=   rO   T5GemmaCrossAttention.forward^  s9    !(TUU#))#2.88b8$--8{{=166|DNNqRST&(3377GJ#2#H#H "*"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL*+?+F+Fzaeaoao+p(
=A**4>>:-44T^^DIIJ/66t~~FMML(?(M(MKK,,.E)
 %8%
 /3mmD**LL//%
 %
!\ "));;;;FFHkk+.L((r?   )r   r   re   r   r   r   r   r   r   r   r   r   r3   )rW   rX   rY   rZ   r  r,   r[   r5   r8   r   r   r!   r   rR   rO   r\   r]   r^   s   @r=   r  r  >  s    Gc2 cs cB )-3)||3) t+3)  %||d2	3)
 3) -.3) 
u||U\\D0%2E2LL	M3) 3)r?   r  c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\	\R                  4   4
S jjrSrU =r$ )T5GemmaEncoderLayeri  zEncoder sub-layer.r   c                 $  > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        g N)re   r   r1   )r4   r5   rf   re   r   r   attention_typer   	self_attnr.   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormr`   mlppre_feedforward_layernormpost_feedforward_layernormr6   rn   ro   rp   r   s      r=   r5   T5GemmaEncoderLayer.__init__  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r?   Nrt   r   r   r   r   c           	      8   UnU R                  U5      nU R                  " SUUUUS S.UD6u  pU R                  U5      nX`R                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX`R                  U5      -   nU$ )N)rt   r   r   r   r    )r  r  r   rp   r"  r!  r#  )r;   rt   r   r   r   r   residual_s           r=   rO   T5GemmaEncoderLayer.forward  s     !44]C>> 
' 3)% 
 
 55mD <<#>> 66}E/77F <<#>>r?   )r  re   rp   rf   r   r!  r#  r   r"  r  r  r   )rW   rX   rY   rZ   r  r[   r5   r8   r   rR   
LongTensorFloatTensorrO   r\   r]   r^   s   @r=   r  r    s    7# 7. IM.204|| #5<<#=>E t+	
 &&- 
u  !	" r?   r  c                   H  ^  \ rS rSrSrS\4U 4S jjr       SS\R                  S\	\R                  \R                  4   S-  S\R                  S-  S	\R                  S-  S
\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  4S jjrSrU =r$ )T5GemmaDecoderLayeri  z2Decoder sub-layer: an extra cross-attention layer.r   c                   > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        [+        XS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g r  )r4   r5   rf   re   r   r   r  r   r  r.   r  r  r   r`   r!  r"  r#  r6   rn   ro   rp   r  
cross_attnpre_cross_attn_layernormpost_cross_attn_layernormr   s      r=   r5   T5GemmaDecoderLayer.__init__  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r?   Nrt   r   r   r   r   	use_cacher  encoder_attention_maskr   c	           
         Un
U R                  U5      nU R                  " SUUUUUb  UR                  OS US.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  " SUUUUUS.U	D6u  pU R                  U5      nXR	                  U5      -   nUn
U R                  U5      nU R                  U5      nU R                  U5      nXR	                  U5      -   nU$ )N)rt   r   r   r   r   r3  )rt   r  r   r   r3  r&  )r  r  self_attention_cacher   rp   r0  r/  r1  r"  r!  r#  )r;   rt   r   r   r   r   r3  r  r4  r   r'  r(  s               r=   rO   T5GemmaDecoderLayer.forward  s-    !44]C>> 
' 3)%DSD_O@@ei
 
 55mD <<#>> 55mD?? 
'"71+
 
 66}E <<#>> 66}E/77F <<#>>r?   )r  re   r/  rp   rf   r   r!  r1  r#  r   r0  r"  r  r  )NNNNFNN)rW   rX   rY   rZ   r  r[   r5   r8   r   rR   r*  r
   boolr+  rO   r\   r]   r^   s   @r=   r-  r-    s    <e# e4 IM.2046:!&596:,||, #5<<#=>E, t+	,
 &&-, -t3, $;,  %||d2, !&t 3, 
		, ,r?   r-  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaClassificationHeadi  z-Head for sentence-level classification tasks.rf   
num_labelsclassifier_dropout_ratec                    > [         TU ]  5         [        R                  " US9U l        [        R
                  " X5      U l        g )N)r   )r4   r5   r6   rn   rp   rh   out_proj)r;   rf   r;  r<  r<   s       r=   r5   "T5GemmaClassificationHead.__init__  s/    zz$;<		+:r?   rt   r   c                 J    U R                  U5      nU R                  U5      nU$ r3   rp   r>  )r;   rt   s     r=   rO   !T5GemmaClassificationHead.forward  s$    ]3m4r?   rA  )r   )rW   rX   rY   rZ   r  r[   rL   r5   r8   r   rO   r\   r]   r^   s   @r=   r:  r:    sF    7;C ;S ;SX ; ;
U\\ ell  r?   r:  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaLMHeadi  z.Head for language modeling (generation) tasks.rf   
vocab_sizerd   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )Nrc   )r4   r5   r6   rh   r>  )r;   rf   rE  rd   r<   s       r=   r5   T5GemmaLMHead.__init__"  s     		+Er?   rt   r   c                 (    U R                  U5      nU$ r3   r>  )r;   rt   logitss      r=   rO   T5GemmaLMHead.forward&  s    }-r?   rI  )F)rW   rX   rY   rZ   r  r[   r8  r5   r8   r   rO   r\   r]   r^   s   @r=   rD  rD    sJ    8FC FS F F FU\\ ell  r?   rD  c            	          ^  \ rS rSr% \\S'   SrSrSS/rS/r	Sr
SrSrSrSr\\" \SS	S
9\" \SSS
9\" \SSS
9/S.r\R*                  " 5       U 4S j5       rS rSrU =r$ )T5GemmaPreTrainedModeli+  re   modelTr  r-  r   r*   r  )index
layer_namer/  )rt   
attentionsc                 Z  > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       a  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  [        UR                  S5      (       aC  UR                  R                  b+  [        R                  " UR                  R                  5        g g g [	        U[        5      (       ao  U R                  R                  (       dS  UR                  R                  R                  S   S-  n[        R                  " UR                  R                  SX#-  S9  g g SUR                   R"                  ;   a!  [        R                  " UR                  5        g g )Nr   r   r   )rF   stdrd   RMSNorm)r4   _init_weightsre   initializer_ranger   r:  r>  r:   rS   initnormal_r   rd   zeros_rD  tie_word_embeddingsr<   rW   )r;   r   rS  scaler<   s       r=   rU  $T5GemmaPreTrainedModel._init_weightsA  s.    	f%kk++f788OO**003t;ELL//cs{Kv//FOO4H4H4TFOO001 5U/..;;22..44Q74?V__33#3;O 3 &**333KK& 4r?   c                 b   U R                   R                  R                  nU R                   R                  R                  nUc  [	        S5      eUR                  UR                  5      nUSSS24   R                  5       USSS24'   X$S'   Uc  [	        S5      eUR                  US:H  U5        U$ )	z
Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
pad_token_id replacement for labels that were -100.
This is a common preparation step for decoder inputs in sequence-to-sequence models.
Nz:self.model.config.decoder.bos_token_id has to be defined. .rB   r*   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	re   decoderbos_token_idpad_token_idr	  	new_zerosrS   r   masked_fill_)r;   	input_idsdecoder_start_token_idr`  shifted_input_idss        r=   _shift_right#T5GemmaPreTrainedModel._shift_rightS  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r?   r&  )rW   rX   rY   rZ   r+   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr-  r(   r   r  _can_record_outputsr8   r   rU  rf  r\   r]   r^   s   @r=   rM  rM  +  s    &*#.0EF#4"5N!"&,/q[Q/q\R0lS
 ]]_' '"! !r?   rM  	token_idsr`  c                    U b<  Uc  [        S5      eX:g  R                  UR                  [        R                  5      nU$ [        R
                  " UR                  S   UR                  S   4UR                  [        R                  S9nU$ )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r*   r   )r	  r   r   r8   longonesrS   )rr  rt   r`  r   s       r=   make_default_2d_attention_maskrv  n  s     RSS#3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r?   c                      ^  \ rS rSr\\S.rU 4S jr\\	    SS\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S	\\   S
\\-  4S jj5       5       rSrU =r$ )T5GemmaEncoderi  )rQ  rt   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf Nr  Fre   )r4   r5   r`  padding_idxrE  r6   	Embeddingrf   embed_tokensr.   r  normgradient_checkpointing
ModuleListrangenum_hidden_layersr  r  rn   ro   rp   rw   
rotary_emb	post_initr   s      r=   r5   T5GemmaEncoder.__init__       !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	&+#mmEJ6KcKcEdeEd	 3Ede
 zz&"5"560? 	 f   ?D$Nrc  r   r   inputs_embedsr   r   c                    US L US L-  (       a  [        S5      eUR                  SS 5        Uc  U R                  U5      nUc=  [        R                  " UR
                  S   UR                  S9nUR                  S5      nUc   [        XU R                  R                  5      n[        U=n[        5      (       d'  U R                  UUS.n[        S0 UD6[        S0 UD6S.nUn[        R                  " U R                  R                   S-  UR"                  S	9n	X-  nU R%                  U5      nU R'                  X5      n
[)        U R*                  S U R                  R,                   5       H*  u  pU" UU
X`R                  R.                  U      U40 UD6nM,     U R1                  U5      nU R%                  U5      n[3        US
9$ )N:You must specify exactly one of input_ids or inputs_embedsr   r*   r   r   )re   r  r   full_attentionr         ?r   )last_hidden_stater&  )r	  popr~  r8   r   rS   r   r   rv  re   r`  r   dictr   r   tensorrf   r   rp   r  	enumerater  r  r   r  r   )r;   rc  r   r   r  r   self_attn_mask_mappingmask_kwargsrt   
normalizerr   ilayer_modules                r=   rO   T5GemmaEncoder.forward  s    -t";<YZZ 	

$d+  --i8M <<(;(;A(>}G[G[\L'11!4L!;IVZVaVaVnVnoNNB0DII++!."0K #<"Jk"J%M%\P[%\&"
 &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA(#&{{'>'>q'AB	
 M  W 		-0]3+
 	
r?   rp   r~  r  r  r  r|  r  rE  NNNN)rW   rX   rY   rZ   r   r  rq  r5   r'   r)   r8   r*  r   r+  r!   r"   rR   r   rO   r\   r]   r^   s   @r=   rx  rx    s    *,
$   .2.204266
##d*6
 t+6
 &&-	6

 ((4/6
 +,6
 
	 6
   6
r?   rx  c                   V  ^  \ rS rSr\" \SS9\" \SS9\S.rU 4S jr	\
\        SS\R                  S-  S\R                  S-  S	\R                  S-  S
\S-  S\R                   S-  S\S-  S\R                  S-  S\R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )T5GemmaDecoderi  r*   )rO  )rQ  cross_attentionsrt   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[!        X5      PM     sn5      U l        [
        R$                  " UR&                  5      U l        [+        US9U l        U R/                  5         g s  snf rz  )r4   r5   r`  r|  rE  r6   r}  rf   r~  r.   r  r  r  r  r  r  r-  r  rn   ro   rp   rw   r  r  r   s      r=   r5   T5GemmaDecoder.__init__  r  r  Nrc  r   r   r   r  r3  r  r4  r   r   c	                    US L US L-  (       a  [        S5      eUc  [        S5      eUc  U R                  U5      nU R                  (       d/  U(       a(  Uc%  [        [	        U R
                  S9[	        5       5      nUcU  Ub  UR                  5       OSn
[        R                  " UR                  S   UR                  S9U
-   nUR                  S5      nUc#  Uc   [        XU R
                  R                  5      n[        U=n[        5      (       d8  U R
                  UUUb  UR                   OS US.n[#        S0 UD6[%        S0 UD6S.n[        U=n[        5      (       d  S	['        U R
                  UUUS
90nUn[        R(                  " U R
                  R*                  S-  UR,                  S9nX-  nU R/                  U5      nU R1                  X5      n[3        U R4                  S U R
                  R6                   5       H2  u  nnU" UUXR
                  R8                  U      UUUUUS	   40 U	D6nM4     U R;                  U5      nU R/                  U5      n[=        UUS9$ )Nr  z0`encoder_hidden_states` must be given in decoderr{  r   r*   r  )re   r  r   r   r   r  r  )re   r  r   r  r  r   )r  r   r&  )r	  r~  r   r
   r	   re   get_seq_lengthr8   r   rS   r   r   rv  r`  r   r  r6  r   r   r   r  rf   r   rp   r  r  r  r  r   r  r   )r;   rc  r   r   r   r  r3  r  r4  r   past_seen_tokensr  r  cross_attn_mask_mappingrt   r  r   r  r  s                      r=   rO   T5GemmaDecoder.forward  so    -t";<YZZ (OPP  --i8M}}/F 2,dkk2RT`TbcOCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L!o&=;IVZVaVaVnVnoNNB0DII++!."0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TRR ";;;"/#9*?	#'# &\\$++"9"93">mFYFYZ
%2]3"oomJ(5Tt{{7T7T)UVOA|(#&{{'>'>q'AB%'(89
 
M  W 		-0]38++
 	
r?   r  )NNNNNNNN)rW   rX   rY   rZ   r(   r   r  r-  rq  r5   r'   r)   r8   r*  r   r
   r+  r8  r!   r"   rR   r   rO   r\   r]   r^   s   @r=   r  r    s   $%9C*+@J,$   .2.2046:26!%596:P
##d*P
 t+P
 &&-	P

 -t3P
 ((4/P
 $;P
  %||d2P
 !&t 3P
 +,P
 
:	:P
   P
r?   r  c                     ^  \ rS rSrS\4U 4S jjrS rS r\\	           SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\S-  S\
R                   S-  S\
R                   S-  S\S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaModeli@  re   c                    > [         TU ]  U5        UR                  (       d  [        S5      e[	        UR
                  5      U l        [        UR                  5      U l        U R                  5         g )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r4   r5   is_encoder_decoderr	  rx  encoderr  r^  r  rq   s     r=   r5   T5GemmaModel.__init__B  sO     ((uvv%fnn5%fnn5r?   c                 6    U R                   R                  5       $ r3   r  get_input_embeddingsrT   s    r=   r  !T5GemmaModel.get_input_embeddingsM      ||0022r?   c                 8    U R                   R                  U5      $ r3   r  set_input_embeddingsr;   new_embeddingss     r=   r  !T5GemmaModel.set_input_embeddingsP      ||00@@r?   Nrc  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   r  decoder_inputs_embedsr3  r   r   c                    Uc  U R                   " SUUUU	S.UD6nUR                  nU R                  " SUUUU
UUUUS.UD6n[        UR                  UR                  UR                  SS5      (       a  UR                  OUR                  4UR                  UR                  UR                  UR                  UR                  S9$ )a8  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
rc  r   r   r  )rc  r   r   r  r   r  r4  r3  output_hidden_statesF)r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentionsr&  )	r  r  r^  r   r   r  rt   rQ  r  )r;   rc  r   r   r  r  r  r  r   r  r  r3  r   r  decoder_outputss                  r=   rO   T5GemmaModel.forwardS  s    , ""ll #-)+	
 O !0 A A,, 

'1-/+"7#1

 

 "-??+;;zz0%88 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r?   )r^  r  )NNNNNNNNNNN)rW   rX   rY   rZ   r+   r5   r  r  r$   r#   r8   r*  r+  
BoolTensorr   r
   r   r8  r!   r"   r   rO   r\   r]   r^   s   @r=   r  r  @  sA   	} 	3A  .2370459:>8<266:-159!%6
##d*6
 ))D06
 &&-	6

 !++d26
 !& 0 04 76
 $..56
 )4/6
 -t36
 ||d*6
  %||d26
 $;6
 +,6
 
6
  6
r?   r  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	    SS\
R                  S-  S\
R                  S-  S	\
R                  S-  S
\
R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaEncoderModeli  re   c                    > [         TU ]  U5        UR                  (       a  [        S5      e[	        UR
                  5      U l        U R                  5         g )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r4   r5   r  r	  rx  r  r  rq   s     r=   r5   T5GemmaEncoderModel.__init__  s?     $$pqq%fnn5r?   c                 6    U R                   R                  5       $ r3   r  rT   s    r=   r  (T5GemmaEncoderModel.get_input_embeddings  r  r?   c                 8    U R                   R                  U5      $ r3   r  r  s     r=   r  (T5GemmaEncoderModel.set_input_embeddings  r  r?   Nrc  r   r   r  r   r   c                 4    U R                   " SUUUUS.UD6nU$ )Nr  r&  r  )r;   rc  r   r   r  r   r  s          r=   rO   T5GemmaEncoderModel.forward  s5     ,, 
)%'	

 
 r?   r  r  )rW   rX   rY   rZ   r+   r5   r  r  r$   r#   r8   r*  r+  r   r!   r"   r   rO   r\   r]   r^   s   @r=   r  r    s    } 3A  .23704-1##d* ))D0 &&-	
 ||d* +, 
  r?   r  c            "       Z  ^  \ rS rSrSS0rSS0rSS/S/40rS\4U 4S	 jjrS
 r	S r
\\             S%S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                   S-  S\R                  S-  S\S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\R*                  -  S\\   S\\R                     \-  4S jj5       5       rS\R*                  4S jrS\S\S \S!\S"\S\4U 4S# jjrS$r U =r!$ )&T5GemmaForConditionalGenerationi  zlm_head.out_proj.weightz!model.decoder.embed_tokens.weightzlm_head.out_projcolwise_gather_outputrt   rJ  re   c                   > SUl         [        TU ]	  U5        [        U5      U l        UR
                  R                  U l        [        UR
                  R                  U R                  5      U l	        SU l
        U R                  5         g )NTForMaskedLM)r  r4   r5   r  rN  r^  rE  rD  rf   lm_head	loss_typer  rq   s     r=   r5   (T5GemmaForConditionalGeneration.__init__  sb    $(! !&)
 ..33$V^^%?%?Q&r?   c                 2   XR                   l        U R                  R                  (       al  UR                  U R
                  R                  R                  l        UR                  R                  S   U R
                  R                  R                  l	        g g )Nr   )
r  r>  re   rZ  r:   rN  r^  r~  rS   num_embeddingsr  s     r=   set_output_embeddings5T5GemmaForConditionalGeneration.set_output_embeddings  sh     . ;;**5C5J5JDJJ++2=K=R=R=X=XYZ=[DJJ++: +r?   c                 .    U R                   R                  $ r3   )r  r>  rT   s    r=   get_output_embeddings5T5GemmaForConditionalGeneration.get_output_embeddings  s    ||$$$r?   Nrc  r   r   r  r  r  r  r   r  r  labelsr3  logits_to_keepr   r   c                    Ub  Uc  U
c  U R                  U5      nU R                  " SUUUUUUUUU	U
US.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  5       R                  nUR                  b4  UUR                  -  n[        R                  " U5      nUUR                  -  nSnUb  U R                  " UXR                  40 UD6n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )a  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)rc  r   r   r  r  r  r  r   r  r  r3  )	lossrJ  r   r  r  r  r  r  r  r&  )rf  rN  r  r   r[   slicer  get_decoderre   final_logit_softcappingr8   r   loss_functionrE  r   r   r  r  r  r  r  r  )r;   rc  r   r   r  r  r  r  r   r  r  r  r3  r  r   r  rt   slice_indicesrJ  decoder_configr  s                        r=   rO   'T5GemmaForConditionalGeneration.forward  sp   : "3";@U@] $ 1 1& 9.2jj /
)%/#9!5++'"7/
 /
 (998B>SV8W8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%%ffooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r?   c                 $    U R                  U5      $ r3   )rf  )r;   r  s     r=   %prepare_decoder_input_ids_from_labelsET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r?   generation_configmodel_kwargsgeneration_mode
batch_sizemax_cache_lengthc           
        > [         TU ]  UUUUU5        UR                  SL a  gUR                  nUc  SnOSUR                  ;   n[        R
                  " U R                  R                  SS95      nSUl        S/UR                  -  Ul
        UUS.n	UR                  S5      n
U
b  [        U
[        5      (       d  [        S	5      e[        U
R                   5      S
:  a!  U
R                   R                  S
5      (       a  g[#        U
R$                  5      nU[&        :X  a  US   S
   R(                  S   U	S'   U" S0 U	D6U
l        O:[        [+        S0 U R                  R                  SS9US.D6[+        5       5      US'   [-        U S5      (       aC  U R.                  b5  [        U R.                  [        5      (       d  [        S5      eUS   U l        ggg)a  Override cache preparation to force full attention on the cross-attention cache.

The decoder config may declare sliding-window layers, but cross-attention must always use full attention.
The default `_prepare_cache_for_generation` would otherwise build a sliding cross-attention cache.
FN	offloadedT)r^  r  )re   
offloadingr   z`The `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma model.r   r  r*   max_cache_len_cachezKThe internal cache must be of type `EncoderDecoderCache` for T5Gemma model.r&  )r4   _prepare_cache_for_generationr3  cache_implementationcopydeepcopyre   get_text_configr   r  r   r  r   r
   r	  lenr  r   r  r   rS   r	   r   r  )r;   r  r  r  r  r  r  offload_cachecross_attn_configcross_attn_cache_kwargsr   cross_attn_clsr<   s               r=   r  =T5GemmaForConditionalGeneration._prepare_cache_for_generation  s    	-	
 &&%/0EE'!M'+<+Q+QQM !MM$++*E*Ed*E*ST+/()9(:=N=`=`(`% ('#

 '**+<=&o/BCC v 
 ?--.27Q7Q7U7UVW7X7X!/"G"GHN,;GHY;Z[\;];c;cde;f'84B4]E\4]O1 /B "&++"="=d"="K&3 /L*+ 4""t{{'>dkk+>?? !noo&'89DK	 (?"r?   )r  r  r  rN  rE  )NNNNNNNNNNNNr   )"rW   rX   rY   rZ   _tied_weights_keys_tp_plan_pp_planr+   r5   r  r  r$   r#   r8   r*  r+  r  r   r
   r8  r[   r   r!   r"   rR   r   rO   r  r   r  r   r  r\   r]   r^   s   @r=   r  r    s   35XY"$;<H"o%6
$CDH	} 	\%  .2370459:>8<266:26:>*.!%-.G
##d*G
 ))D0G
 &&-	G

 !++d2G
 !& 0 04 7G
 $..5G
 )4/G
 -t3G
 ((4/G
  %0047G
   4'G
 $;G
 ell*G
 +,G
  
u  	!O	3!G
  G
R)ELL )I:+I: I: (	I:
 I: I: 
I: I:r?   r  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ ) T5GemmaForSequenceClassificationii  Nre   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
Nr<  皙?r  r4   r5   r;  r  rN  r  r  rf   r^  r   r:  scorer  r;   re   r  rf   classifier_dropoutr<   s        r=   r5   )T5GemmaForSequenceClassification.__init__k  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r?   c                 6    U R                   R                  5       $ r3   rN  r  rT   s    r=   r  5T5GemmaForSequenceClassification.get_input_embeddings      zz..00r?   c                 :    U R                   R                  U5        g r3   rN  r  r;   r   s     r=   r  5T5GemmaForSequenceClassification.set_input_embeddings      

''.r?   rc  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R                  c  US	:w  a  [        S
5      eU R                   R                  c  SnGOUb  XR                   R                  :g  R!                  UR"                  [$        R&                  5      n[$        R(                  " UR                  S   UR"                  [$        R&                  S9nUU-  R+                  S5      nU R                   R                  (       a*  US	-  n[$        R,                  " UUR                  S   S	-
  S9nO.Sn[.        R1                  U R                  R                   S35        U[$        R(                  " UUR"                  S9U4   nSnU
b  U R3                  UU
UU R                   S9n[5        UUUUS9$ )  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   r  r  r  r  r  r  r3  r   r   r  r   r*   z=Cannot handle batch sizes > 1 if no padding token is defined.rB   r   )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )rJ  r  pooled_logitsre   r  rJ  rt   rQ  )re   r  NotImplementedErrorr<   rW   r	  rf  rN  r  r  r  rt   rQ  r  rS   r`  r   r   r8   int32r   argmaxclamploggerwarning_oncer  r   )r;   rc  r   r   r  r  r  r  r  r  r  r   outputsr  rt   rQ  rJ  r  last_non_pad_tokennon_pad_masktoken_indicesr"  r  s                          r=   rO   (T5GemmaForSequenceClassification.forward  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r?   rN  r;  r  r3   
NNNNNNNNNN)rW   rX   rY   rZ   r+   r8  r5   r  r  r$   r#   r8   r*  r   r   r+  r!   r"   r   rO   r\   r]   r^   s   @r=   r	  r	  i  sS   } $+  .1/  .2.204596:8<2626:>*.i
##d*i
 t+i
 &&-	i

 !++d2i
 !&t 3i
 $..5i
 )4/i
 ((4/i
  %0047i
   4'i
 +,i
 
"i
  i
r?   r	  c                     ^  \ rS rSrSS\S\S-  4U 4S jjjrS rS r\	\
          SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ )T5GemmaForTokenClassificationi  Nre   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for token classification. When set to False, only encoder is used.
Nr<  r  r  r  s        r=   r5   &T5GemmaForTokenClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r?   c                 6    U R                   R                  5       $ r3   r  rT   s    r=   r  2T5GemmaForTokenClassification.get_input_embeddings  r  r?   c                 :    U R                   R                  U5        g r3   r  r  s     r=   r  2T5GemmaForTokenClassification.set_input_embeddings  r  r?   rc  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nSnU
b  U R                  UXR                   5      n[        UUUUS9$ )	r  Nr  r  r  Fr  r   r#  )re   r  r$  r<   rW   r	  rf  rN  r  r  r  rt   rQ  r  r  r   )r;   rc  r   r   r  r  r  r  r  r  r  r   r*  r  rt   rQ  rJ  r  s                     r=   rO   %T5GemmaForTokenClassification.forward  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-.%%ffkkBD$'!	
 	
r?   r/  r3   r0  )rW   rX   rY   rZ   r+   r8  r5   r  r  r$   r#   r8   r*  r   r   r+  r!   r"   r   rO   r\   r]   r^   s   @r=   r2  r2    sS   } $+  01/  .2.204596:8<2626:>*.N
##d*N
 t+N
 &&-	N

 !++d2N
 !&t 3N
 $..5N
 )4/N
 ((4/N
  %0047N
   4'N
 +,N
 
N
  N
r?   r2  )r  r  r  rM  r	  r2  )r*   )r   NN)_r  collections.abcr   typingr   r8   torch.nnr6    r   rW  activationsr   cache_utilsr   r	   r
   r   
generationr   r   r   integrationsr   r   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r    processing_utilsr!   utilsr"   r#   r$   r%   utils.genericr&   r'   utils.output_capturingr(   r)   configuration_t5gemmar+   r,   
get_loggerrW   r(  Moduler.   r`   rw   r   r   r   r[   r   rL   rR   r   r   r  r  r-  r:  rD  rM  r*  rv  rx  r  r  r  r  r	  r2  __all__r&  r?   r=   <module>rQ     s+  *  $    & ! P P K K I  C 9  L F & R R G E E 
		H	%=RYY =( &><RYY ><B( *+ ,2	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D )*F)299 F) +F)R )*R)BII R) +R)j14 1hF4 FR		 	BII 	 ?!_ ?! ?!D$&<< * \\	"P
+ P
fk
+ k
\ J
) J
 J
Z !0 ! !Hs:&<o s:l I
'= I
 I
X o
$: o
 o
dr?   