
    3j                    	   S SK r S SKJr  S SKJr  S SKJr  S SKJr  S SK	J
r
  S SKrS SKJr  S SKJr  S	S
KJr  S	SKJr  S	SKJrJr  S	SKJr  S	SKJr  S	SKJr  S	SKJrJ r J!r!J"r"  S	SK#J$r$  S	SK%J&r&  S	SK'J(r(J)r)  S	SK*J+r+J,r,  S	SK-J.r.J/r/  S	SK0J1r1  S	SK2J3r3J4r4J5r5J6r6J7r7J8r8  S	SK9J:r:J;r;  S	SK<J=r=J>r>  SSK?J@r@  SSKAJBrBJCrCJDrDJErE  \7" 5       (       a  S SKFJGrG  \5" SS 9\ " S! S"\(5      5       5       rH\5" S#S 9\ " S$ S%\35      5       5       rI\ " S& S'\(5      5       rJ\5\ " S( S)\)5      5       5       rK " S* S+\R                  5      rM " S, S-\R                  5      rN " S. S/\R                  5      rO " S0 S1\R                  5      rP " S2 S3\R                  5      rQ " S4 S5\R                  5      rR " S6 S7\R                  5      rS " S8 S9\R                  5      rU " S: S;\R                  5      rV " S< S=\R                  5      rW " S> S?\R                  5      rX " S@ SA\R                  5      rY " SB SC\R                  5      rZ " SD SE\R                  5      r[SF r\SSG\R                  SH\R                  SI\R                  SJ\^4SK jjr_SL\R                  SM\^SN\R                  4SO jr`   SSP\R                  SQ\R                  SR\R                  SS\R                  ST\R                  S-  SU\a\^-  SV\aS-  SW\aS-  SN\b\R                  \R                  4   4SX jjrc SSG\R                  SH\R                  SI\R                  SY\R                  SJ\^SN\R                  4SZ jjrd " S[ S\\R                  5      re " S] S^\&5      rf " S_ S`\R                  5      rg " Sa Sb\R                  5      rh " Sc Sd\R                  5      ri " Se Sf\R                  5      rj\ " Sg Sh\R                  5      5       rk " Si Sj\R                  5      rl " Sk Sl\&5      rm " Sm Sn\R                  5      ro\5 " So Sp\/5      5       rp\5" SqS 9 " Sr Ss\p5      5       rq\5" StS 9 " Su Sv\p\5      5       rrSw\b\^\^4   SN\4Sx jrs " Sy Sz\p5      rt " S{ S|\p5      ru " S} S~\R                  5      rvS\R                  S\R                  SN\R                  4S jrx\5" SS 9 " S S\p5      5       ry\5" SS 9 " S S\p\5      5       rz/ SQr{g)    N)UserDict)Callable)	dataclass)cached_property)Optional)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)use_experts_implementation)create_bidirectional_maskcreate_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleis_accelerate_availabletorch_compilable_check)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel   )Gemma4AudioConfigGemma4ConfigGemma4TextConfigGemma4VisionConfig)add_hook_to_modulezK
    Base class for Gemma4 outputs, with hidden states and attentions.
    custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\\\R                  \R                  4   4   S-  \S'   Srg)Gemma4ModelOutputWithPastD   aw  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
Nimage_hidden_statesaudio_hidden_statesshared_kv_states )__name__
__module____qualname____firstlineno____doc__r6   torchFloatTensor__annotations__r7   r8   dictstrtupleTensor__static_attributes__r9       d/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/gemma4/modeling_gemma4.pyr4   r4   D   sa    " 59**T1848**T18LPd3ellELL&@ AABTIPrG   r4   zR
    Base class for Gemma4 causal language model (or autoregressive) outputs.
    c                   z   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Sr\\\\R(                  \R(                  4   4   S-  \S'   Srg)Gemma4CausalLMOutputWithPastc   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
Nlosslogitspast_key_valueshidden_states
attentionsr6   r7   r8   r9   )r:   r;   r<   r=   r>   rL   r?   r@   rA   rM   rN   r   rO   rD   rP   r6   r7   r8   rB   rC   rE   rF   r9   rG   rH   rJ   rJ   c   s    * &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T1848**T18LPd3ellELL&@ AABTIPrG   rJ   c                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4TextModelOutputWithPast   a!  
BaseModelOutputWithPast extended with shared_kv_states for KV sharing.

Args:
    shared_kv_states (`dict`, *optional*):
        Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
        Used to pass shared KV states between layers during KV sharing.
Nr8   r9   )r:   r;   r<   r=   r>   r8   rB   rC   rD   r?   rE   rA   rF   r9   rG   rH   rR   rR      s7     MQd3ellELL&@ AABTIPrG   rR   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma4AudioModelOutput   z
attention_mask (`torch.BoolTensor`, *optional*):
    A torch.BoolTensor of shape `(batch_size, num_frames)`. True for valid positions, False for padding.
Nattention_maskr9   )
r:   r;   r<   r=   r>   rW   r?   
BoolTensorrA   rF   r9   rG   rH   rU   rU      s    
 /3NE$$t+2rG   rU   c                   |   ^  \ rS rSrS\\-  S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr
S
rU =r$ )Gemma4ClippableLinear   configin_featuresout_featuresreturnNc                   > [         TU ]  5         UR                  U l        [        R                  " X#SS9U l        U R                  (       a  U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        g g )NFbias	input_mininf	input_max
output_min
output_max)
super__init__use_clipped_linearsr   Linearlinearregister_bufferr?   tensorfloat)selfr\   r]   r^   	__class__s       rH   ri   Gemma4ClippableLinear.__init__   s     	#)#=#= iiF##  ellE%L=.IJ  ell5<.HI  u||U5\M/JK  u||E%L/IJ	 $rG   rO   c                    U R                   (       a+  [        R                  " XR                  U R                  5      nU R                  U5      nU R                   (       a+  [        R                  " XR                  U R                  5      nU$ N)rj   r?   clamprc   re   rl   rf   rg   )rp   rO   s     rH   forwardGemma4ClippableLinear.forward   sX    ##!KK~~t~~VMM2##!KKXMrG   )rl   rj   )r:   r;   r<   r=   r/   r,   intri   r?   rE   rv   rF   __classcell__rq   s   @rH   rZ   rZ      sY    K"%66K K 	K
 
K 	U\\ 	ell 	 	rG   rZ   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  4S jr
S\R                  S\R                  4S	 jrS
rU =r$ )Gemma4RMSNorm   dimeps
with_scalec                    > [         TU ]  5         X l        X0l        U R                  (       a/  [        R
                  " [        R                  " U5      SS9U l        g g )NT)requires_grad)	rh   ri   r   r   r   	Parameterr?   onesweight)rp   r~   r   r   rq   s       rH   ri   Gemma4RMSNorm.__init__   s>    $??,,uzz#dKDK rG   rO   c                     UR                  S5      R                  SSS9U R                  -   nU[        R                   " US5      -  $ )Nr)   T)keepdim      )powmeanr   r?   )rp   rO   mean_squareds      rH   _normGemma4RMSNorm._norm   sA    $((+00T0BTXXMuyyt<<<rG   r_   c                     U R                  UR                  5       5      nU R                  (       a  X R                  R                  5       -  nUR	                  U5      $ rt   )r   ro   r   r   type_as)rp   rO   normed_outputs      rH   rv   Gemma4RMSNorm.forward   sF    

=#6#6#89??)KK,=,=,??M$$]33rG   )r   r   r   )gư>T)r:   r;   r<   r=   rx   ro   boolri   r?   rE   r   rv   rF   ry   rz   s   @rH   r|   r|      sW    LC Le L L L=5<< =
4U\\ 4ell 4 4rG   r|   c                      ^  \ rS rSr% Sr\R                  \S'   S\4U 4S jjr	\R                  " 5       S\R                  S\R                  4S j5       rS	rU =r$ )
 Gemma4AudioRelPositionalEncoding   zSinusoidal relative positional encoding for the audio encoder.

Produces position embeddings of shape [1, context_size // 2 + 1, hidden_size] with
concatenated [sin..., cos...] layout matching the original Gemma4 convention.
inv_timescalesr\   c                   > [         TU ]  5         UR                  U l        UR                  UR                  -   S-
  UR
                  -   U l        SnSnU R                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  nU R                  SUR                  S5      R                  S5      SS9  g )	Nr+         ?     @r)   r   r   F
persistent)rh   ri   hidden_sizeattention_chunk_sizeattention_context_leftattention_context_rightcontext_sizemathlogmaxr?   exparangerm   	unsqueeze)rp   r\   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr   rq   s          rH   ri   )Gemma4AudioRelPositionalEncoding.__init__   s    !--''&*G*GG!KfNlNll 	 ))Q."&((=+H"ICP^abPbdeLf"f&5<<3OSjRj3j)kk-~/G/G/J/T/TUV/WdijrG   rO   r_   c                 b   [         R                  " U R                  S-  SSUR                  S9nUS   nX R                  R                  UR                  S9-  n[         R                  " [         R                  " U5      [         R                  " U5      /SS9nUR                  UR                  S9$ )Nr)   r   device.Nr~   dtype)
r?   r   r   r   r   tocatsincosr   )rp   rO   position_idsscaled_time	pos_embeds        rH   rv   (Gemma4AudioRelPositionalEncoding.forward   s    ||D$5$5$:B=K_K_`#I."%8%8%;%;=CWCW%;%XXIIuyy5uyy7MNTVW	||-"5"5|66rG   )r   r   )r:   r;   r<   r=   r>   r?   rE   rA   r,   ri   no_gradrv   rF   ry   rz   s   @rH   r   r      sS     LL k0 k ]]_7U\\ 7ell 7 7rG   r   c                   f  ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\R                  4S jr
S\R                  S\R                  4S	 jrS
\R                  S\R                  4S jr SS\R                  S\R                  S\R                  S-  S\\R                  S4   4S jjrSrU =r$ )Gemma4AudioAttention   z3Chunked local attention with relative position biasr\   	layer_idxc                   > [         TU ]  5         Xl        X l        UR                  U l        UR                  UR                  -  U l        UR                  U l	        U R                  S-  [        R                  " S5      -  U l        [        R                  " S[        R                  -   5      [        R                  " S5      -  U l        UR                  U l        UR"                  S-
  U l        UR&                  U l        U R                   U R$                  -   U R(                  -   U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  UR                  5      U l        [6        R8                  " UR                  U R                  U R                  -  SS9U l        [6        R<                  " [>        R@                  " U R                  5      5      U l!        U RE                  S[>        RF                  " U R
                  5      SS9  g )Nr   r)   r+   Fra   softcapr   )$rh   ri   r\   r   attention_logit_capattention_logits_soft_capr   num_attention_headshead_dim	num_headsr   r   q_scaleek_scaler   
chunk_sizer   max_past_horizonr   max_future_horizonr   rZ   q_projk_projv_projpostr   rk   relative_k_projr   r?   zerosper_dim_scalerm   rn   rp   r\   r   rq   s      rH   ri   Gemma4AudioAttention.__init__   s   ")/)C)C&**f.H.HH33t+txx{:xxDFF
+dhhqk9 55 & = = A"("@"@ OOd.C.CCdF]F]]+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg)&2D2DfFXFXY	!yy););T^^dmm=[bgh\\%++dmm*DEYT5S5S(TafgrG   rO   r_   c           	         UR                   u  p#pEX0R                  -   S-
  U R                  -  nX`R                  -  U-
  n[        R                  " USSSSSU45      nUR	                  X&U R                  XE5      R                  5       $ )zSplits a `(batch_size, seq_len, num_heads, head_dim)` tensor into non-overlapping blocks of `chunk_size` along the sequence dim.r+   r   )shaper   Fpadreshape
contiguous)rp   rO   
batch_sizeseq_lenr   r   
num_blocksr   s           rH   _convert_to_block&Gemma4AudioAttention._convert_to_block  s|    3@3F3F0
Y/!3G
??*W4maAq!S-AB$$ZT__ibmmoorG   c           
      @   UR                   u  p#pE[        R                  " USSSSU R                  U R                  U R
                  -   S-
  45      nUR                  SU R                  U R
                  5      n[        R                  " USS5      nUR                  5       $ )z`Extracts overlapping context windows of `context_size` for every block, strided by `chunk_size`.r   r+   r   r)   )r   r   r   r   r   r   unfoldr   r?   movedimr   )rp   rO   r   r   r   r   s         rH   _extract_block_context+Gemma4AudioAttention._extract_block_context  s    3@3F3F0
YAq!Q(=(=t?V?VY]YhYh?hkl?lm
 &,,Q0A0A4??SmR;''))rG   xc                     UR                   u  p#pEnU R                  n[        R                  " USUS-   U-
  45      nUR	                  X#XEUS-   -  5      nUSSXW-  24   nUR	                  X#XEU5      $ )zjRelative position shift for blocked attention. See appendix B of https://huggingface.co/papers/1901.02860.r   r+   .N)r   r   r   r   view)rp   r   r   r   r   
block_sizeposition_lengthr   s           rH   
_rel_shiftGemma4AudioAttention._rel_shift$  s    IJF
z((EE!a)O;<=FF:*LSTDT6UVc.Z.../vvjZ\RRrG   Nposition_embeddingsrW   c                    UR                   u  pEnXEU R                  U R                  4nU R                  U5      R	                  5       R                  U5      nU R                  U5      R	                  5       R                  U5      n	U R                  U5      R	                  5       R                  U5      n
XR                  -  [        R                  " U R                  5      -  nXR                  -  n	U R                  U5      nU R                  U	5      n	U R                  U
5      n
UR                   S   nU R                  U5      nUR                  SU R                  U R                  5      nUR!                  UR"                  S9nUR%                  SSSSS5      nXR%                  SSSSS5      -  nUR'                  X@R                  SU R                  5      nXR%                  SSS5      -  nUR'                  X@R                  XR(                  S5      nU R+                  U5      nUU-   nUU R,                  -  n[.        R0                  " U5      nUU R,                  -  nUb4  UR3                  UR5                  5       U R6                  R8                  5      n[        R:                  " US[.        R<                  S9R!                  U
R"                  5      nUU
R%                  SSSSS5      -  nUR%                  SSSSS5      R'                  XKU R(                  -  S5      nUS S 2S U24   R?                  5       nU RA                  UR!                  UR"                  5      5      nUU4$ )	Nr+   r   r   r   r
   r)      r~   r   )!r   r   r   r   ro   r   r   r   r   r   softplusr   r   r   r   r   r   r   permuter   r   r   r   r?   tanhmasked_filllogical_notr\   attention_invalid_logits_valuesoftmaxfloat32r   r   )rp   rO   r   rW   r   
seq_length_hidden_shapequery_states
key_statesvalue_statesr   relative_key_statesqueries	matrix_acqueries_flat	matrix_bdattn_weightsattn_outputs                      rH   rv   Gemma4AudioAttention.forward-  s    %2$7$7!
"N{{=1779>>|L[[/557<<\J
{{=1779>>|L#ll2QZZ@R@R5SS,,.
--l;00<
22<@!''*
"223FG166r4>>4==Y144<;M;M4N&&q!Q1500Aq!Q??	z>>2t}}U #>#>q!Q#GG	%%j..*oo_ab	OOI.	 9,#dll2zz,/#dll2%'33**,dkk.X.XL yy2U]]KNN|OaOab"\%9%9!Q1a%HH!))!Q1a8@@Z^ZiZiMikmn!![j[.1<<>ii}/B/B CDL((rG   )r   r   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rt   )r:   r;   r<   r=   r>   r,   rx   ri   r?   rE   r   r   r   rX   rD   rv   rF   ry   rz   s   @rH   r   r      s    =h0 hS h4pu|| p p*ELL *U\\ *SELL SU\\ S 37	1)||1) #\\1) ((4/	1)
 
u||T!	"1) 1)rG   r   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\R                  S-  4S jjrSrU =r	$ )	'Gemma4AudioSubSampleConvProjectionLayeria  c           	         > [         TU ]  5         [        R                  " UUSSSSS9U l        [        R
                  " X#SSS9U l        [        R                  " 5       U l        g )N)r
   r
   )r)   r)   r+   F)in_channelsout_channelskernel_sizestridepaddingrb   T)r   elementwise_affinerb   )	rh   ri   r   Conv2dconv	LayerNormnormReLUact)rp   r  r  norm_epsrq   s       rH   ri   0Gemma4AudioSubSampleConvProjectionLayer.__init__b  sU    II#%
	 LLPT[`a	779rG   NrO   maskc           
         Ub(  UR                  UR                  S9nXS S 2S S S 2S 4   -  nU R                  UR                  U R                  R                  R                  5      5      nU R                  U R                  UR                  SSSS5      5      R                  SSSS5      R                  5       5      nUb  US S 2S S S24   nX4$ )Nr   r   r)   r
   r+   )	r   r   r  r   r   r  r  r   r   )rp   rO   r  s      rH   rv   /Gemma4AudioSubSampleConvProjectionLayer.forwardo  s    77-"6"677D)D!T1A,BBM		-"2"24993C3C3I3I"JK=+@+@Aq!+L!M!U!UVWYZ\]_`!a!l!l!no3Q3<D""rG   )r  r  r  rt   )
r:   r;   r<   r=   ri   r?   rE   rv   rF   ry   rz   s   @rH   r
  r
  a  s-    #U\\ #9L # #rG   r
  c            	          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S-  S\\R                  \R                  4   4S jjr	S	r
U =r$ )"Gemma4AudioSubSampleConvProjectioni}  r\   c                 d  > [         TU ]  5         [        SUR                  S   UR                  S9U l        [        UR                  S   UR                  S   UR                  S9U l        UR                  S   S-  UR                  S   -  n[        R                  " X!R                  SS9U l
        g )Nr+   r   )r  r  r  r   Fra   )rh   ri   r
  subsampling_conv_channelsrms_norm_epslayer0layer1r   rk   r   input_proj_linear)rp   r\   proj_input_dimrq   s      rH   ri   +Gemma4AudioSubSampleConvProjection.__init__~  s    =99!<((

 >88;99!<((

 !::1=BfFfFfghFii!#>;M;MTY!ZrG   Ninput_featuresinput_features_maskr_   c                    UR                  S5      nU R                  X25      u  p4U R                  X45      u  p4UR                  u  pVpvUR	                  SSSS5      R                  5       R                  XWS5      nU R                  U5      U4$ )Nr+   r   r)   r
   r   )r   r"  r#  r   r   r   r   r$  )rp   r'  r(  rO   r  r   r   r   s           rH   rv   *Gemma4AudioSubSampleConvProjection.forward  s    
 '003"kk-M"kk->$1$7$7!
w%--aAq9DDFNNzdfg%%m4d::rG   )r$  r"  r#  rt   )r:   r;   r<   r=   r,   ri   r?   rE   rD   rv   rF   ry   rz   s   @rH   r  r  }  s\    [0 [$ 48;; #\\D0; 
u||U\\)	*	; ;rG   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioFeedForwardi  r\   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  S-  UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l	        [        UR                     U l        UR                  U l        UR                  U l        g )Nr   )rh   ri   r\   rZ   r   ffw_layer_1ffw_layer_2r|   pre_layer_normpost_layer_normr   
hidden_actact_fngradient_clippingresidual_weightpost_layer_scalerp   r\   rq   s     rH   ri   Gemma4AudioFeedForward.__init__  s    09K9KVM_M_bcMcd09K9Ka9OQWQcQcd+F,>,>?,V-?-?@V../!'!9!9 & 6 6rG   rO   r_   c                    [        U R                  [        R                  " UR                  5      R
                  5      nUn[        R                  " X* U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nXR                  -  nX-  nU$ rt   )minr4  r?   finfor   r   ru   r0  r.  r3  r/  r1  r6  )rp   rO   r4  residuals       rH   rv   Gemma4AudioFeedForward.forward  s     6 6MDWDW8X8\8\] M3EGXY++M:((7M2((7M3EGXY,,];...!rG   )r3  r\   r.  r/  r4  r1  r6  r0  r:   r;   r<   r=   r,   ri   r?   rE   rv   rF   ry   rz   s   @rH   r,  r,    s0    70 7U\\ ell  rG   r,  c                   l   ^  \ rS rSr\S 5       rS\R                  S\R                  4U 4S jjrSr	U =r
$ )Gemma4AudioCausalConv1di  c                 n    U R                   S   S-
  U R                  S   -  S-   nXR                  S   -
  $ )Nr   r+   )r  dilationr  )rp   effective_kernel_sizes     rH   left_pad Gemma4AudioCausalConv1d.left_pad  s<    !%!1!1!!4q!8DMM!<L Lq P${{1~55rG   r   r_   c                 x   > [         R                  R                  XR                  S45      n[        TU ]  U5      $ Nr   )r   r	   r   rD  rh   rv   )rp   r   rq   s     rH   rv   Gemma4AudioCausalConv1d.forward  s1     MMa--!34wq!!rG   r9   )r:   r;   r<   r=   r   rD  r?   rE   rv   rF   ry   rz   s   @rH   r@  r@    s;     6 6"<<" 
	" "rG   r@  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioLightConv1di  r\   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  UR                  5      U l        [        UR                  UR                  UR                  UR                  SS9U l	        [        UR                  UR                  SS9U l        [        UR                  UR                  SS9U l        [        UR                     U l        UR"                  U l        g )Nr)   F)r  r  r  groupsrb   Tr   r   )rh   ri   r\   rZ   r   linear_start
linear_endr@  conv_kernel_sizedepthwise_conv1dr|   r!  r0  	conv_normr   r2  r3  r4  r7  s     rH   ri   Gemma4AudioLightConv1d.__init__  s    1&:L:LfN`N`cdNde/8J8JFL^L^_ 7**++//%%!
 ,F,>,>FDWDWdhi&v'9'9v?R?R_cdV../!'!9!9rG   rO   r_   c                    UnU R                  U5      nU R                  U5      n[        R                  R	                  USS9nU R                  UR                  SS5      5      R                  SS5      n[        U R                  [        R                  " UR                  5      R                  5      n[        R                  " X* U5      nU R                  U5      nU R                  U5      nU R!                  U5      nX-  nU$ )Nr   r   r+   r)   )r0  rN  r   r	   glurQ  	transposer:  r4  r?   r;  r   r   ru   rR  r3  rO  )rp   rO   r<  r4  s       rH   rv   Gemma4AudioLightConv1d.forward  s     ++M:))-8))-R)@--m.E.Ea.KLVVWXZ[\   6 6MDWDW8X8\8\]M3EGXY}5M26!rG   )r3  r\   rR  rQ  r4  rO  rN  r0  r>  rz   s   @rH   rJ  rJ    s0    :0 :(U\\ ell  rG   rJ  c            
          ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  S-  S\R                  S	\
\   S
\R                  4
S jrSrU =r$ )Gemma4AudioLayeri	  r\   r   c                 l  > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        X5      U l        [        U5      U l	        [        UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        g rt   )rh   ri   r\   r,  feed_forward1feed_forward2r   	self_attnrJ  lconv1dr|   r   norm_pre_attnnorm_post_attnnorm_outr4  r   s      rH   ri   Gemma4AudioLayer.__init__
  s    3F;3F;-f@-f5*6+=+=>+F,>,>?%f&8&89!'!9!9rG   rO   rW   Nr   kwargsr_   c                 8   [        U R                  [        R                  " U R                  R
                  R                  5      R                  5      nU R                  U5      nUn[        R                  " X* U5      nU R	                  U5      nU R                  UUUS9u  p[        R                  " X* U5      nU R                  U5      nX-  nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nU$ )N)rO   r   rW   )r:  r4  r?   r;  r_  r   r   r   r[  ru   r]  r`  r^  r\  ra  )rp   rO   rW   r   rc  r4  r<  r   s           rH   rv   Gemma4AudioLayer.forward  s      6 6DDVDVD]D]DcDc8d8h8hi**=9 M3EGXY**=9>>' 3) * 
 M3EGXY++M:!]3**=9M3EGXYm4rG   )	r\   r[  r\  r4  r^  ra  r`  r_  r]  )r:   r;   r<   r=   r,   rx   ri   r?   rE   rX   r   r    rv   rF   ry   rz   s   @rH   rY  rY  	  sn    :0 :S : ||  ((4/  #\\	 
 +,  
   rG   rY  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )Gemma4VisionPatchEmbedderi?  r\   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR
                  U l        [        R                  " SU R                  S-  -  U R                  SS9U l        [        R                  " [        R                  " SU R
                  U R                  5      5      U l        g )Nr
   r)   Fra   )rh   ri   r\   r   
patch_sizeposition_embedding_sizer   rk   
input_projr   r?   r   position_embedding_tabler7  s     rH   ri   "Gemma4VisionPatchEmbedder.__init__@  s    !-- ++'-'E'E$))A(:$:D<L<LSXY(*UZZ4C_C_aeaqaq5r(s%rG   pixel_position_idspadding_positionsr_   c                    UR                  SS9n[        R                  " US   U R                  S   5      n[        R                  " US   U R                  S   5      nXE-   n[        R
                  " UR                  S5      SU5      nU$ )aC  Compute 2-D patch position embeddings via embedding lookup.

``pixel_position_ids`` has shape ``(batch, num_patches, 2)`` where the
last dimension holds (x, y) indices into ``position_embedding_table``
(shape ``(2, position_embedding_size, hidden_size)``).  The result is the
sum of the x- and y-embeddings for each patch.
r   r:  .r   .r+   r+   r           )ru   r   	embeddingrl  r?   wherer   )rp   rn  ro  clamped_positionsx_emby_embr   s          rH   _position_embeddings.Gemma4VisionPatchEmbedder._position_embeddingsJ  s     /444; -f5t7T7TUV7WX-f5t7T7TUV7WX#m#kk*;*E*Eb*I3Pcd""rG   pixel_valuesc                     SUS-
  -  nU R                  UR                  U R                   R                  R                  5      5      nU R	                  X#5      nXE-   $ )Nr)         ?)rk  r   r   r   rz  )rp   r|  rn  ro  rO   r   s         rH   rv   !Gemma4VisionPatchEmbedder.forward`  sU     L3./8N8N8T8T(UV"778J^22rG   )r\   r   rk  ri  rj  rl  )r:   r;   r<   r=   r/   ri   r?   rE   rz  rv   rF   ry   rz   s   @rH   rg  rg  ?  sz    t1 t#u|| #X]XdXd #iniuiu #,3!LL3>Cll3_d_k_k3	3 3rG   rg  c                   @  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	S\
\R                  \R                  4   4S	 jr SS\R                  S\R                  S\R                  S\	S
-  S\
\R                  \R                  4   4
S jjrSrU =r$ )Gemma4VisionPoolerij  aK  Spatial pooling and ``sqrt(hidden_size)`` scaling for vision encodings.

The scaling expands the activation magnitude, which can exceed the float16 range, so it is
computed in float32 and the pooled features are returned in float32. The caller
(``Gemma4VisionModel.forward``) standardizes them and casts back to the working dtype.
r\   c                 l   > [         TU ]  5         UR                  U l        U R                  S-  U l        g )Nr~  )rh   ri   r   root_hidden_sizer7  s     rH   ri   Gemma4VisionPooler.__init__r  s/    !-- $ 0 0# 5rG   rO   rn  lengthr_   c                 z   UR                   S   n[        XC-  S-  5      nUS-  nXc-  U:w  a'  [        SUR                    SU SU< SU< SU S	35      eUR                  S
S9nUS   R	                  SSS9S
   S-   n[
        R                  " XuSS9n	U	S   X-  U	S   -  -   n	[        R                  " U	R                  5       U5      R                  5       U-  n
U
R                  SS5      UR                  5       -  n[
        R                  " U
S
:H  R                  SS95      nUR                  UR                  5      U4$ )z
2D spatial pooling according to patch positions.
Pools the input tokens by averaging patches within a `k^2` grid, where `k` is determined by the ratio between
input and output lengths
r+   r~  r)   zCannot pool z to z: k=z^2 times length=z	 must be .r   rq  rr  r   Tr~   r   floor)rounding_moders  r   )r   rx   
ValueErrorru   r   r?   divr   one_hotlongro   rV  r   allr   r   )rp   rO   rn  r  input_seq_lenk	k_squaredrw  max_xkernel_idxsweightsoutputr  s                rH   _avg_pool_by_positions)Gemma4VisionPooler._avg_pool_by_positionsw  s`    &++A.(S01qD	.}2234xu!EVviW`an`oopq  /444;!&)--"d-CAFJii 1GL!&)UZ;v;N,NN))K,,.7==?)K""1a(=+>+>+@@  'Q,!3!3!3!:;yy,,-t33rG   Nro  output_lengthc                 4   XAR                   S   :  a  [        SU SUR                   S    S35      eUR                  UR                  S5      S5      nUR                   S   U:w  a  U R	                  XU5      u  pUR                  5       U R                  -  nX4$ )Nr+   z*Cannot output more soft tokens (requested z) than there are patches (z9). Change the value of `num_soft_tokens` when processing.r   rt  )r   r  r   r   r  ro   r  )rp   rO   rn  ro  r  s        rH   rv   Gemma4VisionPooler.forward  s     ..q11<]O L"((+,,eg 
 &112C2M2Mb2QSVWq!]2/3/J/J=0,M &++-0E0EE//rG   )r   r  rt   )r:   r;   r<   r=   r>   r/   ri   r?   rE   rx   rD   r  rv   rF   ry   rz   s   @rH   r  r  j  s    61 6
4"\\4?D||4UX4	u||U\\)	*4@ %)0||0 "LL0 !<<	0
 Tz0 
u||U\\)	*0 0rG   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Gemma4VisionMLPi  r\   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        [        UR                     U l        g rt   )rh   ri   r\   r   intermediate_sizerZ   	gate_projup_proj	down_projr   hidden_activationr3  r7  s     rH   ri   Gemma4VisionMLP.__init__  s    !--!'!9!9.v7G7GI_I_`,V5E5EtG]G]^.v7M7MtO_O_`V556rG   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rt   r  r3  r  r  rp   r   r  s      rH   rv   Gemma4VisionMLP.forward  6    NN4;;t~~a/@#ADLLQRO#ST	rG   r3  r\   r  r  r   r  r  )	r:   r;   r<   r=   r/   ri   rv   rF   ry   rz   s   @rH   r  r    s    71 7 rG   r  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	   SS\S-  S\R                  S-  S\S-  S\S	\4   4S
 jj5       r\R                  " 5       \S 5       5       rSrU =r$ )Gemma4VisionRotaryEmbeddingi  inv_freqNr\   c                   > [         TU ]  5         UR                  U l        UR                  U l        Xl        U R
                  R                  S   U l        U R                  nU R                  S:w  a  [        U R                     nU" U R
                  U5      u  o@l
        U R                  SUSS9  U R                  SUR                  5       SS9  g )N	rope_typedefaultr  Fr   original_inv_freq)rh   ri   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr\   rope_parametersr  compute_default_rope_parametersr   attention_scalingrm   clone)rp   r\   r   rope_init_fnr  rq   s        rH   ri   $Gemma4VisionRotaryEmbedding.__init__  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L((ZeD0(..2BuUrG   r   r   r_   torch.Tensorc           	      "   U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nUS-  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetar   Nr)   r   r   r   r   r   	r  getattrr   r   r?   r   int64r   ro   )r\   r   r   baser~   spatial_dimattention_factorr  s           rH   r  ;Gemma4VisionRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c QhQQekkBEEV[`[f[fEgjuuw
 ))rG   c                 $   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn/ / pe[        S5       H  nUS S 2S S 2U4   nUS S 2S S S 24   R                  5       n	[        USS9   UR                  5       U	R                  5       -  R                  SS5      n
[        R                  " X4SS	9nUR                  5       U R                  -  nUR!                  5       U R                  -  nS S S 5        UR#                  W5        UR#                  W5        M     [        R                  " USS	9R	                  UR$                  S
9n[        R                  " USS	9R	                  UR$                  S
9nX4$ ! , (       d  f       N= f)Nr   r   r+   mpscpur)   Fdevice_typeenabledr   r   )r  ro   expandr   r   r   
isinstancetyperC   ranger%   rV  r?   r   r   r  r   appendr   )rp   r   r   inv_freq_expandedr  all_cosall_sinidim_position_idsdim_position_ids_expandedfreqsembr   r   s                 rH   rv   #Gemma4VisionRotaryEmbedding.forward  s    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr'1!((--'E'E!((--[`J`ahhmmfk rqA+Aq!G4(8D!(D(J(J(L%KG*0025N5T5T5VVaabcefgiiB7ggi$"8"88ggi$"8"88	 H
 NN3NN3  iiR(++!''+:iiR(++!''+:x HGs   6BH
H	)r  r\   r  r  r  rt   NNN)r:   r;   r<   r=   r?   rE   rA   r/   ri   staticmethodr   rx   rD   ro   r  r   r   rv   rF   ry   rz   s   @rH   r  r    s    llV1 V V  ,0&*" *"T) *t# * t * 
~u$	%	 *  *D ]]_  rG   r  c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r)   r   )r   r?   r   )r   x1x2s      rH   rotate_halfr    sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''rG   r   r   r   unsqueeze_dimc                 l    UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   $ )a$  Applies Rotary Position Embedding to the query and key tensors.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r   r  r   r   r   r  s       rH   apply_rotary_pos_embr    s6    " --
&C
--
&CGA,--rG   rO   n_repr_   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r+   N)r   r  r   )rO   r  batchnum_key_value_headsslenr   s         rH   	repeat_kvr  )  s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrG   modulequerykeyvaluerW   dropoutscalingr   c                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r)   r
   r   r   )ptrainingr+   )r   r  num_key_value_groupsr?   matmulrV  r   r   r	   r   r   r   r   r  r  r   )r  r  r  r  rW   r  r  r   rc  r   r   r  r  s                rH   eager_attention_forwardr  5  s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$rG   r   c           
         UR                   S   nU R                   S   nSUSU-  -  -  nUS::  a  [        SU SU SU S35      eU/U-  n[        R                  " XSS9n	[        R                  " XSS9n
[        R                  " X(SS9n[	        U5       Vs/ s H  n[        X   X   X   US	9PM     nn[        R                  " USS9$ s  snf )
a#  Applies multidimensional RoPE to inputs.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        If position_ids.ndim + 2 == x.ndim, then this function passes through to `apply_rotary_pos_emb()`.
        Otherwise, position_ids is used to split the inputs, x, into multiple pieces, where each piece is fed to
        `apply_rotary_pos_emb()`, and then concatenated back together.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

Returns:
  Tensor of shape [B, L, N, H] with RoPE applied.
r   r)   r   zEInvalid configuration: num_rotated_channels_per_dim must be > 0, got z (num_input_channels=z, ndim=)r   r  )r   r  r?   splitr  r  r   )r   r   r   r   r  ndimnum_input_channelsnum_rotated_channels_per_dimsplit_sizesx_parts	cos_parts	sin_partsr  y_partss                 rH   apply_multidimensional_roper  W  s
   8 b!D#$(:q4x(H#I #q(,--BCUBV WF!
 	
 0047Kkk!b1GC"5IC"5I t A 	j'		
    99W"%%s   C
c                      ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\R                  S	\R                  S-  S
\R                  S-  S\\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Gemma4VisionAttentioni  =Multi-headed attention from 'Attention Is All You Need' paperr\   r   c                   > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        [        USUR                  UR                  -  5      U l
        UR                  UR                  -  U l        SU l        U R
                  R                  U l        SU l        [!        XR                  UR                  U R                  -  5      U l        [!        XR                  UR                  U R                  -  5      U l        [!        XR                  UR                  U R                  -  5      U l        [!        XR                  U R                  -  UR                  5      U l        [+        UR                  UR,                  S9U l        [+        UR                  UR,                  S9U l        [+        U R                  UR,                  SS9U l        g )Nlayer_typesr   r   Fr~   r   rM  )rh   ri   hasattrr  
layer_typer\   r   r  r   r   r   r  r  r  attention_dropout	is_causalrZ   r   r   r   o_projr|   r!  q_normk_normv_normr   s      rH   ri   Gemma4VisionAttention.__init__  sw   ;B6=;Y;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4N4NQUQ^Q^4^`f`r`rs#V=P=PQ#V=P=PQ#DMMv7J7JW\]rG   NrO   r   rW   r   rc  r_   c                 L   UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      n
U R	                  U
5      n
[        XX5      n
U
R                  SS5      n
U R                  U5      R                  U5      nU R                  U5      n[        XX5      nUR                  SS5      nU R                  U5      R                  U5      nU R                  U5      nUR                  SS5      n[        R                  " U R                  R                  [        5      nU" U U
UUU4U R                   (       a  U R"                  OSU R$                  S.UD6u  pUR&                  " / UQSP76 R)                  5       nU R+                  U5      nX4$ )Nr   r+   r)   rt  )r  r  )r   r   r   r   r  r  rV  r   r  r   r  r   get_interfacer\   _attn_implementationr  r  r
  r  r   r   r  )rp   rO   r   rW   r   rc  input_shaper   r   r   r   r   r   attention_interfacer  r  s                   rH   rv   Gemma4VisionAttention.forward  s    $))#2.88b8$--8&{{=166|D{{<02<cX#--a3[[/44\B
[[,
0#T
))!Q/
{{=166|D{{<0#--a3(?(M(MKK,,.E)
 %8	%
 /3mmD**LL	%
 	%
! "));;;;FFHkk+.((rG   )r
  r\   r   r  r  r   r   r	  r  r  r  r   r  r  r   r  )r:   r;   r<   r=   r>   r/   rx   ri   r?   rE   
LongTensorr   r    rD   rv   rF   ry   rz   s   @rH   r  r    s    G^1 ^c ^, -1.204,)||,) #\\,) t+	,)
 &&-,) +,,) 
u||U\\D0%2E2LL	M,) ,)rG   r  c                     ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma4VisionEncoderLayeri  r\   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        U5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )Nr\   r   r   )rh   ri   r\   r   r   r  r]  r  mlpr|   r!  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      rH   ri   !Gemma4VisionEncoderLayer.__init__  s    !--".fR"6*,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'rG   NrO   r   rW   r   rc  r_   c                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pU R                  U5      nXa-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nXa-   nU$ )N)rO   r   rW   r   r9   )r  r]  r  r   r  r!  )rp   rO   r   rW   r   rc  r<  r   s           rH   rv    Gemma4VisionEncoderLayer.forward  s     !,,];>> 
' 3)%	

 
 55mD 0 66}E/77F 0rG   )	r\   r   r  r   r  r  r!  r   r]  r  )r:   r;   r<   r=   r/   rx   ri   r?   rE   r  r   r    rD   r@   rv   rF   ry   rz   s   @rH   r  r    s    
c1 
cc 
c -1.204|| #\\ t+	
 &&- +, 
u  %(9(95;L;L(L"MPT"TT	U rG   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S-  S\	\
   S	\4
S
 jjrSrU =r$ )Gemma4VisionEncoderi   r\   c           
        > [         TU ]  5         Xl        UR                  U l        [        U5      U l        [        R                  " [        U R                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf )Nr  )rh   ri   r\   num_hidden_layers
num_layersr  
rotary_embr   
ModuleListr  r  layers)rp   r\   r  rq   s      rH   ri   Gemma4VisionEncoder.__init__  se     225f=mmKPQUQ`Q`KabKaa%VAKab
bs   A>Ninputs_embedsrW   rn  rc  r_   c                     [        U R                  UUS9nUnU R                  XS5      nU R                  SU R                  R                    H  nU" U4UUUS.UD6nM     [        US9$ )zw
pixel_position_ids (torch.Tensor):
    Patch positions as (x, y) coordinates in the image as [batch, num_patches, 2].
)r\   r.  rW   N)rW   r   r   last_hidden_state)r   r\   r*  r,  r(  r   )rp   r.  rW   rn  rc  rO   r   decoder_layers           rH   rv   Gemma4VisionEncoder.forward
  s     3;;')
 &"oomP "[[)H4;;+H+HIM)-$7/	
 M J 'GGrG   )r\   r,  r)  r*  rt   )r:   r;   r<   r=   r/   ri   r?   rE   r  r   r    r   rv   rF   ry   rz   s   @rH   r&  r&     so    
1 
 7;	H||H H ",,t3	H
 +,H 
!H HrG   r&  c                   :   ^  \ rS rSrS\S\4U 4S jjrS rSrU =r	$ )Gemma4TextMLPi,  r\   r   c                 X  > [         TU ]  5         UR                  UR                  -
  nX#s=:  =(       a    S:  Os  nUR                  =(       a    UnXl        UR                  U l        UR                  U(       a  SOS-  U l        [        R                  " U R                  U R                  SS9U l
        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        UR                     U l        g )Nr   r)   r+   Fra   )rh   ri   r(  num_kv_shared_layersuse_double_wide_mlpr\   r   r  r   rk   r  r  r  r   r  r3  )rp   r\   r   first_kv_shared_layer_idxis_kv_shared_layerr8  rq   s         rH   ri   Gemma4TextMLP.__init__-  s    $*$<$<v?Z?Z$Z!&GGaG$88O=O!--!'!9!9BUQ[\!]4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556rG   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rt   r  r  s      rH   rv   Gemma4TextMLP.forward:  r  rG   r  )
r:   r;   r<   r=   r.   rx   ri   rv   rF   ry   rz   s   @rH   r5  r5  ,  s!    7/ 7C 7 rG   r5  c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\	    SS\S-  S\
S   S\S-  S	\S-  S
\S\4   4
S jj5       r\R                   " 5       \SS j5       5       rSrU =r$ )Gemma4TextRotaryEmbeddingi?  r  Nr\   c                 |  > [         T
U ]  5         UR                  U l        UR                  U l        Xl        [        UR                  5      U l        0 U l        0 U l	        U R                   H  nU R
                  R                  U   nUc  M!  US   =nS:w  a
  [        U   nOU R                  nX`R                  U'   XPR                  U'   X#S.nUS:X  a  US:X  a  SUS'   U" U R
                  40 UD6u  pU R                  U S3US	S
9  U R                  U S3UR                  5       S	S
9  [        X S3U	5        M     g )Nr  r  )r   r	  full_attentionproportionalglobal_head_dimhead_dim_key	_inv_freqFr   _original_inv_freq_attention_scaling)rh   ri   r  r  r  r\   setr  rope_init_fnsr  r  r   r  rm   r  setattr)rp   r\   r   r	  rope_paramsr  r  rope_init_fn_kwargscurr_inv_freqcurr_attention_scalingrq   s             rH   ri   "Gemma4TextRotaryEmbedding.__init__B  sQ   "("@"@$*$B$B!v112SU)+**J++55jAK"(55	)C29=#CC-9z*)2NN:&-3"N--)~2M6G#N34@4dPc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST) +rG   r   ztorch.devicer   r	  r_   r  c           	         U R                   U   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
r  r   Nr   r   r)   r   r  r  )r\   r   r   r	  r  r~   r  r  s           rH   r  9Gemma4TextRotaryEmbedding.compute_default_rope_parametersb  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))rG   c                 H   [        X S35      n[        X S35      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)NrE  rG  r   r   r+   r  r  Fr  r)   r   r   )r  ro   r  r   r   r   r  r  rC   r%   rV  r?   r   r   r   r   )rp   r   r   r	  r  r  r  position_ids_expandedr  r  r  r   r   s                rH   rv   !Gemma4TextRotaryEmbedding.forward  sd    4<y!9:#DL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   +A.F
F!)r\   r  r  r  rI  r  NN)NNNNrt   )r:   r;   r<   r=   r?   rE   rA   r.   ri   r  r   rx   rC   rD   ro   r  r   r   rv   rF   ry   rz   s   @rH   r?  r?  ?  s    llU/ U U@ *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F ]]_<  <rG   r?  c                   &  ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S	\R                  S-  S
\
\\\R                  \R                  4   4   S\S-  S\\   S\\R                  \R                  S-  4   4S jjrSrU =r$ )Gemma4TextAttentioni  r  r\   r   c                    > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        U R                  S:H  U l        U R                  (       a  UR                  OS U l        U R                  (       d  UR                  (       a  UR                  OUR                  U l
        UR                  =(       a    U R                  (       + U l        U R                  (       a  UR                  OUR                  nUR                  U-  U l        SU l        U R
                  R$                  U l        UR&                  S:g  U l        U R
                  R*                  [-        U R
                  SS5      -
  nX$s=:  =(       a    S:  Os  U l        UR                  S U nU R.                  (       + =(       a6    U[1        U5      S-
  US S S2   R3                  UR                  U   5      -
  :H  U l        [6        R8                  " UR:                  UR                  U R                  -  UR<                  S	9U l        [A        U R                  URB                  S
9U l"        U R.                  (       d  [A        U R                  URB                  S
9U l#        [A        U R                  URB                  SS9U l$        [6        R8                  " UR:                  X0R                  -  UR<                  S	9U l%        U R                  (       d6  [6        R8                  " UR:                  X0R                  -  UR<                  S	9OS U l&        [6        R8                  " UR                  U R                  -  UR:                  UR<                  S	9U l'        g )Nr  sliding_attentionr   r  r7  r   r+   r   ra   r  FrM  )(rh   ri   r  r  r	  r\   r   
is_slidingsliding_windowrC  r   attention_k_eq_vuse_alternative_attentionnum_global_key_value_headsr  r   r  r  r
  use_bidirectional_attentionr  r(  r  r:  lenindexstore_full_length_kvr   rk   r   attention_biasr   r|   r!  r  r  r  r   r   r  )rp   r\   r   r  r9  prev_layersrq   s         rH   ri   Gemma4TextAttention.__init__  s   ;B6=;Y;Y&,,Y7_c"//-@@7;f33D6:oo&J`J`..flfufu)/)@)@)XEX&151O1OF--U[UoUo 	 %+$>$>BU$U!!%!>!>;;uD %)KK$A$AGDKKYoqrDs$s!"+"M"MA"M(()C*CD(,(?(?$? %/IQTU`QadeQehsbDi

%""9-
.R/ E/! ii : :T]] JQWQfQf
 $6;N;NO &&'DMMv?R?RSDK'6;N;N[`aDK))""$7--$GfNcNcDK
 55 		&,,.AMM.QX^XmXmn K ii&&68J8JQWQfQf
rG   NrO   r   rW   r8   rN   rc  r_   c                    UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      nU R	                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       aG  X@R                     u  pUR                  UR                  5      nUR                  UR                  5      nOU R                  U5      R                  U5      nU R                  b   U R                  U5      R                  U5      OUnU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  U5      nUR                  SS5      nUb/  U R                  (       d  UR                  XU R                   5      u  pU R"                  (       a  X4X@R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U UUUU4U R.                  (       a  U R0                  OSU R2                  U R4                  S.UD6u  nnUR6                  " / UQSP76 R9                  5       nU R;                  U5      nUU4$ )Nr   r)   )r  r+   rt  )r  r  r[  )r   r   r   r   r  r  rV  r:  r	  r   r   r   r   r  r  updater   rb  r   r  r\   r  r  r  r
  r  r[  r   r   r  )rp   rO   r   rW   r8   rN   rc  r  r   r   r   r   r   r   r  r  r  s                    rH   rv   Gemma4TextAttention.forward  s>    $))#2.88b8$--8&{{=166|D{{<0+LsRST#--a3
 ""'7'H$J#|':':;J'??<+>+>?L]388FJLPKKLc4;;}5::<HisLZ0J-jsRSTJ#--a3J;;|4L'11!Q7L&t/F/F'6'='=jX\XfXf'g$J$$0:0H__-(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((rG   )r
  r\   r   r  r:  rZ  r  r   r   r	  r  r  r  r   r  r[  rb  r]  r  r   rt   )r:   r;   r<   r=   r>   r.   rx   ri   r?   rE   rB   rC   rD   r   r   r   rv   rF   ry   rz   s   @rH   rW  rW    s    G/
/ /
C /
n )-=)||=) #\\=) t+	=)
 sE%,,*D$EEF=) =) -.=) 
u||U\\D00	1=) =)rG   rW  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )Gemma4TextExpertsi  z2Collection of expert weights stored as 3D tensors.r\   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        [        R                  " [        R                  " U R                  SU R                  -  U R                  5      5      U l        [        R                  " [        R                  " U R                  U R                  U R                  5      5      U l        [        UR                     U l        g )Nr)   )rh   ri   num_expertsr   
hidden_dimmoe_intermediate_sizeintermediate_dimr   r   r?   emptygate_up_projr  r   r  r3  r7  s     rH   ri   Gemma4TextExperts.__init__  s    !-- ,, & < <LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV556rG   rO   top_k_indextop_k_weightsr_   c                 X   [         R                  " U5      n[         R                  " 5          [         R                  R                  R                  X R                  S9nUR                  SSS5      n[         R                  " UR                  SS9S5      R                  5       nS S S 5        W H  nUS   nXpR                  :X  a  M  [         R                  " WU   5      u  pX   n
[        R                  R                  XR                  U   5      R                  SSS9u  pU R                  U5      U-  n[        R                  R                  XR                   U   5      nXXS 4   -  nUR#                  SXR%                  UR&                  5      5        M     U$ ! , (       d  f       N= f)N)num_classesr)   r+   r   )r   r   r   )r?   
zeros_liker   r   r	   r  rl  r   greatersumnonzerorv  rl   rq  chunkr3  r  
index_add_r   r   )rp   rO   rs  rt  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statess                 rH   rv   Gemma4TextExperts.forward  so    $..}=]]_((--55kO_O_5`K%--aA6K{8'DaHPPRJ 
 %J#AJ---#(;;{:/F#G I)4M}}++M;L;LZ;XY__`agi_jHD$(KK$5$:!$&MM$8$89NP^P^_iPj$k!$9)`dJd<e$e!**1i9Q9QReRkRk9lm % #"# _s   A7F
F))r3  r  rq  rm  ro  rl  )r:   r;   r<   r=   r>   r.   ri   r?   rE   rv   rF   ry   rz   s   @rH   rj  rj    sS    <7/ 7#||# \\# ||	#
 
# #rG   rj  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                  \R                  4   4S jr	Sr
U =r$ )Gemma4TextRouteri5  r\   c                 $  > [         TU ]  5         Xl        UR                  U l        U R                  S-  U l        UR
                  U l        [        U R                  U R                  SS9U l        [        R                  " UR                  UR                  SS9U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   FrM  ra   )rh   ri   r\   r   scalar_root_sizer!  r   r|   r  r   rk   rl  projr   r?   r   scaleper_expert_scaler7  s     rH   ri   Gemma4TextRouter.__init__6  s    !-- $ 0 0$ 6&&!$"2"2US	IIf00&2D2D5Q	\\%**T-=-=">?
 "UZZ8J8J-K LrG   rO   r_   c                 ^   U R                  U5      nXR                  -  U R                  -  nU R                  U5      n[        R
                  R                  USS9n[        R                  " UU R                  R                  SS9u  pEXDR                  SSS9-  nX@R                  U   -  nX4U4$ )Nr   r   )r  r~   Tr  )r  r  r  r  r   r	   r   r?   topkr\   top_k_expertsrz  r  )rp   rO   expert_scoresrouter_probabilitiesrt  rs  s         rH   rv   Gemma4TextRouter.forwardB  s    		-0%

2T5J5JJ		-0!}}44]4K &+ZZ kk''&
" 	**r4*@@ &(=(=k(JJ#K??rG   )r\   r   r   r  r  r  r  r  )r:   r;   r<   r=   r.   ri   r?   rE   rD   rv   rF   ry   rz   s   @rH   r  r  5  sD    
M/ 
M@U\\ @eELL%,,<V6W @ @rG   r  c                   @  ^  \ rS rSrS\\-  S\4U 4S jjr      SS\R                  S\R                  S\
\\\R                  \R                  4   4   S-  S	\R                  S
\R                  S-  S\R                  S-  S\S-  S\R                  4S jjrSrU =r$ )Gemma4TextDecoderLayeriY  r\   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        [        XS9U l        [        X5      U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        U R                  S[         R"                  " S5      5        UR$                  U l        U R$                  (       a  [&        UR(                     U l        [,        R.                  " U R                  U R$                  SS9U l        [,        R.                  " U R$                  U R                  SS9U l        [        U R                  UR                  S9U l        UR6                  U l        U R6                  (       a  [9        U5      U l        [=        U5      U l        [        U R                  UR                  S9U l         [        U R                  UR                  S9U l!        [        U R                  UR                  S9U l"        g g )Nr  r  layer_scalarr+   Fra   )#rh   ri   r\   r   r   rW  r]  r5  r  r|   r!  r  r  r   r!  rm   r?   r   hidden_size_per_layer_inputr   r  r3  r   rk   per_layer_input_gateper_layer_projectionpost_per_layer_input_normenable_moe_blockr  routerrj  expertspost_feedforward_layernorm_1post_feedforward_layernorm_2pre_feedforward_layernorm_2r   s      rH   ri   Gemma4TextDecoderLayer.__init__Z  s   !--",FP 3,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'^UZZ];+1+M+M(++ !9!9:DK(*		$2B2BDDdDdkp(qD%(*		$2R2RTXTdTdkp(qD%-:4;K;KQWQdQd-eD* & 7 7  *62DK,V4DL0=d>N>NTZTgTg0hD-0=d>N>NTZTgTg0hD-/<T=M=MSYSfSf/gD, !rG   NrO   per_layer_inputr8   r   rW   r   rN   r_   c           
      (   Un	U R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUn	U R                  U5      nU R	                  U5      nU R
                  (       a  U R                  U5      nU	R                  SU	R                  S   5      nU R                  U5      u  pnU R                  U5      nU R                  XU5      nUR                  U	R                  5      nU R                  U5      nX-   nU R                  U5      nX-   nU R                  (       aN  Un	U R                  U5      nU R!                  U5      nX-  nU R#                  U5      nU R%                  U5      nX-   nXR&                  -  nU$ )N)rO   r   rW   r8   r   rN   r   r9   )r  r]  r  r   r  r  r  r   r   r  r  r  r  r!  r  r  r3  r  r  r  )rp   rO   r  r8   r   rW   r   rN   rc  r<  r   hidden_states_1hidden_states_flatrt  rs  hidden_states_2s                   rH   rv   Gemma4TextDecoderLayer.forwardv  s    !,,];>> 
' 3)-%+
 
 55mD 0 66}E/  "??NO "*!1!1"hnnR6H!I,0KK8J,K)Ak">>?QRO"ll?WO-55hnnEO"??PO ,=M77F 0++$H 55mDM KK6M);M 55mDM ::=IM$4M***rG   )r3  r\   r  r  r   r  r  r   r  r  r  r  r!  r  r  r  r   r  r  r]  )NNNNNN)r:   r;   r<   r=   r.   r/   rx   ri   r?   rE   rB   rC   rD   r  r   rv   rF   ry   rz   s   @rH   r  r  Y  s    h/2DD hQT h> )-PT,0.204(,9||9 9 sE%,,*D$EEFM	9
 #\\9 t+9 &&-9 9 
9 9rG   r  c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma4TextScaledWordEmbeddingi  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 |   > [         TU ]  XU5        X@l        U R                  S[        R
                  " U5      SS9  g )Nr  Fr   )rh   ri   scalar_embed_scalerm   r?   rn   )rp   r  r  r  r  rq   s        rH   ri   &Gemma4TextScaledWordEmbedding.__init__  s7    D"-]ELL,ERWXrG   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ rt   )rh   rv   r  r   r   r   )rp   r  rq   s     rH   rv   %Gemma4TextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRrG   )r  )r   )r:   r;   r<   r=   r>   rx   ro   ri   r?   rE   rv   rF   ry   rz   s   @rH   r  r    sM    Ys Y3 YS Y_d Y Y
S S SrG   r  c            	         ^  \ rS rSr% \\S'   SrSr/ SQrSS/r	Sr
SrSrSrSrSrS	r\R$                  " 5       U 4S
 j5       rS rS r   SS\S-  S\S-  S\S\R2                  4U 4S jjjr   SS\S-  S\S-  S\4S jjrSrU =r$ )Gemma4PreTrainedModeli  r\   modelT)r  r  rg  rY  rN   r8   N)imagetextvideoaudioc                 n
  > [         TU ]  U5        [        U[        5      (       a!  [        R
                  " UR                  5        g [        U[        5      (       a  SnSnUR                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  n[        R                  " UR                   UR#                  S5      R#                  S5      5        g [        U[$        5      (       aL  [        R&                  " UR(                  UR*                  5        [        R,                  " UR.                  5        g [        U[0        5      (       a  UR2                  R5                  5        H  u  pxSU0n	US:X  a  UR6                  U   S:X  a  S	U	S
'   U" UR8                  40 U	D6u  p[        R                  " [;        X S35      U
5        [        R                  " [;        X S35      U
5        M     g [        U[<        5      (       a  UR6                  S:w  a  [>        UR6                     OUR@                  nU" UR8                  5      u  p[        R                  " URB                  U5        [        R                  " URD                  U5        g [        U[F        5      (       a,  [        R&                  " URH                  URJ                  5        g [        U[L        5      (       aA  [        R
                  " URN                  5        [        R
                  " URP                  5        g [        U[R        5      (       aW  U R8                  RT                  n[        RV                  " URX                  SUS9  [        RV                  " URZ                  SUS9  g [        U[\        5      (       a!  [        R
                  " UR^                  5        g [        U[`        5      (       a  URb                  (       a  [        R&                  " URd                  [g        S5      * 5        [        R&                  " URh                  [g        S5      5        [        R&                  " URj                  [g        S5      * 5        [        R&                  " URl                  [g        S5      5        g [        U[n        5      (       a]  UR8                  Rp                  (       aA  [        R,                  " URr                  5        [        R
                  " URt                  5        g g g )Nr   r   r)   r+   r   r	  rA  rB  rC  rD  rE  rF  r  rt  )r   stdrd   );rh   _init_weightsr  rg  initones_rl  r   r   r   r   r   r?   r   r   copy_r   r   r   	constant_r   r   zeros_r   r?  rI  itemsr  r\   r  r  r   r  r  r  r  r  r  r  r  r  rj  initializer_rangenormal_rq  r  r  r  rZ   rj   rc   ro   re   rf   rg   Gemma4VisionModelstandardizestd_bias	std_scale)rp   r  r   r   r   r   r   r	  r  rL  rM  r   rope_fnbuffer_valuer  rq   s                  rH   r  #Gemma4PreTrainedModel._init_weights  s   f%f788JJv667 @AAM#M#//14N&*hh}/L&MPSTbefTfhiPj&j#*UYYu||N7SWnVn7n-ooNJJv,,n.F.Fq.I.S.STU.VW 455NN6>>6+K+KLKK,,- 9::,2,@,@,F,F,H(
'3Z&@#!11f6F6Fz6RVd6d:K'7#/#UAT#U 

76\+CDmT

76\9K+LM}] -I  ;<< ##y0 $F$4$45;; 
 &fmm4OLJJv5JJv//> =>>NN6--v/H/HI 011JJv||$JJv../ 122++//CLL,,3C@LL))= 677JJv**+ 5666;U;UNN6++eEl];NN6++U5\:NN6,,uU|m<NN6,,eEl; 122v}}7P7PKK(JJv''( 8Q2rG   c                 .    U R                   R                  $ rt   
base_modelembed_tokens_per_layerrp   s    rH   get_per_layer_input_embeddings4Gemma4PreTrainedModel.get_per_layer_input_embeddings
  s    555rG   c                 $    XR                   l        g rt   r  rp   r  s     rH   set_per_layer_input_embeddings4Gemma4PreTrainedModel.set_per_layer_input_embeddings  s    16.rG   new_num_tokenspad_to_multiple_ofmean_resizingr_   c                 J   > [         TU ]  UUUS9nU R                  XU5        U$ )N)r  r  r  )rh   resize_token_embeddings_resize_per_layer_embeddings)rp   r  r  r  r.  rq   s        rH   r  -Gemma4PreTrainedModel.resize_token_embeddings  s:     7)1' 8 

 	)).m\rG   c                    U R                   U R                  R                  5       l        U R                  R                  5       R                  (       a  U R                  5       nU R                  XAX#5      n[        US5      (       a  UR                  n[        XV5        UR                  UR                  R                  5        U R                  U5        g g )N_hf_hook)
vocab_sizer\   get_text_configvocab_size_per_layer_inputr  r  _get_resized_embeddingsr  r  r0   requires_grad_r   r   r  )rp   r  r  r  r  new_embeddings_per_layerhooks          rH   r  2Gemma4PreTrainedModel._resize_per_layer_embeddings  s     DH??##%@;;&&(DD%)%H%H%J"'+'C'C&8J($ -z::-66"#;B$334J4Q4Q4_4_`//0HI ErG   r9   )NNT)r:   r;   r<   r=   r-   rA   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backend_can_record_outputsinput_modalitiesr?   r   r  r  r  rx   r   r   	Embeddingr  r  rF   ry   rz   s   @rH   r  r    s    &*# $56H"IN!"&:
]]_2) 2)h67
 &*)-"	d
  $J 	
 
   &*)-"	Jd
J  $JJ 	J JrG   r  zAThe base Gemma 4 language model without a language modeling head.c                     ^  \ rS rSr% \\S'   Sr\" \SS9\	\
S.rS\4U 4S jjr\\\       SS	\R"                  S-  S
\R$                  S-  S\R"                  S-  S\S-  S\R(                  S-  S\R$                  S-  S\S-  S\\   S\4S jj5       5       5       rS	\R$                  S-  S\R$                  S-  S\R$                  4S jr SS\R$                  S\R$                  S-  S\R$                  4S jjrSrU =r$ )Gemma4TextModeli1  r\   )r  r   )ra  )router_logitsrO   rP   c           
      &  > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        U5      U l        SU l        [+        U R                  R,                  5      U l        UR0                  U l        U R0                  (       a  [        UR2                  UR                  UR0                  -  U R                  UR0                  S-  S9U l        SU l        [        R8                  " UR                  UR                  UR0                  -  SS9U l        UR                  S-  U l        [        UR0                  UR                   S9U l        / U l         [C        U R                  5       HT  u  p4URD                  RF                  (       d  M"  U R@                  RI                  S Vs/ s H  nS	U S
U 3PM     sn5        MV     U RK                  5         g s  snf s  snf )Nr~  )r  r  Fg;f?ra   r   )r   r   r  r  zlayers.z.self_attn.)&rh   ri   pad_token_idr  r  r  r   r\   embed_tokensr   r+  r  r(  r  r,  r|   r!  r  r?  r*  gradient_checkpointingrH  r  unique_layer_typesr  r  r  per_layer_input_scalerk   per_layer_model_projection per_layer_model_projection_scaleper_layer_projection_norm"_keys_to_ignore_on_load_unexpected	enumerater]  r:  extend	post_init)rp   r\   r   r  layernamerq   s         rH   ri   Gemma4TextModel.__init__;  s*    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmHMfNfNfHghHg9#F6Hgh
 "&"4"4&:M:MN	3F;&+#"%dkk&=&=">
 ,2+M+M(++*G11((6+M+MM  ">>C	+D' *3D&.0ii""((6+M+MM/D+
 5;4F4F4LD1-:6;];]cicvcv-wD* 35/!$++.HA11177>>@hi@hwqcTF3@hi / 	I i@ js    J	J
Nr  rW   r   rN   r.  per_layer_inputs	use_cacherc  r_   c           
      d   USL USL-  (       a  [        S5      eUb  Ub  [        S5      eUb  U R                  U5      nU R                  (       a%  Uc  U R                  X5      nU R	                  XV5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn	[        R                  " UR                  S   UR                  S9U	-   nUR                  S5      n[        U=n
[        5      (       d)  U R                  UUUUS.n[        S0 UD6[!        S0 UD6S	.n
Un0 nU R"                   H  nU R%                  XU5      X'   M     UR'                  S
[)        5       5      n[+        U R,                  SU R                  R.                   5       H\  u  nnUb  USS2SS2USS24   OSnU" UU4UXR                  R0                  U      XR                  R0                  U      UUS.UD6nM^     U R3                  U5      n[5        UUUR7                  SS5      (       a  US9$ SS9$ )a3  
per_layer_inputs (`torch.Tensor`, *optional*):
    Pre-computed per-layer input text embeddings of shape `(batch_size, sequence_length, num_hidden_layers,
    hidden_size_per_layer_input)`. When provided, these are used directly instead of being computed from `input_ids`
    via `get_per_layer_inputs()` in the text model. If calling the `forward` with `inputs_embeds` instead of `input_ids`,
    you should probably precompute them and forward them along `inputs_embeds`, otherwise recomputing them needs
    to reverse the main embedding, which is expensive.
N:You must specify exactly one of input_ids or inputs_embeds<You cannot specify per_layer_inputs if input_ids is providedr\   r   r+   r   r\   r.  rW   rN   r   )rA  rY  r8   )r8   r   rW   r   rN   return_shared_kv_statesF)r1  rN   r8   r9   )r  r  r  get_per_layer_inputsproject_per_layer_inputsr   r\   get_seq_lengthr?   r   r   r   r   r  rB   r   r   r   r*  popr   r  r,  r(  r  r  rR   get)rp   r  rW   r   rN   r.  r  r  rc  past_seen_tokenscausal_mask_mappingmask_kwargsrO   r   r	  r8   r  r2  r  s                      rH   rv   Gemma4TextModel.forwardk  sk   , -t";<YZZ %5%A[\\  --i8M++'#'#<#<Y#V #<<]]0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U# & 11J.2oom[e.f+ 2 "::&8(*E !*$++6U8U8U*V WA}>N>Z.q!Qz:`dO)	 "2$78O8OPQ8R$S2;;3J3J13MN) /	 	M !X 		-0,++17<UW\1]1]-
 	
 dh
 	
rG   c                    U R                   (       d  [        SU R                   35      eUc  [        R                  " 5          USS2SS2SSS24   U R
                  R                  SSSS2SS24   U R                  R                  S-  -  :H  R                  SS9R                  5       SS2S4   n UR                  UR                  SS 5      n SSS5        U R                  U5      R                  " / UR                  QU R                  R                  PU R                   P76 $ ! [         a    [        S5      ef = f! , (       d  f       Nt= f)a  Compute the token-identity component of Per-Layer Embeddings (PLE).

Looks up `input_ids` in `embed_tokens_per_layer` (a scaled embedding that multiplies
by `sqrt(hidden_size_per_layer_input)`) and reshapes the packed output from
`[batch, seq, num_hidden_layers * hidden_size_per_layer_input]` to
`[batch, seq, num_hidden_layers, hidden_size_per_layer_input]`.

If only `inputs_embeds` is provided (no `input_ids`), reverses the main embedding
to recover `input_ids` for the PLE lookup.
z}Attempting to call get_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nr~  r
   r   r)   a)  It seems like you tried to call `forward` from `inputs_embeds` without providing `input_ids`, and that the `inputs_embeds` you provided do not exactly match the embedding weights. Since Gemma4 needs to reverse the embedding to compute another embedding, make sure you provide exact `inputs_embeds`)r  RuntimeErrorr\   r?   r   r  r   r   r  r{  r   r   r  r   r(  )rp   r  r.  s      rH   r  $Gemma4TextModel.get_per_layer_inputs  sK    //**.++8   &aD!m4,,33D$14DEH_H_adHdde SQSZWYq!t%  )}/B/B2A/F GI !$ **95== 
__
KK))
 ,,
 	
 $ &r  !s   A.D>1D%%D;;D>>
Ec                 `   U R                   (       d  [        SU R                   35      eU R                  U5      U R                  -  nUR
                  " / UR                  SS QU R                  R                  PU R                   P76 nU R                  U5      nUc  U$ X2-   U R                  -  $ )aL  Compute the context-aware component of PLE and combine with token-identity.

Projects `inputs_embeds` through `per_layer_model_projection` (Linear), scales by
`1/sqrt(hidden_size)`, reshapes to `[batch, seq, num_layers, ple_dim]`, and normalizes
with `per_layer_projection_norm` (RMSNorm).

If `per_layer_inputs` (the token-identity component from `get_per_layer_inputs()`)
is provided, combines both: `(context_projection + token_identity) * (1/sqrt(2))`.
If `per_layer_inputs` is None (e.g. for multimodal inputs where input_ids are not
available), returns just the context projection.
zAttempting to call project_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nr   )
r  r  r\   r  r  r   r   r(  r  r  )rp   r.  r  r  s       rH   r  (Gemma4TextModel.project_per_layer_inputs  s      //226++@ 
  $>>}MPTPuPuu3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''$74;U;UUUrG   )r  r  r  r  r  r,  r  r  r  r  r  r  r*  r   r  )NNNNNNNrt   )r:   r;   r<   r=   r.   rA   r  r'   r  r  rW  r  ri   r&   r(   r!   r?   r  rE   r   r@   r   r   r    rR   rv   r  r  rF   ry   rz   s   @rH   r  r  1  s    '(8B/)./ .`   .2.204(,2604!%Y
##d*Y
 t+Y
 &&-	Y

 Y
 ((4/Y
  ,,-Y
 $;Y
 +,Y
 
'Y
    Y
v*
ellT.A *
RWR^R^aeRe *
jojvjv *
^ 15!V||!V  ,,-!V 
	!V !VrG   r  z>The base Gemma 4 language model with a language modeling head.c                     ^  \ rS rSr% SS0rSS0rSS/S/40r\\S'   S	r	S\4U 4S
 jjr
\\         SS\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R"                  S-  S\R                  S-  S\S-  S\\R                  -  S\R                  S-  S\\   S\4S jj5       5       rSrU =r$ )Gemma4ForCausalLMi  lm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrO   rM   r\   r  c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g NFra   )
rh   ri   r  r  r  r   rk   r   r%  r  r7  s     rH   ri   Gemma4ForCausalLM.__init__!  sU     $V,
 ++yy!3!3V5F5FUS 	rG   Nr  rW   r   rN   r.  labelsr  logits_to_keepr  rc  r_   c
                 8   U R                   " SUUUUUU	US.U
D6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bF  XR                  R                  -  n[        R                  " U5      nXR                  R                  -  nSnUb  U R                  " XU R                  40 U
D6n[        UUUR                  UR                  UR                  UR                   S9$ )aU  
per_layer_inputs (`torch.Tensor`, *optional*):
    Pre-computed per-layer input text embeddings of shape `(batch_size, sequence_length, num_hidden_layers,
    hidden_size_per_layer_input)`. When provided, these are used directly instead of being computed from `input_ids`
    via `get_per_layer_inputs()` in the text model. If calling the `forward` with `inputs_embeds` instead of `input_ids`,
    you should probably precompute them and forward them along `inputs_embeds`, otherwise recomputing them needs
    to reverse the main embedding, which is expensive.

Example:

```python
>>> from transformers import AutoTokenizer, Gemma4ForCausalLM

>>> model = Gemma4ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)r  rW   r   rN   r.  r  r  N)rL   rM   rN   rO   rP   r8   r9   )r  r1  r  rx   slicer%  r\   final_logit_softcappingr?   r   loss_functionr  rJ   rN   rO   rP   r8   )rp   r  rW   r   rN   r.  r*  r  r+  r  rc  outputsrO   slice_indicesrM   rL   s                   rH   rv   Gemma4ForCausalLM.forward*  s   P 26 	2
)%+'-	2
 	2
  118B>SV8W8W~ot4]kmA}a,?@A;;..:kkAAAFZZ'FkkAAAF%%fdooPPD+#33!//))$55
 	
rG   )r%  r  r  )	NNNNNNNr   N)r:   r;   r<   r=   _tied_weights_keys_tp_plan_pp_planr.   rA   r  ri   r"   r!   r?   r  rE   r   r@   r   rx   r   r    rJ   rv   rF   ry   rz   s   @rH   r#  r#    sN   *,GH23H_-z:;H/   .2.204(,26*.!%-.04E
##d*E
 t+E
 &&-	E

 E
 ((4/E
   4'E
 $;E
 ell*E
  ,,-E
 +,E
 
&E
  E
rG   r#  r[  c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )zD
This creates uni/bidirectional attention mask with sliding window.
	batch_idxhead_idxq_idxkv_idxr_   c                 H   > T	u  pEX#-
  nUS:  Xd:  -  nUS:  U* U:  -  nXx-  $ rG  r9   )
r7  r8  r9  r:  left_window_sizeright_window_sizedist	left_mask
right_maskr[  s
            rH   
inner_mask0sliding_window_mask_function.<locals>.inner_masky  sC    .<+~QY4#:;	QhD5+<#<=
%%rG   )rx   r   )r[  rA  s   ` rH   sliding_window_mask_functionrC  t  s3    
&c &S & &c &d & rG   c                   8  ^  \ rS rSr% Sr\\S'   SrSr\	\
S.rS\4U 4S jjrS\R                  S	\R                  4S
 jr\\\" SS9 SS\R                  S\R                  S-  S\\   S	\\R                  \R,                  4   4S jj5       5       5       rSrU =r$ )Gemma4AudioModeli  znAn audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.r\   r'  zmodel.audio_towerrO   rP   c           	        > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                  " UR                  UR                  SS9U l        U R#                  5         g s  snf )NTra   )rh   ri   r\   r  subsample_conv_projectionr   rel_pos_encr   r+  r  r(  rY  r,  rk   r   output_proj_dimsoutput_projr  r   s      rH   ri   Gemma4AudioModel.__init__  s     )KF)S&;FCmmBGH`H`BabBaYf0Bab
 99V%7%79P9PW[\	 cs   B?mask_4dr_   c                 ^   UR                   u  p#pCUR                  nU R                  R                  nU R                  R                  S-
  nU R                  R
                  nXF-   S-
  U-  n	X-  n
X-
  n[        R                  " USUSU4SS9nUR                  USXU
5      n[        R                  " XU4SS9n[        R                  " XS9U-  n[        R                  " Xg-   U-   US9nUSS2S4   USSS24   -   nUSSSS2SSS24   R                  USSUS5      nUR                  SU5      $ )z
Convert a standard 4D attention mask `[batch_size, 1, seq_len, seq_len]` to the 5D blocked format
`[batch_size, 1, num_blocks, chunk_size, context_size]` expected by the chunked local attention,
r+   r   F)r  r   Nr   )r   r   r\   r   r   r   r   r   r   r?   r   r  gather)rp   rM  r   r   r   r   r   r   r   r   padded_seq_len
pad_amountmask_5dblock_startsoffsets
kv_indicess                   rH   _convert_4d_mask_to_blocked_5d/Gemma4AudioModel._convert_4d_mask_to_blocked_5d  s:   
 %,MM!
w[[55
;;==A![[@@*Q.:=
#0#-
%%!ZJ!?uM//*aX%%4F!GuU||J>K,,z<?QQZ`a!!T'*WT1W-==
dAtQ 67>>z1bR\^`a
~~b*--rG   z&Encodes audio features to soft tokens.r1   NrW   rc  c           	         U R                  X5      u  pEU R                  U5      n[        U R                  UU[	        U R                  R
                  S-
  U R                  R                  45      S9nUb  U R                  U5      nU R                  S U R                  R                    H  nU" U4UUS.UD6nM     U R                  U5      n[        XES9$ )Nr+   )r\   r.  rW   and_mask_function)rW   r   )r1  rW   )rH  rI  r   r\   rC  r   r   rV  r,  r(  rK  rU   )rp   r'  rW   rc  rO   output_maskr   encoder_layers           rH   rv   Gemma4AudioModel.forward  s     &*%C%CN%c""..}=2;;'&:33a79\9\]	
 %!@@PN![[)H4;;+H+HIM)-$7 	M J ((7%bbrG   )r\   r,  rK  rI  rH  rt   )r:   r;   r<   r=   r>   r,   rA   main_input_namer  rY  r   r  ri   r?   rE   rV  r&   r(   r!   r   r    rD   rX   rv   rF   ry   rz   s   @rH   rE  rE    s    x&O+)*
0 .ell .u|| .6  !IJ /3cc t+c +,	c
 
u||U---	.c K   crG   rE  c                      ^  \ rS rSrSr\r\\S.r	S\4U 4S jjr
\\\" SS9S\R                  S	\R                   S
\\   S\4S j5       5       5       rSrU =r$ )r  i  zThe Gemma 4 Vision Encoder.rF  r\   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  R                  (       at  U R                  S[        R                  " U R                  R                  5      5        U R                  S[        R                  " U R                  R                  5      5        U R                  5         g )Nr  r  )rh   ri   rg  patch_embedderr&  encoderr  poolerr\   r  rm   r?   rp  r   r  r7  s     rH   ri   Gemma4VisionModel.__init__  s     7?*62(0;;""  U[[9P9P-QR  ekk$++:Q:Q.RSrG   z1Encodes image pixels to soft tokens from patches.r1   r|  rn  rc  r_   c                    U R                   R                  nUR                  S   XD-  -  nUS:H  R                  SS9nU R	                  XU5      nU R
                  " SUU) US.UD6nU R                  UR                  UUUS9u  pX   n	U R                   R                  (       a7  XR                  R                  5       -
  U R                  R                  5       -  n	U	R                  UR                  5      n	[        U	S9$ )a  
pixel_values (`torch.FloatTensor` or `list[torch.FloatTensor]`):
    The images to encode. Either a single `[batch, channels, height, width]` tensor
    (all images same size) or a list of `[1, channels, height, width]` tensors (different sizes).
pixel_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
rw  r   r   )r.  rW   rn  )rO   rn  ro  r  r0  r9   )r\   pooling_kernel_sizer   r  r`  ra  rb  r1  r  r  ro   r  r   r   r   )rp   r|  rn  rc  re  r  ro  r.  r  rO   pooler_masks              rH   rv   Gemma4VisionModel.forward  s     #kk==$**2.3F3\]/25::r:B++LN_` 
'--1
 	
 &*[[ 221/'	 &1 &
" &2 ;;""*]]-@-@-BBdnnFZFZF\\M%(()<)<=&GGrG   )ra  r`  rb  )r:   r;   r<   r=   r>   r/   r\   r  r  r  ri   r&   r(   r!   r?   r@   r  r   r    r   rv   rF   ry   rz   s   @rH   r  r    s    %F1+

1 
  !TU)H'')H ",,)H +,	)H
 
!)H V   )HrG   r  c                   x   ^  \ rS rSrSrS\\-  S\4U 4S jjrS\	R                  S\	R                  4S jrS	rU =r$ )
Gemma4MultimodalEmbedderi  zQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 D  > [         TU ]  5         [        USUR                  5      U l        UR
                  U l        UR                  U l        [        R                  " U R                  U R                  SS9U l
        [        U R                  U R                  SS9U l        g )NrJ  Fra   rM  )rh   ri   r  r   multimodal_hidden_sizer!  r   text_hidden_sizer   rk   embedding_projectionr|   embedding_pre_projection_norm)rp   rj  rk  rq   s      rH   ri   !Gemma4MultimodalEmbedder.__init__!  s    
 	&-.?ASUfUrUr&s#$11 + 7 7$&IId.I.I4K`K`gl$m!-:4;V;V\`\d\dqv-w*rG   r.  r_   c                 F    U R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.
Args:
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.
Returns:
    A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
)rp  ro  )rp   r.  embs_normeds      rH   rv    Gemma4MultimodalEmbedder.forward.  s%     88G((55rG   )rp  ro  r   rm  rn  )r:   r;   r<   r=   r>   r,   r/   r.   ri   r?   rE   rv   rF   ry   rz   s   @rH   ri  ri    sF    [x,/AAx &x6U\\ 6ell 6 6rG   ri  mm_token_type_idsr   c                     U R                  U5      n U S:H  U S:H  -  n[        R                  " USSS9nSUS'   X#) -  n[        R                  " UR	                  5       SS9S-
  n[        R
                  " X%S5      nU$ )Nr+   r)   r   )shiftsdimsFrr  r   )r   r?   rollcumsumrx   rv  )ru  r   	is_visionis_prev_visionnew_vision_startsvision_group_idsblock_sequence_idss          rH   get_block_sequence_ids_for_maskr  9  s    ),,V4"a',=,BCIZZ	!"=N"N6!O3||$5$9$9$;CaGY"ErG   z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            $         ^  \ rS rSrSrS\4U 4S jjr\\" SS9 S"S\	R                  S	\	R                  S-  S
\\   S\4S jj5       5       r  S#S\	R                  S-  S\	R                  S-  S\\	R"                  \	R"                  \	R"                  4   4S jjr\\\              S$S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R(                  S-  S\	R(                  S-  S\	R                  S-  S\S-  S\	R                  S-  S\	R                  S-  S\S-  S	\	R                  S-  S\	R                  S-  S\	R(                  S-  S
\\   S\4 S jj5       5       5       rS rS r\\" SS9S\	R(                  S\	R(                  S
\\   S\\-  4S j5       5       r\\" SS9 S"S\	R                  S\	R                  S-  S
\\   S\4S  jj5       5       rS!rU =r$ )%Gemma4ModeliE  Fr\   c                   > [         TU ]  U5        UR                  b   [        R                  " UR                  5      OS U l        UR                  R                  U l        [        R                  " UR                  S9nX l        UR                  R                  U l	        UR                  b   [        R                  " UR                  5      OS U l        UR                  b   [        UR                  UR                  5      OS U l        UR                  b   [        UR                  UR                  5      OS U l        U R                  5         g )Nr  )rh   ri   vision_configr*   from_configvision_towerrk  r  language_modelr  audio_configaudio_towerri  embed_visionembed_audior  )rp   r\   r  rq   s      rH   ri   Gemma4Model.__init__O  s    KQK_K_KkI11&2F2FGqu ,,77"..f6H6HI,*0*<*<*W*W'IOI\I\Ih9001D1DEnr ##/ %V%9%96;M;MN 	 "". %V%8%8&:L:LM 	
 	rG   zOProjects the last hidden state from the vision model into language model space.r1   Nr|  image_position_idsrc  r_   c                 p    U R                   " SUUS.UD6nUR                  nU R                  US9Ul        U$ )z
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
r|  rn  r.  r9   )r  r1  r  pooler_output)rp   r|  r  rc  vision_outputsr1  s         rH   get_image_featuresGemma4Model.get_image_featuresd  sT     ** 
%1
 

 +<<'+'8'8GX'8'Y$rG   r  r.  c           	         UbJ  XR                   R                  :H  nXR                   R                  :H  nXR                   R                  :H  nGO8UU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nX4U4$ )a;  
Obtains mask for multimodal placeholders (replaced by soft tokens) and hard text tokens.

Masks will be obtained from `mm_token_type_ids`, `input_ids`, or `inputs_embeds` as available and in that
precedence order. If passing `input_ids` or `inputs_embeds`, the image mask will be derived using
`config.image_token_id`. Same goes for audio and video masks

Args:
    input_ids: A tensor containing the hard token IDs from the text tokenizer.
    inputs_embeds: A tensor containing the embeddings for all hard text tokens.

Returns:
    image_mask, video_mask, audio_mask
)r   r   r   )
r\   image_token_idvideo_token_idaudio_token_idget_input_embeddingsr?   rn   r  r   r  )rp   r  r.  special_image_maskspecial_video_maskspecial_audio_masks         rH   get_placeholder_mask Gemma4Model.get_placeholder_masky  sA   &  !*kk.H.H!H!*kk.H.H!H!*kk.H.H!H ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  "7IIIrG   pixel_values_videosr'  rW   r(  r   rN   ru  r  video_position_idsr  c                    USL U
SL-  (       a  [        S5      eUb  Ub  [        S5      eU R                  X5      u  nnnUU-  U-  nSnU
c\  UR                  5       n[        R                  " UU R
                  R                  R                  U5      nU R                  5       " U5      n
Uc  U R
                  R                  5       R                  (       a  U R                  R                  R                  U R
                  R                  R                  SS24   nUR                  U
R                  5      n[        R                  " US   UR!                  SSS5      U
5      nU R                  R#                  UU5      nUGb  U R%                  X,SS9R&                  nUR                  U
R                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R                  U
R                  5      n[1        U
U   R3                  5       UR3                  5       :H  S	U S
UR4                  S    35        U
R7                  UR                  U
R                  5      UR                  U
R                  5      5      n
UGb  U R9                  X=SS9R&                  nUR                  U
R                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R                  U
R                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S
UR4                  S    35        U
R7                  UR                  U
R                  5      UR                  U
R                  5      5      n
UGb(  UGb$  U R;                  XFSS9nUR&                  nUR<                  nUUR                  UR                  5         nUR+                  5       nUR-                  S5      R/                  U
5      R                  U
R                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S
UR4                  S   UR4                  S   -   35        U
R7                  UR                  U
R                  5      UR                  U
R                  5      5      n
UcU  Ub  UR?                  5       OSn[        R@                  " U
R4                  S   U
R                  S9U-   nUR-                  S5      n[C        U=n [D        5      (       d  U R
                  R                  5       U
UUUS.n!U R
                  R                  5       RF                  S:X  aN  [        RH                  " / U
RK                  5       SS QSU
R                  S9n"U	b  [M        XR                  S9n"U"U!S'   [O        S0 U!D6n U R                  " SUU UUU
USS.UD6n#[Q        U#RR                  U#RT                  U#RV                  U#RX                  Ub  WOSUb  WOSU#RZ                  S9$ )  
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
per_layer_inputs (`torch.Tensor`, *optional*):
    Pre-computed per-layer input text embeddings of shape `(batch_size, sequence_length, num_hidden_layers,
    hidden_size_per_layer_input)`. When provided, these are used directly instead of being computed from `input_ids`
    via `get_per_layer_inputs()` in the text model. If calling the `forward` with `inputs_embeds` instead of `input_ids`,
    you should probably precompute them and forward them along `inputs_embeds`, otherwise recomputing them needs
    to reverse the main embedding, which is expensive.
Nr  r  r   r+   r   T)return_dictz6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: z6Audio features and audio tokens do not match, tokens: r   r  visionr  )r  rW   r   rN   r.  r  r  )r1  rN   rO   rP   r6   r7   r8   r9   ).r  r  r  r?   rv  r\   rk  r  r  r  r  r  r  r   r   r   r   r  r  r  r   rz  r   	expand_asr$   numelr   masked_scatterget_video_featuresget_audio_featuresrW   r  r   r  rB   r_  fullsizer  r   r4   r1  rN   rO   rP   r8   )$rp   r  r|  r  r'  rW   r(  r   rN   ru  r.  r  r  r  r  rc  
image_mask
video_mask
audio_maskmultimodal_maskllm_input_idspad_embeddingllm_inputs_embedsimage_featuresn_image_tokensvideo_featuresn_video_tokensaudio_outputaudio_featuresaudio_mask_from_encodern_audio_tokensr  r  r  r  r0  s$                                       rH   rv   Gemma4Model.forward  s   J -t";<YZZ %5%A[\\-1-F-Fy-`*
J
$z1J>  %OO-M!KK9P9P9]9]_lmM 557FM#(C(C(E(a(a //<<CCDKKD[D[DhDhjkDklM-001E1EFO %OI,FHZHZ[\^_acHdfs t#22GGWhi #!44\cg4hvvN+..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M *!44#T 5 m  ,..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M
 %*=*I22>dh2iL)77N&2&A&A#
 ,,C,F,F~G\G\,]^N'^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+n.B.B1.EEFH *88m223^5F5F}G[G[5\M
 CRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-FF++557!."0#2 ,K {{**,HHHT%*ZZ0L-2D2D2Fs2K0LbYfYmYm%n"$0)H)2F2F*& 5G01 #<"Jk"J%% 	
-.%+'	
 	
 )%77#33!//))2>2JPT2@2LRV$55
 	
rG   c                 .    U R                   R                  $ rt   r  r  r  s    rH   r  *Gemma4Model.get_per_layer_input_embeddingsO	  s    ""999rG   c                 $    XR                   l        g rt   r  r  s     rH   r  *Gemma4Model.set_per_layer_input_embeddingsR	  s    5:2rG   zPProjects the last hidden state from the audio encoder into language model space.c                     U R                   c  [        S5      eU R                   " X4SS0UD6nU R                  UR                  S9Ul        U$ )a  
input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
    The tensors corresponding to the input audio.
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
zAudio features were requested, but the model was initialized without an audio_config. Cannot process audio without an audio tower and audio embedder.r  Tr  )r  r  r  r1  r  )rp   r'  r(  rc  audio_outputss        rH   r  Gemma4Model.get_audio_featuresU	  sc     #R 
 ((iZ^ibhi&*&6&6]EdEd&6&e#rG   zQProjects the last hidden state from the vision encoder into language model space.c                     UR                  SS5      nUR                  SS5      nU R                  " SUUS.UD6nUR                  nU R                  US9Ul        U$ )a  
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
r   r+   r  r  r9   )flattenr  r1  r  r  )rp   r  r  rc  r  r1  s         rH   r  Gemma4Model.get_video_featuresn	  sz     299!Q?/771=** 
,1
 

 +<<'+'8'8GX'8'Y$rG   )r  r  r  r  r  r  r  rt   rU  )NNNNNNNNNNNNNN) r:   r;   r<   r=   accepts_loss_kwargsr-   ri   r"   r!   r?   r@   r  r   r    r   r  rD   rX   r  r&   rE   r   r   r4   rv   r  r  rU   r  r  rF   ry   rz   s   @rH   r  r  E  s7     | * !rs 7;'' ",,t3 +,	
 
$ t * .226+J##d*+J ((4/+J 
u!1!153C3CC	D	+JZ   .2158<37.23704(,5926!%6:6:04d
##d*d
 ''$.d
 #..5	d

 ))D0d
 t+d
 #\\D0d
 &&-d
 d
 !++d2d
 ((4/d
 $;d
 ",,t3d
 ",,t3d
  ,,-d
  +,!d
" 
##d
    d
L:; !st #\\ +,	
 
'	' u . !tu 7;".. ",,t3 +,	
 
$ v rG   r  z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            '         ^  \ rS rSrSS0rSrSrS\4U 4S jjr\	 S#S	\
R                  S
\
R                  S-  S\\   4S jj5       r\\	                S$S\
R                  S-  S	\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R"                  S-  S\
R"                  S-  S\
R                  S-  S
\
R                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\
R                  S-  S\S-  S\\
R"                  -  S\
R"                  S-  S\\   S\4$S jj5       5       r             S%U 4S jjrS rS r\  S&S\S\
R"                  S\
R"                  S-  S\S-  S\
R"                  S-  S\
R"                  S-  S \S-  S\4S! jj5       rS"rU =r$ )'Gemma4ForConditionalGenerationi	  r$  z(model.language_model.embed_tokens.weightFr  r\   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g r(  )rh   ri   r  r  r   rk   rk  r   r  r%  r  r7  s     rH   ri   'Gemma4ForConditionalGeneration.__init__	  sS      (
yy!3!3!?!?ASASA^A^ejkrG   Nr|  r  rc  c                 <    U R                   R                  " X40 UD6$ )a  
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
)r  r  )rp   r|  r  rc  s       rH   r  1Gemma4ForConditionalGeneration.get_image_features	  s     zz,,\XQWXXrG   r  r  r'  rW   r(  r   r  rN   ru  r.  r*  r  r+  r  r_   c                    U R                   " S0 SU_SU_SU_SU_SU_SU_SU_SU
_S	U_S
U_SU_SU_SU_SU_SU	_SS_UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnUb6  U R                  " UXR                  R                  5       R                  40 UD6n[        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  S9$ )r  r  r|  r  r'  rW   r(  r   rN   ru  r.  r  r*  r  r  r  r  TN)rL   rM   rN   rO   rP   r6   r7   r8   r9   )r  r1  r  rx   r-  r%  r\   r  r.  r?   r   r/  r  rJ   rN   rO   rP   r6   r7   r8   )rp   r  r|  r  r'  rW   r(  r   r  r  rN   ru  r.  r*  r  r+  r  rc  r0  rO   r1  rM   r.  rL   s                           rH   rv   &Gemma4ForConditionalGeneration.forward	  s   L ** 

%
 !4
 *	

 *
 !4
 &
 ,
 0
 (
 .
 
  
  2
  2
  #
(  118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F%%ffkk6Q6Q6S6^6^ibhiD+#33!//)) ' ; ; ' ; ;$55	
 		
rG   c                    > [         TU ]  " U4UUUUUUU
US.UD6nU(       d  U(       d  UUS'   UUS'   UUS'   U	US'   OS US'   U(       d  UR                  SS 5      nU$ )N)rN   r.  rW   r   r  r+  token_type_idsis_first_iterationr|  r  r'  r(  ru  r  )rh   prepare_inputs_for_generationr  )rp   r  rN   r.  r   r|  r  r'  rW   r(  r  r  r+  r*  r  rc  model_inputsr   rq   s                     rH   r  <Gemma4ForConditionalGeneration.prepare_inputs_for_generation	  s    & w<
+')%))1
 
 Y+7L(2EL./-;L)*2EL./ 15L,- "  !3T:ArG   c                 6    U R                   R                  5       $ rt   )r  r  r  s    rH   r  =Gemma4ForConditionalGeneration.get_per_layer_input_embeddings(
  s    zz88::rG   c                 :    U R                   R                  U5        g rt   )r  r  r  s     rH   r  =Gemma4ForConditionalGeneration.set_per_layer_input_embeddings+
  s    

11%8rG   r  c                    U R                  5       UUUUS.n[        U R                  5       SS 5      S:X  aM  [        R                  " / UR	                  5       S S QSUR
                  S9n	Ub  [        XQR
                  S9n	XS'   [        S0 UD6$ )Nr  r_  r  r   r   r  r9   )r  r  r?   r  r  r   r  r   )
r\   r.  rW   rN   r   ru  r  rc  r  r  s
             rH   r   8Gemma4ForConditionalGeneration.create_masks_for_generate.
  s     ,,.*,.(
 6))+-JDQU]]!&,Hm.@.@.B3B.G,H"UbUiUi!j ,%DEV_s_s%t"0B,-(7;77rG   )r%  r  rt   )NNNNNNNNNNNNNNr   N)NNNNNNNNNTNNF)NF) r:   r;   r<   r=   r3  r  r  r-   ri   r!   r?   r@   r  r   r    r  r"   rE   r   r   rx   rJ   rv   r  r  r  r  r   rB   r   rF   ry   rz   s   @rH   r  r  	  s    +,VW|   7;Y''Y ",,t3Y +,	Y Y  .2158<37.237046:6:(,5926*.!%-.04#N
##d*N
 ''$.N
 #..5	N

 ))D0N
 t+N
 #\\D0N
 &&-N
 ",,t3N
 ",,t3N
 N
 !++d2N
 ((4/N
   4'N
 $;N
  ell*!N
"  ,,-#N
$ +,%N
& 
&'N
  N
f    .`;9  26*/8 8||8 t+8 	8
 llT)8 !<<$.8 !4K8 
8 8rG   r  )rE  r#  r  r  r  r  r  )r+   )rt  NN)r)   )|r   collectionsr   collections.abcr   dataclassesr   	functoolsr   typingr   r?   r   torch.nnr	   r    r   r  activationsr   cache_utilsr   r   configuration_utilsr   
generationr   integrationsr   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r    r!   r"   r#   r$   utils.genericr%   r&   utils.output_capturingr'   r(   auto.modeling_autor*   configuration_gemma4r,   r-   r.   r/   accelerate.hooksr0   r4   rJ   rR   rU   ModulerZ   r|   r   r   r
  r  r,  Conv1dr@  rJ  rY  rg  r  r  r  r  rE   rx   r  r  ro   rD   r  r  r  r  r&  r5  r?  rW  rj  r  r  r  r  r  r  r#  rC  rE  r  ri  r   r  r  r  __all__r9   rG   rH   <module>r     sk  *    $ ! %    $ & ! . 3 ) 6  C 9 S K F &  H E * g g 3 
 Q 7 Q Q2 
 Q; Q QD 
Q$; 
Q 
Q 
37 3  3BII :4BII 4*7ryy 7>i)299 i)X#bii #8; ;< RYY  H"bii "B&RYY &R0ryy 0l(3		 (3V@0 @0Fbii  L")) L^(.ELL .u|| .%,, ._b .,	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%N 5&||5&	5& 
5& ,,	5&
 5& \\5&pB)BII B)J)9 )X)H")) )HXBII &W<		 W<tq)")) q)h $#		 $# $#N!@ryy !@HV7 VrSBLL S mJO mJ mJ` `adV+ dV bdVN ]^W
- W
 _W
tsCx X  Sc, SclAH- AHH6ryy 66	u|| 	U\\ 	^c^j^j 	 y' yyx	 }8%:O }8}8@rG   