
    3j                       S SK r S SKJr  S SKJr  S SKJr  S SKJr  S SK	r	S SK	J
r
  S SKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/J0r0  SSK1J2r2J3r3J4r4  SSK5J6r6J7r7  SSK8J9r9  SSK:J;r;J<r<J=r=J>r>J?r?J@r@JArA  SSKBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrK  SSKLJMrM  SSKNJOrO  SSKPJQrQ  SSKRJSrSJTrTJUrUJVrV  \." 5       (       a   \/R                  " \X5      rY " S S \F5      rZ " S! S"\C5      r[\ " S# S$\ 5      5       r\\,\ " S% S&\!5      5       5       r] " S' S(\
R                  5      r_ " S) S*\I5      r` " S+ S,\
R                  5      ra " S- S.\
R                  5      rb " S/ S0\
R                  5      rc " S1 S2\
R                  5      rd " S3 S4\
R                  5      re " S5 S6\
R                  5      rg " S7 S8\
R                  5      rh " S9 S:\
R                  5      ri " S; S<\
R                  5      rj " S= S>\
R                  5      rk " S? S@\>5      rl SzSA\	R                  SB\	R                  SC\	R                  SD\	R                  SE\nSF\	R                  4SG jjro " SH SI\M5      rp\4 " SJ SK\;5      5       rq " SL SM\<5      rr " SN SO\
R                  5      rs " SP SQ\>5      rt " SR SS\?5      ru " ST SU\
R                  5      rv " SV SW\O5      rw " SX SY\
R                  5      rx " SZ S[\<5      ry " S\ S]\A5      rz " S^ S_\H5      r{\," S`Sa9 " Sb Sc\@5      5       r|\," SdSa9 " Se Sf\=5      5       r} " Sg Sh\{5      r~ " Si Sj\{5      r " Sk Sl\G5      rSm\	R                  S-  Sn\	R                  S-  SF\S-  4So jrSp\	R                  Sq\	GR                  SF\	R                  4Sr jr\," SsSa9 " St Su\E5      5       r\," SvSa9 " Sw Sx\D5      5       r/ SyQrg){    N)UserDict)Callable)	dataclass)cached_property)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)create_bidirectional_maskcreate_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_accelerate_availableloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaultsno_inherit_decorator)OutputRecordercapture_outputs   )	AutoModel)Gemma3AttentionGemma3DecoderLayerGemma3ForCausalLM	Gemma3MLPGemma3RotaryEmbeddingGemma3TextModelGemma3TextScaledWordEmbedding)	Gemma3nCausalLMOutputWithPastGemma3nForConditionalGenerationGemma3nModelGemma3nModelOutputWithPastGemma3nMultimodalEmbedderGemma3nPreTrainedModelGemma3nRMSNormapply_rotary_pos_embeager_attention_forward)LlamaRotaryEmbedding)MixtralExperts)sliding_window_mask_function   )Gemma4AudioConfigGemma4ConfigGemma4TextConfigGemma4VisionConfigc                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4ModelOutputWithPastT   aw  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
Nshared_kv_states __name__
__module____qualname____firstlineno____doc__rC   dictstrtupletorchTensor__annotations____static_attributes__rD       c/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/gemma4/modular_gemma4.pyrA   rA   T   s7    " MQd3ellELL&@ AABTIPrR   rA   c                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4CausalLMOutputWithPasti   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
shared_kv_states (`dict`, *optional*):
    Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
    Used to pass shared KV states between layers during KV sharing.
NrC   rD   rE   rD   rR   rS   rU   rU   i   s7    * MQd3ellELL&@ AABTIPrR   rU   c                   j    \ rS rSr% SrSr\\\\	R                  \	R                  4   4   S-  \S'   Srg)Gemma4TextModelOutputWithPast   a!  
BaseModelOutputWithPast extended with shared_kv_states for KV sharing.

Args:
    shared_kv_states (`dict`, *optional*):
        Dictionary mapping layer type strings to tuples of (key_states, value_states) tensors.
        Used to pass shared KV states between layers during KV sharing.
NrC   rD   rE   rD   rR   rS   rX   rX      s7     MQd3ellELL&@ AABTIPrR   rX   c                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma4AudioModelOutput   z
attention_mask (`torch.BoolTensor`, *optional*):
    A torch.BoolTensor of shape `(batch_size, num_frames)`. True for valid positions, False for padding.
Nattention_maskrD   )
rF   rG   rH   rI   rJ   r]   rN   
BoolTensorrP   rQ   rD   rR   rS   r[   r[      s    
 /3NE$$t+2rR   r[   c                   |   ^  \ rS rSrS\\-  S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr
S
rU =r$ )Gemma4ClippableLinear   configin_featuresout_featuresreturnNc                   > [         TU ]  5         UR                  U l        [        R                  " X#SS9U l        U R                  (       a  U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        U R                  S[        R                  " [        S5      * 5      5        U R                  S[        R                  " [        S5      5      5        g g )NFbias	input_mininf	input_max
output_min
output_max)
super__init__use_clipped_linearsr   Linearlinearregister_bufferrN   tensorfloat)selfrb   rc   rd   	__class__s       rS   ro   Gemma4ClippableLinear.__init__   s     	#)#=#= iiF##  ellE%L=.IJ  ell5<.HI  u||U5\M/JK  u||E%L/IJ	 $rR   hidden_statesc                    U R                   (       a+  [        R                  " XR                  U R                  5      nU R                  U5      nU R                   (       a+  [        R                  " XR                  U R                  5      nU$ N)rp   rN   clampri   rk   rr   rl   rm   )rv   ry   s     rS   forwardGemma4ClippableLinear.forward   sX    ##!KK~~t~~VMM2##!KKXMrR   )rr   rp   )rF   rG   rH   rI   r?   r<   intro   rN   rO   r}   rQ   __classcell__rw   s   @rS   r`   r`      sY    K"%66K K 	K
 
K 	U\\ 	ell 	 	rR   r`   c                       \ rS rSrSrg)Gemma4RMSNorm   rD   NrF   rG   rH   rI   rQ   rD   rR   rS   r   r          rR   r   c                      ^  \ rS rSr% Sr\R                  \S'   S\4U 4S jjr	\R                  " 5       S\R                  S\R                  4S j5       rS	rU =r$ )
 Gemma4AudioRelPositionalEncoding   zSinusoidal relative positional encoding for the audio encoder.

Produces position embeddings of shape [1, context_size // 2 + 1, hidden_size] with
concatenated [sin..., cos...] layout matching the original Gemma4 convention.
inv_timescalesrb   c                   > [         TU ]  5         UR                  U l        UR                  UR                  -   S-
  UR
                  -   U l        SnSnU R                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  nU R                  SUR                  S5      R                  S5      SS9  g )	Nr;         ?     @r&   r   r   F
persistent)rn   ro   hidden_sizeattention_chunk_sizeattention_context_leftattention_context_rightcontext_sizemathlogmaxrN   exparangers   	unsqueeze)rv   rb   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr   rw   s          rS   ro   )Gemma4AudioRelPositionalEncoding.__init__   s    !--''&*G*GG!KfNlNll 	 ))Q."&((=+H"ICP^abPbdeLf"f&5<<3OSjRj3j)kk-~/G/G/J/T/TUV/WdijrR   ry   re   c                 b   [         R                  " U R                  S-  SSUR                  S9nUS   nX R                  R                  UR                  S9-  n[         R                  " [         R                  " U5      [         R                  " U5      /SS9nUR                  UR                  S9$ )Nr&   device.Ndimdtype)
rN   r   r   r   r   tocatsincosr   )rv   ry   position_idsscaled_time	pos_embeds        rS   r}   (Gemma4AudioRelPositionalEncoding.forward   s    ||D$5$5$:B=K_K_`#I."%8%8%;%;=CWCW%;%XXIIuyy5uyy7MNTVW	||-"5"5|66rR   )r   r   )rF   rG   rH   rI   rJ   rN   rO   rP   r<   ro   no_gradr}   rQ   r   r   s   @rS   r   r      sS     LL k0 k ]]_7U\\ 7ell 7 7rR   r   c                   f  ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\R                  4S jr
S\R                  S\R                  4S	 jrS
\R                  S\R                  4S jr SS\R                  S\R                  S\R                  S-  S\\R                  S4   4S jjrSrU =r$ )Gemma4AudioAttention   z3Chunked local attention with relative position biasrb   	layer_idxc                   > [         TU ]  5         Xl        X l        UR                  U l        UR                  UR                  -  U l        UR                  U l	        U R                  S-  [        R                  " S5      -  U l        [        R                  " S[        R                  -   5      [        R                  " S5      -  U l        UR                  U l        UR"                  S-
  U l        UR&                  U l        U R                   U R$                  -   U R(                  -   U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  U R                  U R                  -  5      U l        [-        XR                  UR                  5      U l        [6        R8                  " UR                  U R                  U R                  -  SS9U l        [6        R<                  " [>        R@                  " U R                  5      5      U l!        U RE                  S[>        RF                  " U R
                  5      SS9  g )N      r&   r;   Frg   softcapr   )$rn   ro   rb   r   attention_logit_capattention_logits_soft_capr   num_attention_headshead_dim	num_headsr   r   q_scaleek_scaler   
chunk_sizer   max_past_horizonr   max_future_horizonr   r`   q_projk_projv_projpostr   rq   relative_k_proj	ParameterrN   zerosper_dim_scalers   rt   rv   rb   r   rw   s      rS   ro   Gemma4AudioAttention.__init__   s   ")/)C)C&**f.H.HH33t+txx{:xxDFF
+dhhqk9 55 & = = A"("@"@ OOd.C.CCdF]F]]+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg+F4F4FY]YfYfHfg)&2D2DfFXFXY	!yy););T^^dmm=[bgh\\%++dmm*DEYT5S5S(TafgrR   ry   re   c           	         UR                   u  p#pEX0R                  -   S-
  U R                  -  nX`R                  -  U-
  n[        R                  " USSSSSU45      nUR	                  X&U R                  XE5      R                  5       $ )zSplits a `(batch_size, seq_len, num_heads, head_dim)` tensor into non-overlapping blocks of `chunk_size` along the sequence dim.r;   r   )shaper   Fpadreshape
contiguous)rv   ry   
batch_sizeseq_lenr   r   
num_blocksr   s           rS   _convert_to_block&Gemma4AudioAttention._convert_to_block   s|    3@3F3F0
Y/!3G
??*W4maAq!S-AB$$ZT__ibmmoorR   c           
      @   UR                   u  p#pE[        R                  " USSSSU R                  U R                  U R
                  -   S-
  45      nUR                  SU R                  U R
                  5      n[        R                  " USS5      nUR                  5       $ )z`Extracts overlapping context windows of `context_size` for every block, strided by `chunk_size`.r   r;   r   r&   )r   r   r   r   r   r   unfoldr   rN   movedimr   )rv   ry   r   r   r   r   s         rS   _extract_block_context+Gemma4AudioAttention._extract_block_context   s    3@3F3F0
YAq!Q(=(=t?V?VY]YhYh?hkl?lm
 &,,Q0A0A4??SmR;''))rR   xc                     UR                   u  p#pEnU R                  n[        R                  " USUS-   U-
  45      nUR	                  X#XEUS-   -  5      nUSSXW-  24   nUR	                  X#XEU5      $ )zjRelative position shift for blocked attention. See appendix B of https://huggingface.co/papers/1901.02860.r   r;   .N)r   r   r   r   view)rv   r   r   r   r   
block_sizeposition_lengthr   s           rS   
_rel_shiftGemma4AudioAttention._rel_shift
  s    IJF
z((EE!a)O;<=FF:*LSTDT6UVc.Z.../vvjZ\RRrR   Nposition_embeddingsr]   c                    UR                   u  pEnXEU R                  U R                  4nU R                  U5      R	                  5       R                  U5      nU R                  U5      R	                  5       R                  U5      n	U R                  U5      R	                  5       R                  U5      n
XR                  -  [        R                  " U R                  5      -  nXR                  -  n	U R                  U5      nU R                  U	5      n	U R                  U
5      n
UR                   S   nU R                  U5      nUR                  SU R                  U R                  5      nUR!                  UR"                  S9nUR%                  SSSSS5      nXR%                  SSSSS5      -  nUR'                  X@R                  SU R                  5      nXR%                  SSS5      -  nUR'                  X@R                  XR(                  S5      nU R+                  U5      nUU-   nUU R,                  -  n[.        R0                  " U5      nUU R,                  -  nUb4  UR3                  UR5                  5       U R6                  R8                  5      n[        R:                  " US[.        R<                  S9R!                  U
R"                  5      nUU
R%                  SSSSS5      -  nUR%                  SSSSS5      R'                  XKU R(                  -  S5      nUS S 2S U24   R?                  5       nU RA                  UR!                  UR"                  5      5      nUU4$ )	Nr;   r   r   r   r	   r&      )r   r   )!r   r   r   r   ru   r   r   r   r   r   softplusr   r   r   r   r   r   r   permuter   r   r   r   rN   tanhmasked_filllogical_notrb   attention_invalid_logits_valuesoftmaxfloat32r   r   )rv   ry   r   r]   r   
seq_length_hidden_shapequery_states
key_statesvalue_statesr   relative_key_statesqueries	matrix_acqueries_flat	matrix_bdattn_weightsattn_outputs                      rS   r}   Gemma4AudioAttention.forward  s    %2$7$7!
"N{{=1779>>|L[[/557<<\J
{{=1779>>|L#ll2QZZ@R@R5SS,,.
--l;00<
22<@!''*
"223FG166r4>>4==Y144<;M;M4N&&q!Q1500Aq!Q??	z>>2t}}U #>#>q!Q#GG	%%j..*oo_ab	OOI.	 9,#dll2zz,/#dll2%'33**,dkk.X.XL yy2U]]KNN|OaOab"\%9%9!Q1a%HH!))!Q1a8@@Z^ZiZiMikmn!![j[.1<<>ii}/B/B CDL((rR   )r   r   rb   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   )rF   rG   rH   rI   rJ   r<   r   ro   rN   rO   r   r   r   r^   rM   r}   rQ   r   r   s   @rS   r   r      s    =h0 hS h4pu|| p p*ELL *U\\ *SELL SU\\ S 37	1)||1) #\\1) ((4/	1)
 
u||T!	"1) 1)rR   r   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\R                  S-  4S jjrSrU =r	$ )	'Gemma4AudioSubSampleConvProjectionLayeriG  c           	         > [         TU ]  5         [        R                  " UUSSSSS9U l        [        R
                  " X#SSS9U l        [        R                  " 5       U l        g )N)r	   r	   )r&   r&   r;   F)in_channelsout_channelskernel_sizestridepaddingrh   T)epselementwise_affinerh   )	rn   ro   r   Conv2dconv	LayerNormnormReLUact)rv   r  r  norm_epsrw   s       rS   ro   0Gemma4AudioSubSampleConvProjectionLayer.__init__H  sU    II#%
	 LLPT[`a	779rR   Nry   maskc           
         Ub(  UR                  UR                  S9nXS S 2S S S 2S 4   -  nU R                  UR                  U R                  R                  R                  5      5      nU R                  U R                  UR                  SSSS5      5      R                  SSSS5      R                  5       5      nUb  US S 2S S S24   nX4$ )Nr   r   r&   r	   r;   )	r   r   r	  weightr   r  r  r   r   )rv   ry   r  s      rS   r}   /Gemma4AudioSubSampleConvProjectionLayer.forwardU  s    77-"6"677D)D!T1A,BBM		-"2"24993C3C3I3I"JK=+@+@Aq!+L!M!U!UVWYZ\]_`!a!l!l!no3Q3<D""rR   )r  r	  r  r{   )
rF   rG   rH   rI   ro   rN   rO   r}   rQ   r   r   s   @rS   r   r   G  s-    #U\\ #9L # #rR   r   c            	          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S-  S\\R                  \R                  4   4S jjr	S	r
U =r$ )"Gemma4AudioSubSampleConvProjectionic  rb   c                 d  > [         TU ]  5         [        SUR                  S   UR                  S9U l        [        UR                  S   UR                  S   UR                  S9U l        UR                  S   S-  UR                  S   -  n[        R                  " X!R                  SS9U l
        g )Nr;   r   )r  r  r  r   Frg   )rn   ro   r   subsampling_conv_channelsrms_norm_epslayer0layer1r   rq   r   input_proj_linear)rv   rb   proj_input_dimrw   s      rS   ro   +Gemma4AudioSubSampleConvProjection.__init__d  s    =99!<((

 >88;99!<((

 !::1=BfFfFfghFii!#>;M;MTY!ZrR   Ninput_featuresinput_features_maskre   c                    UR                  S5      nU R                  X25      u  p4U R                  X45      u  p4UR                  u  pVpvUR	                  SSSS5      R                  5       R                  XWS5      nU R                  U5      U4$ )Nr;   r   r&   r	   r   )r   r  r  r   r   r   r   r  )rv   r  r  ry   r  r   r   r   s           rS   r}   *Gemma4AudioSubSampleConvProjection.forwards  s    
 '003"kk-M"kk->$1$7$7!
w%--aAq9DDFNNzdfg%%m4d::rR   )r  r  r  r{   )rF   rG   rH   rI   r<   ro   rN   rO   rM   r}   rQ   r   r   s   @rS   r  r  c  s\    [0 [$ 48;; #\\D0; 
u||U\\)	*	; ;rR   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioFeedForwardi  rb   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  S-  UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l	        [        UR                     U l        UR                  U l        UR                  U l        g )Nr   )rn   ro   rb   r`   r   ffw_layer_1ffw_layer_2r   pre_layer_normpost_layer_normr   
hidden_actact_fngradient_clippingresidual_weightpost_layer_scalerv   rb   rw   s     rS   ro   Gemma4AudioFeedForward.__init__  s    09K9KVM_M_bcMcd09K9Ka9OQWQcQcd+F,>,>?,V-?-?@V../!'!9!9 & 6 6rR   ry   re   c                    [        U R                  [        R                  " UR                  5      R
                  5      nUn[        R                  " X* U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nXR                  -  nX-  nU$ r{   )minr+  rN   finfor   r   r|   r'  r%  r*  r&  r(  r-  )rv   ry   r+  residuals       rS   r}   Gemma4AudioFeedForward.forward  s     6 6MDWDW8X8\8\] M3EGXY++M:((7M2((7M3EGXY,,];...!rR   )r*  rb   r%  r&  r+  r(  r-  r'  rF   rG   rH   rI   r<   ro   rN   rO   r}   rQ   r   r   s   @rS   r#  r#    s0    70 7U\\ ell  rR   r#  c                   l   ^  \ rS rSr\S 5       rS\R                  S\R                  4U 4S jjrSr	U =r
$ )Gemma4AudioCausalConv1di  c                 n    U R                   S   S-
  U R                  S   -  S-   nXR                  S   -
  $ )Nr   r;   )r  dilationr  )rv   effective_kernel_sizes     rS   left_pad Gemma4AudioCausalConv1d.left_pad  s<    !%!1!1!!4q!8DMM!<L Lq P${{1~55rR   r   re   c                 x   > [         R                  R                  XR                  S45      n[        TU ]  U5      $ )Nr   )r   r   r   r;  rn   r}   )rv   r   rw   s     rS   r}   Gemma4AudioCausalConv1d.forward  s1     MMa--!34wq!!rR   rD   )rF   rG   rH   rI   r   r;  rN   rO   r}   rQ   r   r   s   @rS   r7  r7    s;     6 6"<<" 
	" "rR   r7  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Gemma4AudioLightConv1di  rb   c                   > [         TU ]  5         Xl        [        XR                  UR                  S-  5      U l        [        XR                  UR                  5      U l        [        UR                  UR                  UR                  UR                  SS9U l	        [        UR                  UR                  SS9U l        [        UR                  UR                  SS9U l        [        UR                     U l        UR"                  U l        g )Nr&   F)r  r  r  groupsrh   Tr  
with_scale)rn   ro   rb   r`   r   linear_start
linear_endr7  conv_kernel_sizedepthwise_conv1dr   r  r'  	conv_normr   r)  r*  r+  r.  s     rS   ro   Gemma4AudioLightConv1d.__init__  s    1&:L:LfN`N`cdNde/8J8JFL^L^_ 7**++//%%!
 ,F,>,>FDWDWdhi&v'9'9v?R?R_cdV../!'!9!9rR   ry   re   c                    UnU R                  U5      nU R                  U5      n[        R                  R	                  USS9nU R                  UR                  SS5      5      R                  SS5      n[        U R                  [        R                  " UR                  5      R                  5      n[        R                  " X* U5      nU R                  U5      nU R                  U5      nU R!                  U5      nX-  nU$ )Nr   r   r;   r&   )r'  rE  r   r   glurH  	transposer1  r+  rN   r2  r   r   r|   rI  r*  rF  )rv   ry   r3  r+  s       rS   r}   Gemma4AudioLightConv1d.forward  s     ++M:))-8))-R)@--m.E.Ea.KLVVWXZ[\   6 6MDWDW8X8\8\]M3EGXY}5M26!rR   )r*  rb   rI  rH  r+  rF  rE  r'  r5  r   s   @rS   r@  r@    s0    :0 :(U\\ ell  rR   r@  c            
          ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  S-  S\R                  S	\
\   S
\R                  4
S jrSrU =r$ )Gemma4AudioLayeri  rb   r   c                 l  > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        X5      U l        [        U5      U l	        [        UR                  5      U l        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        g r{   )rn   ro   rb   r#  feed_forward1feed_forward2r   	self_attnr@  lconv1dr   r   norm_pre_attnnorm_post_attnnorm_outr+  r   s      rS   ro   Gemma4AudioLayer.__init__  s    3F;3F;-f@-f5*6+=+=>+F,>,>?%f&8&89!'!9!9rR   ry   r]   Nr   kwargsre   c                 8   [        U R                  [        R                  " U R                  R
                  R                  5      R                  5      nU R                  U5      nUn[        R                  " X* U5      nU R	                  U5      nU R                  UUUS9u  p[        R                  " X* U5      nU R                  U5      nX-  nU R                  U5      nU R                  U5      n[        R                  " X* U5      nU R                  U5      nU$ )N)ry   r   r]   )r1  r+  rN   r2  rV  r  r   r   rR  r|   rT  rW  rU  rS  rX  )rv   ry   r]   r   rZ  r+  r3  r   s           rS   r}   Gemma4AudioLayer.forward  s      6 6DDVDVD]D]DcDc8d8h8hi**=9 M3EGXY**=9>>' 3) * 
 M3EGXY++M:!]3**=9M3EGXYm4rR   )	rb   rR  rS  r+  rU  rX  rW  rV  rT  )rF   rG   rH   rI   r<   r   ro   rN   rO   r^   r   r   r}   rQ   r   r   s   @rS   rP  rP    sn    :0 :S : ||  ((4/  #\\	 
 +,  
   rR   rP  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrS\R                  S\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )Gemma4VisionPatchEmbedderi%  rb   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR
                  U l        [        R                  " SU R                  S-  -  U R                  SS9U l        [        R                  " [        R                  " SU R
                  U R                  5      5      U l        g )Nr	   r&   Frg   )rn   ro   rb   r   
patch_sizeposition_embedding_sizer   rq   
input_projr   rN   onesposition_embedding_tabler.  s     rS   ro   "Gemma4VisionPatchEmbedder.__init__&  s    !-- ++'-'E'E$))A(:$:D<L<LSXY(*UZZ4C_C_aeaqaq5r(s%rR   pixel_position_idspadding_positionsre   c                    UR                  SS9n[        R                  " US   U R                  S   5      n[        R                  " US   U R                  S   5      nXE-   n[        R
                  " UR                  S5      SU5      nU$ )aC  Compute 2-D patch position embeddings via embedding lookup.

``pixel_position_ids`` has shape ``(batch, num_patches, 2)`` where the
last dimension holds (x, y) indices into ``position_embedding_table``
(shape ``(2, position_embedding_size, hidden_size)``).  The result is the
sum of the x- and y-embeddings for each patch.
r   r1  .r   .r;   r;   r           )r|   r   	embeddingrd  rN   wherer   )rv   rf  rg  clamped_positionsx_emby_embr   s          rS   _position_embeddings.Gemma4VisionPatchEmbedder._position_embeddings0  s     /444; -f5t7T7TUV7WX-f5t7T7TUV7WX#m#kk*;*E*Eb*I3Pcd""rR   pixel_valuesc                     SUS-
  -  nU R                  UR                  U R                   R                  R                  5      5      nU R	                  X#5      nXE-   $ )Nr&         ?)rb  r   r  r   rr  )rv   rt  rf  rg  ry   r   s         rS   r}   !Gemma4VisionPatchEmbedder.forwardF  sU     L3./8N8N8T8T(UV"778J^22rR   )rb   r   rb  r`  ra  rd  )rF   rG   rH   rI   r?   ro   rN   rO   rr  r}   rQ   r   r   s   @rS   r^  r^  %  sz    t1 t#u|| #X]XdXd #iniuiu #,3!LL3>Cll3_d_k_k3	3 3rR   r^  c                   @  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\	S\
\R                  \R                  4   4S	 jr SS\R                  S\R                  S\R                  S\	S
-  S\
\R                  \R                  4   4
S jjrSrU =r$ )Gemma4VisionPooleriP  aK  Spatial pooling and ``sqrt(hidden_size)`` scaling for vision encodings.

The scaling expands the activation magnitude, which can exceed the float16 range, so it is
computed in float32 and the pooled features are returned in float32. The caller
(``Gemma4VisionModel.forward``) standardizes them and casts back to the working dtype.
rb   c                 l   > [         TU ]  5         UR                  U l        U R                  S-  U l        g )Nrv  )rn   ro   r   root_hidden_sizer.  s     rS   ro   Gemma4VisionPooler.__init__X  s/    !-- $ 0 0# 5rR   ry   rf  lengthre   c                 z   UR                   S   n[        XC-  S-  5      nUS-  nXc-  U:w  a'  [        SUR                    SU SU< SU< SU S	35      eUR                  S
S9nUS   R	                  SSS9S
   S-   n[
        R                  " XuSS9n	U	S   X-  U	S   -  -   n	[        R                  " U	R                  5       U5      R                  5       U-  n
U
R                  SS5      UR                  5       -  n[
        R                  " U
S
:H  R                  SS95      nUR                  UR                  5      U4$ )z
2D spatial pooling according to patch positions.
Pools the input tokens by averaging patches within a `k^2` grid, where `k` is determined by the ratio between
input and output lengths
r;   rv  r&   zCannot pool z to z: k=z^2 times length=z	 must be .r   ri  rj  r   Tr   keepdimfloor)rounding_moderk  r   )r   r   
ValueErrorr|   r   rN   divr   one_hotlongru   rM  r   allr   r   )rv   ry   rf  r}  input_seq_lenk	k_squaredro  max_xkernel_idxsweightsoutputr  s                rS   _avg_pool_by_positions)Gemma4VisionPooler._avg_pool_by_positions]  s`    &++A.(S01qD	.}2234xu!EVviW`an`oopq  /444;!&)--"d-CAFJii 1GL!&)UZ;v;N,NN))K,,.7==?)K""1a(=+>+>+@@  'Q,!3!3!3!:;yy,,-t33rR   Nrg  output_lengthc                 4   XAR                   S   :  a  [        SU SUR                   S    S35      eUR                  UR                  S5      S5      nUR                   S   U:w  a  U R	                  XU5      u  pUR                  5       U R                  -  nX4$ )Nr;   z*Cannot output more soft tokens (requested z) than there are patches (z9). Change the value of `num_soft_tokens` when processing.r   rl  )r   r  r   r   r  ru   r{  )rv   ry   rf  rg  r  s        rS   r}   Gemma4VisionPooler.forwardx  s     ..q11<]O L"((+,,eg 
 &112C2M2Mb2QSVWq!]2/3/J/J=0,M &++-0E0EE//rR   )r   r{  r{   )rF   rG   rH   rI   rJ   r?   ro   rN   rO   r   rM   r  r}   rQ   r   r   s   @rS   ry  ry  P  s    61 6
4"\\4?D||4UX4	u||U\\)	*4@ %)0||0 "LL0 !<<	0
 Tz0 
u||U\\)	*0 0rR   ry  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Gemma4VisionMLPi  rb   c                   > [         TU ]  X5        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        [        XR                  U R                  5      U l        g r{   )rn   ro   r`   r   intermediate_size	gate_projup_proj	down_projr.  s     rS   ro   Gemma4VisionMLP.__init__  s^    &.v7G7GI_I_`,V5E5EtG]G]^.v7M7MtO_O_`rR   )r  r  r  )rF   rG   rH   rI   r?   ro   rQ   r   r   s   @rS   r  r    s    a1 a arR   r  r   r   r   r   unsqueeze_dimre   c           
         UR                   S   nU R                   S   nSUSU-  -  -  nUS::  a  [        SU SU SU S35      eU/U-  n[        R                  " XSS9n	[        R                  " XSS9n
[        R                  " X(SS9n[	        U5       Vs/ s H  n[        X   X   X   US	9PM     nn[        R                  " USS9$ s  snf )
a#  Applies multidimensional RoPE to inputs.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        If position_ids.ndim + 2 == x.ndim, then this function passes through to `apply_rotary_pos_emb()`.
        Otherwise, position_ids is used to split the inputs, x, into multiple pieces, where each piece is fed to
        `apply_rotary_pos_emb()`, and then concatenated back together.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.

Returns:
  Tensor of shape [B, L, N, H] with RoPE applied.
r   r&   r   zEInvalid configuration: num_rotated_channels_per_dim must be > 0, got z (num_input_channels=z, ndim=)r   )r   r   r   r  )r   r  rN   splitranger6   r   )r   r   r   r   r  ndimnum_input_channelsnum_rotated_channels_per_dimsplit_sizesx_parts	cos_parts	sin_partsr  y_partss                 rS   apply_multidimensional_roper    s
   8 b!D#$(:q4x(H#I #q(,--BCUBV WF!
 	
 0047Kkk!b1GC"5IC"5I t A 	j'		
    99W"%%s   C
c                       \ rS rSr\   SS\S-  S\R                  S-  S\S-  S\	S\
4   4S jj5       r\R                  " 5       \S	 5       5       rS
rg)Gemma4VisionRotaryEmbeddingi  Nrb   r   r   re   ztorch.Tensorc           	      "   U R                   S   n[        U SS5      =(       d    U R                  U R                  -  nUS-  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	aH  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetar   Nr&   r   r   r   )r   r   )	rope_parametersgetattrr   r   rN   r   int64r   ru   )rb   r   r   baser   spatial_dimattention_factorinv_freqs           rS   compute_default_rope_parameters;Gemma4VisionRotaryEmbedding.compute_default_rope_parameters  s    & %%l3fj$/c63E3EIcIc3c QhQQekkBEEV[`[f[fEgjuuw
 ))rR   c                 $   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn/ / pe[        S5       H  nUS S 2S S 2U4   nUS S 2S S S 24   R                  5       n	[        USS9   UR                  5       U	R                  5       -  R                  SS5      n
[        R                  " X4SS	9nUR                  5       U R                  -  nUR!                  5       U R                  -  nS S S 5        UR#                  W5        UR#                  W5        M     [        R                  " USS	9R	                  UR$                  S
9n[        R                  " USS	9R	                  UR$                  S
9nX4$ ! , (       d  f       N= f)Nr   r   r;   mpscpur&   F)device_typeenabledr   r   )r  ru   expandr   r   r   
isinstancetyperL   r  r!   rM  rN   r   r   attention_scalingr   appendr   )rv   r   r   inv_freq_expandedr  all_cosall_sinidim_position_idsdim_position_ids_expandedfreqsembr   r   s                 rS   r}   #Gemma4VisionRotaryEmbedding.forward  s    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr'1!((--'E'E!((--[`J`ahhmmfk rqA+Aq!G4(8D!(D(J(J(L%KG*0025N5T5T5VVaabcefgiiB7ggi$"8"88ggi$"8"88	 H
 NN3NN3  iiR(++!''+:iiR(++!''+:x HGs   6BH
H	rD   NNN)rF   rG   rH   rI   staticmethodr?   rN   r   r   rM   ru   r  r   r   r}   rQ   rD   rR   rS   r  r    s    ,0&*" *"T) *t# * t * 
~u$	%	 *  *D ]]_  rR   r  c                     ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \R                  S-  \\R                     S-  4   4S jjrSrU =r$ )Gemma4VisionAttentioni  rb   r   c                 &  > [         TU ]  XU5        U ?U ?U ?SU l        SU l        [        XR                  UR                  U R                  -  5      U l        [        XR                  UR                  U R                  -  5      U l        [        XR                  UR                  U R                  -  5      U l        [        XR                  U R                  -  UR                  5      U l        [!        U R                  UR"                  SS9U l        g )Nr   FrC  )rn   ro   attn_logit_softcappingsliding_window
is_slidingscaling	is_causalr`   r   num_key_value_headsr   r   r   r   r   o_projr   r  v_normr   s      rS   ro   Gemma4VisionAttention.__init__  s    y1'O+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4F4FHbHbeiererHrs+F4N4NQUQ^Q^4^`f`r`rs#DMMv7J7JW\]rR   Nry   r   r]   r   rZ  re   c                 L   UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      n
U R	                  U
5      n
[        XX5      n
U
R                  SS5      n
U R                  U5      R                  U5      nU R                  U5      n[        XX5      nUR                  SS5      nU R                  U5      R                  U5      nU R                  U5      nUR                  SS5      n[        R                  " U R                  R                  [        5      nU" U U
UUU4U R                   (       a  U R"                  OSU R$                  S.UD6u  pUR&                  " / UQSP76 R)                  5       nU R+                  U5      nX4$ )Nr   r;   r&   rl  )dropoutr  )r   r   r   r   q_normr  rM  r   k_normr   r  r   get_interfacerb   _attn_implementationr7   trainingattention_dropoutr  r   r   r  )rv   ry   r   r]   r   rZ  input_shaper   r   r   r   r   r   attention_interfacer   r   s                   rS   r}   Gemma4VisionAttention.forward  s    $))#2.88b8$--8&{{=166|D{{<02<cX#--a3[[/44\B
[[,
0#T
))!Q/
{{=166|D{{<0#--a3(?(M(MKK,,.E)
 %8	%
 /3mmD**LL	%
 	%
! "));;;;FFHkk+.((rR   )r  r   r  r   r  r  r   r  )rF   rG   rH   rI   r?   r   ro   rN   rO   
LongTensorr   r   rM   r}   rQ   r   r   s   @rS   r  r    s    ^1 ^c ^  -1.204,)||,) #\\,) t+	,)
 &&-,) +,,) 
u||U\\D0%2E2LL	M,) ,)rR   r  c                     ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\
\   S\\R                  \\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma4VisionEncoderLayeriO  rb   r   c                 b   > [         TU ]  XU5        [        XS9U l        [	        U5      U l        g Nrb   r   )rn   ro   r  rT  r  mlpr   s      rS   ro   !Gemma4VisionEncoderLayer.__init__P  s*    y1.fR"6*rR   Nry   r   r]   r   rZ  re   c                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pU R                  U5      nXa-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nXa-   nU$ )N)ry   r   r]   r   rD   )input_layernormrT  post_attention_layernormpre_feedforward_layernormr  post_feedforward_layernorm)rv   ry   r   r]   r   rZ  r3  r   s           rS   r}    Gemma4VisionEncoderLayer.forwardU  s     !,,];>> 
' 3)%	

 
 55mD 0 66}E/77F 0rR   )r  rT  r  )rF   rG   rH   rI   r?   r   ro   rN   rO   r  r   r   rM   FloatTensorr}   rQ   r   r   s   @rS   r  r  O  s    +1 +c + -1.204|| #\\ t+	
 &&- +, 
u  %(9(95;L;L(L"MPT"TT	U rR   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S-  S\	\
   S	\4
S
 jjrSrU =r$ )Gemma4VisionEncoderit  rb   c           
        > [         TU ]  5         Xl        UR                  U l        [        U5      U l        [        R                  " [        U R                  5       Vs/ s H  n[        XS9PM     sn5      U l        g s  snf r  )rn   ro   rb   num_hidden_layers
num_layersr  
rotary_embr   
ModuleListr  r  layers)rv   rb   r  rw   s      rS   ro   Gemma4VisionEncoder.__init__u  se     225f=mmKPQUQ`Q`KabKaa%VAKab
bs   A>Ninputs_embedsr]   rf  rZ  re   c                     [        U R                  UUS9nUnU R                  XS5      nU R                  SU R                  R                    H  nU" U4UUUS.UD6nM     [        US9$ )zw
pixel_position_ids (torch.Tensor):
    Patch positions as (x, y) coordinates in the image as [batch, num_patches, 2].
)rb   r  r]   N)r]   r   r   last_hidden_state)r   rb   r  r  r  r   )rv   r  r]   rf  rZ  ry   r   decoder_layers           rS   r}   Gemma4VisionEncoder.forward~  s     3;;')
 &"oomP "[[)H4;;+H+HIM)-$7/	
 M J 'GGrR   )rb   r  r  r  r{   )rF   rG   rH   rI   r?   ro   rN   rO   r  r   r   r   r}   rQ   r   r   s   @rS   r  r  t  so    
1 
 7;	H||H H ",,t3	H
 +,H 
!H HrR   r  c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Gemma4TextMLPi  rb   r   c                    > UR                   UR                  -
  nX#s=:  =(       a    S:  Os  nUR                  =(       a    Un[        TU ]  5         UR
                  U(       a  SOS-  U l        g )Nr   r&   r;   )r  num_kv_shared_layersuse_double_wide_mlprn   ro   r  )rv   rb   r   first_kv_shared_layer_idxis_kv_shared_layerr  rw   s         rS   ro   Gemma4TextMLP.__init__  sa    $*$<$<v?Z?Z$Z!&GGaG$88O=O!'!9!9BUQ[\!]rR   )r  )	rF   rG   rH   rI   r>   r   ro   rQ   r   r   s   @rS   r  r    s     ^/ ^C ^ ^rR   r  c                   &    \ rS rSrSS\4S jjrSrg)Gemma4TextRotaryEmbeddingi  Nrb   c                    [         R                  R                  U 5        UR                  U l        UR                  U l        Xl        [        UR                  5      U l        0 U l	        0 U l
        U R                   H  nU R                  R                  U   nUc  M!  US   =nS:w  a
  [        U   nOU R                  nX`R                  U'   XPR                  U'   X#S.nUS:X  a  US:X  a  SUS'   U" U R                  40 UD6u  pU R                  U S3US	S
9  U R                  U S3UR                  5       S	S
9  [!        X S3U	5        M     g )N	rope_typedefault)r   
layer_typefull_attentionproportionalglobal_head_dimhead_dim_key	_inv_freqFr   _original_inv_freq_attention_scaling)r   Modulero   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrb   setlayer_typesrope_init_fnsr  r  r   r  rs   clonesetattr)
rv   rb   r   r  rope_paramsr  rope_init_fnrope_init_fn_kwargscurr_inv_freqcurr_attention_scalings
             rS   ro   "Gemma4TextRotaryEmbedding.__init__  sW   
		4 "("@"@$*$B$B!v112SU)+**J++55jAK"(55	)C29=#CC-9z*)2NN:&-3"N--)~2M6G#N34@4dPc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST) +rR   )rb   r  r  r  r  r  NN)rF   rG   rH   rI   r>   ro   rQ   rD   rR   rS   r  r    s    U/ U UrR   r  c                   &  ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\R                  S	\R                  S-  S
\
\\\R                  \R                  4   4   S\S-  S\\   S\\R                  \R                  S-  4   4S jjrSrU =r$ )Gemma4TextAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrb   r   c                    > [         TU ]  5         [        US5      (       a  UR                  U   OS U l        Xl        X l        U R                  S:H  U l        U R                  (       a  UR                  OS U l        U R                  (       d  UR                  (       a  UR                  OUR                  U l
        UR                  =(       a    U R                  (       + U l        U R                  (       a  UR                  OUR                  nUR                  U-  U l        SU l        U R
                  R$                  U l        UR&                  S:g  U l        U R
                  R*                  [-        U R
                  SS5      -
  nX$s=:  =(       a    S:  Os  U l        UR                  S U nU R.                  (       + =(       a6    U[1        U5      S-
  US S S2   R3                  UR                  U   5      -
  :H  U l        [6        R8                  " UR:                  UR                  U R                  -  UR<                  S	9U l        [A        U R                  URB                  S
9U l"        U R.                  (       d  [A        U R                  URB                  S
9U l#        [A        U R                  URB                  SS9U l$        [6        R8                  " UR:                  X0R                  -  UR<                  S	9U l%        U R                  (       d6  [6        R8                  " UR:                  X0R                  -  UR<                  S	9OS U l&        [6        R8                  " UR                  U R                  -  UR:                  UR<                  S	9U l'        g )Nr  sliding_attentionr   r  r  r   r;   r   rg   )r   r  FrC  )(rn   ro   hasattrr  r  rb   r   r  r  r  r   attention_k_eq_vuse_alternative_attentionnum_global_key_value_headsr  r   num_key_value_groupsr  r  use_bidirectional_attentionr  r  r  r  lenindexstore_full_length_kvr   rq   r   attention_biasr   r   r  r  r  r  r   r   r  )rv   rb   r   r  r  prev_layersrw   s         rS   ro   Gemma4TextAttention.__init__  s   ;B6=;Y;Y&,,Y7_c"//-@@7;f33D6:oo&J`J`..flfufu)/)@)@)XEX&151O1OF--U[UoUo 	 %+$>$>BU$U!!%!>!>;;uD %)KK$A$AGDKKYoqrDs$s!"+"M"MA"M(()C*CD(,(?(?$? %/IQTU`QadeQehsbDi

%""9-
.R/ E/! ii : :T]] JQWQfQf
 $6;N;NO &&'DMMv?R?RSDK'6;N;N[`aDK))""$7--$GfNcNcDK
 55 		&,,.AMM.QX^XmXmn K ii&&68J8JQWQfQf
rR   Nry   r   r]   rC   past_key_valuesrZ  re   c                    UR                   S S n/ UQSPU R                  P7nUu  pU R                  U5      R                  U5      nU R	                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       aG  X@R                     u  pUR                  UR                  5      nUR                  UR                  5      nOU R                  U5      R                  U5      nU R                  b   U R                  U5      R                  U5      OUnU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  U5      nUR                  SS5      nUb/  U R                  (       d  UR                  XU R                   5      u  pU R"                  (       a  X4X@R                  '   [$        R&                  " U R(                  R*                  [,        5      nU" U UUUU4U R.                  (       a  U R0                  OSU R2                  U R4                  S.UD6u  nnUR6                  " / UQSP76 R9                  5       nU R;                  U5      nUU4$ )Nr   r&   )r  r;   rl  )r  r  r  )r   r   r   r   r  r6   rM  r  r  r   r   r   r   r  r  updater   r3  r   r  rb   r  r7   r  r  r  r  r   r   r  )rv   ry   r   r]   rC   r7  rZ  r  r   r   r   r   r   r   r  r   r   s                    rS   r}   Gemma4TextAttention.forward  s>    $))#2.88b8$--8&{{=166|D{{<0+LsRST#--a3
 ""'7'H$J#|':':;J'??<+>+>?L]388FJLPKKLc4;;}5::<HisLZ0J-jsRSTJ#--a3J;;|4L'11!Q7L&t/F/F'6'='=jX\XfXf'g$J$$0:0H__-(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((rR   )r  rb   r   r  r  r  r  r   r   r  r/  r  r  r   r  r  r3  r-  r  r   r{   )rF   rG   rH   rI   rJ   r>   r   ro   rN   rO   rK   rL   rM   r   r   r   r}   rQ   r   r   s   @rS   r(  r(    s    G/
/ /
C /
n )-=)||=) #\\=) t+	=)
 sE%,,*D$EEF=) =) -.=) 
u||U\\D00	1=) =)rR   r(  c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )Gemma4TextExpertsiB  rb   c                    > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        g r{   )rn   ro   num_expertsmoe_intermediate_sizeintermediate_dimr   hidden_activationr*  r.  s     rS   ro   Gemma4TextExperts.__init__C  s<    !-- & < <V556rR   )r*  r@  r>  )rF   rG   rH   rI   r>   ro   rQ   r   r   s   @rS   r<  r<  B  s    7/ 7 7rR   r<  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                  \R                  4   4S jr	Sr
U =r$ )Gemma4TextRouteriJ  rb   c                 $  > [         TU ]  5         Xl        UR                  U l        U R                  S-  U l        UR
                  U l        [        U R                  U R                  SS9U l        [        R                  " UR                  UR                  SS9U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   FrC  rg   )rn   ro   rb   r   scalar_root_sizer  r  r   r  r   rq   r>  projr   rN   rc  scaleper_expert_scaler.  s     rS   ro   Gemma4TextRouter.__init__K  s    !-- $ 0 0$ 6&&!$"2"2US	IIf00&2D2D5Q	\\%**T-=-=">?
 "UZZ8J8J-K LrR   ry   re   c                 ^   U R                  U5      nXR                  -  U R                  -  nU R                  U5      n[        R
                  R                  USS9n[        R                  " UU R                  R                  SS9u  pEXDR                  SSS9-  nX@R                  U   -  nX4U4$ )Nr   r   )r  r   Tr  )r  rH  rF  rG  r   r   r   rN   topkrb   top_k_expertssumrI  )rv   ry   expert_scoresrouter_probabilitiestop_k_weightstop_k_indexs         rS   r}   Gemma4TextRouter.forwardW  s    		-0%

2T5J5JJ		-0!}}44]4K &+ZZ kk''&
" 	**r4*@@ &(=(=k(JJ#K??rR   )rb   r  r   r  rI  rG  rF  rH  )rF   rG   rH   rI   r>   ro   rN   rO   rM   r}   rQ   r   r   s   @rS   rD  rD  J  sD    
M/ 
M@U\\ @eELL%,,<V6W @ @rR   rD  c                   @  ^  \ rS rSrS\\-  S\4U 4S jjr      SS\R                  S\R                  S\
\\\R                  \R                  4   4   S-  S	\R                  S
\R                  S-  S\R                  S-  S\S-  S\R                  4S jjrSrU =r$ )Gemma4TextDecoderLayerin  rb   r   c                 z  > [         TU ]  X5        [        XS9U l        [	        X5      U l        U R                  S[        R                  " S5      5        UR                  U l	        U R                  (       a  [        UR                     U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [%        U R                  UR&                  S9U l        UR*                  U l        U R*                  (       a  [-        U5      U l        [1        U5      U l        [%        U R                  UR&                  S9U l        [%        U R                  UR&                  S9U l        [%        U R                  UR&                  S9U l        g g )Nr  layer_scalarr;   Frg   r  )rn   ro   r(  rT  r  r  rs   rN   rc  hidden_size_per_layer_inputr   rA  r*  r   rq   r   per_layer_input_gateper_layer_projectionr   r  post_per_layer_input_normenable_moe_blockrD  routerr<  expertspost_feedforward_layernorm_1post_feedforward_layernorm_2pre_feedforward_layernorm_2r   s      rS   ro   Gemma4TextDecoderLayer.__init__o  sR   +,FP 3^UZZ];+1+M+M(++ !9!9:DK(*		$2B2BDDdDdkp(qD%(*		$2R2RTXTdTdkp(qD%-:4;K;KQWQdQd-eD* & 7 7  *62DK,V4DL0=d>N>NTZTgTg0hD-0=d>N>NTZTgTg0hD-/<T=M=MSYSfSf/gD, !rR   Nry   per_layer_inputrC   r   r]   r   r7  re   c           
      (   Un	U R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nUn	U R                  U5      nU R	                  U5      nU R
                  (       a  U R                  U5      nU	R                  SU	R                  S   5      nU R                  U5      u  pnU R                  U5      nU R                  XU5      nUR                  U	R                  5      nU R                  U5      nX-   nU R                  U5      nX-   nU R                  (       aN  Un	U R                  U5      nU R!                  U5      nX-  nU R#                  U5      nU R%                  U5      nX-   nXR&                  -  nU$ )N)ry   r   r]   rC   r   r7  r   rD   )r  rT  r  r  r  r]  r`  r   r   r^  rb  r_  ra  r  rY  rZ  r*  r[  r\  rW  )rv   ry   rd  rC   r   r]   r   r7  rZ  r3  r   hidden_states_1hidden_states_flatrQ  rR  hidden_states_2s                   rS   r}   Gemma4TextDecoderLayer.forward  s    !,,];>> 
' 3)-%+
 
 55mD 0 66}E/  "??NO "*!1!1"hnnR6H!I,0KK8J,K)Ak">>?QRO"ll?WO-55hnnEO"??PO ,=M77F 0++$H 55mDM KK6M);M 55mDM ::=IM$4M***rR   )r*  r]  r_  rY  r  rZ  r[  r`  ra  r\  rb  r^  rT  )NNNNNN)rF   rG   rH   rI   r>   r?   r   ro   rN   rO   rK   rL   rM   r  r   r}   rQ   r   r   s   @rS   rU  rU  n  s    h/2DD hQT h0 )-PT,0.204(,9||9 9 sE%,,*D$EEFM	9
 #\\9 t+9 &&-9 9 
9 9rR   rU  c                       \ rS rSrSrg)Gemma4TextScaledWordEmbeddingi  rD   Nr   rD   rR   rS   rk  rk    r   rR   rk  c                   R    \ rS rSr/ SQrSrSr\R                  " 5       S 5       r	Sr
g)Gemma4PreTrainedModeli  )rU  r  r^  rP  )imagetextvideoaudioNc                 z
   [         R                  " U5        [        U[        5      (       a!  [        R
                  " UR                  5        g [        U[        5      (       a  SnSnUR                  S-  n[        R                  " X2-  5      [        US-
  S5      -  nU[        R                  " [        R                  " U5      U* -  5      -  n[        R                  " UR                   UR#                  S5      R#                  S5      5        g [        U[$        5      (       aL  [        R&                  " UR(                  UR*                  5        [        R,                  " UR.                  5        g [        U[0        5      (       a  UR2                  R5                  5        H  u  pxSU0n	US:X  a  UR6                  U   S:X  a  S	U	S
'   U" UR8                  40 U	D6u  p[        R                  " [;        X S35      U
5        [        R                  " [;        X S35      U
5        M     g [        U[<        5      (       a  UR6                  S:w  a  [>        UR6                     OUR@                  nU" UR8                  5      u  p[        R                  " URB                  U5        [        R                  " URD                  U5        g [        U[F        5      (       a,  [        R&                  " URH                  URJ                  5        g [        U[L        5      (       aA  [        R
                  " URN                  5        [        R
                  " URP                  5        g [        U[R        5      (       aW  U R8                  RT                  n[        RV                  " URX                  SUS9  [        RV                  " URZ                  SUS9  g [        U[\        5      (       a!  [        R
                  " UR^                  5        g [        U[`        5      (       a  URb                  (       a  [        R&                  " URd                  [g        S5      * 5        [        R&                  " URh                  [g        S5      5        [        R&                  " URj                  [g        S5      * 5        [        R&                  " URl                  [g        S5      5        g [        U[n        5      (       a]  UR8                  Rp                  (       aA  [        R,                  " URr                  5        [        R
                  " URt                  5        g g g )Nr   r   r&   r;   r   r  r  r  r  r  r  r  r  rl  )meanstdrj   );r   _init_weightsr  r^  initones_rd  r   r   r   r   r   rN   r   r   copy_r   r   r   	constant_r   r   zeros_r   r  r  itemsr  rb   r  r  r   r  r  original_inv_freqrk  embed_scalescalar_embed_scalerD  rH  rI  r<  initializer_rangenormal_gate_up_projr  rU  rW  r`   rp   ri   ru   rk   rl   rm   Gemma4VisionModelstandardizestd_bias	std_scale)rv   moduler   r   r   r   r   r  r!  r"  r#  r   rope_fnbuffer_valuert  s                  rS   ru  #Gemma4PreTrainedModel._init_weights  s   %%f-f788JJv667 @AAM#M#//14N&*hh}/L&MPSTbefTfhiPj&j#*UYYu||N7SWnVn7n-ooNJJv,,n.F.Fq.I.S.STU.VW 455NN6>>6+K+KLKK,,- 9::,2,@,@,F,F,H(
'3Z&@#!11f6F6Fz6RVd6d:K'7#/#UAT#U 

76\+CDmT

76\9K+LM}] -I  ;<< ##y0 $F$4$45;; 
 &fmm4OLJJv5JJv//> =>>NN6--v/H/HI 011JJv||$JJv../ 122++//CLL,,3C@LL))= 677JJv**+ 5666;U;UNN6++eEl];NN6++U5\:NN6,,uU|m<NN6,,eEl; 122v}}7P7PKK(JJv''( 8Q2rR   rD   )rF   rG   rH   rI   _no_split_modulesinput_modalities_can_record_outputsrN   r   ru  rQ   rD   rR   rS   rm  rm    s0     ;
]]_2) 2)rR   rm  zAThe base Gemma 4 language model without a language modeling head.custom_introc                      ^  \ rS rSr% \\S'   \" \SS9\\	S.r
S\4U 4S jjrS\R                  S-  S	\R                  S-  S
\R                  4S jr SS	\R                  S\R                  S-  S
\R                  4S jjr\\\       SS\R&                  S-  S\R                  S-  S\R&                  S-  S\S-  S	\R*                  S-  S\R                  S-  S\S-  S\\   S
\4S jj5       5       5       rSrU =r$ )Gemma4TextModeli  rb   r   )r2  )router_logitsry   
attentionsc           
        > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        U5      U l	        [        U R                  R                  5      U l        UR                  U l        U R                  (       a  [        UR                   UR
                  UR                  -  U R"                  UR                  S-  S9U l        SU l        [        R(                  " UR*                  UR
                  UR                  -  SS9U l        UR*                  S-  U l        [1        UR                  UR2                  S9U l        / U l        [9        U R                  5       HT  u  p4UR:                  R<                  (       d  M"  U R6                  R?                  S Vs/ s H  nS	U S
U 3PM     sn5        MV     g s  snf s  snf )Nrv  )r}  g;f?Frg   r   rX  )r   r   r  r  zlayers.z.self_attn.) rn   ro   r   r  r  r  rU  r  r  r  r  rb   r  unique_layer_typesrY  rk  vocab_size_per_layer_inputpadding_idxembed_tokens_per_layerper_layer_input_scalerq   r   per_layer_model_projection per_layer_model_projection_scaler   r  per_layer_projection_norm"_keys_to_ignore_on_load_unexpected	enumeraterT  r  extend)rv   rb   r   r  layernamerw   s         rS   ro   Gemma4TextModel.__init__  s    mmHMfNfNfHghHg9#F6Hgh
 4F;"%dkk&=&=">
 ,2+M+M(++*G11((6+M+MM  ">>C	+D' *3D&.0ii""((6+M+MM/D+
 5;4F4F4LD1-:6;];]cicvcv-wD* 35/!$++.HA11177>>@hi@hwqcTF3@hi /7 i< js   G'	G,
	input_idsNr  re   c                    U R                   (       d  [        SU R                   35      eUc  [        R                  " 5          USS2SS2SSS24   U R
                  R                  SSSS2SS24   U R                  R                  S-  -  :H  R                  SS9R                  5       SS2S4   n UR                  UR                  SS 5      n SSS5        U R                  U5      R                  " / UR                  QU R                  R                  PU R                   P76 $ ! [         a    [        S5      ef = f! , (       d  f       Nt= f)a  Compute the token-identity component of Per-Layer Embeddings (PLE).

Looks up `input_ids` in `embed_tokens_per_layer` (a scaled embedding that multiplies
by `sqrt(hidden_size_per_layer_input)`) and reshapes the packed output from
`[batch, seq, num_hidden_layers * hidden_size_per_layer_input]` to
`[batch, seq, num_hidden_layers, hidden_size_per_layer_input]`.

If only `inputs_embeds` is provided (no `input_ids`), reverses the main embedding
to recover `input_ids` for the PLE lookup.
z}Attempting to call get_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nrv  r	   r   r&   a)  It seems like you tried to call `forward` from `inputs_embeds` without providing `input_ids`, and that the `inputs_embeds` you provided do not exactly match the embedding weights. Since Gemma4 needs to reverse the embedding to compute another embedding, make sure you provide exact `inputs_embeds`)rY  RuntimeErrorrb   rN   r   embed_tokensr  r   r  nonzeror   r   r  r   r  )rv   r  r  s      rS   get_per_layer_inputs$Gemma4TextModel.get_per_layer_inputs4  sK    //**.++8   &aD!m4,,33D$14DEH_H_adHdde SQSZWYq!t%  )}/B/B2A/F GI !$ **95== 
__
KK))
 ,,
 	
 $ &r  !s   A.D>1D%%D;;D>>
Eper_layer_inputsc                 `   U R                   (       d  [        SU R                   35      eU R                  U5      U R                  -  nUR
                  " / UR                  SS QU R                  R                  PU R                   P76 nU R                  U5      nUc  U$ X2-   U R                  -  $ )aL  Compute the context-aware component of PLE and combine with token-identity.

Projects `inputs_embeds` through `per_layer_model_projection` (Linear), scales by
`1/sqrt(hidden_size)`, reshapes to `[batch, seq, num_layers, ple_dim]`, and normalizes
with `per_layer_projection_norm` (RMSNorm).

If `per_layer_inputs` (the token-identity component from `get_per_layer_inputs()`)
is provided, combines both: `(context_projection + token_identity) * (1/sqrt(2))`.
If `per_layer_inputs` is None (e.g. for multimodal inputs where input_ids are not
available), returns just the context projection.
zAttempting to call project_per_layer_inputs() from a model initialized with a config that does not support per-layer embeddings. Nr   )
rY  r  rb   r  r  r   r   r  r  r  )rv   r  r  r[  s       rS   project_per_layer_inputs(Gemma4TextModel.project_per_layer_inputs`  s      //226++@ 
  $>>}MPTPuPuu3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''$74;U;UUUrR   r]   r   r7  	use_cacherZ  c           
      d   USL USL-  (       a  [        S5      eUb  Ub  [        S5      eUb  U R                  U5      nU R                  (       a%  Uc  U R                  X5      nU R	                  XV5      nU(       a  Uc  [        U R                  S9nUcU  Ub  UR                  5       OSn	[        R                  " UR                  S   UR                  S9U	-   nUR                  S5      n[        U=n
[        5      (       d)  U R                  UUUUS.n[        S0 UD6[!        S0 UD6S	.n
Un0 nU R"                   H  nU R%                  XU5      X'   M     UR'                  S
[)        5       5      n[+        U R,                  SU R                  R.                   5       H\  u  nnUb  USS2SS2USS24   OSnU" UU4UXR                  R0                  U      XR                  R0                  U      UUS.UD6nM^     U R3                  U5      n[5        UUUR7                  SS5      (       a  US9$ SS9$ )a3  
per_layer_inputs (`torch.Tensor`, *optional*):
    Pre-computed per-layer input text embeddings of shape `(batch_size, sequence_length, num_hidden_layers,
    hidden_size_per_layer_input)`. When provided, these are used directly instead of being computed from `input_ids`
    via `get_per_layer_inputs()` in the text model. If calling the `forward` with `inputs_embeds` instead of `input_ids`,
    you should probably precompute them and forward them along `inputs_embeds`, otherwise recomputing them needs
    to reverse the main embedding, which is expensive.
N:You must specify exactly one of input_ids or inputs_embeds<You cannot specify per_layer_inputs if input_ids is provided)rb   r   r;   r   rb   r  r]   r7  r   )r  r*  rC   )rC   r   r]   r   r7  return_shared_kv_statesF)r  r7  rC   rD   )r  r  rY  r  r  r   rb   get_seq_lengthrN   r   r   r   r   r  rK   r   r   r  r  popr   r  r  r  r  r  rX   get)rv   r  r]   r   r7  r  r  r  rZ  past_seen_tokenscausal_mask_mappingmask_kwargsry   r   r  rC   r  r   rd  s                      rS   r}   Gemma4TextModel.forward  sk   , -t";<YZZ %5%A[\\  --i8M++'#'#<#<Y#V #<<]]0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U# & 11J.2oom[e.f+ 2 "::&8(*E !*$++6U8U8U*V WA}>N>Z.q!Qz:`dO)	 "2$78O8OPQ8R$S2;;3J3J13MN) /	 	M !X 		-0,++17<UW\1]1]-
 	
 dh
 	
rR   )
r  r  rY  r  r  r  r  r  r  r  r{   )NNNNNNN)rF   rG   rH   rI   r>   rP   r$   rD  rU  r(  r  ro   rN   rO   r  r  r"   r%   r   r  r   r  boolr   r   rX   r}   rQ   r   r   s   @rS   r  r    s   '(8B/)"/ "H*
ellT.A *
RWR^R^aeRe *
jojvjv *
^ 15!V||!V  ,,-!V 
	!VF   .2.204(,2604!%Y
##d*Y
 t+Y
 &&-	Y

 Y
 ((4/Y
  ,,-Y
 $;Y
 +,Y
 
'Y
    Y
rR   r  z>The base Gemma 4 language model with a language modeling head.c                   @   \ rS rSrSr\\         SS\R                  S-  S\R                  S-  S\R                  S-  S\
S-  S\R                  S-  S	\R                  S-  S
\S-  S\\R                  -  S\R                  S-  S\\   S\4S jj5       5       rSrg)Gemma4ForCausalLMi  modelNr  r]   r   r7  r  labelsr  logits_to_keepr  rZ  re   c
                 8   U R                   " SUUUUUU	US.U
D6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bF  XR                  R                  -  n[        R                  " U5      nXR                  R                  -  nSnUb  U R                  " XU R                  40 U
D6n[        UUUR                  UR                  UR                  UR                   S9$ )aU  
per_layer_inputs (`torch.Tensor`, *optional*):
    Pre-computed per-layer input text embeddings of shape `(batch_size, sequence_length, num_hidden_layers,
    hidden_size_per_layer_input)`. When provided, these are used directly instead of being computed from `input_ids`
    via `get_per_layer_inputs()` in the text model. If calling the `forward` with `inputs_embeds` instead of `input_ids`,
    you should probably precompute them and forward them along `inputs_embeds`, otherwise recomputing them needs
    to reverse the main embedding, which is expensive.

Example:

```python
>>> from transformers import AutoTokenizer, Gemma4ForCausalLM

>>> model = Gemma4ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)r  r]   r   r7  r  r  r  N)losslogitsr7  ry   r  rC   rD   )r  r  r  r   slicelm_headrb   final_logit_softcappingrN   r   loss_function
vocab_sizerU   r7  ry   r  rC   )rv   r  r]   r   r7  r  r  r  r  r  rZ  outputsry   slice_indicesr  r  s                   rS   r}   Gemma4ForCausalLM.forward  s   P 26 	2
)%+'-	2
 	2
  118B>SV8W8W~ot4]kmA}a,?@A;;..:kkAAAFZZ'FkkAAAF%%fdooPPD+#33!//))$55
 	
rR   rD   )	NNNNNNNr   N)rF   rG   rH   rI   base_model_prefixr   r   rN   r  rO   r   r  r  r   r   r   rU   r}   rQ   rD   rR   rS   r  r    s    .2.204(,26*.!%-.04E
##d*E
 t+E
 &&-	E

 E
 ((4/E
   4'E
 $;E
 ell*E
  ,,-E
 +,E
 
&E
  E
rR   r  c                   8  ^  \ rS rSr% Sr\\S'   SrSr\	\
S.rS\4U 4S jjrS\R                  S	\R                  4S
 jr\\\" SS9 SS\R                  S\R                  S-  S\\   S	\\R                  \R,                  4   4S jj5       5       5       rSrU =r$ )Gemma4AudioModeli0  znAn audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.rb   r  zmodel.audio_towerry   r  c           	        > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        R                  " UR                  UR                  SS9U l        U R#                  5         g s  snf )NTrg   )rn   ro   rb   r  subsample_conv_projectionr   rel_pos_encr   r  r  r  rP  r  rq   r   output_proj_dimsoutput_proj	post_initr   s      rS   ro   Gemma4AudioModel.__init__;  s     )KF)S&;FCmmBGH`H`BabBaYf0Bab
 99V%7%79P9PW[\	 cs   B?mask_4dre   c                 ^   UR                   u  p#pCUR                  nU R                  R                  nU R                  R                  S-
  nU R                  R
                  nXF-   S-
  U-  n	X-  n
X-
  n[        R                  " USUSU4SS9nUR                  USXU
5      n[        R                  " XU4SS9n[        R                  " XS9U-  n[        R                  " Xg-   U-   US9nUSS2S4   USSS24   -   nUSSSS2SSS24   R                  USSUS5      nUR                  SU5      $ )z
Convert a standard 4D attention mask `[batch_size, 1, seq_len, seq_len]` to the 5D blocked format
`[batch_size, 1, num_blocks, chunk_size, context_size]` expected by the chunked local attention,
r;   r   F)valuer   Nr   )r   r   rb   r   r   r   r   r   r   rN   r   r  gather)rv   r  r   r   r   r   r   r   r   r   padded_seq_len
pad_amountmask_5dblock_startsoffsets
kv_indicess                   rS   _convert_4d_mask_to_blocked_5d/Gemma4AudioModel._convert_4d_mask_to_blocked_5dH  s:   
 %,MM!
w[[55
;;==A![[@@*Q.:=
#0#-
%%!ZJ!?uM//*aX%%4F!GuU||J>K,,z<?QQZ`a!!T'*WT1W-==
dAtQ 67>>z1bR\^`a
~~b*--rR   z&Encodes audio features to soft tokens.r  Nr]   rZ  c           	         U R                  X5      u  pEU R                  U5      n[        U R                  UU[	        U R                  R
                  S-
  U R                  R                  45      S9nUb  U R                  U5      nU R                  S U R                  R                    H  nU" U4UUS.UD6nM     U R                  U5      n[        XES9$ )Nr;   )rb   r  r]   and_mask_function)r]   r   )r  r]   )r  r  r   rb   r:   r   r   r  r  r  r  r[   )rv   r  r]   rZ  ry   output_maskr   encoder_layers           rS   r}   Gemma4AudioModel.forwardc  s     &*%C%CN%c""..}=2;;'&:33a79\9\]	
 %!@@PN![[)H4;;+H+HIM)-$7 	M J ((7%bbrR   )rb   r  r  r  r  r{   )rF   rG   rH   rI   rJ   r<   rP   main_input_namer  rP  r   r  ro   rN   rO   r  r"   r%   r   r   r   rM   r^   r}   rQ   r   r   s   @rS   r  r  0  s    x&O+)*
0 .ell .u|| .6  !IJ /3cc t+c +,	c
 
u||U---	.c K   crR   r  c                      ^  \ rS rSrSr\r\\S.r	S\4U 4S jjr
\\\" SS9S\R                  S	\R                   S
\\   S\4S j5       5       5       rSrU =r$ )r  i  zThe Gemma 4 Vision Encoder.r  rb   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        U R                  R                  (       at  U R                  S[        R                  " U R                  R                  5      5        U R                  S[        R                  " U R                  R                  5      5        U R                  5         g )Nr  r  )rn   ro   r^  patch_embedderr  encoderry  poolerrb   r  rs   rN   emptyr   r  r.  s     rS   ro   Gemma4VisionModel.__init__  s     7?*62(0;;""  U[[9P9P-QR  ekk$++:Q:Q.RSrR   z1Encodes image pixels to soft tokens from patches.r  rt  rf  rZ  re   c                    U R                   R                  nUR                  S   XD-  -  nUS:H  R                  SS9nU R	                  XU5      nU R
                  " SUU) US.UD6nU R                  UR                  UUUS9u  pX   n	U R                   R                  (       a7  XR                  R                  5       -
  U R                  R                  5       -  n	U	R                  UR                  5      n	[        U	S9$ )a  
pixel_values (`torch.FloatTensor` or `list[torch.FloatTensor]`):
    The images to encode. Either a single `[batch, channels, height, width]` tensor
    (all images same size) or a list of `[1, channels, height, width]` tensors (different sizes).
pixel_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
r   r   )r  r]   rf  )ry   rf  rg  r  r  rD   )rb   pooling_kernel_sizer   r  r  r  r  r  r  r  ru   r  r   r   r   )rv   rt  rf  rZ  r  r  rg  r  r  ry   pooler_masks              rS   r}   Gemma4VisionModel.forward  s     #kk==$**2.3F3\]/25::r:B++LN_` 
'--1
 	
 &*[[ 221/'	 &1 &
" &2 ;;""*]]-@-@-BBdnnFZFZF\\M%(()<)<=&GGrR   )r  r  r  )rF   rG   rH   rI   rJ   r?   rb   r  r  r  ro   r"   r%   r   rN   r  r  r   r   r   r}   rQ   r   r   s   @rS   r  r    s    %F1+

1 
  !TU)H'')H ",,)H +,	)H
 
!)H V   )HrR   r  c                   t   ^  \ rS rSrS\\-  S\4U 4S jjrS\R                  S\R                  4S jr
SrU =r$ )	Gemma4MultimodalEmbedderi  multimodal_configtext_configc                    > [         TU ]  X5        U ?U ?U ?U ?U ?U ?[        USUR                  5      U l
        [        U R                  U R                  SS9U l        g )Nr  FrC  )rn   ro   rm  hard_embedding_normsoft_embedding_normvocab_offsetr  embedding_post_projection_normr  r   multimodal_hidden_sizer   r  embedding_pre_projection_norm)rv   r  r  rw   s      rS   ro   !Gemma4MultimodalEmbedder.__init__  sn     	*8N$$O/&-.?ASUfUrUr&s#-:4;V;V\`\d\dqv-w*rR   r  re   c                 F    U R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.
Args:
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.
Returns:
    A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
)r  embedding_projection)rv   r  embs_normeds      rS   r}    Gemma4MultimodalEmbedder.forward  s%     88G((55rR   )r  r  )rF   rG   rH   rI   r<   r?   r>   ro   rN   rO   r}   rQ   r   r   s   @rS   r  r    sC    x,/AAx &x$6U\\ 6ell 6 6rR   r  token_type_idsimage_group_idsc           
      \   ^ U c  gS[         S[         S[         S[         S[        4
U4S jjnU$ )z
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
N	batch_idxhead_idxq_idxkv_idxre   c                    > T	R                   S   nUR                  US-
  S9nUR                  US-
  S9nT	X4   nT	X4   n[        R                  " X$:  US5      n[        R                  " X4:  US5      nXx:H  US:  -  $ )Nr   r;   )r   r   )r   r|   rN   rn  )
r  r  r	  r
  r   q_idx_clampedkv_idx_clampedq_groupkv_groupr  s
            rS   
inner_mask0token_type_ids_mask_function.<locals>.inner_mask  s    $**2.
 
Q7*q.9 ")":;"9#<=++e0'2>;;v2HbA#155rR   )r   r  )r  r  r  s    ` rS   token_type_ids_mask_functionr    s>     6c 6S 6 6c 6d 6 rR   mm_token_type_idsr   c                     U R                  U5      n U S:H  U S:H  -  n[        R                  " USSS9nSUS'   X#) -  n[        R                  " UR	                  5       SS9S-
  n[        R
                  " X%S5      nU$ )Nr;   r&   r   )shiftsdimsFrj  r   )r   rN   rollcumsumr   rn  )r  r   	is_visionis_prev_visionnew_vision_startsvision_group_idsblock_sequence_idss          rS   get_block_sequence_ids_for_maskr    s    ),,V4"a',=,BCIZZ	!"=N"N6!O3||$5$9$9$;CaGY"ErR   z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            $         ^  \ rS rSrS\4U 4S jjrS rS r\\	" SS9 S!S	\
R                  S
\
R                  S-  S\\   S\4S jj5       5       r\\	" SS9 S!S\
R                  S\
R                  S-  S\\   S\4S jj5       5       r  S"S\
R                  S-  S\
R                  S-  S\\
R&                  \
R&                  \
R&                  4   4S jjr\\\	              S#S\
R                  S-  S	\
R                  S-  S\
R                  S-  S\
R                  S-  S\
R,                  S-  S\
R,                  S-  S\
R                  S-  S\S-  S\
R                  S-  S\
R                  S-  S\S-  S
\
R                  S-  S\
R                  S-  S\
R,                  S-  S\\   S\4 S jj5       5       5       r\\	" SS9S\
R,                  S\
R,                  S\\   S\\-  4S j5       5       rS rU =r$ )$Gemma4Modeli  rb   c                   > [         TU ]  U5        UR                  b   [        R                  " UR                  5      OS U l        UR                  b   [        UR                  UR                  5      OS U l        UR                  b   [        R                  " UR                  5      OS U l
        UR                  b&  [        UR                  UR                  5      U l        g S U l        g r{   )rn   ro   vision_configr'   from_configvision_towerr  r  embed_visionaudio_configaudio_towerembed_audior.  s     rS   ro   Gemma4Model.__init__  s     KQK_K_KkI11&2F2FGqu ##/ %V%9%96;M;MN 	
 JPI\I\Ih9001D1DEnr "". %V%8%8&:L:LM 	  	rR   c                 .    U R                   R                  $ r{   language_modelr  rv   s    rS   get_per_layer_input_embeddings*Gemma4Model.get_per_layer_input_embeddings(  s    ""999rR   c                 $    XR                   l        g r{   r+  rv   r  s     rS   set_per_layer_input_embeddings*Gemma4Model.set_per_layer_input_embeddings+  s    5:2rR   zOProjects the last hidden state from the vision model into language model space.r  Nrt  image_position_idsrZ  re   c                 p    U R                   " SUUS.UD6nUR                  nU R                  US9Ul        U$ )z
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    The patch positions as (x, y) coordinates in the image. Padding patches are indicated by (-1, -1).
rt  rf  r  rD   )r$  r  r%  pooler_output)rv   rt  r4  rZ  vision_outputsr  s         rS   get_image_featuresGemma4Model.get_image_features.  sT     ** 
%1
 

 +<<'+'8'8GX'8'Y$rR   zQProjects the last hidden state from the vision encoder into language model space.pixel_values_videosvideo_position_idsc                     UR                  SS5      nUR                  SS5      nU R                  " SUUS.UD6nUR                  nU R                  US9Ul        U$ )a  
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
r   r;   r6  r7  rD   )flattenr$  r  r%  r8  )rv   r<  r=  rZ  r9  r  s         rS   get_video_featuresGemma4Model.get_video_featuresC  sz     299!Q?/771=** 
,1
 

 +<<'+'8'8GX'8'Y$rR   r  r  c           	         UbJ  XR                   R                  :H  nXR                   R                  :H  nXR                   R                  :H  nGO8UU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nUU R	                  5       " [
        R                  " U R                   R                  [
        R                  UR                  S95      :H  R                  S5      nX4U4$ )a;  
Obtains mask for multimodal placeholders (replaced by soft tokens) and hard text tokens.

Masks will be obtained from `mm_token_type_ids`, `input_ids`, or `inputs_embeds` as available and in that
precedence order. If passing `input_ids` or `inputs_embeds`, the image mask will be derived using
`config.image_token_id`. Same goes for audio and video masks

Args:
    input_ids: A tensor containing the hard token IDs from the text tokenizer.
    inputs_embeds: A tensor containing the embeddings for all hard text tokens.

Returns:
    image_mask, video_mask, audio_mask
)r   r   r   )
rb   image_token_idvideo_token_idaudio_token_idget_input_embeddingsrN   rt   r  r   r  )rv   r  r  special_image_maskspecial_video_maskspecial_audio_masks         rS   get_placeholder_mask Gemma4Model.get_placeholder_mask[  sA   &  !*kk.H.H!H!*kk.H.H!H!*kk.H.H!H ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  ,,.LL!;!;5::VcVjVjk c"g  "7IIIrR   r  r]   r  r   r7  r  r  r  c                    USL U
SL-  (       a  [        S5      eUb  Ub  [        S5      eU R                  X5      u  nnnUU-  U-  nSnU
c\  UR                  5       n[        R                  " UU R
                  R                  R                  U5      nU R                  5       " U5      n
Uc  U R
                  R                  5       R                  (       a  U R                  R                  R                  U R
                  R                  R                  SS24   nUR                  U
R                  5      n[        R                  " US   UR!                  SSS5      U
5      nU R                  R#                  UU5      nUGb  U R%                  X,SS9R&                  nUR                  U
R                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R                  U
R                  5      n[1        U
U   R3                  5       UR3                  5       :H  S	U S
UR4                  S    35        U
R7                  UR                  U
R                  5      UR                  U
R                  5      5      n
UGb  U R9                  X=SS9R&                  nUR                  U
R                  U
R(                  5      nUR+                  5       nUR-                  S5      R/                  U
5      R                  U
R                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S
UR4                  S    35        U
R7                  UR                  U
R                  5      UR                  U
R                  5      5      n
UGb(  UGb$  U R;                  XFSS9nUR&                  nUR<                  nUUR                  UR                  5         nUR+                  5       nUR-                  S5      R/                  U
5      R                  U
R                  5      n[1        U
U   R3                  5       UR3                  5       :H  SU S
UR4                  S   UR4                  S   -   35        U
R7                  UR                  U
R                  5      UR                  U
R                  5      5      n
UcU  Ub  UR?                  5       OSn[        R@                  " U
R4                  S   U
R                  S9U-   nUR-                  S5      n[C        U=n [D        5      (       d  U R
                  R                  5       U
UUUS.n!U R
                  R                  5       RF                  S:X  aN  [        RH                  " / U
RK                  5       SS QSU
R                  S9n"U	b  [M        XR                  S9n"U"U!S'   [O        S0 U!D6n U R                  " SUU UUU
USS.UD6n#[Q        U#RR                  U#RT                  U#RV                  U#RX                  Ub  WOSUb  WOSU#RZ                  S9$ )  
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
video_position_ids (`torch.LongTensor` of shape `(num_videos, num_frames, max_patches, 2)`, *optional*):
    2D patch position coordinates from the video processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
per_layer_inputs (`torch.Tensor`, *optional*):
    Pre-computed per-layer input text embeddings of shape `(batch_size, sequence_length, num_hidden_layers,
    hidden_size_per_layer_input)`. When provided, these are used directly instead of being computed from `input_ids`
    via `get_per_layer_inputs()` in the text model. If calling the `forward` with `inputs_embeds` instead of `input_ids`,
    you should probably precompute them and forward them along `inputs_embeds`, otherwise recomputing them needs
    to reverse the main embedding, which is expensive.
Nr  r  r   r;   r   T)return_dictz6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: z6Audio features and audio tokens do not match, tokens: r   r  visionr  )r  r]   r   r7  r  r  rN  )r  r7  ry   r  image_hidden_statesaudio_hidden_statesrC   rD   ).r  rJ  r  rN   rn  rb   r  pad_token_idrF  get_text_configrY  r,  r  r  r   r   r   r  r:  r8  r   rN  r   	expand_asr    numelr   masked_scatterr@  get_audio_featuresr]   r  r   r  rK   r0  fullsizer  r   rA   r  r7  ry   r  rC   )$rv   r  rt  r<  r  r]   r  r   r7  r  r  r  r4  r=  r  rZ  
image_mask
video_mask
audio_maskmultimodal_maskllm_input_idspad_embeddingllm_inputs_embedsimage_featuresn_image_tokensvideo_featuresn_video_tokensaudio_outputaudio_featuresaudio_mask_from_encodern_audio_tokensr  r  r  r  r  s$                                       rS   r}   Gemma4Model.forward  s   J -t";<YZZ %5%A[\\-1-F-Fy-`*
J
$z1J>  %OO-M!KK9P9P9]9]_lmM 557FM#(C(C(E(a(a //<<CCDKKD[D[DhDhjkDklM-001E1EFO %OI,FHZHZ[\^_acHdfs t#22GGWhi #!44\cg4hvvN+..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M *!44#T 5 m  ,..}/C/C]EXEXYN (^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+,. *88m223^5F5F}G[G[5\M
 %*=*I22>dh2iL)77N&2&A&A#
 ,,C,F,F~G\G\,]^N'^^-N#--b1;;MJMMmNbNbcJ"j)//1^5I5I5KKHHX Y"((+n.B.B1.EEFH *88m223^5F5F}G[G[5\M
 CRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L?-FF++557!."0#2 ,K {{**,HHHT%*ZZ0L-2D2D2Fs2K0LbYfYmYm%n"$0)H)2F2F*& 5G01 #<"Jk"J%% 	
-.%+'	
 	
 )%77#33!//))2>2JPT2@2LRV$55
 	
rR   zPProjects the last hidden state from the audio encoder into language model space.c                     U R                   c  [        S5      eU R                   " X4SS0UD6nU R                  UR                  S9Ul        U$ )a  
input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
    The tensors corresponding to the input audio.
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
zAudio features were requested, but the model was initialized without an audio_config. Cannot process audio without an audio tower and audio embedder.rN  Tr7  )r'  r  r(  r  r8  )rv   r  r  rZ  audio_outputss        rS   rW  Gemma4Model.get_audio_features1  sc     #R 
 ((iZ^ibhi&*&6&6]EdEd&6&e#rR   )r'  r(  r%  r$  r{   r&  )NNNNNNNNNNNNNN)rF   rG   rH   rI   r=   ro   r.  r2  r   r   rN   r  r  r   r   r   r:  r@  rM   r^   rJ  r"   rO   r   r  rA   r}   r[   rW  rQ   r   r   s   @rS   r   r     s/   
| 
:; !rs 7;'' ",,t3 +,	
 
$ t & !tu 7;".. ",,t3 +,	
 
$ v 0 .226+J##d*+J ((4/+J 
u!1!153C3CC	D	+JZ   .2158<37.23704(,5926!%6:6:04d
##d*d
 ''$.d
 #..5	d

 ))D0d
 t+d
 #\\D0d
 &&-d
 d
 !++d2d
 ((4/d
 $;d
 ",,t3d
 ",,t3d
  ,,-d
  +,!d
" 
##d
    d
L !st #\\ +,	
 
'	' u rR   r   z
    The base Gemma 4 model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            %       N  ^  \ rS rSrSrS rS r                SS\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\\R                  -  S\R                  S-  S\\   S\4$S jjr\ S S\R                  S\R                  S-  S\\   4S jj5       r\  S!S\S\R                  S
\R                  S-  S\S-  S\R                  S-  S\R                  S-  S\S-  S\4S jj5       r             S"U 4S jjrSrU =r$ )#Gemma4ForConditionalGenerationiK  r  c                 6    U R                   R                  5       $ r{   )r  r.  r-  s    rS   r.  =Gemma4ForConditionalGeneration.get_per_layer_input_embeddingsT  s    zz88::rR   c                 :    U R                   R                  U5        g r{   )r  r2  r1  s     rS   r2  =Gemma4ForConditionalGeneration.set_per_layer_input_embeddingsW  s    

11%8rR   Nr  rt  r<  r  r]   r  r   r4  r=  r7  r  r  r  r  r  r  rZ  re   c                    U R                   " S0 SU_SU_SU_SU_SU_SU_SU_SU
_S	U_S
U_SU_SU_SU_SU_SU	_SS_UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnUb6  U R                  " UXR                  R                  5       R                  40 UD6n[        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  S9$ )rM  r  rt  r<  r  r]   r  r   r7  r  r  r  r  r  r4  r=  rN  TN)r  r  r7  ry   r  rP  rQ  rC   rD   )r  r  r  r   r  r  rb   rS  r  rN   r   r  r  rU   r7  ry   r  rP  rQ  rC   )rv   r  rt  r<  r  r]   r  r   r4  r=  r7  r  r  r  r  r  r  rZ  r  ry   r  r  r  r  s                           rS   r}   &Gemma4ForConditionalGeneration.forwardZ  s   H ** 

%
 !4
 *	

 *
 !4
 &
 ,
 0
 (
 .
 
  
  2
  2
  #
(  118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F%%ffkk6Q6Q6S6^6^ibhiD+#33!//)) ' ; ; ' ; ;$55	
 		
rR   c                 <    U R                   R                  " X40 UD6$ )a  
image_position_ids (`torch.LongTensor` of shape `(batch_size, max_patches, 2)`, *optional*):
    2D patch position coordinates from the image processor, with `(-1, -1)` indicating padding.
    Passed through to the vision encoder for positional embedding computation.
)r  r:  )rv   rt  r4  rZ  s       rS   r:  1Gemma4ForConditionalGeneration.get_image_features  s     zz,,\XQWXXrR   rb   is_first_iterationc                    U R                  5       UUUUS.n[        U R                  5       SS 5      S:X  aM  [        R                  " / UR	                  5       S S QSUR
                  S9n	Ub  [        XQR
                  S9n	XS'   [        S0 UD6$ )Nr  r0  rO  r   r   r  rD   )rS  r  rN   rX  rY  r   r  r   )
rb   r  r]   r7  r   r  rw  rZ  r  r  s
             rS   r   8Gemma4ForConditionalGeneration.create_masks_for_generate  s     ,,.*,.(
 6))+-JDQU]]!&,Hm.@.@.B3B.G,H"UbUiUi!j ,%DEV_s_s%t"0B,-(7;77rR   c                    > [         TU ]  " U4UUUUUUU
US.UD6nU(       d  U(       d  UUS'   UUS'   UUS'   U	US'   OS US'   U(       d  UR                  SS 5      nU$ )N)r7  r  r]   r   r  r  r  rw  rt  r<  r  r  r  r  )rn   prepare_inputs_for_generationr  )rv   r  r7  r  r   rt  r<  r  r]   r  r  r  r  r  rw  rZ  model_inputsr   rw   s                     rS   r{  <Gemma4ForConditionalGeneration.prepare_inputs_for_generation  s    & w<
+')%))1
 
 Y+7L(2EL./-;L)*2EL./ 15L,- "  !3T:ArR   rD   )NNNNNNNNNNNNNNr   Nr{   )NF)NNNNNNNNNTNNF)rF   rG   rH   rI   r  r.  r2  rN   r  r  rO   r   r  r   r   r   rU   r}   r   r:  r  r   rK   r   r{  rQ   r   r   s   @rS   rn  rn  K  s     ;9
 .2158<37.237046:6:(,5926*.!%-.04#N
##d*N
 ''$.N
 #..5	N

 ))D0N
 t+N
 #\\D0N
 &&-N
 ",,t3N
 ",,t3N
 N
 !++d2N
 ((4/N
   4'N
 $;N
  ell*!N
"  ,,-#N
$ +,%N
& 
&'N
`  7;Y''Y ",,t3Y +,	Y Y  26*/8 8||8 t+8 	8
 llT)8 !<<$.8 !4K8 
8 8@    . .rR   rn  )r  r  rn  r   rm  r  r  )r&   )r   collectionsr   collections.abcr   dataclassesr   	functoolsr   rN   r   torch.nnr   r    r
   rv  activationsr   cache_utilsr   r   configuration_utilsr   masking_utilsr   r   r   r   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r    utils.genericr!   r"   r#   utils.output_capturingr$   r%   auto.modeling_autor'   gemma3.modeling_gemma3r(   r)   r*   r+   r,   r-   r.   gemma3n.modeling_gemma3nr/   r0   r1   r2   r3   r4   r5   r6   r7   llama.modeling_llamar8   mixtral.modeling_mixtralr9   0moonshine_streaming.modeling_moonshine_streamingr:   configuration_gemma4r<   r=   r>   r?   
get_loggerrF   loggerrA   rU   rX   r[   r  r`   r   r   r   r   r  r#  Conv1dr7  r@  rP  r^  ry  r  rO   r   r  r  r  r  r  r  r  r(  r<  rD  rU  rk  rm  r  r  r  r  r  r  r   r  r   rn  __all__rD   rR   rS   <module>r     s8      $ ! %   $ & ! . 3  C S K F &  ^ ] E *  
 
 
 8 5 [ g g  
		H	%Q : Q*Q#@ Q2 
Q$; 
Q 
Q 
37 3  3BII :	N 	7ryy 7>i)299 i)X#bii #8; ;< RYY  H"bii "B&RYY &R0ryy 0l(3		 (3V@0 @0Fai a 5&||5&	5& 
5& ,,	5&
 5& \\5&p:"6 :z :)O :) :)|"1 "J)H")) )H^^I ^U 5 UDq)")) q)h7 7!@ryy !@HO/ Od	$A 	=)2 =)@ `aW
o W
 bW
t ]^J
) J
 _J
ZSc, SclAH- AHH68 6>LL4'\\D( _:	u|| 	U\\ 	^c^j^j 	 p, ppf	 s%D sslrR   