
    3j"                    N   S SK r S SKJr  S SKJrJr  S SKJr  S SKJ	r	  S SK
r
S SKJr  S SKJs  Jr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJ r J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.J/r/J0r0J1r1  SSK2J3r3J4r4  SSK5J6r6  SSK7J8r8  SSK9J:r:J;r;J<r<J=r=  \0" 5       (       a  S SK>J?r?  \.\ " S S\!5      5       5       r@\." SS9\ " S S\ 5      5       5       rA\." SS9\ " S  S!\,5      5       5       rB " S" S#\R                  5      rD " S$ S%\R                  5      rE " S& S'\R                  5      rF " S( S)\R                  5      rG " S* S+\R                  5      rH " S, S-\R                  5      rI " S. S/\R                  5      rJ " S0 S1\R                  5      rK " S2 S3\R                  5      rL " S4 S5\R                  5      rM " S6 S7\R                  5      rO " S8 S9\R                  5      rP " S: S;\R                  5      rQ " S< S=\R                  5      rRS> rSS?\
R                  S@\USA\
R                  4SB jrV   SjSC\R                  SD\
R                  SE\
R                  SF\
R                  SG\
R                  S-  SH\W\U-  SI\WS-  SJ\WS-  SA\X\
R                  \
R                  4   4SK jjrYSkSL\
R                  SM\
R                  SN\
R                  SO\U4SP jjrZ " SQ SR\R                  5      r[ " SS ST\5      r\\. " SU SV\(5      5       r] " SW SX\]5      r^ " SY SZ\R                  5      r_\." S[S9 " S\ S]\]5      5       r`\." S^S9 " S_ S`\]\5      5       ra " Sa Sb\R                  5      rb\." ScS9 " Sd Se\]5      5       rc\." SfS9 " Sg Sh\]\5      5       rd/ SiQreg)l    N)UserDict)CallableSequence)	dataclass)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleis_accelerate_availabletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfig)add_hook_to_modulec                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)Gemma3nAudioEncoderModelOutput;   zm
audio_mel_mask (`torch.BoolTensor`, *optional*):
    A torch.BoolTensor of shape `(batch_size, num_frames)`
Naudio_mel_mask )
__name__
__module____qualname____firstlineno____doc__r-   torch
BoolTensor__annotations____static_attributes__r.       f/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr+   r+   ;   s    
 /3NE$$t+2r8   r+   zL
    Base class for Gemma3n outputs, with hidden states and attentions.
    custom_introc                   j    \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
g)Gemma3nModelOutputWithPastF   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Nimage_hidden_statesaudio_hidden_statesr.   )r/   r0   r1   r2   r3   r?   r4   FloatTensorr6   r@   r7   r.   r8   r9   r=   r=   F   s5     59**T1848**T18r8   r=   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   *   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   Sr\R                  S-  \S
'   Srg)Gemma3nCausalLMOutputWithPast`   a
  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr?   r@   r.   )r/   r0   r1   r2   r3   rE   r4   rA   r6   rF   rG   r   rH   tuplerI   r?   r@   r7   r.   r8   r9   rC   rC   `   s    $ &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T1848**T18r8   rC   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  4S jr
S\R                  S\R                  4S	 jrS
rU =r$ )Gemma3nRMSNorm   dimeps
with_scalec                    > [         TU ]  5         X l        X0l        U R                  (       a/  [        R
                  " [        R                  " U5      SS9U l        g g )NT)requires_grad)	super__init__rO   rP   nn	Parameterr4   onesweight)selfrN   rO   rP   	__class__s       r9   rT   Gemma3nRMSNorm.__init__   s>    $??,,uzz#dKDK r8   rH   c                     UR                  S5      R                  SSS9U R                  -   nU[        R                   " US5      -  $ )Nr"   T)keepdim      )powmeanrO   r4   )rY   rH   mean_squareds      r9   _normGemma3nRMSNorm._norm   sA    $((+00T0BTXXMuyyt<<<r8   returnc                     U R                  UR                  5       5      nU R                  (       a  X R                  R                  5       -  nUR	                  U5      $ N)rc   floatrP   rX   type_as)rY   rH   normed_outputs      r9   forwardGemma3nRMSNorm.forward   sF    

=#6#6#89??)KK,=,=,??M$$]33r8   )rO   rX   rP   )gư>T)r/   r0   r1   r2   intrh   boolrT   r4   Tensorrc   rk   r7   __classcell__rZ   s   @r9   rL   rL      sW    LC Le L L L=5<< =
4U\\ 4ell 4 4r8   rL   c                   &  ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S\R                  S	\
S
\
S\
S\
S\
S\
S\R                  4S jrS\R                  S\R                  S\R                  4S jrSrU =r$ )%Gemma3nAudioRelativePositionEmbedding   configc                 L  > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R                  U R                  -  U l        [        SU R                  R                  S-
  5      U l
        U R                  R                  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        SnSnU R                  S-  n[         R"                  " [%        U5      [%        U5      -  5      [        US-
  S5      -  nU[&        R(                  " [&        R*                  " U5      U* -  5      -  nU R-                  SUR%                  5       R/                  S5      R/                  S5      SS	9  g )
Nr   r$   Fbias      ?     @r"   inv_timescales
persistent)rS   rT   ru   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrU   Linearpos_projmathlogrh   r4   exparangeregister_buffer	unsqueeze)rY   ru   min_timescalemax_timescalenum_timescaleslog_timescale_incrementr{   rZ   s          r9   rT   .Gemma3nAudioRelativePositionEmbedding.__init__   sJ   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
r8   positiondtypere   c                 H   UR                  5       R                  S5      nXR                  R                  UR                  [
        R                  S9-  n[
        R                  " [
        R                  " U5      [
        R                  " U5      /SS9nUR                  U5      $ )Nr]   devicer   rN   )rh   r   r{   tor   r4   float32catsincostype)rY   r   r   scaled_timetiming_signals        r9   _get_timing_signal_1d_pos?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos   s{    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((r8   term_bd_before_shift
batch_sizer   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     US-   U-
  nSU4n	[         R                  R                  X5      n
U
R                  UUUXVS-   -  45      nUSS2SS2SS2SXV-  24   nUR                  UUUUU45      nU$ )a"  Performs the relative shift.

Args:
  term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
    (B), num_heads (N), num_query_blocks (U), query_block_size (W),
    key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

Returns:
  Tensor of shape [B, N, U, W, C].
r$   r   N)rU   
functionalpadreshape)rY   r   r   r   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 r9   _relative_shift5Gemma3nAudioRelativePositionEmbedding._relative_shift   s    4  0!3F /0**+?O
 *11  q$89	
 *!Q3X5E5X3X*XY )00   
 r8   querieskeysc           	      >   UR                   u  p4pVnUR                   u    p  n[        R                  " U R                  U R                  * S-
  SUR
                  S9R                  S5      n
U
R                   S   nU R                  XR                  S9nU R                  U5      nUR                  SXR                  U R                  5      R                  S5      nUR                  SSSSS5      nUR                  SSSSS5      n[        R                  " UU5      nUR                  SSSSS5      nUR                  SSS5      nUR                  X6XE-  U5      n[        R                  " UU5      nUR                  UUUUU5      nU R!                  UUUUUU	U5      nUU-   $ )	Nr$   r]   r   r   r   r   r"      )shaper4   r   r   r   r   r   r   r   r   r   r   r   squeezepermutematmulr   )rY   r   r   r   r   r   r   r   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   s                           r9   rk   -Gemma3nAudioRelativePositionEmbedding.forward   s    OVmmK
&68'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
?O?bdlm

 #(,,z:"F 3::
 ..
 ((r8   )r   ru   r   r   r   r   r   )r/   r0   r1   r2   r%   rT   r4   ro   r   r   rm   r   rk   r7   rp   rq   s   @r9   rs   rs      s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L) L)r8   rs   c                   >  ^  \ rS rSrS\4U 4S jjrS rS\R                  S\	S\	S\R                  4S	 jr
S
\R                  S\R                  4S jrS
\R                  S\R                  4S jrS
\R                  S\R                  S\R                  4S jrSrU =r$ )Gemma3nAudioAttentioniE  ru   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R
                  U R                  -  U l        U R                  R                  U l        U R                  R                  U l
        [        SU R                  R                  S-
  5      U l        U R                  R                  U l        U R                  U R                  -   U R                  -   U l        [#        U5      U l        [&        R(                  " [*        R,                  " U R                  45      5      U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        U R                  S-  nS[*        R&                  R8                  R;                  [*        R<                  " S5      5      -  nU R?                  SX#-  RA                  5       RC                  5       SS	9  U RE                  5       nU R?                  S
USS	9  U R?                  S[*        R<                  " U R                  5      RG                  5       SS	9  g )Nr   r$   Frw   r_   ry           q_scaler|   local_causal_valid_masksoftcap)$rS   rT   ru   r~   r   r   r   conf_attention_chunk_size
chunk_sizer   max_future_horizonr   r   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizers   relative_position_embeddingrU   rV   r4   zerosper_dim_scaler   q_projk_projv_projr   softplustensorr   clonedetachcreate_local_causal_valid_maskrh   )rY   ru   r   r_softplus_0r   rZ   s        r9   rT   Gemma3nAudioAttention.__init__F  s%   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY)?(F(F(H(O(O(Q^cd"&"E"E"G68O\abLL778>>@ 	 	
r8   c                    [         R                  " [         R                  " U R                  U R                  4[         R
                  S9SS9R                  n[         R                  " [         R                  " U R                  U R                  4[         R
                  S9U R                  U R                  -   S9n[         R                  " U R                  U R                  4[         R
                  S9nX1-  U-  nU$ )Nr   r   )diagonal)	r4   trilrW   r   r   rn   Tr   r   )rY   lower_causal_maskupper_causal_maskr   s       r9   r   4Gemma3nAudioAttention.create_local_causal_valid_maskh  s    !JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9"MPa"a&&r8   xpad_left	pad_rightre   c                     UR                   tpEnUR                  XB/UQ75      nUR                  XC/UQ75      n[        R                  " XqU/SS9nU$ )Nr$   r   )r   	new_zerosr4   r   )	rY   r   r   r   batchr   
tail_shapeleftrights	            r9   	_pad_dim1Gemma3nAudioAttention._pad_dim1u  sV     !:{{E9j9:U;
;<IIt&A.r8   rH   c                 "   UR                   nUSS u  p4X@R                  -   S-
  U R                  -  nXPR                  -  U-
  =nS:  a  U R                  USU5      nX5U R                  4USS -   nUR                  U5      R	                  5       nU$ )a  Turns a sequence to non overlapping blocks.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, block_size, ...], with necessary
    paddings,
    where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
Nr"   r$   r   )r   r   r   r   
contiguous)rY   rH   r   bt
num_blockspadding_lenpermute_dimss           r9   _convert_to_block'Gemma3nAudioAttention._convert_to_block|  s     ##Ray//)A-$//A
%7!;;Kq@ NN=![IMt7%)C%--l;FFHr8   c                 R   U R                   nU R                  U R                  -   S-
  nU R                  XU5      nU R                  nU R                  nUR                  SXES9nUR                  S:  a&  UR                  S:  a  [        R                  " USSS9nUR                  5       $ )a  Extracts temporal context for every block.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, context_size, ...], with necessary
    paddings,
    where context_size = block_size + left_context + right_context,
    and output[:, i, ...] are x[:, start-left_context:end+right_context,
    ...],
    start = i * block_size, end = (i + 1) * block_size.
r$   )	dimensionsizestepr"   r   r]   )sourcedestination)
r   r   r   r   r   unfoldndimr4   movedimr   )rY   rH   r   r   	frame_len
frame_step
x_unfoldeds          r9   _extract_block_context,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}	J%%	__
 #))AI)W
 !joo&9 z"!LJ$$&&r8   maskc                    / UR                   S S QU R                  PU R                  P7nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       n[        R                  R                  R                  U R                  5      nSSSU R                  4nUR                  U5      n	X@R                  -  U	-  nUR                   S S u  pU R                  U5      nU R!                  U5      nU R!                  U5      nUR                   S   nU) nU R!                  U5      nUR"                  S:X  aI  UR                   S   UR                   S   -  U R$                  :X  a  UR	                  XU R$                  5      nUR                   U
UU R$                  4:w  a,  ['        SUR                    SU
 SU SU R$                   S	3	5      eUR)                  S5      R)                  S
5      nU R*                  R)                  S5      R)                  S5      R)                  S5      n[        R,                  " UUR/                  UR0                  5      5      nU R3                  X5      nU R4                  R/                  UR0                  5      nUU-  n[        R6                  " U5      nUU-  n[        R8                  " UU[        R:                  " UR<                  5      R>                  5      n[        R                  R                  RA                  US[        RB                  S9R/                  UR<                  S9nUR                   u  nnnnnUR                   S   nURE                  SSSSS5      R	                  SUU5      nURE                  SSSSS5      R	                  SUU5      n[        RF                  " UU5      n U R	                  UUUUU5      RE                  SSSSS5      n!U!R	                  U
XRH                  -  U R                  U R                  45      n!U!S S 2S U24   n!U!$ )Nr]   r$   r"   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   rN   r   r   )%r   r   r   r   r   r   r   r   r4   rU   r   r   r   viewr   r  r  r  r   
ValueErrorr   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rY   rH   r  	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_whererF   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     r9   rk   Gemma3nAudioAttention.forward  sI   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#ll25OO)//3
--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*Md.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 ??2	
 *!WfW*5r8   )r   r   ru   r   r   r   r   r   r   r   r   r   r   r   )r/   r0   r1   r2   r%   rT   r   r4   ro   rm   r   r  r  r5   rk   r7   rp   rq   s   @r9   r   r   E  s     
1  
D'5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell d dr8   r   c                      ^  \ rS rSrSr SS\S\\   S\4U 4S jjjrS\	R                  S\	R                  4S	 jrS
rU =r$ )Gemma3nAudioCumulativeGroupNormi)  a  Applies Group Normalization cumulatively over the time dimension.

This layer normalizes the input by calculating the mean and variance
cumulatively over the time dimension (dim 1). The statistics are computed
over all feature dimensions (specified by `feature_dims` and `num_channels`)
for elements marked as valid by the optional `mask`.

If a `mask` is provided (True for valid, False for invalid/padded),
invalid time steps do not contribute to the statistics calculation, and
their corresponding output values are zeroed out.

Scale and bias, if enabled, are applied per-channel (last dimension).
This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
and `cumulative=True`.
num_channelsfeature_dimsrO   c           	        > [         TU ]  5         Xl        [        U5      U l        X0l        [        R                  " [        R                  " U5      5      U l
        [        [        SS[        U R                  5      -   S-   5      5      U l        g )Nr"   r$   )rS   rT   r?  rJ   r@  rO   rU   rV   r4   rW   rX   rangelenreduction_axes)rY   r?  r@  rO   rZ   s       r9   rT   (Gemma3nAudioCumulativeGroupNorm.__init__:  sn     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMr8   rH   re   c                    U R                   U R                  4-   nUR                  SS U:w  a  [        SUR                  SS  SU 35      eUR                  n[
        R                  nUR                  U5      n[
        R                  " XTS9n[
        R                  " XPR                  SS9n[
        R                  " USS	9n[
        R                  " X`R                  SS9n	[
        R                  " U	SS	9n
[
        R                  " U
S
S9nX-  nX\-
  R                  S5      n[
        R                  " XR                  SS9n[
        R                  " USS	9nX-  nX\-
  [
        R                  " UU R                  -   5      -  nU R                   R                  U5      nS/UR#                  5       S-
  -  U R                  /-   nUUR%                  U5      -  nUU-  nUR                  U5      $ )zApplies cumulative group norm, optionally using a mask.

Args:
  hidden_states: Input tensor, shape [B, T, *feature_dims, C].

Returns:
  Normalized tensor with the same shape as x.
r"   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   TrN   r^   r$   r   ry   )r  )r@  r?  r   r  r   r4   r   r   	ones_likesumrD  cumsumclampr`   rsqrtrO   rX   rN   r  )rY   rH   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        r9   rk   'Gemma3nAudioCumulativeGroupNorm.forwardL  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF=	  ))F0C0CTRo1= "'9:M:MW[!\"\\*@aH"'++.@c"J ";
 #)"3!8!8!; 99%;ATAT^bc  ,,'7Q? '@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++r8   )rO   r@  r?  rD  rX   )gMbP?)r/   r0   r1   r2   r3   rm   r   rh   rT   r4   ro   rk   r7   rp   rq   s   @r9   r>  r>  )  s`    ( 	NN smN 	N N$G,U\\ G,ell G, G,r8   r>  c                      ^  \ rS rSrSr SS\S\S\S\\\\\4   4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )Gemma3nAudioSSCPConvBlocki  zA single convolution block for the SubSampleConvProjection.

This block consists of a 2D convolution, followed by CumulativeGroupNorm,
and a ReLU activation. It handles manual padding for the convolution.
ru   idxinput_freq_dimmanual_paddingc           	      6  > [         TU ]  5         Xl        X@l        US:X  a  SOU R                  R                  US-
     nU R                  R                  U   nU R                  R
                  U   u  pxU R                  R                  U   u  p[        R                  " UUUU4X4SSS9U l	        X0R                  S   -   U R                  S   -   nX-
  U
-  S-   n[        UU4U R                  R                  S9U l        [        R                  " 5       U l        g )Nr   r$   )r   r   F)in_channelsout_channelskernel_sizestridepaddingrx   )r?  r@  rO   )rS   rT   ru   re  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerU   Conv2dconvr>  sscp_conv_group_norm_epsnormReLU
activation)rY   ru   rc  rd  re  rg  rh  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrZ   s                r9   rT   "Gemma3nAudioSSCPConvBlock.__init__  s    	, !8a)K)KCRSG)T{{99#>![[>>sC![[>>sCII#% '

	 %':':1'==@S@STU@VV!,9A=
3%$44
	 '')r8   audio_encodingsre   c                    [         R                  " XR                  SSS9R                  U R                  R
                  R                  5      nU R	                  U5      nUR                  SSSS5      R                  5       nU R                  U5      nUR                  SSSS5      R                  5       nU R                  U5      $ )Nconstantr   )modevaluer   r"   r   r$   )Fr   re  r   rp  rX   r   r   r   rr  rt  )rY   r|  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          r9   rk   !Gemma3nAudioSSCPConvBlock.forward  s     "#8K8KR\dg!h!k!kII"""

  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566r8   )rt  ru   rp  re  rr  ))r   r   r   r   )r/   r0   r1   r2   r3   r%   rm   rJ   rT   r4   ro   rk   r7   rp   rq   s   @r9   rb  rb    sm     5A)$")$ )$ 	)$
 c3S01)$ )$V7u|| 7 7 7r8   rb  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )#Gemma3nAudioSubSampleConvProjectioni  ru   c                 Z  > [         TU ]  5         Xl        UR                  n/ n/ n[	        S5       Hk  nUR
                  U   u  pgUR                  U   u  pSn
US-
  nSnSnUUU
U4nUR                  U5        X,-   U-   nX-
  U	-  S-   nUR                  U5        UnMm     [        SUR                  UUS   S9U l	        [        SUS   UUS   S9U l
        UR                  S   nUS   nUU-  U l        [        R                  " U R                  U R                  R                  SS9U l        g )Nr"   r   r$   )rc  rd  ru   re  r]   Frw   )rS   rT   ru   input_feat_sizerB  rm  rn  appendrb  conv_0conv_1rl  input_proj_in_featuresrU   r   r   input_proj_linear)rY   ru   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsiru  rv  rw  rx  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplery  f_out_after_convfinal_c_outfinal_f_outrZ   s                      r9   rT   ,Gemma3nAudioSubSampleConvProjection.__init__  sr   $*$:$:!#%  "qA!'!=!=a!@H!'!=!=a!@H I#a<L JK 	$  %++,@A 4@;NK + 68CaG!(()9:(8%= @ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lr8   r|  re   c                    UR                  S5      nU R                  U5      nU R                  U5      nUR                  u  pEpgUR	                  SSSS5      R                  5       nUR                  XFXu-  5      n	U R                  U	5      n
U
$ )Nr$   r   r"   r   )r   r  r  r   r   r   r  r  )rY   r|  audio_encodings_reshapedr   r   c_outt_outf_out
x_permutedoutput_flattenedoutputs              r9   rk   +Gemma3nAudioSubSampleConvProjection.forward  s     $3#<#<Q#? KK01KKN!"%YYq!Q*557
%??1U]C''(89r8   )ru   r  r  r  r  r/   r0   r1   r2   r%   rT   r4   ro   rk   r7   rp   rq   s   @r9   r  r    s3    7m1 7mru||   r8   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerAttentioni$  ru   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l
        [        U5      U l        [        R                  " U R                  U R                  R                  SS9U l        [        U R                  R                  5      U l        g )Ngradient_clippingFr|   rw   )rS   rT   ru   r   post_in_featuresr   r4   r   r  rL   pre_attn_normr   attnrU   r   post	post_normrY   ru   rZ   s     r9   rT   'Gemma3nAudioConformerAttention.__init__%  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@r8   r|  r-   re   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  XB5      nUR
                  u  pgpUR                  XgX-  5      n
U R                  U
5      n[         R                  " XR                  * U R                  5      nX0R                  U5      -   $ rg   )	r4   rK  r  r  r  r   r   r  r  )rY   r|  r-   audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   r   r   r  s              r9   rk   &Gemma3nAudioConformerAttention.forward/  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A#R  %=$B$B!i#;#C#CA)J^#_ ))$<=++o8N8N7NPTPfPfg,~~o/NNNr8   )r  ru   r  r  r  r  r/   r0   r1   r2   r%   rT   r4   ro   r5   rk   r7   rp   rq   s   @r9   r  r  $  sG    A1 AOu|| OUEUEU OZ_ZfZf O Or8   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerFeedForwardi@  ru   c                 ~  > [         TU ]  5         Xl        U R                  S[        R
                  " U R                  R                  5      SS9  [        U R                  R                  5      U l	        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  S-  U R                  R                  SS9U l        [        U R                  R                  5      U l        U R                  R                  U l        g )Nr  Fr|   r   rw   )rS   rT   ru   r   r4   r   r  rL   r   pre_layer_normrU   r   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scaler  s     r9   rT   )Gemma3nAudioConformerFeedForward.__init__A  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF $ @ @r8   r|  re   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  U5      n[
        R                  R                  U5      nU R                  U5      n[         R                  " XR                  * U R                  5      nU R                  U5      nX!U R                  -  -   $ rg   )r4   rK  r  r  r  rU   r   silur  r  r  )rY   r|  residuals      r9   rk   (Gemma3nAudioConformerFeedForward.forwardM  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..?T-B-BBCCr8   )ru   r  r  r  r  r  r  rq   s   @r9   r  r  @  s6    
A1 
A	Du|| 	D 	D 	Dr8   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerLightConv1diY  ru   c           
        > [         TU ]  5         Xl        [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l	        [        R                  " U R                  R                  U R                  R                  U R                  R                  SSU R                  R                  SS9U l        U R                  S[        R                  " U R                  R                   5      SS	9  [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        U R                  R                  S-
  U l        g )
NrO   r"   Frw   r$   r   )rg  rh  ri  rj  rk  groupsrx   r  r|   )rS   rT   ru   rL   r   rms_norm_epsr  rU   r   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1dr   r4   r   r  	conv_norm
linear_endcausal_paddingr  s     r9   rT   )Gemma3nAudioConformerLightConv1d.__init__Z  sB   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Cr8   r|  re   c                 2   UnU R                  U5      nU R                  U5      n[        R                  R                  R                  USS9nUR                  SSS5      n[        R                  " X0R                  S45      nU R                  U5      nUR                  SSS5      n[        R                  " XR                  * U R                  5      nU R                  U5      n[        R                  R                  U5      nU R                  U5      nX-   nU$ )Nr]   r   r   r"   r$   )r  r  r4   rU   r   glur   r  r   r  r  rK  r  r  r  r  )rY   r|  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedr  s         r9   rk   (Gemma3nAudioConformerLightConv1d.forwardo  s    #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0HK^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: ;r8   )r  ru   r  r  r  r  r  r  rq   s   @r9   r  r  Y  s2    D1 D*u||   r8   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerBlocki  ru   c                   > [         TU ]  5         Xl        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l	        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l        g )Nr  Fr|   )rS   rT   ru   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endr   r4   r   r  rL   r   rr  r  s     r9   rT   #Gemma3nAudioConformerBlock.__init__  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	r8   r|  r-   re   c                 f   U R                  U5      nU R                  X5      nU) nXR                  S5      R                  UR                  5      -  nU R                  U5      nU R                  U5      n[        R                  " XR                  * U R                  5      nU R                  U5      nU$ )Nr]   )r  r  r   r   r   r  r  r4   rK  r  rr  )rY   r|  r-   validity_mask_for_lconvaudio_encodings_for_lconv_inputr  s         r9   rk   "Gemma3nAudioConformerBlock.forward  s    ..?..I#1/*9<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+r8   )r  ru   r  r  r  rr  r  rq   s   @r9   r  r    s@    	<1 	<u|| UEUEU Z_ZfZf  r8   r  c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3nTextScaledWordEmbeddingi  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 |   > [         TU ]  XU5        X@l        U R                  S[        R
                  " U5      SS9  g )Nr  Fr|   )rS   rT   scalar_embed_scaler   r4   r   )rY   r  r  r  r  rZ   s        r9   rT   'Gemma3nTextScaledWordEmbedding.__init__  s7    D"-]ELL,ERWXr8   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ rg   )rS   rk   r  r   rX   r   )rY   r  rZ   s     r9   rk   &Gemma3nTextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRr8   )r  )ry   )r/   r0   r1   r2   r3   rm   rh   rT   r4   ro   rk   r7   rp   rq   s   @r9   r  r    sM    Ys Y3 YS Y_d Y Y
S S Sr8   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nTextLaurelBlocki  z Learned Augmented Residual Layerru   c                   > [         TU ]  5         Xl        [        R                  " U R                  R
                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R
                  SS9U l        [        U R                  R
                  U R                  R                  S9U l        g )NFrw   r  )rS   rT   ru   rU   r   r   laurel_ranklinear_leftlinear_rightrL   r  post_laurel_normr  s     r9   rT   Gemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd er8   rH   re   c                 p    U R                  U5      nU R                  U5      nU R                  U5      nX-   $ rg   )r  r  r  )rY   rH   laurel_hidden_statesnormed_laurel_hidden_statess       r9   rk   Gemma3nTextLaurelBlock.forward  s@    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#::r8   )ru   r  r  r  )r/   r0   r1   r2   r3   r'   rT   r4   ro   rk   r7   rp   rq   s   @r9   r  r    s5    *f0 f;U\\ ;ell ; ;r8   r  c                      ^  \ rS rSrSS\S\4U 4S jjjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
rU =r$ )Gemma3nTextMLPi  ru   	layer_idxc                   > [         TU ]  5         Xl        UR                  U l        UR                  U   U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        UR                  U   U l        g NFrw   )rS   rT   ru   r   intermediate_sizerU   r   	gate_projup_proj	down_projr
   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrY   ru   r  rZ   s      r9   rT   Gemma3nTextMLP.__init__  s    !--!'!9!9)!D4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556#)#E#Ei#P r8   rH   re   c                     U R                  U5      nU R                  S:  a  U R                  U5      nU R                  U5      nU R	                  U5      nU R                  X4-  5      nU$ )Nr   )r  r  _gaussian_topkr	  r  r  )rY   rH   r  activationsr  r  s         r9   rk   Gemma3nTextMLP.forward  sa    NN=1	##c)++I6Ikk),,,}-NN;#89	r8   inputsc                    [         R                  " U R                  [         R                  UR                  S9n[         R
                  R                  R                  SS5      nUR                  U5      nUR                  UR                  5      n[         R                  " USSS9n[         R                  " USSSS9nXVU-  -   n[        R                  R                  X-
  5      $ )	Nr   r   r   r$   r]   TrG  F)rN   r^   unbiased)r4   r   r  r   r   distributionsnormalNormalicdfr   r   ra   stdrU   r   relu)rY   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           r9   r  Gemma3nTextMLP._gaussian_topk  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&"344r8   )r	  r  ru   r  r  r   r  r  )r   )r/   r0   r1   r2   r'   rm   rT   r4   ro   rk   r  r7   rp   rq   s   @r9   r   r     s[    	Q0 	QS 	Q 	QU\\ ell 5U\\ 5ell 5 5r8   r   c                   n  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
\R                  S\R                  S\R                  4S jrS\R                  S\R                  4S jrS\R                  S\R                  4S jrSrU =r$ )Gemma3nTextAltUpi  a  Alternating Updates (AltUp)

The AltUp module wraps transformer layers. The `predict` step modifies the
input to the transformer layer, and the `correct` step propagates the output
of the transformer layer to the sparsely updated dimensions.

See more in the research paper:

https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
ru   c                 2  > [         TU ]  5         Xl        [        R                  " [
        R                  " U R                  R                  5      5      U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        U R                  R                  U R                  R                  S9U l        U R#                  S[
        R$                  " U R                  R                  S-  5      SS9  g )NFrw   r"   r  router_input_scale      r|   )rS   rT   ru   rU   rV   r4   r   r   correct_output_scaler   altup_num_inputscorrection_coefsprediction_coefsmodality_routerrL   r  router_normr   r   r  s     r9   rT   Gemma3nTextAltUp.__init__  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqr8   r   re   c                     U R                  U5      U R                  -  nU R                  U5      n[        R                  " UR                  5       5      R                  U5      $ rg   )r-  r&  r,  r4   r  rh   ri   )rY   r   router_inputsrouteds       r9   compute_router_modalities*Gemma3nTextAltUp.compute_router_modalities  sM    ((+d.E.EE%%m4zz&,,.)11!44r8   rH   c                    U R                  XR                  R                     5      nU R                  (       ap  U R                  R                  bY  U R
                  R                  R                  R                  U R                  R                  * U R                  R                  5        U R                  U5      R                  " / UR                  SS QU R                  R                  PU R                  R                  P76 R                  SSSS5      n[        R                  " UR                  SSSS5      U5      nUR                  SSSS5      nXA-  nUR                  5       R!                  U5      $ )a  Predicts the output of a layer using a trainable map.

Args:
    hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
Nr]   r   r$   r   r"   )r2  ru   altup_active_idxtrainingaltup_coef_clipr+  rX   dataclamp_r   r   r)  r   r4   r   r   ri   )rY   rH   
modalities	all_coefspredictionss        r9   predictGemma3nTextAltUp.predict   s?    33M++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSno !!*-W i &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15$%%'//>>r8   r<  	activatedc                    U R                  U5      nX!U R                  R                     -
  nUR                  U R                  R                  SSS5      nU R
                  (       a  U R                  R                  b{  U R                  R                  R                  U R                  R                  * U R                  R                  5      n[        R                  R                  R                  X5SS9S-   nOU R                  U5      S-   nUR                  SSS5      R                  S5      n[        R                   " XF5      nXq-  nUR#                  5       R%                  U5      $ )a  Corrects the predictions relative to the

Args:
    predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
    activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
        predictions relative to the activated input embeddings.
r$   Nrw   ry   r"   r   r]   )r2  ru   r5  repeatr)  r6  r7  r*  rX   rK  r4   rU   r   linearr   r   mulr   ri   )rY   r<  r?  r:  
innovationrX   r;  	correcteds           r9   correctGemma3nTextAltUp.correct  s$    33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
==T[[88D**11779T9T8TVZVaVaVqVqrF++22:D2QTWWI--j9C?I
 %%aA.88<	IIj4	 	##%--i88r8   rE  c                 p    UR                  U R                  5      U R                  -  R                  U5      $ )z
This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
(which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
`scale_corrected_output`
)ri   r(  rY   rE  s     r9   rk   Gemma3nTextAltUp.forward;  s2     !!$";";<t?X?XXaabkllr8   c                 $    U R                  U5      $ )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)rk   rI  s     r9   scale_corrected_output'Gemma3nTextAltUp.scale_corrected_outputC  s    ||I&&r8   )ru   r(  r*  r,  r+  r-  )r/   r0   r1   r2   r3   r'   rT   r4   ro   r2  r=  rF  rk   rL  r7   rp   rq   s   @r9   r$  r$    s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9>m m%,, m' ' ' 'r8   r$  c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr]   r"   r   )r   r4   r   )r   x1x2s      r9   rotate_halfrQ  H  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r8   rH   n_repre   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r$   N)r   expandr   )rH   rR  r   num_key_value_headsslenr   s         r9   	repeat_kvrW  O  s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr8   modulequerykeyr  attention_maskdropoutscalingr   c                 j   Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb  X-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr_   r"   r   r]   r  )pr6  r$   )r   rW  num_key_value_groupsr4   r   	transposer  rU   r   r  r   r   r   r\  r6  r   )rX  rY  rZ  r  r[  r\  r]  r   kwargsr"  r#  attn_weightsattn_outputs                r9   eager_attention_forwardre  [  s    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!#4 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r8   r   r   r   unsqueeze_dimc                 l    UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   $ )a$  Applies Rotary Position Embedding to the query and key tensors.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r   rQ  )r   r   r   rf  s       r9   apply_rotary_pos_embrh  }  s6    " --
&C
--
&CGA,--r8   c                   *  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S\R                  S-  S	\	S-  S
\
\\\R                  \R                  4   4   S-  S\\   S\\R                  \R                  S-  4   4S jjrSrU =r$ )Gemma3nTextAttentioni  ru   r  c                   > [         TU ]  5         Xl        X l        [	        US5      (       a  UR
                  U   OS U l        U R                  S:H  U l        U R                  (       a  UR                  OS U l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l        SU l        U R                  R                   U l        SU l        U R                  R$                  U R                  R&                  -
  nX#s=:  =(       a    S:  Os  U l        UR
                  S U nU R(                  (       a@  [+        U5      S-
  US S S2   R-                  UR
                  U   5      -
  U l        S	U l        OBS U l        U[+        U5      S-
  US S S2   R-                  UR
                  U   5      -
  :H  U l        [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l        [;        UR                  UR<                  S9U l        U R(                  (       d  [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l         [2        R4                  " UR                  UR                  U R                  -  UR6                  S
9U l!        [;        UR                  UR<                  S9U l"        [;        UR                  UR<                  S	S9U l#        [2        R4                  " UR                  U R                  -  UR                  UR6                  S
9U l$        g )Nlayer_typessliding_attentionr   ry   Tr   r$   r]   Frw   )rN   rO   )rN   rO   rP   )%rS   rT   ru   r  hasattrrl  
layer_type
is_slidingsliding_windowgetattrr   num_attention_headsr   rU  r`  r]  attention_dropout	is_causalnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerrC  indexkv_shared_layer_indexstore_full_length_kvrU   r   attention_biasr   rL   r  q_normr   r   k_normv_normo_proj)rY   ru   r  first_kv_shared_layer_idxprev_layersrZ   s        r9   rT   Gemma3nTextAttention.__init__  s   ";B6=;Y;Y&,,Y7_c//-@@7;f33D
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>$(KK$A$ADKKDdDd$d!"+"L"L1"L(()C*CD""),[)9A)=DbD@Q@W@WX^XjXjktXu@v)vD&(-D%)-D&(1S5E5IKX\Z\X\L]LcLc""9-M 6 )D% ii : :T]] JQWQfQf
 %f>Q>QR &&))""F$>$>$NU[UjUjDK ))""F$>$>$NU[UjUjDK )V__&BUBUVDK(V__&BUBUbghDKii&&68J8JQWQfQf
r8   NrH   position_embeddingsr[  rG   shared_kv_statesrb  re   c                    UR                   S S n/ UQSPU R                  R                  P7nUu  pU R                  U5      R	                  U5      nU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       aG  XPR                     u  pUR                  UR                  5      nUR                  UR                  5      nOU R                  U5      R	                  U5      nU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  U5      R	                  U5      nU R                  U5      nUR                  SS5      nUb/  U R                  (       d  UR!                  XU R"                  5      u  pU R$                  (       a  X4XPR"                  '   [&        R(                  " U R                  R*                  [,        5      nU" U UUUU4U R.                  (       a  U R0                  OSU R2                  U R4                  S.UD6u  nnUR6                  " / UQSP76 R9                  5       nU R;                  U5      nUU4$ )Nr]   r"   )rf  r$   r   )r\  r]  rq  )r   ru   r   r   r  r}  rh  ra  rx  rz  r   r   r   r~  r   r  updater  r{  r   get_interface_attn_implementationre  r6  rt  r]  rq  r   r   r  )rY   rH   r  r[  rG   r  rb  input_shapehidden_shaper   r   r!  r"  r#  attention_interfacerd  rc  s                    r9   rk   Gemma3nTextAttention.forward  s:    $))#2.??b?$++*>*>?&{{=166|D{{<0+LsRST#--a3
 ""'78R8R'S$J#|':':;J'??<+>+>?L]388FJZ0J-jsRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L&t/F/F'6'='=jX\XfXf'g$J$$/9/G^^,(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((r8   )rt  ru   r   ru  rx  rp  r~  r   rz  r  ro  r`  r  r}  r   r]  rq  r{  r  r   NN)r/   r0   r1   r2   r'   rm   rT   r4   ro   r   dictrJ   r   r   rk   r7   rp   rq   s   @r9   rj  rj    s    .
0 .
S .
j )-PT;)||;) #\\;) t+	;)
 ;) sE%,,*D$EEFM;) +,;) 
u||U\\D00	1;) ;)r8   rj  c                     ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S\R                  S	\	\\
\R                  \R                  4   4   S-  S
\R                  S-  S\R                  S-  S\S-  S\\   S\
\R                  \
\R                  \R                  4   S-  4   4S jjrSrU =r$ )Gemma3nTextDecoderLayeri  ru   r  c                 D  > [         TU ]  5         Xl        UR                  U l        X l        [        X5      U l        [        XS9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        UR                  U l        [         UR"                     U l        ['        U5      U l        [+        U5      U l        [.        R0                  " U R                  U R                  SS9U l        [.        R0                  " U R                  U R                  SS9U l        [        U R                  UR                  S9U l        g )N)r  r  Frw   )rS   rT   ru   r   r  rj  	self_attnr   mlprL   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr
   r  r	  r$  altupr  laurelrU   r   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  s      r9   rT    Gemma3nTextDecoderLayer.__init__  sB   !--"-f@!&>-d.>.>FDWDWX(6t7G7GVM`M`(a%)78H8HfNaNa)b&*89I9IvObOb*c'+1+M+M(V556%f-
,V4$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&r8   NrH   r  per_layer_inputr  r[  position_idsrG   rb  re   c           
      z   U R                   R                  U5      n	XR                  R                     n
U R	                  U
5      nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R                  U5      nX-   nX-   [        R                  " S5      -  nU R                  U5      nU R                  U5      nU R                  U5      nUU-   nU R                   R                  U	U5      nUU R                  R                     R                  5       nU R                  R                  (       a  U R                   R!                  U5      nU R#                  U5      nU R%                  U5      n[&        R(                  " UU5      nU R+                  U5      nU R-                  U5      nUSS === U-  sss& U$ )N)rH   r[  r  r  r  rG   r"   r$   r.   )r  r=  ru   r5  r  r  r  r  r   sqrtr  r  r  rF  r   altup_correct_scalerL  r  r	  r4   multiplyr  r  )rY   rH   r  r  r  r[  r  rG   rb  r<  active_predictionactive_prediction_normedlaurel_outputr  r   
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictions                          r9   rk   Gemma3nTextDecoderLayer.forward  s    jj((7'(D(DE#'#7#78I#J $<=.. 
2)-% 3+
 
 ,,T2&-
!1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!$$r8   )r	  r  ru   r   r  r  r  r  r  r  r  r  r  r  r  r  )NNNNNN)r/   r0   r1   r2   r'   rm   rT   r4   ro   r  rJ   
LongTensorr   r   r   rA   rk   r7   rp   rq   s   @r9   r  r    s   c0 cS c0 -1(,PT.204(,3%||3% #\\3% 	3%
 sE%,,*D$EEFM3% t+3% &&-3% 3% +,3% 
u||U5#4#4e6G6G#GH4OO	P3% 3%r8   r  c            	         ^  \ rS rSr% \\S'   SrSrS/rSS/r	Sr
SrSrSrSr\\S.rS	r\R(                  " 5       U 4S
 j5       rS rS r   SS\S-  S\S-  S\S\R6                  4U 4S jjjr   SS\S-  S\S-  S\4S jjrSrU =r$ )Gemma3nPreTrainedModeliN  ru   modelTr  rG   r  )rH   rI   )imagetextaudioc                   > [         TU ]  U5        [        U[        5      (       a"  [        R
                  " UR                  5        GO[        U[        5      (       a  [        R                  " UR                  5        UR                  S-  nS[        R                  R                  R                  [        R                  " S5      5      -  n[        R                   " UR"                  X#-  5        [        R$                  " UR&                  UR(                  5        [        R                   " UR*                  UR-                  5       5        GO[        U[.        5      (       a-  [        R$                  " UR0                  UR2                  5        GO[        U[4        5      (       aZ  [        R                  " UR6                  5        [        R$                  " UR8                  U R:                  R<                  S-  5        GO[        U[>        5      (       a  Su  pEUR@                  S-  n[B        RD                  " [G        U5      [G        U5      -  5      [I        US-
  S5      -  nU[        RJ                  " [        RL                  " U5      U* -  5      -  n[        R                   " URN                  URG                  5       RQ                  S5      RQ                  S5      5        GO2[        U[R        5      (       ag  [        R$                  " URT                  U R<                  S-  5        [        R$                  " URV                  S[B        RX                  " S	5      -  5        O[        U[Z        5      (       a  UR\                   H  n	UR^                  n
UR`                  U	   S
:w  a  [b        UR`                  U	      n
U
" UR:                  U	S9u  p[        R                   " [e        X S35      U5        [        R                   " [e        X S35      U5        M     [g        US5      (       a6  [        R$                  " URh                  U R:                  Rh                  5        g g )Nr_   ry   r   r'  )ry   rz   r"   r$   r          @defaultro  	_inv_freq_original_inv_freqr  )5rS   _init_weights
isinstancer>  initones_rX   r   zeros_r   r   r4   rU   r   r   r   copy_r   	constant_r   r   r   r   r  r  r  r$  r(  r&  ru   r   rs   r   r   r   rh   r   r   r   r{   r   Gemma3nTextModelper_layer_projection_scaleper_layer_input_scaler  Gemma3nRotaryEmbeddingrl  compute_default_rope_parameters	rope_typer   rr  rn  r  )rY   rX  r   r   r   r   r   r   r{   ro  rope_init_fncurr_inv_freqr   rZ   s                r9   r  $Gemma3nPreTrainedModel._init_weightsa  s	   f%f=>>JJv}}% 566KK,,-oot+G!4!4!=!=ell3>O!PPLJJv~~w'=>NN6>>6+K+KLJJv55v7\7\7^_ >??NN6--v/H/HI 011KK334NN644dkk6M6Mt6ST EFF+5(M#__1N&*hhu]/CeMFZ/Z&[^a"A_ '# +UYYu||N7SWnVn7n-ooNJJv,,n.B.B.D.N.Nq.Q.[.[\].^_ 011NN6<<d>N>NPT>TUNN677TYYs^9KL 677$00
%EE##J/9<#6v7G7G
7S#TL#/*#U 

76\+CDmT

76\9K+LM}] 1 6.//NN633T[[5R5RS 0r8   c                 .    U R                   R                  $ rg   
base_modelembed_tokens_per_layerrY   s    r9   get_per_layer_input_embeddings5Gemma3nPreTrainedModel.get_per_layer_input_embeddings  s    555r8   c                 $    XR                   l        g rg   r  rY   r  s     r9   set_per_layer_input_embeddings5Gemma3nPreTrainedModel.set_per_layer_input_embeddings  s    16.r8   Nnew_num_tokenspad_to_multiple_ofmean_resizingre   c                 J   > [         TU ]  UUUS9nU R                  XU5        U$ )N)r  r  r  )rS   resize_token_embeddings_resize_per_layer_embeddings)rY   r  r  r  inputs_embedsrZ   s        r9   r  .Gemma3nPreTrainedModel.resize_token_embeddings  s:     7)1' 8 

 	)).m\r8   c                    U R                   U R                  R                  5       l        U R                  R                  5       R                  (       a  U R                  5       nU R                  XAX#5      n[        US5      (       a  UR                  n[        XV5        UR                  UR                  R                  5        U R                  U5        g g )N_hf_hook)
vocab_sizeru   get_text_configvocab_size_per_layer_inputr  r  _get_resized_embeddingsrn  r  r)   requires_grad_rX   rR   r  )rY   r  r  r  r  new_embeddings_per_layerhooks          r9   r  3Gemma3nPreTrainedModel._resize_per_layer_embeddings  s     DH??##%@;;&&(DD%)%H%H%J"'+'C'C&8J($ -z::-66"#;B$334J4Q4Q4_4_`//0HI Er8   r.   )NNT) r/   r0   r1   r2   r&   r6   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  rj  _can_record_outputsinput_modalitiesr4   no_gradr  r  r  rm   rn   rU   	Embeddingr  r  r7   rp   rq   s   @r9   r  r  N  s   &*#23#46H"IN!"&0* 2
]]_%T %TN67
 &*)-"	d
  $J 	
 
   &*)-"	Jd
J  $JJ 	J Jr8   r  c                      ^  \ rS rSr% Sr\\S'   SrSrS\4U 4S jjr	\
\S\R                  S\R                  S\\   S	\\-  4S
 j5       5       rSrU =r$ )Gemma3nAudioEncoderi  zp
An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
ru   	audio_melr  c                 
  > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        U R                  5         g s  snf rg   )rS   rT   ru   r  subsample_conv_projectionrU   
ModuleListrB  conf_num_hidden_layersr  	conformer	post_init)rY   ru   r   rZ   s      r9   rT   Gemma3nAudioEncoder.__init__  si     )LV)T&9>v?\?\9]^9]A'/9]^
 	 _s   B r-   rb  re   c                 >   U R                  U5      nUR                  S   nSn[        [        U R                  R
                  5      5       H!  nX`R                  R
                  U   S   -  nM#     [        R                  " XRR                  S9U-  n[        R                  " XR                  S   S-
  S9nUR                  S:  a?  UR                  S:X  a/  UR                  S5      R                  UR                  S   S5      nOcUR                  UR                  :X  aI  UR                  S   S:X  a6  UR                  S   S:w  a#  XXR                  S   :X  a  UR                  S5      n[        R                  " USU5      n	U R                   H  n
U
" XI5      nM     U R                  R                  S:  a@  USS2SSU R                  R                  24   nU	SS2SSU R                  R                  24   n	UR!                  U	R                  S5      S5      n[#        UU	S9$ )	ad  Encodes a batch of MELs.

Args:
    audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
      mel_bins].

Returns:
    audio_encodings: a torch.Tensor of shape
        `[batch_size, self.config.audio_soft_tokens_per_image,
        self.config.audio_config.hidden_size]`
    audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
r$   r   r   )r   r]   Nr   )last_hidden_stater-   )r  r   rB  rC  ru   rn  r4   r   r   rK  r  r   rT  gatherr   conf_reduction_factormasked_fillr+   )rY   r  r-   rb  r|  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks              r9   rk   Gemma3nAudioEncoder.forward  s   " 88C  %%a($S)J)J%KLO;;#D#D_#UVW#XX  M ,,u-B-BCFYY++g+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^E#OBO $ ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV--'
 	
r8   )ru   r   r  )r/   r0   r1   r2   r3   r%   r6   main_input_namer  rT   r    r!   r4   ro   r5   r   r   rJ   r+   rk   r7   rp   rq   s   @r9   r  r    sz     !O1   8
8
7<7G7G8
SYZlSm8
	/	/8
   8
r8   r  c                      ^  \ rS rSr% \R
                  \S'   S\4U 4S jjr\	    SS\S-  S\
S   S\S-  S	\S-  S
\S\4   4
S jj5       r\R                   " 5       \SS j5       5       rSrU =r$ )r  i  inv_freqru   c                 f  > [         TU ]  5         UR                  U l        UR                  U l        Xl        [        [        UR                  5      5      U l        0 U l	        U R                   H  nU R
                  R                  U   nUc  M!  US   U R                  U'   U R                  nU R                  U   S:w  a  [        U R                  U      nU" U R
                  US9u  pVU R                  U S3USS9  U R                  U S3UR                  5       SS9  [        X S3U5        M     g )	Nr  r  r  r  Fr|   r  _attention_scaling)rS   rT   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenru   listsetrl  r  rope_parametersr  r   r   r   setattr)rY   ru   ro  rope_paramsr  r  curr_attention_scalingrZ   s          r9   rT   Gemma3nRotaryEmbedding.__init__  s(   "("@"@$*$B$B!F$6$6 78**J++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@Yc4d1M  J<y!9=UZ [  J</A!BMDWDWDYfk lDL(:;=ST +r8   Nr   ztorch.deviceseq_lenro  re   ztorch.Tensorc           	         U R                   U   S   n[        U SS5      =(       d    U R                  U R                  -  nSnSU[        R
                  " SUS[        R                  S9R                  U[        R                  S9U-  -  -  nXv4$ )	a  
Computes the inverse frequencies according to the original RoPE implementation
Args:
    config ([`~transformers.PreTrainedConfig`]):
        The model configuration.
    device (`torch.device`):
        The device to use for initialization of the inverse frequencies.
    seq_len (`int`, *optional*):
        The current sequence length. Unused for this type of RoPE.
    layer_type (`str`, *optional*):
        The current layer type if the model has different RoPE parameters per type.
        Should not be used unless `config.layer_types is not None`

Returns:
    Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
    post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).

rope_thetar   Nry   r   r"   r   r   )	r  rr  r   rs  r4   r   int64r   rh   )ru   r   r  ro  baserN   attention_factorr  s           r9   r  6Gemma3nRotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 ))r8   c                 H   [        X S35      n[        X S35      nUS S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        USS	9   UR                  5       UR                  5       -  R                  SS
5      n	[        R                  " X4SS9n
U
R                  5       U-  nU
R                  5       U-  nS S S 5        WR	                  UR                  S9WR	                  UR                  S94$ ! , (       d  f       N@= f)Nr  r  r   r]   r$   mpscpuF)device_typeenabledr"   r   r   )rr  rh   rT  r   r   r   r  r   strr   ra  r4   r   r   r   r   )rY   r   r  ro  r  attention_scalinginv_freq_expandedposition_ids_expandedr(  freqsembr   r   s                r9   rk   Gemma3nRotaryEmbedding.forward=  sd    4<y!9:#DL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfkUC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')//C'')//C	 D vvAGGv$cff177f&;;; DCs   +A.F
F!)ru   rl  r  r  r  NNNNrg   )r/   r0   r1   r2   r4   ro   r6   r'   rT   staticmethodr   rm   r*  rJ   rh   r  r  r   rk   r7   rp   rq   s   @r9   r  r    s    llU0 U* +/+/"!%	!*!D(!*(!* t!* $J	!*
 
~u$	%!* !*F ]]_<  <r8   r  zBThe base Gemma 3n language model without a language modeling head.c                     ^  \ rS rSr% \\S'   SrS\4U 4S jjr\\	" SS9\
       SS\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\S-  S\R                  S-  S\S-  S\\   S\4S jj5       5       5       rS\R                  S\R                  4S jr SS\R                  S	\R                  S-  S\R                  4S jjrSrU =r$ )r  iP  ru   )r  c                 Z  > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        U5      U l        SU l        UR                  U l        UR*                  U l        [        UR,                  UR                  UR*                  -  U R                  UR*                  S-  S9U l        [        R0                  " U R                  UR                  UR*                  -  SS9U l        [        UR*                  UR                   S9U l        [        R                  " [        SU R                  R6                  5       Vs/ s H-  n[        R0                  " U R                  U R                  SS9PM/     sn5      U l        [        R                  " [        SU R                  R6                  5       Vs/ s H-  n[        R0                  " U R                  U R                  SS9PM/     sn5      U l        U R=                  S[>        R@                  " U R                  S-  5      SS	9  U R=                  S
[>        RB                  " [>        R@                  " S5      5      SS	9  / U l"        [G        U R                  5       HT  u  pEURH                  RJ                  (       d  M"  U RD                  RM                  S Vs/ s H  nSU SU 3PM     sn5        MV     U RO                  5         g s  snf s  snf s  snf s  snf )N      ?)r  r  Frw   r$   r  r_   r|   r  r  )r   r   r~  r  zlayers.z.self_attn.)(rS   rT   pad_token_idr  r  r  r   ru   embed_tokensrU   r  rB  rv  r  layersrL   r  rr  r  
rotary_embgradient_checkpointingr  r  r  r   per_layer_model_projectionper_layer_projection_normr)  altup_projectionsaltup_unembed_projectionsr   r4   r   rL  "_keys_to_ignore_on_load_unexpected	enumerater  rx  extendr  )rY   ru   r  r   r  layernamerZ   s          r9   rT   Gemma3nTextModel.__init__U  s    !.. ++ ;v1143C3CQUQ\Q\QhQhjmQm
 mmINvOgOgIhiIhI$V7Ihi
 #6#5#56;N;NO	08&+#!--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&!#PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw"
 *,PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg 35/!$++.HA11177>>@hi@hwqcTF3@hi / 	[ j4 x x js    N74N)4N#+N(
F)tie_last_hidden_statesNr  per_layer_inputsr[  r  rG   r  	use_cacherb  re   c           	         USL USL-  (       a  [        S5      eUb"  U R                  U5      nU R                  U5      nU R                  Xb5      nU(       a  Uc  [	        U R
                  S9nUcU  Ub  UR                  5       OSn	[        R                  " UR                  S   UR                  S9U	-   nUR                  S5      n[        U=n
[        5      (       d)  U R
                  UUUUS.n[        S0 UD6[        S0 UD6S.n
Un[        R                   " US	-  S
SS9S-  n[        R"                  " S5      nU/n[%        SU R
                  R&                  5       H  nU R(                  US-
     " U5      nUR+                  UR,                  UR                  S9n[        R                   " US	-  S
SS9n[        R.                  " [        R0                  " UUR+                  UR                  5      5      5      nUU-  U-  nUR3                  U5        M     [        R4                  " USS9n0 n[7        U R
                  R8                  5       H  nU R;                  UUU5      UU'   M     [=        5       n[?        U R@                  SU R
                  RB                   5       HZ  u  nnXR
                  R8                  U      nUSS2SS2USS24   nU" UUU R
                  R8                  U      U4UUUUS.UD6nM\     [        R                   " US   S	-  S
SS9S-  nUS   /n[%        SU R
                  R&                  5       H  nU RD                  US-
     " UU   5      nUR+                  UR,                  UR                  S9n[        R                   " US	-  S
SS9n[        R.                  " [        R0                  " UUR+                  UR                  5      5      5      nUU-  U-  nUR3                  U5        M     [        R4                  " U5      n[        R                   " USS9nU RG                  U5      n[I        UUS9$ )z
per_layer_inputs (torch.Tensor, *optional*, defaults to None):
    Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
N:You must specify exactly one of input_ids or inputs_embedsru   r   r$   r   )ru   r  r[  rG   r  )full_attentionrm  r"   r]   TrG  r5  gh㈵>r  r   )r  r[  r  rG   )r  rG   r.   )%r  r7  get_per_layer_inputsproject_per_layer_inputsr   ru   get_seq_lengthr4   r   r   r   r   r  r  r   r   ra   r   rB  r)  r=  r   r   r  maximumr  stackr  rl  r9  r   r@  r8  rv  r>  rr  r   )rY   r  rF  r[  r  rG   r  rG  rb  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0target_magnitudeepsilon_tensortemp_hidden_statesr  
altup_projcurrent_hidden_statenew_magnituderH   r  ro  r  decoder_layercausal_maskr  altup_unemb_projs                               r9   rk   Gemma3nTextModel.forward  s1   $ -t";<YZZ  --i8M#88C88Y0*$++>OCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4L ?-FF ++!."0#2 ,K #5"C{"C%F%U%U# ( !::oq&8b$OSVVd+-.q$++667A//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $6A> dkk556J.2oom\[e.f
+ 7 $: )$++6U8U8U*V WA}-kk.E.Ea.HIK.q!Qz:O)#DKK$;$;A$>?	 "2*) /	 	M	 !X  !::mA&6!&;TRVYY+A./q$++667A-1-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $67

=a8		-0&++
 	
r8   c                     U R                  U5      R                  " / UR                  QU R                  R                  PU R
                  P76 $ rg   )r  r   r   ru   rv  r  )rY   r  s     r9   rL  %Gemma3nTextModel.get_per_layer_inputs   sN    **95== 
__
KK))
 ,,
 	
r8   c                    U R                  U5      nX0R                  R                  UR                  UR                  S9-  nUR
                  " / UR                  S S QU R                  R                  PU R                  P76 nU R                  U5      nUc  U$ UR                  UR                  :w  a   USS U R                  R                  2S S 24   nX2-   U R                  R                  UR                  UR                  S9-  $ )Nr  r]   .)r;  r  r   r   r   r   r   ru   rv  r  r<  r  )rY   r  rF  r  s       r9   rM  )Gemma3nTextModel.project_per_layer_inputs  s&   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  4;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$74;U;U;X;X%%.B.I.I <Y <
 
 	
r8   )r?  r=  r>  r7  r  r:  r   r  r8  rr  r  r;  r<  r9  r  )NNNNNNNrg   )r/   r0   r1   r2   r'   r6   r  rT   r    r!   r   r4   r  ro   r   rA   rn   r   r   r   rk   rL  rM  r7   rp   rq   s   @r9   r  r  P  s[    70 7r  E2 .204.204(,26!%m
##d*m
  ,,-m
 t+	m

 &&-m
 m
 ((4/m
 $;m
 +,m
 
!m
  3  m
^
e.>.> 
5<< 
 15
||
  ,,-
 
	
 
r8   r  z?The base Gemma 3n language model with a language modeling head.c                   d  ^  \ rS rSr% SS0rSS0rSS/S/40r\\S'   S\4U 4S	 jjr	\
\        SS\R                  S
-  S\R                  S
-  S\R                  S
-  S\S
-  S\R                   S
-  S\R                  S
-  S\S
-  S\\R                  -  S\\   S\4S jj5       5       rSrU =r$ )Gemma3nForCausalLMi#  lm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrH   rF   ru   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
rS   rT   r  r  r  rU   r   r   rf  r  r  s     r9   rT   Gemma3nForCausalLM.__init__*  sU     %f-
 ++yy!3!3V5F5FUS 	r8   Nr  r[  r  rG   r  labelsrG  logits_to_keeprb  re   c	           
          U R                   " SUUUUUUS.U	D6n
U
R                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bF  XR                  R                  -  n[        R                  " U5      nXR                  R                  -  nSnUb  U R                  " XU R                  40 U	D6n[        UUU
R                  U
R                  U
R                  S9$ )a$  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma3nForCausalLM

>>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```)r  r[  r  rG   r  rG  N)rE   rF   rG   rH   rI   r.   )r  r  r  rm   slicerf  ru   final_logit_softcappingr4   r  loss_functionr  r   rG   rH   rI   )rY   r  r[  r  rG   r  rj  rG  rk  rb  outputsrH   slice_indicesrF   rE   s                  r9   rk   Gemma3nForCausalLM.forward3  s   @ ,0:: ,
)%+',
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:kkAAAFZZ'FkkAAAF%%fdooPPD%#33!//))
 	
r8   )rf  r  r  )NNNNNNNr   )r/   r0   r1   r2   _tied_weights_keys_tp_plan_pp_planr'   r6   rT   r   r   r4   r  ro   r   rA   rn   rm   r   r   r   rk   r7   rp   rq   s   @r9   rd  rd  #  s%   *,GH23H_-z:;H0   .2.204(,26*.!%-.;
##d*;
 t+;
 &&-	;

 ;
 ((4/;
   4';
 $;;
 ell*;
 +,;
 
 ;
  ;
r8   rd  c                      ^  \ rS rSrSrS\\-  S\4U 4S jjr  SS\	R                  S-  S\	R                  S-  S	\	R                  4S
 jjrSrU =r$ )Gemma3nMultimodalEmbedderis  zQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 ^  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  5      U l        [        U R                  U R
                  S9U l        [        U R                  U R
                  S9U l        [        R                  " U R                  U R                  SS9U l        [        U R                  U R
                  SS9U l        g )Nr  Frw   )rO   rP   )rS   rT   r   multimodal_hidden_sizer  rO   vocab_offsetr  text_hidden_sizerU   r  	embeddingrL   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)rY   rx  ry  rZ   s      r9   rT   "Gemma3nMultimodalEmbedder.__init__v  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+r8   Nr  r  re   c                     USL USL-  (       a  [        S5      eUb  U R                  U5      nO.U R                  XR                  -
  5      nU R	                  U5      nU R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.

Args:
    input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
        `[vocab_offset, vocab_offset + vocab_size)`.
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.

Returns:
    A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
NrI  )r  r  r~  r|  r  r  r  )rY   r  r  emb_normhard_embemb_norm_projs         r9   rk   !Gemma3nMultimodalEmbedder.forward  s     -t";<YZZ$//>H~~i2C2C&CDH//9H11(;22=AAr8   )
r~  r  r  rO   r  r{  r  r}  r|  r  r  )r/   r0   r1   r2   r3   r%   r(   r'   rT   r4   r  ro   rk   r7   rp   rq   s   @r9   rw  rw  s  sq    [t-0CCt 't* .2-1B##d*B ||d*B 
	B Br8   rw  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                     ^  \ rS rSrSrS\4U 4S jjr\\" SS9S\	R                  S\\   S	\\-  4S
 j5       5       r    S S\	R                   S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  4S jjr\           S!S\	R                   S-  S\	R                  S-  S\	R                  S-  S\	R$                  S-  S\	R$                  S-  S\	R                   S-  S\S-  S\	R                   S-  S\	R                  S-  S\	R                   S-  S\S-  S\\   S	\4S jj5       rS rS r\\" SS9S\	R$                  S\	R$                  S\\   S	\\-  4S j5       5       rSrU =r$ )"Gemma3nModeli  Fru   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        UR                  R                  U l        [        R                  " UR                  S9nX l        UR                  R                  U l	        [        R                  " UR                  5      U l        [        UR                  UR                  5      U l        [        UR                  UR                  5      U l        U R                  5         g )NrJ  )rS   rT   r#   from_configvision_configvision_towerry  r  language_modelr  audio_configaudio_towerrw  embed_visionembed_audior  )rY   ru   r  rZ   s      r9   rT   Gemma3nModel.__init__  s     %119M9MN ,,77"..f6H6HI,*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\]r8   zOProjects the last hidden state from the vision model into language model space.r:   pixel_valuesrb  re   c                    U R                   " S	USSS.UD6nUR                  nUR                  UR                  S   U R                  R
                  R                  U R                  R                  5      R                  SSS5      nX@R                  R
                  R                  S-  -  nU R                  US9Ul
        U$ )
NFT)r  
do_poolingreturn_dictr   r"   r$   r5  r  r.   )r  r  r   r   ru   r  r   vision_soft_tokens_per_imager   r  pooler_output)rY   r  rb  vision_outputsr  s        r9   get_image_featuresGemma3nModel.get_image_features  s     **sQVdhslrs*<< .55##A&KK%%11KK44
 '!Q
	 	 	[[66BBCGG'+'8'8GX'8'Y$r8   Nr  r  image_featuresaudio_featuresc           	      B   Uc  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  R                  S5      nO0XR                  R                  :H  nXR                  R                  :H  nUR                  5       nUR                  S5      R                  UR                  5      nUbO  [        XrR                  S   -  UR                  5       :H  SU SUR                  S   UR                  S   -   35        UR                  5       nUR                  S5      R                  UR                  5      nUbO  [        XR                  S   -  UR                  5       :H  SU SUR                  S   UR                  S   -   35        XV4$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r  r]   z6Image features and image tokens do not match, tokens: z, features: r   r$   z6Audio features and audio tokens do not match, tokens: )get_input_embeddingsr4   r   ru   image_token_idlongr   allaudio_token_idrI  r   r   r   r   numel)	rY   r  r  r  r  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            r9   get_placeholder_mask!Gemma3nModel.get_placeholder_mask  s:    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;,,.LL!;!;5::VcVjVjk c"g  "+kk.H.H!H!*kk.H.H!H+//1/99"=@@AUAUV%"!4!4R!88N<P<P<RRHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 ,//1/99"=@@AUAUV%"!4!4R!88N<P<P<RRHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 "55r8   input_featuresr[  input_features_maskr  rG   token_type_idsrj  rG  	lm_kwargsc                 	   USL U	SL-  (       a  [        S5      eUGb\  U R                  5       " U5      n	[        R                  " US:  XR                  :  5      n[        R
                  " X[        R                  " U5      5      nU R                  R                  U5      n[        R                  " XR                  R                  :  XR                  R                  :  5      nU R                  R                  U R                  R                  -   S-
  n[        R
                  " UUU5      R                  U	R                  5      nU R                  US9nUR                  U	R                  U	R                  5      nUR!                  S5      n[        R
                  " UUU	5      n	XR                  R                  :  nU R                  R                  U R                  R                  -   S-
  n[        R
                  " UUU5      R                  U	R                  5      nU R                  US9nUR                  U	R                  U	R                  5      nUR!                  S5      n[        R
                  " UUU	5      n	OSnUbe  U R#                  USS9R$                  nUR                  U	R                  U	R                  5      nU R'                  XUS	9u  nnU	R)                  UU5      n	UGb>  UGb:  U R+                  X5) SS9nUR$                  nUR,                  n[        R.                  " U R                  S-
  //[        R0                  UR                  S
9nU R                  US9n [        R
                  " UR!                  S5      U U5      nUR2                  u  n!n"n#U R4                  R6                  U"-
  n$U R9                  U!U$U#5      n%[        R:                  " UU%4SS9nUR                  U	R                  U	R                  5      nU R'                  XUS9u  nn&U	R)                  U&U5      n	U R                  " SSUUUUU	USS.UD6n'[=        U'R>                  U(       a  U'R@                  OSU'RB                  U'RD                  Ub  WOSUb  WS9$ SS9$ )a  
input_features_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Attention mask for `input_features` where non-zero values mark valid audio frames.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

>>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```
NrI  r   r$   )r  r]   T)r  )r  r  r  r   )r  r  )r  rF  r[  r  rG   r  rG  r  )r  rG   rH   rI   r?   r@   r.   )#r  r  r4   r  r  r  
zeros_liker  rL  r  r|  r  r  r   r   r   r   r  r  r  masked_scatterget_audio_featuresr-   r   r  r   ru   audio_soft_tokens_per_imagerT  r   r=   r  rG   rH   rI   )(rY   r  r  r  r[  r  r  rG   r  r  rj  rG  r  per_layer_inputs_maskper_layer_inputs_tokensrF  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr  r  r   audio_outputsr  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr  rp  s(                                           r9   rk   Gemma3nModel.forward  s   ` -t";<YZZ  557	BM %*$5$5i1niRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM),,]-A-A=CVCVWM#.#8#8#< !KK(<m][M #&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL'??=+?+?ATATUL","6"6r":!KK(;\=YM# #!44\t4TbbN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I 33NDXfj3kM*88N&55J "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%% 

-)%+'

 

 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
r8   c                 .    U R                   R                  $ rg   r  r  r  s    r9   r  +Gemma3nModel.get_per_layer_input_embeddings  s    ""999r8   c                 $    XR                   l        g rg   r  r  s     r9   r  +Gemma3nModel.set_per_layer_input_embeddings  s    5:2r8   zPProjects the last hidden state from the audio encoder into language model space.c                 n    U R                   " X4SS0UD6nU R                  UR                  S9nXTl        U$ )a  
input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
    The tensors corresponding to the input audio.
input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
    The attention mask for the input audio.
r  Tr  )r  r  r  r  )rY   r  r  rb  r  r  s         r9   r  Gemma3nModel.get_audio_features  sN     9=8H8H9
=A9
EK9
 ''m6U6U'V&2#r8   )r  r  r  r  r  r  r  r1  )NNNNNNNNNNN)r/   r0   r1   r2   accepts_loss_kwargsr&   rT   r   r   r4   rA   r   r   rJ   r   r  r  r  ro   r   rn   r=   rk   r  r  r+   r  r7   rp   rq   s   @r9   r  r    si     }  !rs'' +, 
+	+	 t , .2263737*6##d**6 ((4/*6 ))D0	*6
 ))D0*6X  .21537.23704(,2626*.!%F
##d*F
 ''$.F
 ))D0	F

 t+F
 #\\D0F
 &&-F
 F
 ((4/F
 ((4/F
   4'F
 $;F
 ./F
 
$F
 F
P:; !st #\\ +,	
 
/	/ u r8   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c                   .  ^  \ rS rSrSS0rSrS\4U 4S jjr\S\	R                  S\\   4S	 j5       r\\            SS\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                   S
-  S\	R                   S
-  S\	R                  S
-  S\S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\S
-  S\\	R                   -  S\\   S\4S jj5       5       r            SU 4S jjrS rS rSrU =r$ )Gemma3nForConditionalGenerationi  re  z(model.language_model.embed_tokens.weightFru   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g r  )rS   rT   r  r  rU   r   ry  r   r  rf  r  r  s     r9   rT   (Gemma3nForConditionalGeneration.__init__  sS     !&)
yy!3!3!?!?ASASA^A^ejkr8   r  rb  c                 <    U R                   R                  " U40 UD6$ rg   )r  r  )rY   r  rb  s      r9   r  2Gemma3nForConditionalGeneration.get_image_features  s    zz,,\DVDDr8   Nr  r  r[  r  r  rG   r  r  rj  rG  rk  r  re   c                 \   U R                   " SUUUUUUUUU	U
USS.UD6nUR                  n[        U[        5      (       a  [	        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnU
b6  U R                  " UXR                  R                  5       R                  40 UD6n[        UUUR                  UR                  UR                   UR"                  UR$                  S9$ )a<  
input_features_mask (torch.Tensor, *optional*, defaults to None):
    The attention mask for the input audio.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in
    `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenizer=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
T)r  r  r  r[  r  r  rG   r  r  rj  rG  r  N)rE   rF   rG   rH   rI   r?   r@   r.   )r  r  r  rm   rm  rf  ru   r  rn  r4   r  ro  r  rC   rG   rH   rI   r?   r@   )rY   r  r  r  r[  r  r  rG   r  r  rj  rG  rk  r  rp  rH   rq  rF   rn  rE   s                       r9   rk   'Gemma3nForConditionalGeneration.forward  sD   D ** 
%)) 3%+)'
 
   118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F%%ffkk6Q6Q6S6^6^lbklD,#33!//)) ' ; ; ' ; ;
 	
r8   c                 p   > [         TU ]  " U4UUUUU
UU	US.UD6nU(       d  U
(       d  X_S'   XoS'   XS'   U$ )N)rG   r  r[  r  rG  rk  r  is_first_iterationr  r  r  )rS   prepare_inputs_for_generation)rY   r  rG   r  r  r  r  r[  r  r  rG  rk  rj  r  rb  model_inputsrZ   s                   r9   r  =Gemma3nForConditionalGeneration.prepare_inputs_for_generation!	  se    $ w<
+')%))1
 
  Y+7(-;)*2E./r8   c                 6    U R                   R                  5       $ rg   )r  r  r  s    r9   r  >Gemma3nForConditionalGeneration.get_per_layer_input_embeddingsJ	  s    zz88::r8   c                 :    U R                   R                  U5        g rg   )r  r  r  s     r9   r  >Gemma3nForConditionalGeneration.set_per_layer_input_embeddingsM	  s    

11%8r8   )rf  r  )NNNNNNNNNNNr   )NNNNNNNNTNNF)r/   r0   r1   r2   rs  r  r&   rT   r   r4   rA   r   r   r  r   r  ro   r   rn   rm   rC   rk   r  r  r  r7   rp   rq   s   @r9   r  r    s    +,VW}  Eu/@/@ EFSeLf E E  .21537.23704(,2626*.!%-.e
##d*e
 ''$.e
 ))D0	e

 t+e
 #\\D0e
 &&-e
 e
 ((4/e
 ((4/e
   4'e
 $;e
 ell*e
 ./e
 
'e
  e
T   'R;9 9r8   r  )r  rd  r  r  r  r  )r   NN)r$   )fr   collectionsr   collections.abcr   r   dataclassesr   typingr   r4   torch.nnrU   torch.nn.functionalr   r   r	   r  r  r
   cache_utilsr   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   r    utils.output_capturingr!   autor#   configuration_gemma3nr%   r&   r'   r(   accelerate.hooksr)   r+   r=   rC   ModulerL   rs   r   r>  rb  r  r  r  r  r  r  r  r  r   r$  rQ  ro   rm   rW  rh   rJ   re  rh  rj  r  r  r  r  r  rd  rw  r  r  __all__r.   r8   r9   <module>r     s	  *    . !      & ! . ) R 9 k k K F &  H 5  l l 3 
3%? 3  3 
 9!8 9 9( 
 9K 9 9:4RYY 40g)BII g)TaBII aHj,bii j,ZB7		 B7JF")) FRORYY O8Dryy D2(ryy (V 6SR\\ S;RYY ;$#5RYY #5L`'ryy `'F(	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% S[% T\% T\% 5<<%&%D.ELL .u|| .%,, ._b .,l)299 l)^I%8 I%X ^J_ ^J ^JBN
0 N
bL<RYY L<^ abO
- O
 cO
d ^_L
/ L
 `L
^/B		 /Bd v) vvr d9&<o d9d9Nr8   