
    
3j                        S r SSKrSSKJr  SSKJrJrJrJrJ	r	J
r
JrJr  SSKrSSKJr  SSKJs  Jr  SSKJrJr  SSKJrJrJrJrJr  SSKJr  SS	KJr  SS
K J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SS/r( " S S\RR                  5      r*S\RV                  S\RV                  4S jr, SRS\RV                  S\RV                  S\RV                  S\-S\RV                  4
S jjr.  SSS\RV                  S\RV                  S\RV                  S\-S\-S\RV                  4S jjr/ " S S\RR                  5      r0 " S S\RR                  5      r1 " S S\RR                  5      r2 " S  S!\RR                  5      r3 " S" S#\RR                  5      r4 " S$ S%\RR                  5      r5 " S& S\RR                  5      r6 " S' S\RR                  5      r7STS)\RR                  S*\8S+\9SS4S, jjr:STS-\8S+\9S\4S. jjr;S/\\8\RV                  4   S0\6S\\8\RV                  4   4S1 jr<S/\\8\RV                  4   S0SS\\8\RV                  4   4S2 jr=SUS4\8S5\9S\64S6 jjr>SUS4\8S5\9S\74S7 jjr?SVS8\8S\\8\4   4S9 jjr@\$" \@" S:S;S<S=9\@" S>S;S<S=9\@" S?S<S@9\@" SAS<S@9SB.5      rA\B" SCSDSESFSGS3S(SH9rC\B" SISJSDSKSLS(SM9rD\%SUS5\9S\74SN jj5       rE\%SUS5\9S\64SO jj5       rF\%SUS5\9S\74SP jj5       rG\%SUS5\9S\64SQ jj5       rHg)WaM  Gemma4 Vision Transformer

Vision encoder from Google's Gemma 4 multimodal model.
Custom ViT with 2D RoPE, Gated MLP, QKV normalization, and 4-norm sandwich blocks.

Paper: https://ai.google.dev/gemma/docs/core/model_card_4
Reference impl: https://github.com/huggingface/transformers (Gemma4VisionModel)

Copyright 2025 Yonghye Kwon
    N)partial)AnyCallableDictListOptionalSetTupleUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)RmsNormDropPath	to_2tupletrunc_normal_tf_use_fused_attn   )build_model_with_cfg)feature_take_indices)
checkpointnamed_apply)generate_default_cfgsregister_model)batch_patchifyGemma4VitEncoderGemma4VitClassifierc            	          ^  \ rS rSrSr   SS\S\S\SS4U 4S jjjrSS	 jrS
\	R                  S\	R                  4S jrSrU =r$ )Gemma4ClippableLinear    zLinear layer with optional input/output clamping.

Used in Gemma4 E4B variant where clamp values are finite and affect output.
When use_clipped=False, behaves as a standard nn.Linear (no buffers registered).
Nin_featuresout_featuresuse_clippedreturnc                   > XES.n[         TU ]  5         X0l        [        R                  " X4SS0UD6U l        U(       a  U R                  S[        R                  " S0 UD65        U R                  S[        R                  " S0 UD65        U R                  S[        R                  " S0 UD65        U R                  S[        R                  " S0 UD65        U R                  5         g g )	NdevicedtypebiasF	input_min	input_max
output_min
output_max) )
super__init__r"   nnLinearlinearregister_buffertorchemptyreset_parameters)selfr    r!   r"   r&   r'   dd	__class__s          P/home/wildlama/miniconda3/lib/python3.13/site-packages/timm/models/gemma4_vit.pyr/   Gemma4ClippableLinear.__init__'   s     /&iiLLL  ekk.C.CD  ekk.C.CD  u{{/D/DE  u{{/D/DE!!#     c                 L   U R                   (       a  U R                  R                  [        S5      * 5        U R                  R                  [        S5      5        U R
                  R                  [        S5      * 5        U R                  R                  [        S5      5        g g )Ninf)r"   r)   fill_floatr*   r+   r,   r7   s    r:   r6   &Gemma4ClippableLinear.reset_parameters;   sn     NN  %,/NN  u.OO!!5<-0OO!!%,/	 r<   xc                    U R                   (       a+  [        R                  " XR                  U R                  5      nU R                  U5      nU R                   (       a+  [        R                  " XR                  U R                  5      nU$ N)r"   r4   clampr)   r*   r2   r+   r,   r7   rC   s     r:   forwardGemma4ClippableLinear.forwardD   sV    A~~t~~>AKKNA@Ar<   )r2   r"   )FNNr#   N)__name__
__module____qualname____firstlineno____doc__intboolr/   r6   r4   TensorrH   __static_attributes____classcell__r9   s   @r:   r   r       sh     !&$$ $ 	$ 
$ $(0 %,,  r<   r   rC   r#   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shaper4   cat)rC   x1x2s      r:   rotate_halfr_   M   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r<   cossinunsqueeze_dimc                 l    UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   $ )z0Apply rotary position embedding to input tensor.)	unsqueezer_   )rC   r`   ra   rb   s       r:   apply_rotary_pos_embre   T   s6     --
&C
--
&CGA,--r<   ndimc           
      J   U R                   S   nSUSU-  -  -  nU/U-  n[        R                  " XSS9n[        R                  " XSS9n	[        R                  " X'SS9n
[        U5       Vs/ s H  n[	        X   X   X   US9PM     nn[        R
                  " USS9$ s  snf )zApply multidimensional RoPE to input tensor.

Splits input along head_dim into ndim parts, applies RoPE to each,
then concatenates back.
rW   rX   rY   )rb   )r[   r4   splitrangere   r\   )rC   r`   ra   rf   rb   num_input_channelsnum_rotated_channels_per_dimsplit_sizesx_parts	cos_parts	sin_partsky_partss                r:   apply_multidimensional_roperr   `   s     #$(:q4x(H#I /047Kkk!b1GC"5IC"5I lqqukvkvfgWZy|S`akv   99W"%%s   -B c                      ^  \ rS rSrSr   SS\S\SS4U 4S jjjrSS jrSS	 jr	SS
 jr
S\R                  S\R                  S\\R                  \R                  4   4S jrSrU =r$ )Gemma4RotaryEmbedding2Dz   z2D Rotary Position Embedding for Gemma4 vision encoder.

Computes RoPE independently for each spatial dimension (x, y),
using theta=100.0 and the partitioned head_dim.
Nhead_dim
rope_thetar#   c           	         > [         TU ]  5         Xl        X l        US-  S-  nU R	                  S[
        R                  " XS[
        R                  S9SS9  U R                  5         g )NrX   inv_freqr%   F)
persistent)	r.   r/   rv   rw   r3   r4   r5   r@   _init_buffers)r7   rv   rw   r&   r'   	num_freqsr9   s         r:   r/    Gemma4RotaryEmbedding2D.__init__   sc     	 $ ]q(	KK	D 	 	
 	r<   c           
          U R                   S-  nSU R                  [        R                  " SUS[        R                  U R
                  R                  S9U-  -  -  nU R
                  R                  U5        g)z.Compute and fill non-persistent buffer values.rX         ?r   r'   r&   N)rv   rw   r4   aranger@   ry   r&   copy_)r7   spatial_dimry   s      r:   r{   %Gemma4RotaryEmbedding2D._init_buffers   sc    mmq(OOQQekk$--J^J^_bmmo
 	H%r<   c                 $    U R                  5         g rE   r{   rA   s    r:   r6   (Gemma4RotaryEmbedding2D.reset_parameters   s    r<   c                 $    U R                  5         g)z"Initialize non-persistent buffers.Nr   rA   s    r:   init_non_persistent_buffers3Gemma4RotaryEmbedding2D.init_non_persistent_buffers   s    r<   rC   position_idsc                 P   [         R                  " 5          U R                  SSS2S4   R                  5       R	                  UR
                  S   SS5      R                  UR                  5      n/ n/ n[        S5       H  nUSS2SS2U4   nUSS2SSS24   R                  5       nUR                  5       UR                  5       -  R                  SS5      n	[         R                  " X4SS9n
UR                  U
R                  5       5        UR                  U
R                  5       5        M     [         R                  " USS9R                  UR                  S9n[         R                  " USS9R                  UR                  S9nSSS5        X4$ ! , (       d  f       WW4$ = f)z
Args:
    x: Hidden states tensor, used only for dtype/device.
    position_ids: (B, N, 2) Gemma4-internal ``(x, y)`` patch coords.

Returns:
    cos, sin: (B, N, head_dim) tensors for RoPE application.
Nr   rW   r   rX   rY   )r'   )r4   no_gradry   r@   expandr[   tor&   ri   	transposer\   appendr`   ra   r'   )r7   rC   r   inv_freq_expandedall_cosall_sinidim_posdim_pos_expandedfreqsembr`   ra   s                r:   rH   Gemma4RotaryEmbedding2D.forward   se     ]]_ $dAtm < B B D K KLL^L^_`Lacegh i l lmnmumu v*,G*,G1X&q!Qw/#*1dA:#6#<#<#> *0025E5K5K5MMXXYZ\]^iiB7swwy)swwy)  ))G,//agg/>C))G,//agg/>C   x! _  Cxs   E3F
F%rv   rw   )      Y@NNrJ   )rK   rL   rM   rN   rO   rP   r@   r/   r{   r6   r   r4   rR   r
   rH   rS   rT   rU   s   @r:   rt   rt   z   s     !&  
 ,& ||   ,,  
u||U\\)	*	   r<   rt   c                     ^  \ rS rSrSr      SS\\\\\4   4   S\S\S\SS4
U 4S	 jjjrSS
 jr	S\S\S\S\
R                  S\\
R                  \
R                  4   4
S jrS\
R                  S\
R                  S\
R                  4S jr  SS\\
R                  \\\
R                  4   4   S\\
R                     S\\
R                     S\\
R                  \
R                  \
R                  4   4S jjrSrU =r$ )Gemma4PatchEmbed   u  Linear patch embedding with a 2D position-embedding table.

Unlike the standard ViT PatchEmbed (Conv2d), Gemma4 uses a Linear projection
on flattened patches plus a separate 2D position-embedding table applied via
one-hot lookup. Inputs may be supplied as:

  * ``(B, C, H, W)`` raw images — patchified inline in P-P-C flat order.
  * ``(B, N, P*P*C)`` pre-patchified P-P-C patches (NaFlex loader output).
  * ``(B, N, Ph, Pw, C)`` pre-patchified unflattened patches.
  * ``dict`` with keys ``patches`` / ``patch_coord`` / ``patch_valid``.

All input-dispatch logic is owned by this module; the parent model simply
calls ``self.patch_embed(x, patch_coord, patch_valid)`` and unpacks the
returned tensors.
N
patch_sizein_chans	embed_dimposition_embedding_sizer#   c                 H  > XVS.n[         T
U ]  5         [        U5      U l        X0l        X@l        U R                  u  p[        R                  " X(-  U	-  U4SS0UD6U l        [        R                  " [        R                  " SXC40 UD65      U l        U R                  5         g )Nr%   r(   FrX   )r.   r/   r   r   r   r   r0   r1   
input_proj	Parameterr4   r5   position_embedding_tabler6   )r7   r   r   r   r   r&   r'   r8   phpwr9   s             r:   r/   Gemma4PatchEmbed.__init__   s     /#J/"'>$))HMB$6	TTQST(*KK2DD)
% 	r<   c                 ,    [        U R                  SS9  g )N{Gz?std)r   r   rA   s    r:   r6   !Gemma4PatchEmbed.reset_parameters   s    66DAr<   
batch_sizepHpWr&   c                 ~   [         R                  " X$S9n[         R                  " X4S9n[         R                  " XVSS9u  px[         R                  " UR	                  5       UR	                  5       /SS9n	U	R                  S5      R                  USS5      n	[         R                  " XU-  [         R                  US9n
X4$ )z:Row-major grid coords in external NaFlex ``(y, x)`` order.)r&   ij)indexingrW   rY   r   r   )	r4   r   meshgridstackflattenrd   r   onesrQ   )r7   r   r   r   r&   ysxsgrid_ygrid_xpatch_coordpatch_valids              r:   _default_patch_coord%Gemma4PatchEmbed._default_patch_coord   s     \\",\\",>kk6>>#3V^^5E"FBO!++A.55j"bIjj"WEJJvV''r<   r   padding_positionsc                 B   UR                  SS9n[        R                  " X0R                  S9nUR	                  SSSS5      R                  U R                  5      nX@R                  -  nUR                  SS9n[        R                  " UR                  S5      S	U5      nU$ )
zCompute position embeddings via one-hot matmul against the 2D table.

``position_ids`` follows the Gemma4-internal ``(x, y)`` convention.
r   minnum_classesrX   r      rY   rW           )rF   Fone_hotr   permuter   r   sumr4   whererd   )r7   r   r   clamped_positionsr   position_embeddingss         r:   _position_embeddings%Gemma4PatchEmbed._position_embeddings  s     )..1.5))-;W;WX//!Q1-001N1NO%(E(EE155!5<#kk''+S2E
 #"r<   rC   r   r   c                    [        U[        5      (       a)  UR                  SU5      nUR                  SU5      nUS   nU R                  u  pEUR                  S:X  aF  UR
                  u  pgpUc$  U R                  XhU-  X-  UR                  5      u  p#[        XU4SSS9u  pOUR                  S:X  aA  UR                  S	S
SSS5      R                  UR
                  S	   UR
                  S
   S5      nOvUR                  S:X  aM  UR
                  u  pjnXU-  -  nUR                  XjXEU5      R                  S	S
SSS5      R                  XjU5      nO[        SUR                   S35      eUc  [        S5      eUcf  US:H  R                  SS9nUR                  5       (       a  U) nO;[        R                   " UR
                  SS [        R"                  UR                  S9nSUS-
  -  nU R%                  UR'                  U R$                  R(                  R*                  5      5      nUR-                  SS9nU) nXR/                  X5      -   nXU4$ )a  Normalize inputs, patchify / project / position-embed.

Accepts external NaFlex ``patch_coord`` (y, x) / ``patch_valid`` inputs
and returns the internal (x, y) form used by the rest of the model.

Returns:
    embeddings: (B, N, embed_dim) patch embeddings with position info added.
    position_ids: (B, N, 2) Gemma4-internal (x, y) coords.
    padding_positions: (B, N) True for padding tokens.
r   r   patches   NF)padchannels_last   r   r   rX   r   rW   z&Expected input ndim in (3, 4, 5); got .z1patch_coord is required for pre-patchified input.rY   r         ?)rW   )dims)
isinstancedictgetr   rf   r[   r   r&   r   r   reshapeview
ValueErrorallanyr4   r   rQ   r   r   weightr'   flipr   )r7   rC   r   r   r   r   B_HWNPPCCsentinelr   r   s                   r:   rH   Gemma4PatchEmbed.forward  s8     a%%{;K%%{;K)A66Q;JA!"+/+D+DQRQRQXZ[ZbZb+c(!!"X5NDAqVVq[ 		!Q1a(00QWWQZLAVVq[ IA#R AqRQ'//1aA>FFqSQA8B  PQQ#r)..2.6H||~~'i#jj%%bq)KDVDV
 SMOOADD!7!7!=!=>? #''U'3(L)),JJ 111r<   )r   r   r   r   r   )   r       (  NNrJ   NN)rK   rL   rM   rN   rO   r   rP   r
   r/   r6   r4   r&   rR   r   r   r   strr   rH   rS   rT   rU   s   @r:   r   r      sj   $ 79 +0 c5c?23    	 
 &)  
   .B(( ( 	(
 LL( 
u||U\\)	*( #,,#  %||# 
	#. 3726	@2U\\4U\\(9#::;@2 "%,,/@2 "%,,/	@2
 
u||U\\5<<7	8@2 @2r<   r   c                   8  ^  \ rS rSr% Sr\R                  R                  \   \	S'            SS\
S\
S\
S\\
   S	\S
\S\S\SS4U 4S jjjr SS\R                  S\R                  S\R                  S\\R                     S\R                  4
S jjrSrU =r$ )Gemma4Attentioni\  a  Gemma4 Vision Attention with QKV normalization and 2D RoPE.

Key features:
- Separate Q, K, V projections (not fused)
- RMSNorm on Q, K (with scale) and V (without scale)
- 2D RoPE applied after normalization
- Attention scale = 1.0 (since QK are normalized)

fused_attnNrZ   	num_headsrv   num_kv_heads	attn_drop	proj_dropnorm_epsuse_clipped_linearsr#   c                 b  > XS.n[         TU ]  5         X l        X0l        U=(       d    UU l        X R                  -  U l        [        5       U l        [        XU-  4SU0UD6U l	        [        XR                  U-  4SU0UD6U l
        [        XR                  U-  4SU0UD6U l        [        X#-  U4SU0UD6U l        [        U4USS.UD6U l        [        U4USS.UD6U l        [        U4USS.UD6U l        ["        R$                  " U5      U l        ["        R$                  " U5      U l        g )Nr%   r"   TepsaffineF)r.   r/   r   rv   r   num_kv_groupsr   r   r   q_projk_projv_projo_projr   q_normk_normv_normr0   Dropoutr   r   )r7   rZ   r   rv   r   r   r   r   r   r&   r'   r8   r9   s               r:   r/   Gemma4Attention.__init__h  s2    /" (5I&*;*;;(*+CX1EmSfmjlm+C1B1BX1Mu[nurtu+C1B1BX1Mu[nurtu+I,@#mSfmjlmhHHTHRHhHHTHRHhIHUIbII.I.r<   rC   rope_cosrope_sin	attn_maskc           	         UR                   u  pVnU R                  U5      R                  XVU R                  U R                  5      nU R                  U5      R                  XVU R                  U R                  5      n	U R                  U5      R                  XVU R                  U R                  5      n
U R                  U5      nU R                  U	5      n	U R                  U
5      n
[        XUSSS9n[        XUSSS9n	UR                  SS5      nU	R                  SS5      n	U
R                  SS5      n
U R                  S:  a4  U	R                  U R                  SS9n	U
R                  U R                  SS9n
U R                  (       aA  [         R"                  " UU	U
UU R$                  (       a  U R&                  R(                  OSSS9nO?XR                  SS	5      -  nUb  X-   nUR+                  S	S9nU R'                  U5      nX-  nUR                  SS5      R-                  XVS	5      nU R/                  U5      nU R1                  U5      nU$ )
NrX   )rf   rb   r   rY   r   r   )r  	dropout_pscalerW   )r[   r  r   r   rv   r  r   r  r  r  r  rr   r   r  repeat_interleaver   r   scaled_dot_product_attentiontrainingr   psoftmaxr   r  r   )r7   rC   r  r  r  r   r   r   qrp   vattns               r:   rH   Gemma4Attention.forward  s    ''aKKNdnndmmDKKNd&7&7GKKNd&7&7G KKNKKNKKN (XAUVW'XAUVW KK1KK1KK1 !##D$6$6A#>A##D$6$6A#>A??..#.2mm$..**A {{2r**D$'<<B<'D>>$'DAKK1%%aB/KKNNN1r<   )r   r   rv   r  r  r   r  r   r  r   r  r  r  r  )	   @   Nr   r   ư>FNNrE   )rK   rL   rM   rN   rO   r4   jitFinalrQ   __annotations__rP   r   r@   r/   rR   rH   rS   rT   rU   s   @r:   r   r   \  s     		%%
  *."""(- / /  / 	 /
 #3- /  /  /  / "& / 
 /  /N 155||5 ll5 ll	5
  -5 
5 5r<   r   c                      ^  \ rS rSrSr     SS\S\S\\   S\S\	S	S4U 4S
 jjjr
S\R                  S	\R                  4S jrSrU =r$ )Gemma4GatedMlpi  zyGated MLP for Gemma4 Vision Encoder.

Uses GELUTanh activation: output = down_proj(gelu_tanh(gate_proj(x)) * up_proj(x))
Nr    hidden_features	act_layerdropr   r#   c                   > XgS.n[         T	U ]  5         [        X4SU0UD6U l        [        X4SU0UD6U l        [        X!4SU0UD6U l        Ub  U" 5       O[        R                  " SS9U l        [        R                  " U5      U l
        g )Nr%   r"   tanhapproximate)r.   r/   r   	gate_projup_proj	down_projr0   GELUactr	  r%  )
r7   r    r#  r$  r%  r   r&   r'   r8   r9   s
            r:   r/   Gemma4GatedMlp.__init__  s     /.{sYlsprs,[qWjqnpq.sYlsprs"+"79;RWWQW=XJJt$	r<   rC   c           	          U R                  U R                  U R                  U R                  U5      5      U R	                  U5      -  5      5      $ rE   )r%  r,  r.  r*  r+  rG   s     r:   rH   Gemma4GatedMlp.forward  s:    yy1B(CdllSTo(UVWWr<   )r.  r,  r%  r*  r+  )Nr   FNN)rK   rL   rM   rN   rO   rP   r   r   r@   rQ   r/   r4   rR   rH   rS   rT   rU   s   @r:   r"  r"    s     -1(-%% !%  )	%
 % "&% 
% %$X X%,, X Xr<   r"  c                     ^  \ rS rSrSr         SS\S\S\S\S\\   S	\S
\S\S\S\\   S\	SS4U 4S jjjr
 SS\R                  S\R                  S\R                  S\\R                     S\R                  4
S jjrSrU =r$ )Gemma4Blocki  zGemma4 Vision Encoder Block with 4-norm sandwich pattern.

Unlike standard ViT (pre-norm with 2 norms), Gemma4 uses:
- input_layernorm (norm1) + post_attention_layernorm (norm2)
- pre_feedforward_layernorm (norm3) + post_feedforward_layernorm (norm4)
NrZ   r   rv   intermediate_sizer   r   r   r   	drop_pathr$  r   r#   c                   > XS.n[         TU ]  5         [        U4SU0UD6U l        [	        SUUUUUUUUS.UD6U l        [        U4SU0UD6U l        [        U4SU0UD6U l        [        SUUU
US.UD6U l	        [        U4SU0UD6U l
        U	S:  a  [        U	5      U l        g [        R                  " 5       U l        g )Nr%   r   )rZ   r   rv   r   r   r   r   r   )r    r#  r$  r   r   r-   )r.   r/   r   norm1r   r  norm2norm3r"  mlpnorm4r   r0   Identityr5  )r7   rZ   r   rv   r4  r   r   r   r   r5  r$  r   r&   r'   r8   r9   s                  r:   r/   Gemma4Block.__init__  s      /S5h5"5
# 

% 3

 

	 S5h5"5
S5h5"5
! 
- 3	

 
 S5h5"5
09C),R[[]r<   rC   r  r  r  c                 $   UnU R                  U5      nU R                  XX4S9nU R                  U5      nXPR                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nXPR                  U5      -   nU$ )Nr  )r7  r  r8  r5  r9  r:  r;  )r7   rC   r  r  r  residuals         r:   rH   Gemma4Block.forward  s     JJqMIIa8IAJJqM~~a(( JJqMHHQKJJqM~~a((r<   )r  r5  r:  r7  r8  r9  r;  )	Nr  r   r   r   NFNNrE   )rK   rL   rM   rN   rO   rP   r   r@   r   rQ   r/   r4   rR   rH   rS   rT   rU   s   @r:   r3  r3    s    +/"""",0(-(S(S (S 	(S
  #(S #3-(S (S (S (S (S  )(S "&(S 
(S (S^ 15|| ll ll	
  - 
 r<   r3  c            
       4  ^  \ rS rSrSrSS\S\4U 4S jjjrS\R                  S\R                  S\	\R                  \R                  4   4S	 jr
S\R                  S\R                  S
\R                  S\	\R                  \R                  4   4S jrSrU =r$ )Gemma4VisionPooleri'  u   Spatial pooling for Gemma4 vision encoder output.

Pools patches by averaging within k×k grid cells based on position coordinates.
Output is scaled by sqrt(hidden_size).
hidden_sizepooling_kernel_sizec                 N   > [         TU ]  5         Xl        US-  U l        X l        g )Nr   )r.   r/   rD  root_hidden_sizerE  )r7   rD  rE  r9   s      r:   r/   Gemma4VisionPooler.__init__.  s'    & +S 0#6 r<   hidden_statesr   r#   c           	      T   UR                   S   nU R                  nXD-  nX5-  S:w  a  [        SU SU SU S35      eX5-  nUR                  SS9nUS   R	                  S	S
S9S   S-   n[
        R                  " XtSS9n	U	S   X-  U	S   -  -   n	[        R                  " U	R                  5       U5      R                  5       U-  n
U
R                  SS5      UR                  5       -  n[
        R                  " U
S:H  R                  SS95      nUR                  UR                  5      U4$ )u   2D spatial pooling on a ``k × k`` grid (k = ``self.pooling_kernel_size``).

``N`` patches are binned into ``k^2``-sized cells, so the pool requires
``N % k^2 == 0`` (caller ensures both grid dims divide by k upstream).
``position_ids`` follows the Gemma4-internal ``(x, y)`` convention.
r   r   zCannot pool z tokens with k=z: N must be divisible by k^2=z.. Both grid dimensions must be divisible by k.r   .r   rW   TrZ   keepdimfloor)rounding_mode.r   rX   rY   )r[   rE  r   rF   maxr4   divr   r   longr@   r   logical_notr   r   r'   )r7   rI  r   r   rp   	k_squaredoutput_lengthr   max_xkernel_idxsweightsoutputmasks                r:   _avg_pool_by_positions)Gemma4VisionPooler._avg_pool_by_positions4  sO    "$$E	=Aqc3PQZP[ \? @  (..1.5!&)--"d-CAFJii 1GL!&)UZ;v;N,NN))K,,.>DDFR""1a(=+>+>+@@  'Q,!3!3!3!:;yy,,-t33r<   r   c                     UR                  UR                  S5      S5      nU R                  X5      u  pXR                  -  nX4$ )uG  Spatial pool with ``pooling_kernel_size × pooling_kernel_size`` cells.

Args:
    hidden_states: (B, N, D) encoder output.
    position_ids: (B, N, 2) Gemma4-internal ``(x, y)`` coords.
    padding_positions: (B, N) True for padding tokens.

Returns:
    pooled hidden states (B, N // k^2, D) and validity mask (B, N // k^2).
rW   r   )masked_fillrd   r\  rG  )r7   rI  r   r   pooler_masks        r:   rH   Gemma4VisionPooler.forwardS  sM    " &112C2M2Mb2QSVW%)%@%@%]"%(=(==))r<   )rD  rE  rG  )r   )rK   rL   rM   rN   rO   rP   r/   r4   rR   r
   r\  rH   rS   rT   rU   s   @r:   rC  rC  '  s    7C 7c 7 74 <<4  ,,4 
u||U\\)	*	4>* <<*  ,,*  %||	*
 
u||U\\)	** *r<   rC  c            -       P  ^  \ rS rSrSr                       S=S\\\\\4   4   S\S\S\S\S	\S
\S\S\	\   S\S\
S\
S\S\S\S\S\
S\
S\
S\	\   S\SS4,U 4S jjjr\R                  R                   S>S\S\SS4S jj5       r\R                  R                   S\\   4S j5       r\R                  R                   S\\\4   4S j5       r\R                  R                   S?S \S\\\4   4S! jj5       r\R                  R                   S@S"\SS4S# jj5       r\R                  R                   S@S$\SS4S% jj5       rS&\R4                  SS4S' jr  SAS&\R4                  S(\R4                  S)\R4                  S*\	\\\R4                  /S4      S+\	\   S\R4                  4S, jjr  SAS&\\R4                  \\\R4                  4   4   S-\	\R4                     S.\	\R4                     S\R4                  4S/ jjrS?S&\R4                  S0\S\R4                  4S1 jjr  SAS&\\R4                  \\\R4                  4   4   S-\	\R4                     S.\	\R4                     S\R4                  4S2 jjr       SBS&\\R4                  \\\R4                  4   4   S-\	\R4                     S.\	\R4                     S3\	\\\ \   4      S4\S5\S6\S7\S\\ \R4                     \\R4                  \ \R4                     4   4   4S8 jjr!   SCS3\\\ \   4   S9\S:\S\ \   4S; jjr"S<r#U =r$$ )Dr   ij  u  Gemma4 Vision Encoder.

The pure encoder from Google's Gemma 4 multimodal model. Custom ViT with 2D
RoPE, gated MLP, QKV normalization, and 4-norm sandwich blocks.

When ``standardize=True`` (31B variant), ``std_bias/std_scale`` are applied
after the soft-token pooler (original contract). Other pool modes don't apply it.
Output shape depends on ``global_pool``:

- ``'soft'`` (default): spatial ``k×k`` pooler + ``√D`` scale + optional
  standardization. Output ``(B, num_soft_tokens, embed_dim)``
- ``'avg'``: masked mean over patch tokens; skips the pooler's ``√D`` scale
  and standardization. Output ``(B, embed_dim)``.
- ``'none'`` / ``''``: no pool — returns raw patch tokens
  ``(B, N, embed_dim)``. Useful for building custom pool heads.
Nimg_sizer   r   global_poolr   depthr   rv   r   r4  r   rw   r   rE  standardizer   proj_drop_rateattn_drop_ratedrop_path_rater$  weight_initr#   c                 t  > UUS.n[         TU ]  5         US;   d   SU< 35       eX@l        U=U l        =U l        U l        SU l        SU l        SU l        SU l	        [        U5      U l        Xl        UU l        U=(       d    [        [        R                   SS9n[#        SU R                  UUUS	.UD6U l        ['        SUUS
.UD6U l        [*        R,                  " SUU5       Vs/ s H  nUR/                  5       PM     nn[        R0                  " [3        U5       Vs/ s H  n[5        SUUUU	U
UUUUU   UUS.UD6PM     sn5      U l        [9        UUS9U l        U(       aO  U R=                  S[*        R>                  " U40 UD65        U R=                  S[*        R>                  " U40 UD65        OS U l         S U l!        [E        U R                  5      n[3        U5       Vs/ s H  n[G        UUSU 3S9PM     snU l$        US:X  a  SOUU l%        US:w  a  U RM                  SS9  g g s  snf s  snf s  snf )Nr%   )softavgnone z>global_pool must be one of 'soft', 'avg', 'none' (or ''); got r   NLCFr'  r(  )r   r   r   r   r   )rZ   r   rv   r   r4  r   r   r   r5  r$  r   )rD  rE  std_bias	std_scaleblocks.)num_chs	reductionmoduleskipresetneeds_resetr-   )'r.   r/   rd  num_featureshead_hidden_sizer   r   
output_fmtnum_prefix_tokensgrad_checkpointingr   r   rE  r   r   r0   r-  r   patch_embedrt   
rotary_embr4   linspaceitem
ModuleListri   r3  blocksrC  poolerr3   r5   rq  rr  rQ  r   feature_infoweight_init_modeinit_weights)r7   rc  r   r   rd  r   re  r   rv   r   r4  r   rw   r   rE  rf  r   rg  rh  ri  r$  rj  r&   r'   r8   rC   dprr   _redr9   s                                r:   r/   Gemma4VitEncoder.__init__|  s{   4 /99 	]L[O\	]9&ENNND1DN  !""' $J/#6 #6 Ef!E	 , 
$;	

 
 2 
!
 
 "'>5!IJ!IAqvvx!IJmm 5\%
 "  #!)"3!((a&#$7  "%
 & )! 3
   U[[-Ib-IJ  ekk).Jr.JK DM!DN
 4??#dijodpqdp_`T)tgVWUXMZdpq+6&+@k& %0 !W K%
L rs   'H+ $H0%H5moderz  c                 <   U=(       d    U R                   nUS;   d   eU R                  b)  [        R                  R	                  U R                  5        U R
                  b)  [        R                  R                  U R
                  5        [        [        XS9U 5        g)ao  Initialize model weights.

Args:
    mode: Init mode. '' applies trunc-normal-TF Linear init; 'reset'
        only calls ``reset_parameters`` on each sub-module.
    needs_reset: If True, call ``reset_parameters`` on modules that
        have one (for post-``to_empty()`` reinit). Set to False during
        ``__init__`` since modules already self-initialize there.
)ro  rx  Nry  )	r  rq  r0   initzeros_rr  ones_r   get_init_weights_gemma4_vitr7   r  rz  s      r:   r  Gemma4VitEncoder.init_weights  sq     ,t,,}$$$ ==$GGNN4==)>>%GGMM$..)/NPTUr<   c                     S1$ )Nz$patch_embed.position_embedding_tabler-   rA   s    r:   no_weight_decay Gemma4VitEncoder.no_weight_decay  s    677r<   c                     U R                   $ zHReturn the 2-tuple patch size. For NaFlex dataloader / transform wiring.)r   rA   s    r:   get_patch_sizeGemma4VitEncoder.get_patch_size  s     r<   coarsec                     [        SSS/S9$ )Nz^patch_embed|^rotary_emb)z^blocks\.(\d+)N)z^pooler|^std_)i stemr  r   r7   r  s     r:   group_matcherGemma4VitEncoder.group_matcher  s    ,-/KL
 	
r<   enablec                     Xl         g rE   )r  r7   r  s     r:   set_grad_checkpointing'Gemma4VitEncoder.set_grad_checkpointing  s    "(r<   enabledc                 l    U R                  5        H   n[        U[        5      (       d  M  Xl        M"     g)ao  Toggle the ``Gemma4ClippableLinear`` clamp ops.

Gemma4's pretrained E4B checkpoint ships finite clamp buffers on every
projection, which can saturate and stall gradient flow during classifier
fine-tuning. Setting ``enabled=False`` skips both the input and output
clamps in the forward pass (buffers are left untouched so the call is
reversible and checkpoint-safe).
N)modulesr   r   r"   )r7   r  mods      r:   set_clamp_enabled"Gemma4VitEncoder.set_clamp_enabled  s'     <<>C#455") "r<   rC   c                    UR                   S:w  d  U R                  S:w  a  gUR                  SS u  p#U R                  u  pEU R                  nXF-  XV-  pX'-  S:w  d  X8-  S:w  a  [        SU SU SU SU S	U SU S
35      eg)zWhen using the soft-token pooler, raw-image H/W must divide
by ``patch_size * pooling_kernel_size`` so the pool cell grid is integral.
Pre-patchified / NaFlex inputs are assumed to be conformant already.
r   rl  Nr  r   zImage size (z, z=) must be divisible by (patch_size * pooling_kernel_size) = (z3) when global_pool='soft'. Resize to multiples of (z#), or use global_pool='avg'/'none'.)rf   rd  r[   r   rE  r   )	r7   rC   r   r   r   r   rp   cell_hcell_ws	            r:   _assert_raw_img_conformant+Gemma4VitEncoder._assert_raw_img_conformant$  s    
 66Q;$**f4wwrs|$$:?ajAoqcA3 '99?6( K++1("VH<_a  .r<   r   r   block_callbackmax_block_indexc           	      8   UR                   SS u  pgU R                  X5      u  pSn
UR                  5       (       aR  [        R                  " USSXqR
                  UR                  S9n
U
R                  USS2SSSS24   [        S5      5        Uc  U R                  nOU R                  SUS-    n[        U5       H[  u  pU R                  (       a1  [        R                  R                  5       (       d  [        XXU
5      nOU" XXS9nUc  MS  U" X5        M]     U$ )z?RoPE + transformer-block pipeline over already-embedded tokens.NrX   r   r%   z-infr?  )r[   r  r   r4   zerosr&   r'   masked_fill_r@   r  	enumerater  r  is_scriptingr   )r7   rC   r   r   r  r  r   r   r  r  r  r  r   blks                 r:   _encodeGemma4VitEncoder._encode6  s     wwr{!__Q=,0	  ""Aq!QxxqwwOI""#4QdA5E#FfV"[[F[[!5/A"56F'FA&&uyy/E/E/G/Gsx9EXC)q$ ( r<   r   r   c                     U R                  [        U[        5      (       d  UOUS   5        U R                  XU5      u  pnU R	                  XU5      $ )z9Raw patch tokens pre-pool. Returns ``(B, N, embed_dim)``.r   )r  r   r   r  r  )r7   rC   r   r   r   r   s         r:   forward_features!Gemma4VitEncoder.forward_featuresW  sN     	''At1D1D!I,W-1-=-=ak-Z**||A->??r<   
pre_logitsc                     [        S5      e)Nz;Gemma4VitEncoder does not support classification use cases.)NotImplementedError)r7   rC   r  s      r:   forward_headGemma4VitEncoder.forward_headb  s    !"_``r<   c                 P   U R                  [        U[        5      (       d  UOUS   5        U R                  XU5      u  pnU R	                  XU5      nU R
                  S:X  a>  U R                  XU5      u  pU R                  b  XR                  -
  U R                  -  nU$ U R
                  S:X  au  UR                  5       (       aQ  UR                  UR                  S5      S5      nUR                  SS9U) R                  SSS	9R                  SS
9-  nU$ UR                  SS9nU$ )u   Encode + apply the configured pool.

Output shape depends on ``self.global_pool``:
  ``'soft'`` → ``(B, num_soft_tokens, D)``
  ``'avg'``  → ``(B, D)``
  ``'none'`` → ``(B, N, D)`` (raw patch tokens, identical to forward_features)
r   rl  rm  rW   r   r   rY   TrL  r   )r  r   r   r  r  rd  r  rq  rr  r   r_  rd   r   rF   mean)r7   rC   r   r   r   r   r   s          r:   rH   Gemma4VitEncoder.forwarde  s%    	''At1D1D!I,W-1-=-=ak-Z**LL*;<v%;;q0ABDA}}(&$..8  & $$&&MM"3"="=b"A3GEEaEL%6$6#;#;4#;#P#V#V[\#V#]]
  FFqFM r<   indicesnorm
stop_earlyr}  intermediates_onlyc	           	      x  ^^ US;   d   eUS:H  n	[        [        U R                  5      U5      u  mn
[        U[        5      (       d  UOUS   nUR
                  nU R                  U5        U R                  XU5      u  pn/ mS[        S[        R                  SS4UU4S jjnSnU(       a%  [        R                  R                  5       (       d  U
nU R                  UUUUUS	9nU	(       a  US
:w  a  [        S5      eUR                  S   n[        US   R!                  5       R#                  5       5      S-   n[        US   R!                  5       R#                  5       5      S-   nT Vs/ s H7  nUR%                  UUUS5      R'                  SSSS5      R)                  5       PM9     snmU(       a  T$ UT4$ s  snf )aI  Forward features returning intermediates.

Args:
    x: Input tensor ``(B, C, H, W)`` or NaFlex pre-patchified tensor/dict.
    patch_coord: ``(B, N, 2)`` patch coords in NaFlex external (y, x) convention.
    patch_valid: ``(B, N)`` boolean mask, True for valid tokens.
    indices: Block indices to return intermediates for.
    norm: Not used (no final norm in Gemma4 encoder).
    stop_early: Stop iterating after last needed intermediate.
    output_fmt: Output format ('NCHW' or 'NLC'). NCHW requires a fixed full grid.
    intermediates_only: Only return intermediate features.
)NCHWrp  r  r   r   yr#   Nc                 6   > U T;   a  TR                  U5        g g rE   )r   )r   r  intermediatestake_indicess     r:   _cb3Gemma4VitEncoder.forward_intermediates.<locals>._cb  s    L $$Q' !r<   )r  r  r   z:output_fmt='NCHW' requires a raw image (B, C, H, W) input.r   rK  r   rP  rW   r   rX   )r   lenr  r   r   rf   r  r  rP   r4   rR   r  r  r  r   r[   rQ  r  r   r   
contiguous)r7   rC   r   r   r  r  r  r}  r  r   	max_indexrawraw_input_ndimr   r   r  r  r   r   r   r  r  r  s                        @@r:   forward_intermediates&Gemma4VitEncoder.forward_intermediates  s   . _,,,&"6s4;;7G"Qi!!T**a)'',-1-=-=ak-Z**,.	(3 	(5<< 	(D 	( 	( eii4466'OLL+  
 " !]^^""1%A\&)--/4467!;B\&)--/4467!;B`mn`m[\QYYq"b"5==aAqITTV`mnM  - os   )>F7
prune_norm
prune_headc                 v    [        [        U R                  5      U5      u  pEU R                  SUS-    U l        U$ )zPrune layers not required for specified intermediates.

``prune_head`` is accepted for API compatibility; the encoder has no
classifier head so it's a no-op here. (The wrapping ``Gemma4VitClassifier``
handles its own head pruning.)
Nr   )r   r  r  )r7   r  r  r  r  r  s         r:   prune_intermediate_layers*Gemma4VitEncoder.prune_intermediate_layers  s8     #7s4;;7G"Qkk.9q=1r<   )r  r   r  rd  r  r|  r   r{  r~  r}  r  r   r  rE  r  rq  rr  r   r  )r   r   r   rl  r   r   r  r  N   r  r   r   r   FFr   r   r   Nro  NNro  TFTr   NNNFFr  Fr   FT)%rK   rL   rM   rN   rO   r   rP   r
   r   r   r@   rQ   r   r/   r4   r  ignorer  r	   r  r  r   r   r  r  r  rR   r  r  r  r  rH   r   r  r  rS   rT   rU   s   @r:   r   r   j  s/   & 58 % *.%)" %+0'( %(-$'$'$',0!1m1CsCx01m1 m1 	m1
 m1 m1 m1 m1 m1 #3-m1  #m1 m1 m1 &)m1 "%m1  !m1" "&#m1$ "%m1& "'m1( ")m1*  )+m1, -m12 
3m1 m1^ YYV V V V V, YY8S 8 8 YYc3h   YY
D 
T#s(^ 
 
 YY)T )T ) ) YY* * * *ELL T . MQ-1||  ,,  %||	
 %XsELL.A4.G%HI &c] 
H 3726		@U\\4U\\(9#::;	@ "%,,/	@ "%,,/		@
 
	@aell a a a 3726	 U\\4U\\(9#::;  "%,,/  "%,,/	 
 
 J 37267;$$',> U\\4U\\(9#::;>  "%,,/>  "%,,/	> 
 eCcN34>  >  >  >  !%>  
tELL!5tELL7I)I#JJ	K> D ./$#	3S	>*  	
 
c r<   c            5       0  ^  \ rS rSrSr                           S=S\\\\\4   4   S\S\S\S\S	\S
\S\S\S\S\	\   S\S\
S\
S\S\S\S\S\S\
S\
S\
S\
S\	\   S\SS44U 4S jjjr\R                  R                   S>S\S \SS4S! jj5       r\R                  R                   S\\   4S" j5       r\R                  R                   S?S#\S\\\4   4S$ jj5       r\R                  R                   S@S%\SS4S& jj5       r\R                  R                   S@S'\SS4S( jj5       r\R                  R                   S\\\4   4S) j5       r\R                  R                   S\R6                  4S* j5       rSAS\S\	\   SS4S+ jjr  SBS,\\R<                  \\\R<                  4   4   S-\	\R<                     S.\	\R<                     S\R<                  4S/ jjr  SCS,\R<                  S.\	\R<                     S0\S\R<                  4S1 jjr   SBS,\\R<                  \\\R<                  4   4   S-\	\R<                     S.\	\R<                     S\R<                  4S2 jjr!       SDS,\\R<                  \\\R<                  4   4   S-\	\R<                     S.\	\R<                     S3\	\\\"\   4      S4\S5\S6\S7\S\\"\R<                     \\R<                  \"\R<                     4   4   4S8 jjr#   SES3\\\"\   4   S9\S:\S\"\   4S; jjr$S<r%U =r&$ )Fr   i  u  Classification wrapper around ``Gemma4VitEncoder``.

Holds:
    - ``encoder``: a ``Gemma4VitEncoder``. By default constructed with
      ``global_pool=''`` so the classifier does its own masked-mean pool
      over raw patch tokens in ``forward_head``. Pass
      ``encoder_pool='soft'`` to turn on the native VLM spatial ``k×k``
      soft-token pool inside the encoder; ``forward_features`` then
      returns ``(B, num_soft_tokens, D)`` already pooled (plus the ``√D``
      scale and, for the 31B variant, ``std_bias/std_scale``).
    - ``norm``: optional ``RmsNorm`` after pooling.
    - ``head``: linear classifier.

Input/output contract matches timm convention: ``forward_features`` returns
pre-head features (pre-pool ``(B, N, D)`` patch tokens by default, or
``(B, num_soft_tokens, D)`` post-soft-pool when ``encoder_pool='soft'``);
``forward_head`` does classifier pool + norm + head; ``forward`` =
``forward_features`` + ``forward_head``.
Nrc  r   r   r   rd  encoder_poolr   re  r   rv   r   r4  r   rw   r   rE  rf  r   
final_norm	drop_raterg  rh  ri  r$  rj  r#   c                 N  > UUS.n[         TU ]  5         US;   d   SU< S35       eUS;   d   SU< S35       eX@l        XPl        X`l        [        S!0 SU_S	U_S
U_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_UD6U l        U(       a  [        U4USS.UD6O[        R                  " 5       U l
        [        R                  " U5      U l        US:  a  [        R                  " Xt40 UD6O[        R                  " 5       U l        U R                  R                  =U l        U l        U R                  R"                  U l        U R                  R$                  U l        U R                  R&                  U l        U R                  R(                  U l        US :w  av  [+        U R                  [        R                  5      (       aL  U R                  R,                  b4  [        R.                  R1                  U R                  R,                  5        g g g g )"Nr%   rm  rn  ro  BGemma4VitClassifier global_pool must be 'avg', 'none', or '' (got z>); use Gemma4VitEncoder directly for 'soft' VLM-style pooling.)ro  rn  rl  zDGemma4VitClassifier encoder_pool must be '', 'none', or 'soft' (got z).rc  r   r   rd  r   re  r   rv   r   r4  r   rw   r   rE  rf  r   rg  rh  ri  r$  rj  Fr   r   rw  r-   )r.   r/   r   rd  r  r   encoderr   r0   r<  r  r	  	head_dropr1   headr{  r|  r   r   r  r  r   r(   r  r  )r7   rc  r   r   r   rd  r  r   re  r   rv   r   r4  r   rw   r   rE  rf  r   r  r  rg  rh  ri  r$  rj  r&   r'   r8   r9   s                                r:   r/   Gemma4VitClassifier.__init__  su   < /11 	KPQ\P_ `J K	K1 33 	fRS_Rbbde	f3&&( ( 

!
 
 %	

  
 
  
 
 &
 0
 
 "
 %<
 !4
 $
  !4!
" *#
$ *%
& *'
(  )
* $-
0 MWGIH8EHRH\^\g\g\i	I.?JQBIIi;;TVT_T_Ta	 59LL4M4MMD1//,,11 LL55 $ = =& $))RYY//DIINN4Ntyy~~. 5O/ !r<   r  rz  c                     U R                   R                  XS9  [        U R                  [        R
                  5      (       aL  U R                  R                  b4  [        R                  R                  U R                  R                  5        ggg)z>Init encoder + classifier-specific pieces (head bias to zero).r  rz  N)	r  r  r   r  r0   r1   r(   r  r  r  s      r:   r   Gemma4VitClassifier.init_weights@  s\     	!!t!Edii++		0JGGNN499>>* 1K+r<   c                 h    U R                   R                  5        Vs1 s H  nSU 3iM
     sn$ s  snf )Nencoder.)r  r  )r7   rp   s     r:   r  #Gemma4VitClassifier.no_weight_decayG  s/    (,(D(D(FG(F1(1#(FGGGs   /r  c                     [        SS/S9$ )Nz*^encoder\.patch_embed|^encoder\.rotary_emb)z^encoder\.blocks\.(\d+)Nr  r  r  s     r:   r  !Gemma4VitClassifier.group_matcherK  s     >67
 	
r<   r  c                 :    U R                   R                  U5        g rE   )r  r  r  s     r:   r  *Gemma4VitClassifier.set_grad_checkpointingS  s    ++F3r<   r  c                 :    U R                   R                  U5        g rE   )r  r  )r7   r  s     r:   r  %Gemma4VitClassifier.set_clamp_enabledW  s    &&w/r<   c                 6    U R                   R                  5       $ r  )r  r  rA   s    r:   r  "Gemma4VitClassifier.get_patch_size[  s     ||**,,r<   c                     U R                   $ rE   )r  rA   s    r:   get_classifier"Gemma4VitClassifier.get_classifier`  s    yyr<   c                    Xl         Ub  US;   d   SU< S35       eX l        US:  a!  [        R                  " U R                  U5      O[        R
                  " 5       U l        [        U R                  [        R                  5      (       aL  U R                  R                  b4  [        R                  R                  U R                  R                  5        g g g )Nr  r  )r   )r   rd  r0   r1   r   r<  r  r   r(   r  r  )r7   r   rd  s      r:   reset_classifier$Gemma4VitClassifier.reset_classifierd  s    &""55 fTU`Tccdef5*>IAoBIIdnnk:SUS^S^S`	dii++		0JGGNN499>>* 1K+r<   rC   r   r   c                 v    U R                   S:X  a  U R                  XUS9$ U R                  R                  XUS9$ )u-  Pre-head features.

Shape depends on ``encoder_pool``:
  ``''`` / ``'none'``: raw patch tokens ``(B, N, embed_dim)``.
  ``'soft'``: spatial ``k×k`` soft-token pool output
      ``(B, num_soft_tokens, embed_dim)`` (with ``√D`` scale + optional
      ``std_bias/std_scale`` baked in by the encoder).
rl  r   r   )r  r  r  )r7   rC   r   r   s       r:   r  $Gemma4VitClassifier.forward_featuresn  s@     &<<<TT||,,QU`,aar<   r  c                 ~   U R                   S:X  ar  U R                  S:X  d  Uc  UR                  SS9nOOUR                  U) R	                  S5      S5      nUR                  SS9UR                  SSS9R                  SS	9-  nU R                  U5      nU R                  U5      nU(       a  U$ U R                  U5      $ )
u  Pool (if configured) → norm → head_drop → head.

Args:
    x: pre-head features from ``forward_features``.
    patch_valid: ``(B, N)`` valid mask for masked mean over raw patch
        tokens. Ignored when ``encoder_pool='soft'`` (the soft pooler
        has already collapsed padding tokens, and the new token count
        no longer aligns with ``patch_valid``).
    pre_logits: If True, return pre-classifier features.
rm  rl  r   rY   rW   r   TrL  r   )
rd  r  r  r_  rd   r   rF   r  r  r  )r7   rC   r   r  s       r:   r   Gemma4VitClassifier.forward_head  s      u$  F*k.AFFqFMMMK<":":2">DEEaEL;??q$?#G#M#MRS#M#TTIIaLNN1q0DIIaL0r<   c                    [        U[        5      (       a)  UR                  SU5      nUR                  SU5      nUS   nUc-  Ub*  US:H  R                  SS9nUR	                  5       (       a  U) nU R                  XUS9nU R                  XSS9$ )Nr   r   r   rW   rY   r	  )r   )r   r   r   r   r   r  r  )r7   rC   r   r   r   featss         r:   rH   Gemma4VitClassifier.forward  s     a%%{;K%%{;K)A;#:#r)..2.6H||~~'i%%ak%Z   @@r<   r  r  r  r}  r  c	                 B    U R                   R                  UUUUUUUUS9$ )N)r   r   r  r  r  r}  r  )r  r  )	r7   rC   r   r   r  r  r  r}  r  s	            r:   r  )Gemma4VitClassifier.forward_intermediates  s8     ||11##!!1 2 	
 		
r<   r  r  c                 j    U R                   R                  XSS9nU(       a  U R                  S5        U$ )NF)r  r  r   )r  r  r  )r7   r  r  r  r  s        r:   r  -Gemma4VitClassifier.prune_intermediate_layers  s5     ||==gin=o!!!$r<   )r   r  r  r  rd  r  r  r|  r  r   r{  r   r  )r   r   r   i  rm  ro  r   r   r  r  Nr  r  r   r   r   FFTr   r   r   r   Nro  NNr  r  r  rE   r   )NFr  r  )'rK   rL   rM   rN   rO   r   rP   r
   r   r   r@   rQ   r   r/   r4   r  r  r  r	   r  r   r   r  r  r  r  r0   Moduler  r  rR   r  r  rH   r   r  r  rS   rT   rU   s   @r:   r   r     sA   , 58 #$ " *.%)" %+0'( %(-#"$'$'$',0!9Q/CsCx01Q/ Q/ 	Q/
 Q/ Q/ Q/ Q/ Q/ Q/ Q/ #3-Q/  #Q/ Q/ Q/  &)!Q/" "%#Q/$ %Q/& "&'Q/( )Q/* +Q/, "-Q/. "/Q/0 "1Q/2  )3Q/4 5Q/: 
;Q/ Q/f YY+ + + + + YYHS H H YY
D 
T#s(^ 
 
 YY4T 4T 4 4 YY0 0 0 0 YY-c3h - - YY		  +C +hsm +W[ + 3726	bU\\4U\\(9#::;b "%,,/b "%,,/	b
 
b* 37$	1||1 "%,,/1 	1
 
1< 3726	AU\\4U\\(9#::;A "%,,/A "%,,/	A
 
A8 37267;$$',
U\\4U\\(9#::;
 "%,,/
 "%,,/	

 eCcN34
 
 
 
 !%
 
tELL!5tELL7I)I#JJ	K
0 ./$#		3S	>*	 	 		
 
c	 	r<   Trv  namerz  c                 r   [        U [        R                  5      (       aL  [        U R                  SS9  U R
                  b*  [        R                  R                  U R
                  5        gg[        U S5      (       a  U R                  5         gU(       a#  [        U S5      (       a  U R                  5         ggg)zPer-module init for Gemma4Vit (trunc-normal-TF for Linear weights).

Args:
    module: Module to initialize.
    name: Dotted module name (from ``named_apply``).
    needs_reset: If True, call ``reset_parameters`` on modules that define one.
r   r   Nr  r6   )r   r0   r1   r   r   r(   r  r  hasattrr  r6   )rv  r  rz  s      r:   init_weights_gemma4_vitr    s     &"))$$D1;;"GGNN6;;' #		(	(	);<<! =r<   r  c                     [        [        US9$ )Nry  )r   r  r  s     r:   r  r    s    *DDr<   
state_dictmodelc                    0 nSnU R                  5        H  u  pESnU H  nUR                  U5      (       d  M  Un  O   Uc  UR                  S5      (       a  XRU'   MI  U[        U5      S nSU;   a  M_  UR                  SS5      nUR                  SS5      nUR                  S	S
5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nUR                  SS5      nXRU'   M     U$ )u  Convert HuggingFace Gemma4 vision encoder weights → ``Gemma4VitEncoder`` keys.

Pure key remapping — no value transforms. The encoder consumes patches in
HF's native C-P-P flat layout (via ``batch_patchify(channels_last=False)``
in ``Gemma4PatchEmbed``), so ``input_proj.weight`` passes through unchanged.
``Gemma4ClippableLinear`` preserves HF's ``.linear.weight`` + clamp-buffer
structure, so those pass through too.
)zmodel.vision_tower.zmodel.vision_model.zvision_model.zvision_tower.N)patch_embed.rs  std_zpooler.zrotary_emb.r  zpatch_embedder.r  zencoder.layers.rs  z.input_layernorm.z.norm1.z.post_attention_layernorm.z.norm2.z.pre_feedforward_layernorm.z.norm3.z.post_feedforward_layernorm.z.norm4.z.self_attn.z.attn.)items
startswithr  replace)	r  r  out_dicthf_prefixesrp   r  matched_prefixprefixnew_ks	            r:   checkpoint_filter_fn_encoderr'    s    HbK  "!F||F##!' "
 !||YZZ#n%&' 5 /@/;19=:IF;YG<iHmX67 #: Or<   c                    SnU R                  5        VVs0 s H5  u  p4UR                  U5      (       d  UR                  S5      (       d  M3  X4_M7     nnnU R                  5        VVs0 s H  u  p4X5;  d  M  X4_M     nnn[        XaR                  5      nUR                  5        VVs0 s H  u  p4SU 3U_M     nnnUR	                  U5        U$ s  snnf s  snnf s  snnf )uF  Convert HF or timm-encoder state dict → ``Gemma4VitClassifier`` keys.

Runs the encoder filter, then prefixes every encoder-owned key with
``encoder.``. Top-level keys that belong to the classifier (``norm.*``,
``head.*``) pass through unchanged — they're absent from HF checkpoints, so
the model's own init values stand.
)znorm.zhead.r  )r  r   r'  r  update)	r  r  classifier_local_prefixesrp   r  classifier_local	to_filterencoder_dictprefixeds	            r:   checkpoint_filter_fn_classifierr/    s     !3)3)9)9); `);<<(ABBallS]F^ ); `","2"2"4R"4$!8Q"4IR/	==IL.:.@.@.BC.Bda(1#!.BHCOO$%O`R Ds   2CC(C7C(CFvariant
pretrainedc           	      j    UR                  SS5      n[        [        U U4[        [	        USS9SS.UD6$ )Nout_indicesr   getterr3  feature_clsr   )pretrained_filter_fnfeature_cfgkwargs_filter)popr   r   r'  r   r0  r1  kwargsr3  s       r:   _create_gemma4_vit_encoderr=  4  sI    **]A.K :[hG&  r<   c           	      h    UR                  SS5      n[        [        U U4[        [	        USS9S.UD6$ )Nr3  r   r4  r5  )r7  r8  )r:  r   r   r/  r   r;  s       r:   _create_gemma4_vit_classifierr?  A  sF    **]A.K =[hG  r<   urlc                 $    U SSSS SSSSSS	S
S.UE$ )Nr   )r   r   r   )r   `   rB  r   bicubicF)r   r   r   )r   r   r   zpatch_embed.input_projr  )r@  r   
input_sizemin_input_size	pool_sizecrop_pctinterpolationfixed_input_sizer  r   
first_conv
classifierr-   )r@  r<  s     r:   _cfgrL  N  s>    # &"!
  .'( ) r<   z"timm/gemma4_vit_167m.gemma4_e4b_itzencoder.patch_embed.input_projz
apache-2.0)	hf_hub_idrJ  licensez"timm/gemma4_vit_570m.gemma4_31b_itz&timm/gemma4_vit_167m_enc.gemma4_e4b_it)rM  rN  z&timm/gemma4_vit_570m_enc.gemma4_31b_it)zgemma4_vit_167m.gemma4_e4b_itzgemma4_vit_570m.gemma4_31b_itz!gemma4_vit_167m_enc.gemma4_e4b_itz!gemma4_vit_570m_enc.gemma4_31b_itr   r   r  r  r  )r   re  r   rv   r4  rf  r   i     H   i  )r   re  r   rv   r4  rf  c           	      L    [        [        SS9n[        SSU 0[        U40 UD6D6$ )zGemma4 ~167M (E2B/E4B  vision tower) classifier.

Masked mean pool over patch tokens + norm + linear classifier. Output:
``(B, num_classes)``. For the native VLM encoder interface (soft-token
output), use ``gemma4_vit_167m_enc``.
Tr  r1  )gemma4_vit_167m)r   
_167M_ARCHr?  r1  r<  
model_argss      r:   rS  rS    s0     jT2J(pzpUYZdUohnUoppr<   c           	      L    [        [        SS9n[        SSU 0[        U40 UD6D6$ )u   Gemma4 ~167M (E2B/E4B vision tower) — native VLM encoder.

``global_pool='soft'`` applies the spatial ``k×k`` soft-token pool + √D
scale; output: ``(B, num_soft_tokens, embed_dim)``. Bit-perfect with HF
``Gemma4VisionModel`` on matching weights.
rl  rd  r1  )gemma4_vit_167m_enc)r   rT  r=  rU  s      r:   rY  rY    0     jf5J%q
qVZ[eVpioVpqqr<   c           	      L    [        [        SS9n[        SSU 0[        U40 UD6D6$ )aS  Gemma4 ~570M (26B/31B vision tower) classifier.

Masked mean pool over patch tokens + norm + linear classifier. Output:
``(B, num_classes)``. The classifier uses ``encoder.forward_features`` and
does its own pool + norm; the encoder's ``std_bias/std_scale`` (which only
applies in ``'soft'`` pool mode) is not used on the classifier path.
TrR  r1  )gemma4_vit_570m)r   
_570M_ARCHr?  rU  s      r:   r\  r\    s0     jT2J(pzpUYZdUohnUoppr<   c           	      L    [        [        SS9n[        SSU 0[        U40 UD6D6$ )u   Gemma4 ~570M (26B/31B vision tower) — native VLM encoder.

``global_pool='soft'`` output ``(B, num_soft_tokens, embed_dim)`` with
``std_bias/std_scale`` standardization applied post-pool (HF-native
ordering — bit-perfect with ``Gemma4VisionModel``).
rl  rX  r1  )gemma4_vit_570m_enc)r   r]  r=  rU  s      r:   r_  r_    rZ  r<   )rX   )rX   rX   r  r  )ro  )IrO   math	functoolsr   typingr   r   r   r   r   r	   r
   r   r4   torch.nnr0   torch.nn.functional
functionalr   	timm.datar   r   timm.layersr   r   r   r   r   _builderr   	_featuresr   _manipulater   r   	_registryr   r   	naflexvitr   __all__r  r   rR   r_   rP   re   rr   rt   r   r   r"  r3  rC  r   r   r   rQ   r  r  r'  r/  r=  r?  rL  default_cfgsr   rT  r]  rS  rY  r\  r_  r-   r<   r:   <module>ro     s  	   I I I     A V V * + 0 < %4
5*BII *Z(5<< (ELL ( 		.<<	.\\	. \\	. 		.
 \\	.  &<<&\\& \\& 	&
 & \\&4Mbii M`O2ryy O2dcbii cLXRYY X8F")) FR@* @*Fkryy k\{")) {|"BII "S "D "\` "$Ec ET EX E
,ell*+,, 
#u||
,^ell*+$ 
#u||
0
 
 
Td 

3 
D 
Wj 
c T#s(^ 2 %%)63&
 &*63& *.:* *.:*#& 0 

 

 q q;N q q rD r?O r r 	q 	q;N 	q 	q rD r?O r rr<   