
    3jN                     :   S r SSKrSSKrSSKJs  Jr  SSKJr  SSKJr  SSK	J
r  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)J*r*J+r+  SSK,J-r-J.r.J/r/  SSK0J1r1  \" SS9\ " S S\+5      5       5       r2\" SS9\ " S S\*5      5       5       r3\" SS9\ " S S\)5      5       5       r4 " S S\/5      r5 " S  S!\'5      r6 " S" S#\&5      r7 " S$ S%\Rp                  5      r9 " S& S'\#5      r: " S( S)\-5      r; " S* S+\5      r< " S, S-\.5      r= " S. S/\Rp                  5      r>\ " S0 S1\5      5       r?\" S2S39 " S4 S5\?5      5       r@\" S6S39 " S7 S8\?5      5       rA\ " S9 S:\"5      5       rB/ S;QrCg)<z%Pytorch implementation of AIMv2 Model    N)strict)nn   )initialization)PreTrainedConfig)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutput)&build_2d_sinusoidal_position_embeddingz!apple/aimv2-large-patch14-224-lit)
checkpointc                       \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   S	r
\\S
'   Sr\\\   -  \\\4   -  \S'   Sr\\S'   Sr\\-  \S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   \" 5       rSrg)Aimv2VisionConfig)   a  
use_head (`str`, *optional*, defaults to `True`):
    Whether to use Attention Pooling Head or Not.
is_native (`str`, *optional*, defaults to `False`):
    Whether to use ckpt trained for image native resolution or not.

Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = Aimv2VisionConfig()

>>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = Aimv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```i   hidden_sizei   intermediate_size   num_hidden_layers   num_attention_heads   
patch_sizeh㈵>rms_norm_eps        attention_dropoutFqkv_biasmlp_biassilu
hidden_act{Gz?initializer_rangeTuse_head	is_native N)__name__
__module____qualname____firstlineno____doc__r$   int__annotations__r%   r'   r)   r+   listtupler-   floatr/   r0   boolr1   r3   strr5   r6   r7   AttributeErrorlayer_norm_eps__static_attributes__r8       a/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/aimv2/modular_aimv2.pyr"   r"   )   s    * K!s!s  46Jd3i%S/16L%%(us{(HdHdJ#u#HdIt#%NrH   r"   c                       \ rS rSr% Sr\\S'   Sr\\S'   Sr\\S'   Sr	\\S	'   S
r
\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   Sr\\S'   \" 5       r\" 5       r\" 5       r\" 5       rS rSrg)Aimv2TextConfigR   i   
vocab_sizei   r$   i   r%      r'      r)   M   max_position_embeddingsr2   r3   r,   r-   Fr0   r1   r4   r5   c                 0    [         R                  " S0 UD6  g )Nr8   )r   __post_init__)selfkwargss     rI   rS   Aimv2TextConfig.__post_init__e   s    &&00rH   r8   N)r9   r:   r;   r<   rM   r>   r?   r$   r%   r'   r)   rQ   r3   rD   r-   rB   r0   rC   r1   r5   rE   bos_token_idpad_token_idrF   projection_sizerS   rG   r8   rH   rI   rK   rK   R   s     JK!s!s  #%S%JL%HdHd#u#!#L!#L#%N$&O1rH   rK   c                   D    \ rS rSr% SrSr\\S'   Sr\	\S'   Sr
\	\S'   S	rg
)Aimv2Configi   a  
max_logit_scale (`float`, *optional*, defaults to `100.0`):
    The maximum logit scale to use

Example:

```python
>>> from transformers import Aimv2Config, Aimv2Model

>>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
>>> configuration = Aimv2Config()

>>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
>>> model = Aimv2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
>>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = Aimv2TextConfig()
>>> config_vision = Aimv2VisionConfig()

>>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
```i   projection_dimg/L
F@logit_scale_init_valueg      Y@max_logit_scaler8   N)r9   r:   r;   r<   r=   r]   r>   r?   r^   rB   r_   rG   r8   rH   rI   r[   r[   i   s(    8 NC$*E*"OU"rH   r[   c                       \ rS rSrSrg)Aimv2Output   r8   Nr9   r:   r;   r<   rG   r8   rH   rI   ra   ra          rH   ra   c                       \ rS rSrSrg)Aimv2RMSNorm   r8   Nrc   r8   rH   rI   rf   rf      rd   rH   rf   c                       \ rS rSrSrg)Aimv2MLP   r8   Nrc   r8   rH   rI   ri   ri      rd   rH   ri   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2VisionEmbeddings   configc                 B  > [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)kernel_sizestrider   position_ids   F)
persistent)super__init__rn   r+   r   Conv2dnum_channelsr$   patch_embedrf   r-   rms_norm
image_sizer7   	Embeddingposition_embeddingregister_buffertorcharangeexpand)rT   rn   num_patches	__class__s      rI   rx   Aimv2VisionEmbeddings.__init__   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-VchirH   pixel_valuesreturnc                 P   UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       a  [        X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nUR                  S   S-  n[        R                  " USUS 24   USS U24   /SS9nUR                  S5      nOU R!                  U R"                  5      nXV-   nU$ )Nr   rt   )heightwidth	embed_dimdevicedtyperu   .dimr   )sizer{   flatten	transposer|   rn   r7   r   r+   r$   r   r   shaper   cat	unsqueezer   rr   )rT   r   _r   r   hidden_states	pos_embedhalfs           rI   forwardAimv2VisionEmbeddings.forward   s   *//11e((6>>qAKKAqQm4;;  >0.++11$++#))I ??2&!+D		9S$%Z#8)C$J:O"PVXYI!++A.I//0A0ABI%1rH   )rn   r{   r+   r   r|   r9   r:   r;   r<   r"   rx   r   Tensorr   rG   __classcell__r   s   @rI   rl   rl      s2    j0 jELL U\\  rH   rl   c                       \ rS rSrSrg)Aimv2TextEmbeddings   r8   Nrc   r8   rH   rI   r   r      rd   rH   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )Aimv2Attention   c                   > [         TU ]  U5        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l	        g )Nbias)
rw   rx   r   Linearr   r0   k_projv_projq_projout_projrT   rn   r   s     rI   rx   Aimv2Attention.__init__   s     iiV__UiiV__UiiV__U		$..$..vWrH   )r   r   r   r   )r9   r:   r;   r<   rx   rG   r   r   s   @rI   r   r      s    X XrH   r   c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\R                  4S	 jjr
S
rU =r$ )Aimv2EncoderLayer   rn   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g N)rw   rx   r   	attentionri   ffnrf   r$   r-   	rms_norm1	rms_norm2r   s     rI   rx   Aimv2EncoderLayer.__init__   sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNrH   Nr   attention_maskrU   r   c                     U R                  U5      nU R                  " SXBS.UD6u  pVX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   r8   )r   r   r   r   )rT   r   r   rU   norm_hidden_statesattn_outputr   
mlp_outputs           rI   r   Aimv2EncoderLayer.forward   sa     "^^M:r6Hrkqr%3!^^M:XX01
%2rH   )r   r   r   r   r   )r9   r:   r;   r<   r"   rx   r   r   r   r   r   rG   r   r   s   @rI   r   r      s^    O0 O /3|| t+ +,	
 
 rH   r   c                       \ rS rSrSrg)Aimv2Encoder   r8   Nrc   r8   rH   rI   r   r      rd   rH   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2AttentionPoolingHead   rn   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   rt   T)rw   rx   r$   r)   	num_headsr   r   r0   r   r   	Parameterr   zeros	cls_tokenoutput_projr   s     rI   rx   "Aimv2AttentionPoolingHead.__init__   s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSrH   r   r   c                    UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " XU5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )Nru   rt   r   r   r   r   )r   r   r   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )rT   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              rI   r   !Aimv2AttentionPoolingHead.forward   s8   *7*=*='
ZNN))*b"=	kk-(00dnnV`drdrVrsM*22:XbftftXtu!!*a~~A]^kk!Q1%aAq)aAq)44UG!++Aq199*aT!&&1&-!!+.rH   )r   r$   r   r   r   r   r   r   s   @rI   r   r      s2    	T0 	TU\\ ell  rH   r   c                      ^  \ rS rSr% Sr\\S'   SrSrSr	/ SQr
SrSrSr\R                  " 5       U 4S j5       rS	rU =r$ )
Aimv2PreTrainedModeli  z
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rn   aimv2)imageT)r   r   rl   r   c                 ^  > [         TU ]  U5        [        US5      (       a`  [        UR                  [
        R                  5      (       a6  [        R                  " UR                  [        R                  " S5      5        g g [        U[        5      (       a5  [        R                  " UR                  SU R                  R                  S9  g [        U[         5      (       a\  [        R"                  " UR$                  [&        R(                  " UR$                  R*                  S   5      R-                  S5      5        g [        U[.        5      (       a\  [        R"                  " UR$                  [&        R(                  " UR$                  R*                  S   5      R-                  S5      5        g g )Nlogit_scaleg$I$I,@r.   )r   stdru   rs   )rw   _init_weightshasattr
isinstancer   r   r   init	constant_mathlogr   normal_r   rn   r5   rl   copy_rr   r   r   r   r   r   )rT   moduler   s     rI   r   "Aimv2PreTrainedModel._init_weights"  s   f%6=))&,,bll;;v11488H3EF < 9::LL))9V9VW 566JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 344JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5rH   r8   )r9   r:   r;   r<   r=   r[   r?   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   no_gradr   rG   r   r   s   @rI   r   r     sW    
 !&*# N
]]_
i 
irH   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                      ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
S\R                  4S jr\\" SS	9\S
\\   S\4S j5       5       5       rSrU =r$ )Aimv2VisionModeli0  rn   r   r   
attentionsc                 >  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g r   )rw   rx   rn   rl   
embeddingsr   encoderrf   r$   r-   r|   r6   r   head	post_initr   s     rI   rx   Aimv2VisionModel.__init__=  so     /7#F+$V%7%79L9LM==1&9DIrH   r   c                 .    U R                   R                  $ r   )r   r{   rT   s    rI   get_input_embeddings%Aimv2VisionModel.get_input_embeddingsK  s    ***rH   Ftie_last_hidden_statesrU   c                     U R                  U5      nU R                  " SSU0UD6nUR                  nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```inputs_embedsNlast_hidden_statepooler_outputr8   )r   r   r
  r|   r6   r   r   )rT   r   rU   r   encoder_outputsr
  r  s          rI   r   Aimv2VisionModel.forwardN  sx    < 5+/<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
rH   )rn   r   r   r   r|   r6   )r9   r:   r;   r<   r"   r?   main_input_namer   r   _can_record_outputsrx   r   Moduler  r   r   r   r   r   r   r   rG   r   r   s   @rI   r   r   0  s~     $O*$
0 +bii +  E2*
 +,*
 
$	*
  3  *
rH   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            
          ^  \ rS rSrSr\\S.rS\4U 4S jjr	S\
R                  4S jrS r\\" S	S
9\ SS\R$                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )Aimv2TextModeli~  	input_idsr   rn   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g r   )rw   rx   rn   r   r   r   r   rf   r$   r-   r|   eos_token_idr   r   s     rI   rx   Aimv2TextModel.__init__  s_     -f5#F+$V%7%79L9LM"//rH   r   c                 .    U R                   R                  $ r   r   token_embeddingr  s    rI   r  #Aimv2TextModel.get_input_embeddings  s    ...rH   c                 $    XR                   l        g r   r  )rT   r   s     rI   set_input_embeddings#Aimv2TextModel.set_input_embeddings  s    */'rH   Fr  Nr   rU   c                    U R                  U5      nUR                  u  pVn[        R                  " U[        R                  UR
                  S9nUR                  S5      R                  US5      nUb  [        U R                  UUUS S9nU R                  " S	UUS.UD6n	U	R                  n
U R                  U
5      n
U
[        R                  " U
R                  S   U
R
                  S9UR                  [        R                  U
R
                  S9U R                  :H  R                  5       R!                  SS94   n[#        U
US9$ )
N)r   r   r   ru   )rn   r  rr   r   past_key_values)r  r   )r   r   r	  r8   )r   r   r   r   longr   r   r   r   rn   r   r
  r|   tor>   r  argmaxr   )rT   r  r   rU   r   r   r   r   rr   r  r
  pooled_outputs               rI   r   Aimv2TextModel.forward  sG    	2!.!4!4
Q||G5::mFZFZ[#--a077
BG%/{{+)- $N ,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
rH   )rn   r   r   r  r|   r   )r9   r:   r;   r<   r  r   r   r  rK   rx   r   r  r  r  r   r   r   r   r   r   r   r   r   rG   r   r   s   @rI   r  r  ~  s     "O +$
	 	/bii /0  E2 /3&
 t+&
 +,	&

 
$&
  3  &
rH   r  c                       \ rS rSrSrS\4S jr\\   SS\	R                  S-  S\	R                  S-  S\	R                  S-  S	\\   S
\4
S jj5       5       rSrg)
Aimv2Modeli  Trn   c                    [         R                  " X5        UR                  U l        UR                  R                  U l        UR                  R                  U l        [        R                  UR                  5      U l
        [        R                  UR                  5      U l        [        R                  " U R
                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFr   )r   rx   r]   vision_configr$   vision_embed_dimtext_configtext_embed_dimr   _from_configvision_modelr  
text_modelr   r   visual_projectiontext_projectionr   r   tensorrn   r^   r   r   r   r_   max_log_logit_scaler   )rT   rn   s     rI   rx   Aimv2Model.__init__  s      .$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C rH   Nr  r   r   rU   r   c           	          U R                   " SSU0UD6nU R                  " SUUS.UD6nUR                  nU R                  U5      nUR                  nU R	                  U5      nU[        U5      -  nU[        U5      -  nU R                  R                  SU R                  5      R                  5       R                  UR                  5      n	X-  UR                  5       -  n
U
R                  5       n[        UU
UUUUS9$ )aD  
Examples:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   )r  r   r.   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr8   )r-  r.  r  r/  r0  r   r   clampr2  expr!  r   tra   )rT   r  r   r   rU   vision_outputstext_outputsr8  r7  r   r6  r5  s               rI   r   Aimv2Model.forward  s   B 6:5F5F 6
%6
6

 48?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
rH   )	r   r2  r]   r+  r.  r0  r)  r-  r/  )NNN)r9   r:   r;   r<   r   r[   rx   r   r   r   
LongTensorFloatTensorr   r   r   ra   r   rG   r8   rH   rI   r&  r&    s    { $  .215.2	?
##d*?
 ''$.?
 t+	?

 +,?
 
?
  ?
rH   r&  )r[   r"   rK   r   r&  r   r  )Dr=   r   r   torch.nn.functionalr   
functionalr   huggingface_hub.dataclassesr    r   r   configuration_utilsr   masking_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   vit_mae.modeling_vit_maer   r"   rK   r[   ra   rf   ri   r  rl   r   r   r   r   r   r   r   r  r&  __all__r8   rH   rI   <module>rV     s   ,     .  & 3 / 9 K - & I I 7 5 P P 9 \ \ Q Q M >?$&* $&  @$&N >?1& 1  @1* >?#, #  @#D	, 		< 		x 	%BII %P	, 	X_ X2 2	= 			 D i? i iD 
F
+ F

F
R 
B
) B

B
J V
 V
 V
rrH   