
    3j-                       S r SSKJr  SSKJrJr  SSKrSSKJr  SSKJ	r	  S/r
 " S S	\R                  5      r " S
 S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r/ SQrSS jrg)ae  Module that implement Vision Transformer (ViT).

Paper: https://paperswithcode.com/paper/an-image-is-worth-16x16-words-transformers-1

Based on: `https://towardsdatascience.com/implementing-visualttransformer-in-pytorch-184f9f16f632`

Added some tricks from: `https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py`
    )annotations)AnyCallableN)nn)KORNIA_CHECKVisionTransformerc                  6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )ResidualAdd'   c                .   > [         TU ]  5         Xl        g N)super__init__fn)selfr   	__class__s     K/home/wildlama/miniconda3/lib/python3.13/site-packages/kornia/models/vit.pyr   ResidualAdd.__init__(   s        c                8    UnU R                   " U40 UD6nX-  nU$ r   r   )r   xkwargsress       r   forwardResidualAdd.forward,   s&    GGA  	r   r   )r   zCallable[..., torch.Tensor]returnNone)r   torch.Tensorr   r   r   r   __name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r   r
   r
   '   s     r   r
   c                  0   ^  \ rS rSrSSU 4S jjjrSrU =r$ )FeedForward3   c           	        > [         TU ]  [        R                  " X5      [        R                  " 5       [        R
                  " U5      [        R                  " X#5      [        R
                  " U5      5        g r   )r   r   r   LinearGELUDropout)r   in_featureshidden_featuresout_featuresdropout_rater   s        r   r   FeedForward.__init__4   sI    IIk3GGIJJ|$IIo4JJ|$	
r    )        )
r/   intr0   r6   r1   r6   r2   floatr   r   r!   r"   r#   r$   r   r%   r&   r'   s   @r   r)   r)   3   s    
 
r   r)   c                  6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )MultiHeadAttention>   c                  > [         TU ]  5         Xl        X l        X-  nUS-  U l        U R                  U R                  -  (       a&  [        SU R                   SU R                   S35      e[        R                  " XS-  5      U l        [        R                  " U5      U l
        [        R                  " X5      U l        [        R                  " U5      U l        g )Ng      zySize of embedding inside the transformer decoder must be visible by number of headsfor correct multi-head attention Got: z embedding size and z numbers of heads   )r   r   emb_size	num_headsscale
ValueErrorr   r,   qkvr.   att_drop
projectionprojection_drop)r   r>   r?   rC   	proj_drop	head_sizer   s         r   r   MultiHeadAttention.__init__?   s     ")	_
==4>>)&:4>>:JJ[]  99X!|4

8,))H7!zz)4r   c                6   UR                   u  p#nU R                  U5      R                  X#SU R                  X@R                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        R                  " SXg5      U R                  -  n	U	R                  SS9n	U R                  U	5      n	[
        R                  " S	X5      n
U
R	                  SSSS5      R                  5       R                  X#S5      n
U R                  U
5      n
U R                  U
5      n
U
$ )
Nr=      r         zbhqd, bhkd -> bhqkdimzbhal, bhlv -> bhav )shaperB   reshaper?   permutetorcheinsumr@   softmaxrC   
contiguousviewrD   rE   )r   r   BNCrB   qkvattouts              r   r   MultiHeadAttention.forwardS   s   ''a hhqk!!!4>>1;NOWWXY[\^_abdefa&#a&#a&a ll/6Ckkbk!mmC  ll0#9kk!Q1%00277bAooc"""3'
r   )rC   r>   r?   rD   rE   rB   r@   )
r>   r6   r?   r6   rC   r7   rF   r7   r   r   r   r   r   r   r    r'   s   @r   r:   r:   >   s    5( r   r:   c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )TransformerEncoderBlockh   c                p  > [         TU ]  [        [        R                  " [        R
                  " US5      [        XXC5      [        R                  " U5      5      5      [        [        R                  " [        R
                  " US5      [        XS-  XS9[        R                  " U5      5      5      5        g )Nư>rL   )r2   )	r   r   r
   r   
Sequential	LayerNormr:   r.   r)   )r   	embed_dimr?   r2   dropout_attnr   s        r   r    TransformerEncoderBlock.__init__i   s    LLD1&y\XJJ|, LLD1	q=)_JJ|,	
r   r4   )
ri   r6   r?   r6   r2   r7   rj   r7   r   r   r8   r'   s   @r   rc   rc   h   s    
 
r   rc   c                  Z   ^  \ rS rSr     S           SU 4S jjjrSS jrSrU =r$ )TransformerEncoder|   c                   >^^^^ [         TU ]  5         [        R                  " UUUU4S j[	        U5       5       6 U l        / U l        g )Nc              3  @   >#    U  H  n[        TTTT5      v   M     g 7fr   )rc   ).0_rj   r2   ri   r?   s     r   	<genexpr>.TransformerEncoder.__init__.<locals>.<genexpr>   s#     nam\]%iL,WWams   )r   r   r   rg   rangeblocksresults)r   ri   depthr?   r2   rj   r   s    ` ```r   r   TransformerEncoder.__init__}   s8     	mmnafglamn
 ,.r   c                    / U l         UnU R                  R                  5        H&  nU" U5      nU R                   R                  U5        M(     U$ r   )rw   rv   childrenappend)r   r   r_   ms       r   r   TransformerEncoder.forward   sG    %%'AC&CLL$ ( 
r   )rv   rw   )      r   r5   r5   )ri   r6   rx   r6   r?   r6   r2   r7   rj   r7   r   r   ra   r    r'   s   @r   rm   rm   |   sb     !!.. . 	.
 . . 
. . r   rm   c                  h   ^  \ rS rSrSr     S           SU 4S jjjrS	S jrS
S jrSrU =r	$ )PatchEmbedding   zJCompute the 2d image patch embedding ready to pass to transformer encoder.c                  > [         TU ]  5         Xl        X l        X0l        U=(       d    [
        R                  " XX3S9U l        Ub  U R                  XU45      u  p&X l        OXC-  S-  n[
        R                  " [        R                  " SSU5      5      U l        [
        R                  " [        R                  " US-   U5      5      U l        g )N)kernel_sizestriderJ   rK   )r   r   in_channelsout_channels
patch_sizer   Conv2dbackbone_compute_feats_dims	ParameterrS   randn	cls_token	positions)r   r   r   r   
image_sizer   	feat_sizer   s          r   r   PatchEmbedding.__init__   s     	&($ !sBIIkU_$s&*&>&>Yc?d&e#L ,#1a7Iekk!Q&EFekk)a-&NOr   c                    U R                  [        R                  " S/UQ76 5      R                  5       nUR                  S   UR                  S   UR                  S   -  4$ )NrK   rM   )r   rS   zerosdetachrP   )r   r   r_   s      r   r   "PatchEmbedding._compute_feats_dims   sO    mmEKK7J78??Ayy}ciimciim;;;r   c                   U R                  U5      nUR                  u  p#  nUR                  X#S5      R                  SSS5      nU R                  R                  USS5      n[        R                  " XQ/SS9nXR                  -  nU$ )NrM   r   rJ   rK   rN   )	r   rP   rW   rR   r   repeatrS   catr   )r   r   rX   rY   rr   
cls_tokenss         r   r   PatchEmbedding.forward   sy    MM!WW
aFF1$$Q1-^^**1a3
IIzo1-	^^r   )r   r   r   r   r   r   )r=   r         N)r   r6   r   r6   r   r6   r   r6   r   nn.Module | Noner   r   )r   ztuple[int, int, int]r   ztuple[int, int]ra   )
r!   r"   r#   r$   __doc__r   r   r   r%   r&   r'   s   @r   r   r      sr    T %)PP P 	P
 P #P 
P P0<	 	r   r   c                     ^  \ rS rSrSr         S                   S	U 4S jjjr\S
S j5       rSS jr\	SSS jj5       r
SrU =r$ )r      a  Vision transformer (ViT) module.

The module is expected to be used as operator for different vision tasks.

The method is inspired from existing implementations of the paper :cite:`dosovitskiy2020vit`.

.. warning::
    This is an experimental API subject to changes in favor of flexibility.

Args:
    image_size: the size of the input image.
    patch_size: the size of the patch to compute the embedding.
    in_channels: the number of channels for the input.
    embed_dim: the embedding dimension inside the transformer encoder.
    depth: the depth of the transformer.
    num_heads: the number of attention heads.
    dropout_rate: dropout rate.
    dropout_attn: attention dropout rate.
    backbone: an nn.Module to compute the image patches embeddings.

Example:
    >>> img = torch.rand(1, 3, 224, 224)
    >>> vit = VisionTransformer(image_size=224, patch_size=16)
    >>> vit(img).shape
    torch.Size([1, 197, 768])

c
                   > [         TU ]  5         Xl        X l        X0l        X@l        [        X4X!U	5      U l        U R                  R                  n
[        XXgU5      U l
        [        R                  " U
S5      U l        g )Nrf   )r   r   r   r   r   
embed_sizer   patch_embeddingr   rm   encoderr   rh   norm)r   r   r   r   ri   rx   r?   r2   rj   r   
hidden_dimr   s              r   r   VisionTransformer.__init__   sk     	$$&#-kj^fg))66
)*YVbcLLT2	r   c                .    U R                   R                  $ r   )r   rw   )r   s    r   encoder_results!VisionTransformer.encoder_results   s    ||###r   c           
        [        U[        R                  5      (       d  [        S[	        U5       35      eU R
                  / UR                  SS  Q7;  a\  UR                  S   U R                  :w  a?  [        SU R                   SU R
                   SU R
                   SUR                   35      eU R                  U5      nU R                  U5      nU R                  U5      nU$ )Nz)Input x type is not a torch.Tensor. Got: r   r   zInput image shape must be Bxr   z. Got: )
isinstancerS   Tensor	TypeErrortyper   rP   r   rA   r   r   r   )r   r   r_   s      r   r   VisionTransformer.forward   s    !U\\**GQyQRR??"2AGGBCL"22qwwr{dFVFV7V.t/?/?.@$//ARRSTXTcTcSddklmlslsktu  ""1%ll3iin
r   c                8   U R                  S5      u  p4[        U5      nSSSS.SSSS.SSSS.S	S
SS.SSSS.S.U   nUR                  XeS9  [        S0 UD6nU(       a;  [	        U 5      n[
        R                  R                  U5      n	UR                  U	5        U$ )a0  Build ViT model based on the given config string.

The format is ``vit_{size}/{patch_size}``.
E.g. ``vit_b/16`` means ViT-Base, patch size 16x16. If ``pretrained=True``, AugReg weights are loaded.
The weights are hosted on HuggingFace's model hub: https://huggingface.co/kornia.

.. note::
    The available weights are: ``vit_l/16``, ``vit_b/16``, ``vit_s/16``, ``vit_ti/16``,
    ``vit_b/32``, ``vit_s/32``.

Args:
    variant: ViT model variant e.g. ``vit_b/16``.
    pretrained: whether to load pre-trained AugReg weights.
    kwargs: other keyword arguments that will be passed to :func:`kornia.models.vit.VisionTransformer`.

Returns:
    The respective ViT model

Example:
    >>> from kornia.models.vit import VisionTransformer
    >>> vit_model = VisionTransformer.from_config("vit_b/16", pretrained=True)

/   r   r=   )ri   rx   r?   i     r   i      r   i       )vit_tivit_svit_bvit_lvit_h)r   r4   )	splitr6   updater   _get_weight_urlrS   hubload_state_dict_from_urlload_state_dict)
variant
pretrainedr   
model_typepatch_size_strr   model_configmodelurl
state_dicts
             r   from_configVisionTransformer.from_config  s    2 &-]]3%7"
(
 %("1E#&!D#&"E#'"2F#'"2F
  	l:!+F+!'*C;;C@J!!*-r   )r   r   r   r   r   r   r   )	r   r   r=   r   r   r   r5   r5   N)r   r6   r   r6   r   r6   ri   r6   rx   r6   r?   r6   r2   r7   rj   r7   r   r   r   r   )r   zlist[torch.Tensor]ra   )F)r   strr   boolr   r   r   r   )r!   r"   r#   r$   r   r   propertyr   r   staticmethodr   r%   r&   r'   s   @r   r   r      s    < !!%)33 3 	3
 3 3 3 3 3 #3 
3 3. $ $ + +r   )zvit_l/16zvit_b/16zvit_s/16z	vit_ti/16zvit_b/32zvit_s/32c                r    [        U [        ;   SU  S35        U R                  S5      u  pSU U SU SU S3$ )z$Return the URL of the model weights.zVariant z% does not have pre-trained checkpointr   zhttps://huggingface.co/kornia/z_augreg_i21k_r224/resolve/main/-z.pth)r   _AVAILABLE_WEIGHTSr   )r   r   r   s      r   r   r   7  sY    ..(7)Ch0ij$]]3/J+J<
|Cbcmbnnopzo{{  A  Ar   )r   r   r   r   )r   
__future__r   typingr   r   rS   r   kornia.core.checkr   __all__Moduler
   rg   r)   r:   rc   rm   r   r   r   r   r4   r   r   <module>r      s   $ #     *
	")) 	
"-- 
' 'T
bmm 
( 0(RYY (Vr		 rj _ Ar   