
    3j 4                     l   S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJrJrJrJrJr  SSKJr   " S S\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r \" SS9\ " S S\5      5       5       r!\" SS9 " S S\5      5       r"/ S Qr#g)!z7PyTorch DeiT (Data-efficient Image Transformers) model.    )	dataclassN)nn   )initialization)BaseModelOutputWithPoolingMaskedImageModelingOutput)Unpack)ModelOutputTransformersKwargsauto_docstring	torch_int)can_return_tuple   )ViTEmbeddingsViTForImageClassificationViTForMaskedImageModelingViTModelViTPreTrainedModel   )
DeiTConfigc            	          ^  \ rS rSrSrSS\S\SS4U 4S jjjrS\R                  S	\
S
\
S\R                  4S jr  SS\R                  S\R                  S-  S\S\R                  4S jjrSrU =r$ )DeiTEmbeddings'   a  
Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.

Differences from ViTEmbeddings:
- Adds a distillation token (for distillation pre-training).
- Position embeddings include +2 slots (CLS + distillation) instead of +1.
- interpolate_pos_encoding handles 2 special tokens instead of 1.
- forward concatenates distillation token and handles position encoding for both.
configuse_mask_tokenreturnNc                   > [         TU ]  XS9  [        R                  " [        R
                  " SSUR                  5      5      U l        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l
        [        R                  " [        R
                  " SSUR                  5      5      U l        g )N)r   r   r   )super__init__r   	Parametertorchzeroshidden_size	cls_tokenpatch_embeddingsnum_patchesposition_embeddingsdistillation_token)selfr   r   r&   	__class__s       _/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/deit/modular_deit.pyr   DeiTEmbeddings.__init__2   s    ?ekk!Q8J8J&KL++77#%<<A{QPVPbPb0c#d "$,,u{{1aASAS/T"U    
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   N      ?r   r   bicubicF)sizemodealign_cornersdim)shaper'   r!   jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateviewcat)r)   r.   r/   r0   r&   num_positionsclass_and_dist_pos_embedpatch_pos_embedr9   
new_height	new_widthsqrt_num_positionss               r+   interpolate_pos_encoding'DeiTEmbeddings.interpolate_pos_encoding:   sU    !&&q)A-0066q9A= yy##%%+*F6?+++#'#;#;ArrE#B 221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy2D!LLr-   pixel_valuesbool_masked_posrJ   c                    UR                   u    pEnU R                  U5      nUR                  5       u  pnUbI  U R                  R	                  XS5      n
UR                  S5      R                  U
5      nUSU-
  -  X-  -   nU R                  R	                  USS5      nU R                  R	                  USS5      n[        R                  " XU4SS9nU(       a  XpR                  XuU5      -   nOdXPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S	3	5      eXpR                  -   nU R                  U5      nU$ )
Nr2   g      ?r   r8   r   zInput image size (*z) doesn't match model (z).)r:   r%   r5   
mask_tokenexpand	unsqueezetype_asr$   r(   r!   rC   rJ   
image_size
ValueErrorr'   dropout)r)   rL   rM   rJ   _r/   r0   r.   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokenss                 r+   forwardDeiTEmbeddings.forwardb   sm    +001e**<8
$.OO$5!
&//00LK",,R088ED#sTz2[5GGJ^^**:r2>
"55<<ZRPYY
LRST
##&C&CJX]&^^J++u8J/J (% 9+,Adooa.@-AE  $&>&>>J\\*-
r-   )r$   r(   r'   )F)NF)__name__
__module____qualname____firstlineno____doc__r   boolr   r!   TensorintrJ   
BoolTensorr^   __static_attributes____classcell__r*   s   @r+   r   r   '   s    Vz V4 VD V V&M5<< &M &MUX &M]b]i]i &MV 48).	 ll  ))D0  #'	 
 
   r-   r   c                      ^  \ rS rSrSS/rS\R                  \R                  -  \R                  -  SS4U 4S jjr	Sr
U =r$ )	DeiTPreTrainedModel   r   	DeiTLayermoduler   Nc                 l  > [         TU ]  U5        [        U[        5      (       a  [        R
                  " UR                  5        [        R
                  " UR                  5        [        R
                  " UR                  5        UR                  b!  [        R
                  " UR                  5        ggg)zInitialize the weightsN)
r   _init_weights
isinstancer   initzeros_r$   r'   r(   rP   )r)   rp   r*   s     r+   rr   !DeiTPreTrainedModel._init_weights   s{    f%fn--KK(()KK223KK112  ,F--. -	 .r-    )r`   ra   rb   rc   _no_split_modulesr   LinearConv2d	LayerNormrr   ri   rj   rk   s   @r+   rm   rm      s<    );7/BII		$9BLL$H /T / /r-   rm   c                       \ rS rSrSrg)	DeiTModel   rw   Nr`   ra   rb   rc   ri   rw   r-   r+   r}   r}          r-   r}   c                       \ rS rSr\\    SS\R                  S-  S\R                  S-  S\	S\R                  S-  S\
\   S\4S	 jj5       5       rS
rg)DeiTForMaskedImageModeling   NrL   rM   rJ   attention_maskkwargsr   c                 R   U R                   " U4UUUS.UD6nUR                  nUSS2SS24   nUR                  u  pn
[        U	S-  5      =pUR	                  SSS5      R                  XX5      nU R                  U5      nSnUGb  U R                  R                  U R                  R                  -  nUR                  SX5      nUR                  U R                  R                  S5      R                  U R                  R                  S5      R                  S5      R                  5       n[        R                  R                  XSS	9nUU-  R!                  5       UR!                  5       S
-   -  U R                  R"                  -  n[%        UUUR&                  UR(                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
>>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```)rM   rJ   r   Nr   r3   r   r   r2   none)	reductiongh㈵>)lossreconstructionhidden_states
attentions)deitlast_hidden_stater:   rg   r?   r>   decoderr   rT   r=   repeat_interleaverR   
contiguousr   r@   l1_losssumnum_channelsr   r   r   )r)   rL   rM   rJ   r   r   outputssequence_outputrX   sequence_lengthr   r/   r0   reconstructed_pixel_valuesmasked_im_lossr5   r[   reconstruction_losss                     r+   r^   "DeiTForMaskedImageModeling.forward   s   L /3ii/
+%=)	/

 /
 "33 *!QR%04C4I4I1
\_c122)11!Q:BB:]ck &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN(5!//))	
 	
r-   rw   )NNFN)r`   ra   rb   rc   r   r   r!   rf   rh   re   r	   r   r   r^   ri   rw   r-   r+   r   r      s     -137)..2J
llT)J
 ))D0J
 #'	J

 t+J
 +,J
 
#J
  J
r-   r   c                       \ rS rSrSrg)DeiTForImageClassification   rw   Nr   rw   r-   r+   r   r      r   r-   r   zC
    Output type of [`DeiTForImageClassificationWithTeacher`].
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
+DeiTForImageClassificationWithTeacherOutput   aF  
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Prediction scores as the average of the cls_logits and distillation logits.
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
    class token).
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
    distillation token).
Nlogits
cls_logitsdistillation_logitsr   r   rw   )r`   ra   rb   rc   rd   r   r!   FloatTensor__annotations__r   r   r   tupler   ri   rw   r-   r+   r   r      s}    	 (,FE$++/J!!D(/48**T1859M5**+d2926Je''(4/6r-   r   a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                      ^  \ rS rSrS\SS4U 4S jjr\\   SS\R                  S-  S\
S\R                  S-  S	\\   S\4
S
 jj5       5       rSrU =r$ )%DeiTForImageClassificationWithTeacheri  r   r   Nc                   > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l
        U R                  5         g )NF)add_pooling_layerr   )r   r   
num_labelsr}   r   r   ry   r#   Identitycls_classifierdistillation_classifier	post_init)r)   r   r*   s     r+   r   .DeiTForImageClassificationWithTeacher.__init__  s      ++f>	 AG@Q@QTU@UBIIf((&*;*;<[][f[f[h 	 AG@Q@QTU@UBIIf((&*;*;<[][f[f[h 	$
 	r-   rL   rJ   r   r   c                    U R                   " U4UUS.UD6nUR                  nU R                  US S 2SS S 24   5      nU R                  US S 2SS S 24   5      nXx-   S-  n	[	        U	UUUR
                  UR                  S9$ )N)rJ   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   r   )
r)   rL   rJ   r   r   r   r   r   r   r   s
             r+   r^   -DeiTForImageClassificationWithTeacher.forward!  s     /3ii/
%=)/
 	/
 "33((Aq)AB
"::?1aQR7;ST 2a7:! 3!//))
 	
r-   )r   r   r   r   )NFN)r`   ra   rb   rc   r   r   r   r   r!   rf   re   r	   r   r   r^   ri   rj   rk   s   @r+   r   r     s    z d "  -1)..2	
llT)
 #'
 t+	

 +,
 
5
  
r-   r   )r   r   r   r}   rm   )$rd   dataclassesr   r!   r    r   rt   modeling_outputsr   r   processing_utilsr	   utilsr
   r   r   r   utils.genericr   vit.modeling_vitr   r   r   r   r   configuration_deitr   r   rm   r}   r   r   r   r   __all__rw   r-   r+   <module>r      s    > !   & ' O O -  +[] [|/, /	 	M
!: M
`	!: 	 
 7+ 7 7& 
0
,? 0

0
fr-   