
    3jdn                     V   S SK JrJr  S SKJr  S SKrS SKJr  SSKJr	  SSK
Jr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$   " S S\RJ                  5      r& " S S\RJ                  5      r'  S9S\RJ                  S\RP                  S\RP                  S\RP                  S\RP                  S-  S\)S-  S\)S\\   4S jjr* " S S \RJ                  5      r+ " S! S"\RJ                  5      r, " S# S$\5      r-\ " S% S&\5      5       r. " S' S(\RJ                  5      r/\ " S) S*\.5      5       r0\" S+S,9 " S- S.\.5      5       r1\" S/S,9 " S0 S1\.5      5       r2\" S2S,9\ " S3 S4\5      5       5       r3\" S5S,9 " S6 S7\.5      5       r4/ S8Qr5g):    )CallableIterable)	dataclassN)nn   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )
DeiTConfigc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	DeiTPatchEmbeddings(   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
configc                   > [         TU ]  5         UR                  nUR                  n[	        U[
        5      (       a  UOX"4n[	        U[
        5      (       a  UOX34nUS   US   -  US   US   -  -  U l        X l        X0l        UR                  U l        [        R                  " UR                  UR                  X3S9U l        g )Nr   r   )kernel_sizestride)super__init__
image_size
patch_size
isinstancer   num_patchesnum_channelsr   Conv2dhidden_size
projection)selfr   r$   r%   	__class__s       `/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/deit/modeling_deit.pyr#   DeiTPatchEmbeddings.__init__/   s    &&
&&
#-j(#C#CZ*Ia
#-j(#C#CZ*Ia
&qMZ]:z!}PZ[\P]?]^$$"//))F$7$79K9KYcw    pixel_valuesreturnc                     UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  U5      R	                  S5      R                  SS5      $ )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .   )shaper(   
ValueErrorr+   flatten	transpose)r,   r1   r(   s      r.   forwardDeiTPatchEmbeddings.forward<   ss    #))!,,,,!../yaI  |,44Q7AA!QGGr0   )r$   r(   r'   r%   r+   )__name__
__module____qualname____firstlineno____doc__r   r#   torchTensorr:   __static_attributes____classcell__r-   s   @r.   r   r   (   s:    xz xHELL HU\\ H Hr0   r   c            	          ^  \ rS rSrSrSS\S\SS4U 4S jjjrS\R                  S	\
S
\
S\R                  4S jr  SS\R                  S\R                  S-  S\S\R                  4S jjrSrU =r$ )DeiTEmbeddingsF   a  
Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.

Differences from ViTEmbeddings:
- Adds a distillation token (for distillation pre-training).
- Position embeddings include +2 slots (CLS + distillation) instead of +1.
- interpolate_pos_encoding handles 2 special tokens instead of 1.
- forward concatenates distillation token and handles position encoding for both.
r   use_mask_tokenr2   Nc                    > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        U(       a6  [        R                  " [        R
                  " SSUR                  5      5      OS U l        [        U5      U l
        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR                   U l        U R                  R"                  U l        [        R                  " [        R
                  " SSUR                  5      5      U l        g )Nr   r5   )r"   r#   r   	ParameterrA   zerosr*   	cls_token
mask_tokenr   patch_embeddingsr'   position_embeddingsDropouthidden_dropout_probdropoutr%   r$   distillation_token)r,   r   rI   r'   r-   s       r.   r#   DeiTEmbeddings.__init__Q   s    ekk!Q8J8J&KLQ_",,u{{1a9K9K'LMei 3F ;++77#%<<A{QPVPbPb0c#d zz&"<"<= ++//::"$,,u{{1aASAS/T"Ur0   
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r5   N      ?r   r   bicubicF)sizemodealign_cornersdim)r6   rP   rA   jit
is_tracingr%   r   reshapepermuter   
functionalinterpolateviewcat)r,   rV   rW   rX   r'   num_positionsclass_and_dist_pos_embedpatch_pos_embedra   
new_height	new_widthsqrt_num_positionss               r.   interpolate_pos_encoding'DeiTEmbeddings.interpolate_pos_encoding^   sU    !&&q)A-0066q9A= yy##%%+*F6?+++#'#;#;ArrE#B 221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy2D!LLr0   r1   bool_masked_posrp   c                    UR                   u    pEnU R                  U5      nUR                  5       u  pnUbI  U R                  R	                  XS5      n
UR                  S5      R                  U
5      nUSU-
  -  X-  -   nU R                  R	                  USS5      nU R                  R	                  USS5      n[        R                  " XU4SS9nU(       a  XpR                  XuU5      -   nOdXPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S	3	5      eXpR                  -   nU R                  U5      nU$ )
NrZ   g      ?r   r`   r   zInput image size (*z) doesn't match model (z).)r6   rO   r]   rN   expand	unsqueezetype_asrM   rT   rA   ri   rp   r$   r7   rP   rS   )r,   r1   rr   rp   _rW   rX   rV   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokenss                 r.   r:   DeiTEmbeddings.forward   sm    +001e**<8
$.OO$5!
&//00LK",,R088ED#sTz2[5GGJ^^**:r2>
"55<<ZRPYY
LRST
##&C&CJX]&^^J++u8J/J (% 9+,Adooa.@-AE  $&>&>>J\\*-
r0   )rM   rT   rS   r$   rN   rO   r%   rP   )F)NF)r<   r=   r>   r?   r@   r   boolr#   rA   rB   intrp   
BoolTensorr:   rC   rD   rE   s   @r.   rG   rG   F   s    Vz V4 VD V V&M5<< &M &MUX &M]b]i]i &MV 48).	 ll  ))D0  #'	 
 
   r0   rG   modulequerykeyvalueattention_maskscalingrS   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  US[        R                  S9R                  UR                  5      n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrZ         r5   r   )ra   dtype)ptrainingr   )r]   rA   matmulr9   r   rf   softmaxfloat32tor   rS   r   
contiguous)
r   r   r   r   r   r   rS   r   attn_weightsattn_outputs
             r.   eager_attention_forwardr      s     **R.D( <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r0   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\
\R                  \R                  4   4S	 jjrS
rU =r$ )DeiTAttention   r   c                   > [         TU ]  5         Xl        UR                  U l        [	        USUR
                  UR                  -  5      U l        UR                  U l        U R                  S-  U l	        SU l
        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  SS9U l        g )Nhead_dimr   F)biasT)r"   r#   r   num_attention_headsgetattrr*   r   attention_probs_dropout_probattention_dropoutr   	is_causalr   Linearqkv_biasq_projk_projv_projo_projr,   r   r-   s     r.   r#   DeiTAttention.__init__   s*   #)#=#= 
F4F4F&JdJd4de!'!D!D}}d*ii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii : :T]] JFL^L^eijr0   Nhidden_statesr   r   r2   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n[        R                  " U R                  R                  [        5      n	U	" U UUUU4U R                  (       d  SOU R                  U R                  S.UD6u  pU
R                  " / UQSP76 R!                  5       n
U R#                  U
5      n
X4$ )NrZ   r   r5           )rS   r   )r6   r   r   rh   r9   r   r   r   get_interfacer   _attn_implementationr   r   r   r   rd   r   r   )r,   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesattention_interfacer   r   s               r.   r:   DeiTAttention.forward   sE    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r0   )
r   r   r   r   r   r   r   r   r   r   N)r<   r=   r>   r?   r   r#   rA   rB   r   r   tupler:   rC   rD   rE   s   @r.   r   r      sk    kz k" /3)||) t+) +,	)
 
u||U\\)	*) )r0   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )DeiTMLP   r   c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r"   r#   r   r	   
hidden_actactivation_fnr   r   r*   intermediate_sizefc1fc2r   s     r.   r#   DeiTMLP.__init__   sb    #F$5$5699V//1I1IJ99V55v7I7IJr0   r   r2   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )r,   r   s     r.   r:   DeiTMLP.forward   s4    /**=9/r0   )r   r   r   r   r<   r=   r>   r?   r   r#   rA   rB   r:   rC   rD   rE   s   @r.   r   r      s1    Kz KU\\ ell  r0   r   c            	          ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S-  S\\	   S\R                  4S	 jjr
S
rU =r$ )	DeiTLayeri  r   c                 d  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        R
                  " UR                  UR                  S9U l	        [        U5      U l        [        R                  " UR                  5      U l        g )Neps)r"   r#   r   	attentionr   	LayerNormr*   layer_norm_epslayernorm_beforelayernorm_afterr   mlprQ   rR   rS   r   s     r.   r#   DeiTLayer.__init__  sy    &v. "V-?-?VEZEZ [!||F,>,>FDYDYZ6?zz&"<"<=r0   Nr   r   r   r2   c                     UnU R                  U5      nU R                  " X40 UD6u  pU R                  U5      nX-   nUnU R                  U5      nU R	                  U5      nU R                  U5      nX-   nU$ r   )r   r   rS   r   r   )r,   r   r   r   residualrx   s         r.   r:   DeiTLayer.forward  s     !--m<>>-R6R]3%0 !,,];/]3%0r0   )r   rS   r   r   r   r   )r<   r=   r>   r?   r   r#   rA   rB   r   r   r:   rC   rD   rE   s   @r.   r   r     s[    >z > /3|| t+ +,	
 
 r0   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	SS/r
SrSrSrSrSr\\S	.rS
r\R*                  " 5       S\R.                  \R0                  -  \R2                  -  SS4U 4S jj5       rSrU =r$ )DeiTPreTrainedModeli&  r   deitr1   )imageTrG   r   )r   
attentionsrO   r   r2   Nc                   > [         TU ]  U5        [        U[        5      (       a  UR                  b4  [
        R                  " UR                  SU R                  R                  S9  [
        R                  " UR                  SU R                  R                  S9  UR                  b   [
        R                  " UR                  5        [        U[        5      (       a  [
        R                  " UR                  5        [
        R                  " UR                  5        [
        R                  " UR                  5        UR                  b!  [
        R                  " UR                  5        ggg)zInitialize the weightsNr   )meanstd)r"   _init_weightsr&   rG   rP   inittrunc_normal_r   initializer_rangerM   rN   zeros_rT   )r,   r   r-   s     r.   r   !DeiTPreTrainedModel._init_weights9  s     	f%fn--))5""6#=#=CT[[MjMjkv//ct{{?\?\]  ,F--.fn--KK(()KK223KK112  ,F--. -	 .r0    )r<   r=   r>   r?   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_can_compile_fullgraphr   r   _can_record_outputs_input_embed_layerrA   no_gradr   r   r)   r   r   rC   rD   rE   s   @r.   r   r   &  s    $O!&*#);7N"&!"# ,
]]_/BII		$9BLL$H /T / /r0   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )
DeiTPooleriK  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r"   r#   r   r   r*   pooler_output_sizedenser	   
pooler_act
activationr   s     r.   r#   DeiTPooler.__init__L  s>    YYv1163L3LM
 !2!23r0   r   r2   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r,   r   first_token_tensorpooled_outputs       r.   r:   DeiTPooler.forwardQ  s6     +1a40

#566r0   )r   r   r   rE   s   @r.   r   r   K  s/    4z 4
U\\ ell  r0   r   c                      ^  \ rS rSrSS\S\S\4U 4S jjjr\\" SS9\	    SS	\
R                  S-  S
\
R                  S-  S\S-  S\
R                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )	DeiTModeliZ  Fr   add_pooling_layerrI   c                   > [         TU ]  U5        Xl        [        XS9U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        [
        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R#                  5         gs  snf )z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
)rI   r   N)r"   r#   r   rG   rV   r   
ModuleListrangenum_hidden_layersr   layersr   r*   r   	layernormr   pooler	post_init)r,   r   r   rI   rx   r-   s        r.   r#   DeiTModel.__init__\  s     	 (OmmfF^F^@_$`@_1Yv%6@_$`af&8&8f>S>ST,=j(4	 %as   C)tie_last_hidden_statesNr1   rr   rp   r   r   r2   c                    U R                   R                  R                  R                  R                  nUR                  U:w  a  UR                  U5      nU R                  XUS9n[        U R                  UUS9nUnU R                   H  n	U	" X40 UD6nM     U R                  U5      n
U R                  b  U R                  U
5      OSn[        XS9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)rr   rp   )r   inputs_embedsr   N)last_hidden_statepooler_output)rV   rO   r+   weightr   r   r
   r   r  r  r  r   )r,   r1   rr   rp   r   r   expected_dtypeembedding_outputr   layersequence_outputr   s               r.   r:   DeiTModel.forwardl  s      99DDKKQQ/'??>:L??Tl + 
 3;;*)

 )[[E!-J6JM ! ..78<8OO4UY)Oiir0   )r   rV   r  r  r  )TFNNNN)r<   r=   r>   r?   r   r   r#   r   r   r   rA   rB   r   r   r   r   r:   rC   rD   rE   s   @r.   r   r   Z  s    z d [_     E2 -13704.2 jllT) j ))D0 j #'+	 j
 t+ j +, j 
$ j  3   jr0   r   ad  
    DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr\\    SS\R                  S-  S\R                  S-  S\S\R                  S-  S	\\   S
\4S jj5       5       rSrU =r$ )DeiTForMaskedImageModelingi  r   c                 H  > [         TU ]  U5        [        USSS9U l        [        R
                  " [        R                  " UR                  UR                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R                  5         g )NFT)r   rI   r5   r   )in_channelsout_channelsr    )r"   r#   r   r   r   
Sequentialr)   r*   encoder_strider(   PixelShuffledecoderr  r   s     r.   r#   #DeiTForMaskedImageModeling.__init__  s     fdS	}}II"..#22A58K8KK
 OOF112
 	r0   Nr1   rr   rp   r   r   r2   c                 R   U R                   " U4UUUS.UD6nUR                  nUSS2SS24   nUR                  u  pn
[        U	S-  5      =pUR	                  SSS5      R                  XX5      nU R                  U5      nSnUGb  U R                  R                  U R                  R                  -  nUR                  SX5      nUR                  U R                  R                  S5      R                  U R                  R                  S5      R                  S5      R                  5       n[        R                  R                  XSS	9nUU-  R!                  5       UR!                  5       S
-   -  U R                  R"                  -  n[%        UUUR&                  UR(                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
>>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```)rr   rp   r   Nr5   r[   r   r   rZ   none)	reductiongh㈵>)lossreconstructionr   r   )r   r  r6   r   re   rd   r  r   r$   r%   repeat_interleaverv   r   r   rf   l1_losssumr(   r   r   r   )r,   r1   rr   rp   r   r   outputsr  ry   sequence_lengthr(   rW   rX   reconstructed_pixel_valuesmasked_im_lossr]   r|   reconstruction_losss                     r.   r:   "DeiTForMaskedImageModeling.forward  s   L /3ii/
+%=)	/

 /
 "33 *!QR%04C4I4I1
\_c122)11!Q:BB:]ck &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN(5!//))	
 	
r0   )r  r   )NNFN)r<   r=   r>   r?   r   r#   r   r   rA   rB   r   r   r   r   r   r:   rC   rD   rE   s   @r.   r  r    s    z "  -137)..2J
llT)J
 ))D0J
 #'	J

 t+J
 +,J
 
#J
  J
r0   r  a  
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune DeiT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrS\4U 4S jjr\\    SS\R                  S-  S\R                  S-  S\
S-  S\R                  S-  S	\\   S
\4S jj5       5       rSrU =r$ )DeiTForImageClassificationi  r   c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g NF)r   r   )r"   r#   
num_labelsr   r   r   r   r*   Identity
classifierr  r   s     r.   r#   #DeiTForImageClassification.__init__  ss      ++f>	 OUN_N_bcNc"))F$6$68I8IJikititiv 	r0   Nr1   labelsrp   r   r   r2   c                    U R                   " U4UUS.UD6nUR                  nUSS2SSS24   nU R                  U5      n	Sn
Ub  U R                  " X)U R                  40 UD6n
[        U
U	UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
rp   r   Nr   )r#  logitsr   r   )r   r  r4  loss_functionr   r   r   r   )r,   r1   r6  rp   r   r   r(  r  r   r9  r#  s              r.   r:   "DeiTForImageClassification.forward  s    " /3ii/
%=)/
 	/
 "33'1a0/%%fdkkLVLD$!//))	
 	
r0   )r4  r   r2  r  )r<   r=   r>   r?   r   r#   r   r   rA   rB   r   r   r   r   r:   rC   rD   rE   s   @r.   r/  r/    s    
z 
  -1&*04.2#
llT)#
 t##
 #'+	#

 t+#
 +,#
 
#
  #
r0   r/  zC
    Output type of [`DeiTForImageClassificationWithTeacher`].
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   S	rg)
+DeiTForImageClassificationWithTeacherOutputiB  aF  
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Prediction scores as the average of the cls_logits and distillation logits.
cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
    class token).
distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
    distillation token).
Nr9  
cls_logitsdistillation_logitsr   r   r   )r<   r=   r>   r?   r@   r9  rA   FloatTensorr   r>  r?  r   r   r   rC   r   r0   r.   r=  r=  B  s}    	 (,FE$++/J!!D(/48**T1859M5**+d2926Je''(4/6r0   r=  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                      ^  \ rS rSrS\SS4U 4S jjr\\   SS\R                  S-  S\
S\R                  S-  S	\\   S\4
S
 jj5       5       rSrU =r$ )%DeiTForImageClassificationWithTeacheri[  r   r2   Nc                   > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l
        U R                  5         g r1  )r"   r#   r2  r   r   r   r   r*   r3  cls_classifierdistillation_classifierr  r   s     r.   r#   .DeiTForImageClassificationWithTeacher.__init__g  s      ++f>	 AG@Q@QTU@UBIIf((&*;*;<[][f[f[h 	 AG@Q@QTU@UBIIf((&*;*;<[][f[f[h 	$
 	r0   r1   rp   r   r   c                    U R                   " U4UUS.UD6nUR                  nU R                  US S 2SS S 24   5      nU R                  US S 2SS S 24   5      nXx-   S-  n	[	        U	UUUR
                  UR                  S9$ )Nr8  r   r   r5   )r9  r>  r?  r   r   )r   r  rD  rE  r=  r   r   )
r,   r1   rp   r   r   r(  r  r>  r?  r9  s
             r.   r:   -DeiTForImageClassificationWithTeacher.forwardx  s     /3ii/
%=)/
 	/
 "33((Aq)AB
"::?1aQR7;ST 2a7:! 3!//))
 	
r0   )rD  r   rE  r2  )NFN)r<   r=   r>   r?   r   r#   r   r   rA   rB   r   r   r   r=  r:   rC   rD   rE   s   @r.   rB  rB  [  s    z d "  -1)..2	
llT)
 #'
 t+	

 +,
 
5
  
r0   rB  )r/  rB  r  r   r   )Nr   )6collections.abcr   r   dataclassesr   rA   r    r   r   activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_deitr   Moduler   rG   rB   floatr   r   r   r   r   r   r   r  r/  r=  rB  __all__r   r0   r.   <module>rY     s  * / !   & ! 6 9 l l F & O O I 5 *H")) H<`RYY `R !%II%<<% 
% <<	%
 LL4'% T\% % '(%8.)BII .)bbii  * @ !// !/ !/H  4j# 4j 4jn 	^
!4 ^
^
B 2
!4 2
2
j 
 7+ 7 7& 
0
,? 0

0
fr0   