
    3jȖ                        S r SSKJr  SSKrSSKJrJr  SSKJr  SSK	J
r
Jr  SSKJr  SS	KJrJrJrJrJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/  \" SS9\ " S S\5      5       5       r0 " S S\,5      r1 " S S\)5      r2 " S S\Rf                  5      r4 " S  S!\(5      r5 " S" S#\+5      r6 " S$ S%\&5      r7 " S& S'\*5      r8\ " S( S)\-5      5       r9\ " S* S+\95      5       r: " S, S-\Rf                  5      r;\" S.S9 " S/ S0\95      5       r<\" S1S9 " S2 S3\95      5       r= " S4 S5\$5      r> " S6 S7\Rf                  5      r? " S8 S9\Rf                  5      r@ " S: S;\Rf                  5      rA " S< S=\Rf                  5      rB " S> S?\Rf                  5      rC " S@ SA\Rf                  5      rD\ " SB SC\95      5       rE\" SDS9 " SE SF\
\95      5       rF/ SGQrGg)HzPyTorch BEiT model.    )	dataclassN)Tensornn   )initialization)BackboneMixinfilter_output_hidden_states)create_bidirectional_mask)BackboneOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel)Unpack)#compile_compatible_method_lru_cache)TransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )ResNetConvLayer)SwinDropPath)ViTAttentionViTEmbeddingsViTLayerViTMLPViTPatchEmbeddingsViTPreTrainedModel   )
BeitConfigz-
    Class for outputs of [`BeitModel`].
    )custom_introc                       \ rS rSrSrSrg)BeitModelOutputWithPooling+   a2  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
    *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
    will be returned.
 N)__name__
__module____qualname____firstlineno____doc____static_attributes__r(       _/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/beit/modular_beit.pyr&   r&   +   s    r/   r&   c                       \ rS rSrSrg)BeitPatchEmbeddings:   r(   Nr)   r*   r+   r,   r.   r(   r/   r0   r2   r2   :       r/   r2   c                       \ rS rSrS\SS4S jr S
S\R                  S\R                  S-  S\R                  4S jjr	S	r
g)BeitEmbeddings>   configreturnNc                    [         R                  R                  U 5        [         R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a6  [         R                  " [        R
                  " SSUR                  5      5      OS U l	        [        U5      U l        UR                  U l        U R                  R                  nUR                  (       a9  [         R                  " [        R
                  " SUS-   UR                  5      5      OS U l        [         R                   " UR"                  5      U l        g )Nr"   )r   Module__init__	Parametertorchzeroshidden_size	cls_tokenuse_mask_token
mask_tokenr2   patch_embeddings
patch_sizenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)selfr9   rG   s      r0   r=   BeitEmbeddings.__init__?   s    
		4 ekk!Q8J8J&KLQWQfQf",,u{{1a9K9K'LMlp 3F ; ++++77 66 LLQa9K9KLM 	 
 zz&"<"<=r/   pixel_valuesbool_masked_posc                    UR                   u    p4nU R                  U5      nUR                  5       u  pxnUbI  U R                  R	                  XxS5      n	UR                  S5      R                  U	5      n
USU
-
  -  X-  -   nU R                  R	                  USS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nU$ Nr"   dim)shaperE   sizerD   expand	unsqueezetype_asrB   r?   catrI   interpolate_pos_encodingrL   )rM   rO   rP   _heightwidth
embeddings
batch_sizeseq_lenmask_tokensmask
cls_tokenss               r0   forwardBeitEmbeddings.forwardN   s    
 +001e**<8
!+!2
Q&//00bIK",,R088ED#q4x0;3EEJ^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
r/   )rB   rL   rD   rE   rF   rI   N)r)   r*   r+   r,   r#   r=   r?   r   
BoolTensorrf   r.   r(   r/   r0   r7   r7   >   sS    >z >d >$ 48ll ))D0 
	 r/   r7   c                      ^  \ rS rSrS\SS4U 4S jjr\\" SS9S\\	\	4   S\
R                  4S	 j5       5       rSS
\S\
R                  4S jjrSrU =r$ )BeitRelativePositionBiash   r9   r:   Nc                   > [         TU ]  5         UR                  n[        U[        [
        45      (       d  X"4nUS   UR                  -  US   UR                  -  4U l        SU R                  S   -  S-
  SU R                  S   -  S-
  -  S-   U l        [        R                  " [        R                  " U R                  UR                  5      5      U l        g Nr   r"   r   r   )superr=   
image_size
isinstancetuplelistrF   window_sizenum_relative_distancer   r>   r?   r@   num_attention_headsrelative_position_bias_table)rM   r9   rp   	__class__s      r0   r=   !BeitRelativePositionBias.__init__i   s    &&
*udm44$1J&qMV->->>
1QWQbQb@bc&'$*:*:1*=&=&Aa$JZJZ[\J]F]`aFa%bef%f",.LLKK22F4N4NO-
)r/   
   )maxsizert   c                    SU S   -  S-
  SU S   -  S-
  -  S-   nU S   U S   -  n[         R                  " [         R                  " [         R                  " [         R                  " U S   5      [         R                  " U S   5      SS95      SS9nUSS2SS2S4   USS2SSS24   -
  R                  SSS5      R                  5       nUSS2SS2S4==   U S   S-
  -  ss'   USS2SS2S4==   U S   S-
  -  ss'   USS2SS2S4==   SU S   -  S-
  -  ss'   [         R                  " US-   4S-  UR                  S	9nUR                  S
5      USS2SS24'   US-
  USSS24'   US-
  USS2S4'   US-
  US'   U$ )z
This method creates the relative position index, modified to support arbitrary window sizes,
as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
r   r   r"   r   ij)indexing)	start_dimN)rW   dtyperS   )r   r   )
r?   flattenstackmeshgridarangepermute
contiguousr@   r   sum)rt   ru   window_areacoords_flattenrelative_coordsrelative_position_indexs         r0    generate_relative_position_index9BeitRelativePositionBias.generate_relative_position_indexu   s    "#[^!3a!7AA<NQR<R SVW W!!n{1~5 KKu||KN'CU\\R]^_R`Ealpqr
 *!Q*5q$PQz8RR[[\]_`bcdooq1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r/   r\   c                    SU R                   S   -  S-
  nSU R                   S   -  S-
  nSUS   -  S-
  nSUS   -  S-
  nU R                  nU R                  n	Xg-  S-   n
USU	S-
   nUR                  SXTS5      R	                  SSSS5      n[
        R                  R                  U[        U5      [        U5      4SS9nUR	                  SSSS5      R                  U
S-
  S5      n[        R                  " XU	S-
  S /5      nU R                  U5      nXR                  S5         nUR                  US   US   -  S-   US   US   -  S-   S5      nUR	                  SSS5      R                  5       nU(       a?  [
        R                  R                  UR                  S5      X34SS	S
9R                  S5      nUR                  S5      $ )ze
Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
r   r   r"   r   NrS   bilinear)rW   modeFrW   r   align_corners)rt   rw   ru   reshaper   r   
functionalinterpolater   r?   r[   r   viewr   rY   squeeze)rM   rt   r\   dim_size
old_height	old_width
new_height	new_width old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler   relative_position_biass                   r0   rf    BeitRelativePositionBias.forward   s!    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099=VYZ=Z=\]^,
( #'"G"G"T!AB^B^_aBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r/   )ru   rw   rt   )FN)r)   r*   r+   r,   r#   r=   staticmethodr   rr   intr?   r   r   boolrf   r.   __classcell__rx   s   @r0   rk   rk   h   sp    	
z 	
d 	
 (4'eCHo '%,, ' 5 '4-3T -3]b]i]i -3 -3r/   rk   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )BeitAttention   r9   c                 
  > [         TU ]  U5        [        R                  " UR                  UR
                  U R                  -  5      U l        [        R                  " UR                  UR
                  U R                  -  SS9U l        [        R                  " UR                  UR
                  U R                  -  5      U l	        [        R                  " UR
                  U R                  -  UR                  5      U l
        g )NF)bias)ro   r=   r   LinearrA   rv   head_dimq_projk_projv_projo_projrM   r9   rx   s     r0   r=   BeitAttention.__init__   s     ii 2 2F4N4NQUQ^Q^4^_ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^_ii : :T]] JFL^L^_r/   )r   r   r   r   )r)   r*   r+   r,   r#   r=   r.   r   r   s   @r0   r   r      s    `z ` `r/   r   c                       \ rS rSrSrg)BeitMLP   r(   Nr4   r(   r/   r0   r   r      r5   r/   r   c                       \ rS rSrSrg)BeitDropPath   r(   Nr4   r(   r/   r0   r   r      r5   r/   r   c                      ^  \ rS rSrSrSS\S\4U 4S jjjr   SS\R                  S\R                  S-  S	\
S
\\\4   S-  S\\   S\R                  4S jjrSrU =r$ )	BeitLayer   z?This corresponds to the Block class in the timm implementation.r9   drop_path_ratec                   > [         TU ]  5         UR                  U l        US:  a  [        U5      O[        R
                  " 5       U l        UR                  nUS:  a6  [        R                  " U[        R                  " UR                  5      -  SS9OSU l        US:  a6  [        R                  " U[        R                  " UR                  5      -  SS9OSU l        UR                  (       a  [        U5      U l        g S U l        g )N        r   T)requires_gradg      ?)ro   r=   rF   r   r   Identity	drop_pathlayer_scale_init_valuer>   r?   onesrA   lambda_1lambda_2use_relative_position_biasrk   r   )rM   r9   r   init_valuesrx   s       r0   r=   BeitLayer.__init__   s     ++9G#9Mn5SUS^S^S`33^ilm^mBLLuzz&2D2D'EEUYZsv 	 _jlm^mBLLuzz&2D2D'EEUYZsv 	 KQJkJk&>v&F#qu#r/   Nhidden_statesattention_maskr\   
resolutionkwargsr:   c                    U R                   bF  Uu  pgX`R                  -  XpR                  -  4nU R                  XUR                  S   S9n	Ub  X-   OU	nUn
U R                  U5      nU R                  " U4SU0UD6u  pU R                  U5      nU R                  U-  nU R                  U5      U
-   nUn
U R                  U5      nU R                  U5      nU R                  U5      nU R                  U-  nU R                  U5      U
-   nU$ )Nr"   )r   r   )r   rF   rV   layernorm_before	attentionrL   r   r   layernorm_aftermlpr   )rM   r   r   r\   r   r   r^   r_   rt   r   residualr]   s               r0   rf   BeitLayer.forward   s6    &&2&MF!__4e6NOK%)%@%@@S@STU@V &A &" <J;U&7[q 
 !--m<>>
)
 

 ]35}5@ !,,];/]35}5@r/   )r   r   r   rF   r   )r   NFN)r)   r*   r+   r,   r-   r#   floatr=   r?   r   r   rr   r   r   r   rf   r.   r   r   s   @r0   r   r      s    Ivz v5 v v" /3).-1&||& t+& #'	&
 #s(Od*& +,& 
& &r/   r   c                   .    \ rS rSrS/rS/rSrSrS rSr	g)BeitPreTrainedModeli  r   z.*relative_position_index.*Fc                    [         R                  " X5        [        U[        5      (       a|  [        R
                  " UR                  5        UR                  b   [        R
                  " UR                  5        UR                  b!  [        R
                  " UR                  5        gg[        U[        5      (       a!  [        R
                  " UR                  5        g[        U[        5      (       a  [        UR                  [        R                  5      (       ak  [        R                  " UR                  U R                   R"                  5        [        R                  " UR$                  U R                   R"                  5        ggg)zInitialize the weightsN)r   _init_weightsrq   r7   initzeros_rB   rD   rI   rk   rw   r   r   r   r>   	constant_r9   r   r   )rM   modules     r0   r   !BeitPreTrainedModel._init_weights  s    %%d3fn--KK(()  ,F--.))5F667 6 899KK;;<	**&//2<<88v0R0RSv0R0RS 9 +r/   r(   N)
r)   r*   r+   r,   _no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attn_supports_flex_attnr   r.   r(   r/   r0   r   r     s%    $*H)I& Tr/   r   c                      ^  \ rS rSrSS\S\SS4U 4S jjjr\\" SS9\	   SS	\
R                  S
\
R                  S-  S\S\
R                  S-  S\\   S\4S jj5       5       5       rSrU =r$ )	BeitModeli$  r9   add_pooling_layerr:   Nc           
        > [         TU ]  U5        Xl        [        U5      U l        UR
                  (       a  [        U5      OSU l        [        UR                  5       Vs/ s H+  o1R                  U-  [        UR                  S-
  S5      -  PM-     nn[        R                  " U Vs/ s H  n[        XS9PM     sn5      U l        UR                   (       a  [        R"                  " 5       O([        R$                  " UR&                  UR(                  S9U l        U(       a  [-        U5      OSU l        U R1                  5         gs  snf s  snf )z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
Nr"   )r   eps)ro   r=   r9   r7   r`   !use_shared_relative_position_biasrk   shared_position_biasrangenum_hidden_layersr   maxr   
ModuleListr   layersuse_mean_poolingr   	LayerNormrA   layer_norm_eps	layernorm
BeitPoolerpooler	post_init)rM   r9   r   idrop_path_ratesrrx   s         r0   r=   BeitModel.__init__&  s   
 	 (0060X0X$V,^b 	! W\\b\t\tVu
VuQR!!A%F,D,Dq,H!(LLVu 	 
 mmRa$bRaQYv%HRa$bc $44BKKM",,vGYGY_e_t_t:u 	 ->j(4 	
 %cs   !2E)EF)tie_last_hidden_statesrO   rP   r\   r   r   c                    U R                  XS9nUR                  SS n[        U R                  UUS9nU R                  bZ  Uu  pXR                  R
                  -  XR                  R
                  -  4n
U R	                  XUR                  S   S9nUb  X-   OUnUnU R                   H  nU" U4UUUS.UD6nM     U R                  U5      nU R                  b  U R                  U5      OSn[        XS9$ )	z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)rP   r   N)r9   inputs_embedsr   r"   )r\   r   )r   r\   r   )last_hidden_statepooler_output)
r`   rV   r
   r9   r   rF   r   r   r   r&   )rM   rO   rP   r\   r   r   embedding_outputr   r^   r_   rt   shared_relative_position_biasr   layersequence_outputpooled_outputs                   r0   rf   BeitModel.forward?  s'     ??<?Y!''+
2;;*)
 $$0&MF![[%;%;;UkkF\F\=\]K,0,E,EYiYoYopqYr -F -)
 "- .>2  )[[E!-)A%	
 M ! ..78<8OO4UY)Oiir/   )r9   r`   r   r   r   r   )Tr   )r)   r*   r+   r,   r#   r   r=   r   r   r   r?   r   ri   r   r   r&   rf   r.   r   r   s   @r0   r   r   $  s    z d d  2  E2 48)..2-jll-j ))D0-j #'	-j
 t+-j +,-j 
$-j  3  -jr/   r   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	r   ir  r9   r:   Nc                    > [         TU ]  5         UR                  (       a/  [        R                  " UR
                  UR                  S9U l        g S U l        g )Nr   )ro   r=   r   r   r   rA   r   r   r   s     r0   r=   BeitPooler.__init__s  sA    KQKbKbBLL++1F1FG 	hl 	r/   r   c                     U R                   b,  U R                  US S 2SS 2S S 24   R                  S5      5      $ US S 2S4   $ )Nr"   r   )r   meanrM   r   s     r0   rf   BeitPooler.forwardy  sD    BF..B\t~~mAqr1H5::1=>ubopqstptbuur/   )r   )r)   r*   r+   r,   r#   r=   r?   r   rf   r.   r   r   s   @r0   r   r   r  s:    
z 
d 
vU\\ vell v vr/   r   a  
    Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.
    c                      ^  \ rS rSrS\SS4U 4S jjrS r\\     SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\S\	R                  S-  S\\   S\\-  4S jj5       5       rSrU =r$ )BeitForMaskedImageModelingi~  r9   r:   Nc                 @  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  UR                  5      U l        U R                  5         g )NFr   r   )ro   r=   
num_labelsr   beitr   r   rA   r   r   r   
vocab_sizelm_headr   r   s     r0   r=   #BeitForMaskedImageModeling.__init__  su      ++f>	 f&8&8f>S>STyy!3!3V5F5FG 	r/   c                     g rh   r(   )rM   s    r0   get_output_embeddings0BeitForMaskedImageModeling.get_output_embeddings  s    r/   rO   rP   labelsr\   r   r   c                 (   U R                   " U4UUUS.UD6nUR                  nU R                  U5      nU R                  USS2SS24   5      n	Sn
Ub   [        R
                  " 5       nU" X   U5      n
[        U
U	UR                  UR                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
>>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, logits = outputs.loss, outputs.logits
>>> list(logits.shape)
[1, 196, 8192]
```)rP   r\   r   Nr"   losslogitsr   
attentions)	r  r  r   r  r   CrossEntropyLossr   r   r"  )rM   rO   rP   r  r\   r   r   outputsr  prediction_scoresmasked_lm_lossloss_fcts               r0   rf   "BeitForMaskedImageModeling.forward  s    X ))
+%=)	

 
 "33..9 LLAB)?@**,H%&7&H&QN$!//))	
 	
r/   )r  r   r  r  )NNNFN)r)   r*   r+   r,   r#   r=   r  r   r   r?   r   ri   r   r   r   rr   r   rf   r.   r   r   s   @r0   r  r  ~  s    z d   -137&*)..2@
llT)@
 ))D0@
 t#	@

 #'@
 t+@
 +,@
 
	@
  @
r/   r  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrS\SS4U 4S jjr\\   SS\R                  S-  S\R                  S-  S\
S	\\   S\\-  4
S
 jj5       5       rSrU =r$ )BeitForImageClassificationi  r9   r:   Nc                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NTr  r   )ro   r=   r  r   r  r   r   rA   r   
classifierr   r   s     r0   r=   #BeitForImageClassification.__init__  ss      ++f=	 OUN_N_bcNc"))F$6$68I8IJikititiv 	r/   rO   r  r\   r   c                     U R                   " U4SU0UD6nUR                  nU R                  U5      nSnUb  U R                  X'U R                  5      n[        UUUR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r\   Nr  )r  r  r,  loss_functionr9   r   r   r"  )	rM   rO   r  r\   r   r$  r  r!  r   s	            r0   rf   "BeitForImageClassification.forward  s     ))
%=
 
  --/%%fdkkBD$!//))	
 	
r/   )r  r,  r  NNF)r)   r*   r+   r,   r#   r=   r   r   r?   r   r   r   r   rr   r   rf   r.   r   r   s   @r0   r*  r*    s    
z 
d 
  -1&*).	 
llT) 
 t# 
 #'	 

 +, 
 
&	& 
   
r/   r*  c                      ^  \ rS rSr       SS\S\S\\\\4   -  S\S\\\\4   -  \-  S\S\\\\4   -  S	\S
\4U 4S jjjrSr	U =r
$ )BeitConvLayeri  in_channelsout_channelskernel_sizestridepaddingr   dilationgroups
activationc
                 b   > [         T
U ]  5         [        R                  " UUUUUUUUS9U l        g )N)r4  r5  r6  r7  r8  r9  r:  r   )ro   r=   r   Conv2dconvolution)rM   r4  r5  r6  r7  r8  r   r9  r:  r;  rx   s             r0   r=   BeitConvLayer.__init__  s9     	99#%#	
r/   )r>  )r   r"   r   Fr"   r"   relu)r)   r*   r+   r,   r   rr   strr   r=   r.   r   r   s   @r0   r3  r3    s    
 .//0*+ 

 
 5c?*	

 
 uS#X&,
 
 c3h'
 
 
 
r/   r3  c                      ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S	\\\4   S\R                  4S
 jr	Sr
U =r$ )BeitPyramidPoolingBlocki.  
pool_scaler4  channelsr:   Nc                 v   > [         TU ]  5         [        R                  " U5      U l        [        X#SS9U l        g )Nr"   r6  )ro   r=   r   AdaptiveAvgPool2dpoolingr3  conv)rM   rD  r4  rE  rx   s       r0   r=    BeitPyramidPoolingBlock.__init__/  s.    ++J7!+QG	r/   inputrW   c                     U R                  U5      nU R                  U5      n[        R                  R	                  X2SSS9nU$ )Nr   Fr   )rI  rJ  r   r   r   )rM   rL  rW   hidden_states       r0   rf   BeitPyramidPoolingBlock.forward4  s@    ||E*yy.}}00zin0or/   )rJ  rI  )r)   r*   r+   r,   r   r=   r?   r   rr   rf   r.   r   r   s   @r0   rC  rC  .  sX    H3 HS HC HD H
U\\ sCx U\\  r/   rC  c                      ^  \ rS rSrSrS\\S4   S\S\SS4U 4S	 jjrS
\R                  S\
\R                     4S jrSrU =r$ )BeitPyramidPoolingModulei;  aK  
Pyramid Pooling Module (PPM) used in PSPNet.

Args:
    pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
        Module.
    in_channels (int): Input channels.
    channels (int): Channels after modules, before conv_seg.

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
pool_scales.r4  rE  r:   Nc                    > [         TU ]  5         Xl        X l        X0l        [
        R                  " U Vs/ s H  n[        XBUS9PM     sn5      U l        g s  snf )N)rD  r4  rE  )	ro   r=   rR  r4  rE  r   r   rC  blocks)rM   rR  r4  rE  rD  rx   s        r0   r=   !BeitPyramidPoolingModule.__init__H  sY    && mm #."-J (:aij"-
s   Ar   c                 r    UR                  5       SS  nU R                   Vs/ s H  o3" XS9PM
     sn$ s  snf )Nr   )rW   )rW   rT  )rM   r   original_sizeblocks       r0   rf    BeitPyramidPoolingModule.forwardT  s8    %**,QR0FJkkRkUm8kRRRs   4)rT  rE  r4  rR  )r)   r*   r+   r,   r-   rr   r   r=   r?   r   rs   rf   r.   r   r   s   @r0   rQ  rQ  ;  s\    


E#s(O 

# 

QT 

Y] 

SU\\ Sd5<<6H S Sr/   rQ  c                      ^  \ rS rSrSrS\SS4U 4S jjrS\\R                     S\R                  4S jr
S	\\R                     S\R                  4S
 jrSrU =r$ )BeitUperHeadiY  z
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://huggingface.co/papers/1807.10221).

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
r9   r:   Nc           
        > [         TU ]  5         UR                  U l        UR                  /S-  U l        UR                  U l        [        R                  " U R
                  UR                  SS9U l	        [        U R                  U R                  S   U R
                  5      U l        [        U R                  S   [        U R                  5      U R
                  -  -   U R
                  SSS9U l        [        R                  " 5       U l        [        R                  " 5       U l        U R                  S S  Hi  nU R                   R%                  [        X R
                  SS95        U R"                  R%                  [        U R
                  U R
                  SSS95        Mk     [        [        U R                  5      U R
                  -  U R
                  SSS9U l        g )N   r"   rG  rS   r   r6  r8  )ro   r=   rR  rA   r4  rE  r   r=  r  r,  rQ  psp_modulesr3  lenpsp_bottleneckr   lateral_convs	fpn_convsappendfpn_bottleneck)rM   r9   r4  rx   s      r0   r=   BeitUperHead.__init__a  s|   !--"../!3**))DMM63D3DRST 4R MM

 ,R 3t'7'7#84==#HHMM	
  ]]_++CR0K%%mK\]&^_NN!!-t}}Z[ef"gh 1 ,  !DMM1MM	
r/   r   c                 |    US   n[         R                  " U/U R                  U5      QSS9nU R                  U5      $ rR   )r?   r[   r_  ra  )rM   r   rN  s      r0   psp_forwardBeitUperHead.psp_forward  sA    $R(yy,!P1A1A,1O!PVWX""<00r/   encoder_hidden_statesc           	      &   / n[        U R                  U5       H  u  p4UR                  U" U5      5        M     UR                  U R                  U5      5        [	        U5      n[        US-
  SS5       HF  nX&S-
     R                  SS  nX&S-
     [        R                  R                  X&   USSS9-   X&S-
  '   MH     / n[        US-
  5       H)  nUR                  U R                  U   " X&   5      5        M+     UR                  US   5        [        US-
  SS5       H7  n[        R                  R                  X   US   R                  SS  SSS9X'   M9     [        R                  " USS9nU R                  U5      n	U R                  U	5      n	U	$ )	Nr"   r   rS   r   r   Fr   rT   )ziprb  rd  rh  r`  r   rV   r   r   r   rc  r?   r[   re  r,  )
rM   rj  lateralslateral_convrN  used_backbone_levelsr   
prev_shapefpn_outsoutputs
             r0   rf   BeitUperHead.forward  s   *-d.@.@BW*X&LOOL67 +Y 	(()>?@  #8}+a/B7A!a%..qr2J&1uo0I0I*:U 1J 1 HUO 8 +a/0AOODNN1-hk:; 1 	%+a/B7A--33(1+"3"3AB"7jX] 4 HK 8 99X1-$$X.(r/   )	rE  r,  re  rc  r4  rb  rR  ra  r_  )r)   r*   r+   r,   r-   r#   r=   rs   r?   r   rh  rf   r.   r   r   s   @r0   r[  r[  Y  sa     
z  
d  
D1ell); 1 1
T%,,-? ELL  r/   r[  c                      ^  \ rS rSrSr SS\S\S\S\\\\4   -  SS4
U 4S	 jjjrS
\	\
R                     S\
R                  4S jrSrU =r$ )BeitFCNHeadi  a  
Fully Convolution Networks for Semantic Segmentation. This head is implemented of
[FCNNet](https://huggingface.co/papers/1411.4038>).

Args:
    config (BeitConfig): Configuration.
    in_channels
    kernel_size (int): The kernel size for convs in the head. Default: 3.
    dilation (int): The dilation rate for convs in the head. Default: 1.


Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
r9   in_indexr6  r9  r:   Nc                 &  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l	        X l
        US-  U-  n[        R                  " 5       U l        U R                  S:  a  U R                  R                  [        U R                  U R
                  X5US95        [!        U R                  S-
  5       H=  nU R                  R                  [        U R
                  U R
                  UUUS95        M?     U R                  (       a4  [        U R                  U R
                  -   U R
                  X3S-  S9U l        [        R$                  " U R
                  UR&                  SS9U l        g )Nr   r   )r6  r8  r9  r"   r^  rG  )ro   r=   rA   r4  auxiliary_channelsrE  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputrv  r   r   convsrd  r3  r   conv_catr=  r  r,  )rM   r9   rv  r6  r9  conv_paddingr]   rx   s          r0   r=   BeitFCNHead.__init__  sD    	!--1133"99 #q(H4]]_
>>AJJ$$dmmmu
 4>>A-.

!!!$/ ,!) / )  4==0$--[qrbrDM ))DMM63D3DRSTr/   rj  c                     XR                      nUnU R                   H  nU" U5      nM     U R                  (       a%  U R                  [        R
                  " X#/SS95      nU R                  U5      nU$ )Nr"   rT   )rv  r}  r|  r~  r?   r[   r,  )rM   rj  r   r   rJ  s        r0   rf   BeitFCNHead.forward  se    (7 JJD /M  MM%))X4MST*UVM6r/   )rE  r,  r|  r~  r}  r4  rv  rz  )r   r   r"   )r)   r*   r+   r,   r-   r#   r   rr   r=   rs   r?   r   rf   r.   r   r   s   @r0   ru  ru    s     no!U !U,/!UBE!UUX[`adfiai[jUj!U	!U !UFT%,,-? ELL  r/   ru  c            	       ~   ^  \ rS rSrSrSS\S\S\SS4U 4S jjjrS	\R                  S\R                  4S
 jr	Sr
U =r$ )BeitFPNUpBlocki  uE   4x upsampling block: ConvTranspose → BN → GELU → ConvTranspose.rA   r6  r7  r:   Nc                    > [         TU ]  5         [        R                  " XX#S9U l        [        R
                  " U5      U l        [        R                  " 5       U l        [        R                  " XX#S9U l	        g )Nr6  r7  )
ro   r=   r   ConvTranspose2dconv_transpose1BatchNorm2dnormalizationGELUr;  conv_transpose2)rM   rA   r6  r7  rx   s       r0   r=   BeitFPNUpBlock.__init__  sX    !11+Xcs^^K8'')!11+Xcsr/   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rh   )r  r  r;  r  r  s     r0   rf   BeitFPNUpBlock.forward  sF    ,,];**=96,,];r/   )r;  r  r  r  )r   r   )r)   r*   r+   r,   r-   r   r=   r?   r   rf   r.   r   r   s   @r0   r  r    sS    OtC tc ts tSW t tU\\ ell  r/   r  c                      ^  \ rS rSrSrS\4U 4S jjrS\\R                  S4   S\\R                  S4   4S jr
S	rU =r$ )
BeitFPNNecki  z
4-level feature pyramid neck for BeiT. Produces x4 upsample, x2 upsample,
identity, and x2 downsample outputs from the four selected ViT feature maps.
r9   c                    > [         TU ]  5         [        UR                  5      U l        [
        R                  " UR                  UR                  SSS9U l        [
        R                  " SSS9U l	        g )Nr   r  )
ro   r=   r  rA   fpn1r   r  fpn2	MaxPool2dfpn4r   s     r0   r=   BeitFPNNeck.__init__  sX    "6#5#56	&&v'9'96;M;M[\efg	LLQq9	r/   feature_maps.r:   c                     U R                  US   5      U R                  US   5      US   U R                  US   5      4$ rn   r  r  r  )rM   r  s     r0   rf   BeitFPNNeck.forward  sC    IIl1o&IIl1o&OIIl1o&	
 	
r/   r  )r)   r*   r+   r,   r-   r#   r=   rr   r?   r   rf   r.   r   r   s   @r0   r  r    sI    
:z :
E%,,*;$< 
u||UXGXAY 
 
r/   r  c                      ^  \ rS rSrS\SS4U 4S jjr\\\   SS\	R                  S-  S\	R                  S-  S\S	\\   S\\-  4
S
 jj5       5       5       rSrU =r$ )BeitForSemanticSegmentationi  r9   r:   Nc                 f  > [         TU ]  U5        UR                  U l        [        USS9U l        [        U R                  R                  5      S:w  a  [        S5      e[        U5      U l
        [        U5      U l        UR                  (       a  [        U5      OS U l        U R!                  5         g )NFr  r]  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.)ro   r=   r  r   r  r`  r9   out_indices
ValueErrorr  fpnr[  decode_headuse_auxiliary_headru  auxiliary_headr   r   s     r0   r=   $BeitForSemanticSegmentation.__init__  s      ++f>	t{{&&'1,- 
 v& (/5;5N5Nk&1TX 	r/   rO   r  r\   r   c                   ^^^^ Ub%  U R                   R                  S:X  a  [        S5      eU R                  " U4SU0UD6nUR                  mUR
                  u  mpgnXpR                   R                  -  mXR                   R                  -  m[        UUUU4S jU R                   R                   5       5      n	U R                  U	5      n	U R                  U	5      n
SnU R                  b  U R                  U	5      nSnUb;  U R                  U
UU R                   R                  UU R                   R                  S9n[        UU
UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
>>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr"   z/The number of labels should be greater than oner\   c              3      >#    U  H8  nTUS -
     SS2S S24   R                  S S5      R                  TSTT5      v   M:     g7f)r"   Nr   rS   )	transposer   ).0r   ra   rj  patch_heightpatch_widths     r0   	<genexpr>6BeitForSemanticSegmentation.forward.<locals>.<genexpr>U  sP      
, "!a%(AB/99!Q?GG
TVXdfqrr,s   A A)ignore_indexauxiliary_logitsauxiliary_loss_weightr  )r9   r  r  r  r   rV   rF   rr   r  r  r  r  r/  semantic_loss_ignore_indexr  r   r"  )rM   rO   r  r\   r   r$  r]   r^   r_   r  r!  r  r   ra   rj  r  r  s                @@@@r0   rf   #BeitForSemanticSegmentation.forward%  s`   B $++"8"8A"=NOO))
%=
 
 !( 5 5'3'9'9$
Au!7!77{{555  
[[,,
 
 xx-!!,/*#22<@%%![[CC!1&*kk&G&G & D '!//))	
 	
r/   )r  r  r  r  r  r1  )r)   r*   r+   r,   r#   r=   r   r	   r   r?   r   r   r   r   rr   r   rf   r.   r   r   s   @r0   r  r    s    z d *   -1&*).	G
llT)G
 t#G
 #'	G

 +,G
 
(	(G
  ! G
r/   r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c            	       b   ^  \ rS rSrU 4S jr\\\S\S\	\
   S\4S j5       5       5       rSrU =r$ )BeitBackboneir  c                 F  > [         TU ]  U5        [        UR                  S-   5       Vs/ s H  o!R                  PM     snU l        [        USS9U l        UR                  (       a  [        U5      O[        R                  " 5       U l        U R                  5         g s  snf )Nr"   Fr  )ro   r=   r   r   rA   num_featuresr   r  add_fpnr  r   r   r  r   )rM   r9   r]   rx   s      r0   r=   BeitBackbone.__init__x  sx     9>v?W?WZ[?[9\]9\A//9\]f>	*0..;v&bkkm 	 ^s   BrO   r   r:   c                 (   UR                   u  p4pVXPR                  R                  -  nX`R                  R                  -  nU R                  " U40 UD6n	U	R                  n
Sn[        U R                  U
5       Hi  u  pXR                  ;   d  M  U R                  R                  (       a3  USS2SS2SS24   nUR                  SS5      nUR                  USXx5      nX4-  nMk     U R                  U5      n[        UU	R                  U	R                  S9$ )a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
>>> model = AutoBackbone.from_pretrained(
...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 14, 14]
```r(   Nr"   r   rS   )r  r   r"  )rV   r9   rF   r  r   rl  stage_namesout_featuresreshape_hidden_statesr  r   r  r   r"  )rM   rO   r   ra   r]   r^   r_   r  r  r$  r   r  stagerN  s                 r0   rf   BeitBackbone.forward  s   @ (4'9'9$
v!7!77{{555))L3F3--#&t'7'7#GE)));;44#/12q#9L#/#9#9!Q#?L#/#7#7
B#bL/ $H xx-%!//))
 	
r/   )r  r  r  )r)   r*   r+   r,   r=   r   r	   r   r   r   r   r   rf   r.   r   r   s   @r0   r  r  r  sN      3
3
 +,3
 
	3
  ! 3
r/   r  )r*  r  r  r   r   r  )Hr-   dataclassesr   r?   r   r    r   r   backbone_utilsr   r	   masking_utilsr
   modeling_outputsr   r   r   r   r   modeling_utilsr   processing_utilsr   pytorch_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   resnet.modeling_resnetr   swin.modeling_swinr   vit.modeling_vitr   r   r   r   r    r!   configuration_beitr#   r&   r2   r7   r<   rk   r   r   r   r   r   r   r   r  r*  r3  rC  rQ  r[  ru  r  r  r  r  __all__r(   r/   r0   <module>r     sF    !   & H 6  . & @ B B I 5 4 - t t * 
 !;  	, 	'] 'TV3ryy V3r`L `	f 		< 	7 7t T, T T. Jj# Jj JjZ	v 	v S
!4 S
S
l /
!4 /
/
d
O 
4
bii 
Sryy S<N299 Nb:")) :zRYY $
")) 
* `
"5 `
 `
F 
A
="5 A

A
Hr/   