
    3j                        S SK rS SKrS SK Jr  S SKJr  S SKrS SKJr  SSKJ	r
  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%   " S S\RL                  5      r'\" SS9\ " S S\5      5       5       r(\" SS9\ " S S\5      5       5       r)\" SS9\ " S S\5      5       5       r*\" SS9\ " S S \5      5       5       r+ " S! S"\RL                  5      r, " S# S$\RL                  5      r- " S% S&\RL                  5      r. " S' S(\RL                  5      r/  SLS)\RL                  S*\R`                  S+\R`                  S,\R`                  S-\R`                  S-  S.\1S-  S/\1S0\\   4S1 jjr2 " S2 S3\RL                  5      r3 " S4 S5\RL                  5      r4S6 r5S7 r6 " S8 S9\5      r7 " S: S;\5      r8\ " S< S=\5      5       r9 " S> S?\95      r:\ " S@ SA\95      5       r;\" SBS9 " SC SD\95      5       r<\" SES9 " SF SG\95      5       r=\" SHS9 " SI SJ\\95      5       r>/ SKQr?g)M    N)Callable)	dataclass)nn   )initialization)ACT2FN)BackboneMixinfilter_output_hidden_states)GradientCheckpointingLayer)BackboneOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)OutputRecordercapture_outputs   )
SwinConfigc                      ^  \ rS rSrSrSS\SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )SwinDropPath*   zStochastic depth (DropPath) per sample, for residual blocks.

Identity when ``drop_prob`` is 0 or outside training. See `Deep Networks with Stochastic Depth
<https://arxiv.org/abs/1603.09382>`_.
	drop_probreturnNc                 .   > [         TU ]  5         Xl        g N)super__init__r   )selfr   	__class__s     `/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/swin/modeling_swin.pyr"   SwinDropPath.__init__1   s    "    hidden_statesc                 V   U R                   S:X  d  U R                  (       d  U$ SU R                   -
  nUR                  S   4SUR                  S-
  -  -   n[        R
                  " X1R                  UR                  S9n[        R                  " XB-   5      nUR                  U5      U-  $ )N        r   r   )r   dtypedevice)
r   trainingshapendimtorchrandr,   r-   floordiv)r#   r(   	keep_probr/   random_tensors        r%   forwardSwinDropPath.forward5   s    >>S   &	$$Q')DM4F4F4J,KK

50C0CML`L`aM$=>  +m;;r'   c                      SU R                    3$ )Nzp=r   )r#   s    r%   
extra_reprSwinDropPath.extra_repr>   s    DNN#$$r'   r:   )r*   )__name__
__module____qualname____firstlineno____doc__floatr"   r1   Tensorr7   strr;   __static_attributes____classcell__r$   s   @r%   r   r   *   sL    #% #$ # #<U\\ <ell <%C % %r'   r   zN
    Swin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   S	rg)
SwinEncoderOutputB   a  
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlast_hidden_state.r(   
attentionsreshaped_hidden_states )r=   r>   r?   r@   rA   rL   r1   FloatTensor__annotations__r(   tuplerM   rN   rE   rO   r'   r%   rJ   rJ   B   s}     37u((4/6:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr'   rJ   zV
    Swin model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)SwinModelOutputX   a  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
    Average pooling of the last layer hidden-state.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
NrL   pooler_output.r(   rM   rN   rO   )r=   r>   r?   r@   rA   rL   r1   rP   rQ   rV   r(   rR   rM   rN   rE   rO   r'   r%   rT   rT   X   s    	 37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr'   rT   z*
    Swin masked image model outputs.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)SwinMaskedImageModelingOutputq   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
    Masked image modeling (MLM) loss.
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Reconstructed pixel values.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlossreconstruction.r(   rM   rN   rO   )r=   r>   r?   r@   rA   rZ   r1   rP   rQ   r[   r(   rR   rM   rN   rE   rO   r'   r%   rX   rX   q   s     &*D%

d
")/3NE%%,3:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr'   rX   z0
    Swin outputs for image classification.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)SwinImageClassifierOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
NrZ   logits.r(   rM   rN   rO   )r=   r>   r?   r@   rA   rZ   r1   rP   rQ   r_   r(   rR   rM   rN   rE   rO   r'   r%   r]   r]      s     &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr'   r]   c            
          ^  \ rS rSrSrSU 4S jjrS\R                  S\S\S\R                  4S jr	  SS
\R                  S	-  S\R                  S	-  S\S\\R                     4S jjrSrU =r$ )SwinEmbeddings   zO
Construct the patch and position embeddings. Optionally, also the mask token.
c                   > [         TU ]  5         [        U5      U l        U R                  R                  nU R                  R
                  U l        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l        UR                  (       a5  [        R                  " [        R                  " SX1R                  5      5      OS U l        [        R                  " UR                  5      U l        [        R"                  " UR$                  5      U l        UR(                  U l        Xl        g )Nr   )r!   r"   SwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr1   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)r#   ru   use_mask_tokenrf   r$   s       r%   r"   SwinEmbeddings.__init__   s     3F ;++77//99O]",,u{{1a9I9I'JKcg LRKiKiBLLQ5E5EFGos 	  LL!1!12	zz&"<"<= ++r'   
embeddingsheightwidthr   c                 $   UR                   S   nU R                  R                   S   n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ UR                   S   nX R
                  -  nX0R
                  -  n[        US-  5      n	U R                  R                  SXU5      n
U
R                  SSSS5      n
[        R                  R                  U
Xx4SSS	9n
U
R                  SSSS5      R                  SSU5      $ )
z
Interpolate pre-trained position encodings to support higher-resolution images at inference.
Unlike ViT, Swin has no CLS token, so position embeddings cover patch positions only.
r         ?r   r      bicubicF)sizemodealign_corners)r/   rn   r1   jit
is_tracingrt   r   reshapepermuter   
functionalinterpolateview)r#   rx   ry   rz   rf   num_positionsdim
new_height	new_widthsqrt_num_positionspatch_pos_embeds              r%   interpolate_pos_encoding'SwinEmbeddings.interpolate_pos_encoding   s   
 !&&q)0066q9 yy##%%+*F6?+++r".
__,	&}c'9:22::1>Pfij)11!Q1=--33(	 4 
 &&q!Q2772sCCr'   Npixel_valuesbool_masked_posr   c                    UR                   u  pEpgU R                  U5      u  pU R                  U5      nUR                  5       u  pnUbI  U R                  R                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  b*  U(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nX4$ )Nr|   g      ?)r/   re   rp   r   rl   expand	unsqueezetype_asrn   r   rs   )r#   r   r   r   _num_channelsry   rz   rx   output_dimensions
batch_sizeseq_lenmask_tokensmasks                 r%   r7   SwinEmbeddings.forward   s     *6););&(,(=(=l(K%
YYz*
!+!2
Q&//00bIK",,R088ED#sTz2[5GGJ##/''*G*G
\a*bb
'*B*BB
\\*-
,,r'   )ru   rs   rl   rp   re   rh   rt   rn   F)NF)r=   r>   r?   r@   rA   r"   r1   rC   intr   rP   
BoolTensorboolrR   r7   rE   rF   rG   s   @r%   ra   ra      s    "D5<< D DUX D]b]i]i DB 48).	-''$.- ))D0- #'	-
 
u||	- -r'   ra   c                      ^  \ rS rSrSrU 4S jrS rS\R                  S-  S\	\R                  \	\   4   4S jrS	rU =r$ )
rd      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX0l        X`l
        US   US   -  US   US   -  4U l        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r!   r"   
image_sizert   r   rk   
isinstancecollectionsabcIterablerf   rg   r   Conv2d
projection)r#   ru   r   rt   r   hidden_sizerf   r$   s          r%   r"   SwinPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79I9Ik#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$&$Q-:a=8*Q-:VW=:XY))L:ir'   c                 f   X0R                   S   -  S:w  aB  SU R                   S   X0R                   S   -  -
  4n[        R                  R                  X5      nX R                   S   -  S:w  aD  SSSU R                   S   X R                   S   -  -
  4n[        R                  R                  X5      nU$ )z9Pad pixel_values to be divisible by patch_size if needed.r   r   )rt   r   r   pad)r#   r   ry   rz   
pad_valuess        r%   	maybe_padSwinPatchEmbeddings.maybe_pad  s    ??1%%*T__Q/%//!:L2LLMJ==,,\FLOOA&&!+Q4??1#5QRAS8S#STJ==,,\FLr'   r   Nr   c                     UR                   u  p#pEU R                  XU5      nU R                  U5      nUR                   u    p$nXE4nUR                  S5      R	                  SS5      nXg4$ )Nr~   r   )r/   r   r   flatten	transpose)r#   r   r   r   ry   rz   rx   r   s           r%   r7   SwinPatchEmbeddings.forward  sp    )5););&~~lEB__\2
(..1e#O''*44Q:
,,r'   )rg   rf   rt   r   )r=   r>   r?   r@   rA   r"   r   r1   rP   rR   rC   r   r7   rE   rF   rG   s   @r%   rd   rd      sK    j	-E$5$5$< 	-u||UZ[^U_G_A` 	- 	-r'   rd   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\S	\S\R                  4S
 jr	S\R                  S\
\\4   S\R                  4S jrSrU =r$ )SwinPatchMergingi!  zP
Patch Merging Layer.

Args:
    dim (`int`):
        Number of input channels.
r   r   Nc                    > [         TU ]  5         [        R                  " SU-  SU-  SS9U l        [        R
                  " SU-  5      U l        g )N   r~   Fbias)r!   r"   r   Linear	reductionro   rp   )r#   r   r$   s     r%   r"   SwinPatchMerging.__init__*  s>    1s7AG%@LLS)	r'   input_featurery   rz   c           
          US-  S:X  d	  US-  S:X  a,  [         R                  R                  USSSUS-  SUS-  45      nU$ )zPPad input feature map to be divisible by 2 in both spatial dimensions if needed.r~   r   r   )r   r   r   )r#   r   ry   rz   s       r%   r   SwinPatchMerging.maybe_pad/  sL    QJ!OaMM--maAuqyRSU[^_U_=`aMr'   input_dimensionsc                    Uu  p4UR                   u  pVnUR                  XSXG5      nU R                  XU5      n[        R                  " [        S5       VV	s/ s H(  n[        S5        H  oS S 2U	S S2US S2S S 24   PM     M*     sn	nSS9nUR                  USSU-  5      nU R                  U5      nU R                  U5      nU$ s  sn	nf )Nr~   r|   r   r   )r/   r   r   r1   catrangerp   r   )
r#   r   r   ry   rz   r   r   r   colrows
             r%   r7   SwinPatchMerging.forward5  s    ((5(;(;%
%**:uS}eD		<A!HYHSPUVWPX1cf1fcf1fa/0PX0HY_a
 &**:r1|;KL		-0}5 Zs   /C
)rp   r   )r=   r>   r?   r@   rA   r   r"   r1   rC   r   rR   r7   rE   rF   rG   s   @r%   r   r   !  sz    *C *D *
u|| S  QVQ]Q] U\\ U3PS8_ Y^YeYe  r'   r   c                      ^  \ rS rSrSrS\S\\\4   4U 4S jjrS\R                  4S jr
S\R                  4S jrS	rU =r$ )
SwinRelativePositionBiasiI  a  
Relative position bias for Swin's window-based attention, following the style of BeitRelativePositionBias.

Unlike BeiT, Swin has no CLS token, so the table covers exactly (2*ws_h-1)*(2*ws_w-1) unique
relative positions. The lookup index is purely determined by window_size (static), so it is stored
as a non-persistent buffer (recomputed from config on load, never serialised). The table values
are learned parameters and must be re-read on every forward call.
	num_headswindow_sizec                 4  > [         TU ]  5         X l        US   US   -  U l        [        R
                  " [        R                  " SUS   -  S-
  SUS   -  S-
  -  U5      5      U l        U R                  SU R                  5       R                  S5      SS9  g )Nr   r   r~   relative_position_indexr|   F)
persistent)r!   r"   r   window_arear   ri   r1   rj   relative_position_bias_tableregister_buffer_create_relative_position_indexr   )r#   r   r   r$   s      r%   r"   !SwinRelativePositionBias.__init__S  s    &&q>KN:,.LLKK[^+a/AA4F4JKYW-
)
 	%00277; 	 	
r'   r   c                    [         R                  " U R                  S   5      n[         R                  " U R                  S   5      n[         R                  " [         R                  " X/SS95      n[         R
                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  nUR                  SSS5      R                  5       nUS S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   UR                  S5      $ )Nr   r   ij)indexingr~   r|   )	r1   aranger   stackmeshgridr   r   
contiguoussum)r#   coords_hcoords_wcoordscoords_flattenrelative_coordss         r%   r   8SwinRelativePositionBias._create_relative_position_indexb  s'   << 0 0 34<< 0 0 34U^^X,@4PQvq1(At4~aqj7QQ)11!Q:EEG 	1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? ""2&&r'   c                     U R                   U R                     nUR                  U R                  U R                  S5      nUR	                  SSS5      R                  5       R                  S5      $ )Nr|   r~   r   r   )r   r   r   r   r   r   r   )r#   relative_position_biass     r%   r7    SwinRelativePositionBias.forwards  sd    !%!B!B4C_C_!`!7!<!<T=M=MtO_O_ac!d%--aA6AACMMaPPr'   )r   r   r   )r=   r>   r?   r@   rA   r   rR   r"   r1   rC   r   r7   rE   rF   rG   s   @r%   r   r   I  sK    
# 
E#s(O 
' '"Q Q Qr'   r   modulequerykeyvalueattention_maskscalingrs   kwargsc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  nUb  X-   n[        R
                  R                  US[        R                  S9R                  UR                  5      n[        R
                  R                  XU R                  S9n[        R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr|         r~   r   )r   r,   )pr.   r   )r   r1   matmulr   r   r   softmaxfloat32tor,   rs   r.   r   )
r   r   r   r   r   r   rs   r   attn_weightsattn_outputs
             r%   eager_attention_forwardr   y  s     **R.D( <<}}Q':;gEL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r'   c                      ^  \ rS rSrS\S\S\S\4U 4S jjr SS\R                  S	\R                  S-  S
\
\   S\\R                  \R                  4   4S jjrSrU =r$ )SwinAttentioni  ru   r   num_attention_headsr   c                   > [         TU ]  5         Xl        X0l        X#-  U l        UR
                  U l        U R                  S-  U l        SU l        [        R                  " X"UR                  S9U l        [        R                  " X"UR                  S9U l        [        R                  " X"UR                  S9U l        [        R                  " X"5      U l        [!        X4U45      U l        g )Nr   Fr   )r!   r"   ru   r   head_dimattention_probs_dropout_probattention_dropoutr   	is_causalr   r   qkv_biasq_projk_projv_projo_projr   r   )r#   ru   r   r   r   r$   s        r%   r"   SwinAttention.__init__  s    #6 #:!'!D!D}}d*iivOiivOiivOii9&>?RbmTn&o#r'   Nr(   r   r   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  5       n	Ube  UR                   S   n
US   U
-  nUS   nUR                  S5      R                  S5      R                  USSSS5      R                  SSX5      nX-   nOU	n[        R                  " U R                  R                  [        5      nU" U UUUU4U R                   (       d  SOU R"                  U R$                  S.UD6u  nnUR                  " / UQSP76 R'                  5       nU R)                  U5      nUU4$ )Nr|   r   r~   r   r*   )rs   r   )r/   r  r  r   r   r  r  r   r   r   r   r   get_interfaceru   _attn_implementationr   r.   r  r   r   r	  )r#   r(   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesr   num_windowsr   r   combined_maskattention_interfacer   r   s                    r%   r7   SwinAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST "&!<!<!>%(..q1K$Q;6J!!nG ((+1
BB3Q1	  3CM2M(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r'   )r  ru   r  r  r  r   r	  r  r   r   r  r    )r=   r>   r?   r@   r   r   r"   r1   rC   rP   r   r   rR   r7   rE   rF   rG   s   @r%   r   r     s    pz p pRU pdg p& 482)||2) ))D02) +,	2)
 
u||U\\)	*2) 2)r'   r   c                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	SwinMLPi  ru   r   c                   > [         TU ]  5         [        UR                     U l        [
        R                  " U[        UR                  U-  5      5      U l	        [
        R                  " [        UR                  U-  5      U5      U l
        g r    )r!   r"   r   
hidden_actactivation_fnr   r   r   	mlp_ratiofc1fc2)r#   ru   r   r$   s      r%   r"   SwinMLP.__init__  se    #F$5$5699S#f&6&6&<"=>99S!1!1C!78#>r'   r(   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r    )r  r  r  )r#   r(   s     r%   r7   SwinMLP.forward  s4    /**=9/r'   )r  r  r  )r=   r>   r?   r@   r   r   r"   r1   rC   r7   rE   rF   rG   s   @r%   r  r    s6    ?z ? ?U\\ ell  r'   r  c                     U R                   u  p#pEU R                  X#U-  XU-  X5      n U R                  SS5      R                  5       R                  SXU5      nU$ )z*
Partitions the given input into windows.
r~   r   r|   r/   r   r   r   )r   r   r   ry   rz   r   windowss          r%   window_partitionr%    sg     /<.A.A+J!&&k);8LkM %%a+668==b+\hiGNr'   c                     U R                   S   nU R                  SX!-  X1-  XU5      n U R                  SS5      R                  5       R                  SX#U5      n U $ )z7
Merges windows to produce higher resolution features.
r|   r~   r   r#  )r$  r   ry   rz   r   s        r%   window_reverser'    s^     ==$Lll2v4e6JKfrsG1%00277F<XGNr'   c                     ^  \ rS rSr  SS\S\S\\\4   S\S\S\4U 4S jjjr SS	\	R                  S
\\\4   S\S\\   S\	R                  4
S jjrS\\\4   SS4S jrS\S\S\	R                   S\	R"                  S\	R                  S-  4
S jrS	\	R                  S\S\S\\	R                  \\S4   4   4S jrSS	\	R                  S\S\	R                  4S jjrSrU =r$ )	SwinLayeri   ru   r   input_resolutionr   drop_path_rate
shift_sizec                   > [         TU ]  5         [        XXAR                  S9U l        [
        R                  " X!R                  S9U l        [
        R                  " X!R                  S9U l	        [        X5      U l        [
        R                  " UR                  5      U l        UR                  U l        X`l        X0l        US:  a  [#        U5      U l        g [
        R$                  " 5       U l        g )N)r   epsr*   )r!   r"   r   r   	attentionr   ro   layer_norm_epslayernorm_beforelayernorm_afterr  mlprq   rr   rs   r,  r*  r   Identity	drop_path)r#   ru   r   r*  r   r+  r,  r$   s          r%   r"   SwinLayer.__init__  s     	&vIK]K]^ "S6K6K L!||C5J5JK6'zz&"<"<=!--$ 09G#9Mn5SUS^S^S`r'   r(   r   always_partitionr   r   c                    U(       d  U R                  U5        Uu  pVUR                  5       u  pxn	Un
U R                  U5      nUR                  XuXi5      nU R	                  XU5      u  pUR
                  u  pp[        U R                  U5      U R                  5      nUR                  SU R                  U R                  -  U	5      nU R                  XUR                  UR                  S9nU R                  " X40 UD6u  nnU R                  U5      nUR                  SU R                  U R                  U	5      nU R                  [        UU R                  X5      SS9nUS   S:  d	  US   S:  a  US S 2S U2S U2S S 24   R                  5       nUR                  XuU-  U	5      nXR!                  U5      -   nUnU R#                  U5      nU R%                  U5      nU R                  U5      U-   nUU4$ )Nr|   r+   T)reverser   r      )set_shift_and_window_sizer   r2  r   r   r/   r%  cyclic_shiftr   get_attn_maskr,   r-   r0  rs   r'  r   r6  r3  r4  )r#   r(   r   r8  r   ry   rz   r   r   channelsshortcutr   
height_pad	width_padhidden_states_windows	attn_maskattention_outputr   attention_windowsresiduals                       r%   r7   SwinLayer.forward  s     **+;<("/"4"4"6
x --m<%**:uO %)NN=%$P!&3&9&9#y 01B1B=1QSWScSc d 5 : :2t?O?ORVRbRb?bdl m&&)<)<EZEaEa ' 
	 *.8M)c\b)c&,<<(89,11"d6F6FHXHXZbc --,d.>.>
V`d . 
 a=1
1 1 1!WfWfufa2G H S S U-22:~xX >>2C#DD ,,];/]3h>l**r'   Nc                    [        U5      U R                  ::  an  [        S5      U l        [        R
                  R                  5       (       a*  [        R                   " [        R                  " U5      5      O
[        U5      U l        gg)zQClamp window and shift sizes when the window is larger than the input resolution.r   N)minr   r   r,  r1   r   r   tensor)r#   r*  s     r%   r<  #SwinLayer.set_shift_and_window_sizeD  s_     D$4$44'lDO=BYY=Q=Q=S=S		%,,'789Y\]mYn  5r'   ry   rz   r,   r-   c           	         U R                   S:  Gae  [        R                  " SXS4X4S9n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S5      4n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S5      4nSnU H  n	U H  n
XSS2XSS24'   US-  nM     M     [        XPR                  5      nUR                  SU R                  U R                  -  5      nUR                  S5      UR                  S5      -
  nUR                  US:g  S5      R                  US:H  S5      nU$ SnU$ )	z`Build the cyclic-shift attention mask for shifted-window MSA; returns None when shift_size is 0.r   r   r+   Nr|   r~   g      Yr*   )	r,  r1   rj   slicer   r%  r   r   masked_fill)r#   ry   rz   r,   r-   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windowsrD  s                r%   r>  SwinLayer.get_attn_maskL  sy   ??Q{{Ava#8UHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E -#/K@EQ1<=QJE $0 !.
 ,H6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir'   .c                     U R                   X0R                   -  -
  U R                   -  nU R                   X R                   -  -
  U R                   -  nSSSUSU4n[        R                  R                  X5      nX4$ )zHPad feature map so both spatial dimensions are divisible by window_size.r   )r   r   r   r   )r#   r(   ry   rz   	pad_right
pad_bottomr   s          r%   r   SwinLayer.maybe_padh  sy    %%0@0@(@@DDTDTT	&&2B2B)BBdFVFVV
Ay!Z8
))-D((r'   r:  c                     U R                   S:  a;  U(       a  SOSn[        R                  " UX0R                   -  X0R                   -  4SS9nU$ )zOApply a cyclic shift along the spatial dimensions for shifted-window attention.r   r   r|   )r   r~   )shiftsdims)r,  r1   roll)r#   r(   r:  	directions       r%   r=  SwinLayer.cyclic_shiftp  sI    ??Q$"I!JJ!OO3Y5PQM
 r'   )	r0  r6  rs   r*  r3  r2  r4  r,  r   )r*   r   r   )r=   r>   r?   r@   r   r   rR   rB   r"   r1   rC   r   r   r   r7   r<  r,   r-   r>  r   r=  rE   rF   rG   s   @r%   r)  r)     sy    !$aa a  S/	a
 a a a a0 "'	-+||-+  S/-+ 	-+
 +,-+ 
-+^%S/ d C  EKK QVQ]Q] bgbnbnqubu 8)u|| )S ) )QVW\WcWcejknpsksetWtQu )	%,, 	 	RWR^R^ 	 	r'   r)  c                   N  ^  \ rS rSrS\S\S\\\4   S\S\S\\   4U 4S jjr	S	\
R                  S
\
R                  S\S\S\S\
R                  4S jr  SS	\
R                  S\\\4   S\S\S\\   S\\
R                  \
R                  \
R                  S-  4   4S jjrSrU =r$ )	SwinStagei|  ru   r   r*  depthr   r6  c                   > [         T	U ]  5         Xl        [        R                  " [        U5       Vs/ s H+  n[        UUUUXh   US-  S:X  a  SOUR                  S-  S9PM-     sn5      U l        Ub  U" US9U l	        g S U l	        g s  snf )Nr~   r   )ru   r   r*  r   r+  r,  r   )
r!   r"   ru   r   
ModuleListr   r)  r   blocks
downsample)
r#   ru   r   r*  rd  r   r6  rh  ir$   s
            r%   r"   SwinStage.__init__}  s     	mm u
 &A !%5'#,<%&UaZqf6H6HA6M &

 2<1G*-T
s   2Br(   !hidden_states_before_downsamplingry   rz   (output_hidden_states_before_downsamplingr   c                     U(       a  X#UpnO"U R                   b  XS-   S-  US-   S-  pnOXUpnUR                  u  pnUR                  XX5      R                  SSSS5      R	                  5       $ )u^  
Select the spatial hidden states for this stage and reshape from (B, L, C) to (B, C, H, W).

The chosen state and its resolution depend on output_hidden_states_before_downsampling:
- True  → pre-downsampling states at (height, width) — used by the backbone.
- False → post-downsampling states at half the resolution (if a downsampler exists).
r   r~   r   r   )rh  r/   r   r   r   )r#   r(   rk  ry   rz   rl  spatial_statehwr   r   r   s               r%   get_reshaped_hidden_states$SwinStage.get_reshaped_hidden_states  s     4"CUaMa__("/1*1BUQYSTDTaMa"/aM%2%8%8"
{!!*@HHAqRST__aar'   r   r8  r   Nc                     Uu  pgS nU R                    H  n	U	" X4SU0UD6u  pM     Un
U R                  b  U R                  X5      nU R                  XXgU5      nXU4$ )Nr8  )rg  rh  rq  )r#   r(   r   r8  rl  r   ry   rz   last_attn_weightslayer_modulerk  rN   s               r%   r7   SwinStage.forward  s     )  KKL/;0BR0V\0,M, (
 -:)??& OO,M`M!%!@!@fMu"
 6GGGr'   )rg  ru   rh  )FF)r=   r>   r?   r@   r   r   rR   listrB   r"   r1   rC   r   rq  r   r   r7   rE   rF   rG   s   @r%   rc  rc  |  s*   RR R  S/	R
 R R ;R8b||b ,1<<b 	b
 b 37b 
b: "'9>H||H  S/H 	H
 37H +,H 
u||U\\5<<$+>>	?H Hr'   rc  c                      ^  \ rS rSr% \\S'   SrSrSrSr	S/r
SrSrSrSrSr\" \S	SS
9\" \SSS
9S.rSrSS/r\R,                  " 5       U 4S j5       rSrU =r$ )SwinPreTrainedModeli  ru   swinr   )imageTrc  Fr   )indexcapture_initial_hidden_stater~   )r(   rM   re   z(attention\.self\.relative_position_indexz:attention\.relative_position_bias\.relative_position_indexc                   > [         TU ]  U5        [        U[        5      (       a\  UR                  b   [
        R                  " UR                  5        UR                  b!  [
        R                  " UR                  5        gg[        U[        5      (       a_  [
        R                  " UR                  5        [
        R                  " UR                  UR                  5       R                  S5      5        gg)zInitialize the weightsNr|   )r!   _init_weightsr   ra   rl   initzeros_rn   r   r   copy_r   r   r   )r#   r   r$   s     r%   r  !SwinPreTrainedModel._init_weights  s     	f%fn--  ,F--.))5F667 6 899KK;;<JJv55v7]7]7_7d7deg7hi :r'   rO   )r=   r>   r?   r@   r   rQ   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_can_compile_fullgraphr   rc  _can_record_outputs_input_embed_layer"_keys_to_ignore_on_load_unexpectedr1   no_gradr  rE   rF   rG   s   @r%   ry  ry    s    $O!&*#$N "&! (	Y]^
 %YaV[\	 , 	4E*&
 ]]_
j 
jr'   ry  c                      ^  \ rS rSrS\S\\\4   4U 4S jjr\\	" SS9\
   SS\R                  S\\\4   S	\S
\S\S\\   S\4S jj5       5       5       rSrU =r$ )SwinEncoderi  ru   rg   c                   > [         TU ]  U5        [        UR                  5      U l        Xl        [        [        UR                  5      5       Vs/ s H4  o1R                  U-  [        [        UR                  5      S-
  S5      -  PM6     nn[        R                  " [        U R                  5       Vs/ s H  n[        U[        UR                  SU-  -  5      US   SU-  -  US   SU-  -  4UR                  U   UR                  U   U[        UR                  S U 5      [        UR                  S US-    5       XPR                  S-
  :  a  [         OS S9PM     sn5      U l        U R%                  5         g s  snf s  snf )Nr   r~   r   )ru   r   r*  rd  r   r6  rh  )r!   r"   lendepths
num_layersru   r   r   r+  maxr   rf  rc  r   rk   r   r   layers	post_init)r#   ru   rg   ri  dpr	layer_idxr$   s         r%   r"   SwinEncoder.__init__  sh    fmm,SXY\]c]j]jYkSlmSla$$q(3s6==/AA/Eq+IISlmmm "'t!7 "8I !F,,q)|;<&/lq)|&DiPQlWXZcWcFd%e --	2$..y9!#fmmJY&?"@3v}}UdW`cdWdGeCfg4=RS@S4S/Z^ "8
 	 ns   ;E<5B*FF)tie_last_hidden_statesr(   r   r8  output_hidden_statesrl  r   r   c                 ^   SnU(       aG  UR                   u  pn
UR                  " U/UQU
P76 R                  SSSS5      R                  5       nU4nU R                   HD  nU" UU4UUS.UD6u  pn	U(       a  X}4-  nUR
                  c  M.  US   S-   S-  US   S-   S-  4nMF     [        UUS9$ )a  
input_dimensions (`tuple[int, int]`):
    Spatial `(height, width)` of the patch grid entering the encoder.
always_partition (`bool`, *optional*, defaults to `False`):
    If `True`, always apply window partitioning regardless of input resolution.
output_hidden_states_before_downsampling (`bool`, *optional*, defaults to `False`):
    If `True`, `reshaped_hidden_states` contains pre-downsampling feature maps.
Nr   r   r   r~   r8  rl  )rL   rN   )r/   r   r   r   r  rh  rJ   )r#   r(   r   r8  r  rl  r   all_reshaped_hidden_statesr   r   r   stem_spatialru  reshaped_hidden_states                 r%   r7   SwinEncoder.forward  s    ( &*" *7)<)<&J;"":N0@N+NVVWXZ[]^`abmmo  +7& KKL6B 7 "29a	7
 73M! $*.FF*&&2%5a%81%<$BEUVWEX[\E\abDb#c  ( !+#=
 	
r'   )ru   r  r  )FFF)r=   r>   r?   r@   r   rR   r   r"   r   r   r   r1   rC   r   r   r   rJ   r7   rE   rF   rG   s   @r%   r  r    s    z eCHo *  E2
 "'%*9>+
||+
  S/+
 	+

 #+
 37+
 +,+
 
+
  3  +
r'   r  c                      ^  \ rS rSrSU 4S jjr\\   SS\R                  S-  S\R                  S-  S\
S\\   S\4
S	 jj5       5       rS
rU =r$ )	SwinModeli?  c                   > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  SU R
                  S-
  -  -  5      U l        [        XS9U l
        [        XR                  R                  5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OSU l        U R)                  5         g)z
add_pooling_layer (`bool`, *optional*, defaults to `True`):
    Whether or not to apply pooling layer.
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether or not to create and apply mask tokens in the embedding layer.
r~   r   )rv   r.  N)r!   r"   ru   r  r  r  r   rk   num_featuresra   rx   r  rh   encoderr   ro   r1  	layernormAdaptiveAvgPool1dpoolerr  )r#   ru   add_pooling_layerrv   r$   s       r%   r"   SwinModel.__init__A  s     	 fmm, 0 0119L3M MN(O"6??+E+EFd&7&7V=R=RS1Bb**1- 	r'   Nr   r   r   r   r   c                    UR                  SU R                  R                  5      nU R                  XUS9u  pgU R                  " UU4SU0UD6nUR
                  n	U R                  U	5      n	Sn
U R                  b8  U R                  U	R                  SS5      5      n
[        R                  " U
S5      n
[        U	U
UR                  UR                  UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
r  r   r   Nr   r~   )rL   rV   r(   rM   rN   )popru   r  rx   r  rL   r  r  r   r1   r   rT   r(   rM   rN   )r#   r   r   r   r   r  embedding_outputr   encoder_outputssequence_outputpooled_outputs              r%   r7   SwinModel.forwardV  s      &zz*@$++BbBbc-1__Tl .= .
* ,,
 "6
 	
 *;;..9;;" KK(A(A!Q(GHM!MM-;M-')77&11#2#I#I
 	
r'   )ru   rx   r  r  r  r  r  )TFNNF)r=   r>   r?   r@   r"   r   r   r1   rP   r   r   r   r   rT   r7   rE   rF   rG   s   @r%   r  r  ?  s{    *  2637).	(
''$.(
 ))D0(
 #'	(

 +,(
 
(
  (
r'   r  ad  
    Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                      ^  \ rS rSrU 4S jr\\   SS\R                  S-  S\R                  S-  S\
S\\   S\4
S	 jj5       5       rS
rU =r$ )SwinForMaskedImageModelingi  c                   > [         TU ]  U5        [        USSS9U l        [	        UR
                  SUR                  S-
  -  -  5      n[        R                  " [        R                  " X!R                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R                  5         g )NFT)r  rv   r~   r   )in_channelsout_channelsr   )r!   r"   r  rz  r   rk   r  r   
Sequentialr   encoder_strider   PixelShuffledecoderr  )r#   ru   r  r$   s      r%   r"   #SwinForMaskedImageModeling.__init__  s     fdS	6++aF4E4E4I.JJK}}II(7L7La7ORXReRe7est OOF112	
 	r'   Nr   r   r   r   r   c                 h   U R                   " U4UUS.UD6nUR                  nUR                  SS5      nUR                  u  pxn	[        R
                  " U	S-  5      =pUR                  XxX5      nU R                  U5      nSnUGb  U R                  R                  U R                  R                  -  nUR                  SX5      nUR                  U R                  R                  S5      R                  U R                  R                  S5      R                  S5      R                  5       n[        R                  R!                  XSS9nUU-  R#                  5       UR#                  5       S	-   -  U R                  R$                  -  n['        UUUR(                  UR*                  UR,                  S
9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
>>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 192, 192]
```r  r   r~   r}   Nr|   none)r   gh㈵>)rZ   r[   r(   rM   rN   )rz  rL   r   r/   mathr3   r   r  ru   r   rt   repeat_interleaver   r   r   r   l1_lossr   r   rX   r(   rM   rN   )r#   r   r   r   r   outputsr  r   r   sequence_lengthry   rz   reconstructed_pixel_valuesmasked_im_lossr   r   reconstruction_losss                    r%   r7   "SwinForMaskedImageModeling.forward  s   L ))
+%=
 	
 "33)33Aq94C4I4I1
/OS$899)11*FZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN,5!//))#*#A#A
 	
r'   )r  rz  r  )r=   r>   r?   r@   r"   r   r   r1   rP   r   r   r   r   rX   r7   rE   rF   rG   s   @r%   r  r    s       2637).	H
''$.H
 ))D0H
 #'	H

 +,H
 
'H
  H
r'   r  a  
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune Swin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrU 4S jr\\   SS\R                  S-  S\R                  S-  S\
S\\   S\4
S	 jj5       5       rS
rU =r$ )SwinForImageClassificationi  c                 D  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a5  [
        R                  " U R                  R                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )Nr   )r!   r"   
num_labelsr  rz  r   r   r  r5  
classifierr  )r#   ru   r$   s     r%   r"   #SwinForImageClassification.__init__  sx      ++f%	 EKDUDUXYDYBIIdii,,f.?.?@_a_j_j_l 	
 	r'   Nr   labelsr   r   r   c                    U R                   " U4SU0UD6nUR                  nU R                  U5      nSnUb  U R                  " X'U R                  40 UD6n[        UUUR                  UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r   N)rZ   r_   r(   rM   rN   )	rz  rV   r  loss_functionru   r]   r(   rM   rN   )	r#   r   r  r   r   r  r  r_   rZ   s	            r%   r7   "SwinForImageClassification.forward
  s     ))
%=
 
  --/%%fdkkLVLD(!//))#*#A#A
 	
r'   )r  r  rz  r  )r=   r>   r?   r@   r"   r   r   r1   rP   
LongTensorr   r   r   r]   r7   rE   rF   rG   s   @r%   r  r    s{      26*.).	!
''$.!
   4'!
 #'	!

 +,!
 
#!
  !
r'   r  zM
    Swin backbone, to be used with frameworks like DETR and MaskFormer.
    c            	          ^  \ rS rSrS/rS\4U 4S jjr\\\	S\
R                  S\\   S\4S j5       5       5       rS	rU =r$ )
SwinBackbonei0  zswin.layernorm.*ru   c           	        > [         TU ]  U5        UR                  /[        [	        UR
                  5      5       Vs/ s H  n[        UR                  SU-  -  5      PM      sn-   U l        [        USS9U l	        0 n[        U R                  U R                  5       H  u  pE[        R                  " U5      X4'   M     [        R                  " U5      U l        U R#                  5         g s  snf )Nr~   F)r  )r!   r"   rk   r   r  r  r   r  r  rz  zipout_featuresr?  r   ro   
ModuleDicthidden_states_normsr  )r#   ru   ri  r  stager   r$   s         r%   r"   SwinBackbone.__init__8  s     #--.X]^abhbobo^pXq1rXqST#f6F6FA6M2NXq1rrf>	 !#&t'8'8$--#HE)+l)C& $I#%==1D#E  	 2ss   %C*r   r   r   c                    U R                   " U4SSS.UD6nSn[        U R                  UR                  5       H  u  pVXPR                  ;   d  M  UR
                  u  pxpUR                  SSSS5      R                  5       nUR                  XyU
-  U5      nU R                  U   " U5      nUR                  XyX5      nUR                  SSSS5      R                  5       nXF4-  nM     [        UUR                  UR                  S9$ )	a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
>>> model = AutoBackbone.from_pretrained(
...     "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 7, 7]
```
Tr  rO   r   r~   r   r   )feature_mapsr(   rM   )rz  r  stage_namesrN   r  r/   r   r   r   r  r   rM   )r#   r   r   r  r  r  hidden_stater   r   ry   rz   s              r%   r7   SwinBackbone.forwardG  s   H ))
!59
 	
 #&t'7'79W9W#XE))):F:L:L7
&+33Aq!Q?JJL+00e^\Z#77>|L+00UY+33Aq!Q?JJL/ $Y %!88))
 	
r'   )r  r  rz  )r=   r>   r?   r@   _keys_to_ignore_on_load_missingr   r"   r   r
   r   r1   rC   r   r   r   r7   rE   rF   rG   s   @r%   r  r  0  sd     (;&;#z   7
ll7
 +,7
 
	7
  ! 7
r'   r  )r  r  r  ry  r  )Nr*   )@collections.abcr   r  r   dataclassesr   r1   r    r   r  activationsr   backbone_utilsr	   r
   modeling_layersr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   r   configuration_swinr   Moduler   rJ   rT   rX   r]   ra   rd   r   r   rC   rB   r   r   r  r%  r'  r)  rc  ry  r  r  r  r  r  __all__rO   r'   r%   <module>r     s  *   $ !   & ! H 9 . F & O O I E *%299 %0 
 H H H  
 Hk H H& 
 HK H H* 
 H H H*M-RYY M-`'-")) '-T%ryy %P-Qryy -Ql !%II%<<% 
% <<	%
 LL4'% T\% % '(%8C)BII C)Lbii 	y* yxMH* MH` (j/ (j (jVD
% D
N @
# @
 @
F 	[
!4 [
[
| 2
!4 2
2
j 
L
="5 L

L
^r'   