
    3jx                        S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
  SSKJr  SS	KJr  SS
KJrJrJrJrJr  SSKJr  SSKJr  \" 5       (       a	  SSKJrJr  OS rS r\" SS9\ " S S\5      5       5       r\" SS9\ " S S\5      5       5       r\" SS9\ " S S\5      5       5       r " S S\R>                  5      r  " S S\R>                  5      r! " S S \R>                  5      r" " S! S"\R>                  5      r# " S# S$\R>                  5      r$ " S% S&\R>                  5      r% " S' S(\R>                  5      r& " S) S*\R>                  5      r' " S+ S,\R>                  5      r( " S- S.\R>                  5      r) " S/ S0\R>                  5      r* " S1 S2\R>                  5      r+\ " S3 S4\5      5       r,\ " S5 S6\,5      5       r-\" S7S9 " S8 S9\,5      5       r.\" S:S9 " S; S<\	\,5      5       r// S=Qr0g)>z9PyTorch Dilated Neighborhood Attention Transformer model.    N)	dataclass)nn   )ACT2FN)BackboneMixinfilter_output_hidden_states)BackboneOutput)PreTrainedModel)ModelOutputOptionalDependencyNotAvailableauto_docstringis_natten_availablerequires_backends)can_return_tuple   )DinatConfig)
natten2davnatten2dqkrpbc                      [        5       eNr   argskwargss     b/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/dinat/modeling_dinat.pyr   r   )       ,..    c                      [        5       er   r   r   s     r   r   r   ,   r   r   zO
    Dinat encoder's outputs, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   Sr\
\R                  S4   S-  \S'   S	rg)
DinatEncoderOutput0   a  
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r#   torchFloatTensor__annotations__r$   tupler%   r&   __static_attributes__r'   r   r   r!   r!   0   s}     37u((4/6:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr   r!   zW
    Dinat model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)DinatModelOutputF   a  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
    Average pooling of the last layer hidden-state.
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nr#   pooler_output.r$   r%   r&   r'   )r(   r)   r*   r+   r,   r#   r-   r.   r/   r5   r$   r0   r%   r&   r1   r'   r   r   r3   r3   F   s    	 37u((4/6.2M5$$t+2:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr   r3   z1
    Dinat outputs for image classification.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)DinatImageClassifierOutput_   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
    shape `(batch_size, hidden_size, height, width)`.

    Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
    include the spatial dimensions.
Nlosslogits.r$   r%   r&   r'   )r(   r)   r*   r+   r,   r9   r-   r.   r/   r:   r$   r0   r%   r&   r1   r'   r   r   r7   r7   _   s     &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;CGE%"3"3S"89D@Gr   r7   c                   r   ^  \ rS rSrSrU 4S jrS\R                  S-  S\\R                     4S jr
SrU =r$ )	DinatEmbeddingsz   z.
Construct the patch and position embeddings.
c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  5      U l
        g r   )super__init__DinatPatchEmbeddingspatch_embeddingsr   	LayerNorm	embed_dimnormDropouthidden_dropout_probdropoutselfconfig	__class__s     r   r@   DinatEmbeddings.__init__   sG     4V <LL!1!12	zz&"<"<=r   pixel_valuesNreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rB   rE   rH   )rJ   rN   
embeddingss      r   forwardDinatEmbeddings.forward   s4    **<8
YYz*
\\*-
r   )rH   rE   rB   )r(   r)   r*   r+   r,   r@   r-   r.   r0   TensorrR   r1   __classcell__rL   s   @r   r<   r<   z   s9    >E$5$5$< u||AT  r   r<   c                   l   ^  \ rS rSrSrU 4S jrS\R                  S-  S\R                  4S jr	Sr
U =r$ )	rA      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  nUR                  UR                  pCX0l        US:X  a  O[        S5      e[        R                  " [        R                  " U R                  US-  SSSS9[        R                  " US-  USSSS95      U l	        g )N   z2Dinat only supports patch size of 4 at the moment.   r   r   r[   r[   r   r   )kernel_sizestridepadding)
r?   r@   
patch_sizenum_channelsrD   
ValueErrorr   
SequentialConv2d
projection)rJ   rK   rb   rc   hidden_sizerL   s        r   r@   DinatPatchEmbeddings.__init__   s    &&
$*$7$79I9Ik(? QRR--IId'')9vV\flmIIkQ&PV`fg
r   rN   NrO   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR	                  SSSS5      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r[   r   r   )shaperc   rd   rg   permute)rJ   rN   _rc   heightwidthrQ   s          r   rR   DinatPatchEmbeddings.forward   sZ    )5););&,,,w  __\2
''1a3
r   )rc   rg   )r(   r)   r*   r+   r,   r@   r-   r.   rT   rR   r1   rU   rV   s   @r   rA   rA      s4    
"	E$5$5$< 	 	 	r   rA   c                      ^  \ rS rSrSr\R                  4S\S\R                  SS4U 4S jjjr	S\
R                  S\
R                  4S	 jrS
rU =r$ )DinatDownsampler   z
Convolutional Downsampling Layer.

Args:
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
dim
norm_layerrO   Nc           	         > [         TU ]  5         Xl        [        R                  " USU-  SSSSS9U l        U" SU-  5      U l        g )Nr[   r\   r]   r^   F)r_   r`   ra   bias)r?   r@   rt   r   rf   	reductionrE   )rJ   rt   ru   rL   s      r   r@   DinatDownsampler.__init__   sC    3CVF\binoq3w'	r   input_featurec                     U R                  UR                  SSSS5      5      R                  SSSS5      nU R                  U5      nU$ )Nr   r   r   r[   )rx   rl   rE   )rJ   rz   s     r   rR   DinatDownsampler.forward   sJ    }'<'<Q1a'HIQQRSUVXY[\]		-0r   )rt   rE   rx   )r(   r)   r*   r+   r,   r   rC   intModuler@   r-   rT   rR   r1   rU   rV   s   @r   rr   rr      sT     :< (C (RYY ($ ( (U\\ ell  r   rr   c                   x   ^  \ rS rSrU 4S jr S	S\R                  S\S-  S\\R                     4S jjr	Sr
U =r$ )
NeighborhoodAttention   c                   > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        X@l        XPl        [        R                  " [        R                  " USU R                  -  S-
  SU R                  -  S-
  5      5      U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R&                  " UR(                  5      U l        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r[   r   )rw   )r?   r@   rd   num_attention_headsr}   attention_head_sizeall_head_sizer_   dilationr   	Parameterr-   zerosrpbLinearqkv_biasquerykeyvaluerF   attention_probs_dropout_probrH   rJ   rK   rt   	num_headsr_   r   rL   s         r   r@   NeighborhoodAttention.__init__   s:   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP&  <<ID<L<L8Lq8PTUX\XhXhThklTl noYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr   r$   output_attentionsNrO   c                 p   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU[        R                  " U R                  5      -  n[        XVU R                  U R                  U R                  5      n[        R                  R                  USS9n	U R!                  U	5      n	[#        XU R                  U R                  5      n
U
R%                  SSSSS5      R'                  5       n
U
R)                  5       S S U R*                  4-   nU
R                  U5      n
U(       a  X4nU$ U
4nU$ )	Nr   r[   )rt   r   r   rZ   )rk   r   r   view	transposer   r   mathsqrtr   r   r_   r   r   
functionalsoftmaxrH   r   rl   
contiguoussizer   )rJ   r$   r   input_shapehidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                r   rR   NeighborhoodAttention.forward   s   
 $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR
 "DIId.F.F$GG )4K[K[]a]j]jk --//0@b/I ,,7"?AQAQSWS`S`a%--aAq!<GGI"/"4"4"6s";t?Q?Q>S"S%**+BC6G=2 O\M]r   )
r   r   r   rH   r_   r   r   r   r   r   Fr(   r)   r*   r+   r@   r-   rT   boolr0   rR   r1   rU   rV   s   @r   r   r      sE    G2 */!||!  $;! 
u||		! !r   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )NeighborhoodAttentionOutputi  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g r   )r?   r@   r   r   denserF   r   rH   rJ   rK   rt   rL   s      r   r@   $NeighborhoodAttentionOutput.__init__  s4    YYs(
zz&"E"EFr   r$   input_tensorrO   c                 J    U R                  U5      nU R                  U5      nU$ r   r   rH   )rJ   r$   r   s      r   rR   #NeighborhoodAttentionOutput.forward  s$    

=1]3r   r   
r(   r)   r*   r+   r@   r-   rT   rR   r1   rU   rV   s   @r   r   r     s7    G
U\\  RWR^R^  r   r   c                   x   ^  \ rS rSrU 4S jr S	S\R                  S\S-  S\\R                     4S jjr	Sr
U =r$ )
NeighborhoodAttentionModulei  c                 f   > [         TU ]  5         [        XX4U5      U l        [	        X5      U l        g r   )r?   r@   r   rJ   r   outputr   s         r   r@   $NeighborhoodAttentionModule.__init__  s*    )&yxX	1&>r   r$   r   NrO   c                 d    U R                  X5      nU R                  US   U5      nU4USS  -   nU$ Nr   r   )rJ   r   )rJ   r$   r   self_outputsattention_outputr   s         r   rR   #NeighborhoodAttentionModule.forward  s@    
 yyB;;|AF#%QR(88r   )r   rJ   r   r   rV   s   @r   r   r     sD    ? */||  $; 
u||		 r   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DinatIntermediatei$  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r?   r@   r   r   r}   	mlp_ratior   
isinstance
hidden_actstrr   intermediate_act_fnr   s      r   r@   DinatIntermediate.__init__%  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r   r$   rO   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rJ   r$   s     r   rR   DinatIntermediate.forward-  s&    

=100?r   r   r   rV   s   @r   r   r   $  s(    9U\\ ell  r   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DinatOutputi3  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g r   )
r?   r@   r   r   r}   r   r   rF   rG   rH   r   s      r   r@   DinatOutput.__init__4  sF    YYs6#3#3c#9:C@
zz&"<"<=r   r$   rO   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   s     r   rR   DinatOutput.forward9  s$    

=1]3r   r   r   rV   s   @r   r   r   3  s(    >
U\\ ell  r   r   c                      ^  \ rS rSrSrSS\SS4U 4S jjjrS\R                  S\R                  4S jr	S\
4S	 jrS
rU =r$ )DinatDropPathi@  zStochastic depth (DropPath) per sample, for residual blocks.

Identity when ``drop_prob`` is 0 or outside training. See `Deep Networks with Stochastic Depth
<https://arxiv.org/abs/1603.09382>`_.
	drop_probrO   Nc                 .   > [         TU ]  5         Xl        g r   )r?   r@   r   )rJ   r   rL   s     r   r@   DinatDropPath.__init__G  s    "r   r$   c                 V   U R                   S:X  d  U R                  (       d  U$ SU R                   -
  nUR                  S   4SUR                  S-
  -  -   n[        R
                  " X1R                  UR                  S9n[        R                  " XB-   5      nUR                  U5      U-  $ )N        r   r   )r   )dtypedevice)
r   trainingrk   ndimr-   randr   r   floordiv)rJ   r$   	keep_probrk   random_tensors        r   rR   DinatDropPath.forwardK  s    >>S   &	$$Q')DM4F4F4J,KK

50C0CML`L`aM$=>  +m;;r   c                      SU R                    3$ )Nzp=r   rJ   s    r   
extra_reprDinatDropPath.extra_reprT  s    DNN#$$r   r   r   )r(   r)   r*   r+   r,   floatr@   r-   rT   rR   r   r   r1   rU   rV   s   @r   r   r   @  sL    #% #$ # #<U\\ <ell <%C % %r   r   c            	          ^  \ rS rSrS
U 4S jjrS r SS\R                  S\S-  S\	\R                  \R                  4   4S jjr
S	rU =r$ )
DinatLayeriX  c                   > [         TU ]  5         UR                  U l        UR                  U l        X@l        U R                  U R                  -  U l        [        R                  " X!R                  S9U l	        [        XX0R                  U R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X!R                  S9U l        [!        X5      U l        [%        X5      U l        UR(                  S:  a>  [        R*                  " UR(                  [,        R.                  " SU45      -  SS9U l        g S U l        g )Neps)r_   r   r   r   r[   T)requires_grad)r?   r@   chunk_size_feed_forwardr_   r   window_sizer   rC   layer_norm_epslayernorm_beforer   	attentionr   Identity	drop_pathlayernorm_afterr   intermediater   r   layer_scale_init_valuer   r-   oneslayer_scale_parameters)rJ   rK   rt   r   r   drop_path_raterL   s         r   r@   DinatLayer.__init__Y  s   '-'E'E$!-- ++dmm; "S6K6K L40@0@4==
 ;I3:N~6TVT_T_Ta!||C5J5JK-f:!&. ,,q0 LL66QH9MM]ab 	#  	#r   c                     U R                   nSnX$:  d  X4:  aD  S=pg[        SXC-
  5      n[        SXB-
  5      n	SSXhXy4n[        R                  R	                  X5      nX4$ )N)r   r   r   r   r   r   r   )r   maxr   r   pad)
rJ   r$   rn   ro   r   
pad_valuespad_lpad_tpad_rpad_bs
             r   	maybe_padDinatLayer.maybe_padm  sn    &&'
5#6E;./E;/0EQe;JMM--mHM((r   r$   r   NrO   c                    UR                  5       u  p4pVUnU R                  U5      nU R                  XU5      u  pUR                  u  ppU R	                  XS9nUS   nUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R                  5       nU R                  b  U R                  S   U-  nXpR                  U5      -   nU R                  U5      nU R                  U R                  U5      5      nU R                  b  U R                  S   U-  nXR                  U5      -   nU(       a  XS   4nU$ U4nU$ )N)r   r   r      r   )r   r   r
  rk   r   r   r   r   r   r   r   )rJ   r$   r   
batch_sizern   ro   channelsshortcutr  rm   
height_pad	width_padattention_outputsr   
was_paddedlayer_outputlayer_outputss                    r   rR   DinatLayer.forwardx  sf   
 /<.@.@.B+
E --m<$(NN=%$P!&3&9&9#y NN=N^,Q/]Q&;*Q-!*;
/7F7FUFA0EFQQS&&2#::1=@PP >>2B#CC++M:{{4#4#4\#BC&&266q9LHL$~~l'CC@Q';< YeWfr   )r   r   r   r   r   r_   r   r   r   r   r   r   r   )r(   r)   r*   r+   r@   r
  r-   rT   r   r0   rR   r1   rU   rV   s   @r   r   r   X  sR    
(	) */$||$  $;$ 
u||U\\)	*	$ $r   r   c                   x   ^  \ rS rSrU 4S jr S	S\R                  S\S-  S\\R                     4S jjr	Sr
U =r$ )

DinatStagei  c                 $  > [         T	U ]  5         Xl        X l        [        R
                  " [        U5       Vs/ s H  n[        UUUXX   Xh   S9PM     sn5      U l        Ub  U" U[        R                  S9U l
        OS U l
        SU l        g s  snf )N)rK   rt   r   r   r   )rt   ru   F)r?   r@   rK   rt   r   
ModuleListranger   layersrC   
downsamplepointing)
rJ   rK   rt   depthr   	dilationsr   r  irL   s
            r   r@   DinatStage.__init__  s    mm u	 &A !'&\#1#4 &	
 !(SR\\JDO"DO%	s   Br$   r   NrO   c                     UR                  5       u  p4pS[        U R                  5       H  u  pgU" X5      nUS   nM     Un	U R                  b  U R                  U	5      nX4n
U(       a  U
WSS  -  n
U
$ r   )r   	enumerater  r  )rJ   r$   r   rm   rn   ro   r"  layer_moduler  !hidden_states_before_downsamplingstage_outputss              r   rR   DinatStage.forward  s    
 ,0025(5OA(JM)!,M  6 -:)??& OO,MNM&J]12..Mr   )rK   rt   r  r  r  r   r   rV   s   @r   r  r    sD    8 */||  $; 
u||		 r   r  c                      ^  \ rS rSrU 4S jr    SS\R                  S\S-  S\S-  S\S-  S\S-  S	\\	-  4S
 jjr
SrU =r$ )DinatEncoderi  c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        R                  " SUR                  [        UR                  5      SS9 Vs/ s H  o"R                  5       PM     nn[        R                  " [        U R                  5       Vs/ s H  n[        U[        UR                   SU-  -  5      UR                  U   UR"                  U   UR$                  U   U[        UR                  S U 5      [        UR                  S US-    5       X@R                  S-
  :  a  [&        OS S9PM     sn5      U l        g s  snf s  snf )Nr   cpu)r   r[   r   )rK   rt   r   r   r!  r   r  )r?   r@   lendepths
num_levelsrK   r-   linspacer   sumitemr   r  r  r  r}   rD   r   r!  rr   levels)rJ   rK   xdpri_layerrL   s        r   r@   DinatEncoder.__init__  s0   fmm,!&63H3H#fmmJ\ej!kl!kAvvx!klmm  %T__5  6G !F,,q'z9: --0$..w7$..w7#&s6=='+B'Cc&--XeZadeZeJfFg#h4;ooPQ>Q4Q/X\  6
 ms   &E(B#Er$   r   Noutput_hidden_states(output_hidden_states_before_downsamplingreturn_dictrO   c                    U(       a  SOS nU(       a  SOS nU(       a  SOS nU(       a  UR                  SSSS5      n	Xa4-  nXy4-  n[        U R                  5       H  u  pU" X5      nUS   nUS   nU(       a&  U(       a  UR                  SSSS5      n	Xm4-  nXy4-  nO,U(       a%  U(       d  UR                  SSSS5      n	Xa4-  nXy4-  nU(       d  My  XSS  -  nM     U(       d  [        S XU4 5       5      $ [	        UUUUS9$ )Nr'   r   r   r   r[   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r'   ).0vs     r   	<genexpr>'DinatEncoder.forward.<locals>.<genexpr>
  s     m$[q$[s   	)r#   r$   r%   r&   )rl   r%  r4  r0   r!   )rJ   r$   r   r9  r:  r;  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsreshaped_hidden_stater"  r&  r  r'  s                 r   rR   DinatEncoder.forward  sA    #7BD+?RT"$5b4$1$9$9!Q1$E!!11&*BB&(5OA(JM)!,M0=a0@-#(P(I(Q(QRSUVXY[\(]%!%II!*.FF*%.V(5(=(=aAq(I%!%55!*.FF*  #QR'88#%  6( m]GZ$[mmm!++*#=	
 	
r   )rK   r4  r0  )FFFT)r(   r)   r*   r+   r@   r-   rT   r   r0   r!   rR   r1   rU   rV   s   @r   r+  r+    st    
. */,1@E#'.
||.
  $;.
 #Tk	.

 37+.
 D[.
 
#	#.
 .
r   r+  c                   ,    \ rS rSr% \\S'   SrSrSrSr	g)DinatPreTrainedModeli  rK   dinatrN   )imager'   N)
r(   r)   r*   r+   r   r/   base_model_prefixmain_input_nameinput_modalitiesr1   r'   r   r   rH  rH    s    $O!r   rH  c                      ^  \ rS rSrSU 4S jjrS r\    SS\R                  S-  S\	S-  S\	S-  S\	S-  S	\
\-  4
S
 jj5       rSrU =r$ )
DinatModeli  c                   > [         TU ]  U5        [        U S/5        Xl        [	        UR
                  5      U l        [        UR                  SU R                  S-
  -  -  5      U l	        [        U5      U l        [        U5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OSU l        U R)                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
nattenr[   r   r   N)r?   r@   r   rK   r.  r/  r0  r}   rD   num_featuresr<   rQ   r+  encoderr   rC   r   	layernormAdaptiveAvgPool1dpooler	post_init)rJ   rK   add_pooling_layerrL   s      r   r@   DinatModel.__init__  s    
 	 $
+fmm, 0 0119L3M MN)&1#F+d&7&7V=R=RS1Bb**1- 	r   c                 .    U R                   R                  $ r   rQ   rB   r   s    r   get_input_embeddingsDinatModel.get_input_embeddings4      ///r   NrN   r   r9  r;  rO   c                 Z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nS n	U R                  bH  U R                  UR                  SS5      R                  SS5      5      n	[        R                  " U	S5      n	U(       d  X4USS  -   n
U
$ [        UU	UR                  UR                  UR                  S9$ )Nz You have to specify pixel_valuesr   r9  r;  r   r   r[   )r#   r5   r$   r%   r&   )rK   r   r9  r;  rd   rQ   rS  rT  rV  flattenr   r-   r3   r$   r%   r&   )rJ   rN   r   r9  r;  r   embedding_outputencoder_outputssequence_outputpooled_outputr   s              r   rR   DinatModel.forward7  s?    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY?@@??<8,,/!5#	 ' 
 *!,..9;;" KK(?(?1(E(O(OPQST(UVM!MM-;M%58KKFM-')77&11#2#I#I
 	
r   )rK   rQ   rS  rT  rR  r0  rV  )T)NNNN)r(   r)   r*   r+   r@   r\  r   r-   r.   r   r0   r3   rR   r1   rU   rV   s   @r   rO  rO    s|    ,0  26)-,0#'-
''$.-
  $;-
 #Tk	-

 D[-
 
!	!-
 -
r   rO  z
    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    c                      ^  \ rS rSrU 4S jr\     SS\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S	\
\-  4S
 jj5       rSrU =r$ )DinatForImageClassificationih  c                 ^  > [         TU ]  U5        [        U S/5        UR                  U l        [	        U5      U l        UR                  S:  a5  [        R                  " U R
                  R                  UR                  5      O[        R                  " 5       U l
        U R                  5         g )NrQ  r   )r?   r@   r   
num_labelsrO  rI  r   r   rR  r   
classifierrW  rI   s     r   r@   $DinatForImageClassification.__init__o  s     $
+ ++'
 FLEVEVYZEZBIIdjj--v/@/@A`b`k`k`m 	
 	r   NrN   labelsr   r9  r;  rO   c                 V   Ub  UOU R                   R                  nU R                  UUUUS9nUS   nU R                  U5      n	Sn
Ub  U R	                  X)U R                   5      n
U(       d  U	4USS -   nU
b  U
4U-   $ U$ [        U
U	UR                  UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr`  r   r[   )r9   r:   r$   r%   r&   )	rK   r;  rI  rk  loss_functionr7   r$   r%   r&   )rJ   rN   rm  r   r9  r;  r   r   re  r:   r9   r   s               r   rR   #DinatForImageClassification.forward  s      &1%<k$++BYBY**/!5#	  
  
/%%fdkkBDY,F)-)9TGf$EvE)!//))#*#A#A
 	
r   )rk  rI  rj  )NNNNN)r(   r)   r*   r+   r@   r   r-   r.   
LongTensorr   r0   r7   rR   r1   rU   rV   s   @r   rh  rh  h  s       26*.)-,0#'*
''$.*
   4'*
  $;	*

 #Tk*
 D[*
 
+	+*
 *
r   rh  zL
    NAT backbone, to be used with frameworks like DETR and MaskFormer.
    c                      ^  \ rS rSrU 4S jrS r\\\   SS\	R                  S\S-  S\S-  S\S-  S	\4
S
 jj5       5       5       rSrU =r$ )DinatBackbonei  c           	        > [         TU ]  U5        [        U S/5        [        U5      U l        [        U5      U l        UR                  /[        [        UR                  5      5       Vs/ s H  n[        UR                  SU-  -  5      PM      sn-   U l        0 n[        U R                  U R                  5       H  u  pE[         R"                  " U5      X4'   M     [         R$                  " U5      U l        U R)                  5         g s  snf )NrQ  r[   )r?   r@   r   r<   rQ   r+  rS  rD   r  r.  r/  r}   rR  zipout_featuresr  r   rC   
ModuleDicthidden_states_normsrW  )rJ   rK   r"  rx  stagerc   rL   s         r   r@   DinatBackbone.__init__  s     $
+)&1#F+#--.X]^abhbobo^pXq1rXqST#f6F6FA6M2NXq1rr !#&t'8'8$--#HE)+l)C& $I#%==1D#E  	 2ss   *%Dc                 .    U R                   R                  $ r   r[  r   s    r   r\  "DinatBackbone.get_input_embeddings  r^  r   NrN   r9  r   r;  rO   c                 &   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nU R                  UUSSSS9nUR                  nSn	[        U R                  U5       H  u  pXR                  ;   d  M  UR                  u  ppUR                  SSSS5      R                  5       nUR                  XU-  U5      nU R                  U
   " U5      nUR                  XX5      nUR                  SSSS5      R                  5       nX4-  n	M     U(       d  U	4nU(       a  UUR                  4-  nU$ [!        U	U(       a  UR                  OSUR"                  S	9$ )
a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
>>> model = AutoBackbone.from_pretrained(
...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)

>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 512, 7, 7]
```NT)r   r9  r:  r;  r'   r   r[   r   r   )feature_mapsr$   r%   )rK   r;  r9  r   rQ   rS  r&   ru  stage_namesrv  rk   rl   r   r   rx  r$   r	   r%   )rJ   rN   r9  r   r;  r   rb  r   r$   r~  ry  hidden_stater  rc   rn   ro   r   s                    r   rR   DinatBackbone.forward  s   L &1%<k$++BYBY$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,/!%59  
  66#&t'7'7#GE))):F:L:L7
&+33Aq!Q?JJL+00e^\Z#77>|L+00UY+33Aq!Q?JJL/ $H "_F#70022M%3G'//T))
 	
r   )rQ   rS  rx  rR  )NNN)r(   r)   r*   r+   r@   r\  r   r   r   r-   rT   r   r	   rR   r1   rU   rV   s   @r   rs  rs    s    $0   -1)-#'J
llJ
 #TkJ
  $;	J

 D[J
 
J
  ! J
r   rs  )rh  rO  rH  rs  )1r,   r   dataclassesr   r-   r   activationsr   backbone_utilsr   r   modeling_outputsr	   modeling_utilsr
   utilsr   r   r   r   r   utils.genericr   configuration_dinatr   natten.functionalr   r   r!   r3   r7   r~   r<   rA   rr   r   r   r   r   r   r   r   r  r+  rH  rO  rh  rs  __all__r'   r   r   <module>r     sE   @  !   ! H . -  . , ;;// 
 H H H  
 H{ H H& 
 H H H*bii ,!299 !Hryy .8BII 8v
")) 
")) "		 	")) 	%BII %0D DN, ,^C
299 C
L "? " " H
% H
 H
V <
"6 <
<
~ 
c
M#7 c

c
L ar   