
    
3j,                        S SK JrJrJr  S SKrS SKJr  S SKJr  SSK	J
r
  SSKJr  SSKJr  / S	Qr\R                   R"                  \
SS
\R$                  S\\R$                     4S jj5       5       r\R                   R"                  \
  SS\S\R$                  S\\R$                     S\S\\R$                     4
S jj5       5       r " S S\R.                  5      r " S S\R.                  5      rg)    )FinalOptionalTypeN)nn)
functional   )register_notrace_function)use_fused_attn)apply_rot_embed_cat)	AttentionAttentionRopemaybe_add_maskresolve_self_attn_maskscores	attn_maskc                     Uc  U $ X-   $ N )r   r   s     O/home/wildlama/miniconda3/lib/python3.13/site-packages/timm/layers/attention.pyr   r      s     &6>F,>>    seq_lenattn	is_causalreturnc                 4   U(       a-  UR                  X 4[        S5      5      R                  S5      nU$ Uc  S nU$ UR                  [        R
                  :X  a<  [        R                  " X!R                  S9nUR                  U) [        S5      5        U$ UnU$ )Nz-infr   )dtype)new_fullfloattriu_r   torchbool
zeros_likemasked_fill_)r   r   r   r   	attn_biass        r   r   r      s     MM7"4eFmDJJ1M	  
		  
EJJ	&$$YjjA		z5=9  	r   c                     ^  \ rS rSr% Sr\\   \S'               SS\S\S\	\   S\	\   S	\S
\S\S\S\
S\
S\	\\R                        SS4U 4S jjjr  SS\R                   S\	\R                      S\S\R                   4S jjrSrU =r$ )r   +   a  Standard Multi-head Self Attention module with QKV projection.

This module implements the standard multi-head attention mechanism used in transformers.
It supports both the fused attention implementation (scaled_dot_product_attention) for
efficiency when available, and a manual implementation otherwise. The module includes
options for QK normalization, attention dropout, and projection dropout.

fused_attnNdim	num_headsattn_head_dimdim_outqkv_biasqk_norm
scale_norm	proj_bias	attn_drop	proj_drop
norm_layerr   c                    > [         TU ]  5         XS.nU=(       d    UnUnUc  X-  S:X  d   S5       eX-  nU(       d  U(       a
  Uc   S5       eX l        Xl        X/-  U l        US-  U l        [        5       U l        [        R                  " XR                  S-  4SU0UD6U l
        U(       a	  U" U40 UD6O[        R                  " 5       U l        U(       a	  U" U40 UD6O[        R                  " 5       U l        [        R                  " U	5      U l        U(       a  U" U R                  40 UD6O[        R                  " 5       U l        [        R                  " U R                  U4SU0UD6U l        [        R                  " U
5      U l        g)	a  Initialize the Attention module.

Args:
    dim: Input dimension of the token embeddings.
    num_heads: Number of attention heads.
    attn_head_dim: Dimension of each attention head. If None, computed as dim // num_heads.
    dim_out: Output dimension. If None, same as dim.
    qkv_bias: Whether to use bias in the query, key, value projections.
    qk_norm: Whether to apply normalization to query and key vectors.
    scale_norm: Whether to apply normalization to attention output before projection.
    proj_bias: Whether to use bias in the output projection.
    attn_drop: Dropout rate applied to the attention weights.
    proj_drop: Dropout rate applied after the output projection.
    norm_layer: Normalization layer constructor for QK normalization if enabled.
devicer   Nr   $dim should be divisible by num_heads<norm_layer must be provided if qk_norm or scale_norm is True         bias)super__init__r)   head_dimattn_dimscaler
   r'   r   LinearqkvIdentityq_normk_normDropoutr0   normprojr1   )selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r5   r   ddr=   	__class__s                   r   r<   Attention.__init__5   sC   > 	/.S ?a'O)OO''Hj)i+ii)" !,%
(*99S--!"3I(IbI4;j0R04;j0R0I.7AJt}}33r{{}	IIdmmWK9KK	I.r   xr   r   c           	      <   UR                   u  pEnU R                  U5      R                  XESU R                  U R                  5      R                  SSSSS5      nUR                  S5      u  pn
U R                  U5      U R                  U	5      pU R                  (       a@  [        R                  " XU
UU R                  (       a  U R                  R                  OSUS9nO]XR                  -  nXR!                  SS	5      -  n[#        X[X#5      n[%        X5      nUR'                  S	S
9nU R                  U5      nX-  nUR!                  SS5      R                  XEU R(                  5      nU R+                  U5      nU R-                  U5      nU R/                  U5      nU$ )Nr9      r   r              r   	dropout_pr   r(   )shaperA   reshaper)   r=   permuteunbindrC   rD   r'   Fscaled_dot_product_attentiontrainingr0   pr?   	transposer   r   softmaxr>   rF   rG   r1   )rH   rL   r   r   BNCrA   qkvr   r$   s                r   forwardAttention.forwardl   sW    ''ahhqk!!!4>>4==IQQRSUVXY[\^_`**Q-a{{1~t{{1~1??..a#.2mm$..**#	A JJA{{2r**D.q	MI!$2D<<B<'D>>$'DAKK1%%aDMM:IIaLIIaLNN1r   )r>   r0   r'   r=   rD   rF   r)   rG   r1   rC   rA   r?   )   NNFFFTrP   rP   NNNNF)__name__
__module____qualname____firstlineno____doc__r   r!   __annotations__intr   r   r   r   Moduler<   r    Tensorrf   __static_attributes____classcell__rJ   s   @r   r   r   +   s    d
 +/%)"!$"!!485/5/ 5/ $C=	5/
 c]5/ 5/ 5/ 5/ 5/ 5/ 5/ !bii15/ 
5/ 5/t 15#	||  - 	
 
 r   r   c                   >  ^  \ rS rSr% Sr\R                  R                  \   \	S'                  SS\
S\
S\\
   S\S\S	\
S
\S\S\\
   S\\R                     S\S\S\S\4U 4S jjjr   SS\\R"                     S\\R"                     S\4S jjrSrU =r$ )r      zA Self Attention module with ROPE support.

Includes options for:
 * QK normalization option
 * Attention output (scale) normalization
 * Fused or unfused QKV projection support
r'   r(   r)   r+   r,   	qkv_fusednum_prefix_tokensr0   r1   r*   r2   r-   r.   r/   rotate_halfc                 v  > [         TU ]  5         UUS.nU=(       d    UnU	nUc  X-  S:X  d   S5       eX-  nU(       d  U(       a
  U
c   S5       eX l        UU l        UU-  U l        US-  U l        X`l        [        5       U l        Xl	        U(       aA  [        R                  " XR                  S-  4SU0UD6U l        S=U l        =U l        U l        OSU l        [        R                  " XR                  4SU0UD6U l        [        R                  " XR                  4SU0UD6U l        [        R                  " XR                  4SU0UD6U l        U(       a	  U
" U40 UD6O[        R                   " 5       U l        U(       a	  U
" U40 UD6O[        R                   " 5       U l        [        R&                  " U5      U l        U(       a  U
" U R                  40 UD6O[        R                   " 5       U l        [        R                  " U R                  U4SU0UD6U l        [        R&                  " U5      U l        g)	a3  Initialize the Attention module.

Args:
    dim: Input dimension of the token embeddings
    num_heads: Number of attention heads
    dim_out: Output dimension. If None, same as dim.
    qkv_bias: Whether to add a bias term to the query, key, and value projections
    qkv_fused: Whether to use fused QKV projection (single linear) or separate projections
    num_prefix_tokens: Number of reg/cls tokens at the beginning of the sequence that
        should not have position embeddings applied
    attn_drop: Dropout rate for attention weights
    proj_drop: Dropout rate for the output projection
    attn_head_dim: Dimension of each attention head. If None, computed as dim // num_heads.
    norm_layer: Normalization layer constructor to use for QK and scale normalization
    qk_norm: Enable normalization of query (Q) and key (K) vectors with norm_layer
    scale_norm: Enable normalization (scaling) of attention output with norm_layer
    proj_bias: Whether to use bias in the output projection
    rotate_half: Use 'half' ROPE layout instead of default 'interleaved'
r4   Nr   r6   r7   r8   r9   r:   )r;   r<   r)   r=   r>   r?   ry   r
   r'   rz   r   r@   rA   q_projk_projv_projrB   rC   rD   rE   r0   rF   rG   r1   )rH   r(   r)   r+   r,   rx   ry   r0   r1   r*   r2   r-   r.   r/   rz   r5   r   rI   r=   rJ   s                      r   r<   AttentionRope.__init__   s   L 	/.S ?a'O)OO''H)i+ii)"  9,%
!2(*&yymma&7MhM"MDH6::DK:$+DH))CLXLLDK))CLXLLDK))CLXLLDK4;j0R04;j0R0I.7AJt}}33r{{}	IIdmmWK9KK	I.r   roper   r   c                 h   UR                   u  pVnU R                  ba  U R                  U5      nUR                  XVSU R                  U R                  5      R                  SSSSS5      nUR                  S5      u  pnOU R                  U5      R                  XVU R                  U R                  5      R                  SS5      n	U R                  U5      R                  XVU R                  U R                  5      R                  SS5      n
U R                  U5      R                  XVU R                  U R                  5      R                  SS5      nU R                  U	5      U R                  U
5      pUb  U R                  n[        U SS5      n[        R                   " U	SS2SS2SU2SS24   [#        U	SS2SS2US2SS24   X-S	9/SS
9R%                  U5      n	[        R                   " U
SS2SS2SU2SS24   [#        U
SS2SS2US2SS24   X-S	9/SS
9R%                  U5      n
U R&                  (       a@  [(        R*                  " XUUU R,                  (       a  U R.                  R0                  OSUS9nO]XR2                  -  n	XR                  SS5      -  n[5        XnX45      n[7        X5      nUR9                  SS
9nU R/                  U5      nX-  nUR                  SS5      R                  XVU R:                  5      nU R=                  U5      nU R?                  U5      nU RA                  U5      nU$ )a  Forward pass for the attention module.

Args:
    x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
    rope: Rotary position embeddings tensor for position-aware attention
    attn_mask: Optional attention mask to apply during attention computation
    is_causal: If True, use causal (autoregressive) masking

Returns:
    Tensor of shape (batch_size, sequence_length, dim_out)
Nr9   rN   r   r   rO   rz   F)halfrU   rP   rQ   rS   rT   )!rV   rA   rW   r)   r=   rX   rY   r|   r^   r}   r~   rC   rD   ry   getattrr    catr   type_asr'   rZ   r[   r\   r0   r]   r?   r   r   r_   r>   rF   rG   r1   )rH   rL   r   r   r   r`   ra   rb   rA   rc   rd   re   nptr   r   r$   s                   r   rf   AttentionRope.forward   s   $ ''a88((1+C++aAt~~t}}EMMaQRTUWXZ[\CjjmGA!A&&qT^^T]]KUUVWYZ[AA&&qT^^T]]KUUVWYZ[AA&&qT^^T]]KUUVWYZ[A{{1~t{{1~1((C46D		1Q4C4]+-@1aq=AQSW-cdjklttuvwA		1Q4C4]+-@1aq=AQSW-cdjklttuvwA??..a#.2mm$..**#	A JJA{{2r**D.q	MI!$2D<<B<'D>>$'DAKK1%%aDMM:IIaLIIaLNN1r   )r>   r0   r'   r=   rD   r}   rF   r)   ry   rG   r1   rC   r|   rA   rz   r?   r~   )rh   NTTr   rP   rP   NNFFTFNN)NNF)rj   rk   rl   rm   rn   r    jitr   r!   ro   rp   r   r   r   r   rq   r<   rr   rf   rs   rt   ru   s   @r   r   r      sF    		%%
 %)!"%&!!+/*.!$" %#F/F/ F/ c]	F/
 F/ F/  #F/ F/ F/ $C=F/ RYYF/ F/ F/ F/ F/ F/V ,004#: 5<<(:  -	:
 : :r   r   r   ri   )typingr   r   r   r    r   torch.nnr   rZ   _fxr	   configr
   pos_embed_sincosr   __all__fxwraprr   r   rp   r!   r   rq   r   r   r   r   r   <module>r      s    ( (   $ * " 1 U ?5<< ?HU\\4J ?  ?  -1	ll ELL) 	
 ell  (`		 `FLBII Lr   