
    
3j                         S r SSKrSSKJrJr  SSKrSSKJr  SSKJs  J	r
  SSKJrJr  SSKJr  SSKJr   " S S	\R$                  5      rg)
zDifferential Attention

Paper: 'Differential Transformer' - https://arxiv.org/abs/2410.05258

Reference impl: https://github.com/microsoft/unilm/tree/master/Diff-Transformer

Hacked together by / Copyright 2024, Ross Wightman
    N)OptionalType   )maybe_add_maskresolve_self_attn_mask)use_fused_attn)RmsNormc                   p  ^  \ rS rSr% Sr\R                  R                  \   \	S'               SS\
S\
S\S\S	\S
\S\S\S\\\R                        S\
S\SS4U 4S jjjrS\
4S jrS rS\R&                  4S jr  SS\R&                  S\\R&                     S\S\R&                  4S jjrSrU =r$ )DiffAttention   a  Differential Attention module.

Computes attention as the difference between two softmax attention maps, which helps
cancel out noise and promotes sparse attention patterns. The module splits Q and K
into two groups, computes separate attention maps, and subtracts one from the other
scaled by a learnable lambda parameter.

The attention output is computed as:
    Attn = softmax(Q1 @ K1^T) - lambda * softmax(Q2 @ K2^T)
    Output = Attn @ V

Supports both fused (scaled_dot_product_attention) and manual implementations.

fused_attnNdim	num_headsqkv_biasqk_norm
scale_norm	proj_bias	attn_drop	proj_drop
norm_layerdepthdual_lambdareturnc                   > [         TU ]  5         XS.nX-  S:X  d   S5       eU	c  [        n	X l        X-  S-  U l        U R                  S-  U l        [        5       U l        [        R                  " XS-  4SU0UD6U l
        U(       a  U	" U R                  40 UD6O[        R                  " 5       U l        U(       a  U	" U R                  40 UD6O[        R                  " 5       U l        [        R                  " U5      U l        Xpl        U(       a	  U	" U40 UD6O[        R                  " 5       U l        [        R                  " X4SU0UD6U l        [        R                  " U5      U l        Xl        U(       a  [        R*                  " [,        R.                  " S	[,        R0                  US
95      U l        [        R*                  " [,        R.                  " S	[,        R0                  US
95      U l        S=U l        =U l        =U l        U l        GO*S=U l        U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        R*                  " [,        R.                  " U R                  [,        R0                  US
95      U l        [        SU R                  -  4SS0UD6U l        SU l         U RC                  U
5        U RE                  5         g)ax  Initialize the DiffAttention module.

Args:
    dim: Input dimension of the token embeddings.
    num_heads: Number of attention heads.
    qkv_bias: Whether to use bias in the query, key, value projections.
    qk_norm: Whether to apply normalization to query and key vectors.
    scale_norm: Whether to apply normalization before the output projection.
    proj_bias: Whether to use bias in the output projection.
    attn_drop: Dropout rate applied to the attention weights.
    proj_drop: Dropout rate applied after the output projection.
    norm_layer: Normalization layer constructor (defaults to RmsNorm).
    depth: Block depth index, used to compute depth-dependent lambda_init.
    dual_lambda: If True, use simplified dual scalar lambda parameterization
        (2 params). If False, use the paper's original formulation with
        lambda_q/k vectors (4 * head_dim params).
)devicedtyper   z$dim should be divisible by num_headsN   g         bias )r   r   epsgh㈵>皙?)#super__init__r	   r   head_dimscaler   r   nnLinearqkvIdentityq_normk_normDropoutr   attn_drop_pnormprojr   r   	Parametertorchemptyfloat32lambda_alambda_b	lambda_q1	lambda_k1	lambda_q2	lambda_k2sub_normlambda_initset_lambda_initreset_parameters)selfr   r   r   r   r   r   r   r   r   r   r   r   r   dd	__class__s                  T/home/wildlama/miniconda3/lib/python3.13/site-packages/timm/layers/diff_attention.pyr$   DiffAttention.__init__%   so   B 	/!#K%KK# J"(A-]]d*
(*99S'??B?9@j5"5bkkm9@j5"5bkkmI.$-7Js)b)R[[]	IIc=Y="=	I.&LLRu}}U[)\]DMLLRu}}U[)\]DMPTTDNTT^Tdnt~,00DMDM\\%++dmm5==ag*hiDN\\%++dmm5==ag*hiDN\\%++dmm5==ag*hiDN\\%++dmm5==ag*hiDNDMM 1BtBrBU#    c                 L    SS[         R                  " SU-  5      -  -
  U l        g )Nr"   g333333?g333333ӿ)mathexpr<   )r?   r   s     rB   r=   DiffAttention.set_lambda_initk   s!    txxu'=!==rD   c                    U R                   (       aS  [        R                  R                  U R                  5        [        R                  R                  U R
                  5        g [        R                  R                  U R                  SSS9  [        R                  R                  U R                  SSS9  [        R                  R                  U R                  SSS9  [        R                  R                  U R                  SSS9  g )Nr   g?)meanstd)r   r'   initzeros_r5   r6   normal_r7   r8   r9   r:   )r?   s    rB   r>   DiffAttention.reset_parametersn   s    GGNN4==)GGNN4==)GGOODNNO<GGOODNNO<GGOODNNO<GGOODNNO<rD   c                    U R                   bA  [        R                  " U R                   5      n[        R                  " U R                  5      nO[        R                  " [        R                  " U R
                  U R                  -  SS9R                  5       5      n[        R                  " [        R                  " U R                  U R                  -  SS9R                  5       5      nX-
  U R                  -   $ )Nr   )r5   r2   rG   r6   sumr7   r8   floatr9   r:   r<   )r?   lambda_1lambda_2s      rB   _compute_lambdaDiffAttention._compute_lambdax   s    ==$yy/Hyy/Hyy4>>DNN+JPR!S!Y!Y![\Hyy4>>DNN+JPR!S!Y!Y![\H"T%5%555rD   x	attn_mask	is_causalc           	         UR                   u  pEnU R                  U5      R                  SSS9u  pxn	UR                  XESU R                  -  U R
                  5      R                  SS5      nUR                  XESU R                  -  U R
                  5      R                  SS5      nU	R                  XEU R                  SU R
                  -  5      R                  SS5      n	U R                  U5      U R                  U5      pU R                  5       R                  U5      n
U R                  (       a  UR                  X@R                  SXPR
                  5      nUR                  X@R                  SXPR
                  5      nUR                  S5      u  pUR                  S5      u  pU R                  (       a  U R                  OSn[        R                   " XXXS9n[        R                   " XXXS9nUU
U-  -
  nOXpR"                  -  nXxR                  SS5      -  n[%        UUX#S	9n['        UU5      nUR)                  SS9nU R+                  U5      nUR-                  X@R                  SXU5      nUS S 2S S 2S
4   U
US S 2S S 2S4   -  -
  nUU	-  nU R/                  U5      nUSU R0                  -
  -  nUR                  SS5      R                  XEU5      nU R3                  U5      nU R5                  U5      nU R7                  U5      nU$ )Nr   r   rR   r           )rZ   	dropout_pr[   rQ   )r[   r   )shaper)   chunkreshaper   r%   	transposer+   r,   rW   type_asr   unbindtrainingr.   Fscaled_dot_product_attentionr&   r   r   softmaxr   viewr;   r<   r/   r0   r   )r?   rY   rZ   r[   BNCqkvlambda_fullq1q2k1k2r^   attn1attn2attn	attn_biass                       rB   forwardDiffAttention.forward   s    ''a((1+##A1#-aIIaA.>HHANIIaA.>HHANIIaDNNA,=>HHAN{{1~t{{1~1**,44Q7??		!^^Q==AA		!^^Q==AAXXa[FBXXa[FB,0MM((sI22)ZE22)ZE e++AJJA{{2r**D.q$	WI!$	2D<<B<'D>>$'D99Q18D1a=;aAg#>>DqAMM!T%%%&KK1%%aA.IIaLIIaLNN1rD   )r   r.   r   r   r%   r,   r5   r6   r<   r8   r:   r7   r9   r/   r   r0   r   r+   r)   r&   r;   )   FFFTr]   r]   Nr   FNN)NF)__name__
__module____qualname____firstlineno____doc__r2   jitFinalbool__annotations__intrT   r   r   r'   Moduler$   r=   r>   TensorrW   rz   __static_attributes____classcell__)rA   s   @rB   r   r      sI    		%%
 "!$"!!48 %D D  D  	D 
 D  D  D  D  D  !bii1D  D  D  
D  D L>S >=6 6 15#	2||2  -2 	2
 
2 2rD   r   )r   rF   typingr   r   r2   torch.nnr'   torch.nn.functional
functionalrg   	attentionr   r   configr   r/   r	   r   r   r    rD   rB   <module>r      s9     !     = " ^BII ^rD   