
    3jX              $          S r SSKrSSKJr  SSKJrJr  SSKr\R                  " \	5      r
/ SQrS\\   S-  S\\   4S jr\" S	S
9S\S\4S j5       r " S S\5      r\R$                  R'                  S0 S9       S:S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S-  S\S\S\S\S-  S\\   S-  S\S\R(                  S-  S\R(                  S-  S\S-  S\\R(                  \R(                  \R(                  4   4S jj5       r\R0                         S:S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S-  S\S\S\S\S-  S\\   S-  S\S\R(                  S-  S\R(                  S-  S\S-  S\\R(                  \R(                  \R(                  4   4S  jj5       rSSS!SSSSS".S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S-  S\S\S#\S-  S\S-  S\\\4   S\S\R(                  S-  S\R(                  S-  S\S-  S\R(                  \\R(                  \R(                  4   -  4S$ jjr\R$                  R'                  S%S&1S9       S:S&\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S-  S\S\S\S\S-  S\\   S-  S\S\R(                  S-  S\R(                  S-  S\S-  S\R(                  4 S' jj5       r\R0                         S:S&\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S-  S\S\S\S\S-  S\\   S-  S\S\R(                  S-  S\R(                  S-  S\S-  S\R(                  4 S( jj5       rSSS!SSSSS".S&\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S\R(                  S-  S\S\S#\S-  S\S-  S\\\4   S\S\R(                  S-  S\R(                  S-  S\S-  S\R(                  \\R(                  \R(                  4   -  4 S) jjrS*\S+\\S,4   S-\SS4S. jr\R$                  R'                  S/0 S9  S;S0\R(                  S\R(                  S\R(                  S\R(                  S&\R(                  S1\R(                  S\R(                  S\R(                  S\S\S\S2\R(                  S\S-  S\\   S-  S\\R(                  \R(                  \R(                  4   4S3 jj5       r\R0                    S;S0\R(                  S\R(                  S\R(                  S\R(                  S&\R(                  S1\R(                  S\R(                  S\R(                  S\S\S\S2\R(                  S\S-  S\\   S-  S\\R(                  \R(                  \R(                  4   4S4 jj5       r S*\S0\R(                  S5\R(                  S6\R(                  S\\R(                  S-  S,4   4
S7 jr!\RE                  \!\S89  \RF                  RI                  \RJ                  RL                  RN                  5        SS9K(J)r)J*r*J+r+J,r,  \*\,\RJ                  RZ                  R.                  '   \+\,\RJ                  RZ                  R6                  '   \)\,\RJ                  RZ                  R>                  '   g)<z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuple)varlen_attnvarlen_attn_out
AuxRequestwindow_sizereturnc                 `    U c  SS/n [        U 5      S:w  a  [        S[        U 5       35      eU $ )N   z$window_size must have length 2, got )len
ValueError)r	   s    S/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/nn/attention/varlen.py_normalize_window_sizer      s=    2h
;1?K@P?QRSS       )maxsizedevice_indexc                     g)z;Cache device capability check to avoid repeated CUDA calls.F )r   s    r   _should_use_cudnnr      s     r   c                   (    \ rS rSr% SrSr\\S'   Srg)r   #   z
Request which auxiliary outputs to compute from varlen_attn.

Each field is a boolean indicating whether that auxiliary output should be computed.
Flser   N)	__name__
__module____qualname____firstlineno____doc__r   bool__annotations____static_attributes__r   r   r   r   r   #   s     Cr   r   ztorch_attn::_varlen_attn)mutates_argsFquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalscale
enable_gqa	seqused_kblock_table
num_splitsc                    [        U	5      n	U R                  =(       a    [        U R                  R                  5      nU(       a  [
        R                  S5        U
(       a  [        S5      eUb  [        S5      eU	S   S:w  d	  U	S   S:w  a  [        S5      eUc  Ub  [        S	5      e[        R                  R                  R                  U UUSUUUUS
SUSUS9nUS   US   US   nnnOW[
        R                  S5        [        R                  R                  R                  U UUUUUUSUSUU	S   U	S   UUUS9u  nnn  n[        R                  " S[        R                  U R                  S9nUUU4$ )z
Private custom op for variable-length attention.

This is the internal implementation. Users should use the public varlen_attn function instead.
#Using cuDNN backend for varlen_attnz,GQA is not supported with the cuDNN backend.Nz3num_splits is not supported with the cuDNN backend.r   r      TcuDNN backend does not support window attention. Please use Flash Attention backend.zBseqused_k/block_table is not yet supported with the cuDNN backend.T        Fr-      -Using Flash Attention backend for varlen_attn)return_debug_maskr-   window_size_leftwindow_size_rightr/   r0   r1   r   dtypedevice)r   is_cudar   r@   indexloginfoRuntimeErrortorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r%   r&   r'   r(   r)   r*   r+   r,   r-   r	   r.   r/   r0   r1   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                        r   _varlen_attnrT   -   s   , )5KG"3ELL4F4F"GI67MNN!TUUq>R;q>R#7f   K$; T  88 9 
  *0F1IvayYY@A/4yy~~/V/V#(^)!n#!! 0W 0
,Y1& ELLJ ;
**r   c                    [        U	5      n	[        R                  " U 5      nU R                  S5      nU R                  S5      n[        R                  " UU4[        R
                  U R                  S9n[        R                  R                  (       a  [        R                  R                  5       nU[        R                  R                  R                  :X  aE  UR                  S5      S-
  n[        R                  " UUU4[        R
                  U R                  S9n[        R                  " S[        R                  U R                  S9nUUU4$ )z
Fake implementation for meta tensor computation and tracing.

Based on the 3D varlen path from meta__flash_attention_forward:
- query shape: (total, num_heads, head_dim)
- logsumexp shape: (num_heads, total_q)
r   r4   r>   r=   )r   rF   
empty_likesizeemptyfloatr@   versionhip_C_get_rocm_fa_preferred_backend_ROCmFABackendAOTritonrL   )r%   r&   r'   r(   r)   r*   r+   r,   r-   r	   r.   r/   r0   r1   rO   total_q	num_heads	logsumexp	preferred
batch_sizerQ   s                        r   _varlen_attn_fakere      s    0 )5K e$F jjmG

1I	GEKKI }}HH;;=	//888!q)A-JY.ekk%,,I DU\\JI9i''r   )r   r   )
return_auxr-   r	   r.   r/   r0   r1   rf   c                   U R                  S5      nUb  UR                  S5      OUR                  S5      nU
(       d  X:w  a  [        SU SU S35      eU
(       a  X-  S:w  a  [        SU SU S35      eU	S	:H  n[        R                  R                  R                  U UUUUUUUU[        U	5      U
UUU5      u  nnnUb  UR                  (       a  UU4$ U$ )
a-  Compute variable-length attention using Flash Attention.

This function is similar to scaled_dot_product_attention but optimized for
variable-length sequences using cumulative sequence position tensors.

Args:
    query (Tensor): Query tensor; shape :math:`(T_q, H_q, D)`
    key (Tensor): Key tensor; shape :math:`(T_k, H_{kv}, D)`, or
        :math:`(\text{total\_pages}, \text{page\_size}, H_{kv}, D)` when ``block_table`` is provided.
    value (Tensor): Value tensor; shape :math:`(T_k, H_{kv}, D)`, or
        :math:`(\text{total\_pages}, \text{page\_size}, H_{kv}, D)` when ``block_table`` is provided.
    cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
    cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
    max_q (int): Maximum query sequence length in the batch.
    max_k (int): Maximum key/value sequence length in the batch.
    return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
    scale (float, optional): Scaling factor for attention scores
    window_size (tuple[int, int], optional): Window size for sliding window attention as (left, right).
        Use (-1, -1) for full attention (default), (-1, 0) for causal attention,
        or (W, 0) for causal attention with sliding window of size W.
    enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA)
        and allows key/value to have fewer heads than query.
        Each KV head is shared by a group of :math:`H_q / H_{kv}` query heads,
        so :math:`H_q` must be divisible by :math:`H_{kv}`.
        Default is False.
    seqused_k (Tensor, optional): Number of valid KV tokens per batch element; shape :math:`(N,)`.
        When set, only the first ``seqused_k[i]`` tokens in the key/value sequence for batch
        element *i* participate in attention. Useful for KV-cache decoding where the cache slot
        is larger than the actual sequence. Inference-only (not supported in backward).
    block_table (Tensor, optional): Block table for paged KV cache; shape
        :math:`(N, \text{max\_pages\_per\_seq})`, dtype ``int32``.
        Requires ``seqused_k``. Inference-only (not supported in backward).

        When ``block_table`` is provided, ``key`` and ``value`` are a "pool" of
        pages of tokens of KV data and the pages belong to any sequence/order.
        The ``block_table`` is what maps each sequence's logical chunks
        back to physical pages in this pool.

        ``seqused_k[i]`` tells the kernel how many tokens in sequence *i* are
        actually valid, since the last page is typically only partially filled.
    num_splits (int, optional): Number of splits for split-KV. Set to ``1``
        to disable split-KV which enables batch invariance. Split-KV
        parallelizes the key/value sequence dimension across multiple thread
        blocks and combines partial results. The split decision depends
        on ``max_k`` (the longest sequence in the batch), so different batch
        compositions can change the reduction order and produce different
        floating-point results for the same sequence. When this is disabled,
        bitwise identical outputs are guaranteed for a given sequence
        regardless of what other sequences are in the batch, at the
        cost of lower GPU utilization when there are few queries. When
        ``None`` (default), the kernel chooses automatically.

Returns:
    output (Tensor): Output tensor from attention computation; shape :math:`(T_q, H_q, D)`.

    If ``return_aux`` is not None and ``return_aux.lse`` is True:
        lse (Tensor): Log-sum-exp of attention scores; shape :math:`(T_q, H_q)`.

Shape legend:
    - :math:`N`: Batch size
    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
    - :math:`H_q`: Number of query attention heads
    - :math:`H_{kv}`: Number of key/value attention heads (equal to :math:`H_q` unless GQA is enabled)
    - :math:`D`: Head dimension

Example::

    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
    >>> head_dim = embed_dim // num_heads
    >>> seq_lengths = []
    >>> for _ in range(batch_size):
    ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
    ...     seq_lengths.append(min(length, max_seq_len))
    >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
    >>> total_tokens = seq_lengths.sum().item()
    >>>
    >>> # Create packed query, key, value tensors
    >>> query = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> key = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> value = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>>
    >>> # Build cumulative sequence tensor
    >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
    >>> cu_seq[1:] = seq_lengths.cumsum(0)
    >>> max_len = seq_lengths.max().item()
    >>>
    >>> # Call varlen_attn
    >>> output = varlen_attn(
    ...     query, key, value, cu_seq, cu_seq, max_len, max_len
    ... )
r4   r   GExpect query and key/value to have the same number of heads but got Hq=	 and Hkv=&. Try setting enable_gqa=True for GQA.r   MExpect number of query heads to be a multiple of kv heads for GQA but got Hq=.r   r   )rW   r   rF   rG   
torch_attnrT   listr   )r%   r&   r'   r(   r)   r*   r+   rf   r-   r	   r.   r/   r0   r1   num_heads_qnum_heads_kr,   outr   rR   s                       r   r   r      s   j **Q-K!,!8#((1+chhqkK+4%i} =34
 	

 k/14%i}A?
 	

 w&I))&&33[KCa  *..CxJr   ztorch_attn::_varlen_attn_outrr   c                 H   [        U
5      n
UR                  =(       a    [        UR                  R                  5      nU(       a  [        S5      e[        R                  S5        [        R                  R                  R                  U UUUUUUUSUSU	U
S   U
S   UUUS9nU$ )z
Private custom op for variable-length attention with pre-allocated output.
Same as _varlen_attn but writes the attention output into the provided out tensor.
z+cuDNN backend does not support out variant.z1Using Flash Attention backend for varlen_attn_outr6   Fr   r4   )r-   r;   r<   r/   r0   r1   )r   rA   r   r@   rB   rE   rC   rD   rF   rG   rH   +_flash_attention_forward_no_dropout_inplace)rr   r%   r&   r'   r(   r)   r*   r+   r,   r-   r	   r.   r/   r0   r1   rM   rP   s                    r   _varlen_attn_outru   R  s    , )5KG"3ELL4F4F"GIHIIHH@A))..LL$Q%a.# M K( r   c                    UR                  S5      nUR                  S5      n[        R                  " UU4[        R                  UR                  S9n[        R
                  R                  (       a  [        R                  R                  5       nU[        R                  R                  R                  :X  aE  UR                  S5      S-
  n[        R                  " UUU4[        R                  UR                  S9nU$ )>
Fake implementation for meta tensor computation and tracing.
r   r4   r>   )rW   rF   rX   rY   r@   rZ   r[   r\   r]   r^   r_   )rr   r%   r&   r'   r(   r)   r*   r+   r,   r-   r	   r.   r/   r0   r1   r`   ra   rb   rc   rd   s                       r   _varlen_attn_out_fakerx     s    * jjmG

1I	GEKKI }}HH;;=	//888!q)A-JY.ekk%,,I r   c                   UR                  S5      nUb  UR                  S5      OUR                  S5      nU(       d  UU:w  a  [        SU SU S35      eU(       a  UU-  S:w  a  [        SU SU S35      eU
S	:H  n[        R                  R                  R                  U UUUUUUUUU	[        U
5      UUUU5      nUb  UR                  (       a  U U4$ U $ )
zCompute variable-length attention using Flash Attention with a pre-allocated output tensor.

Same as :func:`varlen_attn` but writes the attention output into the provided ``out`` tensor
instead of allocating a new one.

r4   r   rh   ri   rj   r   rk   rl   rm   )rW   r   rF   rG   rn   ru   ro   r   )rr   r%   r&   r'   r(   r)   r*   r+   rf   r-   r	   r.   r/   r0   r1   rp   rq   r,   r   s                      r   r   r     s   0 **Q-K!,!8#((1+chhqkK+4%i} =34
 	

 kK/14%i}A?
 	

 w&I
))


/
/[C" *..CxJr   ctxinputs.rO   c                     Uu  nnnnnnn	n
nnnnnnUu  nnnUb  [        S5      eUb  [        S5      eU R                  X4XVUUUU5        Xl        Xl        Xl        Xl        Xl        g )Nz)seqused_k is an inference-only parameter.z+block_table is an inference-only parameter.)rE   save_for_backwardr*   r+   r,   r-   r	   )rz   r{   rO   r%   r&   r'   r(   r)   r*   r+   r,   r-   r	   r.   r/   r0   r1   rr   r   rQ   s                       r   _setup_contextr~     s      	 CiFGGHII%exc9UIIMI!Or   z!torch_attn::_varlen_attn_backwardgrad_outr   rQ   c                 V   [        U5      n[        R                  " SUR                  S9nUR                  =(       a    [        UR                  R                  5      nU(       al  [        R                  S5        US   S:w  d	  US   S:w  a  [        S5      e[        R                  R                  R                  U UUUUUUUUU	SU
UUUS9u  nnnOV[        R                  S	5        [        R                  R                  R                  U UUUUUUUUU	SU
UUUUS   US   S
9u  nnnUUU4$ )Nr   )r@   r3   r   r4   r5   r6   r7   r9   )r-   r;   r<   )r   rF   rX   r@   rA   r   rB   rC   rD   rE   rG   rH   _cudnn_attention_backward_flash_attention_backward)r   r%   r&   r'   rr   r   r(   r)   r*   r+   r,   rQ   r-   r	   unusedrM   dqdkdvs                      r   _varlen_attn_backwardr     sD   " )5K[[5<<0FG"3ELL4F4F"GI67q>R;q>R#7f  YY^^== > 

B$ 	@AYY^^==(^)!n# > 

B& r2:r   c                     [        U5      n[        R                  " U5      n[        R                  " U5      n[        R                  " U5      nXU4$ )rw   )r   rF   rV   )r   r%   r&   r'   rr   r   r(   r)   r*   r+   r,   rQ   r-   r	   
grad_querygrad_key
grad_values                    r   _varlen_attn_backward_faker   Q  sI    ( )5K!!%(J$H!!%(J++r   grad_lsegrad_rngc                 *   U R                   u  pEpgppU R                  nU R                  nU R                  nU R                  nU R
                  n[        R                  R                  R                  UUUUU	U
UUUUUUUU5      u  nnnSnUUU/SU-  Q7$ )N   )N)
saved_tensorsr*   r+   r,   r-   r	   rF   rG   rn   r   )rz   r   r   r   r%   r&   r'   r(   r)   rr   r   rQ   r*   r+   r,   r-   r	   r   r   r   
num_paramss                        r   	_backwardr   n  s     BEARAR>EIIEIIEIIIE//K%%;;JBB$ JB0'J.00r   )setup_context)_varlen_attn_backward_flop_varlen_attn_forward_flop_varlen_attn_out_flopflop_registry)FNNFNNN)NN).r    logging	functoolsr   typingr   r   rF   	getLoggerr   rC   __all__ro   intr   r!   r   r   library	custom_opTensorrY   tuplerT   register_fakere   r   ru   rx   r   r~   r   r   r   register_autograd_dynamodisallow_in_graphrG   rH   rt   torch.utils.flop_counterr   r   r   r   rn   r   r   r   <module>r      sa
     "  !
:S	D(8 T#Y  1C D  
  3"E $(%)'+!V+<<V+	V+ <<V+ ll	V+
 llT!V+ V+ V+ V+ 4<V+ cT!V+ V+ ||d"V+ $V+ d
V+ 5<<u||34V+ FV+r  $(%)'+!.(<<.(	.( <<.( ll	.(
 llT!.( .( .( .( 4<.( cT!.( .( ||d".( $.( d
.( 5<<u||34.( .(t %)#+%)'+!V<<V	V <<V ll	V
 llT!V V V T!V 4<V sCxV V ||d"V $V d
V  \\E%,,455!Vr 7ugN $(%)'+!2	2<<2 
2 <<	2
 ll2 llT!2 2 2 2 4<2 cT!2 2 ||d"2 $2 d
2  \\!2 O2j  $(%)'+!"	"<<" 
" <<	"
 ll" llT!" " " " 4<" cT!" " ||d"" $" d
"  \\!"  "^ %)#+%)'+!!:	:<<: 
: <<	:
 ll: llT!: : : T!: 4<: sCx: : ||d": $:  d
!:" \\E%,,455#:z" "U38_ "c "d "B <2N $(AllA<<A 
A <<	A
 
A 
A llA llA A A A ||A 4<A cT!A 5<<u||34A OAH $$ $(,ll,<<, 
, <<	,
 
, 
, ll, ll, , , , ||, 4<, cT!, 5<<u||34, %,81	11051HM1
5<<$#$1B   y  G   	IINN>>  4Meii""// 07Leii""33 4<Veii""88 9r   