
    3j*                        S SK r SSKJr  SSKJr  \R
                  " \5      rSrSr	Sr
SqS\4S	 jr\ R                  R                  S
SS9S\ R                   S\ R                   S\ R                   S\ R                   S\ R                   S\ R                   S\S\S\S\S\S\S\S\S\ R                   4S j5       r\R(                  S 5       rS\ R                   S\SS4S  jrS! r    S(S"\ R0                  R2                  S\ R                   S#\ R                   S$\ R                   S%\ R                   S-  S\S\S-  S&\ R                   S-  S\\ R                   S4   4S' jjrg))    N   )logging   )sdpa_attention_forward)                attn_implementationc                 &   [         b  [         $ SSKJn  U R                  S5      S   nUR	                  S5      u  p#nU" X$=(       d    SU(       a  SOSSS	9nS
 H/  n[        [        XVS5      5      (       a  M  [        SU SU S35      e   Uq [         $ )ag  Load the MSA hub kernel once and verify the expected callables are present.

The ``attn_implementation`` string may carry a ``paged|`` prefix and/or an ``@<revision>`` pin
(e.g. ``kernels-staging/msa@v0``); the build currently lives on the repo's ``v0`` branch. The
loaded module is cached in a module-level global so registration happens once, not per call.
Nr   )
get_kernel|@r   T)revisionversionallow_all_kernels)sparse_atten_funcbuild_k2q_csrzThe MSA kernel loaded from `z` does not expose a callable `zK`. Make sure you request a compatible build, e.g. `kernels-staging/msa@v0`.)_MSA_KERNELhub_kernelsr   split	partitioncallablegetattrImportError)r   r   repo_id_revkernelfn_names          a/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/integrations/msa_attention.pyload_and_register_msa_kernelr$       s     '!'',R0G'',OG+stPQeijF9677.wi7UV]U^ _[ [  : K    ztransformers_msa::sparse_atten )mutates_argsqkvq2kcu_seqlens_qcu_seqlens_ktopk
block_sizetotal_kmax_seqlen_qmax_seqlen_kqheads_per_kvscalingimplreturnc                 D   [        U5      n[        R                  R                  U R                  5         UR	                  UUUUUU
U	US9u  nnUR                  U UUUUUUUU	U
USUS9nSSS5        UR                  5       $ ! , (       d  f       WR                  5       $ = f)a  Opaque wrapper around the CuTe-DSL CSR build + block-sparse kernel.

Registered as a ``torch.library`` custom op so ``torch.compile(fullgraph=True)`` treats the
whole CSR-build + attention as a single opaque node (no graph break) and ``reduce-overhead``
CUDA graphs can capture it. The internal ``build_k2q_csr`` output is data-dependent in shape,
but it never escapes this op (only the fixed-shape ``[total_q, Hq, D]`` attention output does),
so the fake/meta impl below is exact. The op is functional (no input mutation).
)r0   r2   r1   qhead_per_kvT)r,   r-   r1   r2   blk_kvcausalsoftmax_scaleN)r$   torchcudadevicer   r   
contiguous)r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   msak2q_row_ptrk2q_q_indicesattn_outputs                     r#   _msa_sparse_atten_oprD   <   s    2 't
,C 
		188	$%(%6%6%%& &7 	&
"] ++%%%%! , 
 
%4 !!##5 
%	$4 !!##s   5B
Bc                 .    [         R                  " U 5      $ N)r<   
empty_like)r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   s                 r#   _msa_sparse_atten_fakerH   u   s    $ Ar%   querydropoutc                    UR                   R                  S:w  d0  [        R                  R	                  UR                   5      S   S:w  a  [        S5      eUR                  S   [        :w  a  [        S[         S35      eU R                  R                  [        :w  a  [        S[         S35      eUS	:w  a  [        S
5      eU R                  R                  nU[        ;  a  [        S[         SU S35      eg)a  Validate kernel capability, dropout and configured topk once per attention module.

Mirrors the flash-attention integration, which checks capability/dropout at model init rather
than on every forward. The check is cached on the module so the hot path never re-runs it.

There is no SDPA fallback: a sparse layer either runs the MSA kernel or this raises. Serves both
prefill (q_len > 1) and single-token decode (q_len == 1) -- decode is just a varlen call with one
query slot, so there is no context-length threshold.
r=   r   
   zMSA block-sparse attention requires an SM100 / Blackwell CUDA device. Select a different `attn_implementation` on unsupported hardware.r   z2MSA block-sparse attention only supports head_dim .z4MSA block-sparse attention only supports block_size         zYMSA block-sparse attention does not support attention dropout; set `attention_dropout=0`.z1MSA block-sparse attention only supports topk in z, got `z0`. Set `index_topk_blocks` to a supported value.N)r>   typer<   r=   get_device_capabilityRuntimeErrorshapeMSA_SUPPORTED_HEAD_DIM
ValueErrorindexerr/   MSA_SUPPORTED_BLOCK_SIZEtopk_blocksMSA_SUPPORTED_TOPK)modulerI   rJ   r.   s       r#   _validate_msa_initrZ      s     ||F"ejj&F&Fu||&TUV&W[]&]P
 	
 {{200MNdMeefghh~~  $<<OPhOiijkll#~tuu>>%%D%%?@R?SSZ[_Z` a< <
 	
 &r%   c                   ^ UR                   u  ppUR                   S   UR                   S   pX-  nUR                   S   m[        U4S j[         5       5      nUT:w  a@  UR                  / UR                   S S QUT-
  P7S5      n[        R
                  " UU/SS9nUmUR                  SS5      R                  X-  X5      R                  5       nUR                  SS5      R                  X-  X5      R                  5       nUR                  SS5      R                  X-  X5      R                  5       n[        R                  " SUS-   U
-  U
UR                  [        R                  S9nUS:X  a|  Uby  US   S-   R                  [        R                  5      R                  S5      n[        R
                  " [        R                  " SUR                  [        R                  S9U/5      nO6[        R                  " SUS-   U-  UUR                  [        R                  S9nUR                  [        R                  5      nUR                  X-  T5      R                  S5      R                  USS5      R                  5       n[!        UUUUUUTUX-  U
UUUU R"                  R$                  5      nUR                  XX5      $ )Nr   r   r   c              3   6   >#    U  H  oT:  d  M
  Uv   M     g 7frF   r&   ).0tr.   s     r#   	<genexpr>$_sparse_attention.<locals>.<genexpr>   s     B"4QT	qq"4s   		)dimr   )r>   dtype)rR   nextrX   new_fullr<   cat	transposereshaper?   aranger>   int32tozeros	unsqueezeexpandrD   config_attn_implementation)rY   rI   keyvaluer4   block_indicesr/   cache_positionbsznum_q_headsq_lenhead_dimnum_kv_headsk_lenr3   padded_topkpadr(   r)   r*   r,   valid_kr-   r+   rC   r.   s                            @r#   _sparse_attentionr}      ss   (-%Ce))A,		!%/Mr"D B"4BBKd$$%T}':':3B'?%TtAS%TVXY		=#"6B?
 	1%%ck;ITTVAa##CKHSSUA1%%ck<JUUWA<<C!Gu#4eAHHTYT_T_`L axN.!"%)--ekk:BB1Eyy%++a"TV]!^_||Aa5'8%X]XcXcd


5;;
'C
++ck4
(
2
21
5
<
<\2r
R
]
]
_C '			**K  s;AAr%   rY   rp   rq   attention_maskrr   c           
         Uc  UR                   S   S-  nUc  [        XX#U4XVS.UD6$ [        U SS5      (       d  [        XU5        SU l        U R
                  R                  n	UR                  S5      n
[        XX#XgX5      nUS4$ )	zf
TODO: this opens a door to per-layer attn implementation which is something we might want lalter on.
Nr   g      )rJ   r4   _msa_validatedFTrs   )	rR   r   r   rZ   r   rU   r/   getr}   )rY   rI   rp   rq   r~   rJ   r4   rr   kwargsr/   rs   rC   s               r#   msa_attention_forwardr      s     ++b/T) %3~
?F
[a
 	
 6+U336'2 $**JZZ 01N#F3wWarKr%   )NrN   NN)r<   utilsr   sdpa_attentionr   
get_logger__name__loggerrX   rV   rS   r   strr$   library	custom_opTensorintfloatrD   register_fakerH   rZ   r}   nnModuletupler   r&   r%   r#   <module>r      s      2 
		H	% $   c 8 9K5$||5$||5$ ||5$ 
	5$
 ,,5$ ,,5$ 5$ 5$ 5$ 5$ 5$ 5$ 5$ 5$ \\5$ L5$p ## $(
ell 
U 
t 
:9BB +/ )- HHOO <<  
  <<	 
 LL4'    T\  <<$&  5<< r%   