
    3jA9              
         % S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
  SSKJrJr  SS	KJr  \
(       a  SS
KJr  SSKrSSKJr  S/rSqS\S'   \ " S S5      5       r\S)S j5       r S*   S+S jjr\S,S j5       rS-S jr S.         S/S jjr  S0                     S1S jjr                  S2S jrS3S jr \" S5      r!S4S jr"   S5                               S6S jjr# S7                           S8S jjr$SSSSSSSSSS.	                                     S9S jjr%SSSSSSSS .                                   S:S! jjr&SSSS".                                 S;S# jjr'   S<SS$.             S=S% jjjr(SS$.                             S>S& jjr)\RT                  " S'\S(9  g)?zUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                  *    \ rS rSr% S\S'   SS jrSrg)
_FA4Handle   zLibrary | Nonelibraryc                    S U l         g Nr   )selfs    Q/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/nn/attention/_fa4.pyremove_FA4Handle.remove"   s	        r   N)returnNone)__name__
__module____qualname____firstlineno____annotations__r   __static_attributes__ r   r   r   r      s    r   r   c                H    [         R                  R                  U 5      u  pU$ r   )torchcudaget_device_capability)devicemajor_s      r   _get_device_majorr-   &   s    zz//7HELr   c                B    [        U 5      nU q[        [        5       5      $ )z
Register FA4 flash attention kernels with the PyTorch dispatcher.

Args:
    module_path: Python module path to the FA4 implementation.
)_fa4_import_moduler   r   _fa4_register_kernels)module_pathr,   s     r   r   r   ,   s#     	;'A"+-..r   c                    [         R                  " U 5      n[        US5      (       a  [        US5      (       d  [        SU  S35      eU$ )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r1   modules     r   r/   r/   ;   sG    $$[1F6,--WVEV5W5WXk]2OPQQMr   c                    [        SSS5      n U R                  S[        S5        U R                  S[        S5        U R                  S[        S5        U R                  S[
        S5        U R                  S[        S5        U $ )	NatenIMPLCUDA_flash_attention_forward+_flash_attention_forward_no_dropout_inplace_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl4_fa4_flash_attention_forward_no_dropout_inplace_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libs    r   r0   r0   C   s~    
&&&
)CHH')JFSHH5<
 HH(*LfUHH-<
 HH6=
 Jr   c                ,   [        S U 5       5      (       d  g[        U Vs1 s H  oDR                  iM     sn5      S:w  a  gU R                  [        R
                  [        R                  4;  a  gU H*  u  pVUR                  [        R                  :w  d  M%  U S3s  $    Uc  U R                  5       S:w  a  gUb  U R                  5       S	:w  a  g
[        R                  R                  5       (       d  g[        U R                  5      S;  a  gg s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fr   )is_cuda.0ts     r   	<genexpr>,_fa4_common_support_error.<locals>.<genexpr>_   s     *'Qyy's   zinputs must be CUDA tensorsr
   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllenr*   dtyper'   float16bfloat16float32dimr(   is_availabler-   )querytensors	cum_seq_qrequire_fp32rO   nametensors          r   _fa4_common_support_errorre   Y   s     *'***,
g&gHHg&'1,){{5==%..998$<<5==(V122 % UYY[A-'!1(::""$$#&g5< 's   Dc
                :   US:w  a  gU(       a  gUb  gUb1  UR                   [        R                  :w  a  gUR                  (       d  g[	        U R
                  5      n
Ub  U
S:w  a  SU
 S	3$ U	b  U	S
:  a  U
S:w  a  SU
 S	3$ [        U XU4U5      nUb	  US:X  a  gU$ g )N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDArV   z+paged KV (block_table) not supported on SM 0r
   z-SplitKV (num_splits > 1) not supported on SM rR   z(query, key, value must be on same device)rY   r'   int32rL   r-   r*   re   )r_   keyvalue	dropout_preturn_debug_maskalibi_slopes	seqused_kra   block_table
num_splitsr+   errors               r   _fa4_forward_support_errorrt   s   s     C$0+??ekk),  +ell+E5B;<UG1EE*q.Ub[>ugQGG%	UE
 ..=r   c           	     @    US:w  a  g[        UXX#XE4USU44S9nUb  U$ g )Nrg   rh   	logsumexp)rb   )re   )	grad_outr_   rk   rl   outrv   rm   ra   rs   s	            r   _fa4_backward_support_errorry      sD     C$%	#c5"I.0	E r   c                    U S:X  a  S$ U $ )z"need to convert -1 to None for FA4Nr%   )vals    r   _aten_to_fa4_window_sizer}      s    "94%#%r   Tsc                 &    [        S U  5       5      $ )Nc              3  D   #    U  H  oR                  S S5      v   M     g7f)r
      N)	transposerM   s     r   rP   #_transpose_dense.<locals>.<genexpr>   s     4GqQ""Gs    )tuple)r`   s    r   _transpose_denser      s    4G444r   c                   [         c  [        S5      e[        [         5      nUU[        U	5      [        U
5      SUUUUUb  UR	                  5       OS UU=(       d    SUS.nUR
                  " XU40 UD6u  nnUUR	                  5       4$ )NFA4 not registeredTr
   )softmax_scalecausalwindow_size_leftwindow_size_right
return_lsecu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krp   
page_tablerr   rx   )r   r8   r/   r}   
contiguousr3   )r_   rk   rl   cu_seq_qcu_seq_kmax_qmax_kscale	is_causalr   r   rp   rx   rq   rr   r9   kwargslses                     r   _fa4_run_forwardr      s    " /00 01F 45EF56GH  /8/DY))+$! oAF %%e%B6BHC   r   c                    [         c  [        S5      e[        [         5      nUR                  UUUUU UR	                  5       UU	[        U
5      [        U5      UUUS9u  pnXU4$ )Nr   )r   r   r   r   r   r   deterministic)r   r8   r/   r4   r   r}   )rw   r_   rk   rl   rx   rv   r   r   r   r   r   r   r   r9   dqdkdvs                    r   _fa4_run_backwardr      s     /00 01F''12BC23DE# ( JBB 2:r   T	r   r   r   rp   ro   rx   rq   compute_auxiliaryrr   c
       	           [        U UUUU	UUUUU5
      nUb  [        SU 35      e[        U UUUUUUU
UUUUUUU5      u  nnU(       a  [        R                  " S[        R
                  U R                  S9n[        R                  " S[        R
                  U R                  S9n[        R                  " SU R                  U R                  S9nOS nS nS nUUUUU4$ )Nz)FA4 flash_attention forward unsupported: )r   )rY   r*   r%   r   )	rt   r8   r   r'   zerosuint64r*   emptyrY   )r_   rk   rl   ra   	cum_seq_kr   r   rm   r   rn   r   r   r   rp   ro   rx   rq   r   rr   rs   r   	rng_statephilox_offset
debug_masks                           r   rD   rD   
  s    , 'E FugNOOHC" KKELLN	Bell5<<P[[%++ellK
	
Yz99r   )r   r   r   rp   ro   rq   rr   c               H    [        UUUUUUUUU	U
UUUUUU USUS9u  nn    nU$ )NFr   )rD   )rx   r_   rk   rl   ra   r   r   r   rm   r   rn   r   r   r   rp   ro   rq   rr   r,   r   s                       r   rE   rE   J  sX    * 8)+!'OAsAq!* Jr   )r   r   r   c                   [        U UUUUUU
U5      nUb  [        SU 35      e[        R                  " 5       n[	        U UUUUUUUUUUUU5      u  nnnUUU4$ )Nz*FA4 flash_attention backward unsupported: )ry   r8   r'   $are_deterministic_algorithms_enabledr   )rw   r_   rk   rl   rx   rv   ra   r   r   r   rm   r   r   unusedr   r   r   rs   r   r   r   r   s                         r   rF   rF   w  s    ( (	E GwOPP>>@M"JBB r2:r   r   c                  [        U UUUUS S S 5      nUb  [        SU 35      e[        XU5      u  pn
[        R                  " U 5      nUR                  SS5      nUR                  S5      nU	R                  S5      n[        UU	U
S S UUUUUUUS9u  nnnnnU R                  S5      nUR                  S5      nUUS S UUUUU4	$ )NzFA4 SDPA forward unsupported: r
   r   )r   rx   )rt   r8   r   r'   
empty_liker   sizerD   )r_   rk   rl   rm   r   rn   r   rs   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr,   r   r   r   r   r   r   s                         r   rG   rG     s	    '	E ;E7CDDu51GA!
 &H!!!Q'H&&)K&&)K3T			40AsI}j JJqMEHHQKE
 
r   c                  [        U UUUUUU
S 5      nUb  [        SU 35      e[        XX4U 5      u  nnnnnUR                  S5      nUR                  S5      n	[	        UUUUUUS S UU	U
UUUUS9u  nnn[        UUU5      u  nnnUUU4$ )NzFA4 SDPA backward unsupported: r   r   )ry   r8   r   r   rF   )rw   r_   rk   rl   rx   rv   ra   r   r   r   rm   r   philox_seedr   r   rs   r   r   r   ogor   r   r   s                           r   rH   rH     s    $ (	E <UGDEE%e%hGNAq!QJJqMEHHQKE3
				JBB" ""b"-JBBr2:r   FA4)register_fn)r*   ztorch.devicer   int)zflash_attn.cute.interface)r1   strr   r   )r1   r   r   r   )r   r   )r%   )
r_   torch.Tensorr`   ztuple[torch.Tensor, ...]ra   torch.Tensor | Nonerb   z$tuple[tuple[str, torch.Tensor], ...]r   r   )NN)r_   r   rk   r   rl   r   rm   floatrn   boolro   r   rp   r   ra   r   rq   r   rr   
int | Noner   r   )rw   r   r_   r   rk   r   rl   r   rx   r   rv   r   rm   r   ra   r   r   r   )r|   r   r   r   )r`   z
Unpack[Ts]r   ztuple[Unpack[Ts]])NNN) r_   r   rk   r   rl   r   r   r   r   r   r   r   r   r   r   float | Noner   r   r   r   r   r   rp   r   rx   r   rq   r   rr   r   r   z!tuple[torch.Tensor, torch.Tensor])F)rw   r   r_   r   rk   r   rl   r   rx   r   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])&r_   r   rk   r   rl   r   ra   r   r   r   r   r   r   r   rm   r   r   r   rn   r   r   r   r   r   r   r   rp   r   ro   r   rx   r   rq   r   r   r   rr   r   )$rx   r   r_   r   rk   r   rl   r   ra   r   r   r   r   r   r   r   rm   r   r   r   rn   r   r   r   r   r   r   r   rp   r   ro   r   rq   r   rr   r   )"rw   r   r_   r   rk   r   rl   r   rx   r   rv   r   ra   r   r   r   r   r   r   r   rm   r   r   r   r   r   r   r   r   r   r   r   r   r   )rg   FF)r_   r   rk   r   rl   r   rm   r   r   r   rn   r   r   r   )rw   r   r_   r   rk   r   rl   r   rx   r   rv   r   ra   r   r   r   r   r   r   r   rm   r   r   r   r   r   r   r   r   r   )+__doc__
__future__r   r5   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r	    r   typesr   r'   torch.libraryr   __all__r   r#   r   r-   r   r/   r0   re   rt   ry   r}   r~   r   r   r   rD   rE   rF   rG   rH   register_flash_attention_implr%   r   r   <module>r      sv    #  !  % 2     ! #
  $ * #      3///  4 :<	% # 7	
 F (,!%%	% % 	%
 % &% #% #% %% % %P 
 	
 
   # .&
 $5"  $'+!%!%!	%! %! "	%!
 "%! %! %! %! %! !%! "%! #%! 
%! %%! %!  '!%!j  !!! 
! 	!
 
! ! "! "! ! ! !! "! ! 5!` #'$(%)(,#'+"!)=:=:	=: =: #	=:
 #=: =: =: =: =: =: =: !=: "=: #=:  &!=:" 
#=:$ %%=:& '=:( )=:Z #'$(%)(,'+!'*	** 
* 	*
 #* #* * * * * * * !* "*  #!*" &#*$ %%*& '*z #'$(%000 
0 	0
 
0 0 #0 #0 0 0 0 0 0 0  !0" !#0$ "%0n #: ::	: : 	:
 : : :Z !333 
3 	3
 
3 3 #3 #3 3 3 3 3 3  3  !3l 
 ' ';W Xr   