
    3jR                       % S r SSKJr  SSKrSSKrSSKJr  \(       a  SSKJr  SSK	J
r
  SSKJr  SSKJrJr  SSKrSS	KJr  S
SKJr  S/rSqS\S'   SqS\S'   \
 " S S5      5       r\S.S j5       r S/   S0S jjrS1S jrS2S jr                S3S jr                        S4S jr                      S5S jr \" S5      r!S6S jr"S7S jr#      S8                                     S9S jjr$ S:                               S;S jjr%   S<SSSSSSSSSS .	                                           S=S! jjjr&SSSSSSSS".                                   S>S# jjr'SSSSSSSSS$.                                   S?S% jjr(SSSS&.                                 S@S' jjr)      SASS(.                   SBS) jjjr*   SCSS(.             SDS* jjjr+SS(.                             SES+ jjr,\RZ                  " S,\S-9  g)Fz
PROTOTYPE!
Flash Attention 3 implementation.
For fp8: only supports forward pass right now.
For fp16/bf16: supports forward and backward pass.
    )annotationsN)TYPE_CHECKING)Callable)	dataclass)cache)TypeVarTupleUnpack)Library   )	_registryregister_flash_attention_fa3zCallable | None_FA3_CUDA_FWD_FA3_CUDA_BWDc                  *    \ rS rSr% S\S'   SS jrSrg)
_FA3Handle&   zLibrary | Nonelibraryc                P    S U l         [        R                  R                  S5        g )NF)r   torch_C_set_sdp_use_fa3)selfs    Q/home/wildlama/miniconda3/lib/python3.13/site-packages/torch/nn/attention/_fa3.pyremove_FA3Handle.remove*   s    !!%(    )r   N)returnNone)__name__
__module____qualname____firstlineno____annotations__r   __static_attributes__ r   r   r   r   &   s    )r   r   c                H    [         R                  R                  U 5      u  pU$ N)r   cudaget_device_capability)devicemajor_s      r   _get_device_majorr-   0   s    zz//7HELr   c                |    [        U 5        [        R                  R                  S5        [	        [        5       5      $ )z
Register FA3 flash attention kernels with the PyTorch dispatcher.

Args:
    module_path: Python module path to the FA3 implementation.
T)_fa3_import_moduler   r   r   r   _fa3_register_kernelsmodule_paths    r   r   r   6   s/     {# 
HHd#+-..r   c                   [         R                  " U 5        [        [        R                  S5      (       d  [        SU  S35      e[        [        R                  R                  S5      (       d  [        SU  S35      e[        [        R                  R                  S5      (       d  [        SU  S35      e[        R                  R                  R                  q[        R                  R                  R                  q
g )Nflash_attn_3zModule 'z' does not expose FA3 kernelsfwdz%' does not expose FA3 forward kernelsbwdz&' does not expose FA3 backward kernels)	importlibimport_modulehasattrr   opsRuntimeErrorr4   r5   r   r6   r   r1   s    r   r/   r/   G   s    K(599n--Xk]2OPQQ599))511{m#HI
 	
 599))511{m#IJ
 	
 II**..MII**..Mr   c                 b   [        SSS5      n U R                  S[        S5        U R                  S[        S5        U R                  S[        S5        U R                  S[
        S5        U R                  S[        S5        U R                  S	[        S5        U R                  S
[        S5        U $ )NatenIMPLCUDAz"_flash_attention_forward.quantizedz-_scaled_dot_product_flash_attention.quantized_flash_attention_forward+_flash_attention_forward_no_dropout_inplace#_scaled_dot_product_flash_attention_flash_attention_backward,_scaled_dot_product_flash_attention_backward)	r
   impl!_fa3_flash_attention_forward_impl4_fa3_scaled_dot_product_flash_attention_forward_impl)_fa3_flash_attention_forward_impl_default4_fa3_flash_attention_forward_no_dropout_inplace_impl<_fa3_scaled_dot_product_flash_attention_forward_impl_default"_fa3_flash_attention_backward_impl5_fa3_scaled_dot_product_flash_attention_backward_impl)libs    r   r0   r0   X   s    
&&&
)CHH,.OQW HH7<
 HH"$Mv HH5<
 HH-D HH(*LfUHH6=
 Jr   c                    US:w  a  g[        S U 5       5      (       d  g[        U Vs1 s H  owR                  iM     sn5      S:w  a  gU R                  [        R
                  :X  a$  Ub  Ub  Uc  [        R                  " S[        5        Uc  U R                  5       S:w  a  g	Ub  U R                  5       S
:w  a  g[        R                  R                  5       (       d  g[        U R                  5      S:w  a  gg s  snf )N        zdropout_p must be 0c              3  8   #    U  H  oR                   v   M     g 7fr'   )is_cuda.0ts     r   	<genexpr>,_fa3_common_support_error.<locals>.<genexpr>   s     *'Qyy's   zinputs must be CUDA tensorsr   inputs must share devicezWhen using SDPA with fp8, descale tensor should always be used for accurate dequantization. Please use _scaled_dot_product_attention_quantized and provide the descale tensors.   zdense query must be 4D   zragged query must be 3DzCUDA not available	   z#FA3 requires compute capability 9.0)alllenr*   dtyper   float8_e4m3fnwarningswarnUserWarningdimr(   is_availabler-   )querytensors	dropout_p	cum_seq_q	q_descale	k_descale	v_descalerT   s           r   _fa3_common_support_errorrk   y   s     C$*'***,
g&gHHg&'1,){{e)))Y.)2C+ 	
 UYY[A-'!1(::""$$#&!+4) 's   C;c           	       ^ U(       a  gUb  gUb1  UR                   [        R                  :w  a  gUR                  (       d  g[        R                  [        R
                  [        R                  4m[        U4S jXU1 5       5      (       d  ST 3$ [        XU1 Vs1 s H  oR                   iM     sn5      S:w  a  g[        U XU4UUUU	U
5      nUb	  US	:X  a  g
U$ g s  snf )Nzreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDAc              3  @   >#    U  H  oR                   T;   v   M     g 7fr'   r]   rS   rT   supported_dtypess     r   rU   -_fa3_forward_support_error.<locals>.<genexpr>   s     H4Gqww**4G   inputs must be one of r   #all inputs must have the same dtyperW   z(query, key, value must be on same device)
r]   r   int32rQ   r^   float16bfloat16r[   r\   rk   )rd   keyvaluerf   return_debug_maskalibi_slopes	seqused_krg   rh   ri   rj   rT   errorrp   s                @r   _fa3_forward_support_errorr~      s     0+??ekk),  +++U]]ENNKHU4GHHH'(8'9::
e%010GG012a74%	UE ..= 2s   Cc
           	       ^ UR                   [        R                  :X  a   gUR                   [        R                  :w  a  g[        R                  [        R
                  4m[        U4S jXX#U1 5       5      (       d  ST 3$ [        XX#U1 V
s1 s H  oR                   iM     sn
5      S:w  a  g[        UXX#XE4UUS S S 5      nUb  U$ g s  sn
f )NzHFA3 backward does not support fp8 - use inference only (torch.no_grad())zlogsumexp dtype must be float32c              3  @   >#    U  H  oR                   T;   v   M     g 7fr'   rn   ro   s     r   rU   ._fa3_backward_support_error.<locals>.<genexpr>   s     W4Vqww**4Vrr   rs   r   rt   )	r]   r   r^   float32rv   rw   r[   r\   rk   )grad_outrd   rx   ry   out	logsumexprf   rg   window_size_leftwindow_size_rightrT   r}   rp   s               @r   _fa3_backward_support_errorr      s     {{e)))V	
 %--'0u~~6WXcRU4VWWW'(8'9::
hs3?@?GG?@AQF4%	#c5E  As   C
Tsc                 &    [        S U  5       5      $ )Nc              3  D   #    U  H  oR                  S S5      v   M     g7f)r      N)	transposerR   s     r   rU   #_transpose_dense.<locals>.<genexpr>   s     4GqQ""Gs    )tuple)re   s    r   _transpose_denser      s    4G444r   c                V    U b%  U R                  S5      S:w  a  U R                  5       $ U $ )z2Ensure tensor is contiguous in the last dimension.r   )stride
contiguous)xs    r   _maybe_contiguousr      s&    ]qxx|q/@1<<>GaGr   c                   [         c  [        S5      e[        U 5      n[        U5      nUR                  [        R
                  :X  a:  UR                  S5      S:w  a%  UR                  S5      S:w  a  UR                  5       O
[        U5      n[        U5      n[        U5      n[        U5      n[        U5      n[        / UPUPUPSPSPSPUPUPUPSPSPUPUPUPUPSPSPSPSPSPUPUPUPUPUPU	b  U	OSPU
b  U
OSPSPSPSPSPU=(       d    [        R                  " 5       (       a  SOSPSP[        R                  R                  5       =(       d    SP76 u  nnnnUUR                  5       4$ )	z>
Run the FA3 forward pass by calling the C++ kernel directly.
NFA3 not registeredr   r   r   rO   T)r   r;   r   r]   r   r^   r   r   $are_deterministic_algorithms_enabledr   _get_sm_carveout_experimental)rd   rx   ry   cu_seq_qcu_seq_kmax_qmax_kscale	is_causalr   r   r|   r   rh   ri   rj   block_table
num_splitsqkvcu_seqlens_qcu_seqlens_ksoftmax_lse	out_accumsoftmax_lse_accums                             r   _fa3_run_forwardr      sc   . /00% A#A ;;%---LL!LL! 	 u%  %X.L$X.L!),I#K0K5B $6	$6	$6 	
$6 		$6
 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6 	$6  	!$6" 	#$6$ 	%$6& 	'$6( 	)$6* 	+$6, 	-$6. 	/$60 	1$62 	3$64 -8b5$66 /:7$68 	
9$6: 	;$6< 	=$6> 	?$6@ 	 	F;;==A1C$6D 	E$6F 	..05AG$62Ci!2J &&(((r   c                j   [         c  [        S5      e[        U 5      nUR                  S5      S:w  a  UR	                  5       OUnUR                  S5      S:w  a  UR	                  5       OUnUR                  S5      S:w  a  UR	                  5       OUn[        U5      n[        U5      n[
        R                  " U5      n[
        R                  " U5      n[
        R                  " U5      n[        UUUUUUUUUUUS S UU	U
UUUSU[
        R                  R                  5       =(       d    S5        UUU4$ )Nr   r   r   rO   r   )	r   r;   r   r   r   r   
empty_liker   r   )r   rd   rx   ry   r   r   r   r   max_seqlen_qmax_seqlen_kr   r   r   r   deterministicdoutr   r   r   olsedqdkdvs                           r   _fa3_run_backwardr   L  s/   " /00 X&D#ll2.!3AJJrNa/SA#ll2.!3A#A
I
&C 
		!	B			!	B			!	B				


..05A-0 r2:r   r   T	r   r   r   r|   r{   r   r   compute_auxiliaryr   c       	           [        U UUUU	UUUU
UU5      nUb  [        SU 35      e[        U UUUUUUUUUUUUU
UUUU5      u  nnU(       a  [        R                  " S[        R
                  U R                  S9n[        R                  " S[        R
                  U R                  S9n[        R                  " SU R                  U R                  S9nOS nS nS nUUUUU4$ )Nz)FA3 flash_attention forward unsupported: )r   )r]   r*   r%   r   )	r~   r;   r   r   zerosuint64r*   emptyr]   )rd   rx   ry   rg   	cum_seq_kr   r   rf   r   rz   rh   ri   rj   r   r   r   r|   r{   r   r   r   r   r}   r   	rng_statephilox_offset
debug_masks                              r   rF   rF     s   2 'E FugNOO%HC( KKELLN	Bell5<<P[[%++ellK
	
Yz99r   )r   r   r   r|   r{   r   r   c               R    [        UUUUUUUUU	U
S S S 4UUUUUU USUS.	6u  nn    nU$ )NFr   rF   )r   rd   rx   ry   rg   r   r   r   rf   r   rz   r   r   r   r|   r{   r   r   r,   r   s                       r   rI   rI     sh    * 8 )+!-OAsAq!0 Jr   )r   r   r   r|   r{   r   r   r   c
               <    [        U UUUUUUUUU	S S S U
UUUUUUUS9$ )N)r   r   r   r|   r{   r   r   r   r   )rd   rx   ry   rg   r   r   r   rf   r   rz   r   r   r   r|   r{   r   r   r   s                     r   rH   rH     sP    * -)+!+ r   )r   r   r   c                   [        U UUUUUU
UUU5
      nUb  [        SU 35      e[        R                  " 5       n[	        U UUUUUUUUU	UUUb  UOSUb  UOSU5      u  nnnUUU4$ )z0FA3 implementation of _flash_attention_backward.z*FA3 flash_attention backward unsupported: r   )r   r;   r   r   r   )r   rd   rx   ry   r   r   rg   r   r   r   rf   r   r   unusedr   r   r   r}   r   r   r   r   s                         r   rK   rK   ,  s    * (E GwOPP>>@M",8b.:JBB" r2:r   r   c	                  [        U UUUUS S S UUU5      n
U
b  [        SU
 35      e[        XU5      u  pnU R                  [        R
                  :X  a  [        R                  OU R                  n[        R                  " XS9nUR                  SS5      nUR                  S5      nUR                  S5      n[        UUUS S UUUUUU	UUUUS9u  nnnnnU R                  S5      nUR                  S5      nUUS S UUUUU4	$ )NzFA3 SDPA forward unsupported: rn   r   r   )r   r   rh   ri   rj   )r~   r;   r   r]   r   r^   rw   r   r   sizerF   )rd   rx   ry   rh   ri   rj   rf   r   rz   r   r}   r   r   r   	out_dtypeout_bhsdout_bshdmax_q_flashmax_k_flashr,   r   r   r   r   r   r   s                             r   rG   rG   g  s;    'E ;E7CDDu51GA!
 #(++1D1D"D%++I7H!!!Q'H&&)K&&)K3T			40AsI}j" JJqMEHHQKE
 
r   c               &    [        U UUS S S UUUUS9
$ )Nr   )rG   )rd   rx   ry   rf   r   rz   r   s          r   rJ   rJ     s0     @ r   c                   [        XX#XEU
SSS5
      nUb  [        SU 35      e[        XX#U5      u  nnnnn[        UUUUUUSSUU	U
UUUUS9u  nnn[        UUU5      u  nnnUUU4$ )zCFA3 implementation of _scaled_dot_product_flash_attention_backward.NzFA3 SDPA backward unsupported: r   )r   r;   r   rK   )r   rd   rx   ry   r   r   rg   r   r   r   rf   r   philox_seedr   r   r}   
grad_out_tq_tk_tv_tout_tr   r   r   dq_outdk_outdv_outs                              r   rL   rL     s    & (SYdDE <UGDEE (8S($JS#u 4JBB& .b"b9FFF66!!r   FA3)register_fn)r*   ztorch.devicer   int)flash_attn_interface)r2   strr   r   )r2   r   r   r   )r   r
   )rd   torch.Tensorre   ztuple[torch.Tensor, ...]rf   floatrg   torch.Tensor | Nonerh   r   ri   r   rj   r   r   
str | None)rd   r   rx   r   ry   r   rf   r   rz   boolr{   r   r|   r   rg   r   rh   r   ri   r   rj   r   r   r   )r   r   rd   r   rx   r   ry   r   r   r   r   r   rf   r   rg   r   r   
int | Noner   r   r   r   )re   z
Unpack[Ts]r   ztuple[Unpack[Ts]])r   r   r   r   )NNNNNN)&rd   r   rx   r   ry   r   r   r   r   r   r   r   r   r   r   float | Noner   r   r   r   r   r   r|   r   r   r   rh   r   ri   r   rj   r   r   r   r   r   r   z!tuple[torch.Tensor, torch.Tensor])F) r   r   rd   r   rx   r   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])NNN),rd   r   rx   r   ry   r   rg   r   r   r   r   r   r   r   rf   r   r   r   rz   r   rh   r   ri   r   rj   r   r   r   r   r   r   r   r|   r   r{   r   r   r   r   r   r   r   r   r   )$r   r   rd   r   rx   r   ry   r   rg   r   r   r   r   r   r   r   rf   r   r   r   rz   r   r   r   r   r   r   r   r|   r   r{   r   r   r   r   r   )$rd   r   rx   r   ry   r   rg   r   r   r   r   r   r   r   rf   r   r   r   rz   r   r   r   r   r   r   r   r|   r   r{   r   r   r   r   r   r   r   )"r   r   rd   r   rx   r   ry   r   r   r   r   r   rg   r   r   r   r   r   r   r   rf   r   r   r   r   r   r   r   r   r   r   r   r   r   )NNNrO   FF)rd   r   rx   r   ry   r   rh   r   ri   r   rj   r   rf   r   r   r   rz   r   r   r   )rO   FF)rd   r   rx   r   ry   r   rf   r   r   r   rz   r   r   r   )r   r   rd   r   rx   r   ry   r   r   r   r   r   rg   r   r   r   r   r   r   r   rf   r   r   r   r   r   r   r   r   r   ).__doc__
__future__r   r7   r_   typingr   collections.abcr   dataclassesr   	functoolsr   typing_extensionsr   r	   r   torch.libraryr
    r   __all__r   r#   r   r   r-   r   r/   r0   rk   r~   r   r   r   r   r   r   rF   rI   rH   rK   rG   rJ   rL   register_flash_attention_implr%   r   r   <module>r      s   #     ( !  2  !  #
 "& %!% % ) ) )   .///"/"B""%" " #	"
 #" #" #" "J((	( ( 	(
 ( &( #( #( #( #( #( (V### 
# 	#
 
# # # ## !# "# #L $5H$  $%)%)%)'+!%N)N)	N) N) "	N)
 "N) N) N) N) N) !N) "N) #N) 
N) #N) #N)  #!N)" %#N)$ %N)& ''N)@  888 
8 	8
 
8 8 "8 "8 8 8 8 8 8 8 8  5!8L &*%)%)D: %)(,#'+"!/D:D:	D: D: #	D:
 #D: D: D: D: D: D: #D: #D: #D: D:  !D:" #D:$ #%D:& &'D:( 
)D:* %+D:, -D:. /D:h %)(,'+!'-	-- 
- 	-
 #- #- - - - - - - - -  #!-" &#-$ %%-& '-x %)(,'+#!'++	+ + #	+
 #+ + + + + + + + + #+  &!+" %#+$ 
%+& '+| #'$(%888 
8 	8
 
8 8 #8 #8 8 8 8 8 8 8  !8" !#8$ "%8~ &*%)%)#D DD	D D #	D
 #D #D D D D DV # 	  	
   P !2"2"2" 
2" 	2"
 
2" 2" #2" #2" 2" 2" 2" 2" 2"  2"  !2"j 
 ' ';W Xr   