
    3j                        S SK r SSKJrJr  SSKJr  \R                  " \5      r\" 5       r	S\ R                  S\ R                  R                  S\ R                  4S jr      SS\ R                  R                  S\ R                  S	\ R                  S
\ R                  S\ R                  S-  S\S\S-  S\S-  S\S-  S\S-  S\ R                  S-  S\\ R                  S4   4S jjrg)    N   )_flash_attention_forward!flash_attn_supports_top_left_mask)loggingquerymodulereturnc                    U R                   [        R                  :X  a  U R                  R                  n[        R
                  " U5      (       a  [        R                  " U5      $ [        UR                  S5      (       a  UR                  R                   $ [        S UR                  5        5       5      R                  R                   $ g)ziIf the query is in float32, return a target dtype compatible with flash attention. Return None otherwise._is_quantizedc              3   |   #    U  H2  n[        U[        R                  R                  5      (       d  M.  Uv   M4     g 7f)N)
isinstancetorchnnLinear).0layers     c/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/integrations/flash_attention.py	<genexpr>#get_target_dtype.<locals>.<genexpr>   s*     b+;%z%QVQYQYQ`Q`?a+;s   -<	<N)dtyper   float32devicetypeis_autocast_enabledget_autocast_dtypehasattrconfignextmodulesweight)r   r   device_types      r   get_target_dtyper"      s    {{emm#ll''$$[11++K88V]]O44==&&&b6>>+;bbiiooo    keyvalueattention_maskdropoutscalingsliding_windowsoftcap	is_causals_auxc                 X   UR                  SS5      (       a  [        R                  S5        UR                  S   n[	        S UR                   5       5      (       a  [        S5      eUR                  SS5      nUR                  SS5      nUR                  SS5      n[        X5      nU	b  U	OU R                  n	[        UUUU4UU	UUUU[        UU R                  R                  [        U S5      (       a  U R                  OS U
b  U
R                  UR                   5      OS S	.UD6nUS 4$ )
Noutput_attentionsFzFlash Attention does not support `output_attentions=True`. Please set your attention to `eager` if you want any of these features.r   c              3   *   #    U  H	  oS :H  v   M     g7f)r   N )r   dims     r   r   *flash_attention_forward.<locals>.<genexpr>1   s     
+{!8{s   zTensor query has shape  with a zero dimension.
FlashAttention does not support inputs with dim=0.
Please check your input shapes or use SDPA instead.   	layer_idx)query_lengthr+   r'   softmax_scaler)   r*   use_top_left_masktarget_dtypeattn_implementationr4   r,   )getloggerwarning_onceshapeany
ValueError	transposer"   r+   r   _use_top_left_maskr   _attn_implementationr   r4   tor   )r   r   r$   r%   r&   r'   r(   r)   r*   r+   r,   kwargsseq_lenr8   attn_outputs                  r   flash_attention_forwardrG      s@    zz%u--W	
 kk!nG

+u{{
+++B
 	
 OOAq!E
--1
COOAq!E $E2L '2	8H8HI*	
 %,!"MM>>&-fk&B&B&""   HHU[[!%( )K. r#   )g        NNNNN)r   modeling_flash_attention_utilsr   r   utilsr   
get_logger__name__r;   rA   Tensorr   Moduler   r"   floatintbooltuplerG   r0   r#   r   <module>rR      s1    h  
		H	%68 ELL %((// ekk (  !% !!%CHHOOC<<C 
C <<	C
 LL4'C C T\C $JC T\C d{C <<$C 5<<Cr#   