
    
3jT                        S SK r S SKrS SKJr  S SKrS SKJr  SSK	J
r
  \
R                  " \5      rSS\4S jjr\R                   R"                  R$                  SS4S\S\4S	 jjr " S
 S\R(                  5      r " S S\R(                  5      r " S S\R(                  5      r " S S\R(                  5      r " S S\R(                  5      rg)    N   )logging   key_chunk_sizec                   ^ ^^^^^^^^ TR                   SS u  nmmTR                   S   m[        TU5      mT [        R                  " T5      -  m [        R
                  " [        R                  SS9U4S j5       mUUUUU UUU4S jn[        R                  R                  U[        R                  " SUT5      S	9u  pxn	[        R                  " U	SS
S9n
[        R                  " X-
  5      nU[        R                  " USS9-  nX-  nUR                  SS9n[        R                  " US5      R                  SS9nX-  $ )zBMulti-head dot product attention with a limited number of queries.NF)prevent_csec                 F  > [         R                  " SXTS9n[         R                  " USSS9n[        R                  R                  U5      n[         R                  " X4-
  5      n[         R                  " SX%TS9n[         R                  " SU5      nXeR                  SS9U4$ )	Nz...qhd,...khd->...qhk)	precisionr	   Taxiskeepdimsz...vhf,...qhv->...qhfz...qhk->...qhr   )jnpeinsummaxjaxlaxstop_gradientexpsum)querykeyvalueattn_weights	max_scoreexp_weights
exp_valuesr   s          Y/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/attention_flax.pysummarize_chunk/_query_chunk_attention.<locals>.summarize_chunk#   s    zz"95QZ[GGLrDA	GG)))4	ggl67ZZ 7W`a
JJ	:	OOO4i@@    c           	      ^  > [         R                  R                  TS/TR                  S-
  -  U SS/-   [	        TR
                  S S 5      TTT/-   S9n[         R                  R                  T
S/T
R                  S-
  -  U SS/-   [	        T
R
                  S S 5      TTT	/-   S9nT" TX5      $ )Nr      r   operandstart_indicesslice_sizes)r   r   dynamic_slicendimlistshape)	chunk_idx	key_chunkvalue_chunk
k_featuresr   r   	num_headsr   r!   
v_featuresr   s      r    chunk_scanner-_query_chunk_attention.<locals>.chunk_scanner0   s    GG))#A.)Q1BBSYYs^,	:/VV * 
	 gg++#a0Iq!3DDU[["-..)Z1XX , 
 ui==r#   r   )fxsTr   r   )r-   minr   sqrt	functoolspartialr   
checkpointr   maparanger   r   expand_dimsr   )r   r   r   r   r   num_kvr4   chunk_valueschunk_weights	chunk_max
global_max	max_diffs
all_valuesall_weightsr1   r2   r!   r3   s   `````         @@@@r    _query_chunk_attentionrH      s   $'IIbcN!FIzRJ0NCHHZ((Es~~59
A :
A> >" .1WW[[=SZZXY[acqMr[-s*LT:J	./ICOOIB77LM!!q!)J//-488a8@K##r#   i   query_chunk_sizec           	         ^ ^^^^^^	^
^ T R                   SS u  m
m	mUUU	U
UUU UU4	S jn[        R                  R                  USS[        R
                  " T
T-  5      S9u  px[        R                  " USS9$ )a  
Flax Memory-efficient multi-head dot product attention. https://huggingface.co/papers/2112.05682v2
https://github.com/AminRezaei0x443/memory-efficient-attention

Args:
    query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
    key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
    value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
    precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
        numerical precision for computation
    query_chunk_size (`int`, *optional*, defaults to 1024):
        chunk size to divide query array value must divide query_length equally without remainder
    key_chunk_size (`int`, *optional*, defaults to 4096):
        chunk size to divide key and value array value must divide key_value_length equally without remainder

Returns:
    (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
r   Nc           	         >	 [         R                  R                  T	S/T	R                  S-
  -  U SS/-   [	        T	R
                  S S 5      [        T
T5      TT/-   S9nU T
-   [        UTTTTS94$ )Nr   r%   r   r&   )r   r   r   r   r   )r   r   r*   r+   r,   r-   r8   rH   )r.   _query_chunkr   r   r2   num_qr   
q_featuresr   rI   r   s      r    r4   5jax_memory_efficient_attention.<locals>.chunk_scannerf   s    gg++3%**q.1iA5FFU[["-.#6F2NPY[e1ff , 
 (("!s%9]k
 	
r#   r   )r6   initr7   lengthr   )r-   r   r   scanmathceilr   concatenate)r   r   r   r   rI   r   r4   rL   resr2   rN   rO   s   ``````   @@@r    jax_memory_efficient_attentionrX   O   sp    * $);;rs#3 E9j
 
 WW\\
yy!112	  FA ??3R((r#   c                       \ rS rSr% Sr\\S'   Sr\\S'   Sr\\S'   Sr	\
\S	'   S
r\\S'   S
r\\S'   \R                  r\R                   \S'   S rS rS rSS jrSrg)FlaxAttention   a  
A Flax multi-head attention module as described in: https://huggingface.co/papers/1706.03762

Parameters:
    query_dim (:obj:`int`):
        Input hidden states dimension
    heads (:obj:`int`, *optional*, defaults to 8):
        Number of heads
    dim_head (:obj:`int`, *optional*, defaults to 64):
        Hidden states dimension inside each head
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
        enable memory efficient attention https://huggingface.co/papers/2112.05682
    split_head_dim (`bool`, *optional*, defaults to `False`):
        Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
        enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`

	query_dim   heads@   dim_head        dropoutFuse_memory_efficient_attentionsplit_head_dimdtypec                    [         R                  S5        U R                  U R                  -  nU R                  S-  U l        [
        R                  " USU R                  SS9U l        [
        R                  " USU R                  SS9U l	        [
        R                  " USU R                  SS9U l
        [
        R                  " U R                  U R                  SS	9U l        [
        R                  " U R                  S
9U l        g )NFlax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.g      Fto_q)use_biasre   nameto_kto_vto_out_0)re   rj   rate)loggerwarningr`   r^   scalennDensere   r   r   r   r\   	proj_attnDropoutrb   dropout_layerself	inner_dims     r    setupFlaxAttention.setup   s    [	

 MMDJJ.	]]D(
 XXi%tzzPVW
88ITZZfUXXi%tzzPVW
$..

TZZT\\:r#   c                     UR                   u  p#nU R                  nUR                  X#XTU-  5      n[        R                  " US5      nUR                  X%-  X4U-  5      nU$ N)r   r      r%   r-   r^   reshaper   	transposery   tensor
batch_sizeseq_lendim	head_sizes         r    reshape_heads_to_batch_dim(FlaxAttention.reshape_heads_to_batch_dim   s[    #)<< 
SJJ	
Yy@PQv|4
 6	AQRr#   c                     UR                   u  p#nU R                  nUR                  X%-  XSU5      n[        R                  " US5      nUR                  X%-  X4U-  5      nU$ r~   r   r   s         r    reshape_batch_dim_to_heads(FlaxAttention.reshape_batch_dim_to_heads   sZ    #)<< 
SJJ	
 7SQv|4
 7	/Rr#   Nc                 6   Uc  UOUnU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a  UR                  S   n[
        R                  " XGSU R                  U R                  45      n[
        R                  " XWSU R                  U R                  45      n	[
        R                  " XgSU R                  U R                  45      n
O3U R                  U5      nU R                  U5      n	U R                  U5      n
U R                  (       a  UR                  SSS5      nU	R                  SSS5      n	U
R                  SSS5      n
UR                  S   nUS-  S:X  a  [        US-  5      nO;US-  S:X  a  [        US-  5      nO#US-  S:X  a  [        US-  5      nO[        U5      n[        XXS	S
9nUR                  SSS5      nU R                  U5      nGOU R                  (       a  [
        R                  " SX5      nO[
        R                  " SX5      nXR                   -  n["        R$                  " XR                  (       a  SOSS9nU R                  (       aW  [
        R                  " SX5      nUR                  S   n[
        R                  " XSU R                  U R                  -  45      nO([
        R                  " SX5      nU R                  U5      nU R'                  U5      nU R)                  XS9$ )Nr   r	   r   r   r   r_         i @  )rI   r   zb t n h, b f n h -> b n f tzb i d, b j d->b i jr   zb n f t, b t n h -> b f n hzb i j, b j d -> b i ddeterministic)r   r   r   rd   r-   r   r   r^   r`   r   rc   r   intrX   r   r   rr   rs   softmaxru   rw   )ry   hidden_statescontextr   
query_projkey_proj
value_projbquery_states
key_statesvalue_statesflatten_latent_dimrI   attention_scoresattention_probss                  r    __call__FlaxAttention.__call__   s   #*?-ZZ.
88G$ZZ(
##A&A;;zr4::t}}3UVLX2tzz4==/QRJ;;zr4::t}}3UVL:::FL88BJ:::FL..'11!Q:L#--aA6J'11!Q:L
 ".!3!3B!7!B&!+#&'9B'>#? #b(A-#&'9B'>#? #a'1,#&'9A'=#> #&'9#: :,jrM *33Aq!<M ;;MJM ""#&::.KZ#f #&::.C\#^ /**< jj)9FYFY_`aO "" #

+H/ h!''* #Mr4::PTP]P]C];^ _ #

+BO b $ ? ? N}5!!-!MMr#   )rw   r   ru   r   rr   r   )NT)__name__
__module____qualname____firstlineno____doc__r   __annotations__r^   r`   rb   floatrc   boolrd   r   float32re   r{   r   r   r   __static_attributes__ r#   r    rZ   rZ      sg    , NE3NHcGU+0"D0 ND {{E399";"<Nr#   rZ   c                       \ rS rSr% Sr\\S'   \\S'   \\S'   Sr\\S'   Sr	\
\S	'   \R                  r\R                  \S
'   Sr\
\S'   Sr\
\S'   S rSS jrSrg)FlaxBasicTransformerBlock   a!  
A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
https://huggingface.co/papers/1706.03762


Parameters:
    dim (:obj:`int`):
        Inner hidden states dimension
    n_heads (:obj:`int`):
        Number of heads
    d_head (:obj:`int`):
        Hidden states dimension inside each head
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    only_cross_attention (`bool`, defaults to `False`):
        Whether to only apply cross attention.
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
    use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
        enable memory efficient attention https://huggingface.co/papers/2112.05682
    split_head_dim (`bool`, *optional*, defaults to `False`):
        Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
        enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
r   n_headsd_headra   rb   Fonly_cross_attentionre   rc   rd   c           
         [         R                  S5        [        U R                  U R                  U R
                  U R                  U R                  U R                  U R                  S9U l
        [        U R                  U R                  U R
                  U R                  U R                  U R                  U R                  S9U l        [        U R                  U R                  U R                  S9U l        [        R                  " SU R                  S9U l        [        R                  " SU R                  S9U l        [        R                  " SU R                  S9U l        [        R&                  " U R                  S9U l        g )Nrg   re   )r   rb   re   h㈵>)epsilonre   rn   )rp   rq   rZ   r   r   r   rb   rc   rd   re   attn1attn2FlaxFeedForwardffrs   	LayerNormnorm1norm2norm3rv   rw   ry   s    r    r{   FlaxBasicTransformerBlock.setup!  s   [	
 #HHLLKKLL//**

 #HHLLKKLL//**

 "dhhDJJW\\$djjA
\\$djjA
\\$djjA
ZZT\\:r#   c                 `   UnU R                   (       a   U R                  U R                  U5      X#S9nOU R                  U R                  U5      US9nX-   nUnU R                  U R	                  U5      X#S9nX-   nUnU R                  U R                  U5      US9nX-   nU R                  XS9$ Nr   )r   r   r   r   r   r   r   rw   )ry   r   r   r   residuals        r    r   "FlaxBasicTransformerBlock.__call__A  s     $$ JJtzz-'@'JgM JJtzz-'@P]J^M%0 !

4::m#<g
c%0 !

= 9W%0!!-!MMr#   )r   r   rw   r   r   r   r   NT)r   r   r   r   r   r   r   rb   r   r   r   r   r   re   rc   rd   r{   r   r   r   r#   r    r   r      s`    2 
HLKGU!&$&{{E399"+0"D0 ND ;@Nr#   r   c                       \ rS rSr% Sr\\S'   \\S'   \\S'   Sr\\S'   Sr\	\S	'   S
r
\\S'   S
r\\S'   \R                  r\R                  \S'   S
r\\S'   S
r\\S'   S rSS jrSrg)FlaxTransformer2DModeliW  a  
A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
https://huggingface.co/papers/1506.02025


Parameters:
    in_channels (:obj:`int`):
        Input number of channels
    n_heads (:obj:`int`):
        Number of heads
    d_head (:obj:`int`):
        Hidden states dimension inside each head
    depth (:obj:`int`, *optional*, defaults to 1):
        Number of transformers block
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    use_linear_projection (`bool`, defaults to `False`): tbd
    only_cross_attention (`bool`, defaults to `False`): tbd
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
    use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
        enable memory efficient attention https://huggingface.co/papers/2112.05682
    split_head_dim (`bool`, *optional*, defaults to `False`):
        Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
        enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
in_channelsr   r   r   depthra   rb   Fuse_linear_projectionr   re   rc   rd   c                 R   [         R                  S5        [        R                  " SSS9U l        U R
                  U R                  -  nU R                  (       a$  [        R                  " XR                  S9U l
        O'[        R                  " USSSU R                  S9U l
        [        U R                  5       Vs/ s HY  n[        UU R
                  U R                  U R                  U R                   U R                  U R"                  U R$                  S	9PM[     snU l        U R                  (       a$  [        R                  " XR                  S9U l        O'[        R                  " USSSU R                  S9U l        [        R*                  " U R                  S
9U l        g s  snf )Nrg       r   )
num_groupsr   r   )r   r   VALID)kernel_sizestridespaddingre   )rb   r   re   rc   rd   rn   )rp   rq   rs   	GroupNormnormr   r   r   rt   re   proj_inConvranger   r   rb   r   rc   rd   transformer_blocksproj_outrv   rw   )ry   rz   rL   s      r    r{   FlaxTransformer2DModel.setup~  sF   [	

 LLB=	LL4;;.	%%88IZZ@DL77"jjDL& 4::&#
 ' &%)%>%>jj/3/R/R#22	 '#
 %%HHYjjADMGG"jjDM  ZZT\\:3#
s   <A F$c                    UR                   u  pEpgUnU R                  U5      nU R                  (       a'  UR                  XEU-  U5      nU R	                  U5      nO&U R	                  U5      nUR                  XEU-  U5      nU R
                   H
  n	U	" XUS9nM     U R                  (       a$  U R                  U5      nUR                  XEXg5      nO#UR                  XEXg5      nU R                  U5      nX-   nU R                  XS9$ r   )r-   r   r   r   r   r   r   rw   )
ry   r   r   r   batchheightwidthchannelsr   transformer_blocks
             r    r   FlaxTransformer2DModel.__call__  s    )6)<)<&u 		-0%%)11%%RM LL7M LL7M)11%%RM!%!8!8-mTabM "9 %% MM-8M)11%QM)11%QM MM-8M%0!!-!MMr#   )rw   r   r   r   r   Nr   )r   r   r   r   r   r   r   r   rb   r   r   r   r   r   r   re   rc   rd   r{   r   r   r   r#   r    r   r   W  su    6 LKE3NGU"'4'!&$&{{E399"+0"D0 ND -;^Nr#   r   c                   x    \ rS rSr% Sr\\S'   Sr\\S'   \	R                  r\	R                  \S'   S rSS jrS	rg
)r   i  a  
Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
[`FeedForward`] class, with the following simplifications:
- The activation function is currently hardcoded to a gated linear unit from:
https://huggingface.co/papers/2002.05202
- `dim_out` is equal to `dim`.
- The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].

Parameters:
    dim (:obj:`int`):
        Inner hidden states dimension
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
r   ra   rb   re   c                     [         R                  S5        [        U R                  U R                  U R
                  5      U l        [        R                  " U R                  U R
                  S9U l	        g )Nrg   r   )
rp   rq   	FlaxGEGLUr   rb   re   net_0rs   rt   net_2r   s    r    r{   FlaxFeedForward.setup  sL    [	
 txxtzzB
XXdhhdjj9
r#   c                 F    U R                  XS9nU R                  U5      nU$ r   r   r   )ry   r   r   s      r    r   FlaxFeedForward.__call__  s&    

=
N

=1r#   r   Nr   r   r   r   r   r   r   r   rb   r   r   r   re   r{   r   r   r   r#   r    r   r     s4    " 
HGU{{E399"	:r#   r   c                   x    \ rS rSr% Sr\\S'   Sr\\S'   \	R                  r\	R                  \S'   S rSS jrS	rg
)r   i  a  
Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
https://huggingface.co/papers/2002.05202.

Parameters:
    dim (:obj:`int`):
        Input hidden states dimension
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
r   ra   rb   re   c                     [         R                  S5        U R                  S-  n[        R                  " US-  U R
                  S9U l        [        R                  " U R                  S9U l	        g )Nrg   r   r   r   rn   )
rp   rq   r   rs   rt   re   projrv   rb   rw   rx   s     r    r{   FlaxGEGLU.setup  sR    [	

 HHqL	HHY]$**=	ZZT\\:r#   c                     U R                  U5      n[        R                  " USSS9u  p4U R                  U[        R
                  " U5      -  US9$ )Nr   r   r   )r   r   splitrw   rs   gelu)ry   r   r   hidden_linearhidden_gelus        r    r   FlaxGEGLU.__call__	  sJ    		-0%(YY}aa%H"!!-"''+2F"FVc!ddr#   )rw   r   Nr   r   r   r#   r    r   r     s5     
HGU{{E399";er#   r   )r   )r:   rT   
flax.linenlinenrs   r   	jax.numpynumpyr   utilsr   
get_loggerr   rp   r   rH   r   	PrecisionHIGHESTrX   ModulerZ   r   r   r   r   r   r#   r    <module>r      s       
   
		H	%0$ 0$h "%!2!2!:!:TXpt-)NQ-)jm-)`|NBII |N~VN		 VNrlNRYY lN^$bii $Ne		 er#   