
    
3j                      L   S SK r S SKJr  S SKJr  S SKrS SKJr  S SKJs  J	r
  SSKJrJr  SSKJrJrJr  SSKJrJrJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJr  SSK J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(  \RR                  " \*5      r+\ " S S\5      5       r, " S S5      r- " S S5      r.  SIS\R^                  S\R^                  S\R^                  S\0S\0S\-S-  S\R^                  4S jjr1S\2\R^                  S4   S \2\R^                  S4   S!\0S"\0S\2\R^                  S4   4
S# jr3S$\R^                  S%\R^                  S!\0S"\0S\R^                  4
S& jr4S'\R^                  S%\R^                  S(\0S!\0S"\0S\R^                  4S) jr5SJSKS+ jjr6SJSKS, jjr7SJSKS- jjr8 " S. S/\Rr                  5      r: " S0 S1\Rr                  5      r; " S2 S35      r< " S4 S55      r= " S6 S*\R                  Rr                  \5      r> " S7 S85      r? " S9 S:5      r@ " S; S<\R                  Rr                  \5      rA " S= S>\Rr                  5      rB " S? S@\Rr                  5      rC " SA SB\Rr                  5      rD " SC SD\Rr                  5      rE " SE SF\Rr                  5      rF " SG SH\&\\\\\\5	      rGg)L    N)	dataclass)Any   )ConfigMixinregister_to_config)FluxTransformer2DLoadersMixinFromOriginalModelMixinPeftAdapterMixin)
BaseOutputapply_lora_scalelogging   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixin)dispatch_attention_fn)
CacheMixin)TimestepEmbedding	Timestepsapply_rotary_embget_1d_rotary_pos_embed)
ModelMixin)AdaLayerNormContinuousc                   2    \ rS rSr% SrS\S'   SrS\S'   Srg)	Flux2Transformer2DModelOutput+   al  
The output of [`Flux2Transformer2DModel`].

Args:
    sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
        The hidden states output conditioned on the `encoder_hidden_states` input.
    kv_cache (`Flux2KVCache`, *optional*):
        The populated KV cache for reference image tokens. Only returned when `kv_cache_mode="extract"`.
ztorch.TensorsampleNFlux2KVCache | Nonekv_cache )__name__
__module____qualname____firstlineno____doc____annotations__r    __static_attributes__r!       i/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_flux2.pyr   r   +   s     &*H#*r)   r   c                       \ rS rSrSrS rS\R                  S\R                  4S jrS\	\R                  \R                  4   4S jr
S	 rS
rg)Flux2KVLayerCache;   zPer-layer KV cache for reference image tokens in the Flux2 Klein KV model.

Stores the K and V projections (post-RoPE) for reference tokens extracted during the first denoising step. Tensor
format: (batch_size, num_ref_tokens, num_heads, head_dim).
c                      S U l         S U l        g Nk_refv_refselfs    r*   __init__Flux2KVLayerCache.__init__B   s    *.
*.
r)   r1   r2   c                     Xl         X l        g)zStore reference token K/V.Nr0   )r4   r1   r2   s      r*   storeFlux2KVLayerCache.storeF   s    

r)   returnc                 b    U R                   c  [        S5      eU R                   U R                  4$ )z$Retrieve cached reference token K/V.z$KV cache has not been populated yet.)r1   RuntimeErrorr2   r3   s    r*   getFlux2KVLayerCache.getK   s+    ::EFFzz4::%%r)   c                      S U l         S U l        g r/   r0   r3   s    r*   clearFlux2KVLayerCache.clearQ   s    

r)   r0   N)r"   r#   r$   r%   r&   r5   torchTensorr8   tupler=   r@   r(   r!   r)   r*   r,   r,   ;   sJ    /5<<  
&U5<<56 &r)   r,   c                   T    \ rS rSrSrS\S\4S jrS\S\4S jrS\S\4S	 jr	S
 r
Srg)Flux2KVCacheV   zContainer for all layers' reference-token KV caches.

Holds separate cache lists for double-stream and single-stream transformer blocks.
num_double_layersnum_single_layersc                     [        U5       Vs/ s H  n[        5       PM     snU l        [        U5       Vs/ s H  n[        5       PM     snU l        SU l        g s  snf s  snf Nr   )ranger,   double_block_cachessingle_block_cachesnum_ref_tokens)r4   rH   rI   _s       r*   r5   Flux2KVCache.__init__\   sY    AFGXAY#ZAYA$5$7AY#Z AFGXAY#ZAYA$5$7AY#Z #$ $[#Zs
   AA 	layer_idxr:   c                      U R                   U   $ r/   )rM   r4   rR   s     r*   
get_doubleFlux2KVCache.get_doublea       ''	22r)   c                      U R                   U   $ r/   )rN   rT   s     r*   
get_singleFlux2KVCache.get_singled   rW   r)   c                     U R                    H  nUR                  5         M     U R                   H  nUR                  5         M     SU l        g rK   )rM   r@   rN   rO   )r4   caches     r*   r@   Flux2KVCache.clearg   s<    --EKKM .--EKKM .r)   )rM   rO   rN   N)r"   r#   r$   r%   r&   intr5   r,   rU   rY   r@   r(   r!   r)   r*   rF   rF   V   sG    
%# %# %
3C 3,= 33C 3,= 3 r)   rF   querykeyvaluenum_txt_tokensrO   r    r:   c                    US:X  a  Uc
  [        XX&S9$ Ubn  UR                  5       u  px[        R                  " USS2SU24   XqSS2US24   /SS9n	[        R                  " USS2SU24   XSS2US24   /SS9n
[        X	XS9$ UnX4-   nU SS2SU24   nU SS2X24   nU SS2US24   nUSS2SU24   nUSS2X24   nUSS2US24   nUSS2SU24   nUSS2X24   nUSS2US24   n[        R                  " X/SS9n[        R                  " UUU/SS9n	[        R                  " UUU/SS9n
[        UXUS9nUSS2SU24   nUSS2US24   n[        XXS9n[        R                  " UUU/SS9$ )a  Causal attention for KV caching where reference tokens only self-attend.

All tensors use the diffusers convention: (batch_size, seq_len, num_heads, head_dim).

Without cache (extract mode): sequence layout is [txt, ref, img]. txt+img tokens attend to all tokens, ref tokens
only attend to themselves. With cache (cached mode): sequence layout is [txt, img]. Cached ref K/V are injected
between txt and img.
r   Nbackend   dim)r   r=   rB   cat)r_   r`   ra   rb   rO   r    re   r1   r2   k_allv_all	ref_startref_endq_txtq_refq_imgk_txtk_imgv_txtv_img	q_txt_imgattn_txt_imgattn_txtattn_imgattn_refs                            r*   _flux2_kv_causal_attentionrz   o   s   $ x/$UHH||~		3q/>/12Eq./?Q;RSYZ[		5O^O!34e1noCU=VW]^_$U5JJ I-G!ZiZ- E!Y&&'E!WX+E:I:E9$$%E78E!ZiZ- E!Y&&'E!WX+E 		5.a0IIIueU+3EIIueU+3E(E'RLAz	zM*HAyzM*H %U5JH99h(3;;r)   
img_params.
ref_paramsnum_refseq_lenc                 j   / n[        X5       H  u  pVUR                  S:X  a"  UR                  S5      nUR                  S5      nUR                  S   nUR	                  [
        R                  " UR                  XrS5      UR                  XsS5      SS2US2SS24   /SS95        M     [        U5      $ )zSBlend modulation parameters so that the first `num_ref` positions use `ref_params`.r   rf   r   Nrg   )	zipndim	unsqueezeshapeappendrB   ri   expandrD   )r{   r|   r}   r~   blendedimrmBs           r*   _blend_mod_paramsr      s     Gj-77a<aBaBHHQKII1r*BIIa",EaSTn,UV	
 . >r)   img_modref_modc                 v   U R                   S:X  a"  U R                  S5      n UR                  S5      n[        R                  " U SSS9n[        R                  " USSS9nUSS USS 4nUSS USS 4n/ n[	        Xg5       H"  u  p[        XX#5      nUR                  U5        M$     [        R                  " USS9$ )zBlend double-block image-stream modulations for a [ref, img] sequence layout.

Takes raw modulation tensors (before `Flux2Modulation.split`) and returns a blended raw tensor that is compatible
with `Flux2Modulation.split(mod, 2)`.
r   rf      r   rg   r   r   )r   r   rB   chunkr   r   extendri   )r   r   r}   r~   
img_chunks
ref_chunksimg_modsref_mods
all_paramsimg_setref_setr   s               r*   _blend_double_block_modsr      s     ||q##A&##A&WaR0JWaR0J1QAa1H1QAa1HJ3#GgG'" 4 99ZR((r)   
single_modnum_txtc                 ^   U R                   S:X  a"  U R                  S5      n UR                  S5      n[        R                  " U SSS9n[        R                  " USSS9n/ n[	        XV5       H  u  pUR                   S:X  a"  UR                  S5      nU	R                  S5      n	UR
                  S   n
UR                  XS5      nU	R                  XS5      nUR                  [        R                  " USS2SU2SS24   XSS2X#-   S2SS24   /SS95        M     [        R                  " USS9$ )zBlend single-block modulations for a [txt, ref, img] sequence layout.

Takes raw modulation tensors and returns a blended raw tensor compatible with `Flux2Modulation.split(mod, 1)`.
r   rf   r   r   rg   r   N)	r   r   rB   r   r   r   r   r   ri   )r   r   r   r}   r~   r{   r|   r   r   r   r   im_expandedrm_expandeds                r*   _blend_single_block_modsr      s    !))!,
##A&Z3JWaR0JGj-77a<aBaBHHQKiiB/iiB/IIQ!^,kq'J[J]_`G`;ab	
 . 99W"%%r)   Flux2Attentionc                    U R                  U5      nU R                  U5      nU R                  U5      nS =n=pxUb@  U R                  b3  U R	                  U5      nU R                  U5      nU R                  U5      nX4XVXx4$ r/   )to_qto_kto_vadded_kv_proj_dim
add_q_proj
add_k_proj
add_v_proj	attnhidden_statesencoder_hidden_statesr_   r`   ra   encoder_queryencoder_keyencoder_values	            r*   _get_projectionsr      s    IIm$E
))M
"CIIm$E266M6K(T-C-C-O(=>oo&;<(=>u[GGr)   c                     U R                  U5      R                  SSS9u  p4nS=n=pxUb3  [        U S5      (       a"  U R                  U5      R                  SSS9u  pgnX4XVXx4$ )Nr   r   rg   r/   to_added_qkv)to_qkvr   hasattrr   r   s	            r*   _get_fused_projectionsr     sy    M2888CE299M9K(WT>-J-J484E4EF[4\4b4bcdjl4b4m1Mu[GGr)   c                 T    U R                   (       a  [        XU5      $ [        XU5      $ r/   )fused_projectionsr   r   )r   r   r   s      r*   _get_qkv_projectionsr     s'    %d;PQQD1FGGr)   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )Flux2SwiGLUi  z
Flux 2 uses a SwiGLU-style activation in the transformer feedforward sub-blocks, but with the linear projection
layer fused into the first linear layer of the FF sub-block. Thus, this module has no trainable parameters.
c                 V   > [         TU ]  5         [        R                  " 5       U l        g r/   )superr5   nnSiLUgate_fn)r4   	__class__s    r*   r5   Flux2SwiGLU.__init__!  s    wwyr)   xr:   c                 R    UR                  SSS9u  p#U R                  U5      U-  nU$ )Nr   r   rg   )r   r   )r4   r   x1x2s       r*   forwardFlux2SwiGLU.forward%  s.    #LLr!r)   )r   )r"   r#   r$   r%   r&   r5   rB   rC   r   r(   __classcell__r   s   @r*   r   r     s-    
! %,,  r)   r   c                      ^  \ rS rSr    SS\S\S-  S\S\S-  S\4
U 4S jjjrS	\R                  S
\R                  4S jr
SrU =r$ )Flux2FeedForwardi+  Nrh   dim_outmult	inner_dimbiasc                    > [         TU ]  5         Uc  [        X-  5      nU=(       d    Un[        R                  " XS-  US9U l        [        5       U l        [        R                  " XBUS9U l        g )Nr   r   )	r   r5   r^   r   Linear	linear_inr   act_fn
linear_out)r4   rh   r   r   r   r   r   s         r*   r5   Flux2FeedForward.__init__,  s]     	CJI.S 3ADA!m))ITBr)   r   r:   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r/   )r   r   r   )r4   r   s     r*   r   Flux2FeedForward.forward>  s0    NN1KKNOOAr)   )r   r   r   )N      @NF)r"   r#   r$   r%   r^   floatboolr5   rB   rC   r   r(   r   r   s   @r*   r   r   +  sz     # $CC tC 	C
 :C C C$ %,,  r)   r   c                       \ rS rSrSrSrS r   SSSS\R                  S\R                  S\R                  S-  S	\R                  S-  S
\R                  4S jjr	Sr
g)Flux2AttnProcessoriE  Nc                 r    [        [        S5      (       d"  [        U R                  R                   S35      eg Nscaled_dot_product_attentionz; requires PyTorch 2.0. Please upgrade your pytorch version.r   FImportErrorr   r"   r3   s    r*   r5   Flux2AttnProcessor.__init__I  3    q899!8!8 99tuvv :r)   r   r   r   r   attention_maskimage_rotary_embr:   c           	         [        XU5      u  pgppUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  U5      nUR	                  U5      nUR
                  b  U	R                  SUR                  S45      n	U
R                  SUR                  S45      n
UR                  SUR                  S45      nUR                  U	5      n	UR                  U
5      n
[        R                  " X/SS9n[        R                  " X/SS9n[        R                  " X/SS9nUb  [        XeSS9n[        XuSS9n[        UUUUU R                  U R                  S9nUR                  SS5      nUR                  UR                   5      nUbO  UR#                  UR$                  S   UR$                  S   UR$                  S   -
  /SS9u  p2UR'                  U5      nUR(                  S   " U5      nUR(                  S   " U5      nUb  X#4$ U$ )	Nr   rf   rg   sequence_dim	attn_maskre   parallel_configr   r   r   )r   	unflattenheadsnorm_qnorm_kr   norm_added_qnorm_added_krB   ri   r   r   _attention_backend_parallel_configflattentodtypesplit_with_sizesr   
to_add_outto_out)r4   r   r   r   r   r   r_   r`   ra   r   r   r   s               r*   __call__Flux2AttnProcessor.__call__M  sI    H\!6H
DE+ TZZ$45mmBR 01TZZ$45E"kk#!!-)33BR8HIM%//TZZ4DEK)33BR8HIM --m<M++K8KII}4!<E))[.A6CII}4!<E'$U1ME"3qIC-$++ 11
 &--a3%((5 ,3@3Q3Q&,,Q/1D1DQ1GJ_JeJefgJh1hiop 4R 40! %)OO4I$J!A}5A}5 , 77  r)   r!   NNNr"   r#   r$   r%   r   r   r5   rB   rC   r   r(   r!   r)   r*   r   r   E  s    w /3.204:!:! ||:!  %||	:!
 t+:!  ,,-:! 
:! :!r)   r   c                       \ rS rSrSrSrSrS r      SSSS\R                  S\R                  S	\R                  S-  S
\R                  S-  S\
S-  S\S-  S\S\R                  4S jjrSrg)Flux2KVAttnProcessori  a  
Attention processor for Flux2 double-stream blocks with KV caching support for reference image tokens.

When `kv_cache_mode` is "extract", reference token K/V are stored in the cache after RoPE and causal attention is
used (ref tokens self-attend only, txt+img attend to all). When `kv_cache_mode` is "cached", cached ref K/V are
injected during attention. When no KV args are provided, behaves identically to `Flux2AttnProcessor`.
Nc                 r    [        [        S5      (       d"  [        U R                  R                   S35      eg r   r   r3   s    r*   r5   Flux2KVAttnProcessor.__init__  r   r)   r   r   r   r   r   r   r    kv_cache_moderO   r:   c	           
         [        XU5      u  pppU	R                  SUR                  S45      n	U
R                  SUR                  S45      n
UR                  SUR                  S45      nUR                  U	5      n	UR	                  U
5      n
UR
                  b  UR                  SUR                  S45      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  U5      nUR                  U5      n[        R                  " X/SS9n	[        R                  " X/SS9n
[        R                  " X/SS9nUb  [        XSS9n	[        XSS9n
Ub  UR                  S   OSnUS:X  aO  UbL  US:  aF  UnX-   nUR                  U
S S 2UU24   R                  5       US S 2UU24   R                  5       5        US:X  a  US:  a  [        XXXR                  S9nOBUS:X  a  Ub  [        XXSX`R                  S	9nO"[!        U	U
UUU R                  U R"                  S
9nUR%                  SS5      nUR'                  U	R(                  5      nUbO  UR+                  UR                  S   UR                  S   UR                  S   -
  /SS9u  p2UR-                  U5      nUR.                  S   " U5      nUR.                  S   " U5      nUb  X#4$ U$ )Nr   rf   rg   r   r   extractrd   cachedr    re   r   r   r   )r   r   r   r   r   r   r   r   rB   ri   r   r   r8   clonerz   r   r   r   r   r   r   r   r   r   )r4   r   r   r   r   r   r    r  rO   r_   r`   ra   r   r   r   rb   rl   rm   s                     r*   r   Flux2KVAttnProcessor.__call__  s    H\!6H
DE+ TZZ$45mmBR 01TZZ$45E"kk#!!-)33BR8HIM%//TZZ4DEK)33BR8HIM --m<M++K8KII}4!<E))[.A6CII}4!<E'$U1ME"3qIC;P;\.44Q7bc I%(*>>TUCU&I$5GNN3q)G"334::<eAyQXGXDX>Y>_>_>ab I%.1*<6E>KbKbM h&8+?6E1xQhQhM 2(// $ 5 5M &--a3%((5 ,3@3Q3Q&,,Q/1D1DQ1GJ_JeJefgJh1hiop 4R 40! %)OO4I$J!A}5A}5 , 77  r)   r!   )NNNNNr   r"   r#   r$   r%   r&   r   r   r5   rB   rC   r,   strr^   r   r(   r!   r)   r*   r  r    s     w /3.204-1$(P!P! ||P!  %||	P!
 t+P!  ,,-P! $d*P! TzP! P! 
P! P!r)   r  c                   (  ^  \ rS rSr\r\\/r           SS\S\S\S\	S\
S\S-  S	\
S-  S
\
S\	S\S\
4U 4S jjjr   SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  4
S jjrSrU =r$ )r   i  N	query_dimr   dim_headdropoutr   r   added_proj_biasout_biasepsout_dimelementwise_affinec                   > [         TU ]  5         X0l        U
b  U
OX2-  U l        Xl        U
b  U
OUU l        U
b  X-  OUU l        XPl        X@l        X`l	        Xpl
        [        R                  R                  XR                  US9U l        [        R                  R                  XR                  US9U l        [        R                  R                  XR                  US9U l        [        R                  R#                  X9US9U l        [        R                  R#                  X9US9U l        [        R                  R)                  / 5      U l        U R*                  R-                  [        R                  R                  U R                  U R
                  US95        U R*                  R-                  [        R                  R/                  U5      5        Ub  [        R                  R#                  X9S9U l        [        R                  R#                  X9S9U l        [        R                  R                  X`R                  US9U l        [        R                  R                  X`R                  US9U l        [        R                  R                  X`R                  US9U l        [        R                  R                  U R                  XS9U l        Uc  U R=                  5       nU R?                  U5        g )Nr   r  r  )r  ) r   r5   head_dimr   r  r  r   use_biasr  r   r  rB   r   r   r   r   r   RMSNormr   r   
ModuleListr   r   Dropoutr   r   r   r   r   r   _default_processor_clsset_processor)r4   r  r   r  r  r   r   r  r  r  r  r  	processorr   s                r*   r5   Flux2Attention.__init__  s$    	 $+$7X=M"")"5w9,3,?W(U
!2.HHOOI~~DOI	HHOOI~~DOI	HHOOI~~DOI	 hh&&xM_&`hh&&xM_&`hh))"-588??4>>4<<h?WX588++G45( % 0 0 0 CD % 0 0 0 CD#hhoo.?VeofDO#hhoo.?VeofDO#hhoo.?VeofDO#hhoodnnioWDO335I9%r)   r   r   r   r   r:   c                    [        [        R                  " U R                  R                  5      R
                  R                  5       5      nUR                  5        VVs/ s H  u  pxXv;  d  M  UPM     n	nn[        U	5      S:  a:  [        R                  SU	 SU R                  R                  R                   S35        UR                  5        VV
s0 s H  u  pzXv;   d  M  Xz_M     nnn
U R                  " XX#U40 UD6$ s  snnf s  sn
nf Nr   zjoint_attention_kwargs z are not expected by z and will be ignored.setinspect	signaturer  r   
parameterskeysitemslenloggerwarningr   r"   )r4   r   r   r   r   kwargsattn_parameterskrP   unused_kwargsws              r*   r   Flux2Attention.forward&  s     g//0G0GHSSXXZ['-||~R~tq9Q~R}!NN)-8MdnnNfNfNoNoMp  qF  G $*<<>J>41Q5I$!$>J~~d3HZjuntuu S
 Ks   D .D D"D)r   r   r   r   r  r  r  r   r   r   r   r   r   r  r  r   r   r   r   r   r  )   @           FNTTh㈵>NTNr   )r"   r#   r$   r%   r   r  r  _available_processorsr^   r   r   r5   rB   rC   r   r(   r   r   s   @r*   r   r     s&   //1EF
 (,'+#'3&3& 3& 	3&
 3& 3& :3& 3& 3& 3& 3& !3& 3&p 6:.204v||v  %||d2v t+	v
  ,,-v 
v vr)   c                       \ rS rSrSrSrS r  SSSS\R                  S\R                  S-  S\R                  S-  S	\R                  4
S
 jjr	Sr
g)Flux2ParallelSelfAttnProcessori8  Nc                 r    [        [        S5      (       d"  [        U R                  R                   S35      eg r   r   r3   s    r*   r5   'Flux2ParallelSelfAttnProcessor.__init__<  r   r)   r   Flux2ParallelSelfAttentionr   r   r   r:   c           	         UR                  U5      n[        R                  " USUR                  -  UR                  UR
                  -  /SS9u  pVUR                  SSS9u  pxn	UR                  SUR                  S45      nUR                  SUR                  S45      nU	R                  SUR                  S45      n	UR                  U5      nUR                  U5      nUb  [        XtSS9n[        XSS9n[        UUU	UU R                  U R                  S9nUR                  SS5      nUR!                  UR"                  5      nUR%                  U5      n[        R&                  " X&/SS9nUR)                  U5      nU$ )Nr   r   rg   rf   r   r   r   )to_qkv_mlp_projrB   splitr   mlp_hidden_dimmlp_mult_factorr   r   r   r   r   r   r   r   r   r   r   r   
mlp_act_fnri   r   )
r4   r   r   r   r   qkvmlp_hidden_statesr_   r`   ra   s
             r*   r   'Flux2ParallelSelfAttnProcessor.__call__@  su    ,,];!&A.0C0CdFZFZ0Z[ac"

  IIaRI0ETZZ$45mmBR 01TZZ$45E"kk#'$U1ME"3qIC-$++ 11
 &--a3%((5 !OO,=> 		="D"MM2r)   r!   NNr   r!   r)   r*   r9  r9  8  sp    w /304-*- ||- t+	-
  ,,-- 
- -r)   r9  c                       \ rS rSrSrSrSrS r      SSSS\R                  S\R                  S-  S	\R                  S-  S
\
S-  S\S-  S\S\S\R                  4S jjrSrg) Flux2KVParallelSelfAttnProcessorip  ap  
Attention processor for Flux2 single-stream blocks with KV caching support for reference image tokens.

When `kv_cache_mode` is "extract", reference token K/V are stored and causal attention is used. When
`kv_cache_mode` is "cached", cached ref K/V are injected during attention. When no KV args are provided, behaves
identically to `Flux2ParallelSelfAttnProcessor`.
Nc                 r    [        [        S5      (       d"  [        U R                  R                   S35      eg r   r   r3   s    r*   r5   )Flux2KVParallelSelfAttnProcessor.__init__|  r   r)   r   r<  r   r   r   r    r  rb   rO   r:   c	           
      .   UR                  U5      n	[        R                  " U	SUR                  -  UR                  UR
                  -  /SS9u  pU
R                  SSS9u  pnUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  U5      nUR                  U5      nUb  [        XSS9n[        XSS9nUS:X  aO  UbL  US:  aF  UnXx-   nUR                  US S 2UU24   R                  5       US S 2UU24   R                  5       5        US:X  a  US:  a  [        XXXR                  S9nOBUS	:X  a  Ub  [        XXSXPR                  S
9nO"[!        UUUUU R                  U R"                  S9nUR%                  SS5      nUR'                  UR(                  5      nUR+                  U5      n[        R,                  " UU/SS9nUR/                  U5      nU$ )Nr   r   rg   rf   r   r  r   rd   r  r  r   r   )r>  rB   r?  r   r@  rA  r   r   r   r   r   r   r8   r	  rz   r   r   r   r   r   r   rB  ri   r   )r4   r   r   r   r   r    r  rb   rO   hidden_states_projrC  rD  r_   r`   ra   rl   rm   attn_outputs                     r*   r   )Flux2KVParallelSelfAttnProcessor.__call__  s,    "11-@!&T^^!3T5H5H4K_K_5_ `fh"
  IIaRI0ETZZ$45mmBR 01TZZ$45E"kk#'$U1ME"3qIC I%(*>>TUCU&I$5GNN3q)G"334::<eAyQXGXDX>Y>_>_>ab I%.1*<4E>KbKbK h&8+?4E1xQhQhK 0(// $ 5 5K "))!Q/!nnU[[1 !OO,=> 		;0A"BKM2r)   r!   )NNNNr   r   r  r!   r)   r*   rH  rH  p  s     w /304-1$(A*A ||A t+	A
  ,,-A $d*A TzA A A 
A Ar)   rH  c                     ^  \ rS rSrSr\r\\/rSr	           SS\
S\
S\
S\S	\S
\S\S\
S\S\S\
4U 4S jjjr  SS\R                  S\R                  S-  S\R                  S-  S\R                  4S jjrSrU =r$ )r<  i  a  
Flux 2 parallel self-attention for the Flux 2 single-stream transformer blocks.

This implements a parallel transformer block, where the attention QKV projections are fused to the feedforward (FF)
input projections, and the attention output projections are fused to the FF output projections. See the [ViT-22B
paper](https://arxiv.org/abs/2302.05442) for a visual depiction of this type of transformer block.
FNr  r   r  r  r   r  r  r  r  	mlp_ratiorA  c                   > [         TU ]  5         X0l        Ub  UOX2-  U l        Xl        Ub  UOUU l        Ub  X-  OUU l        XPl        X@l        Xl	        [        XR                  -  5      U l        Xl        [        R                  R                  U R                  U R                  S-  U R                  U R                  -  -   US9U l        [#        5       U l        [        R                  R'                  X7U	S9U l        [        R                  R'                  X7U	S9U l        [        R                  R                  U R                  U R                  -   U R
                  US9U l        Uc  U R/                  5       nU R1                  U5        g )Nr   r   r  )r   r5   r  r   r  r  r   r  r  rP  r^   r@  rA  rB   r   r   r>  r   rB  r  r   r   r   r  r  )r4   r  r   r  r  r   r  r  r  r  rP  rA  r  r   s                r*   r5   #Flux2ParallelSelfAttention.__init__  sQ    	 $+$7X=M"")"5w9,3,?W(U
"!)nn"<=.  %xxNNDNNQ.1D1DtG[G[1[[bf  /  
 &- hh&&xM_&`hh&&xM_&` hhoodnnt7J7J&JDLL_goh335I9%r)   r   r   r   r:   c                    [        [        R                  " U R                  R                  5      R
                  R                  5       5      nUR                  5        VVs/ s H  u  pgXe;  d  M  UPM     nnn[        U5      S:  a:  [        R                  SU SU R                  R                  R                   S35        UR                  5        VV	s0 s H  u  piXe;   d  M  Xi_M     nnn	U R                  " XX#40 UD6$ s  snnf s  sn	nf r"  r#  )
r4   r   r   r   r-  r.  r/  rP   r0  r1  s
             r*   r   "Flux2ParallelSelfAttention.forward  s     g//0G0GHSSXXZ['-||~R~tq9Q~R}!NN)-8MdnnNfNfNoNoMp  qF  G $*<<>J>41Q5I$!$>J~~d>^W]^^ S
 Ks   C?.C?D"D)r  r  r   r   rB  r@  rA  rP  r   r   r  r  r   r>  r  )r3  r4  r5  FTr6  NTg      @r   NrF  )r"   r#   r$   r%   r&   r9  r  rH  r7  _supports_qkv_fusionr^   r   r   r5   rB   rC   r   r(   r   r   s   @r*   r<  r<    s    <;=]^ 
 #' -&-& -& 	-&
 -& -& -& -& -& !-& -& -& -&d /304	_||_ t+_  ,,-	_ 
_ _r)   r<  c                   F  ^  \ rS rSr   SS\S\S\S\S\S\4U 4S jjjr    SS
\R                  S\R                  S	-  S\R                  S\
\R                  \R                  4   S	-  S\\\4   S	-  S\S\S	-  S\
\R                  \R                  4   4S jjrSrU =r$ )Flux2SingleTransformerBlocki  rh   num_attention_headsattention_head_dimrP  r  r   c                    > [         TU ]  5         [        R                  " USUS9U l        [        UUUUUUUUS[        5       S9
U l        g )NFr  r  r   )
r  r  r   r  r   r  r  rP  rA  r  )r   r5   r   	LayerNormnormr<  r9  r   r4   rh   rX  rY  rP  r  r   r   s          r*   r5   $Flux2SingleTransformerBlock.__init__  sS     	LLCH	
 /'%46
	r)   Nr   r   temb_modr   joint_attention_kwargssplit_hidden_statestext_seq_lenr:   c                    Ub%  UR                   S   n[        R                  " X!/SS9n[        R	                  US5      S   u  pn
U R                  U5      nSU	-   U-  U-   nU=(       d    0 nU R                  " SUUS.UD6nXU-  -   nUR                  [        R                  :X  a  UR                  SS5      nU(       a  US S 2S U24   US S 2US 24   pX!4$ U$ )Nrf   rg   r   )r   r       r!   )
r   rB   ri   Flux2Modulationr?  r]  r   r   float16clip)r4   r   r   r`  r   ra  rb  rc  	mod_shift	mod_scalemod_gatenorm_hidden_statesrM  s                r*   r   #Flux2SingleTransformerBlock.forward0  s    !,066q9L!II'<&LRSTM)8)>)>x)KA)N&	h!YY}5)m/AAIM!7!=2ii 
,-
 %
 &;(>>%--/)..vu=M3@M\MAQ3RTabceqerbrTs=(77  r)   )r   r]  r   ư>F)NNFNr"   r#   r$   r%   r^   r   r   r5   rB   rC   rD   dictr  r   r   r(   r   r   s   @r*   rW  rW    s    

 !
  	

 
 
 
 
D FJ8<$)#'$!||$!  %||d2$! ,,	$!
  ell :;dB$! !%S#X 5$! "$! Dj$! 
u||U\\)	*$! $!r)   rW  c                   F  ^  \ rS rSr   SS\S\S\S\S\S\4U 4S jjjr  SS
\R                  S\R                  S\R                  S\R                  S\
\R                  \R                  4   S	-  S\\\4   S	-  S\
\R                  \R                  4   4S jjrSrU =r$ )Flux2TransformerBlockiW  rh   rX  rY  rP  r  r   c                   > [         TU ]  5         [        X-  5      U l        [        R
                  " USUS9U l        [        R
                  " USUS9U l        [        UUUUUUUUU[        5       S9
U l
        [        R
                  " USUS9U l        [        XXFS9U l        [        R
                  " USUS9U l        [        XXFS9U l        g )NFr[  )
r  r   r  r   r  r   r  r  r  r  )rh   r   r   r   )r   r5   r^   r@  r   r\  norm1norm1_contextr   r   r   norm2r   ffnorm2_context
ff_contextr^  s          r*   r5   Flux2TransformerBlock.__init__X  s     	!#/2\\#%SI
\\#%SQ"!'% (*
	 \\#%SI
"siS\\#%SQ*si[r)   Nr   r   temb_mod_imgtemb_mod_txtr   ra  r:   c                    U=(       d    0 n[         R                  US5      u  u  pxn	u  pn[         R                  US5      u  u  pnu  nnnU R                  U5      nSU-   U-  U-   nU R                  U5      nSU-   U-  U-   nU R                  " SUUUS.UD6nUu  nnU	U-  nUU-   nU R                  U5      nUSU-   -  U
-   nU R                  U5      nXU-  -   nUU-  nUU-   nU R                  U5      nUSU-   -  U-   nU R                  U5      nUUU-  -   nUR                  [        R                  :X  a  UR                  SS5      nX!4$ )Nr   rf   )r   r   r   re  rf  r!   )rg  r?  rv  rw  r   rx  ry  rz  r{  r   rB   rh  ri  )r4   r   r   r}  r~  r   ra  	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpc_shift_msac_scale_msa
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlprm  norm_encoder_hidden_statesattention_outputsrM  context_attn_output	ff_outputcontext_ff_outputs                             r*   r   Flux2TransformerBlock.forwardz  s    "8!=2 N]MbMbcoqrMsJ(x*J9YhYnYn!Z
V.:0Vk:
 "ZZ6)m/AAIM &*%7%78M%N"&'+o9S%SVa%a" !II 
,"<-
 %	
 ,=(( ,%3!ZZ6/1y=AIMGG./	%9(<< )+>> 58K K%)%7%78M%N"%?1{?%SVa%a" OO,FG 5
EV8V V &&%--7$9$>$>vu$M!$33r)   )r   ry  r{  r@  rv  rw  rx  rz  ro  rF  rq  r   s   @r*   rt  rt  W  s      \ \ ! \  	 \
  \  \  \  \P FJ8<94||94  %||94 ll	94
 ll94  ell :;dB94 !%S#X 594 
u||U\\)	*94 94r)   rt  c                   t   ^  \ rS rSrS\S\\   4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	Flux2PosEmbedi  thetaaxes_dimc                 :   > [         TU ]  5         Xl        X l        g r/   )r   r5   r  r  )r4   r  r  r   s      r*   r5   Flux2PosEmbed.__init__  s    
 r)   idsr:   c           
         / n/ nUR                  5       nUR                  R                  S:H  nUR                  R                  S:H  nU(       d  U(       a  [        R                  O[        R
                  n[        [        U R                  5      5       HQ  n[        U R                  U   USU4   U R                  SSUS9u  pUR                  U	5        UR                  U
5        MS     [        R                  " USS9R                  UR                  5      n[        R                  " USS9R                  UR                  5      nX4$ )Nmpsnpu.T)r  repeat_interleave_realuse_realfreqs_dtyper   rg   )r   devicetyperB   float32float64rL   r*  r  r   r  r   ri   r   )r4   r  cos_outsin_outposis_mpsis_npur  icossin	freqs_cos	freqs_sins                r*   r   Flux2PosEmbed.forward  s    iikE)E)(.&emmu}}s4==)*A.a CFjj'+'HC NN3NN3 + IIg2.11#**=	IIg2.11#**=	##r)   )r  r  )r"   r#   r$   r%   r^   listr5   rB   rC   r   r(   r   r   s   @r*   r  r    s:    !c !T#Y !
$5<< $ELL $ $r)   r  c            	          ^  \ rS rSr    SS\S\S\S\4U 4S jjjrS\R                  S\R                  S	\R                  4S
 jr	Sr
U =r$ )Flux2TimestepGuidanceEmbeddingsi  in_channelsembedding_dimr   guidance_embedsc                    > [         TU ]  5         [        USSS9U l        [	        XUS9U l        U(       a  [	        XUS9U l        g S U l        g )NTr   )num_channelsflip_sin_to_cosdownscale_freq_shift)r  time_embed_dimsample_proj_bias)r   r5   r   	time_projr   timestep_embedderguidance_embedder)r4   r  r  r   r  r   s        r*   r5   (Flux2TimestepGuidanceEmbeddings.__init__  sV     	"Thij!2#TX"
 %6'X\&D" &*D"r)   timestepguidancer:   c                    U R                  U5      nU R                  UR                  UR                  5      5      nUbN  U R                  bA  U R                  U5      nU R	                  UR                  UR                  5      5      nXF-   nU$ U$ r/   )r  r  r   r   r  )r4   r  r  timesteps_projtimesteps_embguidance_projguidance_embtime_guidance_embs           r*   r   'Flux2TimestepGuidanceEmbeddings.forward  s    1..~/@/@/PQD$:$:$F NN84M11-2B2B8>>2RSL - <$$  r)   )r  r  r  )   i   FT)r"   r#   r$   r%   r^   r   r5   rB   rC   r   r(   r   r   s   @r*   r  r    sl     ! $** * 	*
 * **
! 
! 
! 
! 
!r)   r  c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S jr	\
S	\R                  S\S\\\R                  \R                  \R                  4   S
4   4S j5       rSrU =r$ )rg  i  rh   mod_param_setsr   c                    > [         TU ]  5         X l        [        R                  " XS-  U R                  -  US9U l        [        R                  " 5       U l        g )Nr   r   )r   r5   r  r   r   linearr   r   )r4   rh   r  r   r   s       r*   r5   Flux2Modulation.__init__  sB    ,ii1Wt/B/B%BNggir)   tembr:   c                 J    U R                  U5      nU R                  U5      nU$ r/   )r   r  )r4   r  mods      r*   r   Flux2Modulation.forward  s#    kk$kk#
r)   r  .c                    ^ U R                   S:X  a  U R                  S5      n [        R                  " U SU-  SS9m[	        U4S j[        U5       5       5      $ )Nr   rf   r   r   rg   c              3   @   >#    U  H  nTS U-  S US-   -   v   M     g7f)r   rf   Nr!   ).0r  
mod_paramss     r*   	<genexpr>(Flux2Modulation.split.<locals>.<genexpr>  s&     T>SZAQU4>Ss   )r   r   rB   r   rD   rL   )r  r  r  s     @r*   r?  Flux2Modulation.split  sK     88q=--"C[[a.&8bA
TeN>STTTr)   )r   r  r  )r   F)r"   r#   r$   r%   r^   r   r5   rB   rC   r   staticmethodrD   r?  r(   r   r   s   @r*   rg  rg    s     C        ELL U\\ 
 U5<< U UuU\\SXS_S_afamamEm?nps?s9t U Ur)   rg  c                   (  ^  \ rS rSrSrSrSS/rSS/rSS/r\	" SS	S
S9\	" SS	S
S9\	" SS	S
S9\	" SS	S
S9S.\
" SS	S9S.r\              S0S\S\S\S-  S\S\S\S\S\S\S\S\\S4   S\S\S\4U 4S jjj5       rS /r\" S!5                 S1S"\R*                  S#\R*                  S$\R,                  S%\R*                  S&\R*                  S'\R*                  S!\\\4   S-  S(\S S)S*\S-  S+\S,\S-\R*                  \-  4S. jj5       rS/rU =r$ )2Flux2Transformer2DModeli  a  
The Transformer model introduced in Flux 2.

Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

Args:
    patch_size (`int`, defaults to `1`):
        Patch size to turn the input data into small patches.
    in_channels (`int`, defaults to `128`):
        The number of channels in the input.
    out_channels (`int`, *optional*, defaults to `None`):
        The number of channels in the output. If not specified, it defaults to `in_channels`.
    num_layers (`int`, defaults to `8`):
        The number of layers of dual stream DiT blocks to use.
    num_single_layers (`int`, defaults to `48`):
        The number of layers of single stream DiT blocks to use.
    attention_head_dim (`int`, defaults to `128`):
        The number of dimensions to use for each attention head.
    num_attention_heads (`int`, defaults to `48`):
        The number of attention heads to use.
    joint_attention_dim (`int`, defaults to `15360`):
        The number of dimensions to use for the joint attention (embedding/channel dimension of
        `encoder_hidden_states`).
    pooled_projection_dim (`int`, defaults to `768`):
        The number of dimensions to use for the pooled projection.
    guidance_embeds (`bool`, defaults to `True`):
        Whether to use guidance embeddings for guidance-distilled variant of the model.
    axes_dims_rope (`tuple[int]`, defaults to `(32, 32, 32, 32)`):
        The dimensions to use for the rotary positional embeddings.
Trt  rW  	pos_embedr]  rf   r   F)	split_dimexpected_dimssplit_output)r   r   img_idstxt_ids)
gather_dimr  ) proj_outN
patch_sizer  out_channels
num_layersrI   rY  rX  joint_attention_dimtimestep_guidance_channelsrP  axes_dims_rope.
rope_thetar  r  c                   > [         TU ]  5         U=(       d    UU l        Xv-  U l        [	        XS9U l        [        U	U R                  SUS9U l        [        U R                  SSS9U l	        [        U R                  SSS9U l
        [        U R                  SSS9U l        [        R                  " X R                  SS9U l        [        R                  " XR                  SS9U l        [        R                   " [#        U5       Vs/ s H  n[%        U R                  UUU
USS9PM     sn5      U l        [        R                   " [#        U5       Vs/ s H  n[)        U R                  UUU
USS9PM     sn5      U l        [-        U R                  U R                  SUSS	9U l        [        R                  " U R                  X-  U R                  -  SS9U l        SU l        g s  snf s  snf )
N)r  r  F)r  r  r   r  r   )r  r   rf   r   )rh   rX  rY  rP  r  r   )r  r  r   )r   r5   r  r   r  r  r  time_guidance_embedrg  double_stream_modulation_imgdouble_stream_modulation_txtsingle_stream_modulationr   r   
x_embeddercontext_embedderr  rL   rt  transformer_blocksrW  single_transformer_blocksr   norm_outr  gradient_checkpointing)r4   r  r  r  r  rI   rY  rX  r  r  rP  r  r  r  r  rP   r   s                   r*   r5    Flux2Transformer2DModel.__init__F  s   $ 	(7K,A 'ZQ $C2..+	$
  -<DNN[\ch,i),;DNN[\ch,i)(7WX_d(e% ))KeL "		*=~~TY Z #%-- z*
 +A &(;'9' +
#
 *, 01
 2A ,(;'9' 2
*
& /NNDNNu#TY
 		$..*2IDL]L]2]dij&+#E

s   "G "G%r    ra  r   r   r  r  r  r  return_dictr   r  rO   ref_fixed_timestepr:   c           
          UR                   S   nUR                  UR                  5      S-  nUb  UR                  UR                  5      S-  nU R                  X65      nU R	                  U5      nU R                  U5      nU R                  U5      nU
S:X  a  US:  a  UR                   S   n[        [        U R                  5      [        U R                  5      S9n	Xl        [        R                  " X<S-  5      nU R                  UU5      nU R	                  U5      nU R                  U5      n[        UUUU5      nU R                  U5      nU R!                  U5      nUR"                  S:X  a  US   nUR"                  S:X  a  US   nU R%                  U5      nU R%                  U5      n[        R&                  " US   US   /SS9[        R&                  " US   US   /SS94nU
S:X  a  0 U=(       d    0 ESSUS	.EnO)U
S
:X  a!  U	b  0 U=(       d    0 ESS
U	R                  S	.EnOUn[)        U R                  5       Hs  u  nnU
b  U	b  U	R+                  U5      US'   [        R,                  " 5       (       a,  U R.                  (       a  U R1                  UUUUUUU5      u  p!Mf  U" UUUUUUS9u  p!Mu     [        R&                  " X!/SS9nU
S:X  a#  US:  a  UR                   S   n[3        UWXU5      nU
b	  0 UESU0EnOUn[)        U R                  5       Hm  u  nnU
b  U	b  U	R5                  U5      US'   [        R,                  " 5       (       a)  U R.                  (       a  U R1                  UUSUUU5      nMc  U" USUUUS9nMo     U
S:X  a  US:  a  USS2X-   S2S4   nOUSS2US2S4   nU R7                  X5      nU R9                  U5      nU
S:X  a  U(       d  UU	4$ [;        UU	S9$ U(       d  U4$ [;        US9$ )a  
The [`Flux2Transformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
        Input `hidden_states`.
    encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
    timestep (`torch.LongTensor`):
        Used to indicate denoising step.
    joint_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.
    kv_cache (`Flux2KVCache`, *optional*):
        KV cache for reference image tokens. When `kv_cache_mode` is "extract", a new cache is created and
        returned. When "cached", the provided cache is used to inject ref K/V during attention.
    kv_cache_mode (`str`, *optional*):
        One of "extract" (first step with ref tokens) or "cached" (subsequent steps using cached ref K/V). When
        `None`, standard forward pass without KV caching.
    num_ref_tokens (`int`, defaults to `0`):
        Number of reference image tokens prepended to `hidden_states` (only used when
        `kv_cache_mode="extract"`).
    ref_fixed_timestep (`float`, defaults to `0.0`):
        Fixed timestep for reference token modulation (only used when `kv_cache_mode="extract"`).

Returns:
    If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
    `tuple` where the first element is the sample tensor. When `kv_cache_mode="extract"`, also returns the
    populated `Flux2KVCache`.
rf   i  Nr  r   )rH   rI   r   rg   )r    r  rO   r  r    )r   r   r}  r~  r   ra  rb   )r   r   r`  r   ra  .)r   r    )r   )r   r   r   r  r  r  r  rF   r*  r  r  rO   rB   	full_liker   r  r  r   r  ri   	enumeraterU   is_grad_enabledr  _gradient_checkpointing_funcr   rY   r  r  r   ) r4   r   r   r  r  r  r  ra  r  r    r  rO   r  rb   r  double_stream_mod_imgdouble_stream_mod_txtsingle_stream_modnum_img_tokensref_timestepref_tembref_double_mod_imgref_single_modr   text_rotary_embconcat_rotary_embkv_attn_kwargsindex_blockblocktotal_single_lenkv_attn_kwargs_singleoutputs                                    r*   r   Flux2Transformer2DModel.forward  s   d /44Q7 ;;}223d:{{=#6#67$>H''; $ A A$ G $ A A$ G 99$? I%.1*<*003N#"%d&=&=">"%d&D&D"EH '5# !??8$5NOL//hGH!%!B!B8!L!::8DN %=%'9>>%!
 6 $ 5 56K L <<1ajG<<1ajG>>'2..1IIq)+;A+>?QGIIq)+;A+>?QG
 I%)/R !*"0	N h&8+?)/R !)"*"9"9	N 4N #,D,C,C"DK(X-A-5-@-@-Mz*$$&&4+F+F7;7X7X!)))%"84%} 8="/*?!6!6%6+984%} #E2 		#8"HaP I%.1*<,2215 8!>>Sc!
 $$X~$X7G$X!$2! #,D,J,J"KK(X-A4<4G4G4T%j1$$&&4+F+F $ A A!%%)! !&"/*..%6+@! #L. I%.1*<)!^-L-NPS*STM)!^_c*ABM m:}-I%))0RR9,F;;r)   )r  r  r  r  r   r  r  r  r  r  r  r  r  r  )rf      Nr3  0   r  r  i <  r  r   )    r  r  r  i  rp  T)NNNNNNTNNr   r5  )r"   r#   r$   r%   r&    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patterns_repeated_blocksr   r   _cp_planr   r^   r   rD   r   r5   
_skip_keysr   rB   rC   
LongTensorrr  r  r   r   r   r(   r   r   s   @r*   r  r    sq   > (,$02OP(3V'<$/1NO 2AQ]bc%9AUVej%k+aqW\]+aqW\]	
 *QaHH  #'!#"%#%#(*-*: $O,O, O, Dj	O,
 O, O,  O, !O, !O, %(O, O, c3hO, O, O, O, O,b J./ /3%) $ $!%8< *.$($'J<||J<  %||J< ""	J<
 J< J< ,,J< !%S#X 5J< J< (J< TzJ< J< "J< 
5	5J< 0J<r)   r  rF  r/   )r   r   )Hr%  dataclassesr   typingr   rB   torch.nnr   torch.nn.functional
functionalr   configuration_utilsr   r   loadersr   r	   r
   utilsr   r   r   _modeling_parallelr   r   	attentionr   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   r   r   modeling_utilsr   normalizationr   
get_loggerr"   r+  r   r,   rF   rC   r^   rz   rD   r   r   r   r   r   r   Moduler   r   r   r  r   r9  rH  r<  rW  rt  r  r  rg  r  r!   r)   r*   <module>r*     s    !      B ^ ^ : : L < 6 $  ( 2 
		H	% +J + + 6   > *.9<<<9<	9< <<9< 	9<
 9<  $&9< \\9<xellC'(ellC'(  	
 5<<,)\\)\\) ) 	)
 \\)4&&\\& & 	&
 & \\&DHHH"))  ryy 4B! B!J`! `!FHvUXX__&: HvV5 5pQ QhK_2F K_\B!")) B!J\4BII \4~$BII $@ !bii  !FUbii U.U<!U<r)   