
    
3j              
          S SK r S SKJr  S SKrS SKJr  S SKJs  Jr  S SK	J
r
  SSKJrJr  SSKJr  SSKJrJr  SSKJr  S	S
KJrJr  S	SKJr  S	SKJr  S	SKJr  S	SKJrJ r J!r!J"r"  S	SK#J$r$  S	SK%J&r&  S	SK'J(r(J)r)J*r*  \RV                  " \,5      r- " S S5      r. " S S\R^                  5      r0 " S S\R^                  5      r1 " S S\R^                  5      r2 " S S\R^                  5      r3\ " S S\R^                  5      5       r4 " S S\R^                  5      r5 " S  S!\R^                  5      r6 " S" S#\R^                  5      r7\ " S$ S%\R^                  5      5       r8\ " S& S'\R^                  5      5       r9 " S( S)\&\\\\
\5      r:g)*    N)Any)FromOriginalModelMixin   )ConfigMixinregister_to_config)PeftAdapterMixin)apply_lora_scalelogging)maybe_allow_in_graph   )AttentionMixinFeedForward)dispatch_attention_fn)	Attention)
CacheMixin)"CombinedTimestepTextProjEmbeddingsTimestepEmbedding	Timestepsget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousAdaLayerNormZeroAdaLayerNormZeroSinglec                       \ rS rSrSrSrS r   SS\S\R                  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  4S
 jjr
Srg)HunyuanImageAttnProcessor.   Nc                 D    [        [        S5      (       d  [        S5      eg )Nscaled_dot_product_attentionzYHunyuanImageAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.)hasattrFImportError)selfs    p/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_hunyuanimage.py__init__"HunyuanImageAttnProcessor.__init__2   s%    q899k  :    attnhidden_statesencoder_hidden_statesattention_maskimage_rotary_embreturnc                 T   UR                   c  Ub  [        R                  " X#/SS9nUR                  U5      nUR	                  U5      nUR                  U5      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  SUR                  S45      nUR                  b  UR                  U5      nUR                  b  UR                  U5      nUb  SSK
Jn	  UR                   c  Ub  [        R                  " U	" US S 2S UR                  S   * 24   USS9US S 2UR                  S   * S 24   /SS9n[        R                  " U	" US S 2S UR                  S   * 24   USS9US S 2UR                  S   * S 24   /SS9nOU	" XeSS9nU	" XuSS9nUR                   Gb  UGb  UR                  U5      n
UR                  U5      nUR                  U5      nU
R                  SUR                  S45      n
UR                  SUR                  S45      nUR                  SUR                  S45      nUR                  b  UR                  U
5      n
UR                   b  UR!                  U5      n[        R                  " Xj/SS9n[        R                  " X{/SS9n[        R                  " X/SS9n[#        UUUUSSU R$                  U R&                  S	9nUR)                  SS
5      nUR+                  UR,                  5      nUb  US S 2S UR                  S   * 24   US S 2UR                  S   * S 24   p2[/        USS 5      b*  UR0                  S   " U5      nUR0                  S   " U5      n[/        USS 5      b  UR3                  U5      nX#4$ )N   dimr   )apply_rotary_emb)sequence_dim        F)	attn_mask	dropout_p	is_causalbackendparallel_configr   to_outr   
to_add_out)
add_q_projtorchcatto_qto_kto_v	unflattenheadsnorm_qnorm_k
embeddingsr3   shape
add_k_proj
add_v_projnorm_added_qnorm_added_kr   _attention_backend_parallel_configflattentodtypegetattrr;   r<   )r#   r(   r)   r*   r+   r,   querykeyvaluer3   encoder_queryencoder_keyencoder_values                r$   __call__"HunyuanImageAttnProcessor.__call__8   s    ??"'<'H!II}&LRSTM 		-(ii&		-(DJJ#34mmA

B/0DJJ#34 ;;"KK&E;;"++c"C '5&+@+L		(!!%F(=(C(CA(F'F%F"FGIYhi a"7"="=a"@!@!BBC	  ii(Q0Q3H3N3Nq3Q2Q0Q-Q)RTdstuA 5 ; ;A >>@@A  )qQ&s1M ??&+@+L OO,ABM//*?@K OO,ABM)33A

B7GHM%//DJJ3CDK)33A

B7GHM  , $ 1 1- @  ,"//<IIu4!<E))S.A6CIIu4!<E .$++ 11	
 &--a3%((5 !,a!B$9$?$?$B#B!BBCa"7"="=a"@!@!BBC 1
 tXt,8 $A} = $A} =t\40<(,8M(N%33r'    )NNN)__name__
__module____qualname____firstlineno__rM   rN   r%   r   r>   TensorrY   __static_attributes__r[   r'   r$   r   r   .   s     6:.204b4b4 ||b4  %||d2	b4
 t+b4  ,,-b4 
b4 b4r'   r   c            	          ^  \ rS rSr   SS\\\\\\\4   4   S\S\SS4U 4S jjjrS\R                  S\R                  4S	 jr	S
r
U =r$ )HunyuanImagePatchEmbed   
patch_sizein_chans	embed_dimr-   Nc                   > [         TU ]  5         Xl        [        U5      S:X  a  [        R
                  " X#XS9U l        g [        U5      S:X  a  [        R                  " X#XS9U l        g [        S[        U5       35      e)Nr   )kernel_sizestrider   1patch_size must be a tuple of length 2 or 3, got )	superr%   re   lennnConv2dprojConv3d
ValueError)r#   re   rf   rg   	__class__s       r$   r%   HunyuanImagePatchEmbed.__init__   sl     	$z?a		(:aDI_!		(:aDIPQTU_Q`Pabccr'   r)   c                 j    U R                  U5      nUR                  S5      R                  SS5      nU$ )Nr   r/   )rp   rO   	transpose)r#   r)   s     r$   forwardHunyuanImagePatchEmbed.forward   s3    		-0%--a0::1a@r'   )re   rp   ))   ry   r   i   )r\   r]   r^   r_   tupleintr%   r>   r`   rw   ra   __classcell__rs   s   @r$   rc   rc      s|     =E	d#sE#sC-$889d d 	d
 
d d"U\\ ell  r'   rc   c                   r   ^  \ rS rSrS\S\S\4U 4S jjrS\R                  S\R                  4S jrS	r	U =r
$ )
HunyuanImageByT5TextProjection   in_featureshidden_sizeout_featuresc                 .  > [         TU ]  5         [        R                  " U5      U l        [        R
                  " X5      U l        [        R
                  " X"5      U l        [        R
                  " X#5      U l        [        R                  " 5       U l
        g N)rl   r%   rn   	LayerNormnormLinearlinear_1linear_2linear_3GELUact_fn)r#   r   r   r   rs   s       r$   r%   'HunyuanImageByT5TextProjection.__init__   s[    LL-			+;		+;		+<ggir'   r*   r-   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   )r#   r*   r)   s      r$   rw   &HunyuanImageByT5TextProjection.forward   s]    		"78m4M2m4M2m4r'   )r   r   r   r   r   )r\   r]   r^   r_   r{   r%   r>   r`   rw   ra   r|   r}   s   @r$   r   r      s=     C  c    U\\ ell  r'   r   c            	          ^  \ rS rSrS
S\S\S-  SS4U 4S jjjrS\R                  S\\R                  \R                  \R                  \R                  \R                  4   4S jr	S	r
U =r$ )HunyuanImageAdaNorm   Nr   r   r-   c                    > [         TU ]  5         U=(       d    SU-  n[        R                  " X5      U l        [        R
                  " 5       U l        g )Nr   )rl   r%   rn   r   linearSiLUnonlinearity)r#   r   r   rs   s      r$   r%   HunyuanImageAdaNorm.__init__   s:    #6q;ii:GGIr'   tembc                     U R                  U R                  U5      5      nUR                  SSS9u  p#UR                  S5      UR                  S5      p2X#4$ )Nr   r/   r0   )r   r   chunk	unsqueeze)r#   r   gate_msagate_mlps       r$   rw   HunyuanImageAdaNorm.forward   sX     {{4,,T23!ZZqZ1%//2H4F4Fq4I(!!r'   )r   r   r   )r\   r]   r^   r_   r{   r%   r>   r`   rz   rw   ra   r|   r}   s   @r$   r   r      sh    &C &sTz &T & &"LL"	u||U\\5<<u||S	T" "r'   r   c                      ^  \ rS rSr  SS\S\S\4U 4S jjjr  SS\R                  S\R                  S-  S	\R                  S-  S
\	\R                  \R                  4   4S jjr
SrU =r$ ))HunyuanImageCombinedTimeGuidanceEmbedding   embedding_dimguidance_embedsuse_meanflowc                   > [         TU ]  5         [        SSSS9U l        [	        SUS9U l        X0l        S U l        S U l        U(       a  [        SSSS9U l        [	        SUS9U l        S U l	        U(       a  [	        SUS9U l	        g g )N   Tr   )num_channelsflip_sin_to_cosdownscale_freq_shift)in_channelstime_embed_dim)
rl   r%   r   	time_projr   timestep_embedderr   time_proj_rtimestep_embedder_rguidance_embedder)r#   r   r   r   rs   s       r$   r%   2HunyuanImageCombinedTimeGuidanceEmbedding.__init__   s     	"T`ab!2sS`!a(#' (c4fghD'8SYf'gD$!%%63Wd%eD" r'   Ntimestep
timestep_rguidancer-   c                    U R                  U5      nU R                  UR                  UR                  S95      nUb@  U R	                  U5      nU R                  UR                  UR                  S95      nXW-   S-  nU R                  b?  U R                  U5      nU R                  UR                  UR                  S95      n	XY-   n
U
$ Un
U
$ )N)rQ   r   )r   r   rP   rQ   r   r   r   )r#   r   r   r   timesteps_projtimesteps_embtimesteps_proj_rtimesteps_emb_rguidance_projguidance_embconditionings              r$   rw   1HunyuanImageCombinedTimeGuidanceEmbedding.forward   s     1..~/@/@x~~/@/VW!#//
;"667G7J7JQYQ_Q_7J7`aO*<AM!!- NN84M11-2B2B2B2XYL(7L  )Lr'   )r   r   r   r   r   r   )FFNN)r\   r]   r^   r_   r{   boolr%   r>   r`   rz   rw   ra   r|   r}   s   @r$   r   r      s     !&"	ff f 	f f4 +/(,	,, LL4' ,,%	
 
u||U\\)	* r'   r   c                      ^  \ rS rSr   SS\S\S\S\S\SS4U 4S	 jjjr SS
\	R                  S\	R                  S\	R                  S-  S\	R                  4S jjrSrU =r$ )'HunyuanImageIndividualTokenRefinerBlocki
  num_attention_headsattention_head_dimmlp_width_ratiomlp_drop_rateattention_biasr-   Nc                   > [         TU ]  5         X-  n[        R                  " USSS9U l        [        US UUUS9U l        [        R                  " USSS9U l        [        XcSUS9U l	        [        USU-  5      U l        g )NTư>elementwise_affineeps)	query_dimcross_attention_dimrD   dim_headbiaszlinear-silu)multactivation_fndropoutr   )rl   r%   rn   r   norm1r   r(   norm2r   ffr   norm_out)r#   r   r   r   r   r   r   rs   s          r$   r%   0HunyuanImageIndividualTokenRefinerBlock.__init__  s     	)>\\+$DQ
! $%'
	 \\+$DQ
k}fst+K[Ir'   r)   r   r+   c                     U R                  U5      nU R                  US US9nU R                  U5      u  pgXU-  -   nU R                  U R	                  U5      5      nXU-  -   nU$ )N)r)   r*   r+   )r   r(   r   r   r   )	r#   r)   r   r+   norm_hidden_statesattn_outputr   r   	ff_outputs	            r$   rw   /HunyuanImageIndividualTokenRefinerBlock.forward&  sy     "ZZ6ii,"&)   
 "]]40%h(>>GGDJJ}56	%H(<<r'   )r(   r   r   r   r         @r5   Tr   )r\   r]   r^   r_   r{   strfloatr   r%   r>   r`   rw   ra   r|   r}   s   @r$   r   r   
  s      #"#J J  J 	J
 J J 
J J< /3	|| ll t+	
 
 r'   r   c                      ^  \ rS rSr   SS\S\S\S\S\S\SS	4U 4S
 jjjr SS\R                  S\R                  S\R                  S	-  SS	4S jjr
SrU =r$ )"HunyuanImageIndividualTokenRefineri=  r   r   
num_layersr   r   r   r-   Nc                    > [         TU ]  5         [        R                  " [	        U5       Vs/ s H  n[        UUUUUS9PM     sn5      U l        g s  snf )N)r   r   r   r   r   )rl   r%   rn   
ModuleListranger   refiner_blocks)	r#   r   r   r   r   r   r   _rs   s	           r$   r%   +HunyuanImageIndividualTokenRefiner.__init__>  s`     	 mm z*	 +A 8(;'9$3"/#1 +	
	s   Ar)   r   r+   c                 l   S nUb  UR                   S   nUR                   S   nUR                  UR                  5      nUR                  USSU5      R	                  SSUS5      nUR                  SS5      nXx-  R                  5       nSUS S 2S S 2S S 2S4'   U R                   H  n	U	" XU5      nM     U$ )Nr   r/   r   r   T)rH   rP   deviceviewrepeatrv   r   r   )
r#   r)   r   r+   self_attn_mask
batch_sizeseq_lenself_attn_mask_1self_attn_mask_2blocks
             r$   rw   *HunyuanImageIndividualTokenRefiner.forwardV  s     %'--a0J$**1-G+..}/C/CDN-22:q!WMTTUVXY[bdef/99!Q?.AGGIN)-N1aA:&((E!-~FM ) r'   )r   r   r   )r\   r]   r^   r_   r{   r   r   r%   r>   r`   rw   ra   r|   r}   s   @r$   r   r   =  s     "%"#
 
  
 	

 
 
 
 

 
8 /3	|| ll t+	
 
 r'   r   c                      ^  \ rS rSr   SS\S\S\S\S\S\S\S	S
4U 4S jjjr SS\R                  S\R                  S\R                  S
-  S	\R                  4S jjrSrU =r$ )HunyuanImageTokenRefinerim  r   r   r   r   	mlp_ratior   r   r-   Nc           	         > [         T	U ]  5         X#-  n[        XS9U l        [        R
                  " XSS9U l        [        UUUUUUS9U l        g )N)r   pooled_projection_dimT)r   )r   r   r   r   r   r   )	rl   r%   r   time_text_embedrn   r   proj_inr   token_refiner)
r#   r   r   r   r   r   r   r   r   rs   s
            r$   r%   !HunyuanImageTokenRefiner.__init__n  s[     	)>A% 
 yyE? 31!%')
r'   r)   r   r+   c                 N   Uc  UR                  SS9nO]UR                  nUR                  5       R                  S5      nX-  R	                  SS9UR	                  SS9-  nUR                  U5      nU R                  X$5      nU R                  U5      nU R                  XU5      nU$ )Nr/   r0   r2   )	meanrQ   r   r   sumrP   r   r   r   )r#   r)   r   r+   pooled_hidden_statesoriginal_dtype
mask_floatr   s           r$   rw    HunyuanImageTokenRefiner.forward  s     !#0#5#5!#5#< *00N'--/99"=J$1$>#C#C#C#JZ^^`a^Mb#b #7#:#:>#J ##HC]3**=Or'   )r   r   r   r   r   )r\   r]   r^   r_   r{   r   r   r%   r>   r`   
LongTensorrw   ra   r|   r}   s   @r$   r   r   m  s     "#

 !
  	

 
 
 
 
 

 
> 37	|| "" ((4/	
 
 r'   r   c            	          ^  \ rS rSrSS\\\   -  S\\\   -  S\SS4U 4S jjjrS\	R                  S\	R                  4S	 jrS
rU =r$ )HunyuanImageRotaryPosEmbedi  re   rope_dimthetar-   Nc                 x  > [         TU ]  5         [        U[        [        45      (       a  [        U5      S;  a  [        SU 35      e[        U[        [        45      (       a  [        U5      S;  a  [        SU 35      e[        U5      [        U5      :X  d  [        SU SU 35      eXl        X l        X0l	        g )Nr   r   z9patch_size must be a tuple or list of length 2 or 3, got z7rope_dim must be a tuple or list of length 2 or 3, got z7patch_size and rope_dim must have the same length, got z and )
rl   r%   
isinstancerz   listrm   rr   re   r	  r
  )r#   re   r	  r
  rs   s       r$   r%   #HunyuanImageRotaryPosEmbed.__init__  s    *udm44Jv8UXYcXdeff(UDM22c(m66QVW_V`abb:#h-/VWaVbbghpgqrss$ 
r'   r)   c                    UR                   S:X  a+  UR                  u    p#pEU R                  u  pgnX6-  XG-  XX-  /n	OOUR                   S:X  a'  UR                  u    p$nU R                  u  pxXG-  XX-  /n	O[        SUR                   35      e/ n
[	        [        U	5      5       HE  n[        R                  " SX   UR                  [        R                  S9nU
R                  U5        MG     [        R                  " U
SS06n[        R                  " USS9n/ n[	        [        U	5      5       HH  n[        U R                  U   X   R                  S	5      U R                   S
S9nUR                  U5        MJ     [        R"                  " U Vs/ s H  oS   PM	     snSS9n[        R"                  " U Vs/ s H  oS   PM	     snSS9nUU4$ s  snf s  snf )N      -hidden_states must be a 4D or 5D tensor, got r   )r   rQ   indexingijr0   r2   T)use_realr/   )ndimrH   re   rr   r   rm   r>   aranger   float32appendmeshgridstackr   r	  reshaper
  r?   )r#   r)   r   frameheightwidthpatch_size_framepatch_size_heightpatch_size_width
rope_sizes
axes_gridsigridfreqsfreqf	freqs_cos	freqs_sins                     r$   rw   "HunyuanImageRotaryPosEmbed.forward  s   ")6)<)<&AqDHOOA1A3V5PRWRklJ1$"/"5"5Aq%26/// 5u7PQJL]M`M`Labcc
s:'A<<:=9M9MUZUbUbcDd# ( ~~z9D9{{4Q's:'A*4==+;TW__R=PRVR\R\gklDLL ( IIU3UtU3;	IIU3UtU3;	)## 43s   G3G)re   r	  r
  )      p@)r\   r]   r^   r_   rz   r  r{   r   r%   r>   r`   rw   ra   r|   r}   s   @r$   r  r    s]    549#4 S	@Q Z_ lp   $U\\ $ell $ $r'   r  c                     ^  \ rS rSr  SS\S\S\S\SS4
U 4S jjjr  SS	\R                  S
\R                  S\R                  S\R                  S-  S\
\R                  \R                  4   S-  S\R                  4S jjrSrU =r$ )"HunyuanImageSingleTransformerBlocki  r   r   r   qk_normr-   Nc                 D  > [         TU ]  5         X-  n[        XS-  5      n[        US UUUS[	        5       USSS9
U l        [        USS9U l        [        R                  " XV5      U l
        [        R                  " SS9U l        [        R                  " XV-   U5      U l        g )NTr   )
r   r   r   rD   out_dimr   	processorr1  r   pre_only
layer_norm	norm_typetanh)approximate)rl   r%   r{   r   r   r(   r   r   rn   r   proj_mlpr   act_mlpproj_out)r#   r   r   r   r1  r   mlp_dimrs   s          r$   r%   +HunyuanImageSingleTransformerBlock.__init__  s     	)>k-.! $'%/1
	 +;,O			+7ww62		+"7Er'   r)   r*   r   r+   r,   c                    UR                   S   n[        R                  " X/SS9nUn	U R                  XS9u  pU R	                  U R                  U
5      5      nU
S S 2S U* 2S S 24   U
S S 2U* S 2S S 24   pU R                  U
UUUS9u  p[        R                  " X/SS9n[        R                  " X/SS9nUR                  S5      U R                  U5      -  nX-   nUS S 2S U* 2S S 24   US S 2U* S 2S S 24   p!X4$ )Nr/   r0   embr)   r*   r+   r,   r   )	rH   r>   r?   r   r<  r;  r(   r   r=  )r#   r)   r*   r   r+   r,   argskwargstext_seq_lengthresidualr   gatemlp_hidden_statesnorm_encoder_hidden_statesr   context_attn_outputs                   r$   rw   *HunyuanImageSingleTransformerBlock.forward  s@    055a8		="HaP  $(99]9#E  LL7I)JK q"3O#3"3Q67q?"2"3Q67 7 ,099,"<)-	 ,5 ,
( ii BJ 		;"BJq)DMM-,HH%0 !...12!o-.12 - 33r'   )r<  r(   r   r;  r=  )r   rms_normr   r\   r]   r^   r_   r{   r   r   r%   r>   r`   rz   rw   ra   r|   r}   s   @r$   r0  r0    s     !F F  F 	F
 F 
F FF /3EI*4||*4  %||*4 ll	*4
 t+*4  ell :;dB*4 
*4 *4r'   r0  c                   0  ^  \ rS rSr SS\S\S\S\SS4
U 4S jjjr  SS	\R                  S
\R                  S\R                  S\R                  S-  S\
\R                  \R                  4   S-  S\
\R                  \R                  4   4S jjrSrU =r$ )HunyuanImageTransformerBlocki  r   r   r   r1  r-   Nc                 N  > [         TU ]  5         X-  n[        USS9U l        [        USS9U l        [        US UUUUSS[        5       USS9U l        [        R                  " USSS9U l
        [        XSSS	9U l        [        R                  " USSS9U l        [        XSSS	9U l        g )
Nr6  r7  FTr   )r   r   added_kv_proj_dimr   rD   r3  context_pre_onlyr   r4  r1  r   r   zgelu-approximate)r   r   )rl   r%   r   r   norm1_contextr   r   r(   rn   r   r   r   r   norm2_context
ff_context)r#   r   r   r   r1  r   rs   s         r$   r%   %HunyuanImageTransformerBlock.__init__  s     	)>%k\J
-k\R! $)'%"/1
	 \\+%TR
kI[\\\+%UYZ%kQcdr'   r)   r*   r   r+   r,   c                 "   U R                  XS9u  ppnU R                  X#S9u  pnnnU R                  UUUUS9u  nnUUU	R                  S5      -  -   nUUUR                  S5      -  -   nU R	                  U5      nU R                  U5      nUSUS S 2S 4   -   -  U
S S 2S 4   -   nUSUS S 2S 4   -   -  US S 2S 4   -   nU R                  U5      nU R                  U5      nXR                  S5      U-  -   nUUR                  S5      U-  -   nX4$ )NrA  rC  r/   )r   rT  r(   r   r   rU  r   rV  )r#   r)   r*   r   r+   r,   rD  rE  r   r   	shift_mlp	scale_mlpr   rJ  
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpr   rK  r   context_ff_outputs                         r$   rw   $HunyuanImageTransformerBlock.forward=  s{    HLzzR_zGjDiHW[WiWi! Xj X
T"[*
 ,099,"<)-	 ,5 ,
(( &h6H6H6K(KK 58KjNbNbcdNe8e e!ZZ6%)%7%78M%N"/1yD7I3IJYWXZ^W^M__%?1{STVZSZG[C[%\_jklnrkr_s%s" GG./	 OO,FG%(:(:1(=	(II 5
8L8LQ8ORc8c c33r'   )r(   r   rV  r   rT  r   rU  )rM  r   rN  r}   s   @r$   rP  rP    s     " e  e   e 	 e
  e 
 e  eN /3EI)4||)4  %||)4 ll	)4
 t+)4  ell :;dB)4 
u||U\\)	*)4 )4r'   rP  c            $       4  ^  \ rS rSrSrSr/ SQr/ SQrSS/r\	                S(S	\
S
\
S\
S\
S\
S\
S\
S\S\\
\
4   S\S\S\
S\
S-  S\S\\
S4   S\SS4"U 4S jjj5       r\" S5            S)S\R$                  S\R&                  S\R$                  S \R$                  S!\R&                  S-  S"\R$                  S-  S#\R$                  S-  S$\R$                  S-  S\\\4   S-  S%\S\R$                  \\\R$                  4   -  4S& jj5       rS'rU =r$ )*HunyuanImageTransformer2DModelii  a  
The Transformer model used in [HunyuanImage-2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1).

Args:
    in_channels (`int`, defaults to `16`):
        The number of channels in the input.
    out_channels (`int`, defaults to `16`):
        The number of channels in the output.
    num_attention_heads (`int`, defaults to `24`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`, defaults to `128`):
        The number of channels in each head.
    num_layers (`int`, defaults to `20`):
        The number of layers of dual-stream blocks to use.
    num_single_layers (`int`, defaults to `40`):
        The number of layers of single-stream blocks to use.
    num_refiner_layers (`int`, defaults to `2`):
        The number of layers of refiner blocks to use.
    mlp_ratio (`float`, defaults to `4.0`):
        The ratio of the hidden layer size to the input size in the feedforward network.
    patch_size (`int`, defaults to `2`):
        The size of the spatial patches to use in the patch embedding layer.
    patch_size_t (`int`, defaults to `1`):
        The size of the tmeporal patches to use in the patch embedding layer.
    qk_norm (`str`, defaults to `rms_norm`):
        The normalization to use for the query and key projections in the attention layers.
    guidance_embeds (`bool`, defaults to `True`):
        Whether to use guidance embeddings in the model.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    pooled_projection_dim (`int`, defaults to `768`):
        The dimension of the pooled projection of the text embeddings.
    rope_theta (`float`, defaults to `256.0`):
        The value of theta to use in the RoPE layer.
    rope_axes_dim (`tuple[int]`, defaults to `(16, 56, 56)`):
        The dimensions of the axes to use in the RoPE layer.
    image_condition_type (`str`, *optional*, defaults to `None`):
        The type of image conditioning to use. If `None`, no image conditioning is used. If `latent_concat`, the
        image is concatenated to the latent stream. If `token_replace`, the image is used to replace first-frame
        tokens in the latent stream and apply conditioning.
T)
x_embeddercontext_embedderr   )rP  r0  rc   r   rP  r0  Nr   out_channelsr   r   r   num_single_layersnum_refiner_layersr   re   r1  r   text_embed_dimtext_embed_2_dim
rope_thetarope_axes_dim.r   r-   c                   > [         TU ]  5         [        U	[        [        45      (       a  [        U	5      S;   d  [        SU	 35      eX4-  nU=(       d    Un[        XU5      U l        [        XXGS9U l
        Ub  [        USU5      U l        OS U l        [        UUU5      U l        [        XU5      U l        ["        R$                  " ['        U5       Vs/ s H  n[)        X4XS9PM     sn5      U l        ["        R$                  " ['        U5       Vs/ s H  n[-        X4XS9PM     sn5      U l        [1        UUSSS9U l        ["        R4                  " U[6        R8                  " U	5      U-  5      U l        SU l        g s  snf s  snf )	Nr  rk   )r   i   )r   r1  Fr   r   )rl   r%   r  rz   r  rm   rr   rc   rc  r   rd  r   context_embedder_2r   time_guidance_embedr  ropern   r   r   rP  transformer_blocksr0  single_transformer_blocksr   r   r   mathprodr=  gradient_checkpointing)r#   r   re  r   r   r   rf  rg  r   re   r1  r   rh  ri  rj  rk  r   	inner_dimr   rs   s                      r$   r%   'HunyuanImageTransformer2DModel.__init__  s   ( 	:t}55#j/V:SPQ[P\]^^'<	#2{ 1)T 81C!
 '&DEUW[]f&gD#&*D##LYXgiu#v  /z*U	 #%--
 z*	 +A -'y +	#
 *,
 01	 2A 3'y 2	*
& /y)X]cgh		)TYYz-B\-QR&+#-s   E=Fattention_kwargsr)   r   r*   encoder_attention_maskr   encoder_hidden_states_2encoder_attention_mask_2r   return_dictc           
         UR                   S:X  a  UR                  u  ppX4nO=UR                   S:X  a  UR                  u  pnpUX4nO[        SUR                   35      e[        S [	        XR
                  R                  5       5       5      nU R                  U5      nUR                  5       nU R                  X(US9nU R                  U5      nU R                  X2U5      nU R                  b  Ub  U R                  U5      nUR                  5       n/ n/ n[	        X4Xg5       Ht  u  nnnnUR                  [        R                  " UU   UU   UU)    UU)    /SS95        UR                  [        R                  " UU   UU   UU)    UU)    /SS95        Mv     [        R                   " U5      n[        R                   " U5      n[        R"                  R$                  R'                  XAR                  S   S4S	S
9nUR)                  S5      R)                  S5      n[        R*                  " 5       (       ad  U R,                  (       aS  U R.                   H  nU R1                  UUUUUUS9u  pM     U R2                   H  nU R1                  UUUUUUS9u  pM     O>U R.                   H  nU" UUUUUS9u  pM     U R2                   H  nU" UUUUUS9u  pM     U R5                  UU5      nU R7                  U5      nU R
                  R8                  nU/[;        U5      -   U/-   [;        U R
                  R                  5      -   nUR<                  " U6 n[?        U5      nSUS-   /n[A        U5       H  n URC                  U S-   US-   U -   /5        M!     URD                  " U6 nUU/[	        UU R
                  R                  5       V!V"s/ s H  u  n!n"U!U"-  PM     sn"n!-   n#UR<                  " U#6 nU
(       d  U4$ [G        US9$ s  sn"n!f )Nr  r  r  c              3   .   #    U  H  u  pX-  v   M     g 7fr   r[   ).0dps      r$   	<genexpr>9HunyuanImageTransformer2DModel.forward.<locals>.<genexpr>  s      W4VDA4Vs   )r   r   r   r0   r/   T)rU   r   )r+   r,   )sample)$r  rH   rr   rz   zipconfigre   ro  r   rn  rc  rd  rm  r  r>   r?   r  rn   
functionalpadr   is_grad_enabledrt  rp  _gradient_checkpointing_funcrq  r   r=  re  r  r  rm   r   extendpermuter   )$r#   r)   r   r*   rx  r   ry  rz  r   rw  r{  r   channelsr  r   sizesr  post_patch_sizesr,   r   new_encoder_hidden_statesnew_encoder_attention_masktext	text_masktext_2text_mask_2r+   r   re  reshape_dimsr  permute_patternr&  
post_patchpatch
final_dimss$                                       r$   rw   &HunyuanImageTransformer2DModel.forward  s}    "2?2E2E/J&OE1$9F9L9L6J%F*EL]M`M`Labcc  WC{{?U?U4V WW  99]3 "8!<!<!>''PZ'[6 $ 5 56KWm n"".3J3V&*&=&=>U&V#'?'D'D'F$ )+%)+&8;%?V94i *00II";/ O"K<0 ),	 
 +11II'4%i0'5%yj1	 
#9: %*KK0I$J!%*[[1K%L",,001GJ]J]^_J`bcIdlp0q'11!4>>qA  ""t'B'B007;7X7X!)#1%5 8Y 844 1 777;7X7X!)#1%5 8Y 844 8 007<!)#1%5844 1 777<!)#1%5844 8 mT:m4 {{//"|d+;&<<~MPTUYU`U`UkUkPll%--|<
 #$dQh-tA""AE4!8a<#89 %--? !,/8;<LdkkNdNd8e3
8e#4:uJ8e3
 

 &--z:!##'}==3
s   P)
rd  rm  rt  r   r=  ro  rq  rn  rp  rc  )@   r           (   r   r   )r/   r/   rM  Fi   Nr.  )r  r  F)NNNNNT)r\   r]   r^   r_   __doc__ _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_repeated_blocksr   r{   r   rz   r   r   r%   r	   r>   r`   r  dictr   rw   ra   r|   r}   s   @r$   rb  rb  i  sD   (T (,$'Q$ 78\] #%"%!#"#&,! %"'+!)1"#D,D, D, !	D,
  D, D, D,  D, D, #s(OD, D, D, D, *D, D,  S#X!D," #D,$ 
%D, D,L () /37;8<(,26 R>||R> ""R>  %||	R>
 !&R> $$t+R> "'!4R> #(,,"5R> ,,%R> sCx.4/R> R> 
S%,,./	/R> *R>r'   rb  );rr  typingr   r>   torch.nnrn   torch.nn.functionalr  r!   diffusers.loadersr   configuration_utilsr   r   loadersr   utilsr	   r
   utils.torch_utilsr   	attentionr   r   attention_dispatchr   attention_processorr   cache_utilsr   rG   r   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerr\   loggerr   Modulerc   r   r   r   r   r   r   r  r0  rP  rb  r[   r'   r$   <module>r     sa         4 B ' . 5 3 6 + $  8 ' \ \ 
		H	%l4 l4^RYY 0RYY &"")) ""-		 -b /bii / /d, ,`.ryy .b+$ +$\ I4 I4 I4X L4299 L4 L4^Q>^-=?UWaQ>r'   