
    
3j                     
   S SK Jr  S SKrS SKJr  S SKJs  Jr  SSKJ	r	J
r
  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJr  SSKJrJ r   \RB                  " \"5      r# " S S\RH                  5      r% " S S\RH                  5      r& " S S5      r' " S S5      r(\ " S S\RH                  5      5       r) " S S\RH                  5      r* " S S\RH                  5      r+ " S S\\	\\5      r,g)     )AnyN   )ConfigMixinregister_to_config)PeftAdapterMixin)apply_lora_scalelogging)maybe_allow_in_graph   )FeedForward)	Attention)
CacheMixin)&CogView3CombinedTimestepSizeEmbeddings)Transformer2DModelOutput)
ModelMixin)	LayerNormRMSNormc            	          ^  \ rS rSr    SS\S\S\S\4U 4S jjjrS\R                  S\R                  S	\R                  4S
 jrSr	U =r
$ )CogView4PatchEmbed%   in_channelshidden_size
patch_sizetext_hidden_sizec                    > [         TU ]  5         X0l        [        R                  " XS-  -  U5      U l        [        R                  " XB5      U l        g )Nr   )super__init__r   nnLinearproj	text_proj)selfr   r   r   r   	__class__s        l/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_cogview4.pyr   CogView4PatchEmbed.__init__&   s?     	$IIkM9;G	#3A    hidden_statesencoder_hidden_statesreturnc                 \   UR                   u  p4pVXPR                  -  nX`R                  -  nUR                  X4XpR                  XR                  5      nUR                  SSSSSS5      R	                  SS5      R	                  SS5      nU R                  U5      nU R                  U5      nX4$ )Nr   r         r      )shaper   reshapepermuteflattenr    r!   )	r"   r'   r(   
batch_sizechannelheightwidthpost_patch_heightpost_patch_widths	            r$   forwardCogView4PatchEmbed.forward3   s    -:-@-@*
V"oo5 OO3%--!2OOEUWfWf
 &--aAq!Q?GG1MUUVWYZ[		-0 $/D E33r&   )r   r    r!   )    
  r      )__name__
__module____qualname____firstlineno__intr   torchTensorr8   __static_attributes____classcell__r#   s   @r$   r   r   %   su      $BB B 	B
 B B4U\\ 4%,, 4[`[g[g 4 4r&   r   c            
          ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  S	\R                  S\\R                  \R                  4   4S
 jr	Sr
U =r$ )CogView4AdaLayerNormZeroB   embedding_dimdimr)   Nc                    > [         TU ]  5         [        R                  " USSS9U l        [        R                  " USSS9U l        [        R                  " USU-  SS9U l        g )NFh㈵>elementwise_affineeps   Tbias)r   r   r   r   normnorm_contextr   linear)r"   rJ   rK   r#   s      r$   r   !CogView4AdaLayerNormZero.__init__C   sO    LLDI	LLDQiirCxdCr&   r'   r(   tembc                    UR                   nU R                  U5      R                  US9nU R                  U5      R                  US9nU R	                  U5      nUR                  SSS9u  nn	n
nnnnnnnnnUSU
R                  S5      -   -  UR                  S5      -   nUSUR                  S5      -   -  U	R                  S5      -   nUUUUUUUUUU4
$ )NdtyperQ   r,   rK   )r[   rT   torU   rV   chunk	unsqueeze)r"   r'   r(   rX   r[   norm_hidden_statesnorm_encoder_hidden_statesemb	shift_msac_shift_msa	scale_msac_scale_msagate_msa
c_gate_msa	shift_mlpc_shift_mlp	scale_mlpc_scale_mlpgate_mlp
c_gate_mlps                       r$   r8    CogView4AdaLayerNormZero.forwardJ   s    ##!YY}588u8E%)%6%67L%M%P%PW\%P%]"kk$ IIbaI 	
 +a)2E2Ea2H.HIIL_L_`aLbb :a+BWBWXYBZ>Z [^i^s^stu^v v !
 	
r&   )rV   rT   rU   )r=   r>   r?   r@   rA   r   rB   rC   tupler8   rD   rE   rF   s   @r$   rH   rH   B   sk    Dc D D D%
"\\%
BG,,%
V[VbVb%
	u||U\\)	*%
 %
r&   rH   c                       \ rS rSrSrS r  SS\S\R                  S\R                  S\R                  S-  S	\	\R                  \R                  4   S-  S
\	\R                  \R                  4   4S jjr
Srg)CogView4AttnProcessorr   a  
Processor for implementing scaled dot-product attention for the CogView4 model. It applies a rotary embedding on
query and key vectors, but does not include spatial normalization.

The processor supports passing an attention mask for text tokens. The attention mask should have shape (batch_size,
text_seq_length) where 1 indicates a non-padded token and 0 indicates a padded token.
c                 D    [        [        S5      (       d  [        S5      eg Nscaled_dot_product_attentionzUCogView4AttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.hasattrFImportErrorr"   s    r$   r   CogView4AttnProcessor.__init__{   !    q899uvv :r&   Nattnr'   r(   attention_maskimage_rotary_embr)   c           	         UR                   nUR                  u  pxn	UR                  u  pzn	[        R                  " X2/SS9nUR	                  U5      nUR                  U5      nUR                  U5      nUR                  SUR                  S45      R                  SS5      nUR                  SUR                  S45      R                  SS5      nUR                  SUR                  S45      R                  SS5      nUR                  b  UR                  U5      R                  US9nUR                  b  UR                  U5      R                  US9nUbR  SSKJn  U" US S 2S S 2US 2S S 24   USS9US S 2S S 2US 2S S 24'   U" US S 2S S 2US 2S S 24   USS9US S 2S S 2US 2S S 24'   Ub  UnUR                  5       S:X  d   S	5       eUR!                  5       R                  UR"                  5      n[        R$                  " XxU
-   4UR"                  S
9nUUS S 2S U24'   UR'                  S5      nUUR                  SS5      -  nUS:  R'                  S5      R                  UR                   5      n[(        R*                  " XXSSS9nUR                  SS5      R-                  SS5      nUR/                  U5      nUR0                  S   " U5      nUR0                  S   " U5      nUR3                  XR5                  S5      U-
  /SS9u  p2X#4$ )Nr,   r\   r   rZ   apply_rotary_embuse_real_unbind_dimCthe shape of text_attn_mask should be (batch_size, text_seq_length)devicer           F	attn_mask	dropout_p	is_causalr   )r[   r.   rB   catto_qto_kto_v	unflattenheads	transposenorm_qr]   norm_k
embeddingsr   rK   floatr   onesr_   ry   rv   r1   type_asto_outsplitsize)r"   r~   r'   r(   r   r   r[   r2   text_seq_length	embed_dimimage_seq_lengthquerykeyvaluer   text_attn_maskmix_attn_maskattn_mask_matrixs                     r$   __call__CogView4AttnProcessor.__call__   s    &++1F1L1L.
Y2?2E2E/
i		#8"HaP 		-(ii&		-(DJJ#34>>q!DmmA

B/0::1a@DJJ#34>>q!D ;;"KK&)))6E;;"++c"%%E%2C '5/?aO,a/02BXZ0E!Q(!+, .>Aq/*A-.0@VX.C1o&)*
 %+N!%%'1,s.ss,+11366u||DN!JJ
FV4V'W`e`l`lmM1?M!-o--.)33A6M,}/F/Fq!/LL.2==a@CCEKKPN663RW
 &//15==aC%--e4 A}5A}5/</B/B003oEFA 0C 0
, 33r&    )NN)r=   r>   r?   r@   __doc__r   r   rB   rC   rp   r   rD   r   r&   r$   rr   rr   r   s    w /3EI@4@4 ||@4  %||	@4
 t+@4  ell :;dB@4 
u||U\\)	*@4 @4r&   rr   c                   r   \ rS rSrSrS r    SS\S\R                  S\R                  S\R                  S-  S	\R                  S-  S
\R                  S-  S\	\R                  \R                  4   \
\	\R                  \R                  4      -  S-  S\	\R                  \R                  4   4S jjrSrg)CogView4TrainingAttnProcessor   a  
Training Processor for implementing scaled dot-product attention for the CogView4 model. It applies a rotary
embedding on query and key vectors, but does not include spatial normalization.

This processor differs from CogView4AttnProcessor in several important ways:
1. It supports attention masking with variable sequence lengths for multi-resolution training
2. It unpacks and repacks sequences for efficient training with variable sequence lengths when batch_flag is
   provided
c                 D    [        [        S5      (       d  [        S5      eg ru   rw   r{   s    r$   r   &CogView4TrainingAttnProcessor.__init__   r}   r&   Nr~   r'   r(   latent_attn_maskr   
batch_flagr   r)   c           
         UR                   u  pnUR                   u  pnUR                  nUR                  nUn[        R                  " X?/SS9nUc%  [        R
                  " X4[        R                  US9nUc%  [        R
                  " X4[        R                  US9nUR                  5       S:X  d   S5       eUR                  [        R                  :X  d   S5       eUR                  5       S:X  d   S5       eUR                  [        R                  :X  d   S	5       e[        R
                  " XU-   4[        R                  US9nUUSS2SU
24'   UUSS2U
S24'   UR                  S5      R                  US
9nUUR                  SS5      -  nUGb  UR                  5       S:X  d   e[        R                  " U5      R                  5       S-   n[        R                  " USS9n
[        R                  " USS9nU
U-   n[        U5       Vs/ s H-  n[        R                  " UUU:H     5      R                  5       PM/     nn[        U5      U:X  d   eUR!                  SS5      nUR!                  SS5      nUUS:H     n[        R                  " U5      UR                   S   :X  d   e[        R"                  " UU5      n[        R$                  R&                  R(                  R+                  USSSS9nUR                   S   n[        R,                  " UUU4UUS9n[/        U5       H/  u  nn UUU:H     n!Sn"U! H  n#SU U"U"U#-   2U"U"U#-   24'   U"U#-  n"M     M1     UR                  [        R0                  S
9nUR                  S5      nUn$Uc  [        R                  " X2/SS9nOWnUR3                  U5      n%UR5                  U5      n&UR7                  U5      n'U%R9                  SUR:                  S45      R                  SS5      n%U&R9                  SUR:                  S45      R                  SS5      n&U'R9                  SUR:                  S45      R                  SS5      n'UR<                  b  UR=                  U%5      R                  US
9n%UR>                  b  UR?                  U&5      R                  US
9n&UGb=  SSK J!n(  UcM  U(" U%SS2SS2U
S2SS24   USS9U%SS2SS2U
S2SS24'   U(" U&SS2SS2U
S2SS24   USS9U&SS2SS2U
S2SS24'   OU%R                   S   W:X  d   eU&R                   S   U:X  d   e[        U5      U	:X  d   eSn)[        U5       H  nSn"XU:H     n*WUU:H     n+[E        U*U+5       Hw  u  n,n-U,U--   n.U(" U%USS2U"U,-   U"U.-   2SS24   UU)   SS9U%USS2U"U,-   U"U.-   2SS24'   U(" U&USS2U"U,-   U"U.-   2SS24   UU)   SS9U&USS2U"U,-   U"U.-   2SS24'   U"U.-  n"U)S-  n)My     M     [F        RH                  " U%U&U'U$SSS9nUR                  SS5      R!                  SS5      nURK                  U%5      nURL                  S   " U5      nURL                  S   " U5      nUc(  UR#                  XRO                  S5      U
-
  /SS9u  p2X#4$ [        R$                  R&                  R(                  RQ                  U[        RR                  " W5      SS9n/[        R                  " U/SS9n0[        R"                  " U0WRU                  5       5      n1[        U15      U	:X  d   e[E        U1U
W5       V2V,V-s/ s H   u  n2n,n-[        R"                  " U2U,U-/5      PM"     n1n,n2n-U1 V2s/ s H  n2U2S   PM
     n3n2U1 V2s/ s H  n2U2S   PM
     n/n2[        U	5       H%  nU3U   UU   UU   S:H  '   U/U   UU   UU   S:H  '   M'     UnX#4$ s  snf s  sn-n,n2f s  sn2f s  sn2f )a5  
Args:
    attn (`Attention`):
        The attention module.
    hidden_states (`torch.Tensor`):
        The input hidden states.
    encoder_hidden_states (`torch.Tensor`):
        The encoder hidden states for cross-attention.
    latent_attn_mask (`torch.Tensor`, *optional*):
        Mask for latent tokens where 0 indicates pad token and 1 indicates non-pad token. If None, full
        attention is used for all latent tokens. Note: the shape of latent_attn_mask is (batch_size,
        num_latent_tokens).
    text_attn_mask (`torch.Tensor`, *optional*):
        Mask for text tokens where 0 indicates pad token and 1 indicates non-pad token. If None, full attention
        is used for all text tokens.
    batch_flag (`torch.Tensor`, *optional*):
        Values from 0 to n-1 indicating which samples belong to the same batch. Samples with the same
        batch_flag are packed together. Example: [0, 1, 1, 2, 2] means sample 0 forms batch0, samples 1-2 form
        batch1, and samples 3-4 form batch2. If None, no packing is used.
    image_rotary_emb (`tuple[torch.Tensor, torch.Tensor]` or `list[tuple[torch.Tensor, torch.Tensor]]`, *optional*):
        The rotary embedding for the image part of the input.
Returns:
    `tuple[torch.Tensor, torch.Tensor]`: The processed hidden states for both image and text streams.
r,   r\   N)r[   r   r   r   z1the dtype of text_attn_mask should be torch.int32zGthe shape of latent_attn_mask should be (batch_size, num_latent_tokens)z3the dtype of latent_attn_mask should be torch.int32rZ   r   Tr   right)batch_firstpadding_valuepadding_sider   r   r   r   Fr   r   )lengthsr   )+r.   r[   r   rB   r   r   int32rK   r_   r]   r   maxitemsumrangelenr1   r   r   utilsrnnpad_sequencezeros	enumerateboolr   r   r   r   r   r   r   r   r   zipry   rv   r   r   r   unpad_sequencetensortolist)4r"   r~   r'   r(   r   r   r   r   kwargsr2   r   r   r   r[   r   latent_hidden_statesmixed_hidden_statesmixed_attn_maskmixed_attn_mask_inputr   packing_batch_sizelatent_seq_lengthmixed_seq_length	batch_idxmixed_seq_length_packedmixed_attn_mask_flattenmixed_hidden_states_flattenmixed_hidden_states_unpadmixed_hidden_states_packed!mixed_hidden_states_packed_paddedlidxmaskseq_lengthsoffsetlengthr   r   r   r   r   rope_idxtext_seq_length_bilatent_seq_length_bitlenllenmlenhidden_states_unpadhidden_states_flattenhidden_states_unpackhencoder_hidden_states_unpads4                                                       r$   r   &CogView4TrainingAttnProcessor.__call__   s   J 2G1L1L.
Y2?2E2E/
i%++&--,#ii)>(U[\] !"ZZ(EU[[aghN#$zz:*HPUP[P[djk !!#q(o*oo(##u{{2g4gg2##%*u,uu*%%4k6kk4  **+;;<EKKX^
 0>+O++,/??++, !0 9 9! < ? ?e ? L03H3R3RSTVW3XX !>>#q(((!&:!6!;!;!=!A $iiA>O %		*: B.1BB ]bbt\u'\uy		/*	*ABCHHJ\u $ ' ./3EEEE '6&=&=a&C#*=*E*Ea*K'(CD[_`D`(a%99-.2K2Q2QRS2TTTT */5NPg)h& 160B0B0O0O* !$	 1P 1- 277:A${{#Q*  ''78	T.zS/@A)FOPD&6/16FVO3KKLf$F * 9 ,..UZZ.@+55a8) !II'<&LRSTM >M 		-(ii&		-( DJJ#34>>q!DmmA

B/0::1a@DJJ#34>>q!D ;;"KK&)))6E;;"++c"%%E%2C '5!3C!Q 0!346F\^4aO,a/0 2B1o.124DZ\2Aq/*A-.
 {{1~);;;;yy|'9999+,
::: !34CF)8s9J)K&+<Z3=N+O( '**<>R&S
d#d{JZ!#q&4-&4-*G"JK,X602Kc1ftmftm&CQFG
 IYQ(Eq HI,X602ICFTMFTM$A1DE
 $ A 'T 5. 6633RW

 &//15==aC%--e4 A}5A}5 3@3F3F "4"4Q"7/"IJPQ 4G 40!H 33= #(((.."4"4"C"C%<=  #D # %*II.Aq$I!#(;;/DFVF]F]F_#` +,
:::
 &))=Pa%b$%bMAtT Ad|,%b ! $
 :N*N9MA1Q49M'*N1E"F1EA1Q41E"F Z(GbcfGg%c*>#+>!+CDH[\_H`$S)*:3*?1*DE	 ) 1M33E'`$
 +O"Fs   4_>'_-__r   NNNN)r=   r>   r?   r@   r   r   r   rB   rC   rp   listr   rD   r   r&   r$   r   r      s    w 15.2*.oss4s4 ||s4  %||	s4
  ,,-s4 t+s4 LL4's4  ell :;d5W\WcWcIcCd>eehlls4 
u||U\\)	*s4 s4r&   r   c                     ^  \ rS rSr    SS\S\S\S\SS4
U 4S jjjr    SS	\R                  S
\R                  S\R                  S-  S\\R                  \R                  4   \	\\R                  \R                  4      -  S-  S\
\\R                  4   S-  S\
\\4   S-  S\\R                  \R                  4   4S jjrSrU =r$ )CogView4TransformerBlocki  rK   num_attention_headsattention_head_dimtime_embed_dimr)   Nc                   > [         TU ]  5         [        XA5      U l        [	        UUUUSSSS[        5       S9	U l        [        R                  " USSS9U l	        [        R                  " USSS9U l
        [        XSS9U l        g )	NT
layer_normFrM   )		query_dimr   dim_headout_dimrS   qk_normrO   rP   	processorrN   zgelu-approximate)rK   dim_outactivation_fn)r   r   rH   norm1r   rr   attn1r   r   norm2norm2_contextr   ff)r"   rK   r   r   r   r#   s        r$   r   !CogView4TransformerBlock.__init__  s     	 .nB
%' $+-


 \\#%TJ
\\#%TR#BTUr&   r'   r(   rX   r   r   attention_kwargsc           
      \   U R                  XU5      u
  nnn	n
nnnnnnUc  0 nU R                  " SUUUUS.UD6u  nnUUUR                  S5      -  -   nUUUR                  S5      -  -   nU R                  U5      SU
R                  S5      -   -  U	R                  S5      -   nU R	                  U5      SUR                  S5      -   -  UR                  S5      -   nU R                  U5      nU R                  U5      nUUUR                  S5      -  -   nUUUR                  S5      -  -   nX4$ )N)r'   r(   r   r   r,   r   )r   r   r_   r   r   r   )r"   r'   r(   rX   r   r   r  r`   rg   ri   rk   rm   ra   rh   rj   rl   rn   attn_hidden_statesattn_encoder_hidden_states	ff_outputff_output_contexts                        r$   r8    CogView4TransformerBlock.forward  s   * JJ}TB	
& #!9= :
,"<-)	:

 :
66 &(:X=O=OPQ=R(RR 58RU_UiUijkUl8l l "ZZ6!i>Q>QRS>T:TUXaXkXklmXnn%)%7%78M%N%%a((&
!!!$&%" GG./	 GG$>?%	H4F4Fq4I(II 58IJL`L`abLc8c c33r&   )r   r   r   r   r   )r;   @   (      r   )r=   r>   r?   r@   rA   r   rB   rC   rp   r   dictstrr   r8   rD   rE   rF   s   @r$   r   r     s)    #%"$!VV !V  	V
 V 
V V@ %)os9=26/4||/4  %||/4 llT!	/4
  ell :;d5W\WcWcIcCd>eehll/4 S%,,./$6/4 sCx.4//4 
u||U\\)	*/4 /4r&   r   c                      ^  \ rS rSrSS\S\S\\\4   S\SS4
U 4S jjjrS	\R                  S\\R                  \R                  4   4S
 jr
SrU =r$ )CogView4RotaryPosEmbedi  rK   r   rope_axes_dimthetar)   Nc                 R   > [         TU ]  5         Xl        X l        X0l        X@l        g )N)r   r   rK   r   r  r  )r"   rK   r   r  r  r#   s        r$   r   CogView4RotaryPosEmbed.__init__  s#    $*
r&   r'   c           	         UR                   u  p#pEX@R                  -  XPR                  -  pTU R                  S-  U R                  S-  pvSU R                  [        R
                  " SUS[        R                  S9S US-   R                  5       U-  -  -  nSU R                  [        R
                  " SUS[        R                  S9S US-   R                  5       U-  -  -  n	[        R
                  " U R                  S   5      n
[        R
                  " U R                  S   5      n[        R                  " X5      n[        R                  " X5      n[        R
                  " XLR                  S9n[        R
                  " X]R                  S9nXR                  S   -  U-  nXR                  S   -  U-  nUU   nUU   nUR                  S5      nUR                  S5      nUR                  XES5      nUR                  XES5      n[        R                  " X/SS9n[        R                  " UU/SS9nUR                  XE-  S5      nUR                  5       UR!                  5       4$ )	Nr   g      ?r   rZ   r,   r   r   r\   )r.   r   rK   r  rB   arangefloat32r   r  outerr   r_   expandr   r/   cossin)r"   r'   r2   num_channelsr4   r5   dim_hdim_w
h_inv_freq
w_inv_freqh_seqw_seqfreqs_hfreqs_wh_idxw_idxinner_h_idxinner_w_idxfreqss                      r$   r8   CogView4RotaryPosEmbed.forward   s   2?2E2E/
&//15OO3Kxx1}dhh!muJJ5<<5!5==I.UVWZY__adiij

 JJ5<<5!5==I.UVWZY__adiij

 T//23T//23++e0++e0VNN;U>>:0033v=0033u<+&+& ##A&##A&..3..3 		7,"5		5%.b1fnb1		UYY[))r&   )rK   r   r  r  )     @)r=   r>   r?   r@   rA   rp   r   r   rB   rC   r8   rD   rE   rF   s   @r$   r  r    sh    C S sCx Y^ mq  $*U\\ $*eELL%,,<V6W $* $*r&   r  c                      ^  \ rS rSrSr    SS\S\S\S\S\S\4U 4S	 jjjr	S
\
R                  S\
R                  S\
R                  4S jrSrU =r$ )CogView4AdaLayerNormContinuousiG  z
CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the
Linear on conditioning embedding.
rJ   conditioning_embedding_dimrO   rP   rS   	norm_typec                    > [         TU ]  5         [        R                  " X!S-  US9U l        US:X  a  [        XX55      U l        g US:X  a  [        XU5      U l        g [        SU 35      e)Nr   rR   r   rms_normzunknown norm_type )	r   r   r   r   rV   r   rT   r   
ValueError)r"   rJ   r,  rO   rP   rS   r-  r#   s          r$   r   'CogView4AdaLayerNormContinuous.__init__M  sj     	ii :A<MTXY$!-6HODI*$4FGDI1)=>>r&   xconditioning_embeddingr)   c                     U R                  UR                  UR                  5      5      n[        R                  " USSS9u  pEU R                  U5      SU-   S S 2S S S 24   -  US S 2S S S 24   -   nU$ )Nr   r,   r\   )rV   r]   r[   rB   r^   rT   )r"   r2  r3  rb   scaleshifts         r$   r8   &CogView4AdaLayerNormContinuous.forward_  sj    kk033AGG<={{3q1IIaLAIq$z22U1dA:5FFr&   )rV   rT   )TrM   Tr   )r=   r>   r?   r@   r   rA   r   r   r  r   rB   rC   r8   rD   rE   rF   s   @r$   r+  r+  G  s     $(%?? %(? !	?
 ? ? ? ?$ u|| PUP\P\  r&   r+  c                   4  ^  \ rS rSrSrSr/ SQr/ SQr\            S!S\	S\	S\	S	\	S
\	S\	S\	S\	S\	S\	S\	S\
\	\	4   4U 4S jjj5       r\" S5          S"S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\\\4   S-  S\S\R                  S-  S\
\R                  \R                  4   \\
\R                  \R                  4      -  S-  S\
\R                     \-  4S jj5       rS rU =r$ )#CogView4Transformer2DModelig  a  
Args:
    patch_size (`int`, defaults to `2`):
        The size of the patches to use in the patch embedding layer.
    in_channels (`int`, defaults to `16`):
        The number of channels in the input.
    num_layers (`int`, defaults to `30`):
        The number of layers of Transformer blocks to use.
    attention_head_dim (`int`, defaults to `40`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `64`):
        The number of heads to use for multi-head attention.
    out_channels (`int`, defaults to `16`):
        The number of channels in the output.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    time_embed_dim (`int`, defaults to `512`):
        Output dimension of timestep embeddings.
    condition_dim (`int`, defaults to `256`):
        The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
        crop_coords).
    pos_embed_max_size (`int`, defaults to `128`):
        The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
        to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
        means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
        patch_size => 128 * 8 * 2 => 2048`.
    sample_size (`int`, defaults to `128`):
        The base resolution of input latents. If height/width is not provided during generation, this value is used
        to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
T)r   r   r   )patch_embedrT   proj_outr   r   out_channels
num_layersr   r   text_embed_dimr   condition_dimpos_embed_max_sizesample_sizer  c                   > [         TU ]  5         SU	-  nXe-  nUn[        XQUSS9U l        [	        X.X5      U l        [        UU	UUS9U l        [        R                  " [        U5       Vs/ s H  n[        XXX5      PM     sn5      U l        [        XSS9U l        [        R                  " XU-  U-  SS9U l        SU l        g s  snf )	N   r)  )r  )rJ   r?  pooled_projection_dimtimesteps_dimF)rO   TrR   )r   r   r  roper   r:  r   time_condition_embedr   
ModuleListr   r   transformer_blocksr+  norm_outr   r;  gradient_checkpointing)r"   r   r   r<  r=  r   r   r>  r   r?  r@  rA  r  rD  	inner_dim_r#   s                   r$   r   #CogView4Transformer2DModel.__init__  s      	 !& 5'<	# ++==`gh	 .kja$J('"7#	%
! #%-- z**A )I[l*#
 7yejk		)*-D|-SZ^_&+#s   *Cr  Nr'   r(   timesteporiginal_sizetarget_sizecrop_coordsreturn_dictr   r   r)   c           
         UR                   u  ppU
c  U R                  U5      n
U R                  R                  nX-  nX-  nU R	                  X5      u  pU R                  X4XVUR                  5      n[        R                  " U5      nU R                   HX  n[        R                  " 5       (       a,  U R                  (       a  U R                  UUUUU
U	U5      u  pMI  U" UUUU
U	U5      u  pMZ     U R                  UU5      nU R                  U5      nUR!                  UUUSX5      nUR#                  SSSSSS5      R%                  SS5      R%                  SS5      nU(       d  U4$ ['        US9$ )	Nr   r   r   r,   r+   r   r-   )sample)r.   rF  configr   r:  rG  r[   ry   silurI  rB   is_grad_enabledrK  _gradient_checkpointing_funcrJ  r;  r/   r0   r1   r   )r"   r'   r(   rO  rP  rQ  rR  r  rS  r   r   r2   r  r4   r5   pr6   r7   rX   blockoutputs                        r$   r8   "CogView4Transformer2DModel.forward  s    3@2E2E/
& ##yy7 KK"""K :/3/?/?/e,((+\i\o\opvvd| ,,E$$&&4+F+F7;7X7X!)$"$844 8=!)$"$844 -, mT:m4 &--j:KM]_acdh&&q!Q1a8@@AFNNqRST9'v66r&   )rK  rJ  r:  r;  rF  rG  rI  )r   r:   r:      r	  r  r<   r
        r`  )r_  r_  )NTNN)r=   r>   r?   r@   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rA   rp   r   r   rB   rC   
LongTensorr  r  r   r   r   r   r8   rD   rE   rF   s   @r$   r9  r9  g  s   > (,$`'J$ "$#%"! "%)30,0, 0, 	0,
 0,  0, !0, 0, 0, 0,  0, 0, S#X0, 0,d () 37 .2os=7||=7  %||=7 ""	=7
 ||=7 \\=7 \\=7 sCx.4/=7 =7 t+=7  ell :;d5W\WcWcIcCd>eehll=7 
u||	7	7=7 *=7r&   r9  )-typingr   rB   torch.nnr   torch.nn.functional
functionalry   configuration_utilsr   r   loadersr   r   r   r	   utils.torch_utilsr
   	attentionr   attention_processorr   cache_utilsr   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr=   loggerModuler   rH   rr   r   r   r  r+  r9  r   r&   r$   <module>ru     s         B ' . 5 # + $ ? 7 ' . 
		H	%4 4:-
ryy -
`M4 M4`B4 B4J L4ryy L4 L4^-*RYY -*`RYY @U7[:JJ U7r&   