
    
3j2                         S SK r S SKJr  SSKJrJr  SSKJr  SSKJ	r	J
r
  SSKJrJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  \R0                  " \5      r " S S\R6                  5      r " S S\\	\5      rg)    N   )ConfigMixinregister_to_config)logging   )AttentionMixinFeedForward)	AttentionCogVideoXAttnProcessor2_0)&CogView3CombinedTimestepSizeEmbeddingsCogView3PlusPatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuous%CogView3PlusAdaLayerNormZeroTextImagec            
          ^  \ rS rSrSr    SS\S\S\S\4U 4S jjjrS\R                  S	\R                  S
\R                  S\	\R                  \R                  4   4S jr
SrU =r$ )CogView3PlusTransformerBlock    a  
Transformer block used in [CogView](https://github.com/THUDM/CogView3) model.

Args:
    dim (`int`):
        The number of channels in the input and output.
    num_attention_heads (`int`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`):
        The number of channels in each head.
    time_embed_dim (`int`):
        The number of channels in timestep embedding.
dimnum_attention_headsattention_head_dimtime_embed_dimc                   > [         TU ]  5         [        XAS9U l        [	        UUUUSSSS[        5       S9	U l        [        R                  " USSS9U l	        [        R                  " USSS9U l
        [        XS	S
9U l        g )N)embedding_dimr   T
layer_normFư>)		query_dimheadsdim_headout_dimbiasqk_normelementwise_affineeps	processorgh㈵>)r#   r$   zgelu-approximate)r   dim_outactivation_fn)super__init__r   norm1r
   r   attn1nn	LayerNormnorm2norm2_contextr	   ff)selfr   r   r   r   	__class__s        p/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_cogview3plus.pyr)   %CogView3PlusTransformerBlock.__init__/   s     	:a
%' $/1


 \\#%TJ
\\#%TR#BTU    hidden_statesencoder_hidden_statesembreturnc           
         UR                  S5      nU R                  XU5      u
  nnnnn	n
nnnnU R                  XZS9u  nnXR                  S5      U-  -   nX+R                  S5      U-  -   nU R	                  U5      nUSUS S 2S 4   -   -  US S 2S 4   -   nU R                  U5      n
U
SUS S 2S 4   -   -  US S 2S 4   -   n
[        R                  " X/SS9nU R                  U5      nXR                  S5      US S 2US 24   -  -   nX.R                  S5      US S 2S U24   -  -   nUR                  [        R                  :X  a  UR                  SS5      nUR                  [        R                  :X  a  UR                  SS5      nX4$ )N   )r6   r7   )r   i  i  )sizer*   r+   	unsqueezer.   r/   torchcatr0   dtypefloat16clip)r1   r6   r7   r8   text_seq_lengthnorm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpnorm_encoder_hidden_states
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattn_hidden_statesattn_encoder_hidden_states	ff_outputs                     r3   forward$CogView3PlusTransformerBlock.forwardK   s    044Q7 JJ}SA	
& :>, :D :
66 &(:(:1(=@R(RR 58L8LQ8ORl8l l "ZZ6/1yD7I3IJYWXZ^W^M__%)%7%78M%N"%?1{STVZSZG[C[%\_jklnrkr_s%s" #YY(B'W]^_GG./	%(:(:1(=	!_M]J]@^(^^ 58L8LQ8OR[\]_o`o_o\oRp8p p%--/)..vu=M &&%--7$9$>$>vu$M!33r5   )r+   r0   r*   r.   r/   )i 
  @   (      )__name__
__module____qualname____firstlineno____doc__intr)   r>   TensortuplerQ   __static_attributes____classcell__r2   s   @r3   r   r       s      #%"$!VV !V  	V
 V V804||04  %||04 \\	04
 
u||U\\)	*04 04r5   r   c                   \  ^  \ rS rSrSrSrSS/rSS/r\           SS\	S	\	S
\	S\	S\	S\	S\	S\	S\	S\	S\	4U 4S jjj5       r
 SS\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\S\\R                     \-  4S jjrSrU =r$ ) CogView3PlusTransformer2DModel~   a!  
The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay
Diffusion](https://huggingface.co/papers/2403.05121).

Args:
    patch_size (`int`, defaults to `2`):
        The size of the patches to use in the patch embedding layer.
    in_channels (`int`, defaults to `16`):
        The number of channels in the input.
    num_layers (`int`, defaults to `30`):
        The number of layers of Transformer blocks to use.
    attention_head_dim (`int`, defaults to `40`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `64`):
        The number of heads to use for multi-head attention.
    out_channels (`int`, defaults to `16`):
        The number of channels in the output.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    time_embed_dim (`int`, defaults to `512`):
        Output dimension of timestep embeddings.
    condition_dim (`int`, defaults to `256`):
        The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
        crop_coords).
    pos_embed_max_size (`int`, defaults to `128`):
        The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
        to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
        means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
        patch_size => 128 * 8 * 2 => 2048`.
    sample_size (`int`, defaults to `128`):
        The base resolution of input latents. If height/width is not provided during generation, this value is used
        to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
Tpatch_embednormr   r   
patch_sizein_channels
num_layersr   r   out_channelstext_embed_dimr   condition_dimpos_embed_max_sizesample_sizec                   > [         TU ]  5         X`l        XT-  U l        SU	-  U l        [        UU R                  UUU
S9U l        [        UU	U R                  U R                  S9U l        [        R                  " [        U5       Vs/ s H  n[        U R                  UUUS9PM     sn5      U l        [        U R                  USSS9U l        [        R                   " U R                  X-  U R                  -  SS	9U l        SU l        g s  snf )
N   )rg   hidden_sizerf   text_hidden_sizerl   )r   rk   pooled_projection_dimtimesteps_dim)r   r   r   r   Fr   )r   conditioning_embedding_dimr#   r$   T)r!   )r(   r)   ri   	inner_dimrr   r   rd   r   time_condition_embedr,   
ModuleListranger   transformer_blocksr   norm_outLinearproj_outgradient_checkpointing)r1   rf   rg   rh   r   r   ri   rj   r   rk   rl   rm   _r2   s                r3   r)   'CogView3PlusTransformer2DModel.__init__   s    	(,A &+]%:"1#!+1
 %K('"&"<"<..	%
! #%-- z* +A -(;'9#1	 +
#
 /..'5$	
 		$..*2IDL]L]2]dhi&+#'s    D
r6   r7   timesteporiginal_sizetarget_sizecrop_coordsreturn_dictr9   c                 $   UR                   SS u  pUR                   S   n
U R                  X5      nU R                  X4XVUR                  5      nUSS2SU
24   nUSS2U
S24   n[	        U R
                  5       HR  u  p[        R                  " 5       (       a)  U R                  (       a  U R                  UUUU5      u  pMH  U" UUUS9u  pMT     U R                  X5      nU R                  U5      nU R                  R                  nX-  nX-  n	UR                  UR                   S   XU R                  X4S9n[        R                   " SU5      nUR                  UR                   S   U R                  X-  X-  4S9nU(       d  U4$ [#        US9$ )	a  
The [`CogView3PlusTransformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor`):
        Input `hidden_states` of shape `(batch size, channel, height, width)`.
    encoder_hidden_states (`torch.Tensor`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape
        `(batch_size, sequence_len, text_embed_dim)`
    timestep (`torch.LongTensor`):
        Used to indicate denoising step.
    original_size (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    target_size (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    crop_coords (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.

Returns:
    `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]:
        The denoised latents using provided inputs as conditioning.
Nr;   )r6   r7   r8   r   )shapeznhwcpq->nchpwq)sample)r   rd   rv   r@   	enumeratery   r>   is_grad_enabledr}   _gradient_checkpointing_funcrz   r|   configrf   reshaperi   einsumr   )r1   r6   r7   r   r   r   r   r   heightwidthrC   r8   index_blockblockrf   outputs                   r3   rQ   &CogView3PlusTransformer2DModel.forward   s   L &++BC0/55a8((
 ''[h[n[no -a1A/1A.A B%a)9&9:"+D,C,C"DK$$&&4+F+F7;7X7X!)	844 8="/*?844 #E m9m4 [[++
%#%-- &&q)6$:K:KZd . 
 %5}E&& &&q)4+<+<f>QSXSef ' 
 9'v66r5   )	r}   ru   rz   ri   rd   rr   r|   rv   ry   )r         rT   rS   r   i   rU         r   )T)rV   rW   rX   rY   rZ    _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modulesr   r[   r)   r>   r\   
LongTensorboolr]   r   rQ   r^   r_   r`   s   @r3   rb   rb   ~   sZ    D (,$(5v'>$79QR "$#%"! "%9,9, 9, 	9,
  9, !9, 9, 9, 9, 9,  9, 9, 9,F !S7||S7  %||S7 ""	S7
 ||S7 \\S7 \\S7 S7 
u||	7	7S7 S7r5   rb   )r>   torch.nnr,   configuration_utilsr   r   utilsr   	attentionr   r	   attention_processorr
   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrV   loggerModuler   rb    r5   r3   <module>r      s\       B  3 F W 7 ' Y 
		H	%[4299 [4|v7Z v7r5   