
    
3j8                         S SK Jr  S SKrS SKJr  SSKJrJr  SSKJ	r	  SSK
Jr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJr  \	R4                  " \5      r " S S\R:                  5      r " S S\\5      rg)    )AnyN   )ConfigMixinregister_to_config)logging   )LuminaFeedForward)	AttentionLuminaAttnProcessor2_0)&LuminaCombinedTimestepCaptionEmbeddingLuminaPatchEmbed)Transformer2DModelOutput)
ModelMixin)LuminaLayerNormContinuousLuminaRMSNormZeroRMSNormc                   *  ^  \ rS rSrSr SS\S\S\S\S\S\S	\S
\S\SS4U 4S jjjr SS\	R                  S\	R                  S\	R                  S\	R                  S\	R                  S\	R                  S\\\4   S-  S\	R                  4S jjrSrU =r$ )LuminaNextDiTBlock$   a  
A LuminaNextDiTBlock for LuminaNextDiT2DModel.

Parameters:
    dim (`int`): Embedding dimension of the input features.
    num_attention_heads (`int`): Number of attention heads.
    num_kv_heads (`int`):
        Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
    multiple_of (`int`): The number of multiple of ffn layer.
    ffn_dim_multiplier (`float`): The multiplier factor of ffn layer dimension.
    norm_eps (`float`): The eps for norm layer.
    qk_norm (`bool`): normalization for query and key.
    cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
    norm_elementwise_affine (`bool`, *optional*, defaults to True),
dimnum_attention_headsnum_kv_headsmultiple_offfn_dim_multipliernorm_epsqk_normcross_attention_dimnorm_elementwise_affinereturnNc
                 \  > [         T
U ]  5         X-  U l        [        R                  " [
        R                  " U/5      5      U l        [        US X-  U(       a  SOS UUSSS[        5       S9
U l
        [        R                  " 5       U R                  l        [        UUX-  U(       a  SOS UUSSS[        5       S9
U l        [        U[        SU-  S-  5      UUS9U l        [#        UUU	S9U l        ['        XU	S	9U l        ['        XU	S	9U l        ['        XU	S	9U l        ['        XU	S	9U l        g )
Nlayer_norm_across_headsh㈵>F)
	query_dimr   dim_headr   headskv_headsepsbiasout_bias	processor   r   )r   	inner_dimr   r   )embedding_dimr   r   )r'   elementwise_affine)super__init__head_dimnn	Parametertorchzerosgater
   r   attn1Identityto_outattn2r	   intfeed_forwardr   norm1r   	ffn_norm1norm2	ffn_norm2norm1_context)selfr   r   r   r   r   r   r   r   r   	__class__s             h/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/lumina_nextdit2d.pyr0   LuminaNextDiTBlock.__init__5   s8    	2LL.A-B!CD	  $/18-d%!,.

 KKM

  3/18-d%!,.

 .%#+/*#1	
 '$;


 !G^_SCZ[
 G^_$%8[rs    hidden_statesattention_maskimage_rotary_embencoder_hidden_statesencoder_masktembcross_attention_kwargsc           	         UnU R                  X5      u  ppU R                  " SU	U	UUUS.UD6nU R                  U5      nU R                  " SU	UUUSS.UD6nXR                  R                  5       R                  SSSS5      -  nX-   nUR                  S5      nU R                  R                  S   " U5      nXR                  S5      R                  5       U R                  U5      -  -   nU R                  U R                  U5      SUR                  S5      -   -  5      nXR                  S5      R                  5       U R                  U5      -  -   nU$ )a  
Perform a forward pass through the LuminaNextDiTBlock.

Parameters:
    hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
    attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
    image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
    encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
    encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
    temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
    cross_attention_kwargs (`dict[str, Any]`): kwargs for cross attention.
)rG   rJ   rH   query_rotary_embkey_rotary_embN   r    )r=   r7   rA   r:   r6   tanhviewflattenr9   	unsqueezer?   r<   r>   r@   )rB   rG   rH   rI   rJ   rK   rL   rM   residualnorm_hidden_statesgate_msa	scale_mlpgate_mlpself_attn_outputnorm_encoder_hidden_statescross_attn_outputmixed_attn_output
mlp_outputs                     rD   forwardLuminaNextDiTBlock.forwardv   st   , ! =AJJ}<[9i:: 
,"4)-+
 %
 &*%7%78M%N" JJ 
,"<'-
 %
 .		0@0E0EaBPQ0RR,@-55b9

))!,->? #5#5a#8#=#=#?$**]B[#[[&&t~~m'DIL_L_`aLbHb'cd
%(:(:1(=(B(B(Dt~~V`Ga(aarF   )
r7   r:   r<   r>   r@   r6   r1   r=   rA   r?   )T)N)__name__
__module____qualname____firstlineno____doc__r;   floatboolr0   r4   Tensordictstrr   rc   __static_attributes____classcell__rC   s   @rD   r   r   $   s   4 )-?t?t !?t 	?t
 ?t "?t ?t ?t !?t "&?t 
?t ?tR 9=9||9 9  ,,	9
  %||9 ll9 ll9 !%S#X 59 
9 9rF   r   c                      ^  \ rS rSrSr/ SQr\              SS\S\S-  S\S-  S\S-  S	\S-  S
\S-  S\S-  S\S-  S\S-  S\S-  S\	S-  S\	S-  S\S-  S\S-  SS4U 4S jjj5       r
  SS\R                  S\R                  S\R                  S\R                  S\R                  S\\\4   S\\R                     \-  4S jjrSrU =r$ )LuminaNextDiT2DModel   a
  
LuminaNextDiT: Diffusion model with a Transformer backbone.

Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.

Parameters:
    sample_size (`int`): The width of the latent images. This is fixed during training since
        it is used to learn a number of position embeddings.
    patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
        The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
    in_channels (`int`, *optional*, defaults to 4):
        The number of input channels for the model. Typically, this matches the number of channels in the input
        images.
    hidden_size (`int`, *optional*, defaults to 4096):
        The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
        hidden representations.
    num_layers (`int`, *optional*, default to 32):
        The number of layers in the model. This defines the depth of the neural network.
    num_attention_heads (`int`, *optional*, defaults to 32):
        The number of attention heads in each attention layer. This parameter specifies how many separate attention
        mechanisms are used.
    num_kv_heads (`int`, *optional*, defaults to 8):
        The number of key-value heads in the attention mechanism, if different from the number of attention heads.
        If None, it defaults to num_attention_heads.
    multiple_of (`int`, *optional*, defaults to 256):
        A factor that the hidden size should be a multiple of. This can help optimize certain hardware
        configurations.
    ffn_dim_multiplier (`float`, *optional*):
        A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
        the model configuration.
    norm_eps (`float`, *optional*, defaults to 1e-5):
        A small value added to the denominator for numerical stability in normalization layers.
    learn_sigma (`bool`, *optional*, defaults to True):
        Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
        predictions.
    qk_norm (`bool`, *optional*, defaults to True):
        Indicates if the queries and keys in the attention mechanism should be normalized.
    cross_attention_dim (`int`, *optional*, defaults to 2048):
        The dimensionality of the text embeddings. This parameter defines the size of the text representations used
        in the model.
    scaling_factor (`float`, *optional*, defaults to 1.0):
        A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
        overall scale of the model's operations.
)patch_embeddernormffn_normNsample_size
patch_sizein_channelshidden_size
num_layersr   r   r   r   r   learn_sigmar   r   scaling_factorr   c                 b  > [         TU ]  5         Xl        X l        X0l        U(       a  US-  OUU l        X@l        X`l        XF-  U l        Xl	        [        X#USS9U l        [        R                  " [        R                  " U5      5      U l        [#        [%        US5      US9U l        [        R(                  " [+        U5       Vs/ s H  n[-        UUUUU	U
UU5      PM     sn5      U l        [1        U[%        US5      SSSX"-  U R
                  -  S9U l        XF-  S	-  S
:X  d   S5       eg s  snf )Nr   T)ry   rz   	embed_dimr(   i   )r{   r   Fgư>)r-   conditioning_embedding_dimr.   r'   r(   out_dim   r   z+2d rope needs head dim to be divisible by 4)r/   r0   rx   ry   rz   out_channelsr{   r   r1   r~   r   ru   r2   r3   r4   empty	pad_tokenr   mintime_caption_embed
ModuleListranger   layersr   norm_out)rB   rx   ry   rz   r{   r|   r   r   r   r   r   r}   r   r   r~   _rC   s                   rD   r0   LuminaNextDiT2DModel.__init__   sD   $ 	&$&/:K!O&#6 #:,.!kX\
 ekk+&>?"HK.DW#
 mm z* +A #' &'	 +
 2%'*;'=$+d.?.??
 2a71<k>kk<1s   D,rG   timesteprJ   rK   rI   rM   c                 J   U R                  X5      u  ppUR                  UR                  5      nU R                  X#U5      n
UR	                  5       nU R
                   H  nU" UUUUUU
US9nM     U R                  X5      nU R                  =pU	S   u  pUR                  S5      nX-  X-  -  nUSS2SU24   R                  UX-  X-  XU R                  5      nUR                  SSSSSS5      R                  SS5      R                  SS5      nU(       d  U4$ [        US	9$ )
a]  
Forward pass of LuminaNextDiT.

Parameters:
    hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
    timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
    encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
    encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
)rL   rM   r   N   rQ   r   r   r   )sample)ru   todevicer   rk   r   r   ry   sizerV   r   permuterW   r   )rB   rG   r   rJ   rK   rI   rM   return_dictmaskimg_sizerL   layerheight_tokenswidth_tokensheightwidth
batch_sizesequence_lengthoutputs                      rD   rc   LuminaNextDiT2DModel.forward#  sK   & ;?:M:Mm:n7X+..}/C/CD&&xU#((*[[E! %'=M ! m: (,6 "''*
!2u7LM%a)9/)9&9:??/1Feievev
 &&q!Q1a8@@AFNNqRST9'v66rF   )r1   r{   rz   r   r   r   r   r   ru   ry   rx   r~   r   )   r   r   i 	      r   N   Nr"   TTi   g      ?)NT)re   rf   rg   rh   ri    _skip_layerwise_casting_patternsr   r;   rj   rk   r0   r4   rl   rm   rn   r   tupler   rc   ro   rp   rq   s   @rD   rs   rs      s   +Z (N$ !""#"&!#*,#'"%+/!%#'#*.'*>l>l $J>l 4Z	>l
 4Z>l $J>l !4Z>l Dj>l 4Z>l "DL>l $,>l D[>l >l !4Z>l >l  
!>l >lN 2637||37 ,,37  %||	37
 ll37  ,,37 !%S#X37 
u||	7	737 37rF   rs   ) typingr   r4   torch.nnr2   configuration_utilsr   r   utilsr   	attentionr	   attention_processorr
   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerre   loggerModuler   rs   rT   rF   rD   <module>r      sd       B  ) C 8 ' Q Q 
		H	%K K\d7:{ d7rF   