
    
3j3E              
          S SK Jr  S SKrS SKJr  SSKJrJr  SSKJ	r	  SSK
Jr  SSKJrJr  SSKJr  S	S
KJr  S	SKJrJr  S	SKJr  S	SKJrJr  S	SKJr  S	SKJr  S	SKJ r J!r!  \RD                  " \#5      r$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RJ                  5      r(\ " S S\RJ                  5      5       r) " S S\RJ                  5      r*\ " S S\\\	\\5      5       r+g)    )AnyN   )ConfigMixinregister_to_config)PeftAdapterMixin)FromOriginalModelMixin)apply_lora_scalelogging)maybe_allow_in_graph   )FeedForward)MochiAttentionMochiAttnProcessor2_0)
CacheMixin)%MochiCombinedTimestepCaptionEmbedding
PatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousRMSNormc                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )MochiModulatedRMSNorm&   epsc                 R   > [         TU ]  5         Xl        [        SUS5      U l        g Nr   F)super__init__r   r   norm)selfr   	__class__s     i/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_mochi.pyr   MochiModulatedRMSNorm.__init__'   s#    AsE*	    c                     UR                   nUR                  [        R                  5      nU R	                  U5      nUb  X-  nUR                  U5      nU$ N)dtypetotorchfloat32r   )r    hidden_statesscalehidden_states_dtypes       r"   forwardMochiModulatedRMSNorm.forward-   sU    +11%((7		-0)1M%(()<=r$   )r   r   r&   )	__name__
__module____qualname____firstlineno__floatr   r.   __static_attributes____classcell__r!   s   @r"   r   r   &   s    +E + r$   r   c                      ^  \ rS rSr  S
S\S\4U 4S jjjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )MochiLayerNormContinuous;   embedding_dimconditioning_embedding_dimc                    > [         TU ]  5         [        R                  " 5       U l        [        R
                  " X!US9U l        [        US9U l        g )N)biasr   )	r   r   nnSiLUsiluLinearlinear_1r   r   )r    r;   r<   r   r>   r!   s        r"   r   !MochiLayerNormContinuous.__init__<   s?     	 GGI			"<RVW)c2	r$   xconditioning_embeddingreturnc                 0   UR                   nU R                  U R                  U5      R                  UR                   5      5      nU R	                  USUR                  S5      R                  [        R                  5      -   5      nUR                  U5      $ )N   )r'   rD   rB   r(   r   	unsqueezer)   r*   )r    rF   rG   input_dtyper,   s        r"   r.    MochiLayerNormContinuous.forwardJ   sr    
 gg dii(>?BB177KLIIa!eooa033EMMBBDttK  r$   )rD   r   rB   )h㈵>T)r0   r1   r2   r3   intr   r)   Tensorr.   r5   r6   r7   s   @r"   r9   r9   ;   sY    
 33 %(3 3!<<! !&! 
	! !r$   r9   c                      ^  \ rS rSrSr SS\S\S\S\SS4
U 4S	 jjjrS
\	R                  S\	R                  S\\	R                  \	R                  \	R                  \	R                  4   4S jrSrU =r$ )MochiRMSNormZeroX   zm
Adaptive RMS Norm used in Mochi.

Parameters:
    embedding_dim (`int`): The size of each embedding vector.
r;   
hidden_dimr   elementwise_affinerH   Nc                    > [         TU ]  5         [        R                  " 5       U l        [        R
                  " X5      U l        [        SUS5      U l        g r   )	r   r   r@   rA   rB   rC   linearr   r   )r    r;   rT   r   rU   r!   s        r"   r   MochiRMSNormZero.__init__`   s=     	GGI	ii:AsE*	r$   r+   embc                 Z   UR                   nU R                  U R                  U5      5      nUR                  SSS9u  pEpgU R	                  UR                  [        R                  5      5      SUS S 2S 4   R                  [        R                  5      -   -  nUR                  U5      nXXg4$ )N   rJ   dim)r'   rW   rB   chunkr   r(   r)   r*   )r    r+   rY   r-   	scale_msagate_msa	scale_mlpgate_mlps           r"   r.   MochiRMSNormZero.forwardi   s     ,11kk$))C.)3699QA93F0	Y		-"2"25=="ABa)TUW[T[J\J_J_`e`m`mJnFno%(()<=	;;r$   )rW   r   rB   )rN   F)r0   r1   r2   r3   __doc__rO   r4   boolr   r)   rP   tupler.   r5   r6   r7   s   @r"   rR   rR   X   s     bg+ +.1+8=+Z^+	+ +
<"\\
<05
<	u||U\\5<<E	F
< 
<r$   rR   c                   $  ^  \ rS rSrSr    SS\S\S\S\S\S\S	\S
\SS4U 4S jjjr	 SS\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S-  S\\
R                  \
R                  4   4S jjrSrU =r$ )MochiTransformerBlockv   a  
Transformer block used in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

Args:
    dim (`int`):
        The number of channels in the input and output.
    num_attention_heads (`int`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`):
        The number of channels in each head.
    qk_norm (`str`, defaults to `"rms_norm"`):
        The normalization layer to use.
    activation_fn (`str`, defaults to `"swiglu"`):
        Activation function to use in feed-forward.
    context_pre_only (`bool`, defaults to `False`):
        Whether or not to process context-related conditions with additional layers.
    eps (`float`, defaults to `1e-6`):
        Epsilon value for normalization layers.
r]   num_attention_headsattention_head_dimpooled_projection_dimqk_normactivation_fncontext_pre_onlyr   rH   Nc	                   > [         T	U ]  5         Xpl        SU-  S-  S-  U l        SU-  S-  S-  U l        [        USU-  USS9U l        U(       d  [        USU-  USS9U l        O[        UUUS9U l        [        UUUSUSUUU[        5       SS9U l        [        US	9U l        U R                  (       d	  [        US	9OS U l        [        U5      U l        U R                  (       d	  [        US	9OS U l        [#        XR                  USS
9U l        S U l        U(       d  [#        UU R                  USS
9U l        [        US	9U l        [        US	9U l        g )Nr[   r   r   F)r   rU   )r;   r<   r   rN   )	query_dimheadsdim_headr>   added_kv_proj_dimadded_proj_biasout_dimout_context_dimro   	processorr   r?   )	inner_dimrn   r>   )r   r   ro   ff_inner_dimff_context_inner_dimrR   norm1norm1_contextr9   r   r   attn1r   norm2norm2_contextnorm3norm3_contextr   ff
ff_contextnorm4norm4_context)
r    r]   rj   rk   rl   rm   rn   ro   r   r!   s
            r"   r   MochiTransformerBlock.__init__   sf    	 0Wq[Q.%&)>%>%Bq$H!%c1s7PUV
!1#q;P7PVYns!tD!93+."D $%'3!1-+-

 +s3
CGCXCX2s;^b*3/
CGCXCX2s;^bc->->mbgh)%33+	DO +s3
2s;r$   r+   encoder_hidden_statestembencoder_attention_maskimage_rotary_embc                    U R                  X5      u  pgpU R                  (       d  U R                  X#5      u  ppOU R                  X#5      n
U R                  UU
UUS9u  pXR	                  U[
        R                  " U5      R                  S5      5      -   nU R                  USUR                  S5      R                  [
        R                  5      -   5      nU R                  U5      nXR                  U[
        R                  " U	5      R                  S5      5      -   nU R                  (       d  X R                  U[
        R                  " W5      R                  S5      5      -   nU R                  USWR                  S5      R                  [
        R                  5      -   5      n
U R                  U
5      nX R!                  U[
        R                  " W5      R                  S5      5      -   nX4$ )N)r+   r   r   attention_maskrJ   )r|   ro   r}   r~   r   r)   tanhrK   r   r(   r*   r   r   r   r   r   r   )r    r+   r   r   r   r   norm_hidden_statesr`   ra   rb   norm_encoder_hidden_statesenc_gate_msaenc_scale_mlpenc_gate_mlpattn_hidden_statescontext_attn_hidden_states	ff_outputcontext_ff_outputs                     r"   r.   MochiTransformerBlock.forward   s    =AJJ}<[9i$$TXTfTf%UQ&m\ *.););<Q)X&9=,"<-1	 :D :
6 &

3EuzzRZG[GeGefgGh(ii!ZZI<O<OPQ<R<U<UV[VcVc<d8dfGG./	%

9ejj>R>\>\]^>_(``$$$9<N<N*EJJ|,D,N,Nq,Q= %! *.););%M,C,CA,F,I,I%--,X(X*& !%0J K$9<N<N!5::l#;#E#Ea#H= %! 33r$   )r~   ro   r   r   r{   rz   r|   r}   r   r   r   r   r   r   )rms_normswigluFư>r&   )r0   r1   r2   r3   rd   rO   strre   r4   r   r)   rP   rf   r.   r5   r6   r7   s   @r"   rh   rh   v   s    4 "%!&<<<< !<<  	<<
  #<< << << << << 
<< <<H 15)4||)4  %||)4 ll	)4
 !&)4  ,,-)4 
u||U\\)	*)4 )4r$   rh   c                     ^  \ rS rSrSrSS\S\SS4U 4S jjjrS\R                  4S jr	  SS	\S
\S\S\R                  S-  S\R                  S-  S\R                  4S jjrS\R                  S\R                  S\R                  4S jr  SS\R                  S	\S
\S\S\R                  S-  S\R                  S-  S\\R                  \R                  4   4S jjrSrU =r$ )	MochiRoPE   ae  
RoPE implementation used in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

Args:
    base_height (`int`, defaults to `192`):
        Base height used to compute interpolation scale for rotary positional embeddings.
    base_width (`int`, defaults to `192`):
        Base width used to compute interpolation scale for rotary positional embeddings.
base_height
base_widthrH   Nc                 4   > [         TU ]  5         X-  U l        g r&   )r   r   target_area)r    r   r   r!   s      r"   r   MochiRoPE.__init__  s    &3r$   c                 P    [         R                  " XUS-   XES9nUS S USS  -   S-  $ )NrJ   devicer'   r   )r)   linspace)r    startstopnumr   r'   edgess          r"   _centersMochiRoPE._centers  s4    uC!GFPcr
U12Y&!++r$   
num_framesheightwidthr   r'   c                 X   U R                   X#-  -  S-  n[        R                  " XUS9nU R                  U* U-  S-  X&-  S-  X$U5      nU R                  U* U-  S-  X6-  S-  X4U5      n	[        R                  " XxU	SS9u  pn[        R
                  " XU/SS9R                  SS5      nU$ )	Ng      ?r   r   ij)indexingr   r\   r   )r   r)   aranger   meshgridstackview)r    r   r   r   r   r'   r,   thwgrid_tgrid_hgrid_w	positionss                 r"   _get_positionsMochiRoPE._get_positions
  s     !!V^4<LL%@MM6'E/A-v~/A6SXYMM5&5.1,ema.?PUV!&a$!GKK 8bAFFr1M	r$   freqsposc                    [         R                  " UR                  R                  [         R                  5         [         R
                  " SUR                  [         R                  5      UR                  [         R                  5      5      nS S S 5        [         R                  " U5      n[         R                  " U5      nX44$ ! , (       d  f       N== f)Nznd,dhf->nhf)	r)   autocastr   typer*   einsumr(   cossin)r    r   r   	freqs_cos	freqs_sins        r"   _create_ropeMochiRoPE._create_rope  s    ^^ELL--u}}=LLu}}0EuxxPUP]P]G^_E > IIe$	IIe$	## >=s   AC
Cpos_frequenciesc                 T    U R                  X#XEU5      nU R                  X5      u  pX4$ r&   )r   r   )
r    r   r   r   r   r   r'   r   rope_cosrope_sins
             r"   r.   MochiRoPE.forward&  s4     !!*eUK!..D!!r$   )r   )   r   )NN)r0   r1   r2   r3   rd   rO   r   r)   rP   r   r   r'   r   r   rf   r.   r5   r6   r7   s   @r"   r   r      s8   4C 43 4 4 4
,5<< , '+$(  	
 t# {{T! 
&$%,, $U\\ $ell $ '+$("" " 	"
 " t#" {{T!" 
u||U\\)	*" "r$   r   c                   Z  ^  \ rS rSrSrSrS/rSS/r\            SS\	S	\	S
\	S\	S\	S\	S\	S-  S\
S\	S\	S\
S\	SS4U 4S jjj5       r\" S5        SS\R                  S\R                  S\R                  S\R                  S\\
\4   S-  S\S\R                  4S jj5       rSrU =r$ ) MochiTransformer3DModeli4  a  
A Transformer model for video-like data introduced in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

Args:
    patch_size (`int`, defaults to `2`):
        The size of the patches to use in the patch embedding layer.
    num_attention_heads (`int`, defaults to `24`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`, defaults to `128`):
        The number of channels in each head.
    num_layers (`int`, defaults to `48`):
        The number of layers of Transformer blocks to use.
    in_channels (`int`, defaults to `12`):
        The number of channels in the input.
    out_channels (`int`, *optional*, defaults to `None`):
        The number of channels in the output.
    qk_norm (`str`, defaults to `"rms_norm"`):
        The normalization layer to use.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    time_embed_dim (`int`, defaults to `256`):
        Output dimension of timestep embeddings.
    activation_fn (`str`, defaults to `"swiglu"`):
        Activation function to use in feed-forward.
    max_sequence_length (`int`, defaults to `256`):
        The maximum sequence length of text embeddings supported.
Trh   patch_embedr   N
patch_sizerj   rk   
num_layersrl   in_channelsout_channelsrm   text_embed_dimtime_embed_dimrn   max_sequence_lengthrH   c                   > [         TU ]  5         X#-  nU=(       d    Un[        UUUS S9U l        [	        UUU	U
SS9U l        [        R                  " [        R                  " SX#S-  4S5      5      U l
        [        5       U l        [        R                  " [        U5       Vs/ s H  n[        UUUUUUXS-
  :H  S9PM     sn5      U l        [#        UUS	S
SS9U l        [        R&                  " XU-  U-  5      U l        S	U l        g s  snf )N)r   r   	embed_dimpos_embed_type   )r;   rl   r   r   rj   r   r   g        rJ   )r]   rj   rk   rl   rm   rn   ro   Fr   
layer_norm)rU   r   	norm_type)r   r   r   r   r   
time_embedr@   	Parameterr)   fullr   r   rope
ModuleListrangerh   transformer_blocksr   norm_outrC   proj_outgradient_checkpointing)r    r   rj   rk   r   rl   r   r   rm   r   r   rn   r   ry   ir!   s                  r"   r    MochiTransformer3DModel.__init__V  s+     	'<	#2{%!#	
 @#"7)) !
  "||EJJ;NfgPg7hjm,noK	"$-- z* +A &!(;'9*?#"/%&q.%8 +#
 /$"
 		)*-D|-ST&+#/s   #Dattention_kwargsr+   r   timestepr   return_dictc           	         UR                   u  pxpnU R                  R                  nX-  nX-  nU R                  UUUUR                  S9u  pUR                  SSSSS5      R                  SS5      nU R                  U5      nUR                  SUS45      R                  SS5      nU R                  U R                  U	UUUR                  [        R                  S9n[        U R                  5       HW  u  nn[        R                   " 5       (       a+  U R"                  (       a  U R%                  UUUUUU5      u  pMK  U" UUUUUS	9u  pMY     U R'                  X5      nU R)                  U5      nUR+                  XyXXS5      nUR                  SS
SSSSS5      nUR+                  USXU5      nU(       d  U4$ [-        US9$ )N)hidden_dtyper   r   rJ   r   r[   r   r   )r+   r   r   r   r         )sample)shapeconfigr   r   r'   permuteflattenr   	unflattenr   r   r   r)   r*   	enumerater   is_grad_enabledr   _gradient_checkpointing_funcr   r   reshaper   )r    r+   r   r   r   r   r   
batch_sizenum_channelsr   r   r   ppost_patch_heightpost_patch_widthr   r   r   blockoutputs                       r"   r.   MochiTransformer3DModel.forward  s    ?L>Q>Q;
*eKK"""K :&*oo!"&,,	 '6 '
# &--aAq!<DDQJ((7%//J3CDLLQPQR99   ''-- % 
 "$"9"9:HAu$$&&4+F+F7;7X7X!)*$844 8="/*?+A%5844 ;$ m:m4%--jFWklqst%--aAq!QB&&z2z5Q9'v66r$   )r   r   r   r   r   r   r   r   )r         0   i      Nr   i      r   r  )NT)r0   r1   r2   r3   rd    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rO   r   r   r	   r)   rP   
LongTensordictr   re   r.   r5   r6   r7   s   @r"   r   r   4  sj   8 (,$01(5v'>$ #%"%%)#'!"!%#&>,>, !>,  	>,
 >,  #>, >, Dj>, >, >, >, >, !>, 
>, >,@ () 37 >7||>7  %||>7 ""	>7
 !&>7 sCx.4/>7 >7 
>7 *>7r$   r   ),typingr   r)   torch.nnr@   configuration_utilsr   r   loadersr   loaders.single_file_modelr   utilsr	   r
   utils.torch_utilsr   	attentionr   attention_processorr   r   cache_utilsr   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr0   loggerModuler   r9   rR   rh   r   r    r$   r"   <module>r&     s        B ' ? . 5 # G $ J 7 ' ; 
		H	%BII *!ryy !:<ryy << |4BII |4 |4~;"		 ;"| a7j+7GI_ak a7 a7r$   