
    
3jM                        S SK r S SKrS SKJr  S SKJs  Jr  SSKJrJ	r	  SSK
Jr  SSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJr  \R2                  " \5      r " S S\R8                  5      r " S S\R8                  5      r " S S\R8                  5      r " S S5      r  " S S\R8                  5      r! " S S\\5      r"g)    N   )ConfigMixinregister_to_config)logging   )	Attention)TimestepEmbedding	Timestepsget_2d_sincos_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormRMSNormc                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	OmniGenFeedForward!   hidden_sizeintermediate_sizec                    > [         TU ]  5         [        R                  " USU-  SS9U l        [        R                  " X!SS9U l        [        R                  " 5       U l        g )Nr   Fbias)super__init__nnLineargate_up_proj	down_projSiLUactivation_fn)selfr   r   	__class__s      k/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_omnigen.pyr   OmniGenFeedForward.__init__"   sJ    IIk17H3HuU#4NWWY    hidden_statesreturnc                     U R                  U5      nUR                  SSS9u  p2X R                  U5      -  nU R                  U5      $ )Nr   dim)r   chunkr   r   )r    r%   	up_statesgates       r"   forwardOmniGenFeedForward.forward)   sH    %%m4	#//!/4 2 24 88	~~i((r$   )r   r   r   )__name__
__module____qualname____firstlineno__intr   torchTensorr.   __static_attributes____classcell__r!   s   @r"   r   r   !   s6    'C 'C ')U\\ )ell ) )r$   r   c                      ^  \ rS rSr       SS\S\S\S\S\S\S\4U 4S	 jjjrS
 rS\	R                  S\S\	R                  4S jr SS\	R                  S\S\	R                  S\	R                  4S jjrSrU =r$ )OmniGenPatchEmbed0   
patch_sizein_channels	embed_dimr   interpolation_scalepos_embed_max_size	base_sizec                 T  > [         T	U ]  5         [        R                  " X#X4XS9U l        [        R                  " X#X4XS9U l        Xl        XPl        X`l        [        UU R                  UU R                  SS9nU R                  SUR                  5       R                  S5      SS9  g )N)kernel_sizestrider   pt)rB   r@   output_type	pos_embedr   T)
persistent)r   r   r   Conv2doutput_image_projinput_image_projr=   r@   rA   r   register_bufferfloat	unsqueeze)
r    r=   r>   r?   r   r@   rA   rB   rH   r!   s
            r"   r   OmniGenPatchEmbed.__init__1   s     	!#0HQ["
 !#		0HQ[!
 %#6 "4+## $ 8 8
	 	[)//*;*E*Ea*HUYZr$   c                 0   U R                   c  [        S5      eXR                  -  nX R                  -  nXR                   :  a  [        SU SU R                    S35      eX R                   :  a  [        SU SU R                    S35      eU R                   U-
  S-  nU R                   U-
  S-  nU R                  R	                  SU R                   U R                   S	5      nUSS2X3U-   2XDU-   2SS24   nUR	                  SS	UR
                  S	   5      nU$ )
z2Crops positional embeddings for SD3 compatibility.Nz.`pos_embed_max_size` must be set for cropping.zHeight (z/) cannot be greater than `pos_embed_max_size`: .zWidth (r      r(   )rA   
ValueErrorr=   rH   reshapeshape)r    heightwidthtopleftspatial_pos_embeds         r"   _cropped_pos_embed$OmniGenPatchEmbed._cropped_pos_embedQ   s>   ""*MNN??*(+++6("QRVRiRiQjjkl  ***% OPTPgPgOhhij  &&/A5''%/A5 NN221d6M6MtOfOfhjk-aV|1CTSXLEXZ[.[\-55a=N=T=TUW=XY  r$   r%   is_input_imager&   c                     U(       a  U R                  U5      nOU R                  U5      nUR                  S5      R                  SS5      nU$ )Nr   rS   )rL   rK   flatten	transpose)r    r%   r^   s      r"   _patch_embeddings#OmniGenPatchEmbed._patch_embeddingsh   sH     11-@M 22=AM%--a0::1a@r$   padding_latentc                    [        U[        5      (       a  Uc  S /[        U5      -  n/ n[        X5       H  u  pVUR                  SS  u  pxU R                  XR5      nU R                  Xx5      n	XY-   nUb/  [        R                  " XVR                  UR                  5      /SS9nUR                  U5        M     U$ UR                  SS  u  pxU R                  Xx5      n	U R                  X5      nX-   nU$ )Nr)   )
isinstancelistlenziprV   rb   r\   r5   cattodeviceappend)
r    r%   r^   rd   patched_latents
sub_latentpaddingrW   rX   rH   s
             r"   r.   OmniGenPatchEmbed.forwardp   s     mT**%"&#m*<!< O'*='I#
 * 0 0 5!33JO
 33FB	'3
&!&J

:CTCT8U+V\^!_J&&z2 (J  *//4MF//>I 22=QM+7Or$   )rL   r@   rK   r=   rA   )r      i   TrS      @   )N)r0   r1   r2   r3   r4   boolrN   r   r\   r5   r6   rb   r.   r7   r8   r9   s   @r"   r;   r;   0   s     %&"%[[ [ 	[
 [ #[  [ [ [@!.u|| T V[VbVb  ae"\\;?QVQ]Q]	 r$   r;   c                   4   ^  \ rS rSr SU 4S jjrS rSrU =r$ )OmniGenSuScaledRotaryEmbedding   c           	      P  > [         TU ]  5         Xl        X l        X@l        SU R                  [
        R                  " SU R                  S[
        R                  S9R                  5       U R                  -  -  -  nU R                  SUSS9  US   U l
        US	   U l        X0l        g )
N      ?r   r   )dtypeinv_freqF)tensorrI   short_factorlong_factor)r   r   r*   max_position_embeddingsbaser5   arangeint64rN   rM   r   r    original_max_position_embeddings)r    r*   r   r   r   rope_scalingr}   r!   s          r"   r   'OmniGenSuScaledRotaryEmbedding.__init__   s     	'>$	$))Q!5;;(W(](](_bfbjbj(jklZUK(8'60P-r$   c                 X   [         R                  " U5      S-   nX0R                  :  a9  [         R                  " U R                  [         R
                  UR                  S9nO8[         R                  " U R                  [         R
                  UR                  S9n[         R                  " SU R                  S[         R                  UR                  S9R                  5       U R                  -  nSX@R                  U-  -  -  U l        U R                  S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       nUR                  R                   n[#        U[$        5      (       a  US:w  a  UOSn[         R&                  " US	S
9   UR                  5       UR                  5       -  R)                  SS5      n	[         R*                  " X4SS9S   n
U R,                  U R                  -  nUS::  a  SnON[.        R0                  " S[.        R2                  " U5      [.        R2                  " U R                  5      -  -   5      nU
R5                  5       U-  nU
R7                  5       U-  nS S S 5        X4$ ! , (       d  f       WW4$ = f)NrS   )r|   rm   r   r   r{   r(   mpscpuF)device_typeenabledr)   )r5   maxr   r~   r   float32rm   r   r   r*   r   rN   r   r}   expandrV   typerg   strautocastra   rk   r   mathsqrtlogcossin)r    r%   position_idsseq_lenext_factorsinv_freq_shapeinv_freq_expandedposition_ids_expandedr   freqsembscalescaling_factorr   r   s                  r"   r.   &OmniGenSuScaledRotaryEmbedding.forward   s)   ))L)A-:::,,t'7'7u}}UbUiUijK,,t'8'8VcVjVjkK LLDHHau{{=CWCWX^^`cgckckk 	 {YY-FFG MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @ $**//%/S%A%AkUZFZk`e^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3A6C0043X3XXE|!$!%1txx$JoJoAp/p+p!q'')n,C'')n,C D x DC Cxs   ,C!J
J))r   r*   r}   r   r   r   r   )      '  N)r0   r1   r2   r3   r   r.   r7   r8   r9   s   @r"   rx   rx      s    swQ  r$   rx   c                       \ rS rSrSrS r  SS\S\R                  S\R                  S\R                  S-  S	\R                  S-  S
\R                  4S jjr	Sr
g)OmniGenAttnProcessor2_0   z
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
used in the OmniGen model.
c                 D    [        [        S5      (       d  [        S5      eg )Nscaled_dot_product_attentionzPAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportError)r    s    r"   r    OmniGenAttnProcessor2_0.__init__   s!    q899pqq :r$   Nattnr%   encoder_hidden_statesattention_maskimage_rotary_embr&   c                    UR                   u  pgnUR                  U5      n	UR                  U5      n
UR                  U5      nU	R	                  5       u  pnU
R                   S   nXR
                  -  nUU-  nU	R                  USUR
                  U5      R                  SS5      n	U
R                  USUU5      R                  SS5      n
UR                  USUU5      R                  SS5      nUb  SSKJ	n  U" XSS9n	U" XSS9n
[        R                  " XXS9nUR                  SS5      R                  U	5      nUR                  XUR                  5      nUR                  S   " U5      nU$ )	Nr(   rS   r   )apply_rotary_embrf   )use_real_unbind_dim)	attn_maskr   )rV   to_qto_kto_vsizeheadsviewra   
embeddingsr   r   r   type_asrU   out_dimto_out)r    r   r%   r   r   r   
batch_sizesequence_length_querykeyvaluebszq_len	query_dim	inner_dimhead_dimkv_headsr   s                      r"   __call__ OmniGenAttnProcessor2_0.__call__   s_    *7)<)<&
Q 		-(ii-.		/0 %

IIIbM	

* (

:r4::x@JJ1aPhhz2x:DDQJ

:r8X>HHAN '5$URTUE"3bQC66u5c%//15==eD%--c$,,GA}5r$    )NN)r0   r1   r2   r3   __doc__r   r   r5   r6   r   r7   r   r$   r"   r   r      sw    
r /304%% ||%  %||	%
 t+%  ,,-% 
% %r$   r   c                      ^  \ rS rSrS\S\S\S\S\SS4U 4S	 jjrS
\R                  S\R                  S\R                  S\R                  4S jr	Sr
U =r$ )OmniGenBlock   r   num_attention_headsnum_key_value_headsr   rms_norm_epsr&   Nc                    > [         TU ]  5         [        XS9U l        [	        UUX-  UUSUS[        5       S9	U l        [        XS9U l        [        X5      U l	        g )NepsF)	r   cross_attention_dimdim_headr   r   r   r   out_bias	processor)
r   r   r   input_layernormr   r   	self_attnpost_attention_layernormr   mlp)r    r   r   r   r   r   r!   s         r"   r   OmniGenBlock.__init__   se     	&{E"! + 7%(-/

 )0(N%%kEr$   r%   r   r   c                     U R                  U5      nU R                  UUUUS9nX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r%   r   r   r   )r   r   r   r   )r    r%   r   r   norm_hidden_statesattn_output	ff_outputs          r"   r.   OmniGenBlock.forward  sn     "11-@nn,"4)-	 % 
 &3 "::=IHH/0	%1r$   )r   r   r   r   )r0   r1   r2   r3   r4   rN   r   r5   r6   r.   r7   r8   r9   s   @r"   r   r      s    FF !F !	F
 F F 
F2"\\;@<<[`[g[g	 r$   r   c            (         ^  \ rS rSrSrSrS/r/ SQr\                   S'S\	S\	S	\	S
\
S\	S\	S\	S\	S\	S\	S\	S\	S\	S\S\	S\	S\S\	S\4&U 4S jjj5       rS\R                   S\\R                      S\S\R                   S-  4S jr S(S \R                   S!\	\
-  \R&                  -  S\R                   S\\R                      S\\	\\	   4   S"\R                   S#\R                   S$\S\\\R                      -  4S% jjrS&rU =r$ ))OmniGenTransformer2DModeli  a  
The Transformer model introduced in OmniGen (https://huggingface.co/papers/2409.11340).

Parameters:
    in_channels (`int`, defaults to `4`):
        The number of channels in the input.
    patch_size (`int`, defaults to `2`):
        The size of the spatial patches to use in the patch embedding layer.
    hidden_size (`int`, defaults to `3072`):
        The dimensionality of the hidden layers in the model.
    rms_norm_eps (`float`, defaults to `1e-5`):
        Eps for RMSNorm layer.
    num_attention_heads (`int`, defaults to `32`):
        The number of heads to use for multi-head attention.
    num_key_value_heads (`int`, defaults to `32`):
        The number of heads to use for keys and values in multi-head attention.
    intermediate_size (`int`, defaults to `8192`):
        Dimension of the hidden layer in FeedForward layers.
    num_layers (`int`, default to `32`):
        The number of layers of transformer blocks to use.
    pad_token_id (`int`, default to `32000`):
        The id of the padding token.
    vocab_size (`int`, default to `32064`):
        The size of the vocabulary of the embedding vocabulary.
    rope_base (`int`, default to `10000`):
        The default theta value to use when creating RoPE.
    rope_scaling (`dict`, optional):
        The scaling factors for the RoPE. Must contain `short_factor` and `long_factor`.
    pos_embed_max_size (`int`, default to `192`):
        The maximum size of the positional embeddings.
    time_step_dim (`int`, default to `256`):
        Output dimension of timestep embeddings.
    flip_sin_to_cos (`bool`, default to `True`):
        Whether to flip the sin and cos in the positional embeddings when preparing timestep embeddings.
    downscale_freq_shift (`int`, default to `0`):
        The frequency shift to use when downscaling the timestep embeddings.
    timestep_activation_fn (`str`, default to `silu`):
        The activation function to use for the timestep embeddings.
Tr   )patch_embeddingembed_tokensnormNr>   r=   r   r   r   r   r   
num_layerspad_token_id
vocab_sizer   r   	rope_baser   rA   time_step_dimflip_sin_to_cosdownscale_freq_shifttimestep_activation_fnc                 R  > [         TU ]  5         Xl        Xl        [	        UUUUS9U l        [        UUU5      U l        [        UUU5      U l	        [        UUU5      U l
        [        R                  " XU	5      U l        [        X5-  UUUUS9U l        [        R                   " [#        U5       Vs/ s H  n[%        X5XgU5      PM     sn5      U l        [)        X4S9U l        [-        USSSS9U l        [        R0                  " X2U-  U R                  -  SS	9U l        SU l        g s  snf )
N)r=   r>   r?   rA   )r   r   r   r   r   Fgư>rS   )norm_elementwise_affinenorm_eps	chunk_dimTr   )r   r   r>   out_channelsr;   r   r
   	time_projr	   
time_token
t_embedderr   	Embeddingr   rx   rope
ModuleListranger   layersr   r   r   norm_outr   proj_outgradient_checkpointing)r    r>   r=   r   r   r   r   r   r   r   r   r   r   r   r   rA   r   r   r   r   r   r!   s                        r"   r   "OmniGenTransformer2DModel.__init__H  s,   . 	&'0!#!1	 
 #=/CWX+M;H^_+M;H^_LL,O2.$;-M%
	 mm z**A [?Rgst*
 K:	$[%Z^jkl		+J/FIZIZ/Zaef&+#s   0D$	input_idsinput_img_latentsinput_image_sizesr&   c                 H   Uc  g U Vs/ s H  oDR                  U R                  5      PM     nnU R                  U5      nSnU R                  USS9nUR	                  5        H7  nX8    H,  u  pXv   R                  UR                  5      XXX24'   US-  nM.     M9     U$ s  snf )Nr   Tr^   rS   )rl   r|   r   r   keys)r    r  r  r  xcondition_tokensinput_img_inxinput_image_tokensb_inx	start_inxend_inxs              r"   _get_multimodal_embeddings4OmniGenTransformer2DModel._get_multimodal_embeddings  s     7HI7H!TT$**-7HI,,Y7!112CTX1Y&++-E&7&>"	=O=^=a=a$**> 	(9!9: " '? .   Js   $Br%   timestepr   r   return_dictc	                    UR                   u  ppU R                  R                  nX-  X-  pU R                  USS9nUR	                  S5      nU R                  U5      R                  U5      nU R                  U5      R                  S5      nU R                  U5      nU R                  X4U5      nUb  [        R                  " UUU/SS9nO[        R                  " UU/SS9nUR	                  S5      nUR                  SU5      R                  5       nUbh  UR                  5       S:X  aT  UR                   n[        R"                  " U5      R$                  nSU-
  U-  nUR                  S5      R                  U5      nU R'                  X5      nU R(                   HJ  n[        R*                  " 5       (       a&  U R,                  (       a  U R/                  UXU5      nMC  U" XUS9nML     U R1                  U5      nUS S 2U* S 24   nU R3                  UUS9nU R5                  U5      nUR7                  XXUS5      nUR9                  S	S
SSSS5      R;                  SS
5      R;                  SS5      nU(       d  U4$ [=        US9$ )NFr
  rS   r)   r(   r   )r   r   )tembr      r   rs   )sample)rV   configr=   r   r   r   r   r   rO   r   r  r5   rk   r   longr*   r|   finfominr   r  is_grad_enabledr  _gradient_checkpointing_funcr   r  r  rU   permuter`   r   )r    r%   r  r  r  r  r   r   r  r   num_channelsrW   rX   ppost_patch_heightpost_patch_widthnum_tokens_for_output_imagetimestep_projr   r  r  
seq_lengthr|   	min_dtyper   blockoutputs                              r"   r.   !OmniGenTransformer2DModel.forward  su    3@2E2E/
&KK"".4k5:+ ,,]5,Q&3&8&8&;#x088G__]3==a@
}-::9Yjk'!II'7]&SYZ[M!IIz=&AqIM"''*
#((Z8==? %.*<*<*>!*C!''EE*..I.0I=N+55a8@@ON  99]A [[E$$&&4+F+F $ A A=:J! !&meu v ! 		-0%a*E)E)F&FGm$?m4%--jM]bcegh&&q!Q1a8@@AFNNqRST9'v66r$   )r   r  r>   r  r   r  r   r   r  r   r   r   r   )rs   r   i   gh㈵>    r-  i    r-  i }  i@}  r   r   r   Nrt      Tr   silu)T)r0   r1   r2   r3   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r4   rN   dictrv   r   r   r5   r6   rh   r  FloatTensorr   tupler.   r7   r8   r9   s   @r"   r   r     s   &P (,$'('R$ "#%#%!%!'-04!"%  $$%&,)9,9, 9, 	9,
 9, !9, !9, 9, 9, 9, 9, "%9, +.9, 9, 9,   !9," #9,$ %9,& "'9,( !$)9, 9,v  :>u||:L ae 		 8 !=7||=7 + 1 11=7 <<	=7
  -=7  T#Y/=7 =7 ll=7 =7 
"E%,,$7	7=7 =7r$   r   )#r   r5   torch.nnr   torch.nn.functional
functionalr   configuration_utilsr   r   utilsr   attention_processorr   r   r	   r
   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr0   loggerModuler   r;   rx   r   r   r   r   r$   r"   <module>rB     s         B  + N N 7 ' 1 
		H	%) )U		 Up0RYY 0f/ /d+299 +\y7
K y7r$   