
    
3jx                        S SK Jr  S SKrS SKJr  SSKJrJr  SSKJr  SSK	J
r
Jr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  \R,                  " \5      rS\S\S\S\S\R4                  S\R6                  4S jrS\R6                  S\R6                  S\R6                  4S jr " S S5      r " S S\R>                  \5      r  " S S\R>                  5      r! " S S\R>                  5      r" " S  S!\R>                  5      r# " S" S#\R>                  5      r$ " S$ S%\R>                  5      r%S&\R6                  S\S\R6                  4S' jr&S(\R6                  S\S)\R6                  S\R6                  4S* jr' " S+ S,\\\
5      r(g)-    )AnyN)nn   )ConfigMixinregister_to_config)logging   )AttentionMixinAttentionModuleMixin)dispatch_attention_fn)get_timestep_embedding)Transformer2DModelOutput)
ModelMixin)RMSNorm
batch_sizeheightwidth
patch_sizedevicereturnc                 .   [         R                  " X-  X#-  SUS9n[         R                  " X-  US9SS2S4   US'   [         R                  " X#-  US9SSS24   US'   UR                  X-  X#-  -  S5      R	                  S5      R                  U SS5      $ )aX  
Generates 2D patch coordinate indices for a batch of images.

Args:
    batch_size (`int`):
        Number of images in the batch.
    height (`int`):
        Height of the input images (in pixels).
    width (`int`):
        Width of the input images (in pixels).
    patch_size (`int`):
        Size of the square patches that the image is divided into.
    device (`torch.device`):
        The device on which to create the tensor.

Returns:
    `torch.Tensor`:
        Tensor of shape `(batch_size, num_patches, 2)` containing the (row, col) coordinates of each patch in the
        image grid.
r	   )r   N.r   .   r   r   )torchzerosarangereshape	unsqueezerepeat)r   r   r   r   r   img_idss         g/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_prx.pyget_image_idsr#   !   s    , kk&.0CQvVGll6#7G4PGFOll5#6vFtQwOGFO??F0U5HI1MWWXYZaablnoqrss    xq	freqs_cisc                 .   U R                  5       R                  " / U R                  SS QSPSPSP76 nUR                  U R                  UR
                  S9nUS   US   -  US   US   -  -   nUR                  " U R                  6 R                  U 5      $ )a  
Applies rotary positional embeddings (RoPE) to a query tensor.

Args:
    xq (`torch.Tensor`):
        Input tensor of shape `(..., dim)` representing the queries.
    freqs_cis (`torch.Tensor`):
        Precomputed rotary frequency components of shape `(..., dim/2, 2)` containing cosine and sine pairs.

Returns:
    `torch.Tensor`:
        Tensor of the same shape as `xq` with rotary embeddings applied.
Nr   r	   r   dtyper   r   )floatr   shapetor   r*   type_as)r%   r&   xq_xq_outs       r"   
apply_roper1   =   s     ((*


6bhhsm
6R
6
6A
6CBIISYY?IvV,y/@3v;/NNF>>288$,,R00r$   c                       \ rS rSrSrSrSrS r   SSSS\R                  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  4S jjr
Srg)PRXAttnProcessor2_0R   z
Processor for implementing PRX-style attention with multi-source tokens and RoPE. Supports multiple attention
backends (Flash Attention, Sage Attention, etc.) via dispatch_attention_fn.
Nc                 l    [        [        R                  R                  S5      (       d  [	        S5      eg )Nscaled_dot_product_attentionzHPRXAttnProcessor2_0 requires PyTorch 2.0, please upgrade PyTorch to 2.0.)hasattrr   r   
functionalImportError)selfs    r"   __init__PRXAttnProcessor2_0.__init__[   s,    uxx**,JKKhii Lr$   attnPRXAttentionhidden_statesencoder_hidden_statesattention_maskimage_rotary_embr   c           	      d   Uc  [        S5      eUR                  U5      nUR                  u  pn
UR                  XSUR                  UR
                  5      nUR                  SSSSS5      nUS   US   US   pnUR                  U5      nUR                  U5      nUR                  U5      nUR                  u  pn
UR                  XSUR                  UR
                  5      nUR                  SSSSS5      nUS   US   nnUR                  U5      nUb  [        X5      n[        X5      n[        R                  " UU4SS9n[        R                  " UU4SS9nSnUGb  UR                  u  nn
nn
UR                  S   nUR                  5       S:w  a  [        S	UR                   35      eUR                  S
   U:w  a  [        SUR                  S
    SU 35      eUR                  n[        R                   " UU4[        R"                  US9nUR%                  U[        R"                  S9n[        R                  " UU/S
S9nUSS2SSSS24   R'                  S
UR                  US
5      nUR)                  SS5      nUR)                  SS5      nUR)                  SS5      n[+        UUUUU R,                  U R.                  S9nUR                  u  nn n!n"UR                  UU U!U"-  5      nUR0                  S   " U5      n[3        UR0                  5      S:  a  UR0                  S   " U5      nU$ )ae  
Apply PRX attention using PRXAttention module.

Args:
    attn: PRXAttention module containing projection layers
    hidden_states: Image tokens [B, L_img, D]
    encoder_hidden_states: Text tokens [B, L_txt, D]
    attention_mask: Boolean mask for text tokens [B, L_txt]
    image_rotary_emb: Rotary positional embeddings [B, 1, L_img, head_dim//2, 2, 2]
NzLPRXAttnProcessor2_0 requires 'encoder_hidden_states' containing text tokens.r   r	   r   r      dimz"Unsupported attention_mask shape: r(   zattention_mask last dim z must equal text length r*   r   r)   )	attn_maskbackendparallel_config)
ValueErrorimg_qkv_projr,   r   headshead_dimpermutenorm_qnorm_ktxt_kv_projnorm_added_kr1   r   catrF   r   onesboolr-   expand	transposer   _attention_backend_parallel_configto_outlen)#r:   r=   r?   r@   rA   rB   kwargsimg_qkvBL_img_img_qimg_kimg_vtxt_kvL_txttxt_ktxt_vkvattn_mask_tensorbsl_imgl_txtr   ones_img
joint_maskquerykeyvalueattn_outputr   seq_len	num_headsrN   s#                                      r"   __call__PRXAttnProcessor2_0.__call___   s,   ( !(kll ##M2mm!//!Atzz4==I//!Q1a0%aj'!*gaje E"E" !!"78ll!!TZZG1aA.ay&)u !!%( 'u7Eu7E IIuen!,IIuen!,  %#kkOB5!KKNE!!#q( #EnFZFZE[!\]]##B'50 #;N<P<PQS<T;UUmnsmt!uvv\\Fzz2u+UZZOH+..fEJJ.ONNH#=2FJ)!T4*:;BB2tzzSXZ\] 1%kk!QAq!+&++ 11
 4?3D3D0
GY!))*gy8?ST kk!n[1t{{a++a.5Kr$    NNN)__name__
__module____qualname____firstlineno____doc__rY   rZ   r;   r   Tensorrw   __static_attributes__ry   r$   r"   r3   r3   R   s    
 j 6:.204^^ ||^  %||d2	^
 t+^  ,,-^ 
^ ^r$   r3   c                      ^  \ rS rSrSr\r\/r      SS\S\S\S\	S\	S	\
4U 4S
 jjjr   SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  4
S jjrSrU =r$ )r>      z
PRX-style attention module that handles multi-source tokens and RoPE. Similar to FluxAttention but adapted for
PRX's architecture.
N	query_dimrM   dim_headbiasout_biasepsc                   > [         TU ]  5         X l        X0l        X2-  U l        Xl        [        R                  " XS-  US9U l        [        U R                  USS9U l
        [        U R                  USS9U l        [        R                  " XS-  US9U l        [        U R                  USS9U l        [        R                  " / 5      U l        U R                  R!                  [        R                  " U R                  XS95        U R                  R!                  [        R"                  " S5      5        Uc  U R%                  5       nU R'                  U5        g )Nr   r   T)r   elementwise_affiner	           )superr;   rM   rN   	inner_dimr   r   LinearrL   r   rP   rQ   rR   rS   
ModuleListr[   appendDropout_default_processor_clsset_processor)	r:   r   rM   r   r   r   r   	processor	__class__s	           r"   r;   PRXAttention.__init__   s    	
 !)"IIiQTJdmmNdmmN99YADI#DMMstTmmB'299T^^YNO2::c?+335I9%r$   r?   r@   rA   rB   r   c                 2    U R                   " U U4UUUS.UD6$ )N)r@   rA   rB   )r   )r:   r?   r@   rA   rB   r]   s         r"   forwardPRXAttention.forward   s5     ~~
 #8)-
 
 	
r$   )
rN   rM   rL   r   rS   rQ   rP   r   r[   rR   )   @   FFư>Nrz   )r{   r|   r}   r~   r   r3   r   _available_processorsintrV   r+   r;   r   r   r   r   __classcell__r   s   @r"   r>   r>      s    
 101
 && & 	&
 & & & &H 6:.204
||
  %||d2
 t+	

  ,,-
 

 
r$   r>   c                      ^  \ rS rSrSrS\S\S\\   4U 4S jjrS\R                  S\S\S\R                  4S	 jr
S
\R                  S\R                  4S jrSrU =r$ )
PRXEmbedND   a  
N-dimensional rotary positional embedding.

This module creates rotary embeddings (RoPE) across multiple axes, where each axis can have its own embedding
dimension. The embeddings are combined and returned as a single tensor

Args:
    dim (int):
    Base embedding dimension (must be even).
    theta (int):
    Scaling factor that controls the frequency spectrum of the rotary embeddings.
    axes_dim (list[int]):
    list of embedding dimensions for each axis (each must be even).
rF   thetaaxes_dimc                 F   > [         TU ]  5         Xl        X l        X0l        g N)r   r;   rF   r   r   )r:   rF   r   r   r   s       r"   r;   PRXEmbedND.__init__  s    
 r$   posr   c                    US-  S:X  d   eUR                   R                  S:H  nUR                   R                  S:H  nU(       d  U(       a  [        R                  O[        R                  n[        R
                  " SUSXaR                   S9U-  nSX7-  -  nUR                  S5      UR                  S5      -  n	[        R                  " [        R                  " U	5      [        R                  " U	5      * [        R                  " U	5      [        R                  " U	5      /SS9n	U	R                  " / U	R                  S S QSPSP76 n	U	R                  5       $ )	Nr	   r   mpsnpurG   g      ?r(   rE   )r   typer   float32float64r   r   stackcossinr   r,   r+   )
r:   r   rF   r   is_mpsis_npur*   scaleomegaouts
             r"   ropePRXEmbedND.rope  s    Qw!||E)E)"(FQQeJJG#Mu|$mmB%//!"44kk599S>EIIcN?EIIcNEIIVYN[acd kk0399Sb>010a0yy{r$   idsc                    UR                   S   n[        R                  " [        U5       Vs/ s H6  o0R	                  US S 2S S 2U4   U R
                  U   U R                  5      PM8     snSS9nUR                  S5      $ s  snf )Nr(   rE   r   )r,   r   rT   ranger   r   r   r   )r:   r   n_axesiembs        r"   r   PRXEmbedND.forward#  sp    2iiLQRXMZMqYYs1a7|T]]1%5tzzBMZ
 }}Q [s   =B)r   rF   r   )r{   r|   r}   r~   r   r   listr;   r   r   r   r   r   r   r   s   @r"   r   r      so    !C ! !tCy ! 3 s u||   5<<  ELL    r$   r   c                   r   ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MLPEmbedderi,  aF  
A simple 2-layer MLP used for embedding inputs.

Args:
    in_dim (`int`):
        Dimensionality of the input features.
    hidden_dim (`int`):
        Dimensionality of the hidden and output embedding space.

Returns:
    `torch.Tensor`:
        Tensor of shape `(..., hidden_dim)` containing the embedded representations.
in_dim
hidden_dimc                    > [         TU ]  5         [        R                  " XSS9U l        [        R
                  " 5       U l        [        R                  " X"SS9U l        g )NTr   )r   r;   r   r   in_layerSiLUsilu	out_layer)r:   r   r   r   s      r"   r;   MLPEmbedder.__init__;  s?    		&4@GGI	:Er$   xr   c                 `    U R                  U R                  U R                  U5      5      5      $ r   )r   r   r   )r:   r   s     r"   r   MLPEmbedder.forwardA  s#    ~~diia(89::r$   )r   r   r   r{   r|   r}   r~   r   r   r;   r   r   r   r   r   r   s   @r"   r   r   ,  s>    Fs F F; ;%,, ; ;r$   r   c            	          ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\	\R                  \R                  \R                  4   \	\R                  \R                  \R                  4   4   4S jr
SrU =r$ )	
ModulationiE  a  
Modulation network that generates scale, shift, and gating parameters.

Given an input vector, the module projects it through a linear layer to produce six chunks, which are grouped into
two tuples `(shift, scale, gate)`.

Args:
    dim (`int`):
        Dimensionality of the input vector. The output will have `6 * dim` features internally.

Returns:
    ((`torch.Tensor`, `torch.Tensor`, `torch.Tensor`), (`torch.Tensor`, `torch.Tensor`, `torch.Tensor`)):
        Two tuples `(shift, scale, gate)`.
rF   c                 .  > [         TU ]  5         [        R                  " USU-  SS9U l        [        R
                  R                  U R                  R                  S5        [        R
                  R                  U R                  R                  S5        g )N   Tr   r   )	r   r;   r   r   lininit	constant_weightr   )r:   rF   r   s     r"   r;   Modulation.__init__U  s^    99S!c'5
$((//1-
$((--+r$   vecr   c                     U R                  [        R                  R                  U5      5      S S 2S S S 24   R	                  SSS9n[        US S 5      [        USS  5      4$ )Nr   r(   rE   r   )r   r   r8   r   chunktuple)r:   r   r   s      r"   r   Modulation.forward[  s[     hhr}}))#./4
;AA!ALS!W~uSW~--r$   )r   )r{   r|   r}   r~   r   r   r;   r   r   r   r   r   r   r   s   @r"   r   r   E  sr    ,C ,.<<.	uU\\5<<=>ellTYT`T`bgbnbnFn@oo	p. .r$   r   c                     ^  \ rS rSrSr  SS\S\S\S\S-  4U 4S jjjr SS	\R                  S
\R                  S\R                  S\R                  S\R                  S-  S\
\\4   S\R                  4S jjrSrU =r$ )PRXBlockib  u  
Multimodal transformer block with text–image cross-attention, modulation, and MLP.

Args:
    hidden_size (`int`):
        Dimension of the hidden representations.
    num_heads (`int`):
        Number of attention heads.
    mlp_ratio (`float`, *optional*, defaults to 4.0):
        Expansion ratio for the hidden dimension inside the MLP.
    qk_scale (`float`, *optional*):
        Scale factor for queries and keys. If not provided, defaults to ``head_dim**-0.5``.

Attributes:
    img_pre_norm (`nn.LayerNorm`):
        Pre-normalization applied to image tokens before attention.
    attention (`PRXAttention`):
        Multi-head attention module with built-in QKV projections and normalizations for cross-attention between
        image and text tokens.
    post_attention_layernorm (`nn.LayerNorm`):
        Normalization applied after attention.
    gate_proj / up_proj / down_proj (`nn.Linear`):
        Feedforward layers forming the gated MLP.
    mlp_act (`nn.GELU`):
        Nonlinear activation used in the MLP.
    modulation (`Modulation`):
        Produces scale/shift/gating parameters for modulated layers.

    Methods:
        The forward method performs cross-attention and the MLP with modulation.
Nhidden_sizerv   	mlp_ratioqk_scalec           
        > [         TU ]  5         Xl        X l        X-  U l        U=(       d    U R                  S-  U l        [        X-  5      U l        Xl        [        R                  " USSS9U l        [        UUU R                  SSS[        5       S9U l        [        R                  " USSS9U l        [        R                   " XR                  SS9U l        [        R                   " XR                  SS9U l        [        R                   " U R                  USS9U l        [        R(                  " SS9U l        [-        U5      U l        g )	Ng      Fr   r   r   )r   rM   r   r   r   r   r   r   tanh)approximate)r   r;   r   rv   rN   r   r   mlp_hidden_dimr   r   	LayerNormimg_pre_normr>   r3   	attentionpost_attention_layernormr   	gate_projup_proj	down_projGELUmlp_actr   
modulation)r:   r   rv   r   r   r   s        r"   r;   PRXBlock.__init__  s    	%"#04!4
!+"9:& LLTXY &!]])+
 )+[UZ`d(e%;0C0C%Pyy.A.AN4#6#6%Pww62$[1r$   r?   r@   tembrB   rA   r]   r   c           	      f   U R                  U5      u  pxUu  pnUu  pnSU
-   U R                  U5      -  U	-   nU R                  UUUUS9nXU-  -   nSU-   U R                  U5      -  U-   nXU R	                  U R                  U R                  U5      5      U R                  U5      -  5      -  -   nU$ )aG  
Runs modulation-gated cross-attention and MLP, with residual connections.

Args:
    hidden_states (`torch.Tensor`):
        Image tokens of shape `(B, L_img, hidden_size)`.
    encoder_hidden_states (`torch.Tensor`):
        Text tokens of shape `(B, L_txt, hidden_size)`.
    temb (`torch.Tensor`):
        Conditioning vector used by `Modulation` to produce scale/shift/gates, shape `(B, hidden_size)` (or
        broadcastable).
    image_rotary_emb (`torch.Tensor`):
        Rotary positional embeddings applied inside attention.
    attention_mask (`torch.Tensor`, *optional*):
        Boolean mask for text tokens of shape `(B, L_txt)`, where `0` marks padding.
    **kwargs:
        Additional keyword arguments for API compatibility.

Returns:
    `torch.Tensor`:
        Updated image tokens of shape `(B, L_img, hidden_size)`.
r   )r?   r@   rA   rB   )r   r   r   r   r   r   r   r   )r:   r?   r@   r   rB   rA   r]   mod_attnmod_mlp
attn_shift
attn_scale	attn_gate	mlp_shift	mlp_scalemlp_gatehidden_states_modattn_outr   s                     r"   r   PRXBlock.forward  s    @ !OOD1,4)
	)0&	h^t/@/@/OOR\\>>+"7)-	 " 
 &H(<<]d;;MJJYV%DNN4<<PTP^P^_`PaCbeieqeqrsetCt4u(vvr$   )r   r   r   rN   r   r   r   r   r   r   rv   r   r   r   )g      @Nr   )r{   r|   r}   r~   r   r   r+   r;   r   r   dictstrr   r   r   r   r   s   @r"   r   r   b  s    H !%&2&2 &2 	&2
 $,&2 &2\ /31||1  %||1 ll	1
  ,,1 t+1 sCx.1 
1 1r$   r   c                      ^  \ rS rSrSrS\S\S\4U 4S jjrS\R                  S\R                  S	\R                  4S
 jr	Sr
U =r$ )
FinalLayeri  a3  
Final projection layer with adaptive LayerNorm modulation.

This layer applies a normalized and modulated transformation to input tokens and projects them into patch-level
outputs.

Args:
    hidden_size (`int`):
        Dimensionality of the input tokens.
    patch_size (`int`):
        Size of the square image patches.
    out_channels (`int`):
        Number of output channels per pixel (e.g. RGB = 3).

Forward Inputs:
    x (`torch.Tensor`):
        Input tokens of shape `(B, L, hidden_size)`, where `L` is the number of patches.
    vec (`torch.Tensor`):
        Conditioning vector of shape `(B, hidden_size)` used to generate shift and scale parameters for adaptive
        LayerNorm.

Returns:
    `torch.Tensor`:
        Projected patch outputs of shape `(B, L, patch_size * patch_size * out_channels)`.
r   r   out_channelsc           	      $  > [         TU ]  5         [        R                  " USSS9U l        [        R
                  " XU-  U-  SS9U l        [        R                  " [        R                  " 5       [        R
                  " USU-  SS95      U l	        g )NFr   r   Tr   r	   )
r   r;   r   r   
norm_finalr   linear
Sequentialr   adaLN_modulation)r:   r   r   r  r   s       r"   r;   FinalLayer.__init__  sn    ,,{uRVWii*-D|-SZ^_ "bggi;PQT_P_fj9k lr$   r   r   r   c                     U R                  U5      R                  SSS9u  p4SUS S 2S S S 24   -   U R                  U5      -  US S 2S S S 24   -   nU R                  U5      nU$ )Nr	   r   rE   )r  r   r  r	  )r:   r   r   shiftr   s        r"   r   FinalLayer.forward   si    ,,S177q7Aq$z""dooa&885D!;LLKKNr$   )r  r	  r  r   r   s   @r"   r  r    sQ    4mC mS m m ELL U\\  r$   r  imgc                     U R                   u  p#pEUnU R                  X#XF-  XeU-  U5      n [        R                  " SU 5      n U R                  USX6-  U-  5      n U $ )a  
Flattens an image tensor into a sequence of non-overlapping patches.

Args:
    img (`torch.Tensor`):
        Input image tensor of shape `(B, C, H, W)`.
    patch_size (`int`):
        Size of each square patch. Must evenly divide both `H` and `W`.

Returns:
    `torch.Tensor`:
        Flattened patch sequence of shape `(B, L, C * patch_size * patch_size)`, where `L = (H // patch_size) * (W
        // patch_size)` is the number of patches.
znchpwq->nhwcpqr(   )r,   r   r   einsum)r  r   bchwps          r"   img2seqr    sd     JA!A ++aAFAAvq
1C ,,'
-C ++aQUQY
'CJr$   seqr,   c                    [        U[        5      (       a  USS u  p4OS[        U[        R                  5      (       a  [	        US   5      [	        US   5      pCO[        S[        U5       S35      eU R                  u  pVnUnXxU-  -  n	U R                  XSU-  XH-  XU5      n [        R                  " SU 5      n U R                  XYX45      n U $ )a>  
Reconstructs an image tensor from a sequence of patches (inverse of `img2seq`).

Args:
    seq (`torch.Tensor`):
        Patch sequence of shape `(B, L, C * patch_size * patch_size)`, where `L = (H // patch_size) * (W //
        patch_size)`.
    patch_size (`int`):
        Size of each square patch.
    shape (`tuple` or `torch.Tensor`):
        The original image spatial shape `(H, W)`. If a tensor is provided, the first two values are interpreted as
        height and width.

Returns:
    `torch.Tensor`:
        Reconstructed image tensor of shape `(B, C, H, W)`.
Nr   r   zshape type z not supportedznhwcpq->nchpwq)

isinstancer   r   r   r   NotImplementedErrorr   r,   r   r  )
r  r   r,   r  r  r  ldr  r  s
             r"   seq2imgr   %  s    $ %RSz1	E5<<	(	(E!Hs58}1!KU}N"KLLiiGA!A	!eA ++aaq
1C ,,'
-C ++aA
!CJr$   c                     ^  \ rS rSrSrSrSr\           SS\S\S\S	\S
\	S\S\S\
S\S\	S\4U 4S jjj5       rS\R                  S\R                  S\R                  4S jr   SS\R                  S\R                  S\R                  S\R                  S-  S\\\4   S-  S\S\\R                  S4   \-  4S jjrSrU =r$ ) PRXTransformer2DModeliN  a-  
Transformer-based 2D model for text to image generation.

Args:
    in_channels (`int`, *optional*, defaults to 16):
        Number of input channels in the latent image.
    patch_size (`int`, *optional*, defaults to 2):
        Size of the square patches used to flatten the input image.
    context_in_dim (`int`, *optional*, defaults to 2304):
        Dimensionality of the text conditioning input.
    hidden_size (`int`, *optional*, defaults to 1792):
        Dimension of the hidden representation.
    mlp_ratio (`float`, *optional*, defaults to 3.5):
        Expansion ratio for the hidden dimension inside MLP blocks.
    num_heads (`int`, *optional*, defaults to 28):
        Number of attention heads.
    depth (`int`, *optional*, defaults to 16):
        Number of transformer blocks.
    axes_dim (`list[int]`, *optional*):
        list of dimensions for each positional embedding axis. Defaults to `[32, 32]`.
    theta (`int`, *optional*, defaults to 10000):
        Frequency scaling factor for rotary embeddings.
    time_factor (`float`, *optional*, defaults to 1000.0):
        Scaling factor applied in timestep embeddings.
    time_max_period (`int`, *optional*, defaults to 10000):
        Maximum frequency period for timestep embeddings.

Attributes:
    pe_embedder (`EmbedND`):
        Multi-axis rotary embedding generator for positional encodings.
    img_in (`nn.Linear`):
        Projection layer for image patch tokens.
    time_in (`MLPEmbedder`):
        Embedding layer for timestep embeddings.
    txt_in (`nn.Linear`):
        Projection layer for text conditioning.
    blocks (`nn.ModuleList`):
        Stack of transformer blocks (`PRXBlock`).
    final_layer (`LastLayer`):
        Projection layer mapping hidden tokens back to patch outputs.

Methods:
    attn_processors:
        Returns a dictionary of all attention processors in the model.
    set_attn_processor(processor):
        Replaces attention processors across all attention layers.
    process_inputs(image_latent, txt):
        Converts inputs into patch tokens, encodes text, and produces positional encodings.
    compute_timestep_embedding(timestep, dtype):
        Creates a timestep embedding of dimension 256, scaled and projected.
    forward_transformers(image_latent, cross_attn_conditioning, timestep, time_embedding, attention_mask,
    **block_kwargs):
        Runs the sequence of transformer blocks over image and text tokens.
    forward(image_latent, timestep, cross_attn_conditioning, micro_conditioning, cross_attn_mask=None,
    attention_kwargs=None, return_dict=True):
        Full forward pass from latent input to reconstructed output image.

Returns:
    `Transformer2DModelOutput` if `return_dict=True` (default), otherwise a tuple containing:
        - `sample` (`torch.Tensor`): Reconstructed image of shape `(B, C, H, W)`.
zconfig.jsonTNin_channelsr   context_in_dimr   r   rv   depthr   r   time_factortime_max_periodc                 T  > [         TU ]  5         Uc  SS/nXl        X l        U R                  U R                  S-  -  U l        Xl        Xl        XF-  S:w  a  [        SU SU 35      eXF-  n[        U5      U:w  a  [        SU SU 35      eX@l	        X`l
        [        XUS9U l        [        R                  " U R                  U R                  S-  -  U R                  S	S
9U l        [!        SU R                  S9U l        [        R                  " X0R                  5      U l        [        R&                  " [)        U5       Vs/ s H"  n[+        U R                  U R                  US9PM$     sn5      U l        [/        U R                  SU R                  5      U l        SU l        g s  snf )N    r	   r   zHidden size z  must be divisible by num_heads zGot z but expected positional dim )rF   r   r   Tr      )r   r   )r   r   F)r   r;   r#  r   r  r&  r'  rK   sumr   rv   r   pe_embedderr   r   img_inr   time_intxt_inr   r   r   blocksr  final_layergradient_checkpointing)r:   r#  r   r$  r   r   rv   r%  r   r   r&  r'  pe_dimr   r   s                 r"   r;   PRXTransformer2DModel.__init__  s    	BxH '$ ,,t/AA&."a'|K=8XYbXcdee)x=F"tH:-J6(STT&"%&Qii 0 04??A3E EtGWGW^bc"#$:J:JKii0@0@Amm u &A $$NN'
 &	
 &d&6&64;L;LM&+#s   )F%timestepr*   r   c                     U R                  [        USU R                  U R                  SSS9R	                  U5      5      $ )Nr*  Tr   )	timestepsembedding_dim
max_periodr   flip_sin_to_cosdownscale_freq_shift)r.  r   r'  r&  r-   )r:   r5  r*   s      r"   _compute_timestep_embedding1PRXTransformer2DModel._compute_timestep_embedding  sD    ||""!//&& $%( bi	
 		
r$   r?   r@   rA   attention_kwargsreturn_dict.c           	      |   U R                  U5      n[        XR                  5      nU R                  U5      nUR                  u  pp[        XXR                  UR                  S9nU R                  U5      nU R                  X(R                  S9nU R                   HZ  n[        R                  " 5       (       a3  U R                  (       a"  U R                  UR                  UUUUU5      nMP  U" UUUUUS9nM\     U R!                  X5      n[#        XR                  UR                  5      nU(       d  U4$ [%        US9$ )aa  
Forward pass of the PRXTransformer2DModel.

The latent image is split into patch tokens, combined with text conditioning, and processed through a stack of
transformer blocks modulated by the timestep. The output is reconstructed into the latent image space.

Args:
    hidden_states (`torch.Tensor`):
        Input latent image tensor of shape `(B, C, H, W)`.
    timestep (`torch.Tensor`):
        Timestep tensor of shape `(B,)` or `(1,)`, used for temporal conditioning.
    encoder_hidden_states (`torch.Tensor`):
        Text conditioning tensor of shape `(B, L_txt, context_in_dim)`.
    attention_mask (`torch.Tensor`, *optional*):
        Boolean mask of shape `(B, L_txt)`, where `0` marks padding in the text sequence.
    attention_kwargs (`dict`, *optional*):
        Additional arguments passed to attention layers.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether to return a `Transformer2DModelOutput` or a tuple.

Returns:
    `Transformer2DModelOutput` if `return_dict=True`, otherwise a tuple:

        - `sample` (`torch.Tensor`): Output latent image of shape `(B, C, H, W)`.
)r   r   )r*   )r?   r@   r   rB   rA   )sample)r/  r  r   r-  r,   r#   r   r,  r<  r*   r0  r   is_grad_enabledr2  _gradient_checkpointing_funcrw   r1  r   r   )r:   r?   r5  r@   rA   r>  r?  txtr  rl   ra   r  r  r!   per   blockoutputs                     r"   r   PRXTransformer2DModel.forward  s*   F kk/0 m__5kk# $))qq__]MaMabg& ..xyy.I [[E$$&&4+F+F77NN" "%*-%'#1 !( s(oo}/B/BC9'v66r$   )r0  r1  r2  r   r-  r#  rv   r  r   r,  r&  r.  r'  r/  )   r	   i 	  i   g      @   rI  N'  g     @@rK  )NNT)r{   r|   r}   r~   r   config_name _supports_gradient_checkpointingr   r   r+   r   r;   r   r   r*   r<  r  r  r   rV   r   r   r   r   r   r   s   @r"   r"  r"  N  sy   <|  K'+$ "#$7,7, 7, 	7,
 7, 7, 7, 7, 7, 7, 7, 7, 7,r

ELL 

 

Y^YeYe 

" /326 K7||K7 ,,K7  %||	K7
 t+K7 sCx.4/K7 K7 
u||S 	!$<	<K7 K7r$   r"  ))typingr   r   r   configuration_utilsr   r   utilsr   r   r
   r   attention_dispatchr   
embeddingsr   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerr{   loggerr   r   r   r#   r1   r3   Moduler>   r   r   r   r   r  r  r   r"  ry   r$   r"   <module>rY     s|      B  < 6 / 7 ' # 
		H	%tc t3 ts t tUZUaUa tfkfrfr t815<< 1ELL 1U\\ 1*k k\9
2992 9
z,  , ^;")) ;2. .:zryy zz% %P 3 5<< <& &3 &u|| & &RS7J^ S7r$   