
    
3j<              
       <   S SK Jr  S SKrS SKJr  SSKJrJr  SSKJ	r	J
r
Jr  SSKJrJr  SSKJr  SS	KJrJrJr  SS
KJrJrJr  SSKJrJr  SSKJr  SSKJr  SSK J!r!J"r"  \RF                  " \$5      r%\ " S S\RL                  5      5       r' " S S\\\\
\	\5      r(g)    )AnyN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixinSD3Transformer2DLoadersMixin)apply_lora_scalelogging)maybe_allow_in_graph   )AttentionMixinFeedForwardJointTransformerBlock)	AttentionFusedJointAttnProcessor2_0JointAttnProcessor2_0)"CombinedTimestepTextProjEmbeddings
PatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousAdaLayerNormZeroc                   r   ^  \ rS rSrS\S\S\4U 4S jjrS\R                  S\R                  4S jrS	r	U =r
$ )
SD3SingleTransformerBlock&   dimnum_attention_headsattention_head_dimc           
         > [         TU ]  5         [        U5      U l        [	        UUUUS[        5       SS9U l        [        R                  " USSS9U l	        [        XSS9U l        g )NTư>)	query_dimdim_headheadsout_dimbias	processorepsFelementwise_affiner(   zgelu-approximate)r   dim_outactivation_fn)super__init__r   norm1r   r   attnnn	LayerNormnorm2r   ff)selfr   r   r   	__class__s       g/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/transformers/transformer_sd3.pyr.   "SD3SingleTransformerBlock.__init__(   se     	%c*
'%+-
	 \\#%TJ
#BTU    hidden_statestembc                 B   U R                  XS9u  p4pVnU R                  US S9nUR                  S5      U-  nX-   nU R                  U5      nUSUR                  S5      -   -  UR                  S5      -   nU R	                  U5      n	UR                  S5      U	-  n	X-   nU$ )N)emb)r:   encoder_hidden_states   )r/   r0   	unsqueezer3   r4   )
r5   r:   r;   norm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpattn_output	ff_outputs
             r7   forward!SD3SingleTransformerBlock.forward>   s    GKzzR_zGjDiHii.@X\i]((+k9%3 "ZZ6/1y7J7J17M3MNQZQdQdefQggGG./	&&q)I5	%1r9   )r0   r4   r/   r3   )__name__
__module____qualname____firstlineno__intr.   torchTensorrH   __static_attributes____classcell__r6   s   @r7   r   r   &   sH    VV !V  	V,U\\   r9   r   c                     ^  \ rS rSrSrSrS/rSS/r\             S(S\	S	\	S
\	S\	S\	S\	S\	S\	S\	S\	S\	S\
\	S4   S\S-  4U 4S jjj5       rS)S\	S-  S\	SS4S jjrS rS rS r\" S5             S*S\R&                  S \R&                  S!\R&                  S"\R(                  S#\S\\\4   S-  S$\S%\\	   S-  S\R&                  \-  4S& jj5       rS'rU =r$ )+SD3Transformer2DModelO   ax  
The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

Parameters:
    sample_size (`int`, defaults to `128`):
        The width/height of the latents. This is fixed during training since it is used to learn a number of
        position embeddings.
    patch_size (`int`, defaults to `2`):
        Patch size to turn the input data into small patches.
    in_channels (`int`, defaults to `16`):
        The number of latent channels in the input.
    num_layers (`int`, defaults to `18`):
        The number of layers of transformer blocks to use.
    attention_head_dim (`int`, defaults to `64`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `18`):
        The number of heads to use for multi-head attention.
    joint_attention_dim (`int`, defaults to `4096`):
        The embedding dimension to use for joint text-image attention.
    caption_projection_dim (`int`, defaults to `1152`):
        The embedding dimension of caption embeddings.
    pooled_projection_dim (`int`, defaults to `2048`):
        The embedding dimension of pooled text projections.
    out_channels (`int`, defaults to `16`):
        The number of latent channels in the output.
    pos_embed_max_size (`int`, defaults to `96`):
        The maximum latent height/width of positional embeddings.
    dual_attention_layers (`tuple[int, ...]`, defaults to `()`):
        The number of dual-stream transformer blocks to use.
    qk_norm (`str`, *optional*, defaults to `None`):
        The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
Tr   	pos_embednormNsample_size
patch_sizein_channels
num_layersr   r   joint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizedual_attention_layers.qk_normc                 f  > [         TU ]  5         U
b  U
OUU l        Xe-  U l        [	        UUUUU R                  US9U l        [        U R                  U	S9U l        [        R                  " Xx5      U l
        [        R                  " [        U5       Vs/ s H'  n[        U R                  UUXS-
  :H  UX;   a  SOSS9PM)     sn5      U l        [        U R                  U R                  SSS9U l        [        R                  " U R                  X"-  U R                  -  SS	9U l        SU l        g s  snf )
N)heightwidthrZ   r[   	embed_dimra   )embedding_dimr_   r?   TF)r   r   r   context_pre_onlyrc   use_dual_attentionr!   r)   )r&   )r-   r.   r`   	inner_dimr   rW   r   time_text_embedr1   Linearcontext_embedder
ModuleListranger   transformer_blocksr   norm_outproj_outgradient_checkpointing)r5   rY   rZ   r[   r\   r   r   r]   r^   r_   r`   ra   rb   rc   ir6   s                  r7   r.   SD3Transformer2DModel.__init__w   s+   & 	,8,DL+,A#!#nn1
  B..@U 
 !#		*= V"$-- z*
 +A &(;'9%&q.%8#/0/Itu +
#
 /t~~t~~bgmqr		$..*2IDL]L]2]dhi&+#!
s   .D.
chunk_sizer   returnc                    ^ US;  a  [        SU 35      eU=(       d    SnS[        R                  R                  S[        S[        4U4S jjmU R                  5        H  nT" X1U5        M     g)	a  
Sets the attention processor to use [feed forward
chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

Parameters:
    chunk_size (`int`, *optional*):
        The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
        over each tensor of dim=`dim`.
    dim (`int`, *optional*, defaults to `0`):
        The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
        or dim=1 (sequence length).
)r   r?   z-Make sure to set `dim` to either 0 or 1, not r?   modulerw   r   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g Nset_chunk_feed_forward)rw   r   hasattrr}   childrenrz   rw   r   childfn_recursive_feed_forwards       r7   r   PSD3Transformer2DModel.enable_forward_chunking.<locals>.fn_recursive_feed_forward   =    v788---M*)%SA +r9   N)
ValueErrorrO   r1   ModulerN   r   )r5   rw   r   rz   r   s       @r7   enable_forward_chunking-SD3Transformer2DModel.enable_forward_chunking   sn     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmoF%f#> &r9   c                    ^ S[         R                  R                  S[        S[        4U4S jjmU R	                  5        H  nT" US S5        M     g )Nrz   rw   r   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g r|   r~   r   s       r7   r   QSD3Transformer2DModel.disable_forward_chunking.<locals>.fn_recursive_feed_forward   r   r9   r   )rO   r1   r   rN   r   )r5   rz   r   s     @r7   disable_forward_chunking.SD3Transformer2DModel.disable_forward_chunking   sH    	Behhoo 	B3 	BUX 	B mmoF%fdA6 &r9   c                    SU l         U R                  R                  5        H3  u  pS[        UR                  R
                  5      ;   d  M*  [        S5      e   U R                  U l         U R                  5        H)  n[        U[        5      (       d  M  UR                  SS9  M+     U R                  [        5       5        g)u   
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
are fused. For cross-attention modules, key and value projection matrices are fused.

> [!WARNING] > This API is 🧪 experimental.
NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsattn_processorsitemsstrr6   rJ   r   modules
isinstancer   fuse_projectionsset_attn_processorr   )r5   _attn_processorrz   s       r7   fuse_qkv_projections*SD3Transformer2DModel.fuse_qkv_projections   s     )-%!%!5!5!;!;!=A#n66??@@ !tuu "> )-(<(<%llnF&),,''T'2 % 	 : <=r9   c                 V    U R                   b  U R                  U R                   5        gg)u^   Disables the fused QKV projection if enabled.

> [!WARNING] > This API is 🧪 experimental.

N)r   r   )r5   s    r7   unfuse_qkv_projections,SD3Transformer2DModel.unfuse_qkv_projections   s)     ((4##D$A$AB 5r9   joint_attention_kwargsr:   r>   pooled_projectionstimestepblock_controlnet_hidden_statesreturn_dictskip_layersc	                 2   UR                   SS u  pU R                  U5      nU R                  XC5      nU R                  U5      nUb9  SU;   a3  UR	                  S5      nU R                  X5      u  pUR                  XS9  [        U R                  5       H  u  nnUb  X;   a  SOSn[        R                  " 5       (       a0  U R                  (       a  U(       d  U R                  UUUUU5      u  p!OU(       d  U" UUUUS9u  p!Uc  Ms  UR                  SL d  M  [        U R                  5      [        U5      -  nX[        UU-  5         -   nM     U R!                  X5      nU R#                  U5      nU R$                  R&                  nU	U-  n	U
U-  n
UR)                  UR                   S   XUUU R*                  4S	9n[        R,                  " S
U5      nUR)                  UR                   S   U R*                  U	U-  U
U-  4S	9nU(       d  U4$ [/        US9$ )a  
The [`SD3Transformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
        Input `hidden_states`.
    encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
    pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`):
        Embeddings projected from the embeddings of input conditions.
    timestep (`torch.LongTensor`):
        Used to indicate denoising step.
    block_controlnet_hidden_states (`list` of `torch.Tensor`):
        A list of tensors that if specified are added to the residuals of transformer blocks.
    joint_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.
    skip_layers (`list` of `int`, *optional*):
        A list of layer indices to skip during the forward pass.

Returns:
    If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
    `tuple` where the first element is the sample tensor.
Nip_adapter_image_embeds)ip_hidden_statesr;   TF)r:   r>   r;   r   r   )shapeznhwpqc->nchpwq)sample)r   rW   rl   rn   pop
image_projupdate	enumeraterq   rO   is_grad_enabledrt   _gradient_checkpointing_funcri   lenrN   rr   rs   configrZ   reshaper`   einsumr   )r5   r:   r>   r   r   r   r   r   r   re   rf   r;   r   r   ip_tembindex_blockblockis_skipinterval_controlrZ   outputs                        r7   rH   SD3Transformer2DModel.forward   sE   R &++BC0}5##HA $ 5 56K L!-2KOe2e&<&@&@AZ&[#(,8O(Z%"));K)Z"+D,C,C"DK)5+:TdZ_G$$&&4+F+Fw7;7X7X!)*84%} 7<"/*?+A	84% .9e>T>TX]>]#&t'>'>#?#FdBe#e  -sS^aqSqOr0s s/ #E2 m:m4 [[++
:%#%-- &&q)6*jRVRcRcd . 
 %5}E&& &&q)4+<+<fz>QSX[eSef ' 
 9'v66r9   )
rn   rt   rk   rr   r   r`   rW   rs   rl   rq   )   r         @   r   i   i  i   r   `    N)Nr   )NNNNNTN)rJ   rK   rL   rM   __doc__ _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rN   tupler   r.   r   r   r   r   r
   rO   rP   
LongTensorlistdictr   boolr   rH   rQ   rR   rS   s   @r7   rU   rU   O   s   B (,$01(3V'<$ "$#%#'&*%)"$ "!4,4, 4, 	4,
 4,  4, !4, !4, !$4,  #4, 4,  4,  %H 
4,  t!4, 4,n?#* ?# ?VZ ?<	7>,C ./ /3+/%)/38< (,`7||`7  %||`7 "LL	`7
 ""`7 )-`7 !%S#X 5`7 `7 #Y%`7 
0	0`7 0`7r9   rU   ))typingr   rO   torch.nnr1   configuration_utilsr   r   loadersr   r   r	   utilsr
   r   utils.torch_utilsr   	attentionr   r   r   attention_processorr   r   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrJ   loggerr   r   rU   r   r9   r7   <module>r      s       B ] ] . 5 J J 
 H 7 ' D 
		H	% %		 % %PJ7-=?UWsJ7r9   