
    
3jbL              	       <   S SK Jr  S SKJr  S SKrS SKJr  SSKJrJ	r	  SSK
JrJr  SSKJrJr  SS	KJrJr  SS
KJrJr  SSKJrJr  SSKJr  SSKJr  SSKJr  SSKJ r J!r!  \RD                  " \#5      r$\ " S S\ 5      5       r% " S S\\\\\5      r& " S S\5      r'g)    )	dataclass)AnyN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)apply_lora_scalelogging   )AttentionMixinJointTransformerBlock)	AttentionFusedJointAttnProcessor2_0)"CombinedTimestepTextProjEmbeddings
PatchEmbed)Transformer2DModelOutput)
ModelMixin)SD3SingleTransformerBlock   )
BaseOutputzero_modulec                   :    \ rS rSr% \\R                     \S'   Srg)SD3ControlNetOutput%   controlnet_block_samples N)	__name__
__module____qualname____firstlineno__tupletorchTensor__annotations____static_attributes__r       e/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/controlnets/controlnet_sd3.pyr   r   %   s    #ELL11r'   r   c            $         ^  \ rS rSrSrSr\                 S*S\S\S\S\S	\S
\S\S\S\S\S\S\S\\S4   S\	S-  S\	S-  S\
S\
4"U 4S jjj5       rS+S\S-  S\SS4S jjrS rS rS r\ S,S j5       r\" S 5            S-S!\R(                  S"\R(                  S#\S$\R(                  S%\R(                  S&\R,                  S \\	\4   S-  S'\
S\R(                  \-  4S( jj5       rS)rU =r$ ).SD3ControlNetModel*   a  
ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

Parameters:
    sample_size (`int`, defaults to `128`):
        The width/height of the latents. This is fixed during training since it is used to learn a number of
        position embeddings.
    patch_size (`int`, defaults to `2`):
        Patch size to turn the input data into small patches.
    in_channels (`int`, defaults to `16`):
        The number of latent channels in the input.
    num_layers (`int`, defaults to `18`):
        The number of layers of transformer blocks to use.
    attention_head_dim (`int`, defaults to `64`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `18`):
        The number of heads to use for multi-head attention.
    joint_attention_dim (`int`, defaults to `4096`):
        The embedding dimension to use for joint text-image attention.
    caption_projection_dim (`int`, defaults to `1152`):
        The embedding dimension of caption embeddings.
    pooled_projection_dim (`int`, defaults to `2048`):
        The embedding dimension of pooled text projections.
    out_channels (`int`, defaults to `16`):
        The number of latent channels in the output.
    pos_embed_max_size (`int`, defaults to `96`):
        The maximum latent height/width of positional embeddings.
    extra_conditioning_channels (`int`, defaults to `0`):
        The number of extra channels to use for conditioning for patch embedding.
    dual_attention_layers (`tuple[int, ...]`, defaults to `()`):
        The number of dual-stream transformer blocks to use.
    qk_norm (`str`, *optional*, defaults to `None`):
        The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
    pos_embed_type (`str`, defaults to `"sincos"`):
        The type of positional embedding to use. Choose between `"sincos"` and `None`.
    use_pos_embed (`bool`, defaults to `True`):
        Whether to use positional embeddings.
    force_zeros_for_pooled_projection (`bool`, defaults to `True`):
        Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
        config value of the ControlNet model.
TNsample_size
patch_sizein_channels
num_layersattention_head_dimnum_attention_headsjoint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizeextra_conditioning_channelsdual_attention_layers.qk_normpos_embed_typeuse_pos_embed!force_zeros_for_pooled_projectionc                   > [         TU ]  5         UnU
b  U
OUU l        Xe-  U l        U(       a  [	        UUUUU R                  UUS9U l        OS U l        [        U R                  U	S9U l        Ubo  [        R                  " Xx5      U l
        [        R                  " [        U5       Vs/ s H#  n[        U R                  UUSUUU;   a  SOSS9PM%     sn5      U l        OOS U l
        [        R                  " [        U5       Vs/ s H  n[        U R                  UUS9PM     sn5      U l        [        R                  " / 5      U l        [        [#        U R                  5      5       HT  n[        R                  " U R                  U R                  5      n[%        U5      nU R                   R'                  U5        MV     [	        UUUX<-   U R                  S S9n[%        U5      U l        SU l        g s  snf s  snf )N)heightwidthr-   r.   	embed_dimr6   r:   )embedding_dimr4   FT)dimr1   r0   context_pre_onlyr9   use_dual_attention)rB   r1   r0   )r>   r?   r-   r.   r@   r:   )super__init__r5   	inner_dimr   	pos_embedr   time_text_embednnLinearcontext_embedder
ModuleListranger   transformer_blocksr   controlnet_blockslenr   appendpos_embed_inputgradient_checkpointing)selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   default_out_channelsi_controlnet_blockrS   	__class__s                          r(   rF   SD3ControlNetModel.__init__W   s   * 	*,8,DLJ^,A'"!%'..#5-DN "DNA..@U 
 *$&II.A$ZD! ')mm #:.
 / * NN,?+=). '348M3M4SX /
'D# %)D!&(mm #:. / . NN,?+=
 /	'D# "$r!2s42234A!yyH*+;<""))*:; 5 %!#Ann
  +?;&+#S
s   (*G1G6
chunk_sizerB   returnc                    ^ US;  a  [        SU 35      eU=(       d    SnS[        R                  R                  S[        S[        4U4S jjmU R                  5        H  nT" X1U5        M     g)	a  
Sets the attention processor to use [feed forward
chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

Parameters:
    chunk_size (`int`, *optional*):
        The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
        over each tensor of dim=`dim`.
    dim (`int`, *optional*, defaults to `0`):
        The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
        or dim=1 (sequence length).
)r   r   z-Make sure to set `dim` to either 0 or 1, not r   moduler\   rB   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g )Nset_chunk_feed_forward)r\   rB   )hasattrra   children)r_   r\   rB   childfn_recursive_feed_forwards       r(   re   MSD3ControlNetModel.enable_forward_chunking.<locals>.fn_recursive_feed_forward   s=    v788---M*)%SA +r'   N)
ValueErrorr#   rJ   Moduleintrc   )rU   r\   rB   r_   re   s       @r(   enable_forward_chunking*SD3ControlNetModel.enable_forward_chunking   sn     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmoF%f#> &r'   c                    SU l         U R                  R                  5        H3  u  pS[        UR                  R
                  5      ;   d  M*  [        S5      e   U R                  U l         U R                  5        H)  n[        U[        5      (       d  M  UR                  SS9  M+     U R                  [        5       5        g)u   
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
are fused. For cross-attention modules, key and value projection matrices are fused.

> [!WARNING] > This API is 🧪 experimental.
NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsattn_processorsitemsstrrZ   r   rg   modules
isinstancer   fuse_projectionsset_attn_processorr   )rU   rX   attn_processorr_   s       r(   fuse_qkv_projections'SD3ControlNetModel.fuse_qkv_projections   s     )-%!%!5!5!;!;!=A#n66??@@ !tuu "> )-(<(<%llnF&),,''T'2 % 	 : <=r'   c                 V    U R                   b  U R                  U R                   5        gg)u^   Disables the fused QKV projection if enabled.

> [!WARNING] > This API is 🧪 experimental.

N)ro   rv   )rU   s    r(   unfuse_qkv_projections)SD3ControlNetModel.unfuse_qkv_projections   s)     ((4##D$A$AB 5r'   c           	      N   [        UR                  R                  UR                  R                  UR                  R                  UR                  R                  UR
                  UR                  R                  S9nUR                  UR                  R                  5       SS9  U$ )N)r>   r?   r-   r.   r@   r6   Tstrict)
r   configr,   r-   r.   rG   r6   load_state_dictrH   
state_dict)rU   transformerrH   s      r(   _get_pos_embed_from_transformer2SD3ControlNetModel._get_pos_embed_from_transformer   s    %%11$$00"))44#**66!++*11DD
	 	!!+"7"7"B"B"DT!Rr'   c                 P   UR                   nU=(       d    UR                  US'   X5S'   U R                  U5      nU(       a  UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       SS9  [        UR                  5      Ul
        U$ )Nr/   r7   Fr~   )r   r/   from_configrH   r   r   rI   rL   rO   r   rS   )clsr   r/   num_extra_conditioning_channelsload_weights_from_transformerr   
controlnets          r(   from_transformer#SD3ControlNetModel.from_transformer   s     ##)>V->->|0O,-__V,
(  001F1F1Q1Q1ST&&66{7R7R7]7]7_`''778T8T8_8_8ab))99+:X:X:c:c:ens9t)4Z5O5O)PJ&r'   joint_attention_kwargshidden_statescontrolnet_condconditioning_scaleencoder_hidden_statespooled_projectionstimestepreturn_dictc	                    U R                   b  UR                  S:w  a  [        S5      eU R                   c  UR                  S:w  a  [        S5      eU R                  b  Uc  [        S5      eU R                  c  Ub  [        S5      eU R                   b  U R                  U5      nU R	                  Xe5      n	U R                  b  U R                  U5      nXR                  U5      -   nSn
U R                   H  n[        R                  " 5       (       aH  U R                  (       a7  U R                  b  U R                  UUUU	5      u  pAO2U R                  XU	5      nOU R                  b
  U" XU	S9u  pAOU" X5      nX4-   n
M     Sn[        XR                  5       H  u  pU" U5      nX4-   nM     U Vs/ s H  oU-  PM	     nnU(       d  U4$ [        US	9$ s  snf )
a  
The [`SD3Transformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
        Input `hidden_states`.
    controlnet_cond (`torch.Tensor`):
        The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
    conditioning_scale (`float`, defaults to `1.0`):
        The scale factor for ControlNet outputs.
    encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
    pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
        from the embeddings of input conditions.
    timestep ( `torch.LongTensor`):
        Used to indicate denoising step.
    joint_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.

Returns:
    If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
    `tuple` where the first element is the sample tensor.
   z/hidden_states must be 4D when pos_embed is usedr   z3hidden_states must be 3D when pos_embed is not usedzDencoder_hidden_states must be provided when context_embedder is usedzNencoder_hidden_states should not be provided when context_embedder is not usedr   )r   r   temb)r   )rH   ndimrg   rL   rI   rS   rO   r#   is_grad_enabledrT   _gradient_checkpointing_funcziprP   r   )rU   r   r   r   r   r   r   r   r   r   block_res_samplesblockcontrolnet_block_res_samplesblock_res_samplerY   samples                   r(   forwardSD3ControlNetModel.forward  s   P >>%-*<*<*ANOO ^^#(:(:a(?RSS  ,1F1Ncdd""*/D/Pmnn>>% NN=9M##HA  ,$($9$9:O$P! &(<(<_(MM,,E$$&&4+F+F((4;?;\;\%-	<8)= %)$E$Ee\`$aM ((4;@&3gk<8)=
 %*-$>M 14D D- -0 (*$256GI_I_2`./0@A+GJ]+]( 3a
 So'oRn1C(CRn$'o022"<XYY (ps   3G)
rL   rP   rT   rG   ro   r5   rH   rS   rI   rO   )   r         @   r   i   i  i   r   `   r   r   NsincosTT)Nr   )   r   T)g      ?NNNNT)r   r   r    r!   __doc__ _supports_gradient_checkpointingr   ri   r"   rr   boolrF   rj   rx   r{   r   classmethodr   r
   r#   r$   float
LongTensordictr   r   r   r&   __classcell__rZ   s   @r(   r*   r*   *   s3   (T (,$ "$#%#'&*%)"$+,13"%-"26%W,W, W, 	W,
 W,  W, !W, !W, !$W,  #W, W,  W, &)W,  %S#XW, tW,  d
!W," #W,$ ,0%W, W,t?#* ?# ?VZ ?<>,C
 jn $ ./
 %(.2+/%)8< eZ||eZ eZ "	eZ
  %||eZ "LLeZ ""eZ !%S#X 5eZ eZ 
0	0eZ 0eZr'   r*   c                      ^  \ rS rSrSrU 4S jr    SS\R                  S\\R                     S\\
   S\R                  S	\R                  S
\R                  S\\\4   S-  S\S\\-  4S jjrSrU =r$ )SD3MultiControlNetModeliy  a  
`SD3ControlNetModel` wrapper class for Multi-SD3ControlNet

This module is a wrapper for multiple instances of the `SD3ControlNetModel`. The `forward()` API is designed to be
compatible with `SD3ControlNetModel`.

Args:
    controlnets (`list[SD3ControlNetModel]`):
        Provides additional conditioning to the unet during the denoising process. You must set multiple
        `SD3ControlNetModel` as a list.
c                 X   > [         TU ]  5         [        R                  " U5      U l        g )N)rE   rF   rJ   rM   nets)rU   controlnetsrZ   s     r(   rF    SD3MultiControlNetModel.__init__  s    MM+.	r'   Nr   r   r   r   r   r   r   r   r]   c	                    [        [        X#U R                  5      5       HV  u  n	u  pnU" UUUUU
UUUS9nU	S:X  a  UnM   [        WS   US   5       VVs/ s H  u  nnUU-   PM     nnn[        U5      4nMX     W$ s  snnf )N)r   r   r   r   r   r   r   r   r   )	enumerater   r   r"   )rU   r   r   r   r   r   r   r   r   rW   imagescaler   block_samplescontrol_block_samplescontrol_block_sampleblock_samples                    r(   r   SD3MultiControlNetModel.forward  s     .7s?`d`i`i7j-k)A)j&+!&;#5 %#('='	M Av(5% ?BBWXYBZ\ijk\l>m)>m:,l )<7>m & ) *//D)E(G%) .l, %$)s   A<)r   )NNNT)r   r   r    r!   r   rF   r#   r$   listtensorr   r   r   rr   r   r   r   r"   r   r&   r   r   s   @r(   r   r   y  s    
/ /3%)8< !%||!% ell+!% !K	!%
 "LL!%  %||!% ""!% !%S#X 5!% !% 
u	$!% !%r'   r   )(dataclassesr   typingr   r#   torch.nnrJ   configuration_utilsr   r   loadersr   r	   utilsr
   r   	attentionr   r   attention_processorr   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   transformers.transformer_sd3r   r   r   r   
get_loggerr   loggerr   r*   r   r   r'   r(   <module>r      s     "    B ? . = G G 7 ' D / 
		H	% 2* 2 2LZ^[BRTj LZ^
2%j 2%r'   