ó
    
3jbL  ã            	       ó<  • S SK Jr  S SKJr  S SKrS SKJr  SSKJrJ	r	  SSK
JrJr  SSKJrJr  SS	KJrJr  SS
KJrJr  SSKJrJr  SSKJr  SSKJr  SSKJr  SSKJ r J!r!  \RD                  " \#5      r$\ " S S\ 5      5       r% " S S\\\\\5      r& " S S\5      r'g)é    )Ú	dataclass)ÚAnyNé   )ÚConfigMixinÚregister_to_config)ÚFromOriginalModelMixinÚPeftAdapterMixin)Úapply_lora_scaleÚloggingé   )ÚAttentionMixinÚJointTransformerBlock)Ú	AttentionÚFusedJointAttnProcessor2_0)Ú"CombinedTimestepTextProjEmbeddingsÚ
PatchEmbed)ÚTransformer2DModelOutput)Ú
ModelMixin)ÚSD3SingleTransformerBlocké   )Ú
BaseOutputÚzero_modulec                   ó:   • \ rS rSr% \\R                     \S'   Srg)ÚSD3ControlNetOutputé%   Úcontrolnet_block_samples© N)	Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__ÚtupleÚtorchÚTensorÚ__annotations__Ú__static_attributes__r   ó    Úe/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/controlnets/controlnet_sd3.pyr   r   %   s   ‡ à# E§L¡LÑ1Ö1r'   r   c            $       óè  ^ • \ rS rSrSrSr\                 S*S\S\S\S\S	\S
\S\S\S\S\S\S\S\\S4   S\	S-  S\	S-  S\
S\
4"U 4S jjj5       rS+S\S-  S\SS4S jjrS rS rS r\ S,S j5       r\" S 5            S-S!\R(                  S"\R(                  S#\S$\R(                  S%\R(                  S&\R,                  S \\	\4   S-  S'\
S\R(                  \-  4S( jj5       rS)rU =r$ ).ÚSD3ControlNetModelé*   aÁ  
ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

Parameters:
    sample_size (`int`, defaults to `128`):
        The width/height of the latents. This is fixed during training since it is used to learn a number of
        position embeddings.
    patch_size (`int`, defaults to `2`):
        Patch size to turn the input data into small patches.
    in_channels (`int`, defaults to `16`):
        The number of latent channels in the input.
    num_layers (`int`, defaults to `18`):
        The number of layers of transformer blocks to use.
    attention_head_dim (`int`, defaults to `64`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `18`):
        The number of heads to use for multi-head attention.
    joint_attention_dim (`int`, defaults to `4096`):
        The embedding dimension to use for joint text-image attention.
    caption_projection_dim (`int`, defaults to `1152`):
        The embedding dimension of caption embeddings.
    pooled_projection_dim (`int`, defaults to `2048`):
        The embedding dimension of pooled text projections.
    out_channels (`int`, defaults to `16`):
        The number of latent channels in the output.
    pos_embed_max_size (`int`, defaults to `96`):
        The maximum latent height/width of positional embeddings.
    extra_conditioning_channels (`int`, defaults to `0`):
        The number of extra channels to use for conditioning for patch embedding.
    dual_attention_layers (`tuple[int, ...]`, defaults to `()`):
        The number of dual-stream transformer blocks to use.
    qk_norm (`str`, *optional*, defaults to `None`):
        The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
    pos_embed_type (`str`, defaults to `"sincos"`):
        The type of positional embedding to use. Choose between `"sincos"` and `None`.
    use_pos_embed (`bool`, defaults to `True`):
        Whether to use positional embeddings.
    force_zeros_for_pooled_projection (`bool`, defaults to `True`):
        Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
        config value of the ControlNet model.
TNÚsample_sizeÚ
patch_sizeÚin_channelsÚ
num_layersÚattention_head_dimÚnum_attention_headsÚjoint_attention_dimÚcaption_projection_dimÚpooled_projection_dimÚout_channelsÚpos_embed_max_sizeÚextra_conditioning_channelsÚdual_attention_layers.Úqk_normÚpos_embed_typeÚuse_pos_embedÚ!force_zeros_for_pooled_projectionc                 óö  >• [         TU ]  5         UnU
b  U
OUU l        Xe-  U l        U(       a  [	        UUUUU R                  UUS9U l        OS U l        [        U R                  U	S9U l        Ubo  [        R                  " Xx5      U l
        [        R                  " [        U5       Vs/ s H#  n[        U R                  UUSUUU;   a  SOSS9PM%     sn5      U l        OOS U l
        [        R                  " [        U5       Vs/ s H  n[        U R                  UUS9PM     sn5      U l        [        R                  " / 5      U l        [        [#        U R                  5      5       HT  n[        R                  " U R                  U R                  5      n[%        U5      nU R                   R'                  U5        MV     [	        UUUX<-   U R                  S S9n[%        U5      U l        SU l        g s  snf s  snf )N)ÚheightÚwidthr-   r.   Ú	embed_dimr6   r:   )Úembedding_dimr4   FT)Údimr1   r0   Úcontext_pre_onlyr9   Úuse_dual_attention)rB   r1   r0   )r>   r?   r-   r.   r@   r:   )ÚsuperÚ__init__r5   Ú	inner_dimr   Ú	pos_embedr   Útime_text_embedÚnnÚLinearÚcontext_embedderÚ
ModuleListÚranger   Útransformer_blocksr   Úcontrolnet_blocksÚlenr   ÚappendÚpos_embed_inputÚgradient_checkpointing)Úselfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   Údefault_out_channelsÚiÚ_Úcontrolnet_blockrS   Ú	__class__s                          €r(   rF   ÚSD3ControlNetModel.__init__W   sò  ø€ ô* 	‰ÑÔØ*ÐØ,8Ñ,D™LÐJ^ˆÔØ,ÑAˆŒæÜ'Ø"Ø!Ø%Ø'ØŸ.™.Ø#5Ø-ñˆDNð "ˆDŒNÜAØŸ.™.Ð@Uñ 
ˆÔð Ñ*Ü$&§I¢IÐ.AÓ$ZˆDÔ!ô ')§m¢mô # :Ô.ó
ò /˜ô *Ø ŸN™NØ,?Ø+=Ø).Ø 'Ø34Ð8MÓ3M©4ÐSXôñ /ñ
ó'ˆDÕ#ð %)ˆDÔ!Ü&(§m¢mô # :Ô.óò /˜ô .Ø ŸN™NØ,?Ø+=ôñ
 /ñó	'ˆDÔ#ô "$§¢¨rÓ!2ˆÔÜ”s˜4×2Ñ2Ó3Ö4ˆAÜ!Ÿyšy¨¯©¸¿¹ÓHÐÜ*Ð+;Ó<ÐØ×"Ñ"×)Ñ)Ð*:Ö;ñ 5ô %ØØØ!Ø#ÑAØ—n‘nØñ
ˆô  +¨?Ó;ˆÔà&+ˆÕ#ùòS
ùòs   Â(*G1ÄG6Ú
chunk_sizerB   Úreturnc                 óà   ^• US;  a  [        SU 35      eU=(       d    SnS[        R                  R                  S[        S[        4U4S jjmU R                  5        H  nT" X1U5        M     g)	a  
Sets the attention processor to use [feed forward
chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

Parameters:
    chunk_size (`int`, *optional*):
        The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
        over each tensor of dim=`dim`.
    dim (`int`, *optional*, defaults to `0`):
        The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
        or dim=1 (sequence length).
)r   r   z-Make sure to set `dim` to either 0 or 1, not r   Úmoduler\   rB   c                 ó†   >• [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g )NÚset_chunk_feed_forward)r\   rB   )Úhasattrra   Úchildren)r_   r\   rB   ÚchildÚfn_recursive_feed_forwards       €r(   re   ÚMSD3ControlNetModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardÅ   s=   ø€ ÜvÐ7×8Ñ8Ø×-Ñ-¸Ð-ÑMàŸ™Ö*Ù)¨%¸SÖAò +r'   N)Ú
ValueErrorr#   rJ   ÚModuleÚintrc   )rU   r\   rB   r_   re   s       @r(   Úenable_forward_chunkingÚ*SD3ControlNetModel.enable_forward_chunking²   sn   ø€ ð fÓÜÐLÈSÈEÐRÓSÐSð  —_ 1ˆ
ð	B¬e¯h©h¯o©oð 	BÌ3ð 	BÔUX÷ 	Bð —m‘m–oˆFÙ% f¸#Ö>ò &r'   c                 ó‚  • SU l         U R                  R                  5        H3  u  pS[        UR                  R
                  5      ;   d  M*  [        S5      e   U R                  U l         U R                  5        H)  n[        U[        5      (       d  M  UR                  SS9  M+     U R                  [        5       5        g)uò   
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
are fused. For cross-attention modules, key and value projection matrices are fused.

> [!WARNING] > This API is ðŸ§ª experimental.
NÚAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)Úfuse)Úoriginal_attn_processorsÚattn_processorsÚitemsÚstrrZ   r   rg   ÚmodulesÚ
isinstancer   Úfuse_projectionsÚset_attn_processorr   )rU   rX   Úattn_processorr_   s       r(   Úfuse_qkv_projectionsÚ'SD3ControlNetModel.fuse_qkv_projectionsÐ   sŸ   € ð )-ˆÔ%à!%×!5Ñ!5×!;Ñ!;Ö!=ÑˆAØœ#˜n×6Ñ6×?Ñ?Ó@Õ@Ü Ð!tÓuÐuñ ">ð )-×(<Ñ(<ˆÔ%à—l‘l–nˆFÜ˜&¤)×,Ó,Ø×'Ñ'¨TÐ'Ó2ñ %ð 	×ÑÔ :Ó <Õ=r'   c                 óV   • U R                   b  U R                  U R                   5        gg)u^   Disables the fused QKV projection if enabled.

> [!WARNING] > This API is ðŸ§ª experimental.

N)ro   rv   )rU   s    r(   Úunfuse_qkv_projectionsÚ)SD3ControlNetModel.unfuse_qkv_projectionsæ   s)   € ð ×(Ñ(Ñ4Ø×#Ñ# D×$AÑ$AÕBð 5r'   c           	      óN  • [        UR                  R                  UR                  R                  UR                  R                  UR                  R                  UR
                  UR                  R                  S9nUR                  UR                  R                  5       SS9  U$ )N)r>   r?   r-   r.   r@   r6   T©Ústrict)
r   Úconfigr,   r-   r.   rG   r6   Úload_state_dictrH   Ú
state_dict)rU   ÚtransformerrH   s      r(   Ú_get_pos_embed_from_transformerÚ2SD3ControlNetModel._get_pos_embed_from_transformerñ   sŠ   € ÜØ×%Ñ%×1Ñ1Ø×$Ñ$×0Ñ0Ø"×)Ñ)×4Ñ4Ø#×*Ñ*×6Ñ6Ø!×+Ñ+Ø*×1Ñ1×DÑDñ
ˆ	ð 	×!Ñ! +×"7Ñ"7×"BÑ"BÓ"DÈTÐ!ÑRØÐr'   c                 óP  • UR                   nU=(       d    UR                  US'   X5S'   U R                  U5      nU(       aå  UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       SS9  [        UR                  5      Ul
        U$ )Nr/   r7   Fr~   )r€   r/   Úfrom_configrH   r   r‚   rI   rL   rO   r   rS   )Úclsrƒ   r/   Únum_extra_conditioning_channelsÚload_weights_from_transformerr€   Ú
controlnets          r(   Úfrom_transformerÚ#SD3ControlNetModel.from_transformerý   sé   € ð ×#Ñ#ˆØ)×>¨V×->Ñ->ˆˆ|ÑØ0OÐ,Ñ-Ø—_‘_ VÓ,ˆ
æ(Ø× Ñ ×0Ñ0°×1FÑ1F×1QÑ1QÓ1SÔTØ×&Ñ&×6Ñ6°{×7RÑ7R×7]Ñ7]Ó7_Ô`Ø×'Ñ'×7Ñ7¸×8TÑ8T×8_Ñ8_Ó8aÔbØ×)Ñ)×9Ñ9¸+×:XÑ:X×:cÑ:cÓ:eÐnsÐ9Ñtä)4°Z×5OÑ5OÓ)PˆJÔ&àÐr'   Újoint_attention_kwargsÚhidden_statesÚcontrolnet_condÚconditioning_scaleÚencoder_hidden_statesÚpooled_projectionsÚtimestepÚreturn_dictc	                 ó¶  • U R                   b  UR                  S:w  a  [        S5      eU R                   c  UR                  S:w  a  [        S5      eU R                  b  Uc  [        S5      eU R                  c  Ub  [        S5      eU R                   b  U R                  U5      nU R	                  Xe5      n	U R                  b  U R                  U5      nXR                  U5      -   nSn
U R                   H‰  n[        R                  " 5       (       aH  U R                  (       a7  U R                  b  U R                  UUUU	5      u  pAO2U R                  X±U	5      nOU R                  b
  U" XU	S9u  pAOU" X5      nX¡4-   n
M‹     Sn[        X R                  5       H  u  pÞU" U5      nXÍ4-   nM     U Vs/ s H  oÿU-  PM	     nnU(       d  U4$ [        US	9$ s  snf )
aç  
The [`SD3Transformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
        Input `hidden_states`.
    controlnet_cond (`torch.Tensor`):
        The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
    conditioning_scale (`float`, defaults to `1.0`):
        The scale factor for ControlNet outputs.
    encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
    pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
        from the embeddings of input conditions.
    timestep ( `torch.LongTensor`):
        Used to indicate denoising step.
    joint_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.

Returns:
    If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
    `tuple` where the first element is the sample tensor.
é   z/hidden_states must be 4D when pos_embed is usedr   z3hidden_states must be 3D when pos_embed is not usedzDencoder_hidden_states must be provided when context_embedder is usedzNencoder_hidden_states should not be provided when context_embedder is not usedr   )r   r’   Útemb)r   )rH   Úndimrg   rL   rI   rS   rO   r#   Úis_grad_enabledrT   Ú_gradient_checkpointing_funcÚziprP   r   )rU   r   r   r‘   r’   r“   r”   rŽ   r•   r˜   Úblock_res_samplesÚblockÚcontrolnet_block_res_samplesÚblock_res_samplerY   Úsamples                   r(   ÚforwardÚSD3ControlNetModel.forward  s  € ðP >‰>Ñ%¨-×*<Ñ*<ÀÓ*AÜÐNÓOÐOð ^‰^Ñ#¨×(:Ñ(:¸aÓ(?ÜÐRÓSÐSà× Ñ Ñ,Ð1FÑ1NÜÐcÓdÐdà×"Ñ"Ñ*Ð/DÑ/PÜÐmÓnÐnà>‰>Ñ%Ø ŸN™N¨=Ó9ˆMà×#Ñ# HÓAˆà× Ñ Ñ,Ø$(×$9Ñ$9Ð:OÓ$PÐ!ð &×(<Ñ(<¸_Ó(MÑMˆàÐà×,Ô,ˆEÜ×$Ò$×&Ñ&¨4×+F×+FØ×(Ñ(Ñ4Ø;?×;\Ñ;\ØØ%Ø-Øó	<Ñ8Ð)¨=ð %)×$EÑ$EÀeÐ\`Ó$a‘Mð ×(Ñ(Ñ4Ù;@Ø&3Ðgkñ<Ñ8Ð)¨=ñ
 %*¨-Ó$>Mà 1Ð4DÑ DÒñ- -ð0 (*Ð$Ü25Ð6G×I_ÑI_Ö2`Ñ.ÐÙ/Ð0@ÓAÐØ+GÐJ]Ñ+]Ò(ñ 3añ
 SoÓ'oÒRnÈÐ1CÔ(CÑRnÐ$Ð'oæØ0Ð2Ð2ä"Ð<XÑYÐYùò (ps   Æ3G)
rL   rP   rT   rG   ro   r5   rH   rS   rI   rO   )é€   r   é   é   é@   r¦   i   i€  i   r¥   é`   r   r   NÚsincosTT)Nr   )é   r   T)g      ð?NNNNT)r   r   r    r!   Ú__doc__Ú _supports_gradient_checkpointingr   ri   r"   rr   ÚboolrF   rj   rx   r{   r„   ÚclassmethodrŒ   r
   r#   r$   ÚfloatÚ
LongTensorÚdictr   r   r¢   r&   Ú__classcell__©rZ   s   @r(   r*   r*   *   s3  ø† ñ(ðT (,Ð$àð ØØØØ"$Ø#%Ø#'Ø&*Ø%)ØØ"$Ø+,Ø13Ø"Ø%-Ø"Ø26ñ%W,àðW,ð ðW,ð ð	W,ð
 ðW,ð  ðW,ð !ðW,ð !ðW,ð !$ðW,ð  #ðW,ð ðW,ð  ðW,ð &)ðW,ð  % S¨# X™ðW,ð t‘ðW,ð  ˜d™
ð!W,ð" ð#W,ð$ ,0÷%W,ó ðW,ñt?°#¸±*ð ?È#ð ?ÐVZõ ?ò<>ò,Cò
ð àjnóó ðñ$ Ð.Ó/ð
 %(Ø.2Ø+/Ø%)Ø8<Ø ñeZà—|‘|ðeZð Ÿ™ðeZð "ð	eZð
  %Ÿ|™|ðeZð "ŸL™LðeZð ×"Ñ"ðeZð !% S¨# X¡°Ñ 5ðeZð ðeZð 
‰Ð0Ñ	0ôeZó 0öeZr'   r*   c                   óì   ^ • \ rS rSrSrU 4S jr    SS\R                  S\\R                     S\\
   S\R                  S	\R                  S
\R                  S\\\4   S-  S\S\\-  4S jjrSrU =r$ )ÚSD3MultiControlNetModeliy  aœ  
`SD3ControlNetModel` wrapper class for Multi-SD3ControlNet

This module is a wrapper for multiple instances of the `SD3ControlNetModel`. The `forward()` API is designed to be
compatible with `SD3ControlNetModel`.

Args:
    controlnets (`list[SD3ControlNetModel]`):
        Provides additional conditioning to the unet during the denoising process. You must set multiple
        `SD3ControlNetModel` as a list.
c                 óX   >• [         TU ]  5         [        R                  " U5      U l        g )N)rE   rF   rJ   rM   Únets)rU   ÚcontrolnetsrZ   s     €r(   rF   Ú SD3MultiControlNetModel.__init__†  s   ø€ Ü‰ÑÔÜ—M’M +Ó.ˆ	r'   Nr   r   r‘   r“   r’   r”   rŽ   r•   r]   c	                 ó  • [        [        X#U R                  5      5       HV  u  n	u  p«nU" UUUUU
UUUS9nU	S:X  a  UnM   [        WS   US   5       VVs/ s H  u  nnUU-   PM     nnn[        U5      4nMX     W$ s  snnf )N)r   r”   r’   r“   r   r‘   rŽ   r•   r   )Ú	enumeraterœ   r·   r"   )rU   r   r   r‘   r“   r’   r”   rŽ   r•   rW   ÚimageÚscaler‹   Úblock_samplesÚcontrol_block_samplesÚcontrol_block_sampleÚblock_samples                    r(   r¢   ÚSD3MultiControlNetModel.forwardŠ  s¾   € ô .7´s¸?Ð`d×`iÑ`iÓ7jÖ-kÑ)ˆAÑ)˜jÙ&Ø+Ø!Ø&;Ø#5Ø %Ø#(Ø'=Ø'ñ	ˆMð A‹vØ(5Ò%ô ?BÐBWÐXYÑBZÐ\iÐjkÑ\lÔ>mô)â>mÑ:Ð,¨lð )¨<Ô7Ù>mð &ñ )ô */Ð/DÓ)EÐ(GÒ%ñ) .lð, %Ð$ùó)s   ÁA<)r·   )NNNT)r   r   r    r!   r«   rF   r#   r$   ÚlistÚtensorr¯   r°   r±   rr   r   r­   r   r"   r¢   r&   r²   r³   s   @r(   rµ   rµ   y  s®   ø† ñ
õ/ð /3Ø%)Ø8<Ø ñ!%à—|‘|ð!%ð ˜eŸl™lÑ+ð!%ð ! ™Kð	!%ð
 "ŸL™Lð!%ð  %Ÿ|™|ð!%ð ×"Ñ"ð!%ð !% S¨# X¡°Ñ 5ð!%ð ð!%ð 
˜uÑ	$÷!%ó !%r'   rµ   )(Údataclassesr   Útypingr   r#   Útorch.nnrJ   Úconfiguration_utilsr   r   Úloadersr   r	   Úutilsr
   r   Ú	attentionr   r   Úattention_processorr   r   Ú
embeddingsr   r   Úmodeling_outputsr   Úmodeling_utilsr   Útransformers.transformer_sd3r   r‹   r   r   Ú
get_loggerr   Úloggerr   r*   rµ   r   r'   r(   Ú<module>rÓ      s…   ðõ  "Ý ã Ý ç Bß ?ß .ß =ß Gß GÝ 7Ý 'Ý Dß /ð 
×	Ò	˜HÓ	%€ð ô2˜*ó 2ó ð2ôLZ˜ ^°[ÐBRÐTjô LZô^
2%˜jõ 2%r'   