
    
3j3                        S SK Jr  S SKJrJrJrJr  S SKrS SKJ	r	  SSK
JrJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJrJrJrJr  \" 5       (       a  S SKJr  \R:                  " \5      r\ " S S\5      5       r  " S S\\\5      r!g)    )	dataclass)ListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixin)
BaseOutputis_torchvision_availablelogging   )
ModelMixin)CosmosEmbeddingCosmosLearnablePositionalEmbedCosmosPatchEmbedCosmosRotaryPosEmbedCosmosTransformerBlock)
transformsc                   >    \ rS rSr% Sr\\R                     \S'   Sr	g)CosmosControlNetOutput   z
Output of [`CosmosControlNetModel`].

Args:
    control_block_samples (`list[torch.Tensor]`):
        List of control block activations to be injected into transformer blocks.
control_block_samples N)
__name__
__module____qualname____firstlineno____doc__r   torchTensor__annotations____static_attributes__r       h/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/controlnets/controlnet_cosmos.pyr   r      s      --r%   r   c            &       |  ^  \ rS rSrSrSr/ SQrS/rS/r\	                  S)S\
S	\
S
\
S\
S\
S\
S\S\
S\
S\\
\
\
4   S\\
\
\
4   S\\\\4   S\S-  S\
S-  S\
S\S\
S\
4$U 4S jjj5       rS\\\   -  S\\   4S jr     S*S\R(                  S\R(                  S \R(                  S!\\\R(                     \\\R(                     \\R(                     4   4   S"\R(                  S\\\   -  S#\R(                  S-  S$\R(                  S-  S%\
S-  S&\S\\\\\R(                        4   4S' jjrS(rU =r$ )+CosmosControlNetModel'   a  
ControlNet for Cosmos Transfer2.5.

This model duplicates the shared embedding modules from the transformer (patch_embed, time_embed,
learnable_pos_embed, img_context_proj) to enable proper CPU offloading. The forward() method computes everything
internally from raw inputs.
T)patch_embedpatch_embed_base
time_embedr   learnable_pos_embedNn_controlnet_blocksin_channelslatent_channelsmodel_channelsnum_attention_headsattention_head_dim	mlp_ratiotext_embed_dimadaln_lora_dim
patch_sizemax_size
rope_scaleextra_pos_embed_typeimg_context_dim_inimg_context_dim_outuse_crossattn_projectioncrossattn_proj_in_channelsencoder_hidden_states_channelsc                   > [         TU ]  5         [        X$U
SS9U l        [        X4U
SS9U l        [        XD5      U l        S U l        US:X  a  [        UUU
S9U l        S U l	        UbH  US:  aB  [        R                  " [        R                  " XSS9[        R                  " 5       5      U l	        S U l        U(       aC  [        R                  " [        R                  " UUSS9[        R                  " 5       5      U l        [        XkXS9U l        [        R"                  " [%        U5       Vs/ s H&  n['        UUUUU	SSUS L=(       a    US:  US:H  SS	9
PM(     sn5      U l        SU l        g s  snf )
NF)bias	learnable)hidden_sizer8   r7   r   T)rC   r8   r7   r9   rms_norm)
r2   r3   cross_attention_dimr4   r6   qk_normout_biasimg_contextbefore_proj
after_proj)super__init__r   r*   r+   r   r,   r-   r   img_context_projnn
SequentialLinearGELUcrossattn_projr   rope
ModuleListranger   control_blocksgradient_checkpointing)selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   	block_idx	__class__s                       r&   rL   CosmosControlNetModel.__init__5   sv   , 	+KZ_` 0R\ch i).I#' ;.'E*!%(D$ !%).@1.D$&MM		,M	%D! ##"$--		46T[_`	#D )**
	 !mm "'':!; "<I '(;'9(6'#1&" 2$ > YCUXYCY!*a# "<
$ ',##s   ,-E-conditioning_scalereturnc                 n   [        U[        5      (       a  UnOU/[        U R                  5      -  n[        U5      [        U R                  5      :  aa  [        R                  S[        U5      [        U R                  5      5        U[        U R                  5      -  S [        U R                  5       nU$ )NzoReceived %d control scales, but control network defines %d blocks. Scales will be trimmed or repeated to match.)
isinstancelistlenrV   loggerwarning)rX   r\   scaless      r&   _expand_conditioning_scale0CosmosControlNetModel._expand_conditioning_scale   s    ($//'F()C0C0C,DDFv;T0011NN?FD''(	 s4#6#6779S3t?R?R;STFr%   controls_latentslatentstimestepencoder_hidden_statescondition_maskpadding_maskattention_maskfpsreturn_dictc                 	  ^)^*^+^, UR                   u  ppnUnU R                  R                  S-
  nUR                   S   US-
  :  aX  US-
  UR                   S   -
  n[        R                  " U[        R
                  " UUXU4UR                  UR                  S9/SS9nUb  [        R                  " UU/SS9nO4[        R                  " U[        R                  " USS2SS24   5      /SS9n[        R                  R                  U[        UR                   SS 5      [        R                  R                  S9n[        R                  " UUR                  S5      R!                  USUSS5      /SS9nUnUb  [        R                  " UU/SS9n[        R                  R                  U[        UR                   SS 5      [        R                  R                  S9n[        R                  " UUR                  S5      R!                  USUSS5      /SS9nU R#                  UU	S9nU R$                  (       a  U R%                  U5      OSnU R'                  U5      nUR)                  SS	5      nU R                  R*                  u  nnnUU-  m+UU-  m*UU-  m,U R-                  U5      nUR)                  SS	5      nUR.                  S:X  a  U R1                  UU5      u  nnOUR.                  S
:X  au  UR                   u  m)nn  nUR                   T)SUSS4:X  d   SUR                    35       eUR)                  5       nU R1                  UU5      u  nnU)U*U+U,4S jUU4 5       u  nnO[3        SUR                    35      e[5        U[6        5      (       a  Uu  n n!OUn Sn!U R8                  b  U R9                  U 5      n U!b  U R:                  b  U R;                  U!5      n!U R                  R<                  b  U R                  R<                  S:  a  U U!4n"OU n"Ub   UR                  S5      R                  S5      nU R?                  U5      n#/ n$[A        [C        U RD                  U#5      5       Hy  u  n%u  n&n'[        RF                  " 5       (       a0  U RH                  (       a  U RK                  U&UU"UUUUUSUU%5      u  nn(OU&" UU"UUUUUSUU%S9
u  nn(U$RM                  U(U'-  5        M{     U
(       d  U$4$ [O        U$S9$ )a  
Forward pass for the ControlNet.

Args:
    controls_latents: Control signal latents [B, C, T, H, W]
    latents: Base latents from the noising process [B, C, T, H, W]
    timestep: Diffusion timestep tensor
    encoder_hidden_states: Tuple of (text_context, img_context) or text_context
    condition_mask: Conditioning mask [B, 1, T, H, W]
    conditioning_scale: Scale factor(s) for control outputs
    padding_mask: Padding mask [B, 1, H, W] or None
    attention_mask: Optional attention mask or None
    fps: Frames per second for RoPE or None
    return_dict: Whether to return a CosmosControlNetOutput or a tuple

Returns:
    CosmosControlNetOutput or tuple of control tensors
   )dtypedevice)dimN)interpolationr   )rn   r      z9Expected timestep to have shape [B, 1, T, 1, 1], but got c              3      >#    U  H=  nUR                  TTS S S5      R                  SSTTS5      R                  S S5      v   M?     g7f)rq   r   N)viewexpandflatten).0x
batch_sizepost_patch_heightpost_patch_num_framespost_patch_widths     r&   	<genexpr>0CosmosControlNetModel.forward.<locals>.<genexpr>   sR      ' 3A z#8!QCB 13CRHA 3s   AAz@Expected timestep to have shape [B, 1, T, 1, 1] or [T], but got r   )
hidden_statesrj   embedded_timesteptembimage_rotary_embextra_pos_embrm   controlnet_residualrh   rY   )r   )(shapeconfigr/   r!   catzerosrr   rs   
zeros_liker   
functionalresizer`   InterpolationModeNEAREST	unsqueezerepeatrS   r-   r*   r|   r7   r+   ndimr,   
ValueErrorr_   tuplerR   rM   r;   re   	enumerateziprV   is_grad_enabledrW   _gradient_checkpointing_funcappendr   )-rX   rg   rh   ri   rj   rk   r\   rl   rm   rn   ro   BCTHWcontrol_hidden_statesvace_in_channelspad_Cpadding_mask_resizedbase_hidden_statesbase_padding_maskr   r   p_tp_hp_wr   r   _
num_framestimestep_flattext_contextrH   processed_encoder_hidden_statesrd   resultrY   blockscalecontrol_projr   r   r   r   s-                                            @@@@r&   forwardCosmosControlNetModel.forward   sY   > )..aA !1;;22Q6 &&q),<q,@@$q(+@+F+Fq+IIE$)II)KKE1+3H3N3NWlWsWs %! %$)II/Dn.U[\$]!$)II&(8(89I!RaR%9P(QRXY%!  *44;;$4::23?@PZPlPlPtPt  <  
 !&		"$8$B$B1$E$L$LQPQSTVWYZ$[\bc!

 %%!&,>+OUV!W&1188$177<=ZMiMiMqMq 9 
 #YY!2!<!<Q!?!F!Fq!QPQST!UV\]

  99%:9DKOKcKc001FGim !% 0 01F G 5 = =a C ..S# !SH8!223EF/771= ==A&*oo6H(&S#D#]]a.5mm+J:q!>>j!ZA%FF KHNNK[\F %,,.M&*oo6H-&X#D#'  12	'#D# _`h`n`n_opqq +U33(=%L+0LK *..|<L "t'<'<'H//<K ;;))5$++:X:X[\:\/;[.I+.:+ %+55a8BB1EN 001CD)23t7J7JF3S)T%I~u$$&&4+F+F6:6W6W)3%$!"&73%| 7<"7*I&7%5"/#1(,.'73%| MM,./9 *U< 9%FCCr%   )	rV   rR   rW   rM   r-   r*   r+   rS   r,   )                   g      @      )rq   r   r   )r      r   )g       @      ?r   NNr   Fr   r   )r   NNNT)r   r   r   r   r     _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_keep_in_fp32_modulesr
   intfloatr   strboolrL   r`   r   re   r!   r"   r   r   r   r   r$   __classcell__)rZ   s   @r&   r(   r(   '   s    (,$'X$1223 $%!"#%"%"!+4)81@+/)-#').*..2'J, J, J, 	J,
 J, !J,  J, J, J, J, #sC-(J, S#&J, %-.J, "DjJ,  $JJ,  !!J," #'#J,$ %(%J,& ),'J, J,XUT%[=P UYZ_U` . 36,0.2 kD,,kD kD ,,	kD
  %Xell%;U8ELLCY[cdidpdp[qCq=r%rskD kD "DK/kD llT)kD t+kD 4ZkD kD 
%uT%,,-?'@@	AkD kDr%   r(   )"dataclassesr   typingr   r   r   r   r!   torch.nnrN   configuration_utilsr	   r
   loadersr   utilsr   r   r   modeling_utilsr   transformers.transformer_cosmosr   r   r   r   r   torchvisionr   
get_loggerr   rb   r   r(   r   r%   r&   <module>r      s{    ! / /   B - B B '  &			H	% 	.Z 	. 	.VDJ5K VDr%   