
    
3j                      ,   S SK Jr  S SKrS SKJr  S SKJs  Jr  SSKJ	r	  SSK
Jr  SSKJr  SSKJrJrJrJrJr  SS	KJr  SS
KJrJrJrJrJrJr   " S S\R:                  5      r " S S\R:                  5      rS\R@                  S\R@                  4S jr! " S S\R:                  5      r" " S S\R:                  5      r# " S S\R:                  5      r$ " S S\R:                  5      r% " S S\R:                  5      r& " S S\R:                  5      r'g)    )partialN   )	deprecate   )get_activation)SpatialNorm)Downsample1DDownsample2DFirDownsample2DKDownsample2Ddownsample_2d)AdaGroupNorm)FirUpsample2DKUpsample2D
Upsample1D
Upsample2Dupfirdn2d_nativeupsample_2dc            "          ^  \ rS rSrSrSSSSSSSS	S
SSSSSSS.S\S\S-  S\S\S\S\S\S-  S\S\S\S\S\S-  S\S\S\S\S-  4 U 4S jjjr	S\
R                  S \
R                  S!\
R                  4S" jrS#rU =r$ )$ResnetBlockCondNorm2D+   a  
A Resnet block that use normalization layer that incorporate conditioning information.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
    groups_out (`int`, *optional*, default to None):
        The number of groups to use for the second normalization layer. if set to None, same as `groups`.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
    time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
        The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
    kernel (`torch.Tensor`, optional, default to None): FIR filter, see
        [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
    output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
    use_in_shortcut (`bool`, *optional*, default to `True`):
        If `True`, add a 1x1 nn.conv2d layer for skip-connection.
    up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
    down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
    conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
        `conv_shortcut` output.
    conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
        If None, same as `out_channels`.
NF               ư>swish	ada_group      ?T)out_channelsconv_shortcutdropouttemb_channelsgroups
groups_outepsnon_linearitytime_embedding_normoutput_scale_factoruse_in_shortcutupdownconv_shortcut_biasconv_2d_out_channelsin_channelsr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   c          	      $  > [         TU ]  5         Xl        Uc  UOUnX l        X0l        Xl        Xl        Xl        Xl        Uc  UnU R                  S:X  a  [        XQXhS9U l
        O9U R                  S:X  a  [        X5      U l
        O[        SU R                   35      e[        R                  " XSSSS9U l        U R                  S:X  a  [        XRXxS9U l        O9U R                  S:X  a  [        X%5      U l        O[        SU R                   35      e["        R                  R%                  U5      U l        U=(       d    Un[        R                  " UUSSSS9U l        [+        U	5      U l        S =U l        U l        U R
                  (       a  [3        USS	9U l        O"U R                  (       a  [5        USSS
S9U l        Uc  U R                  U:g  OUU l        S U l        U R6                  (       a  [        R                  " UUSSSUS9U l        g g )Nr   )r%   spatialz" unsupported time_embedding_norm:    r   kernel_sizestridepaddingFuse_convopr7   r5   namer   r3   r4   r5   bias)super__init__r.   r   use_conv_shortcutr*   r+   r(   r'   r   norm1r   
ValueErrornnConv2dconv1norm2torchDropoutr!   conv2r   nonlinearityupsample
downsampler   r
   r)   r    )selfr.   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   	__class__s                    Q/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/models/resnet.pyr>   ResnetBlockCondNorm2D.__init__I   s   ( 	&&2&:{(!.	#6 #6 J##{2%m&RDJ%%2$[@DJA$BZBZA[\]]YY{aPQ[\]
##{2%m:WDJ%%2$\ADJA$BZBZA[\]]xx''03C|YY|-AqYZdef
*=9*..77&{UCDMYY*;PQX\]DOKZKbt//3GGhw!!#$'"D      input_tensortembreturnc                    [        U5      S:  d  UR                  SS 5      b  Sn[        SSU5        UnU R                  Xb5      nU R	                  U5      nU R
                  bV  UR                  S   S:  a   UR                  5       nUR                  5       nU R                  U5      nU R                  U5      nO/U R                  b"  U R                  U5      nU R                  U5      nU R                  U5      nU R                  Xb5      nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  b  U R                  U5      nX-   U R                  -  nU$ )Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0@   )lengetr   r@   rI   rJ   shape
contiguousrK   rD   rE   r!   rH   r    r(   )rL   rQ   rR   argskwargsdeprecation_messagehidden_statesoutput_tensors           rN   forwardResnetBlockCondNorm2D.forward   sN   t9q=FJJw5A #Ugw(;<$

=7))-8==$""1%++668 - 8 8 :==6L MM-8M__(??<8L OOM:M

=1

=7))-8]3

=1)--l;L%59Q9QQrP   )rD   rH   r    r+   rK   r!   r.   rI   r@   rE   r   r(   r'   r*   rJ   r?   r)   )__name__
__module____qualname____firstlineno____doc__intboolfloatstrr>   rF   Tensorrb   __static_attributes____classcell__rM   s   @rN   r   r   +   s3   B $(# !%$#.%('+#'+/%I I Dj	I
 I I I I $JI I I !I #I I I  !I" !#I$ "Dj%I IV%ELL % %Z_ZfZf % %rP   r   c            (       *  ^  \ rS rSrSrSSSSSSSS	S
SSSSSSSSSS.S\S\S-  S\S\S\S\S\S-  S\S\S\S\S\S\	R                  S-  S\S\S-  S\S\S\S \S-  4&U 4S! jjjrS"\	R                  S#\	R                  S$\	R                  4S% jrS&rU =r$ )'ResnetBlock2D   a  
A Resnet block.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
    groups_out (`int`, *optional*, default to None):
        The number of groups to use for the second normalization layer. if set to None, same as `groups`.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
    time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
        By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
        stronger conditioning with scale and shift.
    kernel (`torch.Tensor`, optional, default to None): FIR filter, see
        [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
    output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
    use_in_shortcut (`bool`, *optional*, default to `True`):
        If `True`, add a 1x1 nn.conv2d layer for skip-connection.
    up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
    down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
    conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
        `conv_shortcut` output.
    conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
        If None, same as `out_channels`.
NFr   r   r   Tr   r   defaultr   )r   r    r!   r"   r#   r$   pre_normr%   r&   skip_time_actr'   kernelr(   r)   r*   r+   r,   r-   r.   r   r    r!   r"   r#   r$   ru   r%   r&   rv   r'   rw   r(   r)   r*   r+   r,   r-   c          	        >^ [         TU ]  5         US:X  a  [        S5      eUS:X  a  [        S5      eSU l        Xl        Uc  UOUnX l        X0l        UU l        UU l        Xl	        Xl
        Xl        Uc  Un[        R                  R                  XaU	SS9U l        [        R                   " XSSSS	9U l        Ubu  U R                  S
:X  a  [        R$                  " XR5      U l        OPU R                  S:X  a   [        R$                  " USU-  5      U l        O [        SU R                   S35      eS U l        [        R                  R                  XrU	SS9U l        [        R                  R+                  U5      U l        U=(       d    Un[        R                   " UUSSSS	9U l        [1        U
5      U l        S =U l        U l        U R                  (       aI  US:X  a  SmU4S jU l        OUS:X  a  [9        [:        R<                  SSS9U l        Ok[?        USS9U l        O[U R                  (       aJ  US:X  a  SmU4S jU l        O6US:X  a  [9        [:        R@                  SSS9U l        O[C        USSSS9U l        Uc  U R                  U:g  OUU l"        S U l#        U RD                  (       a  [        R                   " UUSSSUS9U l#        g g )Nr   zkThis class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` insteadr0   ziThis class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` insteadT
num_groupsnum_channelsr%   affiner1   r   r2   rt   scale_shiftr   zunknown time_embedding_norm :  fir)r   r1   r1   r   c                    > [        U TS9$ N)rw   )r   x
fir_kernels    rN   <lambda>(ResnetBlock2D.__init__.<locals>.<lambda>$  s    +a
*KrP   sde_vpg       @nearest)scale_factormodeFr6   c                    > [        U TS9$ r   )r   r   s    rN   r   r   ,  s    M!J,OrP   )r3   r4   r8   r9   r   r;   )$r=   r>   rA   ru   r.   r   r?   r*   r+   r(   r'   rv   rF   rB   	GroupNormr@   rC   rD   Lineartime_emb_projrE   rG   r!   rH   r   rI   rJ   rK   r   Finterpolater   
avg_pool2dr
   r)   r    )rL   r.   r   r    r!   r"   r#   r$   ru   r%   r&   rv   r'   rw   r(   r)   r*   r+   r,   r-   r   rM   s                       @rN   r>   ResnetBlock2D.__init__   s   . 	+-}  )+{  &&2&:{(!.	#6 #6 *JXX''6Y\ei'j
YY{aPQ[\]
$''94%'YY}%K"))]:%'YY}a,>N%O" #A$BZBZA[[\!]^^!%DXX'':^ajn'o
xx''03C|YY|-AqYZdef
*=9*..77)
 K8# 'Ci X *; GYY)
"O8#")!,,Aa"P".{UTU\`"aKZKbt//3GGhw!!#$'"D  rP   rQ   rR   rS   c                    [        U5      S:  d  UR                  SS 5      b  Sn[        SSU5        UnU R                  U5      nU R	                  U5      nU R
                  bV  UR                  S   S:  a   UR                  5       nUR                  5       nU R                  U5      nU R                  U5      nO/U R                  b"  U R                  U5      nU R                  U5      nU R                  U5      nU R                  b>  U R                  (       d  U R	                  U5      nU R                  U5      S S 2S S 2S S 4   nU R                  S:X  a  Ub  Xb-   nU R                  U5      nOqU R                  S:X  aP  Uc  [        SU R                   35      e[        R                   " US	S
S9u  pxU R                  U5      nUS
U-   -  U-   nOU R                  U5      nU R	                  U5      nU R#                  U5      nU R%                  U5      nU R&                  b2  U R(                  (       a  UR                  5       nU R'                  U5      nX-   U R*                  -  n	U	$ )Nr   rU   rV   rW   rX   rt   r}   z9 `temb` should not be None when `time_embedding_norm` is r   r   )dim)rY   rZ   r   r@   rI   rJ   r[   r\   rK   rD   r   rv   r'   rE   rA   rF   chunkr!   rH   r    trainingr(   )
rL   rQ   rR   r]   r^   r_   r`   
time_scale
time_shiftra   s
             rN   rb   ResnetBlock2D.forward?  s<   t9q=FJJw5A #Ugw(;<$

=1))-8==$""1%++668 - 8 8 :==6L MM-8M__(??<8L OOM:M

=1)%%((.%%d+Aq$,<=D##y0 - 4 JJ}5M%%6| OPTPhPhOij  &+[[qa%@"J JJ}5M)Q^<zIM JJ}5M))-8]3

=1) }}+668--l;L%59Q9QQrP   )rD   rH   r    r+   rK   r!   r.   rI   r@   rE   r   r(   ru   rv   r   r'   r*   rJ   r?   r)   )rd   re   rf   rg   rh   ri   rj   rk   rl   rF   rm   r>   rb   rn   ro   rp   s   @rN   rr   rr      sf   D $(# !%$##,&*%('+#'+/+b b Dj	b
 b b b b $Jb b b b b !b t#b  #!b" #b$ %b& 'b( !)b* "Dj+b bH:ELL : :Z_ZfZf : :rP   rr   tensorrS   c                    [        U R                  5      S:X  a  U S S 2S S 2S 4   $ [        U R                  5      S:X  a  U S S 2S S 2S S S 24   $ [        U R                  5      S:X  a  U S S 2S S 2SS S 24   $ [        S[        U 5       S35      e)Nr   r1      r   z`len(tensor)`: z has to be 2, 3 or 4.)rY   r[   rA   )r   s    rN   rearrange_dimsr   }  s    
6<<AaDj!!
6<<AaD!m$$	V\\	a	aAqj!!?3v;-7LMNNrP   c                      ^  \ rS rSrSr  SS\S\S\\\\4   -  S\S\4
U 4S jjjrS	\	R                  S
\	R                  4S jrSrU =r$ )Conv1dBlocki  ax  
Conv1d --> GroupNorm --> Mish

Parameters:
    inp_channels (`int`): Number of input channels.
    out_channels (`int`): Number of output channels.
    kernel_size (`int` or `tuple`): Size of the convolving kernel.
    n_groups (`int`, default `8`): Number of groups to separate the channels into.
    activation (`str`, defaults to `mish`): Name of the activation function.
inp_channelsr   r3   n_groups
activationc                    > [         TU ]  5         [        R                  " XX3S-  S9U l        [        R
                  " XB5      U l        [        U5      U l        g )Nr   r5   )	r=   r>   rB   Conv1dconv1dr   
group_normr   mish)rL   r   r   r3   r   r   rM   s         rN   r>   Conv1dBlock.__init__  sD     	iiK`aQab,,x>":.	rP   inputsrS   c                     U R                  U5      n[        U5      nU R                  U5      n[        U5      nU R                  U5      nU$ N)r   r   r   r   )rL   r   intermediate_reproutputs       rN   rb   Conv1dBlock.forward  sM     KK/*+<= OO,=>*+<=,-rP   )r   r   r   )   r   rd   re   rf   rg   rh   ri   tuplerl   r>   rF   rm   rb   rn   ro   rp   s   @rN   r   r     sw    	   // / 5c?*	/
 / / /ell u||  rP   r   c                      ^  \ rS rSrSr  SS\S\S\S\\\\4   -  S\4
U 4S jjjrS	\	R                  S
\	R                  S\	R                  4S jrSrU =r$ )ResidualTemporalBlock1Di  au  
Residual 1D block with temporal convolutions.

Parameters:
    inp_channels (`int`): Number of input channels.
    out_channels (`int`): Number of output channels.
    embed_dim (`int`): Embedding dimension.
    kernel_size (`int` or `tuple`): Size of the convolving kernel.
    activation (`str`, defaults `mish`): It is possible to choose the right activation function.
r   r   	embed_dimr3   r   c                 4  > [         TU ]  5         [        XU5      U l        [        X"U5      U l        [        U5      U l        [        R                  " X25      U l	        X:w  a  [        R                  " XS5      U l        g [        R                  " 5       U l        g )Nr   )r=   r>   r   conv_inconv_outr   time_emb_actrB   r   time_embr   Identityresidual_conv)rL   r   r   r   r3   r   rM   s         rN   r>    ResidualTemporalBlock1D.__init__  s{     	"<{K#LL*:6		): 9E8TBIIl!4 	Z\ZeZeZg 	rP   r   trS   c                     U R                  U5      nU R                  U5      nU R                  U5      [        U5      -   nU R	                  U5      nX0R                  U5      -   $ )z
Args:
    inputs : [ batch_size x inp_channels x horizon ]
    t : [ batch_size x embed_dim ]

returns:
    out : [ batch_size x out_channels x horizon ]
)r   r   r   r   r   r   )rL   r   r   outs       rN   rb   ResidualTemporalBlock1D.forward  s\     a MM!ll6"^A%66mmC ''///rP   )r   r   r   r   r   )   r   r   rp   s   @rN   r   r     s    	  ./ 

 
 	

 5c?*
 
 
&0ell 0u|| 0 0 0rP   r   c            	          ^  \ rS rSrSr   SS\S\S-  S\S\4U 4S jjjrSS	\R                  S
\S\R                  4S jjr
SrU =r$ )TemporalConvLayeri  a  
Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016

Parameters:
    in_dim (`int`): Number of input channels.
    out_dim (`int`): Number of output channels.
    dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
Nin_dimout_dimr!   norm_num_groupsc                 f  > [         TU ]  5         U=(       d    UnXl        X l        [        R
                  " [        R                  " XA5      [        R                  " 5       [        R                  " XSSS95      U l	        [        R
                  " [        R                  " XB5      [        R                  " 5       [        R                  " U5      [        R                  " X!SSS95      U l        [        R
                  " [        R                  " XB5      [        R                  " 5       [        R                  " U5      [        R                  " X!SSS95      U l        [        R
                  " [        R                  " XB5      [        R                  " 5       [        R                  " U5      [        R                  " X!SSS95      U l        [        R                  R                  U R                  S   R                   5        [        R                  R                  U R                  S   R"                  5        g )Nr1   r   r   )r   r   r   r   )r=   r>   r   r   rB   
Sequentialr   SiLUConv3drD   rG   rH   conv3conv4initzeros_weightr<   )rL   r   r   r!   r   rM   s        rN   r>   TemporalConvLayer.__init__  sb    	#V ]]LL1GGIIIfy)D


 ]]LL2GGIJJwIIgy)D	

 ]]LL2GGIJJwIIgy)D	

 ]]LL2GGIJJwIIgy)D	

 	tzz"~,,-
tzz"~**+rP   r`   
num_framesrS   c                    US S S 24   R                  SU4UR                  SS  -   5      R                  SSSSS5      nUnU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nX1-   nUR                  SSSSS5      R                  UR                  S   UR                  S   -  S4UR                  SS  -   5      nU$ )Nr   r   r   r   r1   r   )reshaper[   permuterD   rH   r   r   )rL   r`   r   identitys       rN   rb   TemporalConvLayer.forward  s    $'"**B
+;m>Q>QRSRT>U+UV^^_`bcefhiklm 	 !

=1

=1

=1

=1 0%--aAq!<DD  #m&9&9!&<<bAMDWDWXYXZD[[
 rP   )rD   rH   r   r   r   r   )Nr   r   )r   rd   re   rf   rg   rh   ri   rk   r>   rF   rm   rb   rn   ro   rp   s   @rN   r   r     so     #!',', t', 	',
 ', ',RU\\ s 5<<  rP   r   c            	          ^  \ rS rSrSr   SS\S\S-  S\S\4U 4S jjjrS	\R                  S
\R                  S\R                  4S jr
SrU =r$ )TemporalResnetBlocki"  a  
A Resnet block.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
Nr.   r   r"   r%   c                   > [         TU ]  5         Xl        Uc  UOUnX l        SnU Vs/ s H  ofS-  PM	     nn[        R
                  R                  SXSS9U l        [
        R                  " UUUSUS9U l	        Ub  [
        R                  " X25      U l        OS U l        [        R
                  R                  SX$SS9U l        [        R
                  R                  S5      U l        [
        R                  " UUUSUS9U l        [!        S	5      U l        U R                  U:g  U l        S U l        U R$                  (       a  [
        R                  " UUSSS
S9U l        g g s  snf )Nr   r   r   Try   r   r2   r   silur   )r=   r>   r.   r   rF   rB   r   r@   r   rD   r   r   rE   rG   r!   rH   r   rI   r)   r    )	rL   r.   r   r"   r%   r3   kr5   rM   s	           rN   r>   TemporalResnetBlock.__init__.  sR    	&&2&:{(#./;a6;/XX''2Kae'f
YY#

 $!#=!GD!%DXX''2Lbf'g
xx'',YY#

 +62#//<?!!#"D  A 0s   E rQ   rR   rS   c                    UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  bI  U R                  U5      nU R                  U5      S S 2S S 2S S 2S S 4   nUR	                  SSSSS5      nX2-   nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  b  U R                  U5      nX-   nU$ )Nr   r   r   r1   r   )	r@   rI   rD   r   r   rE   r!   rH   r    )rL   rQ   rR   r`   ra   s        rN   rb   TemporalResnetBlock.forwardd  s    $

=1))-8

=1)$$T*D%%d+Aq!T4,?@D<<1aA.D)0M

=1))-8]3

=1)--l;L$4rP   )rD   rH   r    r!   r.   rI   r@   rE   r   r   r)   )Nr   r   r   rp   s   @rN   r   r   "  ss    	 $( 44 Dj4 	4
 4 4lELL    rP   r   c                      ^  \ rS rSrSr       SS\S\S-  S\S\S\S-  S	\S
\4U 4S jjjr  SS\	R                  S\	R                  S-  S\	R                  S-  4S jjrSrU =r$ )SpatioTemporalResBlocki  a  
A SpatioTemporal Resnet block.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
    temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
    merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
    merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
        The merge strategy to use for the temporal mixing.
    switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
        If `True`, switch the spatial and temporal mixing.
Nr.   r   r"   r%   temporal_epsmerge_factorswitch_spatial_to_temporal_mixc	                    > [         T	U ]  5         [        UUUUS9U l        [	        Ub  UOUUb  UOUUUb  UOUS9U l        [        UUUS9U l        g )N)r.   r   r"   r%   )alphamerge_strategyr   )r=   r>   rr   spatial_res_blockr   temporal_res_blockAlphaBlender
time_mixer)
rL   r.   r   r"   r%   r   r   r   r   rM   s
            rN   r>   SpatioTemporalResBlock.__init__  sp     	!.#%'	"
 #6(4(@k)5)A{' , 8c	#
 ')+I
rP   r`   rR   image_only_indicatorc                    UR                   S   nU R                  X5      nUR                   u  pVpxXT-  n	US S S 24   R                  XXgU5      R                  SSSSS5      n
US S S 24   R                  XXgU5      R                  SSSSS5      nUb  UR                  XS5      nU R	                  X5      nU R                  U
UUS9nUR                  SSSSS5      R                  XVXx5      nU$ )Nr   r   r   r   r1   r   )	x_spatial
x_temporalr   )r[   r   r   r   r   r   )rL   r`   rR   r   r   batch_frameschannelsheightwidth
batch_sizehidden_states_mixs              rN   rb   SpatioTemporalResBlock.forward  s$    *//3
..}C0=0C0C-!/
 $'"**:8UZ[ccdeghjkmnpqr 	 $'"**:8UZ[ccdeghjkmnpqr 	 <<
;D//D'$!5 ( 
 &--aAq!<DD\]ckrP   )r   r   r   )Nr   r   Ng      ?learned_with_imagesF)NN)rd   re   rf   rg   rh   ri   rk   rj   r>   rF   rm   rb   rn   ro   rp   s   @rN   r   r     s    ( $( %)!,/4

 Dj
 	

 
 dl
 
 )-
 
H %)48	|| llT! $llT1	 rP   r   c            	          ^  \ rS rSrSr/ SQr  SS\S\S\4U 4S jjjr	S\
R                  S	\S
\
R                  4S jr SS\
R                  S\
R                  S\
R                  S-  S
\
R                  4S jjrSrU =r$ )r   i  a  
A module to blend spatial and temporal features.

Parameters:
    alpha (`float`): The initial value of the blending factor.
    merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
        The merge strategy to use for the temporal mixing.
    switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
        If `True`, switch the spatial and temporal mixing.
)learnedfixedr   r   r   r   c                   > [         TU ]  5         X l        X0l        X R                  ;  a  [        SU R                   35      eU R                  S:X  a(  U R                  S[        R                  " U/5      5        g U R                  S:X  d  U R                  S:X  aE  U R                  S[        R                  R                  [        R                  " U/5      5      5        g [        SU R                   35      e)Nzmerge_strategy needs to be in r   
mix_factorr   r   zUnknown merge strategy )r=   r>   r   r   
strategiesrA   register_bufferrF   rm   register_parameterrB   	Parameter)rL   r   r   r   rM   s       rN   r>   AlphaBlender.__init__  s     	,.L+0=doo=NOPP')  u||UG/DE  I-1D1DH]1]##L%((2D2DU\\SXRYEZ2[\6t7J7J6KLMMrP   r   ndimsrS   c           	      @   U R                   S:X  a  U R                  nU$ U R                   S:X  a"  [        R                  " U R                  5      nU$ U R                   S:X  a  Uc  [	        S5      e[        R
                  " UR                  5       [        R                  " SSUR                  S9[        R                  " U R                  5      S   5      nUS:X  a  US S 2S S S 2S S 4   nU$ US	:X  a  UR                  S
5      S S 2S S 4   nU$ [	        SU S35      e[        e)Nr   r   r   zMPlease provide image_only_indicator to use learned_with_images merge strategyr   )device).Nr   r1   r   zUnexpected ndims z. Dimensions should be 3 or 5)r   r   rF   sigmoidrA   whererj   onesr  r   NotImplementedError)rL   r   r  r   s       rN   	get_alphaAlphaBlender.get_alpha  s    ')OOE6 3   I-MM$//2E0 -   $99#+ !pqqKK$))+

1a(<(C(CDdoo.y9E zaq$45  !b)!T4-8  !#4UG;X!YZZ &%rP   Nr   r   c                     U R                  X1R                  5      nUR                  UR                  5      nU R                  (       a  SU-
  nXA-  SU-
  U-  -   nU$ )Nr   )r  ndimtodtyper   )rL   r   r   r   r   r   s         rN   rb   AlphaBlender.forward  sV     3^^D)..%KEu
 ::rP   )r   r   )r   Fr   )rd   re   rf   rg   rh   r  rk   rl   rj   r>   rF   rm   ri   r  rb   rn   ro   rp   s   @rN   r   r     s    	 =J
 4/4	NN N )-	N N(ell 3 5<< F 59	<< LL $llT1	
 
 rP   r   )(	functoolsr   rF   torch.nnrB   torch.nn.functional
functionalr   utilsr   activationsr   attention_processorr   downsamplingr	   r
   r   r   r   normalizationr   
upsamplingr   r   r   r   r   r   Moduler   rr   rm   r   r   r   r   r   r   r    rP   rN   <module>r      s           ' ,  ( NBII Nb}BII }BO5<< OELL O "))  H,0bii ,0^D		 DNY")) YzQRYY QhN299 NrP   