
    
3jݦ              
          S SK r S SKrS SKJrJr  S SKrS SKrS SKJrJ	r	  SSK
JrJr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJr  SSKJ r   SSK!J"r"  SSK#J$r$  \" 5       (       a  S SK%J&s  J'r(  Sr)OSr)\RT                  " \+5      r,Sr-S r.    S S\/S-  S\0\Rb                  -  S-  S\2\/   S-  S\2\3   S-  4S jjr4 S!S\Rj                  S\Rl                  S-  S\04S jjr7 " S S\\5      r8g)"    N)AnyCallable)T5EncoderModelT5Tokenizer   )MultiPipelineCallbacksPipelineCallback)PipelineImageInput)CogVideoXLoraLoaderMixin)AutoencoderKLCogVideoXCogVideoXTransformer3DModel)get_3d_rotary_pos_embed)DiffusionPipeline)CogVideoXDDIMSchedulerCogVideoXDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )CogVideoXPipelineOutputTFaa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import CogVideoXImageToVideoPipeline
        >>> from diffusers.utils import export_to_video, load_image

        >>> pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")

        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
        >>> image = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
        ... )
        >>> video = pipe(image, prompt, use_dynamic_cfg=True)
        >>> export_to_video(video.frames[0], "output.mp4", fps=8)
        ```
c                    UnUnU u  pVXV-  nXtU-  :  a  Un[        [        XE-  U-  5      5      n	OUn	[        [        X6-  U-  5      5      n[        [        XH-
  S-  5      5      n
[        [        X9-
  S-  5      5      nX4X-   X-   44$ )N       @)intround)src	tgt_width
tgt_heighttwthhwrresize_heightresize_widthcrop_top	crop_lefts               u/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.pyget_resize_crop_region_for_gridr*   D   s    	B	BDA	AG}5!,-E"&1*-.5",345HE2,345I 8#;Y=U"VVV    num_inference_stepsdevice	timestepssigmasc                    Ub  Ub  [        S5      eUb  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
X2S.UD6  U R                  n[        U5      nX14$ Ub  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
XBS.UD6  U R                  n[        U5      nX14$ U R                  " U4S	U0UD6  U R                  nX14$ )a  
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

Args:
    scheduler (`SchedulerMixin`):
        The scheduler to get timesteps from.
    num_inference_steps (`int`):
        The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
        must be `None`.
    device (`str` or `torch.device`, *optional*):
        The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
    timesteps (`list[int]`, *optional*):
        Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
        `num_inference_steps` and `sigmas` must be `None`.
    sigmas (`list[float]`, *optional*):
        Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
        `num_inference_steps` and `timesteps` must be `None`.

Returns:
    `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
    second element is the number of inference steps.
zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr.   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r.   r-   r/   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r/   r-   r-    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r.   len)	schedulerr,   r-   r.   r/   kwargsaccepts_timestepsaccept_sigmass           r)   retrieve_timestepsr?   W   s}   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	M)MfM''	!)n )) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	GvGG''	!)n )) 	 3MFMfM''	))r+   encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrrD   rE   moderG   AttributeError)r@   rA   rB   s      r)   retrieve_latentsrK      s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSSr+   c            2       D  ^  \ rS rSrSr/ rSr/ SQrS\S\	S\
S\S	\\-  4
U 4S
 jjr     SBS\\\   -  S\S\S\R&                  S-  S\R(                  S-  4
S jjr        SCS\\\   -  S\\\   -  S-  S\S\S\R.                  S-  S\R.                  S-  S\S\R&                  S-  S\R(                  S-  4S jjr         SDS\R.                  S\S\S\S\S\S\R(                  S-  S\R&                  S-  S \R2                  S-  S!\R.                  S-  4S" jjrS!\R.                  S#\R.                  4S$ jrS% rS& r   SES' jrSFS( jrSFS) jr S\S\S\S\R&                  S#\!\R.                  \R.                  4   4
S* jr"\#S+ 5       r$\#S, 5       r%\#S- 5       r&\#S. 5       r'\#S/ 5       r(\RR                  " 5       \*" \+5      SSSSS0S1SS2S3SS4SSSSS5SSSS!/S4S\,S\\\   -  S-  S\\\   -  S-  S\S-  S\S-  S\S6\S7\\   S-  S8\-S9\S\S:\-S \R2                  \\R2                     -  S-  S!\R\                  S-  S\R\                  S-  S\R\                  S-  S;\S<\S=\/\\04   S-  S>\1\\/S4   \2-  \3-  S-  S?\\   S\S#\4\!-  4.S@ jj5       5       r5SAr6U =r7$ )GCogVideoXImageToVideoPipeline   a  
Pipeline for image-to-video generation using CogVideoX.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

Args:
    vae ([`AutoencoderKL`]):
        Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
    text_encoder ([`T5EncoderModel`]):
        Frozen text-encoder. CogVideoX uses
        [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
        [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
    tokenizer (`T5Tokenizer`):
        Tokenizer of class
        [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
    transformer ([`CogVideoXTransformer3DModel`]):
        A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
    scheduler ([`SchedulerMixin`]):
        A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
ztext_encoder->transformer->vae)rG   prompt_embedsnegative_prompt_embeds	tokenizertext_encodervaetransformerr;   c                   > [         TU ]  5         U R                  UUUUUS9  [        U SS 5      (       a/  S[	        U R
                  R                  R                  5      S-
  -  OSU l        [        U SS 5      (       a   U R
                  R                  R                  OSU l
        [        U SS 5      (       a   U R
                  R                  R                  OSU l        [        U R                  S9U l        g )	N)rQ   rR   rS   rT   r;   rS      r         gffffff?)vae_scale_factor)super__init__register_modulesgetattrr:   rS   configblock_out_channelsvae_scale_factor_spatialtemporal_compression_ratiovae_scale_factor_temporalscaling_factorvae_scaling_factor_imager   video_processor)selfrQ   rR   rS   rT   r;   r9   s         r)   r[   &CogVideoXImageToVideoPipeline.__init__   s     	%# 	 	
 CJ$PUW[B\B\A#dhhoo889A=>bc 	% ;B$t:T:TDHHOO66Z[ 	& KRRVX]_cJdJd(F(Fjm%-t?\?\]r+   Nr      promptnum_videos_per_promptmax_sequence_lengthr-   dtypec           	         U=(       d    U R                   nU=(       d    U R                  R                  n[        U[        5      (       a  U/OUn[        U5      nU R                  USUSSSS9nUR                  nU R                  USSS9R                  n	U	R                  S   UR                  S   :  a]  [        R                  " X5      (       dB  U R                  R                  U	S S 2US-
  S24   5      n
[        R                  S	U S
U
 35        U R                  UR                  U5      5      S   nUR                  XTS9nUR                  u  pnUR                  SUS5      nUR!                  Xb-  US5      nU$ )N
max_lengthTpt)paddingrn   
truncationadd_special_tokensreturn_tensorslongest)rp   rs   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )rl   r-   )_execution_devicerR   rl   
isinstancestrr:   rQ   	input_idsshapetorchequalbatch_decodeloggerwarningtorepeatview)rf   ri   rj   rk   r-   rl   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrO   _seq_lens                 r)   _get_t5_prompt_embeds3CogVideoXImageToVideoPipeline._get_t5_prompt_embeds   s    14110**00'44&&[
nn *# % 
 %....SW.Xbb  $(<(<R(@@UcIuIu>>66qJ]`aJadfJfGf7ghLNN'(	,A
 )).*;*;F*CDQG%((u(D &++A%,,Q0EqI%**:+MwXZ[r+   Tnegative_promptdo_classifier_free_guidancerO   rP   c
                 2   U=(       d    U R                   n[        U[        5      (       a  U/OUnUb  [        U5      n
OUR                  S   n
Uc  U R                  UUUUU	S9nU(       a  Uc  U=(       d    Sn[        U[        5      (       a  X/-  OUnUb;  [        U5      [        U5      La$  [        S[        U5       S[        U5       S35      eU
[        U5      :w  a!  [        SU S[        U5       S	U SU
 S
3	5      eU R                  UUUUU	S9nXV4$ )ab  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `list[str]`, *optional*):
        prompt to be encoded
    negative_prompt (`str` or `list[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
        Whether to use classifier free guidance or not.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    device: (`torch.device`, *optional*):
        torch device
    dtype: (`torch.dtype`, *optional*):
        torch dtype
r   )ri   rj   rk   r-   rl    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	rv   rw   rx   r:   rz   r   type	TypeErrorr2   )rf   ri   r   r   rj   rO   rP   rk   r-   rl   r   s              r)   encode_prompt+CogVideoXImageToVideoPipeline.encode_prompt  sk   L 1411'44&&VJ&,,Q/J  66&;$7 7 M '+A+I-3O@J?\_@`@`j+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  &*%?%?&&;$7 &@ &" 44r+   imager   num_channels_latents
num_framesheightwidthrA   rG   c           
         [        U	[        5      (       a*  [        U	5      U:w  a  [        S[        U	5       SU S35      eUS-
  U R                  -  S-   nUUUXPR
                  -  X`R
                  -  4nU R                  R                  R                  b9  US S US   US   U R                  R                  R                  -  -   4-   USS  -   nUR                  S5      n[        U	[        5      (       aR  [        U5       Vs/ s H;  n[        U R                  R                  X   R                  S5      5      X   5      PM=     nnODU Vs/ s H7  n[        U R                  R                  UR                  S5      5      U	5      PM9     nn[        R                  " USS9R!                  U5      R#                  SSSSS	5      nU R                  R                  R$                  (       d  U R&                  U-  nOSU R&                  -  U-  nUUS-
  UXPR
                  -  X`R
                  -  4n[        R(                  " XUS
9n[        R                  " UU/SS9nU R                  R                  R                  bS  US S 2S UR+                  S5      U R                  R                  R                  -  2S4   n[        R                  " UU/SS9nU
c  [-        XXS9n
OU
R!                  U5      n
XR.                  R0                  -  n
X4$ s  snf s  snf )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r   rV   r   dimr   rX   )r-   rl   .)rA   r-   rl   )rw   listr:   r2   rb   r`   rT   r^   patch_size_t	unsqueezerangerK   rS   encoder{   catr   permuteinvert_scale_latentsrd   zerossizer   r;   init_noise_sigma)rf   r   r   r   r   r   r   rl   r-   rA   rG   rz   iimage_latentsimgpadding_shapelatent_paddingfirst_frames                     r)   prepare_latents-CogVideoXImageToVideoPipeline.prepare_latentsX  s    i&&3y>Z+GA#i.AQ R&<'gi 
 !1n)G)GG!K
 333222
 ""//;"1IqE!Ht7G7G7N7N7[7[,[![ ]]`efgfh`iiE"i&&`efp`q`q[\ 1C1CA1F!GV`q  M hmmgl`c-dhhoocmmA>N.OQZ[glMm		-Q7::5AII!QPQSTVWXxx33 99MIM  = ==MM N 333222
 ]O		=."AqI ""//;'+i]-?-?-BTEUEUE\E\EiEi-i+ikn(noK!II{M&BJM?"5fZGjj(G NN;;;%%M ns   AL
>L	returnc                     UR                  SSSSS5      nSU R                  -  U-  nU R                  R                  U5      R                  nU$ )Nr   rV   r   r   rX   )r   rd   rS   decoderE   )rf   rG   framess      r)   decode_latents,CogVideoXImageToVideoPipeline.decode_latents  sJ    //!Q1a0d333g=)00r+   c                     [        [        X-  5      U5      n[        X-
  S5      nX&U R                  R                  -  S  nX!U-
  4$ )Nr   )minr   maxr;   order)rf   r,   r.   strengthr-   init_timestept_starts          r)   get_timesteps+CogVideoXImageToVideoPipeline.get_timesteps  sP    C 3 >?ATU)91=(<(<<>?	777r+   c                 n   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   n0 nU(       a  X$S'   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   nU(       a  XS'   U$ )NetarA   )r3   r4   r5   r;   stepr7   r8   )rf   rA   r   accepts_etaextra_step_kwargsaccepts_generators         r)   prepare_extra_step_kwargs7CogVideoXImageToVideoPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r+   c
           
        ^  [        U[        R                  5      (       dU  [        U[        R                  R                  5      (       d,  [        U[
        5      (       d  [        S[        U5       35      eUS-  S:w  d	  US-  S:w  a  [        SU SU S35      eUbW  [        U 4S jU 5       5      (       d=  [        ST R                   S	U V
s/ s H  oT R                  ;  d  M  U
PM     sn
 35      eUb  Ub  [        S
U SU S35      eUc  Uc  [        S5      eUbA  [        U[        5      (       d,  [        U[
        5      (       d  [        S[        U5       35      eUb  U	b  [        S
U SU	 S35      eUb  U	b  [        SU SU	 S35      eUbE  U	bA  UR                  U	R                  :w  a&  [        SUR                   SU	R                   S35      eg g g s  sn
f )Nz``image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `list[PIL.Image.Image]` but is rW   r   z7`height` and `width` have to be divisible by 8 but are z and r   c              3   @   >#    U  H  oTR                   ;   v   M     g 7fN)_callback_tensor_inputs).0krf   s     r)   	<genexpr>=CogVideoXImageToVideoPipeline.check_inputs.<locals>.<genexpr>  s      F
7Y!---7Ys   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z and `negative_prompt_embeds`: z'Cannot forward both `negative_prompt`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )rw   r{   TensorPILImager   r2   r   allr   rx   rz   )rf   r   ri   r   r   r   "callback_on_step_end_tensor_inputsrG   rO   rP   r   s   `          r)   check_inputs*CogVideoXImageToVideoPipeline.check_inputs  s    5%,,//uciioo66ud++K=" 
 A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 C
 DTEaEaDbbn  |^  pH  |^vw  ko  kG  kG  bGpq  |^  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@)@TZ\`IaIaQRVW]R^Q_`aa"8"D0 9*++]_ 
 &+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  C *L$5 pHs   G%Gc                 F    SU l         U R                  R                  5         g)zEnables fused QKV projections.TN)fusing_transformerrT   fuse_qkv_projectionsrf   s    r)   r   2CogVideoXImageToVideoPipeline.fuse_qkv_projections  s    "&--/r+   c                     U R                   (       d  [        R                  S5        gU R                  R	                  5         SU l         g)z)Disable QKV projection fusion if enabled.zKThe Transformer was not initially fused for QKV projections. Doing nothing.FN)r   r~   r   rT   unfuse_qkv_projectionsr   s    r)   r   4CogVideoXImageToVideoPipeline.unfuse_qkv_projections  s2    &&NNhi335&+D#r+   c           
         XR                   U R                  R                  R                  -  -  nX R                   U R                  R                  R                  -  -  nU R                  R                  R                  nU R                  R                  R                  nU R                  R                  R
                  U-  n	U R                  R                  R                  U-  n
Uc>  [        XV4X5      n[        U R                  R                  R                  UXV4UUS9u  pX4$ X8-   S-
  U-  n[        U R                  R                  R                  S XV4USX4US9u  pX4$ )N)	embed_dimcrops_coords	grid_sizetemporal_sizer-   r   slice)r   r   r   r   	grid_typemax_sizer-   )
r`   rT   r^   
patch_sizer   sample_widthsample_heightr*   r   attention_head_dim)rf   r   r   r   r-   grid_height
grid_widthpp_tbase_size_widthbase_size_heightgrid_crops_coords	freqs_cos	freqs_sinbase_num_framess                  r)   %_prepare_rotary_positional_embeddingsCCogVideoXImageToVideoPipeline._prepare_rotary_positional_embeddings  sd    !>!>AQAQAXAXAcAc!cd<<t?O?O?V?V?a?aab
##..%%22**11>>!C++22@@AE; ?)?! $;**11DD.&3($ I* ##  */!3;O#:**11DD!&3-!*<$ I ##r+   c                     U R                   $ r   )_guidance_scaler   s    r)   guidance_scale,CogVideoXImageToVideoPipeline.guidance_scaleB  s    ###r+   c                     U R                   $ r   )_num_timestepsr   s    r)   num_timesteps+CogVideoXImageToVideoPipeline.num_timestepsF  s    """r+   c                     U R                   $ r   )_attention_kwargsr   s    r)   attention_kwargs.CogVideoXImageToVideoPipeline.attention_kwargsJ      %%%r+   c                     U R                   $ r   )_current_timestepr   s    r)   current_timestep.CogVideoXImageToVideoPipeline.current_timestepN  r  r+   c                     U R                   $ r   )
_interruptr   s    r)   	interrupt'CogVideoXImageToVideoPipeline.interruptR  s    r+   1   2      Fg        pilr,   r.   r   use_dynamic_cfgr   output_typereturn_dictr   callback_on_step_endr   c                    [        U[        [        45      (       a  UR                  nU=(       d-    U R                  R
                  R                  U R                  -  nU=(       d-    U R                  R
                  R                  U R                  -  nU=(       d     U R                  R
                  R                  nSnU R                  UUUUUUUUUS9	  Xl        SU l        UU l        SU l        Ub  [        U[        5      (       a  SnO3Ub!  [        U[         5      (       a  [#        U5      nOUR$                  S   nU R&                  nU	S:  nU R)                  UUUUUUUUS9u  nnU(       a  [*        R,                  " UU/SS9n[.        (       a  S	nOUn[1        U R2                  UUU5      u  p[#        U5      U l        US-
  U R6                  -  S-   nU R                  R
                  R8                  nSnUb#  UU-  S:w  a  UUU-  -
  nUUU R6                  -  -  nU R:                  R=                  XUS
9R?                  UUR@                  S9nU R                  R
                  RB                  S-  nU RE                  UUU-  UUUUUR@                  UUU5
      u  nnU RG                  X5      n U R                  R
                  RH                  (       a"  U RK                  XEURM                  S5      U5      OSn!U R                  R
                  RN                  c  SOURQ                  SSS9n"[S        [#        U5      XpR2                  RT                  -  -
  S5      n#U RW                  US9 n$Sn%[Y        U5       GH  u  n&n'U RZ                  (       a  M  U'U l        U(       a  [*        R,                  " U/S-  5      OUn(U R2                  R]                  U(U'5      n(U(       a  [*        R,                  " U/S-  5      OUn)[*        R,                  " U(U)/SS9n(U'R_                  U(R$                  S   5      n*U R                  Ra                  S5         U R	                  U(UU*U"U!USS9S   n+SSS5        W+Rc                  5       n+U
(       aO  SU	S[d        Rf                  " [d        Rh                  UU'Rk                  5       -
  U-  S-  -  5      -
  S-  -  -   U l        U(       a)  U+Rm                  S5      u  n,n-U,U Rn                  U-U,-
  -  -   n+[        U R2                  [p        5      (       d'  U R2                  Rr                  " U+U'U40 U DSS0D6S   nO6U R2                  Rr                  " U+U%U'U&S:  a  UU&S-
     OSU40 U DSS0D6u  nn%UR?                  UR@                  5      nUb\  0 n.U H  n/[u        5       U/   U.U/'   M     U" U U&U'U.5      n0U0Rw                  SU5      nU0Rw                  SU5      nU0Rw                  SU5      nU&[#        U5      S-
  :X  d)  U&S-   U#:  a0  U&S-   U R2                  RT                  -  S:X  a  U$Ry                  5         [.        (       d  GM  [z        R|                  " 5         GM     SSS5        SU l        US:X  d7  USS2US24   nU R                  U5      n1U R:                  R                  U1US9n1OUn1U R                  5         U(       d  U14$ [        U1S9$ ! , (       d  f       GN= f! , (       d  f       N= f)a  
Function invoked when calling the pipeline for generation.

Args:
    image (`PipelineImageInput`):
        The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
    prompt (`str` or `list[str]`, *optional*):
        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
        instead.
    negative_prompt (`str` or `list[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
        The height in pixels of the generated image. This is set to 480 by default for the best results.
    width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
        The width in pixels of the generated image. This is set to 720 by default for the best results.
    num_frames (`int`, defaults to `48`):
        Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
        contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
        num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
        needs to be satisfied is that of divisibility mentioned above.
    num_inference_steps (`int`, *optional*, defaults to 50):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    timesteps (`list[int]`, *optional*):
        Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
        in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
        passed will be used. Must be in descending order.
    guidance_scale (`float`, *optional*, defaults to 7.0):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        The number of videos to generate per prompt.
    generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    latents (`torch.FloatTensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will be generated by sampling using the supplied random `generator`.
    prompt_embeds (`torch.FloatTensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.FloatTensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between
        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
        of a plain tuple.
    attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    callback_on_step_end (`Callable`, *optional*):
        A function that calls at the end of each denoising steps during the inference. The function is called
        with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
        callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
        `callback_on_step_end_tensor_inputs`.
    callback_on_step_end_tensor_inputs (`list`, *optional*):
        The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
        will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
        `._callback_tensor_inputs` attribute of your pipeline class.
    max_sequence_length (`int`, defaults to `226`):
        Maximum sequence length in encoded prompt. Must be consistent with
        `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.

Examples:

Returns:
    [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
    [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
    `tuple`. When returning a tuple, the first element is a list with the generated images.
r   )	r   ri   r   r   r   r   rG   rO   rP   NFr   g      ?)ri   r   r   rj   rO   rP   rk   r-   r   cpu)r   r   )rl   rV   )r   r   )
fill_value)totalcond_uncond)hidden_statesencoder_hidden_statestimestepofsimage_rotary_embr   r  g      @r  rG   rO   rP   latent)videor  )r   )Crw   r	   r   tensor_inputsrT   r^   r   r`   r   sample_framesr   r   r  r   r  rx   r   r:   rz   rv   r   r{   r   XLA_AVAILABLEr?   r;   r   rb   r   re   
preprocessr   rl   in_channelsr   r    use_rotary_positional_embeddingsr   r   ofs_embed_dimnew_fullr   r   progress_bar	enumerater	  scale_model_inputexpandcache_contextfloatmathcospiitemchunkr   r   r   localspopupdatexm	mark_stepr   postprocess_videomaybe_free_model_hooksr   )2rf   r   ri   r   r   r   r   r,   r.   r   r  rj   r   rA   rG   rO   rP   r  r  r   r  r   rk   r   r-   r   timestep_devicelatent_framesr   additional_frameslatent_channelsr   r   r  ofs_embnum_warmup_stepsr'  old_pred_original_sampler   tlatent_model_inputlatent_image_inputr  
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr  s2                                                     r)   __call__&CogVideoXImageToVideoPipeline.__call__V  s#   Z *-=?U,VWW1E1S1S.`4++22@@4C`C``]))00==@]@]]H4#3#3#:#:#H#H
 ! 	+/Q'#9 	 
	
  .!%!1 *VS"9"9JJvt$<$<VJ&,,Q/J''
 '5s&:# 150B0B+(C"7'#9 3 1C 	1
-- '!II'=}&MSTUM =#O$O);NN/)*
&	 ")n $aD,J,JJQN ''..;;#(D(I ,}|/K K+d.L.LLLJ$$//E/RUU--- V 
 **11==B!%!5!5.."
 !::9J
 &&GG 66vgllSToW]^ 	 **11??G$WM]M]^borM]Ms s9~0CnnFZFZ0ZZ\]^%89\'+$!),1>>)*&A\UYYy1}%=bi"%)^^%E%EFXZ[%\"GbUYY/B%Chu"%*YY0BDV/W]^%_" 88$6$<$<Q$?@ %%33MB!%!1!1&8.;!)#)9)9$) "2 " "J C (--/
 #+,~TXXdgg2E2PTg1glo0o&opptuu0 ,D( /9C9I9I!9L6%!2T5H5HO^oLo5p!pJ "$..2GHH"nn11*aqL]qkpqrstG8<8K8K"0,-E	!a%(t9 ,9 %*95G5 "**]%8%89 (3&(O?-3Xa[* @';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s*I**A9I/IqSTuX\XfXfXlXlNlpqNq '') =LLNE - :N "&h&a!2!334G''0E((::T_:`EE 	##%8O&e44K CB% :9s,   'C![+[!G([+[+
[(#[++
[9)
r   r  r   r  r   r   r`   rb   rd   re   )Nr   rh   NN)NTr   NNrh   NN)	r         <   Z   NNNN)NNN)r   N)8__name__
__module____qualname____firstlineno____doc___optional_componentsmodel_cpu_offload_seqr   r   r   r   r   r   r   r[   rx   r   r   r{   r-   rl   r   boolr   r   	Generatorr   r   r   r   r   r   r   tupler   propertyr   r   r   r  r	  no_gradr   EXAMPLE_DOC_STRINGr
   r,  FloatTensordictr   r   r	   r   r   rH  __static_attributes____classcell__)r9   s   @r)   rM   rM      s$   , <^^ %^ $	^
 1^ *,AA^< #'%&#&&*$((d3i(  #( !	(
 t#( {{T!(\ 37,0%&-16:#&&*$(O5d3iO5 tCy4/O5 &*	O5
  #O5 ||d*O5 !&t 3O5 !O5 t#O5 {{T!O5h $&$(&*,0'+I&||I& I& "	I&
 I& I& I& {{T!I& t#I& ??T)I& $I&Xell u|| 8!2 #=@0,*$*$ *$ 	*$
 *$ 
u||U\\)	**$X $ $ # # & & & &   ]]_12 *.26! #%&* ! %%&DH,026;?  26nr9B#&/o5!o5 d3i$&o5 tCy4/	o5
 d
o5 Tzo5 o5 !o5 9t#o5 o5 o5  #o5 o5 ??T%//%::TAo5 ""T)o5  ((4/!o5" !& 1 1D 8#o5$ %o5& 'o5( sCx.4/)o5* 'Sz4'78;KKNddgkk+o5, -1I-o5. !/o50 
!5	(1o5 3 o5r+   rM   )NNNN)NrE   )9r4   r-  typingr   r   r   r{   transformersr   r   	callbacksr   r	   image_processorr
   loadersr   modelsr   r   models.embeddingsr   pipelines.pipeline_utilsr   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   re   r   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr5  r!  
get_loggerrN  r~   rZ  r*   r   rx   r-   r   r,  r?   r   rV  rK   rM   r1   r+   r)   <module>ro     s        
  4 A 1 / I 8 9 G O O - - 4 ))MM			H	% *W* '+(,"&!%8*t8* %,,%8* Cy4	8*
 K$8*z `h
TLL
T-2__t-C
TY\
Tg5$57O g5r+   