
    
3j,N                         S SK r SSKJr  SSKJr  SSKJrJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  \" \5      rSr SS\ R*                  S\ R,                  S-  S\4S jjr " S S\5      rg)    N   )PipelineImageInput)AutoencoderKLLTX2Video)
get_loggerreplace_example_docstring)randn_tensor)VideoProcessor   )LTXPipelineOutput)DiffusionPipeline   )LTX2LatentUpsamplerModela  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline
        >>> from diffusers.pipelines.ltx2.export_utils import encode_video
        >>> from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
        >>> from diffusers.utils import load_image

        >>> pipe = LTX2ImageToVideoPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
        >>> pipe.enable_model_cpu_offload()

        >>> image = load_image(
        ...     "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
        ... )
        >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background."
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

        >>> frame_rate = 24.0
        >>> video, audio = pipe(
        ...     image=image,
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=768,
        ...     height=512,
        ...     num_frames=121,
        ...     frame_rate=frame_rate,
        ...     num_inference_steps=40,
        ...     guidance_scale=4.0,
        ...     output_type="pil",
        ...     return_dict=False,
        ... )

        >>> latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
        ...     "Lightricks/LTX-2", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
        ... )
        >>> upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
        >>> upsample_pipe.vae.enable_tiling()
        >>> upsample_pipe.to(device="cuda", dtype=torch.bfloat16)

        >>> video = upsample_pipe(
        ...     video=video,
        ...     width=768,
        ...     height=512,
        ...     output_type="np",
        ...     return_dict=False,
        ... )[0]

        >>> encode_video(
        ...     video[0],
        ...     fps=frame_rate,
        ...     audio=audio[0].float().cpu(),
        ...     audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
        ...     output_path="video.mp4",
        ... )
        ```
encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr   r   moder   AttributeError)r   r   r   s      p/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.pyretrieve_latentsr   [   s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSS    c            !         ^  \ rS rSrSrS\S\SS4U 4S jjr           S*S\R                  S-  S	\
S
\
S\
S\
S\
S\
S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  4S jjrS+S\R                  S\R                  S\4S jjrS\R                  S\S\R                  4S jr\ S+S\R                  S\R                  S\R                  S\S\R                  4
S jj5       r\ S,S\R                  S
\
S\
S\
S\
S\
S\R                  4S jj5       rS  r\R,                  " 5       \" \5                     S-S\\   S-  S\
S\
S
\
S\
S\
S\R                  S-  S!\S"\\\   -  S#\\\   -  S-  S$\S%\S\R                  \\R                     -  S-  S&\S-  S'\4S( jj5       5       rS)rU =r$ ).LTX2LatentUpsamplePipelineh   zvae->latent_upsamplervaelatent_upsamplerreturnNc                   > [         TU ]  5         U R                  XS9  [        U SS 5      b  U R                  R
                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U R                  S9U l
        g )N)r    r!   r           )vae_scale_factor)super__init__register_modulesgetattrr    spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratior	   video_processor)selfr    r!   	__class__s      r   r(   #LTX2LatentUpsamplePipeline.__init__k   s    
 	#I 3:$t2L2XDHH..^` 	* 4;43M3YDHH//_` 	+  .t?a?abr   video
batch_size
num_framesheightwidthspatial_patch_sizetemporal_patch_sizedtypedevicer   r   c           
         Ubc  UR                   S:X  aD  US-
  U R                  -  S-   nX@R                  -  nXPR                  -  nU R                  XXXg5      nUR	                  XS9$ UR	                  XR
                  R                  S9n[        U
[        5      (       a|  [        U
5      U:w  a  [        S[        U
5       SU S35      e[        U5       Vs/ s H;  n[        U R
                  R                  X   R                  S5      5      X   5      PM=     nnODU Vs/ s H7  n[        U R
                  R                  UR                  S5      5      U
5      PM9     nn[        R                   " USS9R	                  U5      nU$ s  snf s  snf )	Nr   r   r;   r:   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r   dim)ndimr.   r,   _unpack_latentstor    r:   
isinstancelistlen
ValueErrorranger   encode	unsqueezetorchcat)r0   r3   r4   r5   r6   r7   r8   r9   r:   r;   r   r   latent_num_frameslatent_heightlatent_widthiinit_latentsvids                     r   prepare_latents*LTX2LatentUpsamplePipeline.prepare_latents|   s    ||q %/!^8[8[$[^_$_! &*L*L L$(J(JJ..M_ ::V:99hhnn=i&&9~+ Ec)nEU V  *|+km  affp`q`q[\ 1C1CA1F!GV`q  L gllfk_b,TXX__S]]1=M-NPYZfkLlyy1588?  ms   AF>F
reference_latentsfactorc                 p   UR                  5       n[        UR                  S5      5       Hp  n[        UR                  S5      5       HO  n[        R                  " X%U4   SS9u  px[        R                  " XEU4   SS9u  pXEU4   U
-
  U	-  U-  U-   XEU4'   MQ     Mr     [        R
                  " XU5      nU$ )a  
Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on statistics from a reference latent
tensor.

Args:
    latent (`torch.Tensor`):
        Input latents to normalize
    reference_latents (`torch.Tensor`):
        The reference latents providing style statistics.
    factor (`float`):
        Blending factor between original and transformed latent. Range: -10.0 to 10.0, Default: 1.0

Returns:
    torch.Tensor: The transformed latent tensor
r   r   Nr>   )clonerG   sizerJ   std_meanlerp)r0   r   rT   rU   resultrO   cr_sdr_meani_sdi_means              r   adain_filter_latent.LTX2LatentUpsamplePipeline.adain_filter_latent   s      w||A'A7<<?+$~~.?1.E4P$~~fTlE!'1!6$ >$FO!t	 , ( GV4r   compressionc                     US-  n[         R                  " U5      n[         R                  " SU-  US-
  -  5      nSSU-  U-  -
  nX-  nU$ )a  
Applies a non-linear tone-mapping function to latent values to reduce their dynamic range in a perceptually
smooth way using a sigmoid-based compression.

This is useful for regularizing high-variance latents or for conditioning outputs during generation, especially
when controlling dynamic behavior with a `compression` factor.

Args:
    latents : torch.Tensor
        Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
    compression : float
        Compression strength in the range [0, 1].
        - 0.0: No tone-mapping (identity transform)
        - 1.0: Full compression effect

Returns:
    torch.Tensor
        The tone-mapped latent tensor of the same shape as input.
g      ?g      @      ?g?)rJ   abssigmoid)r0   r   rc   scale_factorabs_latentssigmoid_termscalesfiltereds           r   tone_map_latents+LTX2LatentUpsamplePipeline.tone_map_latents   s\    * #T)ii( }}S<%7;;L%MNs\)L88#r   latents_meanlatents_stdscaling_factorc                     UR                  SSSSS5      R                  U R                  U R                  5      nUR                  SSSSS5      R                  U R                  U R                  5      nX-  U-  U-   n U $ )Nr   )viewrB   r;   r:   )r   ro   rp   rq   s       r   _denormalize_latents/LTX2LatentUpsamplePipeline._denormalize_latents   su     $((B1a8;;GNNGMMZ!&&q"aA699'..'--X'.8<Gr   
patch_sizepatch_size_tc           
          U R                  S5      nU R                  XaX#SXTU5      n U R                  SSSSSSSS	5      R                  SS	5      R                  SS5      R                  SS5      n U $ )
Nr   rs      r      r
      r      )rX   reshapepermuteflatten)r   r5   r6   r7   rw   rx   r4   s          r   rA   *LTX2LatentUpsamplePipeline._unpack_latents   st     \\!_
//*&\gqr//!Q1aAq9AA!QGOOPQSTU]]^_abcr   c                     X R                   -  S:w  d  X0R                   -  S:w  a  [        SU SU S35      eUb  Ub  [        S5      eUc  Uc  [        S5      eSUs=::  a  S::  d  O  [        S5      eg )	Nr   z8`height` and `width` have to be divisible by 32 but are z and .z1Only one of `video` or `latents` can be provided.z/One of `video` or `latents` has to be provided.r   z8`tone_map_compression_ratio` must be in the range [0, 1])r,   rF   )r0   r3   r6   r7   r   tone_map_compression_ratios         r   check_inputs'LTX2LatentUpsamplePipeline.check_inputs   s    666!;uGiGi?imn?nWX^W__dejdkklmnn!4PQQ=W_NOO/414WXX 5r   latents_normalizeddecode_timestepdecode_noise_scaleadain_factorr   output_typereturn_dictc                 8   U R                  UUUUUS9  Ub  SnOUR                  S   nU R                  nUb  [        U5      nX@R                  -  S:w  aU  X@R                  -  U R                  -  S-   nUSU n[
        R                  SU R                   S[        U5       SU S35        U R                  R                  XUS	9nUR                  U[        R                  S
9nUSLnU R                  UUUUUUU[        R                  UUUS9nU(       a`  U(       aY  U R                  XpR                  R                  U R                  R                   U R                  R"                  R$                  5      nUR                  U R&                  R(                  5      nU R'                  U5      nUS:  a  U R+                  UX{5      nOUnUS:  a  U R-                  X|5      nUS:X  a  UnGOU R                  R"                  R.                  (       d  SnO[1        UR                  UUUR(                  S9n[3        U	[4        5      (       d  U	/U-  n	U
c  U	n
O[3        U
[4        5      (       d  U
/U-  n
[        R6                  " U	UUR(                  S
9n[        R6                  " U
UUR(                  S
9SS2SSSS4   n
SU
-
  U-  U
U-  -   nU R                  R9                  UUSS9S   nU R                  R;                  XS9nU R=                  5         U(       d  U4$ [?        US9$ )aq  
Function invoked when calling the pipeline for generation.

Args:
    video (`list[PipelineImageInput]`, *optional*)
        The video to be upsampled (such as a LTX 2.0 first stage output). If not supplied, `latents` should be
        supplied.
    height (`int`, *optional*, defaults to `512`):
        The height in pixels of the input video (not the generated video, which will have a larger resolution).
    width (`int`, *optional*, defaults to `768`):
        The width in pixels of the input video (not the generated video, which will have a larger resolution).
    num_frames (`int`, *optional*, defaults to `121`):
        The number of frames in the input video.
    spatial_patch_size (`int`, *optional*, defaults to `1`):
        The spatial patch size of the video latents. Used when `latents` is supplied if unpacking is necessary.
    temporal_patch_size (`int`, *optional*, defaults to `1`):
        The temporal patch size of the video latents. Used when `latents` is supplied if unpacking is
        necessary.
    latents (`torch.Tensor`, *optional*):
        Pre-generated video latents. This can be supplied in place of the `video` argument. Can either be a
        patch sequence of shape `(batch_size, seq_len, hidden_dim)` or a video latent of shape `(batch_size,
        latent_channels, latent_frames, latent_height, latent_width)`.
    latents_normalized (`bool`, *optional*, defaults to `False`)
        If `latents` are supplied, whether the `latents` are normalized using the VAE latent mean and std. If
        `True`, the `latents` will be denormalized before being supplied to the latent upsampler.
    decode_timestep (`float`, defaults to `0.0`):
        The timestep at which generated video is decoded.
    decode_noise_scale (`float`, defaults to `None`):
        The interpolation factor between random noise and denoised latents at the decode timestep.
    adain_factor (`float`, *optional*, defaults to `0.0`):
        Adaptive Instance Normalization (AdaIN) blending factor between the upsampled and original latents.
        Should be in [-10.0, 10.0]; supplying 0.0 (the default) means that AdaIN is not performed.
    tone_map_compression_ratio (`float`, *optional*, defaults to `0.0`):
        The compression strength for tone mapping, which will reduce the dynamic range of the latent values.
        This is useful for regularizing high-variance latents or for conditioning outputs during generation.
        Should be in [0, 1], where 0.0 (the default) means tone mapping is not applied and 1.0 corresponds to
        the full compression effect.
    generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between
        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ltx.LTXPipelineOutput`] instead of a plain tuple.

Examples:

Returns:
    [`~pipelines.ltx.LTXPipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.ltx.LTXPipelineOutput`] is returned, otherwise a `tuple` is
        returned where the first element is the upsampled video.
)r3   r6   r7   r   r   Nr   r   z-Video length expected to be of the form `k * z + 1` but is z. Truncating to z frames.)r6   r7   r=   )r3   r4   r5   r6   r7   r8   r9   r:   r;   r   r           latent)r   r;   r:   F)r   )r   )frames) r   shape_execution_devicerE   r.   loggerwarningr/   preprocess_videorB   rJ   float32rR   ru   r    ro   rp   configrq   r!   r:   ra   rm   timestep_conditioningr   rC   rD   tensordecodepostprocess_videomaybe_free_model_hooksr   )r0   r3   r6   r7   r5   r8   r9   r   r   r   r   r   r   r   r   r   r4   r;   latents_suppliedlatents_upsampledtimestepnoises                         r   __call__#LTX2LatentUpsamplePipeline.__call__  sK   T 	'A 	 	
 J q)J''UJ???1D"E"EEHkHkknoo  kz*CDDgDgChhuvyz  wA  vB  BR  S]  R^  ^f  g ((99%V[9\EHHF%--H@E"$.&&!!1 3-- ' 
  2//..0D0DdhhooFdFdG **T22889 11':#../@'XG'G%+++GPG("E88??88$W]]iPV^e^k^kl!/488'6&7*&DO%-)8&#$6==*<)=
)J& <<gmm\%*\\2DV[b[h[h%itT4-&" 11W<?QTY?YYHHOOGX5OI!LE((::5:ZE 	##%8O ..r   )r,   r.   r/   )Nr   y         r   r   NNNN)re   )r   r   )Nr   r   r   r   r   NFr   Nr   r   NpilT) __name__
__module____qualname____firstlineno__model_cpu_offload_seqr   r   r(   rJ   Tensorintr:   r;   	GeneratorrR   floatra   rm   staticmethodru   rA   r   no_gradr   EXAMPLE_DOC_STRINGrD   r   boolstrr   __static_attributes____classcell__)r1   s   @r   r   r   h   s:   3c#c 3c 
	c& &*"##$$(&*,0'+*||d"* * 	*
 * *  * !* {{T!* t#* ??T)* $* 
*X5<< ELL bg 8 5 U\\ @  or-2\\HMfk	   st		+.	8;	DG	UX	lo			 	
Y ]]_12 26"##$'+#(/29=!,/DH"' !_/&'$._/ _/ 	_/
 _/  _/ !_/ $_/ !_/ e,_/ "DK/$6_/ _/ %*_/ ??T%//%::TA_/ 4Z_/  !_/ 3 _/r   r   )Nr   )rJ   image_processorr   modelsr   utilsr   r   utils.torch_utilsr   r/   r	   ltx.pipeline_outputr   pipeline_utilsr   r!   r   r   r   r   r   r   r   r   r    r   r   <module>r      s{      1 , : - - 3 . 6 
H	8 z `h
TLL
T-2__t-C
TY\
TA/!2 A/r   