
    
3jv@                     x   S SK r S SKrS SKJrJrJrJrJrJr  SSK	J
r
  SSKJr  SSKJr  SSKJr  SSKJr  S	S
KJrJr  S	SKJrJrJr  SSKJr  \R6                  " \5      rS rS r    SS jr SS jr! " S S\5      r" SS\RF                  S\RH                  S-  S\%4S jjr& " S S\5      r' " S S\5      r(g)    N)ByT5TokenizerQwen2_5_VLTextModelQwen2TokenizerFastSiglipImageProcessorSiglipVisionModelT5EncoderModel   )
FrozenDict)ClassifierFreeGuidance)AutoencoderKLHunyuanVideo15)HunyuanVideo15ImageProcessor)logging   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )HunyuanVideo15ModularPipelinec                 T    U  Vs/ s H  nSUS.SU(       a  UOSS./PM     sn$ s  snf )Nsystem)rolecontentuser  )promptsystem_messageps      o/home/wildlama/miniconda3/lib/python3.13/site-packages/diffusers/modular_pipelines/hunyuan_video1_5/encoders.pyformat_text_inputr"   (   s;    ououjk(~	6YZTU`c8deou  s   %c                 P   Sn[         R                  " X5      nU Vs/ s H  o3S   =(       d    US   PM     nn[        U5      S:  a  [        [        R                  U5      5      OUnU(       a+  SR                  U Vs/ s H	  nSU S3PM     sn5      S-   nU$ S nU$ s  snf s  snf )Nz\"(.*?)\"|\"(.*?)\"r   r   z. zText "")refindalllenlistdictfromkeysjoin)r   patternmatchesmatchresulttextformatted_results          r!   extract_glyph_textsr2   .   s    $Gjj)G078uAh"%("F8,/K!OT$--'(F996%J64tfA&66%JKdR    9 &Ks   B>B#c                 V   [        U[        5      (       a  U/OUn[        X&5      nUR                  USSSSXG-   SSS9nUR                  R                  US9n	UR                  R                  US9n
U " U	U
SS9R                  US-   *    nUb  US:  a  US S 2US 24   nU
S S 2US 24   n
X4$ )	NT
max_lengthpt)add_generation_prompttokenizereturn_dictpaddingr4   
truncationreturn_tensorsdevice)	input_idsattention_maskoutput_hidden_statesr   r   )
isinstancestrr"   apply_chat_templater>   tor?   hidden_states)text_encoder	tokenizerr   r=   tokenizer_max_lengthnum_hidden_layers_to_skipr   
crop_starttext_inputstext_input_idsprompt_attention_maskprompt_embedss               r!   _get_mllm_prompt_embedsrO   :   s    " $FC00fXfFv6F//"'4 0 	K !**--V-<N'66999H  ,! m/!34	6M *q.%an5 5an E//    c           
         [        U[        5      (       a  U/OUnU Vs/ s H  n[        U5      PM     nn/ n/ nU H  n	U	c\  [        R                  " SXAR
                  R                  4X1R                  S9n
[        R                  " SU4U[        R                  S9nOnU " U	SUSSSS9R                  U5      nU" UR                  UR                  R                  5       S9S   n
U
R                  US	9n
UR                  R                  US	9nUR                  U
5        UR                  U5        M     [        R                  " USS
9[        R                  " USS
94$ s  snf )Nr   r=   dtyper4   Tr5   )r9   r4   r:   add_special_tokensr;   )r>   r?   r   r<   )dim)rA   rB   r2   torchzerosconfigd_modelrS   int64rD   r>   r?   floatappendcat)rG   rF   r   r=   rH   r    glyph_textsprompt_embeds_listprompt_embeds_mask_list
glyph_textglyph_text_embedsglyph_text_embeds_mask
txt_tokenss                r!   _get_byt5_prompt_embedsre   i   sq   #FC00fXfF39:6a&q)6K: !
 %(*=*=*E*EFv]o]o! &+[[!5I1JSYafalal%m""$/#'# bj  !-$..)88>>@! ! !2 4 4F 4 C%/%>%>%A%A%A%P"!!"34&&'=>1 "4 99'Q/;RXY1ZZZ? ;s   Ec                       \ rS rSrSr\S\4S j5       r\S\\	   4S j5       r
\S\\   4S j5       r\S\\   4S j5       r\    SS	 j5       r\R$                  " 5       S
\S\S\4S j5       rSrg)HunyuanVideo15TextEncoderStep   hunyuan-video-1.5returnc                     g)NzDDual text encoder step using Qwen2.5-VL (MLLM) and ByT5 (glyph text)r   selfs    r!   description)HunyuanVideo15TextEncoderStep.description   s    UrP   c                     [        S[        5      [        S[        5      [        S[        5      [        S[        5      [        S[
        [        SS05      SS	9/$ )
NrF   rG   text_encoder_2tokenizer_2guiderguidance_scaleg      @from_configrX   default_creation_method)r   r   r   r   r   r   r
   rl   s    r!   expected_components1HunyuanVideo15TextEncoderStep.expected_components   sX     .*=>+'9:*N;-7&!#3S"9:(5	
 	
rP   c                     [         R                  " SSS9[         R                  " S5      [         R                  " SSS9/$ )Nr   Frequirednegative_promptnum_images_per_promptnum_videos_per_prompt)namer   templaterl   s    r!   inputs$HunyuanVideo15TextEncoderStep.inputs   s?     59 12 7>UV
 	
rP   c                 v   [         R                  " S5      [         R                  " S5      [         R                  " S5      [         R                  " S5      [        S[        R                  SSS9[        S	[        R                  SS
S9[        S[        R                  SSS9[        S[        R                  SSS9/$ )NrN   prompt_embeds_masknegative_prompt_embedsnegative_prompt_embeds_maskprompt_embeds_2denoiser_input_fieldszTByT5 glyph-text embeddings used as a second conditioning stream for the transformer.)	type_hintkwargs_typern   prompt_embeds_mask_2z2Attention mask for the ByT5 glyph-text embeddings.negative_prompt_embeds_2zAByT5 glyph-text negative embeddings for classifier-free guidance.negative_prompt_embeds_mask_2z;Attention mask for the ByT5 glyph-text negative embeddings.)r   r   rV   Tensorrl   s    r!   intermediate_outputs2HunyuanVideo15TextEncoderStep.intermediate_outputs   s       1  !56  !9:  !>?!,,3r	 &,,3P	 *,,3_	 /,,3Y	/
 	
rP   Nc           
      x   U=(       d    U R                   nU=(       d    U R                  R                  nUc  S/U-  n[        U[        5      (       a  U/OUn[        U R                  U R                  UUU R                  U R                  U R                  S9u  pg[        U R                  U R                  UUU R                  S9u  pUR                  u  pn
UR                  SUS5      R!                  XE-  US5      nUR                  SUS5      R!                  XE-  U5      nUR                  u  pn
UR                  SUS5      R!                  XE-  US5      nU	R                  SUS5      R!                  XE-  U5      n	UR#                  X2S9nUR#                  X2S9nUR#                  X2S9nU	R#                  X2S9n	XgX4$ )N )rG   rF   r   r=   rH   r   rJ   )rG   rF   r   r=   rH   r   )rS   r=   )_execution_devicerF   rS   rA   rB   rO   rG   rH   r    prompt_template_encode_start_idxre   rr   rq   tokenizer_2_max_lengthshaperepeatviewrD   )
componentsr   r=   rS   
batch_sizer   rN   r   r   r   _seq_len	seq_len_2s                r!   encode_prompt+HunyuanVideo15TextEncoderStep.encode_prompt   s    7:7760066>TJ&F'44&&,C **#00!+!@!@%44!BB-
) 1H ,,#22!+!B!B1
- &++A%,,Q0EqINN.
 066q:OQRSXX.
 *//a)004I1MRR.	2
  4::1>SUVW\\.	 
 &((u(D/222N),,5,H366U6R/WWrP   r   statec           	      T   U R                  U5      nUR                  nUR                  R                  nUR                  nUR
                  nUR                  nUb  [        U[        5      (       a  Sn	O&Ub!  [        U[        5      (       a  [        U5      n	OSn	U R                  UUUUU	US9u  Ul        Ul        Ul        Ul        UR                   (       a-  U R                  UUUUU	US9u  Ul        Ul        Ul        Ul        UR+                  SU	5        U R-                  X#5        X4$ )Nr   )r   r=   rS   r   r   r   )get_block_stater   transformerrS   r   r}   r   rA   rB   r(   r'   r   rN   r   r   r   requires_unconditional_embedsr   r   r   r   setset_block_state)
rm   r   r   block_stater=   rS   r   r}   r   r   s
             r!   __call__&HunyuanVideo15TextEncoderStep.__call__  sA   **51--&&,,##%55 + A A*VS"9"9JJvt$<$<VJJ !"7  
	
%*', 33 ""&%&; # 2749 			,
+U0  rP   r   )NNr   r   )__name__
__module____qualname____firstlineno__
model_namepropertyrB   rn   r(   r   rx   r   r   r   r   staticmethodr   rV   no_gradr   r   r   __static_attributes__r   rP   r!   rg   rg      s    $JVS V V 
T-%8 
 
 
Z( 
 
 
d;&7 
 
@  6X 6Xp ]]_0!#@ 0! 0![h 0! 0!rP   rg   encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr   r   moder   AttributeError)r   r   r   s      r!   retrieve_latentsr   9  s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSSrP   c                       \ rS rSrSr\S\4S j5       r\S\\	   4S j5       r
\S\\   4S j5       r\S\\   4S j5       r\R                   " 5       S\S	\S\4S
 j5       rSrg)HunyuanVideo15VaeEncoderStepiF  ri   rj   c                     g)Nz\VAE Encoder step that encodes an input image into latent space for image-to-video generationr   rl   s    r!   rn   (HunyuanVideo15VaeEncoderStep.descriptionI  s    mrP   c           	      X    [        S[        5      [        S[        [        SS05      SS9/$ )Nvaevideo_processorvae_scale_factor   ru   rv   )r   r   r   r
   rl   s    r!   rx   0HunyuanVideo15VaeEncoderStep.expected_componentsM  s8     %!<=!,!#5r":;(5	
 	
rP   c                     [         R                  " SSS9[         R                  " S5      [         R                  " S5      /$ )NimageTr{   heightwidthr   rl   s    r!   r   #HunyuanVideo15VaeEncoderStep.inputsY  s:     $7)(
 	
rP   c                 n    [        S[        R                  SS9[        S[        SS9[        S[        SS9/$ )Nimage_latentsz*Encoded image latents from the VAE encoderr   rn   r   z!Target height resolved from imager   z Target width resolved from image)r   rV   r   intrl   s    r!   r   1HunyuanVideo15VaeEncoderStep.intermediate_outputsa  s?     ,,H
 C=`a3<^_
 	
rP   r   r   c                    U R                  U5      nUR                  nUR                  nUR                  nUR                  nUb  UcA  UR
                  R                  UR                  S   UR                  S   UR                  S9u  pgUR
                  R                  XVUSS9nUR                  R                  nUR
                  R                  XVUS9R                  XHS9n	U	R                  S5      n	[        UR                  R!                  U	5      S	S
9n
XR                  R"                  R$                  -  n
Xl        Xcl        Xsl        UR)                  SU5        U R+                  X#5        X4$ )Nr   r   )r   r   target_sizecrop)r   r   resize_mode)r   r   rR   r   r   )r   r   )r   r   r   r   r   r   calculate_default_height_widthsizer   resizer   rS   
preprocessrD   	unsqueezer   encoderX   scaling_factorr   r   r   )rm   r   r   r   r=   r   r   r   	vae_dtypeimage_tensorr   s              r!   r   %HunyuanVideo15VaeEncoderStep.__call__m  s^   **51--!!##!!>U]&66UUzz!}EJJqMzG]G] V MF **11%eag1hNN((	!11<<UY^<_bb c 
 $--a0()>)>|)LZbc%(=(=(L(LL$1!#!		'5!U0  rP   r   Nr   r   r   r   r   r   rB   rn   r(   r   rx   r   r   r   r   rV   r   r   r   r   r   r   rP   r!   r   r   F  s    $JnS n n 	
T-%8 	
 	
 
Z( 
 
 	
d;&7 	
 	
 ]]_!#@ ! ![h ! !rP   r   c                       \ rS rSrSr\S\4S j5       r\S\\	   4S j5       r
\S\\   4S j5       r\S\\   4S j5       r\R                   " 5       S\S	\S\4S
 j5       rSrg)HunyuanVideo15ImageEncoderStepi  ri   rj   c                     g)NzRSiglip image encoder step that produces image_embeds for image-to-video generationr   rl   s    r!   rn   *HunyuanVideo15ImageEncoderStep.description  s    crP   c                 B    [        S[        5      [        S[        5      /$ )Nimage_encoderfeature_extractor)r   r   r   rl   s    r!   rx   2HunyuanVideo15ImageEncoderStep.expected_components  s%     /+<=-/CD
 	
rP   c                 .    [         R                  " SSS9/$ )Nr   Tr{   r   rl   s    r!   r   %HunyuanVideo15ImageEncoderStep.inputs  s     $7
 	
rP   c                 6    [        S[        R                  SS9/$ )Nimage_embedsz/Image embeddings from the Siglip vision encoderr   )r   rV   r   rl   s    r!   r   3HunyuanVideo15ImageEncoderStep.intermediate_outputs  s#     ,,M
 	
rP   r   r   c                 l   U R                  U5      nUR                  n[        UR                  R	                  5       5      R
                  nUR                  R                  UR                  SSSS9nUR                  XES9nUR                  " S0 UD6R                  nXsl        U R                  X#5        X4$ )NTr5   )images	do_resizer;   do_convert_rgbrR   r   )r   r   nextr   
parametersrS   r   r   r   rD   last_hidden_stater   r   )rm   r   r   r   r=   image_encoder_dtypeimage_inputsr   s           r!   r   'HunyuanVideo15ImageEncoderStep.__call__  s    **51--":#;#;#F#F#HIOO!33>>$$TZ^ ? 
 $fP!//?,?QQ#/ U0  rP   r   Nr   r   rP   r!   r   r     s    $JdS d d 
T-%8 
 
 
Z( 
 

 
d;&7 
 
 ]]_!#@ ! ![h ! !rP   r   )i  r   a  You are a helpful assistant. Describe the video by detailing the following aspects:     1. The main content and theme of the video.     2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.     3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.     4. background environment, light, style and atmosphere.     5. camera angles, movements, and transitions used in the video.l   )   )Nr   ))r%   rV   transformersr   r   r   r   r   r   configuration_utilsr
   guidersr   modelsr   *pipelines.hunyuan_video1_5.image_processorr   utilsr   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   
get_loggerr   loggerr"   r2   rO   re   rg   r   	GeneratorrB   r   r   r   r   rP   r!   <module>r	     s    
   . - 1 V  C K K ; 
		H	%	" E ,0^![Hi!$9 i!Z `h
TLL
T-2__t-C
TY\
TC!#8 C!L-!%: -!rP   