
    3jP%                         S SK rSSKJrJrJr  SSKJrJr  \R                  " \
5      r " S S\SS9r\ " S	 S
\5      5       rS
/rg)    N   )MultiModalDataProcessingKwargsProcessorMixin)auto_docstringloggingc                   *    \ rS rSrSSSS.SS0S.rSrg)	Glm46VProcessorKwargs   FT)paddingreturn_token_type_idsreturn_mm_token_type_idsreturn_metadata)text_kwargsvideos_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       f/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/glm46v/processing_glm46v.pyr
   r
      s#     %*(,

 ,T2Ir   r
   F)totalc                      ^  \ rS rSr\rSU 4S jjrS\S\S\	4S jr
S\S\S\	4S	 jrSS
 jr SS jr\U 4S j5       rS\S\\\      4S jrSS\4S jjrSrU =r$ )Glm46VProcessor*   c                   > [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        [        USS 5      (       a  UR                  OUR                  U R                  5      U l        [        USS 5      (       a  UR                  OUR                  U R                  5      U l        [        TU ]!  XX4S9  UR                  S5      U l	        UR                  S	5      U l
        g )
Nimage_tokenz	<|image|>video_tokenz	<|video|>image_token_idvideo_token_id)chat_templatez<|begin_of_video|>z<|end_of_video|>)hasattrr    r!   getattrr"   convert_tokens_to_idsr#   super__init__video_start_idvideo_end_id)selfimage_processor	tokenizervideo_processorr$   kwargs	__class__s         r   r)   Glm46VProcessor.__init__.   s    .5i.O.O;U^UjUj.5i.O.O;U^UjUj y"2D99 $$001A1AB 	 y"2D99 $$001A1AB 	
 	_b'==>RS%;;<NOr   image_inputs	image_idxreturnc                     U R                   R                  S-  nUS   U   R                  5       U-  nU R                  U-  $ )N   image_grid_thw)r-   
merge_sizeprodr    )r,   r3   r4   merge_lengthnum_image_tokenss        r   replace_image_token#Glm46VProcessor.replace_image_token?   sI    ++669'(89)DIIK|["222r   video_inputs	video_idxc                 z   U R                   R                  S-  nUS   U   S   nUS   U   R                  5       U-  U-  nUS   U   nSnUR                  c  [        R                  S5        UR                  c  SOUR                  Ul        UR                  S S S2   n/ n	[        S[        U5      5       H  n
U	R                  X   5        M     U	S U n[        U5      U:  a.  UR                  U(       a  US   OS5        [        U5      U:  a  M.  [        U5       H  nX   nU R                  XS	9nX~-  nM     U$ )
Nr7   video_grid_thwr   video_metadata a  GLM46V requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.   )r<   )r/   r9   r:   fpsloggerwarning_once
timestampsrangelenappendreplace_frame_token_id)r,   r?   r@   r;   
num_framesr<   metadatavideo_structurerJ   unique_timestampsidxselected_timestamps	frame_idxtimestamp_secframe_structures                  r   replace_video_token#Glm46VProcessor.replace_video_tokenD   s_   ++669!"23I>qA
'(89)DIIK|[_ii 01)<<<e
 &\\1rx||((1-
C
O,C$$Z_5 - 0<%&3&&BU':2'>[\] %&3 z*I/:M"99-9kO.O +
 r   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ s H
  oUS-  -  PM     n
n	UR                  XS.5        Ubz  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ s H
  oWS-  -  PM     nn	XS'   [        S0 UD6$ s  snf s  sn	f s  snf s  sn	f )	a  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
    video_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (num_frames, height, width) per each video.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nimages_kwargsr9   r7   )r<   num_image_patchesr   num_video_tokensr   )
r
   r   getupdater-   r9   get_number_of_image_patchesr/   get_number_of_video_patchesr   )r,   image_sizesvideo_sizesr0   vision_datar[   r9   
image_sizer\   num_patchesr<   r   
video_sizenum_video_patchesr]   s                  r   _get_num_multimodal_tokens*Glm46VProcessor._get_num_multimodal_tokensc   s    "1;;??QSTM  (&**<>a$BVBVBaBaJ #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd4Dmn"1;;??QSTM  ( #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd.>*+,,,#!  e!  es   *(EE6(E$Ec                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spaces)r.   batch_decode)r,   generated_outputsrl   rm   r0   s        r   post_process_image_text_to_text/Glm46VProcessor.post_process_image_text_to_text   s3    ( ~~**
 3)E
 	
 	
r   c                     > [         TU ]  S/-   $ )Nmm_token_type_ids)r(   model_input_names)r,   r1   s    r   rt   !Glm46VProcessor.model_input_names   s    w(,?+@@@r   	input_idsc                    / nU H  n[         R                  " U5      n[         R                  " U5      n[         R                  " X@R                  :H  SS9n[         R                  " X@R
                  :H  SS9nXg:  nSXTU R                  :H  U-  '   SXTU R                  :H  U) -  '   UR                  UR                  5       5        M     U$ )Nr   )axisr7      )	nparray
zeros_likecumsumr*   r+   r"   rM   tolist)	r,   rv   rs   input	array_idsmm_token_typesstartsendsis_video_modalitys	            r   create_mm_token_type_ids(Glm46VProcessor.create_mm_token_type_ids   s     EI]]51N
 YYy,?,??aHF99Y*;*;;!DD &UVN)<)<<@QQRXYN)<)<<BSASTU$$^%:%:%<=  ! r   r<   c                 0    SU R                   U-   SUS S3$ )Nz<|begin_of_image|>z<|end_of_image|>z.1fz seconds)r    )r,   rV   r<   s      r   rN   &Glm46VProcessor.replace_frame_token_id   s+    #D$4$47G$G#HHXYfgjXkksttr   )r    r"   r+   r*   r!   r#   )NNNN)NN)TF)ry   )r   r   r   r   r
   valid_processor_kwargsr)   dictintstrr=   rX   ri   rp   propertyrt   listr   rN   r   __classcell__)r1   s   @r   r   r   *   s    2P"3 3 3 3
   >$-N Y^
6 A A!$ !4S	? !*uc u ur   r   )numpyrz   processing_utilsr   r   r   utilsr   r   
get_loggerr   rH   r
   r   __all__r   r   r   <module>r      s`   ,  P P , 
		H	%,E  Sun Su Sul 
r   