
    3j:6                        S SK rSSKJr  SSKJrJr  SSKJrJ	r	J
r
Jr  SSKJrJr  SSKJrJrJr  SSKJr  SS	KJr  \" 5       (       a  S
SKJrJr  \R4                  " \5      r " S S\	SS9r\\" SS9 " S S\
5      5       5       rS/rg)    N   )
AudioInput)
ImageInputmake_nested_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_vision_availablelogging)requires)
VideoInput   )Gemma4ImageProcessorKwargs get_aspect_ratio_preserving_sizec                   <    \ rS rSr% \\S'   SSS.SS00 SS0S.rSrg	)
Gemma4ProcessorKwargs"   images_kwargsT)paddingreturn_mm_token_type_idsdo_convert_rgbreturn_metadata)text_kwargsr   audio_kwargsvideos_kwargs N)__name__
__module____qualname____firstlineno__r   __annotations__	_defaults__static_attributes__r        f/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/gemma4/processing_gemma4.pyr   r   "   s5    -- (,

 d
 +T2
Ir(   r   F)total)vision)backendsc                     ^  \ rS rSr\r    SS\S\S\4U 4S jjjr    SS\S-  S\	\
-  \\	   -  \\
   -  S	\S
\4U 4S jjjr    SS\\\   -  S-  S\	\
-  \\	   -  \\
   -  S	\S
\S\\   4
U 4S jjjrS\S\S\4S jrS\S\S\4S jrS\S\S\4S jrS S jrS\S\4S jr\U 4S j5       r\S\\   4S j5       rSrU =r$ )!Gemma4Processor1   Nimage_seq_lengthaudio_seq_lengthaudio_ms_per_tokenc	           	        > X`l         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  SS/05        SU l        UR                  U R                  5      U l        Xpl	        Xl
        [        USS5      U l        [        USS5      U l        [        USS5      U l        [        USS5      U l        [         T
U ]D  " S	UUUUUS.U	D6  g)
u  
image_seq_length (`int`, *optional*, defaults to 280):
    The number of soft tokens per image used for placeholder expansion.
audio_seq_length (`int`, *optional*, defaults to 750):
    The maximum number of audio soft tokens per audio segment. Serves as an
    upper-bound cap when dynamic audio token counts are computed.
audio_ms_per_token (`int`, *optional*, defaults to 40):
    Milliseconds of audio per output soft token. Used to dynamically compute
    the number of audio placeholder tokens as ``ceil(duration_ms / audio_ms_per_token)``.
    The default of 40 comes from the SSCP convolution's 4× time reduction on 10ms frames.
additional_special_tokensz	<|video|>audio_token_idNaudio_token	boa_token	eoa_token)feature_extractorimage_processor	tokenizervideo_processorchat_templater    )r0   image_token_id	boi_token	eoi_tokenimage_tokenadd_special_tokensvideo_tokenconvert_tokens_to_idsvideo_token_idr1   r2   getattrr5   r6   r7   r8   super__init__)selfr9   r:   r;   r<   r=   r0   r1   r2   kwargs	__class__s             r)   rH   Gemma4Processor.__init__6   s    . !1'66",,",,$00 	$$&AK=%QR&'==d>N>NO !1 #5%i1A4H"9mTB K> K> 	
/++'	
 	
r(   imagestextvideosaudioc           	      8  > [         TU ]  " SXX4S.UD6u  pp4Ub  [        U5      nU(       a?  U(       d8  U Vs/ s H+  nSR                  U R                  /[        U5      -  5      PM-     nnU(       a   U(       d  U R                  /[        U5      -  nXX44$ s  snf )N)rM   rN   rO   rP    r    )rG   prepare_inputs_layoutr   joinrA   lenr6   )rI   rM   rN   rO   rP   rJ   
image_listrK   s          r)   rS   %Gemma4Processor.prepare_inputs_layoutn   s     ',g&C '
V'
DJ'
#f
 /7F $U[\U[zCHHd../#j/ABU[D\$$%E
2DV**	 ]s   2BrJ   c                   > [         T
U ]  " SXS.UD6  Uc  Uc  [        S5      eUb  U R                  b  U R                  b  U R
                  c  [        S5      eUb  U Vs/ s H  ofR                  U R                  5      PM     nnUb  [        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eU Vs/ s H  n[        U5      PM     n	nXy:w  a,  [        SU R                   SU S	U R                   S
U	 S3	5      eg Uc6  [        U5      (       a%  [        S[        U5       S	U R                   S35      eg g g s  snf s  snf )N)rM   rN   z+You must provide either `text` or `images`.zUAudio inputs were provided, but the tokenizer does not have an `audio_token` defined.z1Received inconsistently sized batches of images (z) and text (z).zThe total number of zP tokens in the prompts should be the same as the number of images passed. Found rR   z tokens and z images per sample.zFound z. tokens in the text but no images were passed.r    )rG   validate_inputs
ValueErrorr6   r7   r8   countrA   rU   anysum)rI   rM   rN   rO   rP   rJ   samplen_images_in_textsublistn_images_in_imagesrK   s             r)   rY   Gemma4Processor.validate_inputs   s    	CvCFC<FNJKK!1!1!9T^^=SW[WeWeWmtuuMQRT6T-=-= >TR!v;#d)+$KCPVK=Xdehimendooqr  CI%I&wc'l&"%I#9$.t/?/?.@ A""2!31T5E5E4FlSeRffy{  :
 C(8$9$9 S!1231T5E5E4FFtu  %: R &Js   !$E
Eimage_inputs	image_idxreturnc                 d    US   U   nU R                    U R                  U-   U R                   3$ )Nnum_soft_tokens_per_image)r?   rA   r@   )rI   rc   rd   num_soft_tokenss       r)   replace_image_token#Gemma4Processor.replace_image_token   s;    &'BCIN..!$"2"2_"D!EdnnEUVVr(   video_inputs	video_idxc           
         US   U   nUS   U   nUR                   c  [        R                  S5        UR                   c  SOUR                   Ul         UR                   Vs/ s H$  n[	        US-  5      S S[	        US-  5      S 3PM&     nnSR                  U Vs/ s H.  ow SU R                   U R                  U-   U R                   3PM0     sn5      nU$ s  snf s  snf )	Nnum_soft_tokens_per_videovideo_metadataa  Gemma4 requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.   <   02d:rR   )	fpsloggerwarning_once
timestampsintrT   r?   rC   r@   )	rI   rk   rl   rh   metadatasecondstimestamp_strtvideo_replacements	            r)   replace_video_token#Gemma4Processor.replace_video_token   s    &'BCIN 01)<<<e
 &\\1rx|| ]e\o\op\oQXC2.s31S25Fs4KL\opHHbopbo]^s!DNN#D$4$4$F#GGWXbop
 ! 	 qps   !+C5C audio_inputs	audio_idxc                    US   U   n[        U5      n[        S5       H'  nUS-   S-
  S-  S-   nUS S S2   S U n[        U5      nM)     U R                   U R                  [	        UR                  5       5      -   U R                   3$ )Ninput_features_mask   r   r   )rU   ranger7   r6   rx   r]   r8   )rI   r   r   maskr|   _t_outs          r)   replace_audio_token#Gemma4Processor.replace_audio_token   s    129= IqAUQY1$q(E!9Ve$DD	A 
 ..!$"2"2S_"D!EdnnEUVVr(   c           	      D   [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nUR                  SS5      =(       d    U R                  R                  nUR                  SS5      =(       d    U R                  R                  nXvS-  -  n0 n	Ubd  / n
U H9  n[        US   US   UUUS	9u  pX-  nX-  nU
R                  X-  US-  -  5        M;     S/[        U5      -  nU	R                  U
US
.5        Ub`  [        U R                  SS5      nU Vs/ s H)  nU R                  [        R                  " U5      U5      PM+     nnU	R                  SU05        [!        S0 U	D6$ s  snf )a  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
    audio_lengths (`list[int]`, *optional*):
        The lengths of audio inputs in number of samples. Used to dynamically
        compute per-audio token counts.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r   
patch_sizeNpooling_kernel_sizemax_soft_tokensr   r   r   )heightwidthr   max_patchesr   )num_image_tokensnum_image_patchessampling_ratei>  num_audio_tokensr    )r   r&   getupdater:   r   r   r   r   appendrU   rF   r9   _compute_audio_num_tokensnpzerosr   )rI   image_sizesaudio_lengthsrJ   r   r   r   r   r   vision_datar   
image_sizetarget_htarget_wpatch_heightpatch_widthr   r   lengthr   s                       r)   _get_num_multimodal_tokens*Gemma4Processor._get_num_multimodal_tokens   s     .77;;ORPV$"&&|T:]d>R>R>]>]
3T:fd>R>R>f>f 	 (++,=tDlH\H\HlHl%Q(>>"!)
%E%a=$Q-) +(;&"  (5&4 ''(BFY[\F\(\] * "#c+&6 64D[lmn$ $D$:$:OVTM^k ^kTZ..rxx/?O^k     24DEF,,, s   0Fr   c                 N   [        U5      nU R                  R                  S-   nU R                  R                  S-  nX5-   U-
  U R                  R                  -  S-   nUS::  a  gSnSu  pn
Un[	        U5       H  nUSU
-  -   U-
  U	-  S-   nM     [        XR                  5      $ )aP  Number of audio soft tokens, replicating the encoder's seq-length arithmetic.

Mirrors Gemma4AudioFeatureExtractor mel framing + the two stride-2 Conv2d
subsampling layers in Gemma4AudioSubSampleConvProjection, capped at
``audio_seq_length``. Must match ``audio_mask.sum()`` from the audio tower or
vLLM's ``_merge_multimodal_embeddings`` will raise on a length mismatch.

Args:
    audio_waveform: A 1-D array or list containing the raw audio samples.
    sampling_rate: The sampling rate of the audio waveform in Hz.

Returns:
    The number of audio soft tokens to insert as placeholders.
r   r   r   )r   r   r   )rU   r9   frame_length
hop_lengthr   minr1   )rI   audio_waveformr   num_samplesframe_size_for_unfoldpad_leftnum_mel_framessscp_num_layerssscp_kernelsscp_stridesscp_paddingr|   r   s                r)   r   )Gemma4Processor._compute_audio_num_tokens  s     .) !% 6 6 C Ca G))66!;%03HHTMcMcMnMnnqrrQ 18.,'AQ%%3CaGA (
 1++,,r(   c                     > [         TU ]  S/-   $ )Nmm_token_type_ids)rG   model_input_names)rI   rK   s    r)   r   !Gemma4Processor.model_input_names,  s    w(,?+@@@r(   c                 
    SS/$ )Nrg   rn   r    )rI   s    r)   unused_input_names"Gemma4Processor.unused_input_names0  s    +-HIIr(   )r2   r1   r6   r5   r7   r?   r8   r@   r0   rA   r>   rC   rE   )Ni  i  (   )NNNN)NN)r!   r"   r#   r$   r   valid_processor_kwargsrx   rH   r   r   r   listr   r   rS   r
   r   rY   dictstrri   r~   r   r   r   propertyr   r   r'   __classcell__)rK   s   @r)   r.   r.   1   s    3  # #"$6
 6
 6
  6
 6
t %)Z^! +T!+ ++d9o=EV@WW+ 	+
 + +4 8<Z^! !T*--4! ++d9o=EV@WW! 	!
 ! )*! !FW W W W! ! ! !&W W W W5-n&-s &-s &-P A A JDI J Jr(   r.   ) numpyr   audio_utilsr   image_utilsr   r   processing_utilsr   r   r	   r
   tokenization_utils_baser   r   utilsr   r   r   utils.import_utilsr   video_utilsr   image_processing_gemma4r   r   
get_loggerr!   ru   r   r.   __all__r    r(   r)   <module>r      s      % A X X C A A * % e 
		H	%,E  	;Jn J   JD 
r(   