
    3j9*                        S SK r S SKrSSKJr  SSKJrJr  SSKJrJ	r	J
r
JrJrJrJr  SSKJrJr  SSKJr  \" 5       (       a  SSKJr  \
" 5       (       a  S SKr\" 5       (       a  S S	KJr  O\" 5       (       a  S S	KJr  \R4                  " \5      r " S
 S\SS9rSrS\S\S\S\S\S\ \\4   4S jr!SSS\SS4S jr"SSSSS\S\ S   4S jr#\	" S\5       " S S\5      5       r$S/r%g)     N   )BatchFeature)UnpackVideosKwargs)
TensorTypeadd_start_docstringsis_torch_availableis_torchvision_availableis_torchvision_v2_availableis_vision_availablelogging)BASE_VIDEO_PROCESSOR_DOCSTRINGBaseVideoProcessor)
VideoInput)PILImageResampling)
functionalc                   8    \ rS rSr% Sr\\S'   \\S'   \\S'   Srg)Gemma4VideoProcessorKwargs0   a?  
patch_size (`int`, *optional*):
    Size of each image patch in pixels.
max_soft_tokens (`int`, *optional*):
    Maximum number of soft (vision) tokens per video frame.
    Must be one of {70, 140, 280, 560, 1120}.
pooling_kernel_size (`int`, *optional*):
    Spatial pooling kernel size applied after patchification.

patch_sizemax_soft_tokenspooling_kernel_size N)__name__
__module____qualname____firstlineno____doc__int__annotations____static_attributes__r       l/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/gemma4/video_processing_gemma4.pyr   r   0   s     Or"   r   F)total)F      i  i0  i`  heightwidthr   max_patchesr   returnc                 h   X-  nX2S-  -  n[         R                  " Xe-  5      nXp-  nXq-  n	XB-  n
[        [         R                  " X-  5      5      U
-  n[        [         R                  " X-  5      5      U
-  nUS:X  a  US:X  a  [	        SXB-   S35      eX4S-  -  U
-  nUS:X  a1  U
n[        [        [         R                  " X-  5      5      U
-  U5      nO6US:X  a0  U
n[        [        [         R                  " X-  5      5      U
-  U5      nX-  U:  a  [	        SU  SU SU SU SU S	U 35      eX4$ )
a  
Image is resized to preserve aspect ratio so it fits within the patch budget.
Target dimensions are the largest that:
1) Produce at most `max_patches` patches when patchified with `patch_size`
2) Have height and width divisible by `pooling_kernel_size * patch_size`
   r   zoAttempting to resize to a 0 x 0 image. Resized height should be divisble by `pooling_kernel_size * patch_size`=.z
Resizing [xz] to [z] but this exceeds z patches with patch_size )mathsqrtr   floor
ValueErrormin)r'   r(   r   r)   r   total_px	target_pxfactorideal_heightideal_width	side_multtarget_heighttarget_widthmax_side_lengths                 r#    get_aspect_ratio_preserving_sizer=   D   s{    ~H1}-IYYy+,F?L.K#0I 

<#;<=	IMtzz+"9:;iGL la/22E2R1SSTV
 	

 #1&<<	IO!

5>*+i7
 
	 

6>*+i7

 #i/%}oQ|n M  +},Ej\S
 	

 &&r"   videotorch.Tensorc                     U R                   u  p#pEXA-  nXQ-  nU R                  X#XaXq5      nUR                  SSSSSS5      nUR                  X&U-  S5      nU$ )z
Convert 4D tensor video of shape (num_frames, num_channels, height, width) into 3D tensor of patches of shape
(num_frames, num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
r   r,      r         )shapereshapepermute)	r>   r   
num_framesnum_channelsr'   r(   num_patches_heightnum_patches_widthpatched_videos	            r#   convert_video_to_patchesrM   z   sv    
 /4kk+Jf-+MM"4BSM "))!Q1a;M!))*K\6\^`aMr"   	positionstarget_length)r?   r?   c                     U R                   S   nX#-
  nUS:  ab  SSSUSS/nSSSUSS4n[        R                  R                  R	                  XSSS9n [        R                  R                  R	                  XSSS9nX4$ )z.
Pad the video along to max number of patches
rC   r   constant)modevaluerD   )rE   torchnnr   pad)r>   rN   rO   current_lengthpadding_lengthpaddingpos_paddings          r#   pad_to_max_patchesr[      s     [[^N"3NaNAq1!Q15##''Zq'QHH''++I[]+^	r"   zbConstructs a Gemma4 video processor that samples frames from videos for use with the Gemma4 model.c                     ^  \ rS rSr\R
                  r/ SQr/ SQrSr	Sr
SrSrSrSrSrSrSrSrS	r\rS
S/rS\\   4U 4S jjrU 4S jrS\R6                  S\S\S\S\R<                  S\R6                  4S jrS\ S\\   S\!4U 4S jjr"   S%S\#S   S\$S\$SSS\$S\%S\$S\%\#\%   -  S-  S \%\#\%   -  S-  S!\&\'-  S-  S\S-  S"\S-  S\S-  S\!4S# jjr(S$r)U =r*$ )&Gemma4VideoProcessor   )        r_   r_   )      ?r`   r`   NT       r%   r   pixel_values_videosvideo_position_idskwargsc                    > [         TU ]  " S0 UD6  U R                  [        ;  a   [	        S[         SU R                   S35      eg )N!`max_soft_tokens` must be one of , got r-   r   )super__init__r   _SUPPORTED_SOFT_TOKENSr2   selfre   	__class__s     r#   rj   Gemma4VideoProcessor.__init__   sN    "6"'==@AW@XX^_c_s_s^ttuvww >r"   c                 0   > SUS'   [         TU ]  " S0 UD6  g )NF	do_resizer   )ri   _validate_preprocess_kwargsrl   s     r#   rr   0Gemma4VideoProcessor._validate_preprocess_kwargs   s    
 ${+5f5r"   r>   r   r)   r   resampler*   c                     UR                   S   UR                   S   pv[        UUUUUS9u  pX:X  a  X:X  a  U$ [        R                  " UX/USS9$ )NrD   )r'   r(   r   r)   r   T)sizeinterpolation	antialias)rE   r=   tvFresize)
rm   r>   r   r)   r   rt   r'   r(   r:   r;   s
             r#   aspect_ratio_preserving_resize3Gemma4VideoProcessor.aspect_ratio_preserving_resize   si     BR&F!# 3'
# "|'<Lzz."	
 	
r"   videosc                 &   > [         TU ]  " U40 UD6$ )N)ri   
preprocess)rm   r~   re   rn   s      r#   r   Gemma4VideoProcessor.preprocess   s    
 w!&3F33r"   r?   do_convert_rgbrq   z"tvF.InterpolationMode | int | None
do_rescalerescale_factordo_normalize
image_mean	image_stdreturn_tensorsr   c           	         U[         ;  a  [        S[          SU S35      eXS-  -  n/ n/ n/ nSnU GHq  nU(       a  U R                  U5      nU(       a  U R                  UUUUUS9nU R	                  UXVXxU	5      nUR
                  S   nUR
                  S   U-  nUR
                  S	   U-  n[        UU5      nUR                  UR
                  S   US-  -  5        UR                  n[        R                  " [        R                  " UUS
9[        R                  " UUS
9SS9n[        R                  " US	S9nUR                  UR
                  S   S5      nUS   R                  USS5      n[        UUU5      u  nnUR                  U5        UR                  U5        GMt     [        R                  " USS9n[        R                  " USS9nUUUS.n[!        UU
S9$ )Nrg   rh   r-   r,   rC   )r>   r   r)   r   rt   r   rv   rD   )devicexy)indexing)dim)N.)rc   rd   num_soft_tokens_per_video)datatensor_type)rk   r2   convert_to_rgbr|   rescale_and_normalizerE   rM   appendr   rT   meshgridarangestackrF   repeatr[   r   )rm   r~   r   rq   rt   r   r   r   r   r   r   r   r   r   re   r)   pixel_valuesposition_idsr   rH   r>   patch_heightpatch_widthpatchesr   
patch_gridstacked_gridreal_positionsrN   r   s                                 r#   _preprocess Gemma4VideoProcessor._preprocess   s   " "88@AW@XX^_n^oopqrr%Q(>>$&!
E++E2;;) +(;% <  ..ujR^luvEQJ ;;r?j8L++b/Z7K.ujAG%,,W]]1-=ATVWAW-WX\\F[8\&9J
 !;;zr:L)11'--2BAFN+I6==j!QON!3G^[!YGY(	*A F {{<Q7{{<Q7 $0".)B

 >BBr"   r   )NNN)+r   r   r   r   r   BICUBICrt   r   r   rw   default_to_squarer   rq   r   r   rH   do_sample_framesr   r   r   r   valid_kwargsmodel_input_namesr   rj   rr   rT   Tensorr   rz   InterpolationModer|   r   r   r   listboolfloatstrr   r   r!   __classcell__)rn   s   @r#   r]   r]      s   
 "))H JIDNIJLJJO-L.0DEx(B!C x6
||
 
 	

 !
 ''
 

644 344 
	4& "&&**.FC^$FC FC 	FC
 7FC FC FC FC DK'$.FC 4;&-FC j(4/FC $JFC tFC !4ZFC  
!FC FCr"   r]   )&r/   rT   image_processing_utilsr   processing_utilsr   r   utilsr   r   r	   r
   r   r   r   video_processing_utilsr   r   video_utilsr   image_utilsr   torchvision.transforms.v2r   rz   torchvision.transforms
get_loggerr   loggerr   rk   r   tupler=   rM   r[   r]   __all__r   r"   r#   <module>r      sH     2 4   Y % 1  ;8 
		H	%U   3 3'3'3' 3' 	3'
 3' 38_3'lN    &4EH
)*  h"HC- HC	HCV "
"r"   