
    3j7^                        S SK r S SKJr  S SKJrJrJr  S SKrS SK	r	SSK
Jr  SSKJrJr  SSKJrJrJrJr  SSKJrJrJr  SS	KJr  SS
KJr  \(       a  SSKJr   " S S\SS9r\" SS9\ " S S\5      5       5       rS/rg)    N)
accumulate)TYPE_CHECKINGOptionalUnion   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenBatchEncoding	TextInput)auto_docstring)requires)PreTokenizedInputc                   0    \ rS rSrSS0SSSS.SS0S	.rS
rg)ColModernVBertProcessorKwargs(   paddinglongestTchannels_first)return_row_col_infodata_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r#       v/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/colmodernvbert/processing_colmodernvbert.pyr   r   (   s/     y
 $(+"

 +D1
Ir*   r   F)total)torch)backendsc                   N  ^  \ rS rSr\r     S"S\S\S-  S\S-  4U 4S jjjr\	   S#S\
\\
   -  \\\
      -  S\\S	\\   \S	   4   S\S-  S
\\   S\4
S jj5       r  S$S\
S-  S\\S	\\   \S	   4   S
\\   4S jjr  S$S\
S-  S\\S	\\   \S	   4   S
\\   4U 4S jjjrS\S\S\4S jrS\S\\   S\\\      4S jrS%S jr S%S\
S-  S
\\   S\4S jjrS\\\   -  S
\\   S\4S jr   S&S\S\S   4   S\S\S   4   S\S\S   S\S\4   SS4S  jjrS!rU =r$ )'ColModernVBertProcessor6   Nimage_seq_lenvisual_prompt_prefixquery_prefixc                   > Sn[        SSSS9R                  U l        [        SSSS9R                  U l        [        SSSS9R                  U l        SU l        X@l        UR                  U R                  5      U l        UR                  U R                  5      U l	        UR                  U R
                  5      U l
        [        S	5       VV	s/ s H3  n[        S	5        H   oR                  S
US-    SU	S-    S35      PM"     M5     sn	nU l        [        R                  " S5      U l        SU R                  U R                  U R                  /0n
UR!                  U
5        UR                  U R                  5      U l        ["        TU ]H  " X4SU0UD6  U=(       d    SU R                   S3U l        U=(       d    SU l        U R                  U l        gs  sn	nf )aJ  
image_seq_len (`int`, *optional*, defaults to 64):
    The length of the image sequence i.e. the number of <image> tokens per image in the input.
visual_prompt_prefix (`str`, *optional*):
    A string that gets tokenized and prepended to the image tokens.
query_prefix (`str`, *optional*):
    A prefix to be used for the query.
Nz<fake_token_around_image>FT)
normalizedspecialz<image>z<end_of_utterance>z<global-img>   <row_   _col_>z*(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+additional_special_tokenschat_templatez<|begin_of_text|>User:z0Describe the image.<end_of_utterance>
Assistant: )r   contentfake_image_tokenimage_tokenend_of_utterance_tokenglobal_image_tagr2   convert_tokens_to_idsimage_token_idfake_image_token_idglobal_image_token_idrangerow_col_idsrecompile%_regex_to_remove_extra_special_tokensadd_special_tokenssuper__init__r3   r4   query_augmentation_token)selfimage_processor	tokenizerr>   r2   r3   r4   kwargsijtokens_to_add	__class__s              r+   rP    ColModernVBertProcessor.__init__;   s   $  *+FSXbf g o o%iE4PXX&01ERWae&f&n&n# .*'==d>N>NO#,#B#B4CXCX#Y %.%D%DTEZEZ%["SXYZS[
S[aejklem`a++eAE7%Awa,HIemIS[
 68ZZ@m5n2 (%%  ++*
 	$$]3'==d>N>NO[=[TZ[$8 %
$T%5%5$66gh 	! ).B(,(C(C%1
s   :Gimagestextr   rU   returnc                    U R                   " SXS.UD6u  pU R                  " SXS.UD6  U R                  " [        4SU R                  R
                  0UD6nUb  UOU R                  nUS   R                  SS5      nUS   R                  SS5      nUS   R                  SS5      n0 =pUGb  U R                  " U40 US	   D6u  pU	R                  S
S5        U	R                  SS5        Ub  U R                  X+S9u  p,U R                  " U40 US   D6n
U(       a  XS'   / n[        U5       Hg  u  p/ nU HI  nUS   u  nnU
R                  UU5      nU
R                  UUS-
  5      nUR                  UU-
  S-   5        MK     UR                  U5        Mi     U(       a  U R                  U
S   U5      U
S'   U R                  X*S/S9  OUb  U R                  " SSU0US   D6n
[        0 U
EU	EUS9$ )z
image_seq_len (`int`, *optional*):
    The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
    image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
)r[   r\   tokenizer_init_kwargsNr    return_text_replacement_offsetsFreturn_mm_token_type_idsr   r!   rowscols)images_replacementstext_replacement_offsetsnew_spanr:   	input_idsmm_token_type_idsimage)
modalitiesr\   )datatensor_typer#   )prepare_inputs_layoutvalidate_inputs_merge_kwargsr   rT   init_kwargsr2   pop_process_imagesget_text_with_replacements	enumeratechar_to_tokenappendcreate_mm_token_type_ids_check_special_mm_tokensr   )rR   r[   r\   r2   rU   output_kwargsr`   ra   r   image_inputstext_inputsrd   re   batch_image_seq_lengthsbatch_idtext_replacement_offsetimage_seq_lensrk   startendstart_id_pos
end_id_poss                         r+   __call__ ColModernVBertProcessor.__call__p   sW    11UUfU@F@@**)
"&.."<"<
 
 *7)BHZHZ*7*F*J*JKlns*t'#0#?#C#CD^`e#f &}599:JDQ%''040D0DV0n}]lOm0n-L VT*VT*151P1P 2Q 2. #nnTR]=5QR2>V :;*,'9BC[9\5H%'N 7%)*%5
s'2'@'@5'Q%0%>%>xq%Q
&--j<.G!.KL !8 ,22>B :] ,7;7T7T#K02I8K 34 --dWI-V..SdSmM6RSK!@K!@<!@n]]r*   c                 p   Ub(  [        U[        5      (       a  U/nUR                  5       nUb  U R                  R	                  U5      n[        U5      (       a  U//nX4$ [        U[        [        45      (       a  [        US   5      (       a  Ub  U Vs/ s H  oDR                  U R                  5      PM     nnS/[        [        U5      5      -   n[        [        U5      5       Vs/ s H  nXU   XgS-       PM     nn[        U5      US   :  a  XUS   S  /-   nX4$ Un X4$ U/nX4$ s  snf s  snf )Nr   r:   )
isinstancestrcopyrS   fetch_imagesr
   listtuplecountrB   r   rI   len)	rR   r[   r\   rU   samplen_images_in_textcumsum_images_in_textrV   split_imagess	            r+   rm   -ColModernVBertProcessor.prepare_inputs_layout   sk    $$$v99;D))66v>Ff%%!($ |# FT5M22~fQi7P7P#UY'ZUY6T5E5E(FUY$'Z-.C$zBR7S2T,T) "'s+;'<!=$!=A Q7:OTUPU:VW!= ! $
 6{%:2%>>!-8Mb8Q8S1T0U!U |	 ". | %XF| ([$s   $D.*D3c                   > [         TU ]  " X40 UD6  Uc  Uc  [        S5      eUb  U Vs/ s H  oDR                  U R                  5      PM     nnUbM  U Vs/ s H  n[        U5      PM     nnXW:w  a,  [        SU R                   SU SU R                   SU S3	5      eg Uc6  [        U5      (       a%  [        S[        U5       SU R                   S35      eg g g s  snf s  snf )	Nz+You must provide either `text` or `images`.zThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images per sample.zFound z. tokens in the text but no images were passed.)rO   rn   
ValueErrorr   rB   r   anysum)	rR   r[   r\   rU   r   r   sublistn_images_in_imagesrY   s	           r+   rn   'ColModernVBertProcessor.validate_inputs   s    	77<FNJKKMQRT6T-=-= >TR!BH%I&wc'l&"%I#9$.t/?/?.@ A""2!31T5E5E4FlSeRffy{  :
 C(8$9$9 S!1231T5E5E4FFtu  %: R%Is   $CC!rz   	image_idxc           	         US    VVs/ s H  o3  H  oDPM     M     snnU   nUS    VVs/ s H  of  H  owPM     M     snnU   nUS:X  aJ  US:X  aD  U R                    U R                   -   U R                   U R                  -  -   U R                    -   $ Sn	[	        U5       HU  n
[	        U5       H>  nU	U R                    SU
S-    SUS-    S3-   U R                   U R                  -  -   -  n	M@     U	S	-  n	MW     U	S	U R                    3U R                   -   U R                   U R                  -  -   U R                    -   -  n	U	$ s  snnf s  snnf )
Nrb   rc   r   r?   r9   r:   r;   r<   
)rA   rD   rB   r2   rI   )rR   rz   r   row_listrow
image_rowscol_listcol
image_colstext_split_imagesn_hn_ws               r+   replace_image_token+ColModernVBertProcessor.replace_image_token   s   *6v*>S*>h(3c(c*>ST]^
*6v*>S*>h(3c(c*>ST]^
?zQ(()**+-%%&$*<*<<= **+- !#Z( ,C%001!#'%ay:;!--.$2D2DDE% - "T)! ) T**+,**+-%%&$*<*<<= **+- %$5 TSs
   EErg   r|   c                    / n[        U5       H  u  pE[        R                  " X   5      n[        R                  " U5      n[        R                  " X`R
                  :H  5      S   nSn	U H6  n
U	[        U5      :  a    O&X   nX-   nSX{U& [        R                  " X5      n	M8     UR                  UR                  5       5        M     U$ )Nr   r:   )
rt   nparray
zeros_likewhererG   r   searchsortedrv   tolist)rR   rg   r|   rh   rV   seq_lengths	array_idsmm_token_typesimage_start_positionsrW   seq_lenr   r   s                r+   rw   0ColModernVBertProcessor.create_mm_token_type_ids	  s     '(?@NA.I]]95N$&HHY:R:R-R$STU$V!A&122-0o,-S)OO$9? ' $$^%:%:%<= A ! r*   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R
                  " / UQUP76 PM#     nnU R                  S-   nU R                  S-   n/ n	/ n
U H4  u  pnX-  S-   nU	R                  X~U-  -   5        U
R                  U5        M6     UR                  XS.5        [        S0 UD6$ s  snf )a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r!   r      r:   )num_image_tokensnum_image_patchesr#   )	r   r(   getupdaterS   get_number_of_image_patchesr2   rv   r   )rR   image_sizesrU   vision_datar!   
image_sizenum_image_row_colsbase_image_length
col_lengthr   r   num_patchesnum_rowsnum_cols
row_lengths                  r+   _get_num_multimodal_tokens2ColModernVBertProcessor._get_num_multimodal_tokens  s	    "9CCGGY[\M  ( #.""-J $$@@\*\m\"-  "
 !% 2 2Q 6++a/J! "3E/x'2Q6
 ''(9(=R(ST!((5 4F
 4Dmn,,,#"s   (Cc                    U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      nUSLn[        U5      (       a  U/nOw[        U[        5      (       a  [        US   5      (       a  ON[        U[        5      (       a.  [        US   [        5      (       a  [        US   S   5      (       d  [        S5      eU Vs/ s H  ofR                  S5      PM     nnU R                  U R                  /[        U5      -  UUS   US   S	9nU(       a.  US
   R                  US   S:H  S5      nUR                  SU05        U$ s  snf )au  
Prepare for the model one or several image(s). Handles input validation, RGB conversion,
and prepends the `visual_prompt_prefix` to each image. Optionally computes labels from
`token_type_ids` when a `suffix` is provided in `text_kwargs`.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
r_   r    suffixNr   zAimages must be an image, list of images or list of list of imagesRGBr!   )r\   r[   r!   r    rg   token_type_idsilabels)ro   r   rT   rp   rq   r
   r   r   r   convertr   r3   r   masked_fillr   )	rR   r[   rU   ry   r   return_token_type_idsri   	batch_docr   s	            r+   process_images&ColModernVBertProcessor.process_imagesC  sj   < **)
"&.."<"<
 
 }-11(DA &d 2 &!!XF%%.*C*CVT**z&)T/J/J~^def^ghi^jOkOk`aa 5;;F5--&F; MM++,s6{:'8%m4	 " 
	 !{+77	BR8SWX8XZ^_Fh/0 <s   Ec                    U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      n[        U[        5      (       a  U/nO8[        U[        5      (       a  [        US   [        5      (       d  [        S5      eUc  U R                  S-  nU Vs/ s H  oPR                  U-   U-   PM     nnU R                  USUS   S	9nU$ s  snf )
a  
Prepare for the model one or several text queries. Handles input validation, prepends the
`query_prefix`, and appends query augmentation tokens (used to pad query embeddings for
better late-interaction retrieval performance).

Args:
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
r_   r    r   Nr   z*Text must be a string or a list of strings
   F)r\   r   r    )ro   r   rT   rp   rq   r   r   r   r   rQ   r4   r   )rR   r\   rU   ry   r   querytexts_querybatch_querys           r+   process_queries'ColModernVBertProcessor.process_queries  s    : **)
"&.."<"<
 
 }-11(DAdC  6DT4((ZQ-E-EIJJ >22R7F SW!WRV"3"3e";f"DRV!Wmm"'%m4 $ 
  "Xs   *Cquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         [        U5      S:X  a  [        S5      e[        U5      S:X  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUc  US   R                  n/ n[	        S[        U5      U5       GH  n/ n[
        R                  R                  R                  R                  XXs-    SSS9n	[	        S[        U5      U5       H}  n
[
        R                  R                  R                  R                  X*X-    SSS9nUR                  [
        R                  " SX5      R                  S	S
9S   R                  SS
95        M     UR                  [
        R                  " USS
9R                  U5      R                  U5      5        GM     [
        R                  " USS
9$ )a  
Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
image of a document page.

Because the embedding tensors are multi-vector and can thus have different shapes, they
should be fed as:
(1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
(2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
    obtained by padding the list of tensors.

Args:
    query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
    passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
    batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
    output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
        If `None`, the dtype of the input embeddings is used.
    output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

Returns:
    `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
    tensor is saved on the "cpu" device.
r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)batch_firstpadding_valuezbnd,csd->bcnsr   )dimr   r:   )r   r   devicedtyperI   r-   nnutilsrnnpad_sequencerv   einsummaxr   catto)rR   r   r   r   r   r   scoresrV   batch_scoresbatch_queriesrW   batch_passagess               r+   score_retrieval'ColModernVBertProcessor.score_retrieval  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./<A/1L!HHNN..;; Q^4$VW < M 1c"45zB!&!3!3!@!@&1>:\] "A " ##LL-PTTYZT[\]^bbghbi	 C MM%))La8;;LILL][\ = yyQ''r*   )rM   rC   rA   rG   rD   rH   r2   rB   rF   rQ   r4   rJ   r3   )NN@   NN)NNN)NN)N)   Ncpu)r$   r%   r&   r'   r   valid_processor_kwargsintr   rP   r   r	   r   r   r   r   r   r   rm   r   rn   dictr   rw   r   r   r   r   r   r   r)   __classcell__)rY   s   @r+   r0   r0   6   s    ;
 +/#'3D
 3D "Dj3D Dj3D 3Dj  JNbf$(	>^T*--T*5E0FF>^ I2DOTJ]E^^_>^ Tz	>^
 67>^ 
>^ >^D %)bf T!  I2DOTJ]E^^_  67	 H %)bfT! I2DOTJ]E^^_ )*	 2% % % %:!$ !QUVYQZ !_cdhildm_n !*#-N %)@T!@ 67@ 
	@D7$y/)7 677 
	7z 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>( >(r*   r0   ) rK   	itertoolsr   typingr   r   r   numpyr   r-   feature_extraction_utilsr   image_utilsr	   r
   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r   r   utils.import_utilsr   r   r   r0   __all__r#   r*   r+   <module>r     s~   * 
   1 1   4 5 X X K K # * <$4E  
:D(n D(  D(N %
%r*   