
    3jl8              
       (   S r SSKrSSKJr  SSKJrJr  SSKJ	r	J
r
Jr  SSKJrJr  SSKJr   " S	 S
\	SS9rS\\   S\S\\\      4S jrS\\\\         S\\\      S\S\S\R*                  4
S jrS\S\S\S\4S jr\ " S S\
5      5       rS/rg)zProcessor class for Mllama.    N   )BatchFeature)
ImageInputmake_nested_list_of_images)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringc                        \ rS rSrSSS00rSrg)MllamaProcessorKwargs   image_kwargsmax_image_tiles    N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       f/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s    q
Ir   r   F)total	input_idsimage_token_idreturnc                    [        U 5       VVs/ s H  u  p#X1:X  d  M  UPM     nnn[        U5      S:X  a  / $ [        U5      S:X  a  US   S//$ [        USS USS 5       VVs/ s H  u  pVXV/PM
     nnnUR                  US   [        U 5      /5        US   S   nUSSS2    H  n	U	S   U	S   S-
  :X  a  XS'   U	S   nM     U$ s  snnf s  snnf )a  
Generate a cross-attention token mask for image tokens in the input sequence.

This function identifies the positions of image tokens in the input sequence and creates
a mask that defines which subsequent tokens each image token should attend to.

Args:
    input_ids (list[int]): A list of token ids representing the input sequence.
    image_token_id (int): The id of the token used to represent images in the sequence.

Returns:
    list[list[int]]: A list of [start, end] pairs, where each pair represents the range
    of tokens an image token should attend to.

Notes:
    - If no image tokens are present, an empty list is returned.
    - For a single image token, it attends to all subsequent tokens until the end of the sequence.
    - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
    - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
r      N)	enumeratelenzipappend)
r   r   itokenimage_token_locationsloc1loc2vision_maskslast_mask_endvision_masks
             r   get_cross_attention_token_maskr/   "   s   , 09/C_/C81uG^Q/C_
 !Q&	  !Q&&q)2.//367LSb7QShijikSl3mn3mZTTL3mLn .r2C	NCD
 !$Q'M#DbD)q>[^a//*N#A *
 / ` os   CC$Ccross_attention_token_mask	num_tilesmax_num_tileslengthc           	      p   [        U 5      n[        S U  5       5      n[        R                  " XCXR4[        R                  S9n[        [        X5      5       H[  u  nu  p[        [        X5      5       H;  u  n
u  p[        U5      S:X  d  M  Uu  p[        X5      nUS:X  a  UnSXgX2U
SU24'   M=     M]     U$ )a  
Convert the cross attention mask indices to a cross attention mask 4D array.

This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

Args:
    cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
        - The outer list represents the batch dimension.
        - The middle list represents different images within each batch item.
        - The inner list contains pairs of integers [start, end] representing token ranges for each image.
    num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
    max_num_tiles (int): The maximum possible number of tiles.
    length (int): The total sequence length of the input.

Returns:
    np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
        The array contains `1` where attention is allowed and `0` where it is not.

Note:
    - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
c              3   8   #    U  H  n[        U5      v   M     g 7fNr$   ).0maskss     r   	<genexpr>?convert_sparse_cross_attention_mask_to_dense.<locals>.<genexpr>p   s     L1KU1K   )shapedtype   r"   r!   N)r$   maxnpzerosint64r#   r%   min)r0   r1   r2   r3   
batch_sizemax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                  r   ,convert_sparse_cross_attention_mask_to_denserP   R   s    : /0JL1KLLN88>Ahh
 9B#F`Bl8m4
4\5>s<?b5c1H1y9~"&
#&"9 CYZ$Ho~o%UV 6d 9n  r   prompt	bos_tokenimage_tokenc                     X;   a  U $ SnU R                  U5      (       a+  U [        U5      S n US-  nU R                  U5      (       a  M+  X#-   U U  3$ )a  
Builds a string from the input prompt by adding `bos_token` if not already present.

Args:
    prompt (`str`):
        The input prompt string.
    bos_token (`str`):
        The beginning of sentence token to be added.
    image_token (`str`):
        The image token used to identify the start of an image sequence.

Returns:
    str: The modified prompt string with the `bos_token` added if necessary.

Examples:
    >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'

    >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
    '<|image|><begin_of_text>Hello world'

    >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'
r   Nr!   )
startswithr$   )rQ   rR   rS   num_image_tokens_on_starts       r   build_string_from_inputrW      so    4  !


K
(
(K(*+!Q&! 

K
(
( 56yk&JJr   c            
       \  ^  \ rS rSr\rSU 4S jjr\  SS\S-  S\	\
-  \\	   -  \\
   -  S-  S\\   S\4S jj5       r  SS\S-  S\	\
-  \\	   -  \\
   -  S-  4U 4S	 jjjr  SS\S-  S\	\
-  \\	   -  \\
   -  S-  S\\   4U 4S
 jjjrS\S\S\4S jr SS jr\S 5       rSrU =r$ )MllamaProcessor   Nc                 H  > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        SU l        UR                  U R                  5      U l        UR                  U l        [        TU ]!  XUS9  g )NrS   z	<|image|>z<|python_tag|>)chat_template)	hasattrrS   convert_tokens_to_idsr   python_tokenpython_token_idrR   super__init__)selfimage_processor	tokenizerr\   	__class__s       r   rb   MllamaProcessor.__init__   s    y-00*D"+"A"A$BRBR"SD(44D"+":":D,(>>t?P?PQ",,=Qr   imagestextkwargsr   c           
      t   U R                  XS9u  pU R                  " SXS.UD6  U R                  " [        4SU R                  R
                  0UD6nUS   R                  SS5      n0 nUb'  U R                  " U40 US   D6nU R                  X&S/S9  0 nUb)  U R                  " U40 US   D6u  pxUR                  S	5      n	Ubb  Ub_  US
    V
s/ s H  n
[        XR                  5      PM     nn
[        UW	U R                  R                  [        S US
    5       5      S9nXS'   [        0 UEUEUS9$ s  sn
f )aO  
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
rh   ri   tokenizer_init_kwargstext_kwargsreturn_tensorsNimage)
modalitiesimages_kwargsr1   r   c              3   8   #    U  H  n[        U5      v   M     g 7fr6   r7   )r8   r   s     r   r:   +MllamaProcessor.__call__.<locals>.<genexpr>   s     T;Si3y>>;Sr<   )r1   r2   r3   rG   )datatensor_typer   )prepare_inputs_layoutvalidate_inputs_merge_kwargsr   re   init_kwargspop_check_special_mm_tokens_process_imagesr/   r   rP   rd   r   r@   r   )rc   rh   ri   rj   output_kwargsro   text_inputsimage_inputs_r1   	token_idsr0   rG   s                r   __call__MllamaProcessor.__call__   s   $ 111K@F@@**!
"&.."<"<
 

 '}599:JDQ..N}1MNK))$	)R"226\]?=[\OL$((5I $"2 "-[!9*!9I /y:M:MN!9 ' * $P*#"22BBT;{;STT	$  3G./!@K!@<!@n]]*s   D5c                    > [         TU ]  " SXS.UD6tpnUb  [        U5      nUb0  U Vs/ s H#  n[        XPR                  U R
                  5      PM%     nnX4$ s  snf )Nrl   r   )ra   rw   r   rW   rR   rS   )rc   rh   ri   rj   r   	text_itemrf   s         r   rw   %MllamaProcessor.prepare_inputs_layout   sq     !78\\U[\q /7Fjnojn]f+I~~tGWGWXjnDo| ps   *Ac                   > [         T	U ]  " X40 UD6  Ub  U Vs/ s H  oDR                  U R                  5      PM     nn[	        U5      S:  a  Uc  [        S5      eUb  [        U5      nU Vs/ s H  n[        U5      PM     nn[        S U 5       5      (       a"  [        S U 5       5      (       d  [        S5      eXu:w  a5  Sn[	        U5      [	        U5      :X  a  Xu:w  a  Sn[        SU S	U S
U 35      eg g g s  snf s  snf )Nr   z@No image were provided, but there are image tokens in the promptc              3   *   #    U  H	  oS :H  v   M     g7fr   Nr   r8   	batch_imgs     r   r:   2MllamaProcessor.validate_inputs.<locals>.<genexpr>  s     H7G)A~7G   c              3   *   #    U  H	  oS :H  v   M     g7fr   r   r   s     r   r:   r     s      U4DyN4Dr   zaIf a batch of text is provided, there should be either no images or at least one image per sample zZMake sure to pass your images as a nested list, where each sub-list holds images per batchz)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). )
ra   rx   countrS   sum
ValueErrorr   r$   anyall)
rc   rh   ri   rj   tn_images_in_textsamplen_images_in_imagesadd_messagerf   s
            r   rx   MllamaProcessor.validate_inputs   sA    	77CGH4a(8(8 94H#$q(V^ !cdd#3F;@F%Gfc&k"%GH7GHHHQT U4DU R R %{  &9"$K-.#6F2GGL^Lr 'C$CDTCU V@@R?SSVWbVce  : $ H &Hs   $C</Dr   	image_idxc                     U R                   $ r6   )rS   )rc   r   r   s      r   replace_image_token#MllamaProcessor.replace_image_token!  s     r   c                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spaces)re   batch_decode)rc   generated_outputsr   r   rj   s        r   post_process_image_text_to_text/MllamaProcessor.post_process_image_text_to_text&  s3    ( ~~**
 3)E
 	
 	
r   c                     U R                   R                  nU R                  R                  nU Vs/ s H  o3S:w  d  M
  UPM     nn[        X-   S/-   5      $ s  snf )Nr1   rG   )re   model_input_namesrd   list)rc   tokenizer_input_namesimage_processor_input_namesnames       r   r   !MllamaProcessor.model_input_namesA  sb     $ @ @&*&:&:&L&L# 9T&k8S_jWjt8S#&k)GKaJbbcc 'ls
   	AA)rR   rS   r   r_   r`   r6   )NN)TF)r   r   r   r   r   valid_processor_kwargsrb   r   r   r   r
   r   r	   r   r   rw   r   rx   dictintstrr   r   propertyr   r   __classcell__)rf   s   @r   rY   rY      sb   2R  %)ae3^T!3^ ++d9o=EV@WWZ^^3^ ./	3^
 
3^ 3^n %)aeT! ++d9o=EV@WWZ^^ & %)ae T!  ++d9o=EV@WWZ^^  )*	   D        Y^
6 d dr   rY   )__doc__numpyrA   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   tokenization_utils_baser
   r   utilsr   r   r   r   r/   ndarrayrP   r   rW   rY   __all__r   r   r   <module>r      s    "  4 A H H C #,E -d3i - -QUVZ[^V_Q` -`-  $T$s)_ 5- DI-  -  	- 
 ZZ- `"KC "KC "Kc "Kc "KJ adn ad adH 
r   