
    3j:;                     .   S r SSKrSSKJr  SSKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  \R0                  " \5      r\ " S S\5      5       r\" SS9 " S S\5      5       r\" SS9 " S S\\5      5       r/ SQrg)zPyTorch Fuyu model.    N)nn   )Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)	AutoModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )
FuyuConfigc                   F    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ rS/rSrg)	FuyuPreTrainedModel    configmodel)imagetextTpast_key_values N)__name__
__module____qualname____firstlineno__r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placement__static_attributes__r       `/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.pyr   r       s@    (&*#"&N#4"5r+   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                   J  ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                     S\R                  S\R                  4S jr	\
\S	\R                  S
\\   S\\-  4S j5       5       rS\R$                  S\R                  S\R                  4S jr\
\        SS\R$                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R$                  S-  S\S-  S\R                  S-  S\S-  S
\\   S\\-  4S jj5       5       rSrU =r$ )	FuyuModel.   r   c                   > [         TU ]  U5        UR                  U l        UR                  R
                  U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  -  UR                  -  UR                  5      U l        SU l        U R!                  5         g )NF)super__init__pad_token_idpadding_idxtext_config
vocab_sizer
   from_configlanguage_modelr   Linear
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initselfr   	__class__s     r,   r3   FuyuModel.__init__4   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r+   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         UR                   S   [        U5      :X  d)  [        S[        U5      < SUR                   S   < 35      eUR                  5       n[	        UR                   S   5       H  n[
        R                  " X5   S:  SS9S   nX5   U   nUR                   S   X%   R                   S   :  a-  [        SX%   R                   < SUR                   < SU S	35      eX%   U   R                  UR                  5      XEU4'   M     U$ )
ay  This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.

Args:
    word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Tensor of word embeddings.
    continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
        [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
        indices in image_patch_input_indices for that batch element.
    image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor of indices of the image patches in the input_ids tensor.
r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)rB   rE   rF   rG   output_embeddings	batch_idxdst_indicessrc_indicess           r,   gather_continuous_embeddings&FuyuModel.gather_continuous_embeddingsA   sI   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78I  --(A(LPQ(Q\`abcdK 4>{KK  #&;&F&L&LQ&OO ^7L7W7]7]6_ `I6A6G6G5II[\e[ffgi  9N8XYd8e8h8h!((945 9  ! r+   pixel_valueskwargsc                 6    U R                  U5      n[        US9$ )z
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
)last_hidden_state)r>   r   )rB   r[   r\   patch_embeddingss       r,   get_image_featuresFuyuModel.get_image_featuresm   s!      33LA)<LMMr+   	input_idsinputs_embedsimage_featuresc           	      &   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  UR                  5      n[        XRR                  S   -  UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
dtyperT   r   r   z6Image features and image tokens do not match, tokens: z, features: )get_input_embeddingsrQ   tensorr   image_token_idlongrT   allsumrL   	unsqueezerS   r   numel)rB   rb   rc   rd   special_image_maskn_image_tokensn_image_featuress          r,   get_placeholder_maskFuyuModel.get_placeholder_masky   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=@@AUAUV00448L8L8NND^DTT`aq`rs	
 "!r+   Nimage_patchesimage_patches_indicesattention_maskposition_idsr   	use_cachec	           	      j   USL USL-  (       a  [        S5      eUc   U R                  R                  5       " U5      nUR                  S   n
Uch  Ub  UR                  OUR                  nUb  UR                  5       OSn[        R                  " XU-   [        R                  US9nUR                  S5      nUba  U R                  USS9R                  nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " S
UUUUUS	.U	D6nU$ )a  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rf   T)return_dict)rc   rd   )rc   rx   ry   r   rz   r   )rN   r9   ri   rL   rT   get_seq_lengthrQ   arangerl   ro   r`   r^   rS   rg   rt   masked_scatter)rB   rb   rv   rw   rx   ry   r   rc   rz   r\   seq_lenrT   past_key_values_lengthr_   rq   outputss                   r,   forwardFuyuModel.forward   sa   , -t";<YZZ  //DDFyQM%%a()2)>Y%%MDXDXFIXId_%C%C%Ejk" <<&2H(HPUPZPZciL (11!4L$#66}RV6Wii/22=3G3GI\I\]!%!:!:GW "; " *889K^M%% 
')%+
 
 r+   )r?   r9   r5   r>   r7   )NNNNNNNN)r   r   r   r   r   r3   rQ   TensorlistrY   r   r   FloatTensorr   r   tupler   r`   
LongTensorrt   r   boolr   r   r*   __classcell__rC   s   @r,   r/   r/   .   s   z *!*!  $ELL1*! $)<<	*!
 
*!X N!--N9?@R9SN	+	+N  N"))":?:K:K"]b]n]n"0  .2-159.204(,26!%5##d*5 ||d*	5
  %||d25 t+5 &&-5 5 ((4/5 $;5 +,5 
'	'5  5r+   r/   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                     ^  \ rS rSrSS0rS\4U 4S jjr\\          SS\	R                  S-  S\	R                  S-  S	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\S-  S\	R                  S-  S\S-  S\	R                  S-  S\S-  S\\   S\\-  4S jj5       5       r      SU 4S jjrSrU =r$ )FuyuForCausalLM   zlm_head.weightz(model.language_model.embed_tokens.weightr   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)bias)r2   r3   r/   r   r   r:   r6   r=   r7   lm_headr@   rA   s     r,   r3   FuyuForCausalLM.__init__   sS     v&
yy!3!3!?!?ASASA^A^ejkr+   Nrb   rv   rw   rx   ry   r   rc   rz   labelslogits_to_keepr\   rH   c                    U R                   " SUUUUUUUUS.UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnU	b3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )a  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Examples:

```python
>>> from transformers import FuyuProcessor, FuyuForCausalLM
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
>>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

>>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))
>>> prompt = "Generate a coco-style caption.\n"

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> outputs = model(**inputs)

>>> generated_ids = model.generate(**inputs, max_new_tokens=7)
>>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
>>> print(generation_text[0])
A blue bus parked on the side of a road.
```)rb   rv   rw   rc   rx   ry   r   rz   r   N)logitsr   r7   )lossr   r   hidden_states
attentionsr   )r   
isinstanceintslicer   loss_functionr   r6   r7   r   r   r   r   )rB   rb   rv   rw   rx   ry   r   rc   rz   r   r   r\   r   r   slice_indicesr   r   s                    r,   r   FuyuForCausalLM.forward   s    j ** 

'"7')%+

 

  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD &#33!//))
 	
r+   c           
         > [         T
U ]  " U4UUUUUUS.UD6n	U(       d!  UR                  SS5      (       a
  S U	S'   S U	S'   U	$ )N)r   rx   rc   rv   rw   is_first_iterationrz   Trw   rv   )r2   prepare_inputs_for_generationget)rB   rb   r   rx   rc   rv   rw   r   r\   model_inputsrC   s             r,   r   -FuyuForCausalLM.prepare_inputs_for_generation-  sh     w<	
+)''"71	
 	
 "fjjd&C&C48L01,0L)r+   )r   r   )
NNNNNNNNNr   )NNNNNF)r   r   r   r   _tied_weights_keysr   r3   r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r*   r   r   s   @r,   r   r      sY    +,VWz   .2-159.204(,26!%&*%&P
##d*P
 ||d*	P

  %||d2P
 t+P
 &&-P
 P
 ((4/P
 $;P
 t#P
 d
P
 +,P
 
'	'P
  P
j "  r+   r   )r   r   r/   )__doc__rQ   r   cache_utilsr   
generationr   modeling_outputsr   r   modeling_utilsr	   models.auto.modeling_autor
   processing_utilsr   utilsr   r   r   r   r   configuration_fuyur   
get_loggerr   loggerr   r/   r   __all__r   r+   r,   <module>r      s         ) R - 2 & j j * 
		H	% 
6/ 
6 
6 
U# U
Up 
z)? z
zz Br+   