
    3jL                        S r SSKJr  SSKrSSKJr  SSKJr  SSKJr  SSK	J
r
  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJrJrJr  SSKJ r   SSK!J"r"  \RF                  " \$5      r%\" SS9\ " S S\5      5       5       r&\" SS9\ " S S\5      5       5       r' " S S\RP                  5      r)\ " S S\5      5       r*\" SS9 " S S \*5      5       r+\" SS9 " S! S"\*\
5      5       r,/ S#Qr-g)$zPyTorch PaliGemmamodel.    )	dataclassN)nn   )Cache)PreTrainedConfig)GenerationMixin)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    custom_introc                   B    \ rS rSr% SrSr\R                  S-  \S'   Sr	g)PaligemmaModelOutputWithPast,   a  
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )
__name__
__module____qualname____firstlineno____doc__r    torchFloatTensor__annotations____static_attributes__r!       j/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   r   ,   s     59**T18r+   r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\S-  \S'   Sr\\R                     S-  \S'   Sr\\R                     S-  \S'   Sr\R                  S-  \S	'   S
rg)PaliGemmaCausalLMOutputWithPast<   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr    r!   )r"   r#   r$   r%   r&   r0   r'   r(   r)   r1   r2   r   r3   tupler4   r    r*   r!   r+   r,   r.   r.   <   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r+   r.   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )PaliGemmaMultiModalProjectorZ   configc                    > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        g )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr9   	__class__s     r,   r>   %PaliGemmaMultiModalProjector.__init__[   s;    ii 4 4 @ @&BVBVBeBelpqr+   c                 (    U R                  U5      nU$ NrC   )rE   image_featuresr3   s      r,   forward$PaliGemmaMultiModalProjector.forward_   s    N3r+   rJ   )	r"   r#   r$   r%   r   r>   rL   r*   __classcell__rF   s   @r,   r7   r7   Z   s    r r r+   r7   c                   L    \ rS rSr% \\S'   SrSrSrS/r	S/r
SrSrSrSrSrS	rg
)PaliGemmaPreTrainedModele   r9   model)imagetextTr7   r2   Fr!   N)r"   r#   r$   r%   r   r)   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr*   r!   r+   r,   rQ   rQ   e   sI    (&*#78#4"5"N"&r+   rQ   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c                     ^  \ rS rSrSrS\4U 4S jjr\\" SS9S\	R                  S\\   S	\\-  4S
 j5       5       rS\	R                   S\	R                  S\	R                  4S jr\\         SS\	R                   S-  S\	R                  S-  S\	R$                  S-  S\	R                   S-  S\S-  S\	R                   S-  S\	R                  S-  S\	R                   S-  S\S-  S\\   S	\\-  4S jj5       5       rSrU =r$ )PaliGemmaModelt   Fr9   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        [        U5      U l        UR                  R                  U l	        [        R                  " UR                  S9nX l
        U R                  R                  5       R                  =(       d    U R                  U l        U R                  5         g )N)r9   )r=   r>   r   from_configr@   vision_towerr7   multi_modal_projectortext_config
vocab_sizelanguage_modelr9   get_text_configdtypetext_config_dtype	post_init)rE   r9   ri   rF   s      r,   r>   PaliGemmaModel.__init__}   s     %119M9MN%A&%I" ,,77"..f6H6HI,!%!<!<!>!D!D!R

r+   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   pixel_valueskwargsreturnc                 r    U R                   " U40 UD6nUR                  nU R                  U5      nXSl        U$ rI   )re   last_hidden_staterf   pooler_output)rE   ro   rp   image_outputsselected_image_featurerK   s         r,   get_image_features!PaliGemmaModel.get_image_features   sB     )),A&A!.!@!@334JK&4#r+   	input_idsinputs_embedsrK   c           	      &   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S   UR                  S   -  nUR                  S5      R                  UR                  5      n[        XRR                  S   -  UR                  5       :H  SU SU 35        U$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)rk   devicer   r   z6Image features and image tokens do not match, tokens: z, features: )get_input_embeddingsr'   tensorr9   image_token_idlongr|   allsumshape	unsqueezetor   numel)rE   ry   rz   rK   special_image_maskn_image_tokensn_image_featuress          r,   get_placeholder_mask#PaliGemmaModel.get_placeholder_mask   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1)//2^5I5I!5LL/99"=@@AUAUV00448L8L8NND^DTT`aq`rs	
 "!r+   Nattention_maskposition_idsr2   token_type_idslabels	use_cachec
           	         USL USL-  (       a  [        S5      eUbQ  U R                  R                  U R                  :  a-  XR                  R                  :H  nUR	                  5       nSX'   OUnUc  U R                  5       " U5      nUcX  Ub  UR                  5       OSn[        R                  " UR                  S   UR                  S9U-   nUR                  S5      S-   nUbb  U R                  U5      R                  nUR                  UR                  UR                  5      nU R!                  XUS9nUR#                  X5      nU R                  R%                  5       UUUUS.nUSL =(       d    UR&                  (       + =(       d    USLnUb%  U(       a  [        R(                  " US:H  SS5      US	'   [+        S0 UD6n[-        U R                  R.                  S
S5      b  UR1                  5       nU[3        S0 UD6S.nU R4                  " SUUUUU	S.U
D6n[7        UR8                  UR:                  UR<                  UR>                  Ub  WS9$ SS9$ )
  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO
>>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

>>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read()))

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r|   )rz   rK   )r9   rz   r   r2   r   r}   block_sequence_idssliding_window)full_attentionsliding_attention)r   r   r2   rz   r   )rs   r2   r3   r4   r    r!   ) 
ValueErrorr9   r   rh   cloner~   get_seq_lengthr'   aranger   r|   r   rw   rt   r   rk   r   masked_scatterrj   is_initializedwherer	   getattrrg   copyr   ri   r   rs   r2   r3   r4   )rE   ry   ro   r   r   r2   r   rz   r   r   rp   r   llm_input_idspast_seen_tokensrK   mask_kwargsis_first_iterationcausal_masksliding_mask_kwargsoutputss                       r,   rL   PaliGemmaModel.forward   s   Z -t";<YZZ  T[[%?%?4??%R!*kk.H.H!H%OO-M01M-%M  557FMCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4q8L #!44\BPPN+..}/C/C]EXEXYN!%!:!:~ "; " *889K\M kk113*,.(
 -4vO<Z<Z8Zv^jrv^v%*<05Na<OQRTV0WK,- )7;74;;**,<dCO"-"2"2"4"-%F%]I\%]K
 %% 
&%+'
 
 ,%77#33!//))2>2J
 	

 QU
 	
r+   )ri   rf   rl   re   rh   )	NNNNNNNNN)r"   r#   r$   r%   accepts_loss_kwargsr   r>   r   r   r'   r(   r   r   r5   r   rw   
LongTensorr   Tensorr   boolr   r   rL   r*   rN   rO   s   @r,   ra   ra   t   s     
 
 n!--9?@R9S	+	+ "))":?:K:K"]b]n]n"0  .215.204(,2626*.!%l
##d*l
 ''$.l
 t+	l

 &&-l
 l
 ((4/l
 ((4/l
   4'l
 $;l
 -.l
 
-	-l
  l
r+   ra   c                     ^  \ rS rSrSS0rS\4U 4S jjr\S\R                  S\
\   4S j5       r\\          SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\\R                  -  S\
\   S\\-  4S jj5       5       r          SU 4S jjr\  SS\S\R                  S\R                  S	-  S\S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\4S jj5       rSrU =r$ )!PaliGemmaForConditionalGenerationi   zlm_head.weightz(model.language_model.embed_tokens.weightr9   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr;   )r=   r>   ra   rS   r   r?   rg   rA   rh   lm_headrm   rD   s     r,   r>   *PaliGemmaForConditionalGeneration.__init__(  sS     #F+
yy!3!3!?!?ASASA^A^ejkr+   ro   rp   c                 <    U R                   R                  " U40 UD6$ rI   )rS   rw   )rE   ro   rp   s      r,   rw   4PaliGemmaForConditionalGeneration.get_image_features.  s    zz,,\DVDDr+   Nry   r   r   r2   r   rz   r   r   logits_to_keeprq   c                    U R                   " SUUUUUUUU	US.	UD6nUS   n[        U
[        5      (       a  [        U
* S5      OU
nU R	                  USS2USS24   5      nSnUb3  U R
                  " SXU R                  R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )r   )	ry   ro   r   r   r   r2   rz   r   r   r   N)r1   r   rh   )r0   r1   r2   r3   r4   r    r!   )rS   
isinstanceintslicer   loss_functionr9   rg   rh   r.   r2   r3   r4   r    )rE   ry   ro   r   r   r2   r   rz   r   r   r   rp   r   r3   slice_indicesr1   r0   s                    r,   rL   )PaliGemmaForConditionalGeneration.forward2  s    Z ** 
%))%+'
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% 9P9P9[9[_eD /#33!//)) ' ; ;
 	
r+   c                    > [         TU ]  " U4UUUUUU	UUS.UD6nUR                  S5      b  US   S-   US'   U(       d  U(       d  X]S'   U$ )N)r2   rz   r   r   r   r   r   r   r   r   ro   )r=   prepare_inputs_for_generationget)rE   ry   r2   rz   r   ro   r   r   r   r   r   r   rp   model_inputsrF   s                 r,   r   ?PaliGemmaForConditionalGeneration.prepare_inputs_for_generation  sz      w<
+')%))1
 
 N+7+7+G!+KL( Y+7(r+   r   c           	          [         R                  " / UR                  5       S S QSUR                  S9nUb  [         R                  " US:H  SS5      n[        U R                  5       UUUUUS9$ )Nr}   r   r   )r9   rz   r   r   r2   r   )r'   fullsizer|   r   r
   rj   )	r9   rz   r   r2   r   r   r   rp   	group_idss	            r,   r
   ;PaliGemmaForConditionalGeneration.create_masks_for_generate  sy     JJ;!3!3!5cr!:;RH\H\]	% Na$7B?I())+'()+%
 	
r+   )r   rS   )
NNNNNNNNNr   )
NNNNNNTNNF)NF)r"   r#   r$   r%   _tied_weights_keysr   r>   r   r'   r(   r   r   rw   r   r   r   r   r   r   r5   r.   rL   r   staticmethodr   dictr
   r*   rN   rO   s   @r,   r   r      s-    +,VW  Eu/@/@ EFSeLf E E  .215.204(,2626*.!%-.J
##d*J
 ''$.J
 t+	J

 &&-J
 J
 ((4/J
 ((4/J
   4'J
 $;J
 ell*J
 +,J
 
0	0J
  J
^  )V  /3*/
 
||
 t+
 	

 llT)
 t+
 !4K
 

 
r+   r   )r   rQ   ra   ).r&   dataclassesr   r'   r   cache_utilsr   configuration_utilsr   
generationr   masking_utilsr	   r
   r   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   autor   configuration_paligemmar   
get_loggerr"   loggerr   r.   Moduler7   rQ   ra   r   __all__r!   r+   r,   <module>r      s:    !     3 ) m m B S - &   4 
		H	% 
 9#: 9 9 
 9k 9 90299  ' ' ' 
d
- d

d
N 
^
(@/ ^

^
B ^r+   