
    3jI                        S r SSKrSSKJr  SSKJr  SSKJr  SSKrSSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/J0r0  SSK1J2r2J3r3J4r4  SSK5J6r6J7r7J8r8  \*Rr                  " \:5      r;\(\ " S S\5      5       5       r<\(" SS9\ " S S\&5      5       5       r= " S S \	R|                  5      r? SNS!\	R|                  S"\R                  S#\R                  S$\R                  S%\R                  S-  S&\AS'\A4S( jjrB " S) S*\	R|                  5      rC " S+ S,\	R|                  5      rD " S- S.\5      rE\( " S/ S0\ 5      5       rF " S1 S2\	R|                  5      rG " S3 S4\F5      rH " S5 S6\	R|                  5      rI " S7 S8\	R|                  5      rJ " S9 S:\	R|                  5      rK " S; S<\	R|                  5      rL " S= S>\	R|                  5      rM " S? S@\5      rN " SA SB\	R|                  5      rO " SC SD\	R|                  5      rP " SE SF\F5      rQ\(" SGS9 " SH SI\F5      5       rR\(" SJS9 " SK SL\F\5      5       rS/ SMQrTg)OzPyTorch InstructBLIP model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GenerationMixin)create_bidirectional_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastSeq2SeqLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipConfigInstructBlipQFormerConfigInstructBlipVisionConfigc                   B    \ rS rSr% SrSr\S-  \S'   Sr\	S-  \S'   Sr
g)'BaseModelOutputWithVisionQformerOutputs3   z
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
Nvision_outputsqformer_outputs )__name__
__module____qualname____firstlineno____doc__r,   r   __annotations__r-   r   __static_attributes__r.       p/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/instructblip/modeling_instructblip.pyr*   r*   3   s)     9=N.5<KOOADHOr6   r*   zQ
    Class defining the outputs of [`InstructBlipForConditionalGeneration`].
    )custom_introc                       \ rS rSr% SrSr\\R                     S-  \	S'   Sr
\\R                     S-  \	S'   Sr\S-  \	S'   Sr\S-  \	S'   Sr\\-  S-  \	S'   S	\\   4S
 jrSrg)/InstructBlipForConditionalGenerationModelOutputA   a~  
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Language modeling loss from the language model.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head of the language model.
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
    Outputs of the language model.
Nlosslogitsr,   r-   language_model_outputsreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r,   r-   r>   N)getattrto_tuple).0kselfs     r7   	<genexpr>KInstructBlipForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>]   sC      
 ! WW Gq!**,- !s   25)tuplekeysrG   s   `r7   rD   8InstructBlipForConditionalGenerationModelOutput.to_tuple\   s%     
 YY[	
 
 	
r6   r.   )r/   r0   r1   r2   r3   r<   rJ   torchFloatTensorr4   r=   r,   r   r-   r   r>   r   r   r   rD   r5   r.   r6   r7   r:   r:   A   s     -1D%!!
"T
)0.2FE%##$t+28<N.5<KOOADHONR2_DtKR
%* 
r6   r:   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )InstructBlipVisionEmbeddingsf   configc                 r  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  S9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr%   r   )in_channelsout_channelskernel_sizestrider!   )super__init__rS   hidden_size	embed_dim
image_size
patch_sizer   	ParameterrN   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingrG   rS   	__class__s     r7   rZ   %InstructBlipVisionEmbeddings.__init__g   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"br6   
embeddingsheightwidthr?   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r%   Ng      ?r   r   r!   bicubicF)sizemodealign_cornersdim)shaperf   rN   jit
is_tracingr^   r   reshapepermuter   
functionalinterpolateviewcat)rG   rj   rk   rl   rd   re   class_pos_embedpatch_pos_embedrt   
new_height	new_widthsqrt_num_positionss               r7   interpolate_pos_encoding5InstructBlipVisionEmbeddings.interpolate_pos_encodingy   sS    !&&q)A-//55a81< yy##%%+*F6?***11!RaR%811!QR%8r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr6   pixel_valuesr   c                    UR                   u  p4pVU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n	[        R                  " X/SS9n
U(       a  U R                  XU5      nOU R                  nXS S 2S U
R                  S5      2S S 24   R	                  U5      -   n
U
$ )N)dtyper!   r%   rn   rs   )ru   rc   weightr   toflatten	transposera   expandrN   r}   r   rf   rp   )rG   r   r   
batch_size_rk   rl   target_dtypepatch_embedsclass_embedsrj   rf   s               r7   forward$InstructBlipVisionEmbeddings.forward   s    '3'9'9$
v++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
#!%!>!>zSX!Y!%!8!8Q8L*//!:L8La5O"P"S"ST`"aa
r6   )	ra   rS   r\   r]   rd   re   rc   r^   rf   F)r/   r0   r1   r2   r(   rZ   rN   Tensorintr   rO   boolr   r5   __classcell__rh   s   @r7   rQ   rQ   f   sr    c7 c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn  r6   rQ   modulequerykeyvalueattention_maskscalingdropoutc                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nrn   rs   )ptrainingr%   r!   )	rN   matmulr   r   rz   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r7   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$r6   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	S\R                  S	\
\R                  \R                  S
-  \
\R                     S
-  4   4S jrSrU =r$ )InstructBlipAttention   z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        SU l
        UR                  U l        [        R                  " U R                  SU R                  -  SS9U l        UR                  (       ai  [        R                   " ["        R$                  " U R                  5      5      n[        R                   " ["        R$                  " U R                  5      5      nOS nS nUbQ  ["        R&                  " U["        R(                  " USS9U45      n[        R                   " U5      U R                  l        [        R                  " U R                  U R                  5      U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr   )bias)requires_grad)rY   rZ   rS   r[   r\   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasr_   rN   zerosr}   
zeros_liker   
projection)rG   rS   q_biasv_biasr   rh   s        r7   rZ   InstructBlipAttention.__init__   ss   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCr6   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr%   r!   )r|   r   r   r   r   )rG   r   r   r   s       r7   _shapeInstructBlipAttention._shape   s5    {{3GQQRSUVWbbddr6   hidden_statesr?   Nc                    UR                  5       u  p4nU R                  U5      nUR                  X4SU R                  XPR                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        R                  " U R                  R                  [        5      n
U
" U UUU	4SU R                  (       d  SOU R                  U R                  S.UD6u  pUR                  X4S	5      R                  5       nU R                  U5      nX4$ )
z#Input shape: Batch x Time x Channelr   r!   r   r%      N        )r   r   r   rn   )rp   r   rx   r   ry   r   get_interfacerS   _attn_implementationr   r   r   r   r   r   )rG   r   r   r   tgt_lenr\   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                r7   r   InstructBlipAttention.forward   s    #0"4"4"6iHH]+	%%cAt~~yTbTbGbckkq!Q
	 2;1y|YWX\,(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HJJ	%
 	%
! "))#;FFHook2((r6   )	r   rS   r\   r   r   r   r   r   r   )r/   r0   r1   r2   r3   rZ   rN   r   r   r   rJ   r   r5   r   r   s   @r7   r   r      su    GD>eU\\ eC ec e")||") 
u||U\\D0%2E2LL	M	") ")r6   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)rY   rZ   rS   r	   
hidden_actactivation_fnr   r   r[   intermediate_sizefc1fc2rg   s     r7   rZ   InstructBlipMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr6   r   r?   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rG   r   s     r7   r   InstructBlipMLP.forward  s4    /**=9/r6   )r   rS   r   r   
r/   r0   r1   r2   rZ   rN   r   r   r5   r   r   s   @r7   r   r     s)    KU\\ ell  r6   r   c                   ~   ^  \ rS rSrS\4U 4S jjr\S\R                  S\	\
   S\R                  4S j5       rSrU =r$ )	InstructBlipEncoderLayeri$  rS   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)rY   rZ   r[   r\   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rg   s     r7   rZ   !InstructBlipEncoderLayer.__init__%  sm    ++.v6<<F<Q<QR"6*<<F<Q<QRr6   r   r   r?   c                     UnU R                  U5      nU R                  " SSU0UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )Nr   r.   )r   r   r   r   )rG   r   r   residualr   s        r7   r    InstructBlipEncoderLayer.forward-  su     !((7>> 
'

 &0 ((7/%0r6   )r\   r   r   r   r   )r/   r0   r1   r2   r&   rZ   r   rN   r   r   r   rO   r   r5   r   r   s   @r7   r   r   $  sR    S1 S || +, 
			 r6   r   c                      ^  \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSr/ SQr\R                   " 5       U 4S j5       rSrU =r$ )	InstructBlipPreTrainedModeliD  rS   blip)imagetextT)InstructBlipQFormerEmbeddingsr   r   InstructBlipQFormerLayer%InstructBlipQFormerMultiHeadAttentionInstructBlipQFormerSelfOutputc                 V  > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       aA  [        R                  " UR                  SUS9  [        R                  " UR                  SUS9  g[	        U[        [        45      (       a!  [        R                  " UR                  5        g[	        U[        5      (       a\  [        R                  " UR                   ["        R$                  " UR                   R&                  S   5      R)                  S5      5        gg)zInitialize the weightsr   )meanstdrn   r%   rn   N)rY   _init_weightsrS   initializer_range
isinstancerQ   inittrunc_normal_rf   ra   $InstructBlipForConditionalGenerationInstructBlipModelzeros_query_tokensr   copy_position_idsrN   arangeru   r   )rG   r   factorrh   s      r7   r   )InstructBlipPreTrainedModel._init_weightsZ  s     	f%..f:;;v88sOv55CVL!EGX YZZKK++, =>>JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh ?r6   r.   )r/   r0   r1   r2   r&   r4   base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesrN   no_gradr   r5   r   r   s   @r7   r   r   D  s\    (&*#"&N! ]]_
i 
ir6   r   c                   \   ^  \ rS rSrSrS\4U 4S jjr\S\\	   S\
\-  4S j5       rSrU =r$ )	InstructBlipEncoderii  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InstructBlipEncoderLayer`].

Args:
    config (`InstructBlipConfig`):
        The corresponding vision configuration for the `InstructBlipEncoder`.
rS   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rY   rZ   rS   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rG   rS   r   rh   s      r7   rZ   InstructBlipEncoder.__init__s  sT    mmuU[UmUmOn$oOn!%=f%EOn$op&+# %p   A&r   r?   c                 P    UnU R                    H  nU" U40 UD6nM     [        US9$ )Nlast_hidden_state)r  r   )rG   inputs_embedsr   r   encoder_layers        r7   r   InstructBlipEncoder.forwardy  s9     &![[M)M ) ??r6   )rS   r   r  )r/   r0   r1   r2   r3   r&   rZ   r   r   r   rJ   r   r   r5   r   r   s   @r7   r  r  i  sL    ,1 , @ +,@ 
	 	@ @r6   r  c                      ^  \ rS rSr% SrSr\\S'   \\	S.r
S\4U 4S jjr\\" SS9\  SS\R                   S	-  S
\S\\   S\\-  4S jj5       5       5       rS rSrU =r$ )InstructBlipVisionModeli  r   )r   rS   )r   
attentionsc                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )rY   rZ   rS   r[   rQ   rj   r  encoderr   r   r   post_layernorm	post_init)rG   rS   r\   rh   s      r7   rZ    InstructBlipVisionModel.__init__  sY     &&	6v>*62 ll9:O:OPr6   F)tie_last_hidden_statesNr   r   r?   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r   r&  r   r%  pooler_outputr.   )r   rj   r-  r%  r.  r   )rG   r   r   r   r   encoder_outputsr%  pooled_outputs           r7   r   InstructBlipVisionModel.forward  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
r6   c                     U R                   $ r   )rj   rL   s    r7   get_input_embeddings,InstructBlipVisionModel.get_input_embeddings  s    r6   )rS   rj   r-  r.  r  )r/   r0   r1   r2   main_input_namer  r(   r4   r   r   _can_record_outputsrZ   r   r    r   rN   rO   r   r   r   rJ   r   r   r9  r5   r   r   s   @r7   r*  r*    s    $O!$$1+
	7 	  E2 26).
''$.
 #'
 +,	

 
+	+
  3  
6 r6   r*  c                   h   ^  \ rS rSrSU 4S jjrS rS rS rS rS r	   SS\
\   4S	 jjrS
rU =r$ )r   i  c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        SU l        g )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)F)rY   rZ   rS   r[   r   hasattrr   r   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   save_attentionrG   rS   is_cross_attentionrh   s      r7   rZ   .InstructBlipQFormerMultiHeadAttention.__init__  sb    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF#r6   c                     Xl         g r   attn_gradients)rG   rL  s     r7   save_attn_gradients9InstructBlipQFormerMultiHeadAttention.save_attn_gradients  s    ,r6   c                     U R                   $ r   rK  rL   s    r7   get_attn_gradients8InstructBlipQFormerMultiHeadAttention.get_attn_gradients  s    """r6   c                     Xl         g r   attention_map)rG   rT  s     r7   save_attention_map8InstructBlipQFormerMultiHeadAttention.save_attention_map  s    *r6   c                     U R                   $ r   rS  rL   s    r7   get_attention_map7InstructBlipQFormerMultiHeadAttention.get_attention_map  s    !!!r6   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nrn   r   r!   r%   r   )rp   r   rA  r|   ry   )rG   xnew_x_shapes      r7   transpose_for_scores:InstructBlipQFormerMultiHeadAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r6   r   c                    US LnU(       aC  U R                  U R                  U5      5      nU R                  U R                  U5      5      nUnO@U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      n	U R                  U	5      n
[        R
                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUR                  nUb  X-   n[        R                  " SS9" U5      R                  U5      nU(       a=  U R                  (       a,  U R                  U5        UR!                  U R"                  5        U R%                  U5      n[        R
                  " X5      nUR'                  SSSS5      R)                  5       nUR+                  5       S S U R,                  4-   nUR.                  " U6 nX4$ )Nrn   r   rs   r   r!   r%   r   )r]  r   r   r   rN   r   r   mathsqrtrA  r   r   Softmaxr   rF  rU  register_hookrM  r   ry   r   rp   rB  r|   )rG   r   r   encoder_hidden_statesencoder_attention_maskr   rH  	key_layervalue_layermixed_query_layerquery_layerattention_scoresattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                    r7   r   -InstructBlipQFormerMultiHeadAttention.forward  s    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<5H5HR5PQ+dii8P8P.QQ!1!7!7%/@ **,-=>AABXY$"5"5##O4))$*B*BC #',,"?%<J%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD--r6   )rB  rA  rT  rL  rS   r   r   r   r   rF  r   r   NNN)r/   r0   r1   r2   rZ   rM  rP  rU  rX  r]  r   r   r   r5   r   r   s   @r7   r   r     sF    $0-#+"% "#4. +,4. 4.r6   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )r   i!  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )rY   rZ   r   r   r[   denser   r   rD  hidden_dropout_probr   rg   s     r7   rZ   &InstructBlipQFormerSelfOutput.__init__"  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r6   r   input_tensorr?   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rt  r   r   rG   r   rw  s      r7   r   %InstructBlipQFormerSelfOutput.forward(  5    

=1]3}'CDr6   r   rt  r   r   r   s   @r7   r   r   !  6    >U\\  RWR^R^  r6   r   c                      ^  \ rS rSrSU 4S jjr   SS\R                  S\R                  S-  S\R                  S-  S\R                  S-  S\\	   S	\R                  4S
 jjr
SrU =r$ )InstructBlipQFormerAttentioni0  c                 b   > [         TU ]  5         [        X5      U l        [	        U5      U l        g r   )rY   rZ   r   	attentionr   outputrG  s      r7   rZ   %InstructBlipQFormerAttention.__init__1  s&    >vZ3F;r6   Nr   r   rd  re  r   r?   c                 Z    U R                   " SUUUUS.UD6u  pgU R                  Xa5      nU$ )N)r   r   rd  re  r.   r  r  )	rG   r   r   rd  re  r   r   r   attention_outputs	            r7   r   $InstructBlipQFormerAttention.forward6  sF      
')"7#9	

 
  ;;{Br6   r  r   rq  )r/   r0   r1   r2   rZ   rN   r   rO   r   r   r   r5   r   r   s   @r7   r  r  0  s    < 48:>;? ||  ))D0   %0047	 
 !& 1 1D 8  +,  
   r6   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipQFormerIntermediateiJ  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rY   rZ   r   r   r[   r   rt  r  r   strr	   intermediate_act_fnrg   s     r7   rZ   (InstructBlipQFormerIntermediate.__init__K  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r6   r   r?   c                 J    U R                  U5      nU R                  U5      nU$ r   rt  r  r   s     r7   r   'InstructBlipQFormerIntermediate.forwardS  s&    

=100?r6   r  r   r   s   @r7   r  r  J  s(    9U\\ ell  r6   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )InstructBlipQFormerOutputiZ  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rY   rZ   r   r   r   r[   rt  r   r   rD  ru  r   rg   s     r7   rZ   "InstructBlipQFormerOutput.__init__[  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r6   r   rw  r?   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   ry  rz  s      r7   r   !InstructBlipQFormerOutput.forwarda  r|  r6   r}  r   r   s   @r7   r  r  Z  r~  r6   r  c                   T   ^  \ rS rSrU 4S jr    SS\\   4S jjrS rS r	Sr
U =r$ )	r   ih  c                 ^  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        X l        X!R                  -  S:X  a  [	        USS9U l        SU l	        OSU l	        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        g )Nr%   r   T)rH  F)rY   rZ   chunk_size_feed_forwardseq_len_dimr  r  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionr  intermediater  r  intermediate_queryoutput_queryrG   rS   r  rh   s      r7   rZ   !InstructBlipQFormerLayer.__init__i  s    '-'E'E$5f="7771<">vZ^"_D'+D$',D$;FC/7"A&"I5f=r6   r   c           
      l   U R                   " U4SU0UD6nUS:  a  US S 2S U2S S 24   nU R                  (       a%  Uc  [        S5      eU R                  " U4UUUS.UD6n[	        U R
                  U R                  U R                  U5      n	UR                  S   U:  ag  [	        U R                  U R                  U R                  US S 2US 2S S 24   5      R                  U	R                  5      n
[        R                  " X/SS9n	U	$ [	        U R                  U R                  U R                  U5      n	U	$ )Nr   r   z>encoder_hidden_states must be given for cross-attention layers)r   rd  re  r%   rs   )r  r  r   r  r   feed_forward_chunk_queryr  r  ru   feed_forward_chunkr   devicerN   r}   )rG   r   r   rd  re  query_lengthr   r  query_attention_outputlayer_outputlayer_output_texts              r7   r    InstructBlipQFormerLayer.forward}  si     >>
)
 
 !%5a,6I%J"''(0$%eff)-)<)<**#1*?+A	*
 *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,)JPQR  5'',,   	L r6   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  rG   r  intermediate_outputr  s       r7   r  +InstructBlipQFormerLayer.feed_forward_chunk  s)    "//0@A{{#6Ir6   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  r  s       r7   r  1InstructBlipQFormerLayer.feed_forward_chunk_query  s+    "556FG(()<Or6   )
r  r  r  r  r  r  r  r  r  r  NNNr   )r/   r0   r1   r2   rZ   r   r   r   r  r  r5   r   r   s   @r7   r   r   h  s;    >. "#3 +,3j
 r6   r   c                   R   ^  \ rS rSrU 4S jr\    SS\\   4S jj5       rSr	U =r
$ )InstructBlipQFormerEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf r  )
rY   rZ   rS   r   r  r  r  r   layerr   r  s      r7   rZ   #InstructBlipQFormerEncoder.__init__  sY    ]]JOPVPhPhJijJiY%f8Jij

 ',# kr"  r   c                     [        U R                  R                  5       H   nU R                  U   nU" UUU4UUS.UD6nM"     [	        US9$ )N)re  r  r$  )r  rS   r  r  r   )	rG   r   r   rd  re  r  r   ilayer_modules	            r7   r   "InstructBlipQFormerEncoder.forward  sf     t{{445A::a=L(% (>) M 6 9+
 	
r6   )rS   r   r  r  )r/   r0   r1   r2   rZ   r   r   r   r   r5   r   r   s   @r7   r  r    s:    ,  "#
 +,
 
r6   r  c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )r   i  z;Construct the embeddings from word and position embeddings.c                 "  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R!                  S["        R$                  " UR                  5      R'                  S5      SS9  Xl        g )N)padding_idxr   r
  r   F)
persistent)rY   rZ   r   	Embedding
vocab_sizer[   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingsr   r   	layernormrD  ru  r   register_bufferrN   r  r   rS   rg   s     r7   rZ   &InstructBlipQFormerEmbeddings.__init__  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 r6   c                    Ub  UR                  5       S   nOSnUc%  U R                  S S 2XEU-   24   R                  5       nUbY  U R                  U5      nU R	                  UR                  UR                  5      5      nXg-   nUb  [        R                  " X64SS9nOUnUR                  U R                  R                  R                  5      nU R                  U5      nU R                  U5      nU$ )Nr%   r   rs   )rp   r
  cloner  r  r   r  rN   r}   r  r   r   r   )rG   	input_idsr
  query_embedspast_key_values_length
seq_lengthrj   r  s           r7   r   %InstructBlipQFormerEmbeddings.forward  s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J"&":":<??:K\K\;]"^#9J'"YY'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
r6   )rS   r   r  r  r  r  )	r/   r0   r1   r2   r3   rZ   r   r5   r   r   s   @r7   r   r     s#    E"   r6   r   c                     ^  \ rS rSrSrSrSrSrSr\	\
" \SSS9/\
" \SSS9/S.rS	\4U 4S
 jjrS rS r\\\     SS\R*                  S\R,                  S-  S\R*                  S-  S\R.                  S-  S\R,                  S-  S\R,                  S-  S\\   S\\R,                     \-  4S jj5       5       5       rSrU =r$ )InstructBlipQFormerModeli  z
Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
instruction as input.
Fr%   z
.attention)index
layer_namez.crossattention)r   r+  cross_attentionsrS   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )rY   rZ   rS   r   rj   r  r-  r/  rg   s     r7   rZ   !InstructBlipQFormerModel.__init__)  s7     7?1&9r6   c                 .    U R                   R                  $ r   rj   r  rL   s    r7   r9  -InstructBlipQFormerModel.get_input_embeddings3  s    ...r6   c                 $    XR                   l        g r   r  )rG   r   s     r7   set_input_embeddings-InstructBlipQFormerModel.set_input_embeddings6  s    */'r6   Nr  r   r
  r  rd  re  r   r?   c                 >   Uc  Uc  [        S5      eUb  UR                  S   OSnU R                  UUUS9n	[        U R                  U	US9nUb  [        U R                  U	UUS9nU R
                  " U	4UUUUS.UD6n
U
R                  nUSS2SSS24   n[        UUS	9$ )
a  
query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
    Hidden states to be used in the attention computation. If cross-attention,
    will be used for the query (i.e., key and value will use the encoder_hidden_states).
Nz7You have to specify query_embeds when input_ids is Noner%   r   )r  r
  r  )rS   r&  r   )rS   r&  r   rd  )r   rd  re  r  r3  )r   ru   rj   r   rS   r-  r%  r   )rG   r  r   r
  r  rd  re  r   r  embedding_outputr5  sequence_outputr6  s                r7   r    InstructBlipQFormerModel.forward9  s    $ !5VWW0<0H|))!,a??%% + 
 3;;*)
 "-%>{{.5&;	&" ,0<<,
)"7#9%,
 ,
 *;;'1a0;-'
 	
r6   )rS   rj   r-  )NNNNN)r/   r0   r1   r2   r3   r  r  r  r  r   r   r   r<  r'   rZ   r9  r  r   r    r   rN   
LongTensorrO   r   r   r   rJ   r   r   r5   r   r   s   @r7   r  r    sB   
 #( N 2@Vbc
 @Vgh
8 /0   4804,0:>;?6
##6
 ))D06
 &&-	6

 llT)6
  %00476
 !& 1 1D 86
 +,6
 
u  	!$P	P6
    6
r6   r  z[
    InstructBLIP base Model consisting of language model, qformer and vision encoder.
    c                     ^  \ rS rSrSrS/rS\4U 4S jjrS rS\	R                  S\	R                  4S	 jr\\       SS\	R                  S\	R                  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\	R                  S
-  S\S\\   S\\-  4S jj5       5       rSrU =r$ )r  iu  r   r  rS   c                   > [         TU ]  U5        [        UR                  5      U l        [
        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        UR                  5      U l        [
        R                  " UR                  R                  UR                   R                  5      U l        [$        R&                  " UR                   5      U l        U R+                  5         g Nr%   )rY   rZ   r*  vision_configvision_modelr   r_   rN   r   num_query_tokensqformer_configr[   r  r  qformerr   text_configlanguage_projectionr"   from_configlanguage_modelr/  rg   s     r7   rZ   InstructBlipModel.__init__~  s     3F4H4HILLQ8O8OQWQfQfQrQr)st/0E0EF#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG 	r6   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggz
Some pre-processing hacks to make the model `accelerate` compatible. Check
https://github.com/huggingface/transformers/pull/21707 for more details.
r%   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maplenrN   cudadevice_countloggerwarningr@  r  r  io_same_devicerG   r  s     r7   _preprocess_accelerate(InstructBlipModel._preprocess_accelerate  |    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
33:>D((7 4r6   r  r&  c           	      d   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  UR                  5      nU$ zJ
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
r   r  rn   
r9  rN   r   rS   image_token_idlongr  all	unsqueezer   rG   r  r&  special_image_masks       r7   get_placeholder_mask&InstructBlipModel.get_placeholder_mask       !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H/99"=@@AUAUV!!r6   Nqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskr   r   r?   c
           	      X   U R                   " SUU	S.U
D6nUS   n[        R                  " UR                  5       SS [        R                  UR
                  S9nU R                  R                  UR                  S   SS5      n[        R                  " UR                  5       SS [        R                  UR
                  S9nUc  [        R                  " U5      nUR                  UR
                  5      n[        R                  " X/SS9nU R                  " SUUUUUS.U
D6nUS   SS2SUR                  S5      2SS24   nUc9  U R                  R                  5       " U5      nUc  [        R                  " U5      nU R                  U5      nUR                  UR
                  UR                   5      nU R#                  XHS	9nUR%                  UU5      nU R&                  R(                  (       a  U R                  " SUUS
.U
D6nOU R                  " SUUUUS.U
D6n[+        UUUS9$ )aK  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
)r   r   r   Nrn   r  r%   rs   )r  r   r  rd  re  r&  r&  r   )r&  r   r  r  rB   r.   )r  rN   onesrp   r  r  r  r   ru   	ones_liker   r}   r  r  r9  r  r   r
  masked_scatterrS   use_decoder_only_language_modelr:   )rG   r   r  r  r  r   r  r  r&  r   r   r,   image_embedsimage_attention_maskr  query_attention_maskquery_outputsquery_outputlanguage_model_inputsr	  outputss                        r7   r   InstructBlipModel.forward  sX   P ** 
%%=
 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"!7!:!:;O;V;V!W!&,@+Y_`!a 
'1%".#7
 
 %Q'+A\->->q-A+A1(DE  //DDFyQM%!&!; !% 8 8 F 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +- G )) +-"3'=	
 G ?))#*
 	
r6   r  r  r  r  r  )NNNNNNF)r/   r0   r1   r2   r;  _keep_in_fp32_modulesr&   rZ   r  rN   r  rO   r
  r   r   r   r   r   r   rJ   r:   r   r5   r   r   s   @r7   r  r  u  sH    %O+,1 ?("e.>.> "uO`O` " 
 ;?.22659:>-1)._
''_
 !,,_
 !& 0 04 7	_

 $$t+_
 ((4/_
 !++d2_
 !& 0 04 7_
 ||d*_
 #'_
 -._
 
@	@_
  _
r6   r  a  
    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                     ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	S r
S\R                  4S	 jrSU 4S jjrS rS r\\  SS\R(                  S\R*                  S\R*                  S
-  S\S
-  S\\   S\\-  4S jj5       5       rS\R*                  S\R(                  4S jr\\        SS\R(                  S\R(                  S\R*                  S
-  S\R(                  S
-  S\R*                  S
-  S\R*                  S
-  S\R*                  S
-  S\R(                  S
-  S\R*                  S
-  S\S\\   S\\-  4S jj5       5       r\R>                  " 5             S S\R(                  S\R*                  S
-  S\R*                  S
-  S\R*                  S
-  S\R*                  S
-  S\R(                  S
-  S\S\R*                  4S jj5       r Sr!U =r"$ )!r  i  rS   r   Tr  c                   > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        R                  UR                  5      U l        [        R                   " UR                  R                  UR"                  R                  5      U l        UR&                  (       a!  [(        R*                  " UR"                  5      nO [,        R*                  " UR"                  5      nX l        U R1                  5         g r  )rY   rZ   r*  _from_configr  r  r   r_   rN   r   r  r  r[   r  r  r  r   r  r  r  r#   r  r$   r  r/  )rG   rS   r  rh   s      r7   rZ   -InstructBlipForConditionalGeneration.__init__"  s     3@@AUAUVLLQ8O8OQWQfQfQrQr)st/<<V=R=RS#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN, 	r6   c                 :    U R                   R                  U5        g r   )r  set_output_embeddings)rG   new_embeddingss     r7   r'  :InstructBlipForConditionalGeneration.set_output_embeddings6  s    11.Ar6   r?   c                 6    U R                   R                  5       $ r   )r  get_output_embeddingsrL   s    r7   r+  :InstructBlipForConditionalGeneration.get_output_embeddings9  s    ""88::r6   Nc                 X   > Uc  U R                   R                  5       $ [        TU ]  US9$ )N)modality)r  get_encoderrY   )rG   r.  rh   s     r7   r/  0InstructBlipForConditionalGeneration.get_encoder<  s1    &&22447&&99r6   c                 6    U R                   R                  5       $ r   )r  get_decoderrL   s    r7   r2  0InstructBlipForConditionalGeneration.get_decoderB  s    ""..00r6   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggr  r  r  s     r7   r  ;InstructBlipForConditionalGeneration._preprocess_accelerateF  r  r6   r  r  r   r   c           
         U R                   " SUUSS.UD6n[        S0 UDSU0D6nUS   n[        R                  " UR	                  5       SS [        R
                  UR                  S9nU R                  R                  UR                  S   SS5      n	[        R                  " U	R	                  5       SS [        R
                  UR                  S9n
Uc  [        R                  " U5      nUR                  U
R                  5      n[        R                  " X/SS	9nU R                  " SUUU	UUSS
.UD6nXl        US   SS2SU	R	                  S5      2SS24   nU R                  U5      nXl        U$ )a  
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The tensors corresponding to the input images.
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
T)r   r   return_dictr,   r   Nrn   r  r%   rs   )r  r   r  rd  re  r7  r.   )r  r*   rN   r  rp   r  r  r  r   ru   r  r   r}   r  r-   r  r4  )rG   r   r  r  r   r   r,   r  r  r  r  r-   r  image_featuress                 r7   get_image_features7InstructBlipForConditionalGeneration.get_image_featuresZ  s   > 6:5F5F 6
%%=6
 	6
 Aq>qbpq%a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"!7!:!:;O;V;V!W!&,@+Y_`!a,, 
'1%".#7
 
 *9&&q)!-C|/@/@/C-CQ*FG 11,?'5$r6   r  r&  c           	      d   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  UR                  5      nU$ r  r  r  s       r7   r
  9InstructBlipForConditionalGeneration.get_placeholder_mask  r  r6   r   r  r  labelsc           	         U R                  UUUU
SS9nUR                  nUR                  nUR                  nUc  U R	                  5       " U5      nUc  [
        R                  " U5      nUR                  UR                  UR                  5      nU R                  XHS9nUR                  UU5      nU R                  R                  (       aS  U R                  " SUUS.UD6nUS   nSnU	b3  U R                  " SUXR                  R                   R"                  S.UD6nO5SUS'   U R                  " SUUUUU	S	.UD6nUR$                  nUR&                  n[)        UUUUUS
9$ )a  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
    1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size]`

Examples:

```python
>>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
>>> import torch
>>> from PIL import Image
>>> import httpx
>>> from io import BytesIO

>>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
>>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model.to(device)  # doctest: +IGNORE_RESULT

>>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
>>> with httpx.stream("GET", url) as response:
...     image = Image.open(BytesIO(response.read())).convert("RGB")
>>> prompt = "What is unusual about this image?"
>>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     min_length=1,
...     top_p=0.9,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
...     temperature=1,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
```Tr  r  r   r7  Nr  r  r   )r=   r=  r  r7  )r&  r   r  r  r=  )r<   r=   r,   r-   r>   r.   )r9  r4  r-   r,   r9  rN   r  r   r  r   r
  r  rS   r  r  loss_functionr  r  r<   r=   r:   )rG   r   r  r  r  r   r  r  r&  r=  r   r   r8  r  r-   r,   r	  r  r=   r<   s                       r7   r   ,InstructBlipForConditionalGeneration.forward  s   ^ CGBYBY/#9%= CZ C
 !/ < <(88'66  557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +- G
 QZFD!)) !&[[=T=T=_=_ci
 %)F=!)) +-"3'= G <<D^^F>)+#*
 	
r6   c                 T   [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUSS9n
U
R                  nUc  Uc  U R
                  R                  /U R
                  R                  -  nXR
                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                   " U5      nUR#                  UR                  UR$                  5      nU R'                  XFS9nUR)                  X5      nXeS.nU R*                  R
                  R,                  (       d  XOS	'   U R*                  R.                  " S
0 UDUD6nU$ )a  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
        Input images to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
r  r   Tr?  r  r%   r  r  r  r.   )r@  r  ru   r9  r4  rS   image_token_indexr  r  bos_token_idrN   r   r  r  repeatr9  r  r   r   r
  r  r  is_encoder_decodergenerate)rG   r   r  r  r  r   r&  r   generate_kwargsr   r8  r  image_tokensstart_tokensr	  inputsr  s                    r7   rG  -InstructBlipForConditionalGeneration.generate3  s   D 4))'')!''*
BFBYBY/#9%= CZ C
 !/ < <   $ = =>A]A]]+{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	 557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_#0S""))<<"+;%%..KK?Kr6   r   r   r  )NNNNNNNF)NNNNNF)#r/   r0   r1   r2   r&   r4   r;  r  r!  rZ   r'  r   Moduler+  r/  r2  r  r   r   rN   rO   r  r   r   r   rJ   r*   r9  r
  r:   r   r  rG  r5   r   r   s   @r7   r  r    s    $O!+,1 (B;ryy ;:1?( 
 ;?05@''@ !++@ !& 0 04 7	@
 #'+@ +,@ 
8	8@  @D"e.>.> "uO`O` " 
 ;?.22659:>26*.).B
''B
 !,,B
 !& 0 04 7	B

 $$t+B
 ((4/B
 !++d2B
 !& 0 04 7B
 ((4/B
   4'B
 #'B
 +,B
 
@	@B
  B
H ]]_ 6::>-12626).D''D !++d2D !& 0 04 7	D
 ##d*D ((4/D ((4/D #'D 
		D Dr6   r  )r  r   r  r  r*  )r   )Ur3   r`  collections.abcr   dataclassesr   typingr   rN   r    r   r  activationsr	   
generationr
   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   r    autor"   r#   r$   configuration_instructblipr&   r'   r(   
get_loggerr/   r  r*   r:   rM  rQ   r   floatr   r   r   r   r   r  r*  r   r   r  r  r  r   r  r   r  r  r  __all__r.   r6   r7   <module>rc     s   "  $ !    & ! ) 6 B 9  G & 6 j j 7 E I I o o 
		H	% 
	P.H 	P  	P 
 
k 
 
<G299 Gd %II%<<% 
% <<	%
 LL4'% % %0G)BII G)Vbii  9 @  i/  i  iH@")) @@39 3l^.BII ^.DBII  299  4bii  		 R9 Rl!
 !
H/BII /d^
: ^
B 
U
3 U

U
p ]+F ]]@r6   