
    3j                        S r SSKrSSKJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJrJr  SSKJr  \R8                  " \5      r\\ " S S\5      5       5       r " S S\R@                  5      r! " S S\R@                  5      r" " S S\R@                  5      r# " S S\R@                  5      r$ " S S\R@                  5      r% " S S\R@                  5      r& " S S\R@                  5      r' " S  S!\5      r( " S" S#\R@                  5      r) " S$ S%\R@                  5      r*\ " S& S'\5      5       r+ " S( S)\R@                  5      r, " S* S+\R@                  5      r-\,\-S,.r.\" S-S.9 " S/ S0\+5      5       r/ " S1 S2\R@                  5      r0\" S3S.9 " S4 S5\+5      5       r1/ S6Qr2g)7zPyTorch TVP Model    N)	dataclass)nn   )initialization)ACT2FN)load_backbone)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)auto_docstringlogging   )	TvpConfigc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   S	rg)
TvpVideoGroundingOutput$   a\  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Temporal-Distance IoU loss for video grounding.
logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
    input texts.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
Nlosslogits.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       ^/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   $   sq    	 &*D%

d
")'+FE$+:>M5**C/047>7;Je'',-4;r%   r   c                   D   ^  \ rS rSrSrU 4S jrS rS rS rS r	Sr
U =r$ )	TvpLoss8   ab  
This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
ground-truth / prediction (supervise class and box).

Args:
    losses (`list[str]`):
        List of all the losses to be applied.
c                    > [         TU ]  5         U R                  U R                  U R                  S.U l        U H!  nX R
                  ;  d  M  [        SU S35      e   Xl        g )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr6   r   	__class__s      r&   r0   TvpLoss.__init__C   sa    ==****

 D==( 5n!=>>  r%   c                     [         R                  " XB5      [         R                  " X15      -
  n[         R                  " XB5      [         R                  " X15      -
  nSUR                  SS9U-  -
  nU$ )z&
Measure the intersection over union.
r   r   min)r    r<   maxclamp)	r7   
start_timeend_timecandidates_start_timecandidates_end_timer.   interunionr,   s	            r&   r1   TvpLoss.loss_iouP   s_     		-8599EZ;gg		-8599EZ;gg%++!+$u,,
r%   c                 P   [         R                  " [         R                  " X45      S5      n[         R                  " [         R                  " X5      S5      n[         R                  " [         R                  " Xg5      [         R                  " Xg5      -
  U5      R                  SS9nU$ )z%
Measure the distance of mid points.
g       @g?r;   )r    divaddr=   r<   r>   )	r7   r?   r@   rA   rB   r.   mid_candidatesmid_groundtruthdistance_diffs	            r&   r2   TvpLoss.loss_distanceZ   sy     599-B#XZ]^))EIIj$CSI		IIn6>9ccem

%C%. 	 r%   c                     [         R                  " XC5      n[         R                  " X!5      n[         R                  " [         R                  " [         R                  " Xg5      U5      5      nUR	                  SS9nU$ )z%
Measure the difference of duration.
g?r;   )r    subsquarerG   r>   )	r7   r?   r@   rA   rB   r.   duration_candidatesduration_groundtruthduration_diffs	            r&   r3   TvpLoss.loss_durationf   s`     $ii(;S$yy>UYYuyy9L/cem%no%+++4r%   c                    Uu  p4n[         R                  " X5      nUSS2S4   R                  5       USS2S4   R                  5       p0 n	U R                   H*  n
U	R	                  XR
                  U
   " XEXxU5      05        M,     U	$ )a5  
This performs the loss computation.

Args:
    logits (`torch.FloatTensor`):
        The output logits of head module.
    labels (`list[torch.FloatTensor]`):
        List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
Nr   r   )r    mulfloatr6   updater4   )r7   r   labelsr.   r?   r@   
candidatesrA   rB   losses_dictr   s              r&   forwardTvpLoss.forwardq   s     *0&hYYv0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KKD}}T*:AVmuvw  
 r%   )r4   r6   )r   r   r   r   r   r0   r1   r2   r3   r[   r$   __classcell__r8   s   @r&   r(   r(   8   s&    
	 r%   r(   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVisionModel   c           
        > [         TU ]  5         [        U5      U l        UR                  b  UR                  R
                  S   nO[        U R                  S5      (       aI  [        U R                  R                  S5      (       a$  U R                  R                  R
                  S   nOl[        U R                  S5      (       aF  [        U R                  R                  S5      (       a!  U R                  R                  R                  nO[        S5      e[        R                  " UUR                  SSSSSS	9U l        g )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r/   r0   r   backbonebackbone_configre   hasattrrd   rf   r5   r   Conv2dgrid_encoder_conv)r7   rd   in_channelsr8   s      r&   r0   TvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H--'$--:N:NP^2_2_--..;;B?KT]]H--'$--:N:NP]2^2^--..::K899!#"
r%   c                    UR                   u  p#pEnUR                  X#-  XEU5      nU R                  U5      S   S   nU R                  U5      n[        R
                  R                  USSS9n[        R
                  R                  USS9nUR                   SS  u  pnUR                  X#XU5      nUR                  SSS	S
S5      nU$ )Nfeature_mapsr      )rg   rh   T)inplacer   r      )	shapeviewrl   rp   r   
functional
max_pool2drelupermute)r7   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r&   r[   TvpVisionModel.forward   s    >J>P>P;
e#(()@,X]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*yy)T||Aq!Q*r%   )rl   rp   r   r   r   r   r0   r[   r$   r]   r^   s   @r&   r`   r`      s    
. r%   r`   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSS	\
4S jjrSrU =r$ )TvpVisualInputEmbedding   z3
Takes input of both image and video (multi-frame)
c                 x  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l
        [        R                  " SUR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        UR                  U l        UR                  U l	        g )Nr   eps)r/   r0   r   	Embeddingmax_position_embeddingsrf   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr7   rd   r8   s     r&   r0    TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r%   	embeddingr   r   returnc                    S=pEX R                   :  a  X R                   -  nX0R                  :  a  X0R                  -  nUR                  SSSS5      n[        R                  R                  UXE4SSS9nUR                  SSSS5      nU$ )z
This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
resolution images (high resolution videos).

r   r   r   ru   bicubicFscale_factormodealign_corners)r   r   r~   r   r{   interpolate)r7   r   r   r   h0w0s         r&   interpolate_pos_encoding0TvpVisualInputEmbedding.interpolate_pos_encoding   s     999???B888>>>B%%aAq1	MM--	 . 
	 %%aAq1	r%   r   c                    UR                   u  p4pV[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      n	S[        UR                   5      S-
  -  USU4-   n
U	R                  " U
6 n	[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      nUSX4nUR                  " U6 nX-   nU(       a4  X@R                  :  d  XPR                  :  a  XR                  XU5      -   nU$ X-   nU$ )a.  
Args:
    grid: (batch_size, height, width, hidden_dim)
    interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.
Returns:
    grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
dtypedevice)r   r   r   )ry   r<   r   r    arangelongr   r   lenrz   r   r   r   )r7   r   r   r   r   r   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r&   add_2d_positional_embeddings4TvpVisualInputEmbedding.add_2d_positional_embeddings   s7    15

-
E >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	"9">">	"J ==uE	 <<	DKKX"&">">?O"PI:	"9">">	"J 7 Q $:::eFkFk>k778MW\]]D  /Dr%   c                 x   UR                   u  p4pVnUR                  S5      nU R                  XS9nUR                  USU5      nUR                   SS n	UR                  n
[
        R                  " U	[
        R                  U
S9nU R                  U5      nX-   nU R                  U5      nU R                  U5      nU$ )a  
Args:
    grid: Array of shape (batch_size, num_frames, height, width, num_channels).
        It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
        num_frames can be 1
    interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.

Returns:
    embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

r   r   rc   Nr   )ry   meanr   rz   r   r    zerosr   r   r   r   )r7   r   r   r   r   r   r   r   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r&   r[   TvpVisualInputEmbedding.forward  s     ?Cjj;
|yy|000i		*b,?+11#26%% %8

SYZ $ : :> J":
__Z0
\\*-
r%   )r   r   r   r   r   r   r   r   F)r   r   r   r   r   r0   r    Tensorintr   boolr   r[   r$   r]   r^   s   @r&   r   r      sY    
X%,,  TW \a\h\h .'4 'Rd  r%   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )TvpTextInputEmbeddingsi!  zGConstruct the embeddings from word, position and token_type embeddings.c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        g )N)padding_idxr   )r/   r0   r   r   
vocab_sizerf   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r&   r0   TvpTextInputEmbeddings.__init__$  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r%   c                 .   Ub  UR                  5       nOUR                  5       S S nUS   nUb  UR                  OUR                  nUcD  [        R                  " U[        R                  US9nUR                  S5      R                  U5      nUc$  [        R                  " U[        R                  US9nUc  U R                  U5      nU R                  U5      nU R                  U5      n	XH-   U	-   n
U R                  U
5      n
U R                  U
5      n
U
$ )Nrc   r   r   r   )sizer   r    r   r   	unsqueezeexpandr   r   r   r   r   r   )r7   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r&   r[   TvpTextInputEmbeddings.forward,  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"8;PP
__Z0
\\*-
r%   )r   r   r   r   r   NNNN	r   r   r   r   r   r0   r[   r$   r]   r^   s   @r&   r   r   !  s    Q> r%   r   c                   n   ^  \ rS rSrU 4S jrS\R                  S\S\4S jr  SS\	S-  4S	 jjr
S
rU =r$ )TvpAttentioniE  c                   > [         TU ]  5         UR                  UR                  -  S:w  a6  [	        US5      (       d%  [        SUR                   SUR                   35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R$                  " UR                  UR&                  S9U l        [        R                  " UR*                  5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r/   r0   rf   num_attention_headsrn   r5   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   r   s     r&   r0   TvpAttention.__init__F  s{    : ::a?PVXhHiHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=r%   tensorsequence_lengthr   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   ru   )rz   r   r   	transpose
contiguous)r7   r   r   r   s       r&   _reshapeTvpAttention._reshapeZ  s5    KK
T5M5MtOgOghYq!_Z\	
r%   Noutput_attentionsc                 $   UR                   S S u  pEU R                  U5      nU R                  U5      nU R                  U5      nU R	                  XeU5      n	U R	                  XuU5      n
U R	                  XU5      n[
        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUb  X-   n[        R                  R                  USS9nU R                  U5      n[
        R                  " X5      nUR                  SS5      R                  5       nUR!                  XEU R"                  5      nU R%                  U5      nU R'                  U5      nU R)                  X-   5      nU(       a  X4nU$ U4nU$ )Nru   rc   dimr   )ry   r   r   r   r   r    matmulr   mathsqrtr   r   r{   softmaxr   r   reshaper   r   r   r   )r7   r   attention_maskr   r   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                   r&   r[   TvpAttention.forwarda  sy    '4&9&9"1&=#
 JJ}5((=1 JJ}5mm$5
SMM/JO	mm$5
S !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ++O<ll?@!++Aq1<<>!))*tGYGYZjj-ll;/ook&AB4E;0 MX>r%   )
r   r   r   r   r   r   r   r   r   r   NN)r   r   r   r   r0   r    r   r   r   r   r[   r$   r]   r^   s   @r&   r   r   E  sF    >(
u|| 
c 
s 
 )-	&  $;	& &r%   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )TvpIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g N)r/   r0   r   r   rf   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r&   r0   TvpIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r%   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r  r   r  )r7   r   s     r&   r[   TvpIntermediate.forward  s&    

=100?r%   r  
r   r   r   r   r0   r    r   r[   r$   r]   r^   s   @r&   r  r    s(    9U\\ ell  r%   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )TvpOutputLayeri  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r/   r0   r   r   r  rf   r   r   r   r   r   r   r   r   s     r&   r0   TvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r%   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r  r   r   r   )r7   r   r!  s      r&   r[   TvpOutputLayer.forward  s5    

=1]3(DEr%   r#  r  r^   s   @r&   r  r    s6    >U\\  RWR^R^  r%   r  c                   D   ^  \ rS rSrU 4S jr  SS\S-  4S jjrSrU =r$ )TvpEncodeLayeri  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        U5      U l        g r  )r/   r0   r   	attentionr  intermediater  outputr   s     r&   r0   TvpEncodeLayer.__init__  s3    %f-+F3$V,r%   Nr   c                     U R                  UUUS9nUS   nUSS  nU R                  U5      nU R                  Xu5      nU4U-   nU$ )N)r   r   r   r(  r)  r*  )	r7   r   r  r   self_attention_outputsattention_outputr  intermediate_outputlayer_outputs	            r&   r[   TvpEncodeLayer.forward  sl     "&/ "0 "

 2!4(,"//0@A{{#6I/G+r%   r-  r  )	r   r   r   r   r0   r   r[   r$   r]   r^   s   @r&   r&  r&    s(    - )-	  $;	 r%   r&  c            
       f   ^  \ rS rSrU 4S jr    S
S\S-  S\S-  S\S-  S\\-  4S jjrS	r	U =r
$ )
TvpEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r/   r0   rd   r   
ModuleListrangenum_hidden_layersr&  layergradient_checkpointing)r7   rd   _r8   s      r&   r0   TvpEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&Nr   output_hidden_statesreturn_dictr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnSn[	        U R
                  5       H0  u  pU(       a  Xa4-   nU	" XU5      n
U
S   nU(       d  M(  XzS   4-   nM2     U(       a  Xa4-   nU(       d  U4nU(       a  X4-   nU(       a  X4-   nU$ [        UU(       a  UOS U(       a  US9$ S S9$ )Nr   r   r   )last_hidden_stater   r   )rd   r>  r   r=  	enumerater9  r   )r7   r   r  r   r=  r>  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s               r&   r[   TvpEncoder.forward  s    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4OA#$58H$H!(HYZM)!,M  !/3C2E!E  5   14D D$&G#!$88 !$55N+/C+):~
 	
 AE
 	
r%   )rd   r:  r9  r   )r   r   r   r   r0   r   r#   r   r[   r$   r]   r^   s   @r&   r4  r4    sY    , )-,0#'*
  $;	*

 #Tk*
 D[*
 
	 *
 *
r%   r4  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	TvpPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r  )r/   r0   r   r   rf   r   Tanh
activationr   s     r&   r0   TvpPooler.__init__  s9    YYv1163E3EF
'')r%   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   rL  )r7   r   first_token_tensorpooled_outputs       r&   r[   TvpPooler.forward  s6     +1a40

#566r%   )rL  r   r  r^   s   @r&   rI  rI    s(    $
U\\ ell  r%   rI  c                   v    \ rS rSr% \\S'   SrSrSr\	R                  " 5       S\R                  4S j5       rSrg	)
TvpPreTrainedModeli  rd   model)videotextTmodulec                    [        U[        R                  [        R                  45      (       a6  [        R
                  " UR                  SU R                  R                  S9  GO[        U[        R                  5      (       aA  [        R                  " UR                  5        [        R                  " UR                  5        O[        U[        R                  5      (       aO  [        R                  " UR                  SSS9  UR                  b!  [        R                  " UR                  S5        O5[        U[         5      (       a   [        R
                  " UR"                  5        [        U[        R                  5      (       a-  UR                  b   [        R                  " UR                  5        [%        US5      (       a   [        R
                  " UR&                  5        [%        US	5      (       a   [        R
                  " UR(                  5        [%        US
5      (       a   [        R
                  " UR*                  5        [%        US5      (       a!  [        R
                  " UR,                  5        gg)zInitialize the weights        )r   stdfan_outr}   )r   nonlinearityNr   pad_uppad_downpad_left	pad_right)r  r   r   r   initnormal_weightrd   initializer_ranger   zeros_rk   ones_ro   kaiming_normal_	constant_TvpModeltext_promptrn   r]  r^  r_  r`  )r7   rW  s     r&   _init_weights TvpPreTrainedModel._init_weights  su    fryy",,788LLSdkk6S6ST--KK$JJv}}%		**  YVT{{&v{{A.))LL++,fbii((V[[-DKK$68$$LL'6:&&LL)6:&&LL)6;''LL))* (r%   r   N)r   r   r   r   r   r"   base_model_prefixinput_modalitiessupports_gradient_checkpointingr    no_gradr   Modulerk  r$   r   r%   r&   rS  rS    s=    (&*#
]]_+BII + +r%   rS  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )TvpFrameDownPadPrompteri(  z6
Pad frames extracted from videos only at the bottom.
c           	        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                  U l        UR                   U l         [        R                  " [        R                  " SUR
                  SUR                  UR                  /5      5      U l        g )NrH   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr5   r/   r0   visual_prompt_size	frame_nummax_img_sizer   	Parameterr    randnr^  r   s     r&   r0    TvpFrameDownPadPrompter.__init__-  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r%   c                    U R                   S:w  ao  [        R                  " U R                  U R                  /UR                  UR
                  S9nSX R                  U R                  -
  U R                  2S S 24'   X-  nU R                   S:w  a  [        R                  " UR                  S   UR                  S   SU R                  U R                  /UR
                  S9nU R                  U R                  -
  nU R                  US S 2S S 2S S 2X@R                  2S S 24'   XR                  UR                  5      -  nU$ )	NrH   r   rY  rw  r   r   r   r   )ry  r    onesr|  r   r   rz  r   ry   r^  to)r7   r   visual_prompt_maskpromptstart_points        r&   r[   TvpFrameDownPadPrompter.forward;  s(   %%.!&""D$5$56l>P>PYeYlYl" fi0043J3JJTM^M^^`aab.L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK*;*;;Q>?IIl&8&899Lr%   )r{  r|  r^  rz  ry  r   r^   s   @r&   rs  rs  (  s    
 r%   rs  c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSrU =r$ )TvpFramePadPrompteriM  z7
Pad frames extracted from videos in the surroundings.
c           
        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                   U l         UR
                  UR                  S-  -
  U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        g )Nru  rx  ru   r   r   )ry  r5   r/   r0   r   r|  rz  	base_sizer   r}  r    r~  r]  r^  r_  r`  r   s     r&   r0   TvpFramePadPrompter.__init__R  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r%   r  r   r   r   c                     X R                   -  X0R                   -  pTUR                  u  pgpn
UR                  Xg-  XU
5      n[        R                  R                  UXE4SSS9nUR                  XgXU5      nU$ )z
This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
resolution images (high resolution videos).

r   Fr   )r|  ry   r  r   r{   r   )r7   r  r   r   r   r   batchr   channelsprompt_heightprompt_widths              r&   interpolate_pad_encoding,TvpFramePadPrompter.interpolate_pad_encodingx  s     +++U5F5F-FBCI<<@8L  2H\Z**	 + 
 8UKr%   r  c           	      ^   U(       a  UR                   S   UR                   S   4OU R                  U R                  4u  p4U R                  S;  a  [        SU R                   35      eU R                  S;   a/  [        R
                  " X4/UR                  UR                  S9nX-  nU R                  S;   a  [        R                  " SU R                  S	U R                  U R                  UR                  S
9n[        R                  " U R                  X`R                  /SS9n[        R                  " U R                  XpR                  /S	S9n[        R                  " UR!                  S5      U/-  5      nU(       a  U R#                  XsU5      nXR%                  UR                  5      -   nU$ )Nr   rc   )rH   rw  rv  z$Invalid visual_prompter_apply value )rv  rw  r   )rv  rH   r   r   r  rx   r   r   )ry   r|  ry  r5   r    r  r   r   r   r   r  catr_  r`  r]  r^  r   r  r  )r7   r   r  r   r   r  baser  s           r&   r[   TvpFramePadPrompter.forward  sl    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VO<CUCU^j^q^q!r.L%%);;;;q$//1dnndnn]i]p]pqDYYt^^D!LFYYV]]CKFYY|003vh>?F'66vuM'))L4F4F*GGLr%   )r  r|  r   r^  r_  r`  r]  ry  r   )r   r   r   r   r   r0   r    r   r   r  r   r[   r$   r]   r^   s   @r&   r  r  M  sL    $
Lu|| S QT Y^YeYe 0d  r%   r  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                      ^  \ rS rSrU 4S jrS rS r\       SS\R                  S-  S\R                  S-  S\R                  S-  S	\S-  S
\S-  S\S-  S\S\\-  4S jj5       rSrU =r$ )ri  i  c                 ,  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l
        [        U5      U l        [        R                  " [        R                   " SSUR"                  /5      5      U l        [        R&                  " UR(                  5      U l        UR,                  [.        ;  a  [1        S5      e[.        UR,                     " U5      U l        U R5                  5         g )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r/   r0   rd   r`   vision_modelr   r   r   visual_embeddingsr4  encoderrI  poolerr   r}  r    r~  rf   rj  r   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr5   visual_prompter	post_initr   s     r&   r0   TvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r%   c                 .    U R                   R                  $ r  r   r   )r7   s    r&   get_input_embeddingsTvpModel.get_input_embeddings  s    ...r%   c                 $    XR                   l        g r  r  )r7   r   s     r&   set_input_embeddingsTvpModel.set_input_embeddings  s    */'r%   Nr   r   r  r   r=  r>  r   r   c                 v   Ub  UOU R                   R                  nU R                  U R                  X'S95      nU R	                  US9n	U R                  X'S9n
Ub{  UR                  U
R                  SS 5      n[        R                  " UR                  S   S5      R                  UR                  UR                  S9n[        R                  " XU/S	S
9nU R                  R                  U	R                  S   S	S	5      n[        R                  " XU
/SS
9n[!        U R                   UUS9nU R#                  UUUUUS9nU(       a  UR$                  OUS   nU R'                  U5      nU R)                  U5      nU R)                  U5      nU(       d
  UU4USS -   $ [+        UUUR,                  UR.                  S9$ )a  
Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpModel

>>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r  )r   r   ru   r   r  )r   r   rc   r   r   )rd   r   r  )r  r   r=  r>  )r@  pooler_outputr   r   )rd   r>  r  r  r   r  new_onesry   r    r  r  r   r   r  rj  r   r	   r  r@  r  r   r   r   r   )r7   r   r   r  r   r=  r>  r   kwargstext_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskrj  embedding_outputencoder_outputsr@  rP  s                     r&   r[   TvpModel.forward  s   4 &1%<k$++BYBY((   a
 !%) D"&"8"8 #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N&&--.C.I.I!.LbRTU 99kJa%bhij2;;*)
 ,,)/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r%   )	rd   r   r   r  r  rj  r  r  r  )NNNNNNF)r   r   r   r   r0   r  r  r   r    
LongTensorr!   r   r#   r   r[   r$   r]   r^   s   @r&   ri  ri    s     /0  .21526)-,0#').I
##d*I
 ''$.I
 ((4/	I

  $;I
 #TkI
 D[I
 #'I
 
+	+I
 I
r%   ri  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVideoGroundingHeadi  c                 B  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  S-  S5      U l        [        R                  " 5       U l        [        R                  " 5       U l
        g )Nru   )r/   r0   r   r   rf   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r&   r0   TvpVideoGroundingHead.__init__  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr%   c                     U R                  U R                  U5      5      nU R                  U R                  U5      5      nU$ r  )r  r  r  r  )r7   r  r   s      r&   r[   TvpVideoGroundingHead.forward  s9    ""4<<#>?""4<<#78r%   )r  r  r  r  r   r^   s   @r&   r  r    s    ) r%   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                      ^  \ rS rSrU 4S jr\        SS\R                  S-  S\R                  S-  S\R                  S-  S\	\R                     S-  S\S-  S	\S-  S
\S-  S\S\	\-  4S jj5       rSrU =r$ )TvpForVideoGroundingi$  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r  )r/   r0   rd   ri  rT  r  video_grounding_headr  r   s     r&   r0   TvpForVideoGrounding.__init__*  s8     f%
$9&$A!r%   Nr   r   r  rX   r   r=  r>  r   r   c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n
U
S   nU R                  U5      nSnUbo  [	        / SQ5      nUR                  U R                  5        U" X5      nUS   U R                   R                  US   -  -   U R                   R                  US   -  -   nU(       d  U4U
SS -   n
Ub  U4U
-   n
U
$ [        UUU
R                  U
R                  S	9$ )
a  
labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
    The labels contains duration, start time, and end time of the video corresponding to the text.

Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

>>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r   r=  r>  r   r   r+   r,   r-   r.   ru   )r   r   r   r   )rd   r>  rT  r  r(   r  r   distance_loss_weightduration_loss_weightr   r   r   )r7   r   r   r  rX   r   r=  r>  r   r  r  r  r   r   	criterion	loss_dicts                   r&   r[   TvpForVideoGrounding.forward2  s'   < &1%<k$++BYBY**/!5#%=  
  
**=9 ?@ILL%!&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r%   )rd   rT  r  )NNNNNNNF)r   r   r   r   r0   r   r    r  r!   r#   r   r   r   r[   r$   r]   r^   s   @r&   r  r  $  s      .21526-1)-,0#').?
##d*?
 ''$.?
 ((4/	?

 ell#d*?
  $;?
 #Tk?
 D[?
 #'?
 
(	(?
 ?
r%   r  )ri  rS  r  )3r   r   dataclassesr   r    r    r   ra  activationsr   backbone_utilsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   configuration_tvpr   
get_loggerr   loggerr   rq  r(   r`   r   r   r   r  r  r&  r4  rI  rS  rs  r  r  ri  r  r  __all__r   r%   r&   <module>r     s     !   & ! + 6 9 X X - , ( 
		H	% 
<k <  <$Mbii M`%RYY %Pnbii nb!RYY !HB299 BLbii RYY / 41
 1
j		  + + +B"bii "JW")) Wv ,#   
a
! a

a
HBII  
I
- I

I
X Er%   