
    3jr                    l   S r SSKrSSKJr  SSKrSSKJr  SSKJrJrJ	r	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJr  SSKJ r   \RB                  " \"5      r#\" SS9\ " S S\5      5       5       r$\" SS9\ " S S\5      5       5       r%\" SS9\ " S S\5      5       5       r&\" SS9\ " S S\5      5       5       r'\" SS9\ " S S\5      5       5       r(\" S S9\ " S! S"\5      5       5       r)\" S#S9\ " S$ S%\5      5       5       r*\" S&S9\ " S' S(\5      5       5       r+\" S)S9\ " S* S+\5      5       5       r,\" S,S9\ " S- S.\5      5       5       r- " S/ S0\R\                  5      r/ " S1 S2\R\                  5      r0 " S3 S4\R\                  5      r1 " S5 S6\R\                  5      r2 " S7 S8\R\                  5      r3 " S9 S:\R\                  5      r4 " S; S<\R\                  5      r5 " S= S>\5      r6 " S? S@\R\                  5      r7 " SA SB\R\                  5      r8 " SC SD\R\                  5      r9 " SE SF\R\                  5      r:\ " SG SH\5      5       r;\" SIS9 " SJ SK\;5      5       r<SL r= " SM SN\R\                  5      r>\" SOS9 " SP SQ\;5      5       r?\" SRS9 " SS ST\;5      5       r@\" SUS9 " SV SW\;5      5       rA\" SXS9 " SY SZ\;5      5       rB\" S[S9 " S\ S]\;5      5       rC\" S^S9 " S_ S`\;5      5       rD\ " Sa Sb\;5      5       rE\ " Sc Sd\;5      5       rF/ SeQrGg)fzPyTorch LUKE model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FNgelu)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)apply_chunking_to_forward)ModelOutputauto_docstringlogging   )
LukeConfigz3
    Base class for outputs of the LUKE model.
    )custom_introc                   t    \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Srg)BaseLukeModelOutputWithPooling%   aP  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Last layer hidden-state of the first token of the sequence (classification token) further processed by a
    Linear layer and a Tanh activation function.
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
    Sequence of entity hidden-states at the output of the last layer of the model.
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nentity_last_hidden_state.entity_hidden_states __name__
__module____qualname____firstlineno____doc__r   torchFloatTensor__annotations__r   tuple__static_attributes__r       `/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/luke/modeling_luke.pyr   r   %   s?    
 :>e//$6=AE% 1 13 67$>Er)   r   zV
    Base class for model's outputs, with potential hidden states and attentions.
    c                   t    \ rS rSr% SrSr\R                  S-  \S'   Sr	\
\R                  S4   S-  \S'   Srg)BaseLukeModelOutput<   ah  
entity_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, entity_length, hidden_size)`):
    Sequence of entity hidden-states at the output of the last layer of the model.
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nr   .r   r   r   r   r)   r*   r,   r,   <   s?     :>e//$6=AE% 1 13 67$>Er)   r,   c                   t   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\R                  S-  \S'   Sr\R                  S-  \S'   Sr\\R                     S-  \S	'   Sr\\R                  S
4   S-  \S'   Sr\\R                  S
4   S-  \S'   Srg)LukeMaskedLMOutputP   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    The sum of masked language modeling (MLM) loss and entity prediction loss.
mlm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Masked language modeling (MLM) loss.
mep_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Masked entity prediction (MEP) loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
entity_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the entity prediction head (scores for each entity vocabulary token before SoftMax).
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nlossmlm_lossmep_losslogitsentity_logitshidden_states.r   
attentionsr   )r   r    r!   r"   r#   r1   r$   r%   r&   r2   r3   r4   r5   r6   r'   r   r7   r(   r   r)   r*   r/   r/   P   s    " &*D%

d
"))-He$&-)-He$&-'+FE$+.2M5$$t+259M5**+d29AE% 1 13 67$>E7;Je'',-4;r)   r/   z2
    Outputs of entity classification models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)EntityClassificationOutputr     
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification scores (before SoftMax).
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nr1   r4   .r6   r   r7   r   r   r    r!   r"   r#   r1   r$   r%   r&   r4   r6   r'   r   r7   r(   r   r)   r*   r9   r9   r       	 &*D%

d
")'+FE$+:>M5**C/047>AE% 1 13 67$>E7;Je'',-4;r)   r9   z7
    Outputs of entity pair classification models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)EntityPairClassificationOutput   r;   Nr1   r4   .r6   r   r7   r   r<   r   r)   r*   r?   r?      r=   r)   r?   z7
    Outputs of entity span classification models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)EntitySpanClassificationOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification loss.
logits (`torch.FloatTensor` of shape `(batch_size, entity_length, config.num_labels)`):
    Classification scores (before SoftMax).
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nr1   r4   .r6   r   r7   r   r<   r   r)   r*   rB   rB      r=   r)   rB   z4
    Outputs of sentence classification models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)LukeSequenceClassifierOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification (or regression if config.num_labels==1) loss.
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
    Classification (or regression if config.num_labels==1) scores (before SoftMax).
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nr1   r4   .r6   r   r7   r   r<   r   r)   r*   rE   rE      r=   r)   rE   z@
    Base class for outputs of token classification models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)LukeTokenClassifierOutput   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Classification loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
    Classification scores (before SoftMax).
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nr1   r4   .r6   r   r7   r   r<   r   r)   r*   rH   rH      r=   r)   rH   z/
    Outputs of question answering models.
    c                   (   \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\R                  S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   Sr\\R                  S4   S-  \S
'   Srg) LukeQuestionAnsweringModelOutput   ak  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nr1   start_logits
end_logits.r6   r   r7   r   )r   r    r!   r"   r#   r1   r$   r%   r&   rM   rN   r6   r'   r   r7   r(   r   r)   r*   rK   rK      s     &*D%

d
")-1L%##d*1+/J!!D(/:>M5**C/047>AE% 1 13 67$>E7;Je'',-4;r)   rK   z,
    Outputs of multiple choice models.
    c                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S'   Sr\\R                  S4   S-  \S	'   S
rg)LukeMultipleChoiceModelOutputi  a  
loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
    Classification loss.
logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
    *num_choices* is the second dimension of the input tensors. (see *input_ids* above).

    Classification scores (before SoftMax).
entity_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
    shape `(batch_size, entity_length, hidden_size)`. Entity hidden-states of the model at the output of each
    layer plus the initial entity embedding outputs.
Nr1   r4   .r6   r   r7   r   r<   r   r)   r*   rP   rP     s     &*D%

d
")'+FE$+:>M5**C/047>AE% 1 13 67$>E7;Je'',-4;r)   rP   c                   D   ^  \ rS rSrSrU 4S jr    SS jrS rSrU =r	$ )LukeEmbeddingsi"  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        UR                  U l        [        R                  " UR                  UR
                  U R"                  S9U l	        g )Npadding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrU   selfconfig	__class__s     r*   rY   LukeEmbeddings.__init__'  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= "..#%<<**F,>,>DL\L\$
 r)   c                    UcC  Ub/  [        XR                  5      R                  UR                  5      nOU R	                  U5      nUb  UR                  5       nOUR                  5       S S nUc8  [        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      nU R                  U5      nXF-   U-   nU R                  U5      nU R                  U5      nU$ )Ndtypedevice)"create_position_ids_from_input_idsrU   torq   &create_position_ids_from_inputs_embedssizer$   zeroslongposition_idsr^   r`   rb   rc   rg   )	ri   	input_idstoken_type_idsrx   inputs_embedsinput_shaper`   rb   
embeddingss	            r*   forwardLukeEmbeddings.forward6  s     $A)M]M]^aabkbrbrs#JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN  00;M"66|D $ : :> J"8;PP
^^J/
\\*-
r)   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nrn   r   ro   r   )ru   r$   arangerU   rw   rq   	unsqueezeexpand)ri   r{   r|   sequence_lengthrx   s        r*   rt   5LukeEmbeddings.create_position_ids_from_inputs_embedsW  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r)   )rc   rg   rU   r`   rb   r^   )NNNN)
r   r    r!   r"   r#   rY   r~   rt   r(   __classcell__rk   s   @r*   rR   rR   "  s+    
" B= =r)   rR   c                      ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S\R                  S-  4S jjrS	r	U =r
$ )LukeEntityEmbeddingsii  rj   c                   > [         TU ]  5         Xl        [        R                  " UR
                  UR                  SS9U l        UR                  UR                  :w  a/  [        R                  " UR                  UR                  SS9U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                   S9U l        [        R"                  " UR$                  5      U l        g )Nr   rT   FbiasrV   )rX   rY   rj   r   rZ   entity_vocab_sizeentity_emb_sizeentity_embeddingsr\   Linearentity_embedding_denser_   r`   ra   rb   rc   rd   re   rf   rg   rh   s     r*   rY   LukeEntityEmbeddings.__init__j  s    !#f.F.FH^H^lm!n!!V%7%77*,))F4J4JFL^L^ej*kD'#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=r)   N
entity_idsrx   rz   c                 <   Uc  [         R                  " U5      nU R                  U5      nU R                  R                  U R                  R
                  :w  a  U R                  U5      nU R                  UR                  SS95      nUS:g  R                  U5      R                  S5      nXV-  n[         R                  " USS9nXVR                  SS9R                  SS9-  nU R                  U5      nXE-   U-   nU R                  U5      nU R                  U5      nU$ )Nr   )minrn   dimgHz>)r$   
zeros_liker   rj   r   r\   r   r`   clamptype_asr   sumrb   rc   rg   )	ri   r   rx   rz   r   r`   position_embedding_maskrb   r}   s	            r*   r~   LukeEntityEmbeddings.forwardx  s    !"--j9N 22:>;;&&$++*A*AA $ ; ;<M N"66|7I7Ia7I7PQ#/2#5">">?R"S"]"]^`"a1K#ii(;D14O4OTV4O4W4]4]bf4]4gg $ : :> J&<?TT
^^J/
\\*-
r)   )rc   rj   rg   r   r   r`   rb   N)r   r    r!   r"   r   rY   r$   
LongTensorr~   r(   r   r   s   @r*   r   r   i  sQ    >z >$ 37	$$ && ((4/	 r)   r   c                   <   ^  \ rS rSrU 4S jrS r  SS jrSrU =r$ )LukeSelfAttentioni  c                 D  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        UR                  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        U R                  (       a  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R$                  " UR&                  5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)rX   rY   r\   num_attention_headshasattr
ValueErrorintattention_head_sizeall_head_sizeuse_entity_aware_attentionr   r   querykeyvalue	w2e_query	e2w_query	e2e_queryre   attention_probs_dropout_probrg   rh   s     r*   rY   LukeSelfAttention.__init__  s    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PP*0*K*K'YYv1143E3EF
99V//1C1CDYYv1143E3EF
**YYv'9'94;M;MNDNYYv'9'94;M;MNDNYYv'9'94;M;MNDNzz&"E"EFr)   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nrn   r      r   r   )ru   r   r   viewpermute)ri   xnew_x_shapes      r*   transpose_for_scores&LukeSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r)   c                    UR                  S5      nUc  UnO[        R                  " X/SS9nU R                  U R	                  U5      5      nU R                  U R                  U5      5      nU R                  (       Ga  UGb  U R                  U R                  U5      5      n	U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U R                  U5      5      nUS S 2S S 2S U2S S 24   nUS S 2S S 2S U2S S 24   nUS S 2S S 2US 2S S 24   nUS S 2S S 2US 2S S 24   n[        R                  " XR                  SS5      5      n[        R                  " XR                  SS5      5      n[        R                  " XR                  SS5      5      n[        R                  " UUR                  SS5      5      n[        R                  " UU/SS9n[        R                  " UU/SS9n[        R                  " UU/SS9nOGU R                  U R                  U5      5      n[        R                  " UUR                  SS5      5      nU[        R                  " U R                  5      -  nUb  UU-   n[         R"                  R%                  USS9nU R'                  U5      n[        R                  " UU5      nUR)                  SSSS5      R+                  5       nUR                  5       S S U R,                  4-   nUR.                  " U6 nUS S 2S U2S S 24   nUc  S nOUS S 2US 2S S 24   nU(       a  UUU4nU$ UU4nU$ )Nr   r   rn   r   r   r   r   )ru   r$   catr   r   r   r   r   r   r   r   matmul	transposemathsqrtr   r   
functionalsoftmaxrg   r   
contiguousr   r   )ri   word_hidden_statesr   attention_maskoutput_attentions	word_sizeconcat_hidden_states	key_layervalue_layerw2w_query_layerw2e_query_layere2w_query_layere2e_query_layerw2w_key_layere2w_key_layerw2e_key_layere2e_key_layerw2w_attention_scoresw2e_attention_scorese2w_attention_scorese2e_attention_scoresword_attention_scoresentity_attention_scoresattention_scoresquery_layerattention_probscontext_layernew_context_layer_shapeoutput_word_hidden_statesoutput_entity_hidden_statesoutputss                                  r*   r~   LukeSelfAttention.forward  s^    '++A.	'#5 #(99.@-W]^#_ --dhh7K.LM	//

;O0PQ***/C/O #77

CU8VWO"77GY8Z[O"77G[8\]O"77G[8\]O &aJYJ&9:M%aJYJ&9:M%aIJ&9:M%aIJ&9:M $)<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a #(<<AXAXY[]_A`#a  %*II/CEY.Z`a$b!&+ii1EG[0\bc&d#$yy*?AX)Y_`a 33DJJ?S4TUK$||K9L9LRQS9TU+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD$1!ZiZ2B$C!'*.'*79:q8H*I'02M_G  12MNGr)   )r   r   rg   r   r   r   r   r   r   r   r   NF)	r   r    r!   r"   rY   r   r~   r(   r   r   s   @r*   r   r     s"    G0% K Kr)   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )LukeSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrV   )rX   rY   r   r   r\   denserc   rd   re   rf   rg   rh   s     r*   rY   LukeSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r)   r6   input_tensorreturnc                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rg   rc   ri   r6   r   s      r*   r~   LukeSelfOutput.forward  5    

=1]3}'CDr)   rc   r   rg   
r   r    r!   r"   rY   r$   Tensorr~   r(   r   r   s   @r*   r   r     6    >U\\  RWR^R^  r)   r   c                   6   ^  \ rS rSrU 4S jr  SS jrSrU =r$ )LukeAttentioni  c                 b   > [         TU ]  5         [        U5      U l        [	        U5      U l        g r   )rX   rY   r   ri   r   outputrh   s     r*   rY   LukeAttention.__init__  s&    %f-	$V,r)   c                 :   UR                  S5      nU R                  UUUU5      nUc  US   nUnO.[        R                  " US S SS9n[        R                  " X/SS9nU R	                  Xx5      n	U	S S 2S U2S S 24   n
Uc  S nOU	S S 2US 2S S 24   nX4USS  -   nU$ )Nr   r   r   r   )ru   ri   r$   r   r   )ri   r   r   r   r   r   self_outputsconcat_self_outputsr   attention_outputword_attention_outputentity_attention_outputr   s                r*   r~   LukeAttention.forward  s     '++A.	yy 	
  '".q/#5 "'))L!,<!"D#(99.@-W]^#_ ;;':Q 0JYJ1A B'&*#&6q)*a7G&H# )B\RSRTEUUr)   )r   ri   r   r   r    r!   r"   rY   r~   r(   r   r   s   @r*   r   r     s    -    r)   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )LukeIntermediatei9  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rX   rY   r   r   r\   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnrh   s     r*   rY   LukeIntermediate.__init__:  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r)   r6   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r
  ri   r6   s     r*   r~   LukeIntermediate.forwardB  s&    

=100?r)   r  r   r   s   @r*   r  r  9  s(    9U\\ ell  r)   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
LukeOutputiI  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rX   rY   r   r   r  r\   r   rc   rd   re   rf   rg   rh   s     r*   rY   LukeOutput.__init__J  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r)   r6   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r*   r~   LukeOutput.forwardP  r   r)   r   r   r   s   @r*   r  r  I  r   r)   r  c                   <   ^  \ rS rSrU 4S jr  SS jrS rSrU =r$ )	LukeLayeriW  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g Nr   )
rX   rY   chunk_size_feed_forwardseq_len_dimr   	attentionr  intermediater  r   rh   s     r*   rY   LukeLayer.__init__X  sI    '-'E'E$&v.,V4 (r)   c                 @   UR                  S5      nU R                  UUUUS9nUc  US   nO[        R                  " US S SS9nUSS  n[	        U R
                  U R                  U R                  U5      n	U	S S 2S U2S S 24   n
Uc  S nOU	S S 2US 2S S 24   nX4U-   nU$ )Nr   )r   r   r   r   )ru   r  r$   r   r   feed_forward_chunkr  r  )ri   r   r   r   r   r   self_attention_outputsconcat_attention_outputr   layer_outputword_layer_outputentity_layer_outputs               r*   r~   LukeLayer.forward`  s     '++A.	!% /	 "0 "
  '&<Q&?#&+ii0Fr0JPQ&R#(,0##T%A%A4CSCSUl
 )JYJ)9:'"&".q)*a/?"@$:WDr)   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r   )ri   r   intermediate_outputr#  s       r*   r   LukeLayer.feed_forward_chunk  s)    "//0@A{{#6Ir)   )r  r  r  r   r  r   )	r   r    r!   r"   rY   r~   r   r(   r   r   s   @r*   r  r  W  s     ) !F r)   r  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )LukeEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
rX   rY   rj   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)ri   rj   _rk   s      r*   rY   LukeEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c                 l   U(       a  SOS nU(       a  SOS nU(       a  SOS n	[        U R                  5       H?  u  pU(       a
  Xq4-   nX4-   nU" UUUU5      nUS   nUb  US   nU(       d  M7  XS   4-   n	MA     U(       a
  Xq4-   nX4-   nU(       d  [        S UUU	UU4 5       5      $ [        UUU	UUS9$ )Nr   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   .0vs     r*   	<genexpr>&LukeEncoder.forward.<locals>.<genexpr>  "      
A     	)last_hidden_stater6   r7   r   r   )	enumerater0  r'   r,   )ri   r   r   r   r   output_hidden_statesreturn_dictall_word_hidden_statesall_entity_hidden_statesall_self_attentionsilayer_modulelayer_outputss                r*   r~   LukeEncoder.forward  s    (<)=24 $5b4(4OA#)?BW)W&+CF]+](("$!	M "/q!1#/'4Q'7$  &91=M<O&O#%  5(  %;>S%S"'?BY'Y$ 
 '*'(,
 
 
 #00*%9!9
 	
r)   )rj   r1  r0  )NFFTr  r   s   @r*   r+  r+    s     , "7
 7
r)   r+  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
LukePooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rX   rY   r   r   r\   r   Tanh
activationrh   s     r*   rY   LukePooler.__init__  s9    YYv1163E3EF
'')r)   r6   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   rL  )ri   r6   first_token_tensorpooled_outputs       r*   r~   LukePooler.forward  s6     +1a40

#566r)   )rL  r   r   r   s   @r*   rI  rI    s(    $
U\\ ell  r)   rI  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EntityPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR
                  UR                  S9U l        g r   )rX   rY   r   r   r\   r   r   r  r  r	  r
   transform_act_fnrc   rd   rh   s     r*   rY   &EntityPredictionHeadTransform.__init__  s~    YYv1163I3IJ
f''--$*6+<+<$=D!$*$5$5D!f&<&<&BWBWXr)   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   rU  rc   r  s     r*   r~   %EntityPredictionHeadTransform.forward  s4    

=1--m<}5r)   )rc   r   rU  r  r   s   @r*   rS  rS    s    Y r)   rS  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EntityPredictionHeadi  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [
        R                  " [        R                  " UR                  5      5      U l        g )NFr   )rX   rY   rj   rS  	transformr   r   r   r   decoder	Parameterr$   rv   r   rh   s     r*   rY   EntityPredictionHead.__init__  s_    6v>yy!7!79Q9QX]^LLV-E-E!FG	r)   c                 d    U R                  U5      nU R                  U5      U R                  -   nU$ r   )r\  r]  r   r  s     r*   r~   EntityPredictionHead.forward  s-    }5]3dii?r)   )r   rj   r]  r\  r  r   s   @r*   rZ  rZ    s    H r)   rZ  c                   z    \ rS rSr% \\S'   SrSrSS/r\	R                  " 5       S\R                  4S j5       rS	rg
)LukePreTrainedModeli  rj   lukeTr   r   modulec                    [        U[        R                  5      (       ac  [        R                  " UR
                  SU R                  R                  S9  UR                  b!  [        R                  " UR                  5        gg[        U[        R                  5      (       a  UR                  S:X  a!  [        R                  " UR
                  5        O4[        R                  " UR
                  SU R                  R                  S9  UR                  bK  [        UR
                  SS5      (       d.  [        R                  " UR
                  UR                     5        ggg[        U[        R                  5      (       aA  [        R                  " UR                  5        [        R                  " UR
                  5        gg)zInitialize the weightsg        )meanstdNr   _is_hf_initializedF)r  r   r   initnormal_weightrj   initializer_ranger   zeros_rZ   embedding_dimrU   getattrrc   ones_)ri   re  s     r*   _init_weights!LukePreTrainedModel._init_weights  s    fbii((LLSdkk6S6ST{{&FKK( '--##q(FMM*V]]$++:W:WX!!-gfmmMach6i6iFMM&*<*<=> 7j---KK$JJv}}% .r)   r   N)r   r    r!   r"   r   r&   base_model_prefixsupports_gradient_checkpointing_no_split_modulesr$   no_gradr   Modulerr  r(   r   r)   r*   rc  rc    sC    &*#(*@A
]]_&BII & &r)   rc  zt
    The bare LUKE model transformer outputting raw hidden-states for both word tokens and entities without any
    c                     ^  \ rS rSrSS\S\4U 4S jjjrS rS rS r	S r
\            SS
\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\R                  S	-  S\S	-  S\S	-  S\S	-  S\\-  4S jj5       rSrU =r$ )	LukeModeli  rj   add_pooling_layerc                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l
        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)rX   rY   rj   rR   r}   r   r   r+  encoderrI  pooler	post_init)ri   rj   r{  rk   s      r*   rY   LukeModel.__init__  sX    
 	 (0!5f!="6*,=j(4 	r)   c                 .    U R                   R                  $ r   r}   r^   ri   s    r*   get_input_embeddingsLukeModel.get_input_embeddings,  s    ...r)   c                 $    XR                   l        g r   r  ri   r   s     r*   set_input_embeddingsLukeModel.set_input_embeddings/  s    */'r)   c                 .    U R                   R                   $ r   r   r  s    r*   get_entity_embeddingsLukeModel.get_entity_embeddings2  s    %%777r)   c                 $    XR                   l         g r   r  r  s     r*   set_entity_embeddingsLukeModel.set_entity_embeddings5  s    380r)   Nry   r   rz   rx   r   entity_attention_maskentity_token_type_idsentity_position_idsr{   r   r?  r@  r   c           	         U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  U	b  [	        S5      eUb"  U R                  X5        UR                  5       nO"U	b  U	R                  5       SS nO[	        S5      eUu  nnUb  UR                  OU	R                  nUc  [        R                  " UU4US9nUc$  [        R                  " U[        R                  US9nUbT  UR                  S5      nUc  [        R                  " UU4US9nUc&  [        R                  " UU4[        R                  US9nU R                  UUUU	S9nUb  [        R                  " X&/SS	9n[        U R                   US
   R                  UR                   5      US9nUc  SnOU R#                  XXU5      nU R%                  UUUU
UUS9nUS   nU R&                  b  U R'                  U5      OSnU(       d
  UU4USS -   $ [)        UUUR*                  UR,                  UR.                  UR0                  S9$ )u
  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeModel

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-base")
>>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
# Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"

>>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"

>>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
>>> outputs = model(**encoding)
>>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state
# Input Wikipedia entities to obtain enriched contextualized representations of word tokens

>>> text = "Beyoncé lives in Los Angeles."
>>> entities = [
...     "Beyoncé",
...     "Los Angeles",
... ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
>>> entity_spans = [
...     (0, 7),
...     (17, 28),
... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"

>>> encoding = tokenizer(
...     text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
... )
>>> outputs = model(**encoding)
>>> word_last_hidden_state = outputs.last_hidden_state
>>> entity_last_hidden_state = outputs.entity_last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timern   z5You have to specify either input_ids or inputs_embeds)rq   ro   r   )ry   rx   rz   r{   r   ).N)rj   r{   r   )r   r   r?  r@  r   )r=  pooler_outputr6   r7   r   r   )rj   r   r?  r@  r   %warn_if_padding_and_no_attention_maskru   rq   r$   onesrv   rw   r}   r   r   rs   rp   r   r}  r~  r   r6   r7   r   r   )ri   ry   r   rz   rx   r   r  r  r  r{   r   r?  r@  kwargsr|   
batch_size
seq_lengthrq   entity_seq_lengthword_embedding_outputentity_embedding_outputencoder_outputssequence_outputrP  s                           r*   r~   LukeModel.forward8  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZZ(@PN!"[[EJJvVN! * 2$,(-

J@Q3R[a(b%$,(-ZAR4S[`[e[ent(u% !%%)'	 !0 !
 !,"YY'NTVWN2;;(3667L7R7RS)	
 &*#&*&<&<Z^s&t# ,,!#)/!5# ' 
 *!, 9=8OO4UY#]3oab6III--')77&11%4%M%M!0!E!E
 	
r)   )rj   r}   r}  r   r~  )T)NNNNNNNNNNNN)r   r    r!   r"   r   boolrY   r  r  r  r  r   r$   r   r%   r'   r   r~   r(   r   r   s   @r*   rz  rz    sm   z d  "/089  .2372604.2:>9=7;26)-,0#'X
##d*X
 ))D0X
 ((4/	X

 &&-X
 $$t+X
  %0047X
  %//$6X
 #--4X
 ((4/X
  $;X
 #TkX
 D[X
 
/	/X
 X
r)   rz  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r$   cumsumr   rw   )ry   rU   maskincremental_indicess       r*   rr   rr     sP     <<$((*D <<!4<<TBdJ##%33r)   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )
LukeLMHeadi  z*Roberta Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        g r   )rX   rY   r   r   r\   r   rc   rd   
layer_normr[   r]  r^  r$   rv   r   rh   s     r*   rY   LukeLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	r)   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r  r]  )ri   featuresr  r   s       r*   r~   LukeLMHead.forward  s;    JJx GOOA LLOr)   )r   r]  r   r  )	r   r    r!   r"   r#   rY   r~   r(   r   r   s   @r*   r  r    s    4A r)   r  z
    The LUKE model with a language modeling head and entity prediction head on top for masked language modeling and
    masked entity prediction.
    c            !         ^  \ rS rSrSSS.rU 4S jrS rS r\              SS	\	R                  S-  S
\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\	R                  S-  S\S-  S\S-  S\S-  S\\-  4S jj5       rSrU =r$ )LukeForMaskedLMi  z/luke.entity_embeddings.entity_embeddings.weightzlm_head.decoder.bias)z!entity_predictions.decoder.weightzlm_head.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        [        R                  " 5       U l
        U R                  5         g r   )rX   rY   rz  rd  r  lm_headrZ  entity_predictionsr   r   loss_fnr  rh   s     r*   rY   LukeForMaskedLM.__init__  sQ     f%	!&)"6v">**, 	r)   c                 .    U R                   R                  $ r   r  r]  r  s    r*   get_output_embeddings%LukeForMaskedLM.get_output_embeddings  s    ||###r)   c                 $    XR                   l        g r   r  )ri   new_embeddingss     r*   set_output_embeddings%LukeForMaskedLM.set_output_embeddings  s    -r)   Nry   r   rz   rx   r   r  r  r  labelsentity_labelsr{   r   r?  r@  r   c                 R   Ub  UOU R                   R                  nU R                  UUUUUUUUUUUSS9nSnSnU R                  UR                  5      nU	be  U	R                  UR                  5      n	U R                  UR                  SU R                   R                  5      U	R                  S5      5      nUc  UnSnSnUR                  bn  U R                  UR                  5      nU
bP  U R                  UR                  SU R                   R                  5      U
R                  S5      5      nUc  UnOUU-   nU(       d8  [        S UUUUUUR                  UR                  UR                   4 5       5      $ [#        UUUUUUR                  UR                  UR                   S9$ )a{  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
entity_labels (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
NTry   r   rz   rx   r   r  r  r  r{   r   r?  r@  rn   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  *LukeForMaskedLM.forward.<locals>.<genexpr>s  s"      	A  	r<  )r1   r2   r3   r4   r5   r6   r   r7   )rj   r@  rd  r  r=  rs   rq   r  r   r[   r   r  r   r'   r6   r   r7   r/   )ri   ry   r   rz   rx   r   r  r  r  r  r  r{   r   r?  r@  r  r   r1   r2   r4   r3   r5   s                         r*   r~   LukeForMaskedLM.forward  s   b &1%<k$++BYBY))))%!"7"7 3'/!5  
 g778YYv}}-F||FKKDKK4J4J$KV[[Y[_]H|++7 33G4T4TUM(<<(:(:2t{{?\?\(]_l_q_qrt_uv<#D(?D  !))00&&	   "'!//!(!=!=))	
 		
r)   )r  r  r  rd  NNNNNNNNNNNNNN)r   r    r!   r"   _tied_weights_keysrY   r  r  r   r$   r   r%   r  r'   r/   r~   r(   r   r   s   @r*   r  r    s    ._.
$.  .2372604.29=9=7;*.1526)-,0#'p
##d*p
 ))D0p
 ((4/	p

 &&-p
 $$t+p
  %//$6p
  %//$6p
 #--4p
   4'p
 ''$.p
 ((4/p
  $;p
 #Tkp
 D[p
" 
#	##p
 p
r)   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
    token) for entity classification tasks, such as Open Entity.
    c                     ^  \ rS rSrU 4S jr\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )LukeForEntityClassificationi  c                 0  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   rX   rY   rz  rd  
num_labelsr   re   rf   rg   r   r\   
classifierr  rh   s     r*   rY   $LukeForEntityClassification.__init__  si     f%	 ++zz&"<"<=))F$6$68I8IJ 	r)   Nry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUSS9nUR                  SS2SSS24   nU R	                  U5      nU R                  U5      nSnU
b  U
R                  UR                  5      n
U
R                  S:X  a!  [        R                  R                  UU
5      nOM[        R                  R                  UR                  S5      U
R                  S5      R                  U5      5      nU(       d5  [        S UUUR                   UR"                  UR$                  4 5       5      $ ['        UUUR                   UR"                  UR$                  S9$ )	u	  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
    Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
    used for the single-label classification. In this case, labels should contain the indices that should be in
    `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
    loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
    and 1 indicate false and true, respectively.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeForEntityClassification

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")
>>> model = LukeForEntityClassification.from_pretrained("studio-ousia/luke-large-finetuned-open-entity")

>>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
>>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: person
```NTr  r   r   rn   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  6LukeForEntityClassification.forward.<locals>.<genexpr>         pA pr<  r1   r4   r6   r   r7   )rj   r@  rd  r   rg   r  rs   rq   ndimr   r   cross_entropy binary_cross_entropy_with_logitsr   r   r'   r6   r   r7   r9   ri   ry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r  r   feature_vectorr4   r1   s                      r*   r~   #LukeForEntityClassification.forward  sj   | &1%<k$++BYBY))))%!"7"7 3'/!5  
 !99!Q'Bn50 YYv}}-F{{a}}2266B}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   *!//!(!=!=))
 	
r)   r  rg   rd  r  NNNNNNNNNNNNN)r   r    r!   r"   rY   r   r$   r   r%   r  r'   r9   r~   r(   r   r   s   @r*   r  r    s_   
  .2372604.2:>9=7;26+/)-,0#'j
##d*j
 ))D0j
 ((4/	j

 &&-j
 $$t+j
  %0047j
  %//$6j
 #--4j
 ((4/j
 !!D(j
  $;j
 #Tkj
 D[j
  
+	+!j
 j
r)   r  z
    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
    tokens) for entity pair classification tasks, such as TACRED.
    c                     ^  \ rS rSrU 4S jr\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )LukeForEntityPairClassificationi  c                 8  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  S-  UR                  S5      U l        U R                  5         g )Nr   Fr  rh   s     r*   rY   (LukeForEntityPairClassification.__init__  sp     f%	 ++zz&"<"<=))F$6$6$:F<M<MuU 	r)   Nry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r   c                 8   Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUSS9n[        R                  " UR
                  SS2SSS24   UR
                  SS2SSS24   /SS9nU R                  U5      nU R                  U5      nSnU
b  U
R                  UR                  5      n
U
R                  S:X  a!  [        R                  R                  UU
5      nOM[        R                  R                  UR                  S5      U
R                  S5      R!                  U5      5      nU(       d5  [#        S UUUR$                  UR&                  UR(                  4 5       5      $ [+        UUUR$                  UR&                  UR(                  S	9$ )
u	  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)` or `(batch_size, num_labels)`, *optional*):
    Labels for computing the classification loss. If the shape is `(batch_size,)`, the cross entropy loss is
    used for the single-label classification. In this case, labels should contain the indices that should be in
    `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, num_labels)`, the binary cross entropy
    loss is used for the multi-label classification. In this case, labels should only contain `[0, 1]`, where 0
    and 1 indicate false and true, respectively.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeForEntityPairClassification

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
>>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")

>>> text = "Beyoncé lives in Los Angeles."
>>> entity_spans = [
...     (0, 7),
...     (17, 28),
... ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
>>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: per:cities_of_residence
```NTr  r   r   r   rn   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  :LukeForEntityPairClassification.forward.<locals>.<genexpr>  r  r<  r  )rj   r@  rd  r$   r   r   rg   r  rs   rq   r  r   r   r  r  r   r   r'   r6   r   r7   r?   r  s                      r*   r~   'LukeForEntityPairClassification.forward"  s   B &1%<k$++BYBY))))%!"7"7 3'/!5  
 --aAg68X8XYZ\]_`Y`8abhi
 n50 YYv}}-F{{a}}2266B}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   .!//!(!=!=))
 	
r)   r  r  )r   r    r!   r"   rY   r   r$   r   r%   r  r'   r?   r~   r(   r   r   s   @r*   r  r    s_   
  .2372604.2:>9=7;26*.)-,0#'o
##d*o
 ))D0o
 ((4/	o

 &&-o
 $$t+o
  %0047o
  %//$6o
 #--4o
 ((4/o
   4'o
  $;o
 #Tko
 D[o
  
/	/!o
 o
r)   r  z
    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
    such as named entity recognition.
    c            #         ^  \ rS rSrU 4S jr\               SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4 S jj5       rSrU =r$ )LukeForEntitySpanClassificationi  c                 6  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  S-  UR                  5      U l        U R                  5         g )Nr   r  rh   s     r*   rY   (LukeForEntitySpanClassification.__init__  sn     f%	 ++zz&"<"<=))F$6$6$:F<M<MN 	r)   Nry   r   rz   rx   r   r  r  r  entity_start_positionsentity_end_positionsr{   r  r   r?  r@  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUUUUSS9nUR                  R	                  S5      nU	R                  S5      R                  SSU5      n	U	R                  UR                  R                  :w  a%  U	R                  UR                  R                  5      n	[        R                  " UR                  SU	5      nU
R                  S5      R                  SSU5      n
U
R                  UR                  R                  :w  a%  U
R                  UR                  R                  5      n
[        R                  " UR                  SU
5      n[        R                  " UUUR                  /SS9nU R                  U5      nU R                  U5      nSnUb  UR                  UR                  5      nUR                  S:X  aJ  [         R"                  R%                  UR'                  SU R(                  5      UR'                  S5      5      nOM[         R"                  R+                  UR'                  S5      UR'                  S5      R-                  U5      5      nU(       d5  [/        S UUUR0                  UR2                  UR4                  4 5       5      $ [7        UUUR0                  UR2                  UR4                  S	9$ )
u  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
entity_start_positions (`torch.LongTensor`):
    The start positions of entities in the word token sequence.
entity_end_positions (`torch.LongTensor`):
    The end positions of entities in the word token sequence.
labels (`torch.LongTensor` of shape `(batch_size, entity_length)` or `(batch_size, entity_length, num_labels)`, *optional*):
    Labels for computing the classification loss. If the shape is `(batch_size, entity_length)`, the cross
    entropy loss is used for the single-label classification. In this case, labels should contain the indices
    that should be in `[0, ..., config.num_labels - 1]`. If the shape is `(batch_size, entity_length,
    num_labels)`, the binary cross entropy loss is used for the multi-label classification. In this case,
    labels should only contain `[0, 1]`, where 0 and 1 indicate false and true, respectively.

Examples:

```python
>>> from transformers import AutoTokenizer, LukeForEntitySpanClassification

>>> tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
>>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")

>>> text = "Beyoncé lives in Los Angeles"
# List all possible entity spans in the text

>>> word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
>>> word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
>>> entity_spans = []
>>> for i, start_pos in enumerate(word_start_positions):
...     for end_pos in word_end_positions[i:]:
...         entity_spans.append((start_pos, end_pos))

>>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
>>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
...     if predicted_class_idx != 0:
...         print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])
Beyoncé PER
Los Angeles LOC
```NTr  rn   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  :LukeForEntitySpanClassification.forward.<locals>.<genexpr>$  r  r<  r  )rj   r@  rd  r=  ru   r   r   rq   rs   r$   gatherr   r   rg   r  r  r   r   r  r   r  r  r   r'   r6   r   r7   rB   )ri   ry   r   rz   rx   r   r  r  r  r  r  r{   r  r   r?  r@  r  r   r\   start_states
end_statesr  r4   r1   s                           r*   r~   'LukeForEntitySpanClassification.forward  s   ^ &1%<k$++BYBY))))%!"7"7 3'/!5  
 //44R8!7!A!A"!E!L!LRQSU`!a!((G,E,E,L,LL%;%>%>w?X?X?_?_%`"||G$=$=rCYZ3==bAHHRQ\]&&'*C*C*J*JJ#7#:#:7;T;T;[;[#\ \\'";";RAUV
L*g>^>^#_efgn50YYv}}-F {{a}}226;;r4??3SU[U`U`acUde}}EEfkkRToW]WbWbceWfWnWnouWvw (=(=w?[?[]d]o]op   .!//!(!=!=))
 	
r)   r  )NNNNNNNNNNNNNNN)r   r    r!   r"   rY   r   r$   r   r%   r  r'   rB   r~   r(   r   r   s   @r*   r  r    s   
  .2372604.29=9=7;:>8<26*.)-,0#'!G
##d*G
 ))D0G
 ((4/	G

 &&-G
 $$t+G
  %//$6G
  %//$6G
 #--4G
 !& 0 04 7G
 $..5G
 ((4/G
   4'G
  $;G
 #TkG
  D[!G
$ 
/	/%G
 G
r)   r  z
    The LUKE Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                     ^  \ rS rSrU 4S jr\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )LukeForSequenceClassificationi3  c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  b  UR                  OUR                  5      U l	        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   rX   rY   r  rz  rd  r   re   classifier_dropoutrf   rg   r   r\   r  r  rh   s     r*   rY   &LukeForSequenceClassification.__init__:  s      ++f%	zz)/)B)B)NF%%TZTnTn
 ))F$6$68I8IJ 	r)   Nry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUSS9nUR                  nU R	                  U5      nU R                  U5      nSnU
Gb  U
R                  UR                  5      n
U R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  U
R                  [        R                  :X  d  U
R                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aJ  [        5       nU R                  S:X  a&  U" UR                  5       U
R                  5       5      nOU" UU
5      nOU R                   R                  S:X  a=  [!        5       nU" UR#                  SU R                  5      U
R#                  S5      5      nO-U R                   R                  S:X  a  [%        5       nU" UU
5      nU(       d5  ['        S	 UUUR(                  UR*                  UR,                  4 5       5      $ [/        UUUR(                  UR*                  UR,                  S
9$ )a  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr  r   
regressionsingle_label_classificationmulti_label_classificationrn   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  8LukeForSequenceClassification.forward.<locals>.<genexpr>  r  r<  r  )rj   r@  rd  r  rg   r  rs   rq   problem_typer  rp   r$   rw   r   r   squeezer   r   r   r'   r6   r   r7   rE   )ri   ry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r  r   rP  r4   r1   loss_fcts                       r*   r~   %LukeForSequenceClassification.forwardF  s    V &1%<k$++BYBY))))%!"7"7 3'/!5  
  --]3/YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./ (=(=w?[?[]d]o]op   ,!//!(!=!=))
 	
r)   r  r  )r   r    r!   r"   rY   r   r$   r   r%   r  r'   rE   r~   r(   r   r   s   @r*   r  r  3  s_   
  .2372604.2:>9=7;26+/)-,0#'f
##d*f
 ))D0f
 ((4/	f

 &&-f
 $$t+f
  %0047f
  %//$6f
 #--4f
 ((4/f
 !!D(f
  $;f
 #Tkf
 D[f
  
-	-!f
 f
r)   r  z
    The LUKE Model with a token classification head on top (a linear layer on top of the hidden-states output). To
    solve Named-Entity Recognition (NER) task using LUKE, `LukeForEntitySpanClassification` is more suitable than this
    class.
    c                     ^  \ rS rSrU 4S jr\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )LukeForTokenClassificationi  c                 `  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  b  UR                  OUR                  5      U l	        [
        R                  " UR                  UR                  5      U l        U R                  5         g NF)r{  r  rh   s     r*   rY   #LukeForTokenClassification.__init__  s      ++f>	zz)/)B)B)NF%%TZTnTn
 ))F$6$68I8IJ 	r)   Nry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r   c                 N   Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUSS9nUR                  nU R	                  U5      nU R                  U5      nSnU
bW  U
R                  UR                  5      n
[        5       nU" UR                  SU R                  5      U
R                  S5      5      nU(       d5  [        S UUUR                  UR                  UR                  4 5       5      $ [        UUUR                  UR                  UR                  S9$ )a  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
NTr  rn   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  5LukeForTokenClassification.forward.<locals>.<genexpr>  r  r<  r  )rj   r@  rd  r=  rg   r  rs   rq   r   r   r  r'   r6   r   r7   rH   )ri   ry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r  r   r  r4   r1   r  s                       r*   r~   "LukeForTokenClassification.forward  s.   V &1%<k$++BYBY))))%!"7"7 3'/!5  
 "33,,71YYv}}-F')HFKKDOO<fkk"oND (=(=w?[?[]d]o]op   )!//!(!=!=))
 	
r)   r  r  )r   r    r!   r"   rY   r   r$   r   r%   r  r'   rH   r~   r(   r   r   s   @r*   r  r    s_     .2372604.2:>9=7;26+/)-,0#'T
##d*T
 ))D0T
 ((4/	T

 &&-T
 $$t+T
  %0047T
  %//$6T
 #--4T
 ((4/T
 !!D(T
  $;T
 #TkT
 D[T
  
*	*!T
 T
r)   r  c            !         ^  \ rS rSrU 4S jr\              SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )LukeForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
rX   rY   r  rz  rd  r   r   r\   
qa_outputsr  rh   s     r*   rY   !LukeForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r)   Nry   r   rz   rx   r   r  r  r  r{   start_positionsend_positionsr   r?  r@  r   c                 Z   Ub  UOU R                   R                  nU R                  UUUUUUUUU	UUSS9nUR                  nU R	                  U5      nUR                  SSS9u  nnUR                  S5      nUR                  S5      nSnU
b  Ub  [        U
R                  5       5      S:  a  U
R                  S5      n
[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nU
R                  SU5        UR                  SU5        [        US9nU" UU
5      nU" UU5      nUU-   S	-  nU(       d6  [        S
 UUUUR                  UR                  UR                  4 5       5      $ [        UUUUR                  UR                  UR                  S9$ )a  
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
NTr  r   rn   r   r   )ignore_indexr   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  3LukeForQuestionAnswering.forward.<locals>.<genexpr>|  s"      A  r<  )r1   rM   rN   r6   r   r7   )rj   r@  rd  r=  r  splitr  lenru   clamp_r   r'   r6   r   r7   rK   )ri   ry   r   rz   rx   r   r  r  r  r{   r  r  r   r?  r@  r  r   r  r4   rM   rN   
total_lossignored_indexr  
start_lossend_losss                             r*   r~    LukeForQuestionAnswering.forward*  s   P &1%<k$++BYBY))))%!"7"7 3'/!5  
 "331#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J   ))00&&   0%!!//!(!=!=))
 	
r)   )rd  r  r  r  )r   r    r!   r"   rY   r   r$   r   r%   r  r'   rK   r~   r(   r   r   s   @r*   r
  r
    sx   	  .2372615.2:>9=7;263715)-,0#'e
##d*e
 ))D0e
 ((4/	e

 ''$.e
 $$t+e
  %0047e
  %//$6e
 #--4e
 ((4/e
 ))D0e
 ''$.e
  $;e
 #Tke
 D[e
" 
1	1#e
 e
r)   r
  c                     ^  \ rS rSrU 4S jr\             SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )LukeForMultipleChoicei  c                 ,  > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  b  UR                  OUR                  5      U l        [        R                  " UR                  S5      U l        U R                  5         g r  )rX   rY   rz  rd  r   re   r  rf   rg   r   r\   r  r  rh   s     r*   rY   LukeForMultipleChoice.__init__  so     f%	zz)/)B)B)NF%%TZTnTn
 ))F$6$6: 	r)   Nry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r   c                 N   Ub  UOU R                   R                  nUb  UR                  S   OU	R                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnU	b1  U	R                  SU	R	                  S5      U	R	                  S5      5      OSn	Ub!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUUU	UUSS9nUR                  nU R                  U5      nU R                  U5      nUR                  SU5      nSnU
b.  U
R                  UR                  5      n
[        5       nU" UU
5      nU(       d5  [        S UUUR                  UR                  UR                  4 5       5      $ [!        UUUR                  UR                  UR                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
entity_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`):
    Indices of entity tokens in the entity vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.
entity_attention_mask (`torch.FloatTensor` of shape `(batch_size, entity_length)`, *optional*):
    Mask to avoid performing attention on padding entity token indices. Mask values selected in `[0, 1]`:

    - 1 for entity tokens that are **not masked**,
    - 0 for entity tokens that are **masked**.
entity_token_type_ids (`torch.LongTensor` of shape `(batch_size, entity_length)`, *optional*):
    Segment token indices to indicate first and second portions of the entity token inputs. Indices are
    selected in `[0, 1]`:

    - 0 corresponds to a *portion A* entity token,
    - 1 corresponds to a *portion B* entity token.
entity_position_ids (`torch.LongTensor` of shape `(batch_size, entity_length, max_mention_length)`, *optional*):
    Indices of positions of each input entity in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   rn   r   Tr  c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   r6  s     r*   r9  0LukeForMultipleChoice.forward.<locals>.<genexpr>  r;  r<  r  )rj   r@  shaper   ru   rd  r  rg   r  rs   rq   r   r'   r6   r   r7   rP   )ri   ry   r   rz   rx   r   r  r  r  r{   r  r   r?  r@  r  num_choicesr   rP  r4   reshaped_logitsr1   r  s                         r*   r~   LukeForMultipleChoice.forward  s   F &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 BLAWZ__R)<=]a
 %0 "&&r+@+E+Eb+IJ 	 %0 "&&r+@+E+Eb+IJ 	 #.  $$R)<)A)A")EGZG_G_`bGcd 	 ))))%!"7"7 3'/!5  
  --]3/ ++b+6YY556F')HOV4D 
 #))00&&
 
 
 -"!//!(!=!=))
 	
r)   )r  rg   rd  r  )r   r    r!   r"   rY   r   r$   r   r%   r  r'   rP   r~   r(   r   r   s   @r*   r  r    s_   
  .2372604.2:>9=7;26+/)-,0#'O
##d*O
 ))D0O
 ((4/	O

 &&-O
 $$t+O
  %0047O
  %//$6O
 #--4O
 ((4/O
 !!D(O
  $;O
 #TkO
 D[O
  
.	.!O
 O
r)   r  )
r  r  r  r  r
  r  r  r  rz  rc  )Hr#   r   dataclassesr   r$   r   torch.nnr   r   r    r	   rj  activationsr
   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_luker   
get_loggerr   loggerr   r,   r/   r9   r?   rB   rE   rH   rK   rP   rx  rR   r   r   r   r   r  r  r  r+  rI  rS  rZ  rc  rz  rr   r  r  r  r  r  r  r  r
  r  __all__r   r)   r*   <module>r5     s     !   A A & ' 6 9 K - 6 9 9 * 
		H	% 
 F%? F F" 
 F/ F F 
 < < <8 
 < < <& 
 <[ < <& 
 <[ < <& 
 <; < <& 
 < < <& 
 <{ < <$ 
 <K < <*D=RYY D=N(299 (Vi		 iZRYY &BII &Tryy   /* /d>
")) >
D BII "299  &/ & &4 
w
# w

w
t4" , J
) J
J
Z x
"5 x
x
v }
&9 }
}
@ U
&9 U
U
p t
$7 t
t
n c
!4 c
c
L r
2 r
 r
j ]
/ ]
 ]
@r)   