
    3j                     6   S r SSKrSSKrSSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJr  SSKJr  SSKJr  SSKJ r J!r!  SSK"J#r#  \!RH                  " \%5      r&/ SQr'\ " SS9\ " S S\5      5       5       r( " S S\RR                  5      r* " S S\RR                  5      r+ " S S\RR                  5      r, " S S\RR                  5      r- " S S\RR                  5      r. " S  S!\RR                  5      r/ " S" S#\RR                  5      r0 " S$ S%\RR                  5      r1 " S& S'\5      r2 " S( S)\RR                  5      r3 " S* S+\RR                  5      r4 " S, S-\RR                  5      r5 " S. S/\RR                  5      r6 " S0 S1\RR                  5      r7\  " S2 S3\5      5       r8\  " S4 S5\85      5       r9\ " S6S9 " S7 S8\85      5       r:\  " S9 S:\85      5       r;\  " S; S<\85      5       r<\  " S= S>\85      5       r=/ S?Qr>g)@zPyTorch CANINE model.    N)	dataclass)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )CanineConfig)   +   ;   =   I   a   g   q                           a  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.
    )custom_introc                       \ rS rSr% SrSr\R                  S-  \S'   Sr	\R                  S-  \S'   Sr
\\R                     S-  \S'   Sr\\R                     S-  \S'   Srg)	CanineModelOutputWithPooling1   aW  
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
    shallow Transformer encoder).
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
    Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
    weights are trained from the next sentence prediction (classification) objective during pretraining.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
    encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
    config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
    initial input to each Transformer encoder. The hidden states of the shallow encoders have length
    `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
    `config.downsampling_rate`.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
    num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
    config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
    attention softmax, used to compute the weighted average in the self-attention heads.
Nlast_hidden_statepooler_outputhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r-   torchFloatTensor__annotations__r.   r/   tupler0   __static_attributes__r1       d/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/models/canine/modeling_canine.pyr+   r+   1   sh    , 37u((4/6.2M5$$t+259M5**+d2926Je''(4/6r<   r+   c                      ^  \ rS rSrSrU 4S jrS\S\4S jrS\S\S\4S jr    SS
\	R                  S	-  S\	R                  S	-  S\	R                  S	-  S\	R                  S	-  S\	R                  4
S jjrSrU =r$ )CanineEmbeddingsV   z<Construct the character, position and token_type embeddings.c           	        > [         TU ]  5         Xl        UR                  UR                  -  n[        UR                  5       H3  nSU 3n[        X[        R                  " UR                  U5      5        M5     [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                   5      U l        U R%                  S[&        R(                  " UR*                  5      R-                  S5      SS9  g )NHashBucketCodepointEmbedder_epsposition_idsr   F)
persistent)super__init__confighidden_sizenum_hash_functionsrangesetattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr7   arangemax_position_embeddingsexpand)selfrK   shard_embedding_sizeiname	__class__s        r=   rJ   CanineEmbeddings.__init__Y   s     &11V5N5NNv001A1!5DDV-D-DFZ [\ 2 )+V5L5LfN`N`(a%%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
r<   
num_hashesnum_bucketsc                     U[        [        5      :  a  [        S[        [        5       35      e[        SU n/ nU H  nUS-   U-  U-  nUR                  U5        M!     U$ )aW  
Converts ids to hash bucket ids via multiple hashing.

Args:
    input_ids: The codepoints or other IDs to be hashed.
    num_hashes: The number of hash functions to use.
    num_buckets: The number of hash buckets (i.e. embeddings in each table).

Returns:
    A list of tensors, each of which is the hash bucket IDs from one hash function.
z`num_hashes` must be <= Nr   )len_PRIMES
ValueErrorappend)r^   	input_idsrd   re   primesresult_tensorsprimehasheds           r=   _hash_bucket_tensors%CanineEmbeddings._hash_bucket_tensorsn   sk     G$7G~FGG*%E 1}-<F!!&)  r<   embedding_sizec                     X#-  S:w  a  [        SU SU S35      eU R                  XUS9n/ n[        U5       H,  u  pxSU 3n	[        X	5      " U5      n
UR	                  U
5        M.     [
        R                  " USS9$ )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)rd   re   rB   rG   dim)ri   rp   	enumerategetattrrj   r7   cat)r^   rk   rr   rd   re   hash_bucket_tensorsembedding_shardsr`   hash_bucket_idsra   shard_embeddingss              r=   _embed_hash_buckets$CanineEmbeddings._embed_hash_buckets   s    &!+:>:JJ\]g\hhnopp"77	fq7r"+,?"@A1!5D&t2?C##$45 #A
 yy)r22r<   Nrk   token_type_idsrE   inputs_embedsreturnc                 @   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUc8  [        R                  " U[        R                  U R                  R
                  S9nUcO  U R                  XR                  R                  U R                  R                  U R                  R                  5      nU R                  U5      nXG-   nU R                  U5      n	X-  nU R                  U5      nU R                  U5      nU$ )NrG   r   dtypedevice)sizerE   r7   zeroslongr   r}   rK   rL   rM   rQ   rT   rR   rU   rY   )
r^   rk   r   rE   r   input_shape
seq_lengthrT   
embeddingsposition_embeddingss
             r=   forwardCanineEmbeddings.forward   s     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  44;;22DKK4R4RTXT_T_TpTpM !% : :> J":
";;LI)
^^J/
\\*-
r<   )rU   rR   rK   rY   rT   )NNNN)r2   r3   r4   r5   r6   rJ   intrp   r}   r7   
LongTensorr8   r   r;   __classcell__rb   s   @r=   r?   r?   V   s    F
*# C .3S 3c 3`c 3  .2260426!##d*! ((4/! &&-	!
 ((4/! 
		! !r<   r?   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )CharactersToMolecules   zeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                 6  > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S9U l        [        UR                     U l	        [        R                  " UR                  UR                  S9U l
        g )Nin_channelsout_channelskernel_sizestriderC   )rI   rJ   r   Conv1drL   downsampling_rateconvr
   
hidden_act
activationrU   rV   r^   rK   rb   s     r=   rJ   CharactersToMolecules.__init__   st    II**++00++	
	 !!2!23f&8&8f>S>STr<   char_encodingr   c                 0   US S 2SS2S S 24   n[         R                  " USS5      nU R                  U5      n[         R                  " USS5      nU R                  U5      nUS S 2SS2S S 24   n[         R                  " X$/SS9nU R                  U5      nU$ )Nr   r      rG   rt   )r7   	transposer   r   rx   rU   )r^   r   cls_encodingdownsampleddownsampled_truncatedresults         r=   r   CharactersToMolecules.forward   s    $Q!QY/ q!<ii.ook1a8ook2 !,AqtQJ 7 L@aH'r<   )rU   r   r   r2   r3   r4   r5   r6   rJ   r7   Tensorr   r;   r   r   s   @r=   r   r      s,    oUU\\ ell  r<   r   c                      ^  \ rS rSrSrU 4S jr S
S\R                  S\R                  S-  S\R                  4S jjrS	r	U =r
$ )ConvProjection   z
Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
characters.
c                 ~  > [         TU ]  5         Xl        [        R                  " UR
                  S-  UR
                  UR                  SS9U l        [        UR                     U l
        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   r   rC   )rI   rJ   rK   r   r   rL   upsampling_kernel_sizer   r
   r   r   rU   rV   rW   rX   rY   r   s     r=   rJ   ConvProjection.__init__   s    II**Q.++55	
	 !!2!23f&8&8f>S>STzz&"<"<=r<   Ninputsfinal_seq_char_positionsr   c                    [         R                  " USS5      nU R                  R                  S-
  nUS-  nX4-
  n[        R
                  " XE4S5      nU R                  U" U5      5      n[         R                  " USS5      nU R                  U5      nU R                  U5      nU R                  U5      nUnUb  [        S5      eUn	U	$ )Nr   r   r   z,CanineForMaskedLM is currently not supported)r7   r   rK   r   r   ConstantPad1dr   r   rU   rY   NotImplementedError)
r^   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqs
             r=   r   ConvProjection.forward   s     A.
 KK66:	q.%1153v;'A.('f%#/
 &&TUU&Ir<   )rU   r   rK   r   rY   Nr   r   s   @r=   r   r      sI    
>  9="" #(,,"5" 
	" "r<   r   c                      ^  \ rS rSrU 4S jr  SS\R                  S\R                  S\R                  S-  S\S-  S\	\R                  \R                  S-  4   4
S	 jjr
S
rU =r$ )CanineSelfAttentioni  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        g )Nr   rr   zThe hidden size (z6) is not a multiple of the number of attention heads ())rI   rJ   rL   num_attention_headshasattrri   r   attention_head_sizeall_head_sizer   LinearquerykeyvaluerW   attention_probs_dropout_probrY   r   s     r=   rJ   CanineSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr<   Nfrom_tensor	to_tensorattention_maskoutput_attentionsr   c                    UR                   u  pVnU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      n	U R                  U5      R                  USU R                  U R                  5      R                  SS5      n
[        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUbg  UR                  S:X  aS  [        R                  " USS9nSUR                  5       -
  [        R                  " UR                   5      R"                  -  nX-   n[$        R&                  R)                  USS9nU R+                  U5      n[        R                  " X5      nUR-                  SSSS5      R/                  5       nUR1                  5       S S U R2                  4-   nUR                  " U6 nU(       a  X4nU$ U4nU$ )	NrG   r   r   r   rt   g      ?r   )shaper   viewr   r   r   r   r   r7   matmulmathsqrtndim	unsqueezefloatfinfor   minr   
functionalsoftmaxrY   permute
contiguousr   r   )r^   r   r   r   r   
batch_sizer   _	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r=   r   CanineSelfAttention.forward.  s    %0$5$5!
 HHYT*b$":":D<T<TUYq!_ 	 JJy!T*b$":":D<T<TUYq!_ 	 JJ{#T*b$":":D<T<TUYq!_ 	 !<<5H5HR5PQ+dii8P8P.QQ%""a'!&Q!G #&(<(<(>">%++N^NdNdBeBiBi!i/@ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=2 O\M]r<   )r   r   rY   r   r   r   r   NF)r2   r3   r4   r5   rJ   r7   r   r8   boolr:   r   r;   r   r   s   @r=   r   r     sy    G, 48).;\\; <<; ))D0	;
  $;; 
u||U\\D00	1; ;r<   r   c                      ^  \ rS rSrU 4S jrS\\R                     S\R                  S\\R                  \R                  4   4S jrSr	U =r
$ )CanineSelfOutputil  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrC   )rI   rJ   r   r   rL   denserU   rV   rW   rX   rY   r   s     r=   rJ   CanineSelfOutput.__init__m  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r<   r/   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rY   rU   r^   r/   r   s      r=   r   CanineSelfOutput.forwards  s7     

=1]3}'CDr<   rU   r   rY   r2   r3   r4   r5   rJ   r:   r7   r8   r   r;   r   r   s   @r=   r   r   l  sQ    >"5#4#45EJEVEV	u  %"3"33	4 r<   r   c                      ^  \ rS rSrSr       SS\S\S\S\S\S\4U 4S	 jjjr  SS\\	R                     S\	R                  S
-  S\S
-  S\\	R                  \	R                  S
-  4   4S jjrSrU =r$ )CanineAttentioni|  aB  
Additional arguments related to local attention:

    - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
    - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
      attend
    to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
    *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
    positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
    width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
    128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
    **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
    *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
    skip when moving to the next block in `to_tensor`.
always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	                    > [         T	U ]  5         [        U5      U l        [	        U5      U l        X l        XV:  a  [        S5      eXx:  a  [        S5      eX0l        X@l	        XPl
        X`l        Xpl        Xl        g )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)rI   rJ   r   r^   r   outputlocalri   r   r   r  r  r  r  
r^   rK   r  r   r   r  r  r  r  rb   s
            r=   rJ   CanineAttention.__init__  s     	'/	&v. 
"=w  !9r  0O,-J*'>$(@%%:"&<#r<   Nr/   r   r   r   c                    U R                   (       d  U R                  XX#5      nUS   nGOUR                  S   =pgU=p/ n
U R                  (       a  U
R	                  S5        SnOSn[        XU R                  5       H-  n[        XlU R                  -   5      nU
R	                  X45        M/     / nU R                  (       a  UR	                  SU45        [        SXpR                  5       H-  n[        X|U R                  -   5      nUR	                  X45        M/     [        U
5      [        U5      :w  a  [        SU
 SU
 S35      e/ n/ n[        X5       H  u  u  nnu  nnUS S 2UU2S S 24   nU	S S 2UU2S S 24   nUS S 2UU2UU24   nU R                  (       aJ  US S 2UU2SS24   n[        R                   " UU/SS9nU	S S 2SS2S S 24   n[        R                   " UU/SS9nU R                  UUUU5      nUR	                  US   5        U(       d  M  UR	                  US   5        M     [        R                   " USS9nU R#                  XQ5      nU4nU R                   (       d
  UWSS  -   nU$ U[%        W5      -   nU$ )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r   rt   )r  r^   r   r   rj   rN   r  r   r  r  r  rg   ri   zipr   r7   rx   r  r:   )r^   r/   r   r   self_outputsattention_outputfrom_seq_lengthto_seq_lengthr   r   from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr   s                              r=   r   CanineAttention.forward  s    zz99]>eL+A.;.A.A!.DDO&33K K11""6* 

$Z$B_B_`t?[?[1[\	""K#;<  a
 I11  !]!34$Q7R7RST=W=W/WX	  +!9:  T ;3y>1 Ek] S$$/=0AC  ')#%'">A+>Y:&X(:6$/:h3F0I$J!"+Ax,A"B (6aH9LhW]o6]'^$77)7:h;NPQRSPS8S)T&+0996HJ^5_ef+g(#,Q!QY#7L&+ii0OUV&WO*.))%8LN_+' (../Fq/IJ$$*112I!2LM% ?Z(  %yy)@aH;;'7G#%zzQR 00G  &< ==Gr<   )	r   r  r  r  r  r   r  r  r^   FFF   r"  r"  r"  r   )r2   r3   r4   r5   r6   r   r   rJ   r:   r7   r8   r   r;   r   r   s   @r=   r   r   |  s    & 05.3'*(+%(&)= *.	=
 (,= "%= #&=  #= !$= =F 48).	GU../G ))D0G  $;	G
 
u  %"3"3d"::	;G Gr<   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )CanineIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rI   rJ   r   r   rL   intermediate_sizer   
isinstancer   strr
   intermediate_act_fnr   s     r=   rJ   CanineIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r<   r/   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r)  r^   r/   s     r=   r   CanineIntermediate.forward   s&    

=100?r<   r,  )
r2   r3   r4   r5   rJ   r7   r8   r   r;   r   r   s   @r=   r$  r$    s,    9U%6%6 5;L;L  r<   r$  c                      ^  \ rS rSrU 4S jrS\\R                     S\R                  S\R                  4S jrSr	U =r
$ )CanineOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rI   rJ   r   r   r&  rL   r   rU   rV   rW   rX   rY   r   s     r=   rJ   CanineOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r<   r/   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r=   r   CanineOutput.forward  s5    

=1]3}'CDr<   r   r   r   s   @r=   r0  r0    s?    >U5+<+<%= UM^M^ chctct  r<   r0  c                      ^  \ rS rSrU 4S jr  SS\\R                     S\R                  S-  S\S-  S\\R                  \R                  S-  4   4S jjr	S	 r
S
rU =r$ )CanineLayeri  c	           
         > [         T	U ]  5         UR                  U l        SU l        [	        UUUUUUUU5      U l        [        U5      U l        [        U5      U l	        g Nr   )
rI   rJ   chunk_size_feed_forwardseq_len_dimr   	attentionr$  intermediater0  r  r  s
            r=   rJ   CanineLayer.__init__  se     	'-'E'E$(+)#$!"	
 /v6"6*r<   Nr/   r   r   r   c                     U R                  UUUS9nUS   nUSS  n[        U R                  U R                  U R                  U5      nU4U-   nU$ )N)r   r   r   )r;  r   feed_forward_chunkr9  r:  )r^   r/   r   r   self_attention_outputsr  r   layer_outputs           r=   r   CanineLayer.forward0  ss     "&/ "0 "

 2!4(,0##T%A%A4CSCSUe
  /G+r<   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r<  r  )r^   r  intermediate_outputrA  s       r=   r?  CanineLayer.feed_forward_chunkF  s)    "//0@A{{#6Ir<   )r;  r9  r<  r  r:  r   )r2   r3   r4   r5   rJ   r:   r7   r8   r   r   r?  r;   r   r   s   @r=   r6  r6    sz    +< 48).	U../ ))D0  $;	
 
u  %"3"3d"::	;, r<   r6  c                      ^  \ rS rSr       SU 4S jjr    SS\\R                     S\R                  S-  S\S-  S\S-  S\S-  S	\\	-  4S
 jjr
SrU =r$ )CanineEncoderiL  c	                    > [         T
U ]  5         Xl        [        R                  " [        UR                  5       V	s/ s H  n	[        UUUUUUUU5      PM     sn	5      U l        SU l	        g s  sn	f r   )
rI   rJ   rK   r   
ModuleListrN   num_hidden_layersr6  layergradient_checkpointing)r^   rK   r  r   r   r  r  r  r  r   rb   s             r=   rJ   CanineEncoder.__init__M  s}     	]] v778 9A 31+,)*	 9

 ',#s   A-Nr/   r   r   output_hidden_statesreturn_dictr   c                 $   U(       a  SOS nU(       a  SOS n[        U R                  5       H0  u  pU(       a  Xa4-   nU	" XU5      n
U
S   nU(       d  M(  XzS   4-   nM2     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr1   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r1   .0vs     r=   	<genexpr>(CanineEncoder.forward.<locals>.<genexpr>  s     m$[q$[   	)r-   r/   r0   )rv   rK  r:   r   )r^   r/   r   r   rN  rO  all_hidden_statesall_self_attentionsr`   layer_modulelayer_outputss              r=   r   CanineEncoder.forwardk  s     #7BD$5b4(4OA#$58H$H!(HYZM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r<   )rK   rL  rK  r!  )NFFT)r2   r3   r4   r5   rJ   r:   r7   r8   r   r   r   r;   r   r   s   @r=   rG  rG  L  s     (-&+ #!$!",B 48).,1#'
U../
 ))D0
  $;	

 #Tk
 D[
 
	 
 
r<   rG  c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )CaninePooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rI   rJ   r   r   rL   r   Tanhr   r   s     r=   rJ   CaninePooler.__init__  s9    YYv1163E3EF
'')r<   r/   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r   )r^   r/   first_token_tensorpooled_outputs       r=   r   CaninePooler.forward  s6     +1a40

#566r<   )r   r   r   r   s   @r=   r^  r^    s1    $
U5+<+<%= %BSBS  r<   r^  c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )CaninePredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rI   rJ   r   r   rL   r   r'  r   r(  r
   transform_act_fnrU   rV   r   s     r=   rJ   &CaninePredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr<   r/   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   ri  rU   r-  s     r=   r   %CaninePredictionHeadTransform.forward  s4    

=1--m<}5r<   )rU   r   ri  r   r   s   @r=   rg  rg    s2    UU5+<+<%= %BSBS  r<   rg  c                   h   ^  \ rS rSrU 4S jrS\\R                     S\R                  4S jrSr	U =r
$ )CanineLMPredictionHeadi  c                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        g )NT)bias)rI   rJ   rg  	transformr   r   rL   
vocab_sizedecoder	Parameterr7   r   rp  r   s     r=   rJ   CanineLMPredictionHead.__init__  s[    6v> yy!3!3V5F5FTRLLV->->!?@	r<   r/   r   c                 J    U R                  U5      nU R                  U5      nU$ r   )rq  rs  r-  s     r=   r   CanineLMPredictionHead.forward  s$    }5]3r<   )rp  rs  rq  r   r   s   @r=   rn  rn    s2    AU5+<+<%= %BSBS  r<   rn  c                   n   ^  \ rS rSrU 4S jrS\\R                     S\\R                     4S jrSr	U =r
$ )CanineOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g r   )rI   rJ   rn  predictionsr   s     r=   rJ   CanineOnlyMLMHead.__init__  s    1&9r<   sequence_outputr   c                 (    U R                  U5      nU$ r   r{  )r^   r}  prediction_scoress      r=   r   CanineOnlyMLMHead.forward  s     !,,_=  r<   r  )r2   r3   r4   r5   rJ   r:   r7   r   r   r;   r   r   s   @r=   ry  ry    s6    :!u||,! 
u||	! !r<   ry  c                   <   ^  \ rS rSr% \\S'   SrSrU 4S jrSr	U =r
$ )CaninePreTrainedModeli  rK   canineTc                   > [         TU ]  U5        [        U[        5      (       a\  [        R
                  " UR                  [        R                  " UR                  R                  S   5      R                  S5      5        g g )NrG   rF   )rI   _init_weightsr'  r?   initcopy_rE   r7   r[   r   r]   )r^   modulerb   s     r=   r  #CaninePreTrainedModel._init_weights  s^    f%f.//JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 0r<   r1   )r2   r3   r4   r5   r   r9   base_model_prefixsupports_gradient_checkpointingr  r;   r   r   s   @r=   r  r    s!     &*#i ir<   r  c                   |  ^  \ rS rSrSU 4S jjrS rS\R                  S\4S jr	S\R                  S\S	\R                  4S
 jr
\        SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\S-  S\S-  S\S-  S	\\-  4S jj5       rSrU =r$ )CanineModeli  c                   > [         TU ]  U5        Xl        [        R                  " U5      nSUl        [        U5      U l        [        USSSUR                  UR                  UR                  UR                  S9U l
        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        U(       a  [#        U5      OSU l        U R'                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   TF)r  r   r   r  r  r  r  N)rI   rJ   rK   copydeepcopyrJ  r?   char_embeddingsrG  local_transformer_strideinitial_char_encoderr   chars_to_moleculesencoderr   
projectionfinal_char_encoderr^  pooler	post_init)r^   rK   add_pooling_layershallow_configrb   s       r=   rJ   CanineModel.__init__  s    
 	 v.+,(/7$1,1*/$*$C$C%+%D%D"("A"A#)#B#B	%
! #8"?$V,(0"/"?.?l6*T 	r<   c                    UR                   S   UR                   S   pCUR                   S   n[        R                  " X#SU45      R                  5       n[        R                  " X4S4[        R
                  UR                  S9nXb-  nU$ )a  
Create 3D attention mask from a 2D tensor mask.

Args:
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].

Returns:
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
r   r   )r   r   r   )r   r7   reshaper   onesfloat32r   )r^   r   to_maskr   r  r  broadcast_onesmasks           r=   )_create_3d_attention_mask_from_input_mask5CanineModel._create_3d_attention_mask_from_input_mask  s     '2&7&7&:K<M<Ma<POa(--a)GHNNP
 *q)IQVQ^Q^gnguguv 'r<   char_attention_maskr   c                     UR                   u  p4[        R                  " XSU45      n[        R                  R	                  X"S9" UR                  5       5      n[        R                  " USS9nU$ )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   rG   rt   )r   r7   r  r   	MaxPool1dr   squeeze)r^   r  r   r   char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_masks           r=   _downsample_attention_mask&CanineModel._downsample_attention_mask  sp     $7#<#< 
"]]+>QP\@]^  %xx11>O1j$$& 

 #(--0D""M&&r<   	moleculeschar_seq_lengthr   c                     U R                   R                  nUSS2SS2SS24   n[        R                  " XCSS9nUSS2SS2SS24   nX#-  n[        R                  " UXs-   SS9n[        R                  " XX/SS9$ )zDRepeats molecules to make them the same length as the char sequence.Nr   r   )repeatsru   rG   rt   )rK   r   r7   repeat_interleaverx   )	r^   r  r  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeateds	            r=   _repeat_moleculesCanineModel._repeat_molecules(  s     {{,,&/12q&9#**+FZ\] "!RS!),*1"44$+	
 yy(7R@@r<   Nrk   r   r   rE   r   r   rN  rO  c	                 6   Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOS n
U(       a  SOS nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[	        S5      eUu  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUc$  [        R                  " U[        R                  US9nU R                  X R                   R                  S9nU R                  UUUUS9nU R                  Ub  UOUU5      nU R!                  UUUUS	9nUR"                  nU R%                  U5      n['        U R                   US S 2S
S2S S 24   UR)                  S5      S9nU R+                  UUUUUS9nUS
   nU R,                  b  U R-                  U5      OS nU R/                  UUS   S9n[        R0                  " UU/SS9nU R3                  U5      n['        U R                   UUS9nU R5                  UUUUS	9nUR"                  nU(       a7  U(       a  UR6                  OUS   nU
UR6                  -   U-   UR6                  -   n
U(       a7  U(       a  UR8                  OUS   nUUR8                  -   U-   UR8                  -   nU(       d  UU4nU[;        S X4 5       5      -  nU$ [=        UUU
US9$ )Nr1   zDYou cannot specify both input_ids and inputs_embeds at the same timerG   z5You have to specify either input_ids or inputs_embeds)r   r   )r   )rk   rE   r   r   )r   r   rN  r   r   )rK   r   r   )r   r   rN  rO  )r  rt   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r1   rR  s     r=   rU  &CanineModel.forward.<locals>.<genexpr>  s     a'O!AA'OrW  )r-   r.   r/   r0   )rK   r   rN  rO  ri   %warn_if_padding_and_no_attention_maskr   r   r7   r  r   r   r  r   r  r  r  r-   r  r   r  r  r  r  rx   r  r  r/   r0   r:   r+   ) r^   rk   r   r   rE   r   r   rN  rO  kwargsrX  rY  r   r   r   r   r  input_char_embeddingsr  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputrd  repeated_moleculesconcatr}  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsr  s                                    r=   r   CanineModel.forwardA  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 #7BD$5b4%0%<k$++BYBY ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!"[[EJJvVN #'"A"Akk.K.K #B #

 !% 4 4%)'	 !5 !
 #LL".IM>
 &*%>%>!./!5	 &? &
" 9JJ  "&!8!89L!M";;;0AaC;2::1=#
 ,,"2/!5# ' 
 $31#5 AEAX$<=^b "334L^ijl^m3n /1CD"M //&12;;))
 '+&=&=)/!5	 '> '
# 6GGJU)F)F[jkl[m&!,::;,- .;;<  IT?+E+EZijlZm(#,778./ .889   %}5Fea(9'OaaaFM+-'+*	
 	
r<   )r  r  rK   r  r  r  r  r  )T)NNNNNNNN)r2   r3   r4   r5   rJ   r  r7   r   r   r  r  r   r   r8   r   r:   r+   r   r;   r   r   s   @r=   r  r    s)    D6'ell '_b '"A5<< A# ARWR^R^ A2  .237260426)-,0#'\
##d*\
 ))D0\
 ((4/	\

 &&-\
 ((4/\
  $;\
 #Tk\
 D[\
 
-	-\
 \
r<   r  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForSequenceClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   rI   rJ   
num_labelsr  r  r   rW   rX   rY   r   rL   
classifierr  r   s     r=   rJ   (CanineForSequenceClassification.__init__  i      ++!&)zz&"<"<=))F$6$68I8IJ 	r<   Nrk   r   r   rE   r   labelsr   rN  rO  r   c
                 P   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r   rE   r   r   rN  rO  r   
regressionsingle_label_classificationmulti_label_classificationrG   r   losslogitsr/   r0   )rK   rO  r  rY   r  problem_typer  r   r7   r   r   r   r  r   r   r   r   r/   r0   )r^   rk   r   r   rE   r   r  r   rN  rO  r  r   rd  r  r  loss_fctr  s                    r=   r   'CanineForSequenceClassification.forward  s   ( &1%<k$++BYBY++))%'/!5#  	
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r<   r  r  rY   r  	NNNNNNNNN)r2   r3   r4   r5   rJ   r   r7   r   r8   r   r:   r   r   r;   r   r   s   @r=   r  r    s    	  .237260426*.)-,0#'D
##d*D
 ))D0D
 ((4/	D

 &&-D
 ((4/D
   4'D
  $;D
 #TkD
 D[D
 
)	)D
 D
r<   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForMultipleChoicei;  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g r8  )rI   rJ   r  r  r   rW   rX   rY   r   rL   r  r  r   s     r=   rJ    CanineForMultipleChoice.__init__=  sV     !&)zz&"<"<=))F$6$6: 	r<   Nrk   r   r   rE   r   r  r   rN  rO  r   c
                 X   U	b  U	OU R                   R                  n	Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	S9nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   rG   r   r  r   r  )rK   rO  r   r   r   r  rY   r  r   r   r/   r0   )r^   rk   r   r   rE   r   r  r   rN  rO  r  num_choicesr   rd  r  reshaped_logitsr  r  r  s                      r=   r   CanineForMultipleChoice.forwardG  s   X &1%<k$++BYBY,5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ++))%'/!5#  	
  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r<   )r  r  rY   r  )r2   r3   r4   r5   rJ   r   r7   r   r8   r   r:   r   r   r;   r   r   s   @r=   r  r  ;  s      .237260426*.)-,0#'W
##d*W
 ))D0W
 ((4/	W

 &&-W
 ((4/W
   4'W
  $;W
 #TkW
 D[W
 
*	*W
 W
r<   r  c                   .  ^  \ rS rSrU 4S jr\         SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForTokenClassificationi  c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   r  r   s     r=   rJ   %CanineForTokenClassification.__init__  r  r<   Nrk   r   r   rE   r   r  r   rN  rO  r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

Example:

```python
>>> from transformers import AutoTokenizer, CanineForTokenClassification
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
>>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

>>> inputs = tokenizer(
...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
... )

>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_token_class_ids = logits.argmax(-1)

>>> # Note that tokens are classified rather then input words which means that
>>> # there might be more predicted token classes than words.
>>> # Multiple token classes might account for the same word
>>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
>>> predicted_tokens_classes  # doctest: +SKIP
```

```python
>>> labels = predicted_token_class_ids
>>> loss = model(**inputs, labels=labels).loss
>>> round(loss.item(), 2)  # doctest: +SKIP
```Nr  r   rG   r   r  )rK   rO  r  rY   r  r   r   r  r   r/   r0   )r^   rk   r   r   rE   r   r  r   rN  rO  r  r   r}  r  r  r  r  s                    r=   r   $CanineForTokenClassification.forward  s    ` &1%<k$++BYBY++))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r<   r  r  )r2   r3   r4   r5   rJ   r   r7   r   r8   r   r:   r   r   r;   r   r   s   @r=   r  r    s    	  .237260426*.)-,0#'O
##d*O
 ))D0O
 ((4/	O

 &&-O
 ((4/O
   4'O
  $;O
 #TkO
 D[O
 
&	&O
 O
r<   r  c                   N  ^  \ rS rSrU 4S jr\          SS\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S\R                  S-  S	\R                  S-  S
\R                  S-  S\	S-  S\	S-  S\	S-  S\
\-  4S jj5       rSrU =r$ )CanineForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   )
rI   rJ   r  r  r  r   r   rL   
qa_outputsr  r   s     r=   rJ   #CanineForQuestionAnswering.__init__  sS      ++!&)))F$6$68I8IJ 	r<   Nrk   r   r   rE   r   start_positionsend_positionsr   rN  rO  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUU	U
S9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      nUR                  S5      nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5        UR                  SU5        [        US9nU" X5      nU" UU5      nUU-   S-  nU
(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r   rG   rt   )ignore_indexr   )r  start_logits
end_logitsr/   r0   )rK   rO  r  r  splitr  rg   r   clamp_r   r   r/   r0   )r^   rk   r   r   rE   r   r  r  r   rN  rO  r  r   r}  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r=   r   "CanineForQuestionAnswering.forward  s    &1%<k$++BYBY++))%'/!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r<   )r  r  r  )
NNNNNNNNNN)r2   r3   r4   r5   rJ   r   r7   r   r8   r   r:   r   r   r;   r   r   s   @r=   r  r    s     .2372604263715)-,0#'=
##d*=
 ))D0=
 ((4/	=

 &&-=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r<   r  )r  r  r  r  r6  r  r  )?r6   r  r   dataclassesr   r7   r   torch.nnr   r   r    r	   r  activationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_caniner   
get_loggerr2   loggerrh   r+   Moduler?   r   r   r   r   r   r$  r0  r6  rG  r^  rg  rn  ry  r  r  r  r  r  r  __all__r1   r<   r=   <module>r     sE      !   A A & ! 6 9  . 6 , . 
		H	% U  7; 7 7:^ryy ^B)BII )X5RYY 5pN")) Nbryy  xbii xv 299 5, 5p=
BII =
@299 BII "RYY &
!		 
! iO i i E
' E
 E
P Q
&; Q
Q
h c
3 c
 c
L \
#8 \
 \
~ I
!6 I
 I
Xr<   