
    3j                       S r SSKrSSKrSSKrSSKJr  SSKJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.J/r/J0r0J1r1J2r2  SSK3J4r4J5r5J6r6  \6Rn                  " \85      r9Sr:Sr;Sr<Sr=Sr>\,S-  r,\\ \!\"S.r?\:\=S.r@\5" \,5       " S S\/5      5       rA\ArBg)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer)cached_file   )SpmConverter)convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)r   r   	WordLevel	WordPiece)tokenizer_file
vocab_filec            )       ^  ^  \ rS rSrSr\rSrSr\	SNS j5       r
U 4S jr\S\4S j5       r\S\4S	 j5       rSOS
\S\S-  S\\   4S jjrS r\S 5       r\S 5       r\R,                  S 5       r\R,                  S 5       rS r\S\4S j5       rS\\\4   4S jr\S\\\4   4S j5       r\S\\\4   4S j5       r\S\\\4   4S j5       r\r \r!S\\\4   4S jr"S\4S jr#S\4S jr$\S\%4S j5       r&\S\'4S j5       r(       SPS\)S\S-  S \S-  S!\S"\S#\S$\S%\S\\\\*4   \+\)   4   4S& jjr,S'\S\4S( jr-S)\S\S-  4S* jr.SNS+\+\\-     S\4S, jjr/SNS-\S\4S. jjr0SNS/\\+\   -  S0\S\\+\   -  4S1 jjr1SQS2\S-\S-  S3\S\+\   4S4 jjr2S5\3S6\4S7\S8\S9\S-  S:\S-  4S; jr5SS\3Rl                  \4Rn                  SS<SSSSSSSSSSSS4S2\8\9-  \+\8   -  \+\9   -  S=\8\9-  \+\8   -  \+\9   -  S-  S3\S5\3S6\4S7\S-  S8\S>\S9\S-  S:\S-  S?\S-  S\S-  S \S-  S!\S"\S#\S$\S%\S@\S-  S\:4(SA jjr;SB\+\   S\4SC jr<  SRSD\\+\   -  S0\SE\S-  S\4SF jjr=  SSS
\\>R~                  -  SG\\SH4   SI\S-  S\S-  S\\SH4   4
SJ jjr@   STSK jrA\	       SUSL j5       rBSMrCU =rD$ )VTokenizersBackendS   a5  
Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

Handles all the shared methods for tokenization and special tokens, as well as methods for
downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
NFc                 "  ^( [        U5      nUR                  SS5      nUb_  [        R                  R	                  U5      (       a;  U [
        L d  SU R                  ;  d  U(       a  [        R                  " U5      US'   U$ UGb  [        R                  R	                  U5      (       Ga  [        USS9 n[        R                  " U5      nSSS5        WR                  S0 5      R                  S5      nUS	;  a]  [        U5      n[        US   5      n	0 U	S
'   US:X  a  / U	S'   XS'   / US'   [        R                  " [        R                  " U5      5      n
O[        R                  " U5      n
U
R                  US'   U
R                   US'   U
R"                  US'   U
R"                  b  U
R"                  US'   U
R                   b  U
R                   US'   UR                  S5      nU(       aw  UR                  SS5      S:X  a  US   nO[%        U[&        5      (       d  U/nU H=  nUR                  S5      S:X  d  M  SU;   d  M"  SSKnUR+                  US   5      US'     O   UR                  S0 5      R                  S
S5      nU R,                  c0  [%        U[&        5      (       a  ['        [/        [0        U5      5      nGO0U R,                  R2                  S:X  aV  [%        U[&        5      (       a@  U(       a9  [%        US   [&        [0        45      (       a  U Vs/ s H  n[1        U5      PM     nnOU R,                  R2                  S:X  a"  [5        U5       VVs0 s H	  u  nnUU_M     nnnOU R,                  R2                  S:X  d  U R,                  R2                  S:X  aP  [%        U[&        5      (       a;  [5        U5       VVs0 s H#  u  nn[%        U[&        5      (       a  US   OUU_M%     nnnXS
'   [7        U SS5      nSUR                  S0 5      ;   an  U(       ag  UR2                  S:X  aW  US   S   nU Vs/ s H=  n[%        U[8        5      (       a  [1        UR;                  S5      5      O
[1        U5      PM?     nnUUS'   U$ UR                  S5      nUR                  S5      nUR                  S
5      nUR                  S5      n[%        U[8        5      (       a`  UR=                  S 5      (       aJ  [        R                  R	                  U5      (       a&  S!S"KJ n  U" US#9RC                  U5      u  US
'   US'   U$ [%        U[8        5      (       Ga  [        R                  R	                  U5      (       Ga  UR=                  S$5      (       Gax   S!S%KJ"n  U" U5      nURF                  " U R,                  40 UD6n S!S&KJ$n  UR                  U R2                  5      nUb#  [K        US'5      (       a  URL                  " S?0 UD6n[K        U S+5      (       a  U RT                  " S?0 UD6nSU;  Ga  U [
        L d  SU R                  ;  Ga  UR                  S
S5      nUR                  SS5      nUR                  S,5      =(       d    0 nUb  U(       a  URW                  5        VVs0 s H	  u  nnUU_M     nnnURW                  5        H_  u  nn[Y        U5      n[9        U5      nUR                  U5      n U (       d  M6  U U:w  d  M>  UU;  d  MF  UR                  U 5      UU'   UUU'   Ma     [Z        R\                  " UR^                  UUS-9n!U!b  U!US'   UR^                  R`                  n"U"Rb                  S:  a%  URe                  S.U"Rf                  =(       d    S/5        U"Rh                  S:  a%  URe                  S0U"Rj                  =(       d    S15        U"Rl                  S:  a%  URe                  S2U"Rn                  =(       d    S35        U$ UcC  [%        U[8        5      (       a.  [        R                  R	                  U5      (       a
  UUS
'   US
   nUcC  [%        U[8        5      (       a.  [        R                  R	                  U5      (       a
  UUS'   US   nUc  U R,                  b  U R,                  R2                  S:X  a  [%        U[         5      (       ak  S:[t        [v           S;[&        [8           4U(4S< jjm(/ S=Qn%[y        5       n&U% H&  n'U'U;   d  M  U&R{                  T(" UU'   /5      5        M(     [}        UU&S>9nUUS'   U$ ! , (       d  f       GN~= fs  snf s  snnf s  snnf s  snf ! [N         a1  n[P        RS                  S(U R2                   S)U S*35         SnAGNSnAff = fs  snnf ! [N         aV  n[P        RS                  S4U S5U S635        S!S7KJ8n#  U#" UUR                  S85      S99n$U$Rs                  5       US'    SnAU$ SnAff = f)@z
Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
models, tekken.json, vocab/merges).
r$   N__init__tokenizer_objectutf-8encodingmodeltype)Nr   vocabr   mergesadded_tokenspost_processortokenizer_paddingtokenizer_truncation_json_truncation_json_padding
normalizerSequencenormalizersPrecompiledprecompiled_charsmapr   _spm_precompiled_charsmapr   r"   r#    r%   merges_fileztekken.jsonr   )MistralConverter)r%   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modeladded_tokens_decoder)protor1   r2   	bos_token<s>	eos_token</s>	unk_tokenz<unk>z+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r%   rO   valuesreturnc                    > / nU  HV  nUc  M  [        U[        [        45      (       a  UR                  T" U5      5        M<  UR	                  [        U5      5        MX     U$ N)
isinstancelisttupleextendappendstr)rP   	collectedval_iter_special_tokenss      d/home/wildlama/miniconda3/lib/python3.13/site-packages/transformers/tokenization_utils_tokenizers.pyr\   HTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens)  sY    ')	!C{ !#e}55!(()=c)BC!((S2 " !     )		pad_tokenrM   rI   rK   	sep_token	cls_token
mask_tokenadditional_special_tokensrO   )skip_tokens )?dictpopospathisfiler'   __dict__TokenizerFast	from_fileopenjsonloadgetfrom_strdumpsr4   padding
truncationrT   rU   base64	b64decoder/   maprV   __name__	enumerategetattrrY   splitendswithconvert_slow_tokenizerrA   extract_vocab_merges_from_modelrC   extractrD   hasattrrE   	ExceptionloggerwarningrF   itemsintr   build_tokenizer_from_spm_protorH   trainer_specbos_id
setdefault	bos_pieceeos_id	eos_pieceunk_id	unk_piecerN   	convertedr   r   setupdater   ))clstrust_remote_codekwargslocal_kwargsfast_tokenizer_filetokenizer_handletokenizer_json
model_typeminimal_tokenizer_jsonminimal_modeltok_from_filenormalizer_configr9   rw   r1   itemitokenr2   merger%   r@   rA   rC   	extractorrD   converter_classerG   token_idid_to_token	new_tokencurrent_tokenr+   
proto_specrN   	converterspecial_tokens_keysre   keyr\   s)                                           @r]   convert_to_native_format*TokenizersBackend.convert_to_native_formate   sH	    F|*../?F  +233))Zs||-KO`/</F/FGZ/[L+, ,@S1T1T )G<@P!%+;!< = (++GR8<<VDJ!22)-n)=& $^G%< =)+g&&.0M(+2?w/9;&~6 - 6 6tzzBX7Y Z - 7 78K L-:-I-IL)*0=0E0EL,-3@3K3KL/0 ''33@3K3K/0$$00=0E0E_- !/ 2 2< @ $((6*D(9-(H%#$5t<<):(;%"3J!~~f->CY]gCg%DJDTDT&'=>E%@A  #4 #&&w377FEyy eT** UE!23E##y0eT**uE!HtUZm9\9\5:;UTU4[UE;##{22;E2BC2Bha2BC##u,		0B0Bk0QeT**_hin_op_oS[STV[E4)@)@U1XeQN_oEp$)! gt4J>--gr::
zObObfkOk'0:kqrkqbgZs5K5K%C 01QVW\Q]]kqr)/X&!%%l3
"&&}5  )!!(+ j#&&:+>+>}+M+MRTRYRYR`R`akRlRl@<L%=--j9 :L!<#9   j#&&277>>*+E+E*J]J]^fJgJgFIJ 3:>	(00KlK	O&=&A&A#,,&OO&2wPb7c7c'6'G'G'W,'W
 3 899#&#=#=#M#ML
 &\9,,
#,,0N(,,Wd;E)--h=F ,8+;+;<R+S+YWY((-ANSkkm&\m?5(xm&\3G3M3M3O/Hi'*8}H(+II,7OOH,EM,})1KPYafPf3899]3Ki 08AH 5 4P (4'R'R'oo#%($
 (3;K%78 &/__%A%A
%,,1(33KAUAUA^Y^_%,,1(33KAUAUA_Y_`%,,1(33KAUAUA`Y`a   =Z
C88RWW^^J=W=W$.L! )E>jc::rww~~k?Z?Z%0L"!(+F >cii3		8J8Je8SXbchjnXoXo	!Xc] 	!tCy 	!
# %(EK*,&&&';\#=N<O'PQ + %UDF%+L"[ =<x <C q sF ! NNFs||nT\]^\_  `O  P & ']6  
IA*Mqrsqt u: : F-)@P@PQg@h	 4=3F3F3H/0
Is   4eee*e:Ae%+f. ?Ae* Bf. %f(5Af. >f. f. C:f. 
e*
f%4&f f.  f%%	f. .
h8Ah		hc           	      <  > UR                  SS 5      nUR                  SS 5      nUR                  SS 5        UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  S0 5      nUR                  SS	5      n	UR                  S
5      n
UR                  S5      nUR                  S5      nS nUb  [        R                  " U5      nGOUb<  [        R
                  R                  U5      (       a  [        R                  " U5      nGOUb|  [        UR                  SS5      U40 UD6n[        U5      nUS   S   nUS   nUS   n[        UU5      u  nnUR                  U5        [        U5      S:  a  UR                  U5        GOU R                  c  Ub  UbT  [        U[         5      (       a  UO&[#        U5       VVVs0 s H  u  nu  nnUU_M     snnnn[        [%        UUSS S95      nO[        U[         5      (       a  [        [%        U/ SS S95      nOu[        U[&        5      (       aG  U(       a@  [        US   [(        [&        45      (       a"  [        [+        XR                  SS5      S95      nOU R                  c  [-        S5      eUc4  Uc1  U R                  c$  UR/                  SS5        UR/                  SS5        Ub  Xl        U R                  c  [-        S5      eUR                  SS 5      =(       d    U R                  R0                  =(       d    UnUbq  U R                  R2                  " S80 UD6  UR/                  SUS   5        UR/                  S US!   5        UR/                  S"US"   5        UR/                  S#US$   5        OU R                  R5                  5         UR                  S%S 5      =(       d    U R                  R6                  =(       d    UnUb  U R                  R8                  " S80 UD6  UR/                  S&US&   5        UR/                  S'US(   5        UR/                  S)US!   5        UR/                  SUS*   5        UR/                  S+US+   5        S,U;  a  S-US,'   S.U;   =(       d    S/U;   nUR                  S.S	5      U l        UR                  S/S	5      U l        UR                  S0S 5      =n(       a  UU R                  l        U=(       d    U R                  R>                  S L U l         [B        T&U ]  " S80 UD6  U
b  Xl#        Xl$        U RJ                  U R                  l&        U RN                   Vs1 s H  n[Q        [S        U5      5      iM     nn[U        URW                  5       S1 S29 VVs/ s H"  u  nn[Q        [S        U5      5      U;  d  M   UPM$     nnn['        U RX                  R[                  5       5      U Vs/ s H  n[]        U5      PM     sn-   n U R^                  Ra                  5        H2  n!U!c  M  []        U!5      U ;  d  M  U!U;  d  M!  URc                  U!5        M4     U Rd                   H-  n[]        U5      U ;  d  M  UU;  d  M  URc                  U5        M/     [        U5      S:  a  / n"U R^                  Ra                  5        V#s/ s H  n#U#(       d  M  []        U#5      PM     n$n#U Hp  n[        U[\        5      (       a  [g        USS39nO<[        U[f        5      (       a'  URh                  (       d  []        U5      U$;   a  SUl4        U"Rc                  U5        Mr     U"(       a  U Rk                  U"5         U R                  Rm                  5       n%U%S4:  a  [q        U R                  S5S 5      bl  UR                  SS 5        U Rr                  " U R                  U Rt                  R                  SS 5      4U Rt                  UR                  S6S 5      S7.UD6U l        U R@                  =(       d    U R                  R>                  S L U l         U R@                  (       a  U Rw                  5         g g s  snnnf s  snf s  snnf s  snf s  sn#f ! [n         a    Sn% GNf = f)9Nr7   r8   r>   r+   	gguf_filer$   rG   add_prefix_spaceFr%   r1   r2   name_or_path configr   	tokenizertokenizer_configr   T)r1   r2   fuse_unkdropoutr   )r1   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.rI   rJ   rK   rL   z3The backend tokenizer is not correctly initialized.r6   
max_lengthtruncation_side	directionstridetruncation_strategystrategyr5   r`   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenr4   c                     U S   $ Nr   rf   )xs    r]   <lambda>,TokenizersBackend.__init__.<locals>.<lambda>  s    STUVSWr_   r   )speciali pre_tokenizerfix_mistral_regex)init_kwargsr   rf   )<rh   rr   copydeepcopyri   rj   rk   rm   rn   r   r   r   r   len
_tokenizerrT   rg   r{   r   rU   rV   r   
ValueErrorr   rv   enable_truncationno_truncationru   enable_padding_add_bos_token_add_eos_tokenr4   _should_update_post_processorsuperr*   r%   r   split_special_tokensencode_special_tokensrG   hashreprsortedr   added_tokens_encoderkeysrY   _special_tokens_maprP   rX   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorr|   _patch_mistral_regexr   update_post_processor)'selfargsr   r7   r8   r+   r   r   rG   r   r%   r1   r2   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargsr   w_
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsr4   r   added_tokens_decoder_hashindextokens_to_addencoderspecial_token_valuetokenstall_named_tokens
vocab_size	__class__s'                                         r]   r*   TokenizersBackend.__init__H  s    "::&8$?

?D9 	

.5!::&8$?JJ{D1	$jj)94@%zz*@"E!::&8%@ZZ-


7#H%'!]]+;<N ,@S1T1T*445HIN"#FJJ~r$BIXQWXI-i8J%h/=L'4N)*<=0F|Uc0d-N-MM*+$%)/0__$):!&0&=&=UZcdiZjCkZjYQPVQRTUAqDZjCk
!.sF]ako/p!qE4((!.srTXbf/g!hE4((Uz%(UTXM7Z7Z!.wU::V^`aKb/c!d__$r  &+;+CH_k51k62%,O??"RSSjj!7>p$//B\B\p`p"OO--<<lK,EF/[1IJhH(=>3[5LMOO))+::148dDOO<S<SdWdOO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS F" ,F9%4%>%[/U[B["$jj%@$jj%@#ZZ(8$??>?-;DOO*-G-q4??KiKimqKq*"6"!(O 0040I0I-DHD]D]$^D]5T$u+%6D]!$^ !'';'A'A'C X
 XuDK (AA  X 	 

 t005578Ta;bTa5CJTa;bb $(#;#;#B#B#D"*&'w6;NVc;c$$%89	 $E //E5z(U--G$$U+ 0 }!F040H0H0O0O0QW0Q1UVA0QW&eS))&ud;Ez22 ==SZ;K-K(,e$ ' '	779J
 74??OT#R#^JJ{D)"77  $$^T: !,,"(**-@$"G	
 DO ..X$//2P2PTX2X 	* --&&( .o Dl~ %_

 <c"  X  # 	J	s<   a/'a6#a;a;5b	bb8b bbrQ   c                     g)NTrf   r   s    r]   is_fastTokenizersBackend.is_fast  s    r_   c                    SU R                   ;   ao  U R                   S   R                  S5      (       aL  [        U S5      (       a:  U R                  (       a)  [        R
                  R                  U R                  5      $ gg)z
`bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
r%   rB   FT)vocab_files_namesr~   r   r%   ri   rj   rk   r  s    r]   can_save_slow_tokenizer)TokenizersBackend.can_save_slow_tokenizer  s^     4111d6L6L\6Z6c6cdl6m6mt\**tww~~doo66r_   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  a  [        U R                  U5        U4$ )NzVocabulary path (z) should be a directory-r   r%   )
ri   rj   isdirr   errorjoinVOCAB_FILES_NAMESabspathr%   r   )r   r  r  out_vocab_files       r]   save_vocabulary!TokenizersBackend.save_vocabulary  s    ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r_   c                    U R                   nU R                  nUc  U R                  (       a  SU l        U R                  nU R                  nUc  U R
                  (       a  SU l        U R                  (       a  US-   OS SU R
                  (       a  SU-   S-   OS 3nU U R                  (       a  SU-   S-   OS S	U R
                  (       a  SU-   S-   OS 3n/ nU R                  (       a  UR                  X45        U R
                  (       a  UR                  X445        [        R                  " XVUS
9U R                  l
        g)zU
Updates the underlying post processor with the current `bos_token` and `eos_token`.
NFz:0 r   z$A:0r?   z:0z:1z $B:1)singlepairspecial_tokens)rI   bos_token_idr   rK   eos_token_idr   rX   r	   TemplateProcessingr   r4   )r   bosr  eosr   r  r  r  s           r]   r   'TokenizersBackend.update_post_processor
  s.    nn((;4--!&Dnn((;4--!&D%)%7%7S5[R@[_[m[mcCiRVFVsuDvw0B0B39t+K5gkgygyRUX[R[^bRb  @B  QC  D!!3"56!!3"56)3)F)F^*
&r_   c                     [        U SS5      $ )Nr   Fr|   r  s    r]   r   TokenizersBackend.add_eos_token$      t-u55r_   c                     [        U SS5      $ )Nr   Fr&  r  s    r]   r   TokenizersBackend.add_bos_token(  r(  r_   c                 R    [         R                  U SU5        U R                  5         g )Nr   object__setattr__r   r   values     r]   r   r'  ,  !    4!159""$r_   c                 R    [         R                  U SU5        U R                  5         g )Nr   r,  r/  s     r]   r   r*  1  r1  r_   c           	         / nU R                   R                  5        Ha  nUc  M  [        U[        5      (       a  UR	                  U5        M0  [        U[
        5      (       d  MG  UR	                  [        USSS95        Mc     U R                   H\  n[        U[        5      (       a  UR	                  U5        M+  [        U[
        5      (       d  MB  UR	                  [        USSS95        M^     U(       a  U R                  USS9  [        U SS5      (       d  U R                  R                  c  U R                  5         gg)a3  
Post-initialization hook that runs after the tokenizer is fully set up.
This is called by from_pretrained() after loading the tokenizer, which allows
us to add any special tokens that may have been passed as AddedToken objects.

Child classes should call super()._post_init() if they override this method.
NTF)r   
normalized)r  r   )r   rP   rT   r   rX   rY   r   r   r|   r   r4   r   )r   r   token_valuer   s       r]   
_post_initTokenizersBackend._post_init6  s    33::<K"+z22$$[1K--$$ZTV[%\] = //E%,,$$U+E3''$$ZtPU%VW	 0 OOM$O?48$??4??CaCaCi&&( Djr_   c                 4    U R                   R                  SS9$ )z@
`int`: Size of the base vocabulary (without the added tokens).
Fwith_added_tokensr   r   r  s    r]   r  TokenizersBackend.vocab_sizeV  s    
 ---FFr_   c                 4    U R                   R                  SS9$ )NTr9  )r   	get_vocabr  s    r]   r>  TokenizersBackend.get_vocab]  s    ((4(@@r_   c                 "    U R                  5       $ rS   )r>  r  s    r]   r1   TokenizersBackend.vocab`  s    ~~r_   c                     [        U R                  R                  5       S S9 VVs0 s H  u  pUR                  U_M     snn$ s  snnf )z
Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
optimisation in `self._added_tokens_encoder` for the slow tokenizers.
c                     U S   $ r   rf   r   s    r]   r   8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>j      dhijdkr_   r   r   rG   r   contentr   vks      r]   r   &TokenizersBackend.added_tokens_encoderd  s?     *00I0I0O0O0QWk)lm)l		1)lmmm   Ac                 6    U R                   R                  5       $ )z
Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

Returns:
    `dict[str, int]`: The added tokens.
)r   get_added_tokens_decoderr  s    r]   rG   &TokenizersBackend.added_tokens_decoderl  s     7799r_   c                     [        U R                  R                  5       S S9 VVs0 s H  u  pUR                  U_M     snn$ s  snnf )z
Returns the added tokens in the vocabulary as a dictionary of token to index.

Returns:
    `dict[str, int]`: The added tokens.
c                     U S   $ r   rf   rD  s    r]   r   3TokenizersBackend.get_added_vocab.<locals>.<lambda>  rF  r_   r   rG  rI  s      r]   get_added_vocab!TokenizersBackend.get_added_vocab{  s?     *00I0I0O0O0QWk)lm)l		1)lmmmrM  c                     g)z>
Returns True, to avoid expensive `assert tokenizer` gotchas.
Trf   r  s    r]   __bool__TokenizersBackend.__bool__  s     r_   c                 4    U R                   R                  SS9$ )z4
Size of the full vocabulary with the added tokens.
Tr9  r;  r  s    r]   __len__TokenizersBackend.__len__  s     ---EEr_   c                     U R                   $ )zS
`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
)r   r  s    r]   backend_tokenizer#TokenizersBackend.backend_tokenizer  s    
 r_   c                 .    U R                   R                  $ )zE
`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
)r   decoderr  s    r]   r`  TokenizersBackend.decoder  s    
 &&&r_   Tr.   return_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                    Uc  SU R                   ;   nUc  SU R                   ;   nU(       a  UR                  b  U/UR                  -   n	OU/n	[        [        5      n
U	 H  nU
S   R	                  UR
                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       d  M  U
S   R	                  [        UR
                  5      5        M     X4$ )ar  
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
of encodings, take care of building a batch from overflowing tokens.

Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
lists (overflows) of lists (tokens).

Output shape: (overflows, sequence length)
token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   rU   rX   idstype_idsrk  rm  offsetsr   )r   r.   rb  rc  rd  re  rf  rg  rh  	encodingsencoding_dictr   s               r]   _convert_encoding#TokenizersBackend._convert_encoding  s   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D)A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyA}h'..s155z:  ''r_   r   c                 Z    U R                   R                  U5      nUc  U R                  $ U$ rS   )r   token_to_idunk_token_id)r   r   r   s      r]   #_convert_token_to_id_with_added_voc5TokenizersBackend._convert_token_to_id_with_added_voc  s,    ++E2=$$$r_   r   c                 J    U R                   R                  [        U5      5      $ rS   )r   r   r   )r   r   s     r]   _convert_id_to_token&TokenizersBackend._convert_id_to_token  s    **3u:66r_   
new_tokensc                 |    U(       a  U R                   R                  U5      $ U R                   R                  U5      $ rS   )r   add_special_tokensr   )r   r  r  s      r]   _add_tokensTokenizersBackend._add_tokens  s/    ??55jAA))*55r_   r  c                 8    U R                   R                  U5      $ )a  
Returns the number of added tokens when encoding a sequence with special tokens.

<Tip>

This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
this inside your training loop.

</Tip>

Args:
    pair (`bool`, *optional*, defaults to `False`):
        Whether the number of added tokens should be computed in the case of a sequence pair or a single
        sequence.

Returns:
    `int`: Number of special tokens added to sequences.
)r   num_special_tokens_to_add)r   r  s     r]   r  +TokenizersBackend.num_special_tokens_to_add  s    & 88>>r_   rq  skip_special_tokensc                 @   [        U[        5      (       a  U R                  R                  U5      $ / nU(       a  [	        U R
                  5      O	[	        5       nU H?  n[        U5      nXT;   a  M  UR                  U R                  R                  U5      5        MA     U$ )a  
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.

Args:
    ids (`int` or `list[int]`):
        The token id (or token ids) to convert to tokens.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.

Returns:
    `str` or `list[str]`: The decoded token(s).
)rT   r   r   r   r   all_special_idsrX   )r   rq  r  r  ids_to_skipr   s         r]   convert_ids_to_tokens'TokenizersBackend.convert_ids_to_tokens  s     c3??..s333Fc$../CEEJE#MM$//55e<=	 
 r_   textr  c                 H    U R                   " SXUS.UD6R                  5       $ )N)r  	text_pairr  rf   )_encode_plusr  )r   r  r  r  r   s        r]   tokenizeTokenizersBackend.tokenize  s(      ldOaleklssuur_   padding_strategyr   r   r   r   r   c                    U R                   R                  nU R                   R                  nU[        R                  :X  a  Ub  U R                   R                  5         OdUUUR                  U R                  S.n	Uc  Sn
O"U	 Vs0 s H  oUR                  US5      _M     n
nX:w  a  U R                   R                  " S0 U	D6  U[        R                  :X  a  Ub  U R                   R                  5         ggU[        R                  :X  a  UOSnUUb  UOU R                  U R                  U R                   U R"                  US.n	X:w  a  U R                   R$                  " S0 U	D6  ggs  snf )a  
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
library) and restore the tokenizer settings afterwards.

The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
section.

Args:
    padding_strategy ([`~utils.PaddingStrategy`]):
        The kind of padding that will be applied to the input
    truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
        The kind of truncation that will be applied to the input
    max_length (`int`):
        The maximum size of a sequence.
    stride (`int`):
        The stride to use when handling overflow.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
        the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
    padding_side (`str`, *optional*):
        The side on which the model should have padding applied. Should be selected between ['right', 'left'].
        Default value is picked from the class attribute of the same name.
N)r   r   r   r   )r   r   pad_idr`   r   r   rf   )r   rv   ru   r   DO_NOT_TRUNCATEr   r0  r   rr   r   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idr`   r   r   )r   r  r   r   r   r   r   r   r   targetcurrentrK  r   s                r]   set_truncation_and_padding,TokenizersBackend.set_truncation_and_padding  sR   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG1kooa66G 11;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F !..88 "% Hs   E'r   r  is_split_into_wordsreturn_tensorsr   c                 v   S nU" U5      (       d  [        S5      eUb  U" U5      (       d  [        S5      eU(       a@  [        U[        [        45      =(       a"    U=(       a    [        US   [        [        45      nO[        U[        [        45      nU(       ay  [        U[        5      (       a  [        S5      eUb<  [        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eUb  [        [        X5      5      OUnOU(       a  X4/OU/n[        U[        [        45      (       d  [        S[        U5       S	35      eU R                  UUUUU	U
S
9  Uc  U R                  nU R                  R                  U:w  a  UU R                  l        U R                  R                  UUUS9nU Vs/ s H  nU R                  UUUUUUUUS9PM     nn0 nUS   S    H.  nU VVVs/ s H  u  nnUU     H  nUPM     M     n nnnU UU'   M0     U VVVs/ s H  u  nnU  H  nUPM     M     n!nnnU(       a4  / n"[        U5       H  u  n#u  n$nU"U#/[        U$S   5      -  -  n"M      U"US'   US    H  n%U R!                  U%UU5        M     [#        UU!US9n&U(       dq  Ucn  U(       dg  [#        U&R%                  5        VV's0 s H5  u  nn'U[        U'5      S:  a  [        U'S   [        5      (       a  U'S   OU'_M7     sn'nU&R&                  5      n&U&$ s  snf s  snnnf s  snnnf s  sn'nf )Nc                    [        U [        5      (       a  g[        U [        [        45      (       a  [	        U 5      S:X  a  g[        U S   [        5      (       a  g[        U S   [        [        45      (       a  [	        U S   5      S:X  d  [        U S   S   [        5      (       a  g[        U S   S   [        [        45      (       a4  [	        U S   S   5      S:H  =(       d    [        U S   S   S   [        5      $ ggg)NTr   F)rT   rY   rU   rV   r   )r  s    r]   _is_valid_text_input<TokenizersBackend._encode_plus.<locals>._is_valid_text_inputq  s    !S!!Ae}--q6Q;!c**!tUm441Q4yA~AaDGS)A)A##AaDGdE];;"1Q47|q0OJqtAwqz34OO$ r_   ztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).r   zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))r  r   r   r   r   r   )r  is_pretokenized)r.   rb  rc  rd  re  rf  rg  rh  rl  overflow_to_sample_mapping)tensor_type)r   rT   rU   rV   rY   	TypeErrorr   zipr0   r  r   r   r   encode_batchrv  r{   &_eventual_warn_about_too_long_sequencer   r   rt  )(r   r  r  r  r  r   r   r   r  r   r   r  rb  rc  rd  re  rf  rg  rh  r   r   r  
is_batchedbatch_text_or_text_pairsrt  r.   tokens_and_encodingssanitized_tokensr   r   r   r   stacksanitized_encodingsr  r   toksrl  batched_outputr0  s(                                           r]   r  TokenizersBackend._encode_plusY  s   0	( $D))W 
  )=i)H)HW  #D4-8hThjQUVWQX[_afZgFhJ#D4-8J)S))  $Tc)n)D .s4yk :I'q*  FOEZtC,@'A`d$ ?H(9':dV$ 2UDMBBLTRjMkLllmn  	''- 3!1% 	( 	
  '#'#<#< ??004HH4HDOO1 OO00$1/ 1 
	$ & 
 & ""!&;&;*C+E'=+ # 	 & 	  
 '*1-C&:N&:74DIqQIQ&:EN$)S! . 1ES0DWQdqdq0DS %)+& )*> ?9D!*qcC[8I4J.JJ* !@=W9:)+6I77	:wW 7 ''79LZhi n4=V* '5&:&:&<&<
U c%j1nE!Hd9S9S%(Y^^&< ((N W 
" OS"s    L"7L'
&L.<L5
r  c                     U R                   R                  b%  U R                   R                  R                  U5      $ SR                  U5      $ )Nr?   )r]  r`  decoder  )r   r  s     r]   convert_tokens_to_string*TokenizersBackend.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
r_   	token_idsclean_up_tokenization_spacesc                    UR                  SS 5        [        U[        5      (       a  U/n[        U[        5      (       a  US   nU R                  R                  XS9nUb  UOU R                  nU(       a~  [        U R                  R                  5      R                  S:X  a@  U R                  (       d/  [        R                  SU R                  R                   S35        U$ U R                  U5      nU$ )Nuse_source_tokenizerrl  )r  r   z=Ignoring clean_up_tokenization_spaces=True for BPE tokenizer aE  . The clean_up_tokenization post-processing step is designed for WordPiece tokenizers and is destructive for BPE (it strips spaces before punctuation). Set clean_up_tokenization_spaces=False to suppress this warning, or set clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output=True to force cleanup anyway.)rh   rT   r   rg   r   r  r  r0   r]  r/   rz   Gclean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_outputr   warning_oncer  clean_up_tokenization)r   r  r  r  r   r  s         r]   _decodeTokenizersBackend._decode  s     	

)40i%%"Ii&&!+.I%%i%Y ,7 )22 	%
 (
 T++112;;uDdd##//0 1--  11$7r_   
file_names.legacy_formatc                     [        U5      n[        R                  R                  X(       a  US-   OS[        -   5      nU R
                  R                  U5        X%4-   nU$ )Nr  r   )rY   ri   rj   r  TOKENIZER_FILEr]  save)r   r  r  r  r  r$   s         r]   _save_pretrained"TokenizersBackend._save_pretrained%  s[     ^,o_s22Q__
 	##N3"33
r_   c           
      
   [         R                  " U R                  R                  5       5      nUR	                  S5      nUR	                  S5      n	Sn
US   S   S:X  a  0 US   S'   / US   S'   OuUS   S   S	:X  a?  US   S
   b5  US   S
   nUS   S   U   S   n
Ub	  X;   a  XZ   n
SUS   S
'   U
S//US   S'   O*US   S   S;   a	  0 US   S'   O[        SUS   S    S35      eUb%  SUS   ;   a  US   S   U;   a  XWS   S      US   S'   [        R                  " [         R                  " U5      5      n/ nU Hl  nUR	                  SS5      nUR	                  SS5      nUS   S   S	:w  a	  U(       d  M<  Ub  US   U;   a
  X^S      US'   UR                  [        S(0 UD65        Mn     Ub  UR                  U5        US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S	:X  a  U
b  XS'   US   b_  US   S   S:X  d2  US   S   S:X  aG  SUS   ;   a>  [        S US   S    5       5      (       a!  [        R                  R                  5       US'   [         US   S      nU" S(X-S.UD6nUR#                  XUS9  U	Gb@  [         R                  " UR                  5       5      nSU	;   a  U	S    H  nU	S   U   S   nUb"  U Vs/ s H  nUR%                  UU5      PM     nnUU	S   U   S'   U H"  nUR'                  U5      nUb  M  [        S 5      e   U Vs/ s H  nUR'                  U5      PM     snU	S   U   S!'   M     S" HG  nUU	;   d  M  U	U   u  nnUb  UU;   a  UU   nUR'                  U5      nUc  [        S 5      eUU/U	U'   MI     U	US'   [        R                  " [         R                  " U5      5      nU R(                  R+                  5       n[,        R.                   H  n[1        U U5      c  M  [1        U U5      nUb  UU;   a  UU   nU R2                  R%                  US5      n[5        U[        5      (       a;  [        UUR6                  UR8                  UR:                  UR<                  S#S$9UU'   M  UUU'   M     U R>                  (       a  U R>                  R+                  5       O/ nUb  UR                  U5        [A        U5      S:  a  UUS%'   XS&'    U RB                  " S(0 UD6$ s  snf s  snf ! [D         aG  nS'[G        U5      ;   a2  UR	                  S&S5        U RB                  " S(0 UD6nUUl        Us SnA$ e SnAff = f))u  
Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
as the current one.

Args:
    text_iterator (generator of `list[str]`):
        The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
        if you have everything in memory.
    vocab_size (`int`):
        The size of the vocabulary you want for your tokenizer.
    length (`int`, *optional*):
        The total number of sequences in the iterator. This is used to provide meaningful progress tracking
    new_special_tokens (list of `str` or `AddedToken`, *optional*):
        A list of new special tokens to add to the tokenizer you are training.
    special_tokens_map (`dict[str, str]`, *optional*):
        If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
        token name to new special token name in this argument.
    kwargs (`dict[str, Any]`, *optional*):
        Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

Returns:
    [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
    `text_iterator`.

r3   r4   Nr/   r0   r   r1   r2   r   r   r   g        )r"   r#   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.rM   r   idrH  continuing_subword_prefixend_of_word_suffixr   	ByteLevelr:   pretokenizersc              3   2   #    U  H  nUS    S:H  v   M     g7f)r0   r  Nrf   ).0pretokenizers     r]   	<genexpr><TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>  s!      (X !(K7(Xs   initial_alphabet)r  r  )r   trainerr  r  zQAttempted to set a token in the post processor that does not exist in the mappingrq  )r   sepT)single_wordlstriprstripr4  r   rO   r+   z7multiple values for keyword argument 'tokenizer_object'rf   )$rp   loadsr   to_strrh   r   rm   rs   rt   rX   r   rW   anypre_tokenizers_fastr  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrr   ry  r   r   r   SPECIAL_TOKENS_ATTRIBUTESr|   r   rT   r  r  r  r4  rO   r   r  r  rY   )r   text_iteratorr  r   new_special_tokensspecial_tokens_mapr   r   r3   r4   rM   r   r   r  added_tokenr   r   trainer_classr  trained_tokenizer_jsonr   r  r   r   special_tokenspecial_token_fullrO   r   new_tokenizers                                r]   train_new_from_iterator)TokenizersBackend.train_new_from_iterator6  s   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1i6U 2 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EU\F]^iFj3kN7#K0!**4::n+EF	 'K!ooi6Gd+Ag&v.);G!-+i2HL^2^);	<R)SI&!!*";{";< ( )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+;/*6/7;F!/26:jH#~o'FF (6(G(X  
 .A-J-J-S-S-U)*01H1PQ_:_X^_%%mG%T%%)ZZ	0@0@0B%C">1)*:;C+,<=cB8LF)5TZ![TZ5"4"8"8"FTZ![FLN#34S9(C!'#,#8#8#?#+", s#  "( ouCuntejIDYDYZ_D`ntCuN#34S9%@ < "0 N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1 "0 8F"#34%..tzz:P/QRI!!&&(,FFEtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*==$.%$6$B$B188188#5#@#@ $%F5M %2F5M% G* DHC\C\t88==?bd) ''(:;#$q(-AF)* &/!"	>>+F++{ "\ Dvj  	HCPQFR 

-t4 $ 8 8+4($$ 	s0   4T'	T,T1 1
V;;U=6V<U==Vc
                 v  ^^ SSK mSSKJn  SSKJn  SSKJnJm  U" SS9S[        S	[        4UU4S
 jj5       nU(       d  [        5       (       a  SnUGbR  U(       d  U(       GdC  U" U5      (       Ga5  U" USUUUSSUS9nSnUb  [        USS9 n[        R                  " U5      nSSS5        WR                  S5      nUR                  S5      nU(       a7  UR                  U5      UR                  S5      :  a  U(       a  Ub  US;  a  U$ O-U(       a&  UR                  U5      UR                  S5      :  a  U$ SnU(       d  U(       Gd\  U" U5      (       GaN  U(       a  SU;   a  [!        USUS   5        U	c:  [#        USS5      (       d(  [!        USS5        [$        R'                  SU S35        U$ U	SL d  [#        USS5      (       a  [!        USS5        SSKnUR*                  R-                  UR/                  S5      SS9nUR0                  n[3        UUR*                  R4                  5      (       a  UUR0                  S'   U$ [3        UUR*                  R6                  5      (       a  UR*                  R9                  SSS9nUR*                  R5                  UU/5      Ul        U$ ! , (       d  f       GN= f)a6  
Patches mistral related tokenizers with incorrect regex if detected
    1) Local file with an associated config saved next to it
        >> Model type one of the mistral models (on older versions)
    2) Remote models on the hub from official mistral models
        >> Tags including `base_model:.*mistralai`
r   N)	lru_cache)version)r   hf_api   )maxsizemodel_idrQ   c                    >  T" 5       R                  U 5      nUR                  b1  TR                  SSR	                  UR                  5      5      (       a  gg! [         a     gf = f)NFzbase_model:.*mistralair   T)
model_infor   tagssearchr  )r  r/   r  res     r]   is_base_mistral?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistral  s_    ++H5 zz%995rwwuzz7JKK  s   A 
A%$A%Tzconfig.jsonF)	cache_dirr   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr,   r-   transformers_versionr   z5.0.0)mistralmistral3voxtral	ministralpixtralr   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)r  	functoolsr  	packagingr  transformers.utils.hubr   r  rY   boolr   ro   rp   rq   rr   parsesetattrr|   r   r   r   pre_tokenizersSplitRegexr   rT   r:   	Metaspacer  )r   r   pretrained_model_name_or_pathr   r  r  r  is_localr   r   r   r  r  r   r  _config_filemistral_config_detectedf_configr  transformers_model_typer   split_pretokenizercurrent_pretokenizerr  r  s                           @@r]   r   &TokenizersBackend._patch_mistral_regex  s   * 	'%>	3			c 		d 		 
 		 00H(4X/:W*X*X&-#!16;8=)	L ',#',9Q"iilG :'.{{3I'J$*1++l*C'
 (GMM:N,ORYR_R_`gRh,h 3?3   )()gmm<P.QU\UbUbcjUk.k$$*.'&xOLi<j<j#6+#EI':KH[<\] %,WYH[]b5c5cI':EBNN>?\>] ^e eH ? '$.')EXZ_2`2`I':DA%)3)B)B)H)H * 0 0 s! ",	 *I *& ,5+B+B(!"6
8Q8Q8Z8Z[[5G	//2"  &&:J<U<U<_<_``3=3L3L3V3V16% 4W 40
 3=2K2K2T2T 2 43	/ O :9s   J))
J8)r   r   r   r   r   r   r   r%   )FrS   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)Erz   
__module____qualname____firstlineno____doc__r  r  r/   r   classmethodr   r*   propertyr  r	  r  rY   rV   r  r   r   r   setterr6  r   r  rg   r>  r1   r   r   rG   _added_tokens_encoder_added_tokens_decoderrT  rW  rZ  rm   r]  DecoderFastr`  EncodingFastr   rU   rv  r{  r~  r  r  r  r  r   r   r  r  r  r   r   r   r  r  r  ri   PathLiker  r  r   __static_attributes____classcell__)r  s   @r]   r'   r'   S   s   
 *EJ` `Da)F      !c !C$J !Z_`cZd !
4 6 6 6 6 % % % %)@ GC G GA4S> A  tCH~     nd38n n n :d3
?&; : : 10nc3h n$ F F =   ' ' ' .2-1*/+0',#-(-(  $d{-(  $d{	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^  7# 7#* 76d3+;&< 6WZ 6?d ?s ?*tCy t `cfjknfo`o 4vS vd
 vt vjnorjs vI9)I9 0I9 	I9
 I9  $JI9 DjI9\ gk#',;,F,F2D2T2T!%$))-#'&*-1-1*/+0',#,0)X++d9o=EV@WWX 004	?BTJ[E\\_ccX !	X
 *X 0X $JX X "X  $JX DjX tX  $d{X  $d{X $(X  %)!X" !%#X$ %X& 'X( #Tk)X, 
-Xt
tCy 
S 
 %*48	)c?) ") '+Tk	) 
)^ &*&*bkk) #s(O d{	
 t 
sCx* CJ 
 B Br_   r'   )Cr$  r   rp   ri   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr  r  huggingface_hubr   r   r   r	   r
   r+  r   rm   tokenizers.decodersr   r*  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r  r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr   r    r!   
get_loggerrz   r   r  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr  r  r'   PreTrainedTokenizerFastrf   r_   r]   <module>rB     s   
   	 # $   7 + - / 1 6 * ^ ^ . 0 5 =   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-j/ j .j\) , r_   