
    3j                       % S SK Jr  S SKJr  S SKrS SKrS SKrS SKrS SKJ	r	  S SK
JrJrJrJrJrJrJrJr   S SKJr   S S	KJr  S S
KJr  S SKJr  S SKJr  Sr S SKJ r   S SK!r!SSK"J#r#  \RH                  " \%5      r& " S S5      r'\ " S S\5      5       r(\ " S S\(\5      5       r) " S S\(5      r* " S S \)5      r+ " S! S"\)5      r, " S# S$\)5      r- " S% S&\.\5      r/S*S' jr0 " S( S)\)5      r1g! \ a
    SrS\S'    Nf = f! \ a!    SrSrS\S'   SrS\S'   SrS\S'   SrS\S'    Nf = f! \ a
    Sr S\S'    Nf = f)+    )annotations)EnumN)Path)AnyCallableSequenceMappingIterableProtocolClassVarruntime_checkable)SentencePieceProcessorr   r   )MistralTokenizer)
Tekkenizer)_filter_valid_tokenizer_files)SentencePieceTokenizerTFr   r   r   r   )get_one_valid_tokenizer_filer      )
GGUFWriterc                      \ rS rSr% S\S'   S\S'   S\S'   S\S	'      S       SS jjrSS jrSSS jjrSS jrSS jr	SS jr
SS jrSS jrSrg
)SpecialVocab2   	list[str]mergeszdict[str, bool]add_special_tokendict[str, int]special_token_idsz(str | Sequence[Mapping[str, str]] | Nonechat_templateNc                    0 U l         0 U l        X@l        X l        / U l        S U l        Ub  X0l        OSU l        U R                  [        U5      5        g )N)boseosunkseppadclsmask)	r   r   n_vocabload_mergesr   r   special_token_types_loadr   )selfpathr(   r)   r'   s        D/home/wildlama/miniconda3/lib/python3.13/site-packages/gguf/vocab.py__init__SpecialVocab.__init__8   sR    
 "$!#&!*':$'YD$

4:    c                    SR                  [        U R                  5      U R                  =(       d    SU R                  =(       d    S5      $ )NzG<SpecialVocab with {} merges, special tokens {}, add special tokens {}>unset)formatlenr   r   r   r+   s    r-   __repr__SpecialVocab.__repr__I   s<    X__d44?AWAWAb[b
 	
r0   c                   U R                   (       aO  U(       d,  [        R                  S[        U R                   5       S35        UR	                  U R                   5        O&U R
                  (       a  [        R                  S5        U R                  R                  5        Ha  u  p4[        USU S3S 5      nUc  [        R                  SU SU S35        M7  U(       d  [        R                  S	U S
U 35        U" U5        Mc     U R                  R                  5        Ha  u  p6[        USU S3S 5      nUc  [        R                  SU SU S35        M7  U(       d  [        R                  SU SU 35        U" U5        Mc     U R                  bE  U(       d"  [        R                  SU R                   35        UR                  U R                  5        g g )NzAdding z
 merge(s).zJAdding merges requested but no merges found, output may be non-functional.add_	_token_idz"No handler for special token type z	 with id  - skippingzSetting special token type z to add_add__tokenzNo handler for add_z_token with value zSetting add_z
_token to zSetting chat_template to )r   loggerinfor4   add_token_mergesr(   warningr   itemsgetattrr   r   add_chat_template)r+   gwquiettyptokid
id_handlervalueadd_handlers           r-   add_to_ggufSpecialVocab.add_to_ggufN   s   ;;gc$++&6%7zBC,NNgh00668JC7>rT#iCXZ^7_J!!CC5	RWQXXcde9#d5'JKu 9 00668JC9@xPSuTZE[]a9bK"!4SE9KE7R]^_l3%z%AB 9 )78J8J7KLM  !3!34 *r0   c                    U R                  U5        U R                  U5        U R                  (       a$  U R                  (       d  U R	                  U5        g g g N)_try_load_from_tokenizer_json_try_load_from_config_jsonr(   r   _try_load_merges_txt)r+   r,   s     r-   r*   SpecialVocab._loadj   sC    **40''-DKK%%d+ %0r0   c                8   US-  nUR                  5       (       d  g[        USSS9 n[        US5      R                  5       nUR	                  S5      (       d  UR                  S5        SnOS	n/ nU H  nUS	-  nUR                  5       nU(       d  M!  UR                  S S
5      n[        U5      S:w  a'  [        R                  UR                   SU S35        Mi  UR                  US    SUS	    35        M     S S S 5        WU l        g! , (       d  f       N= f)Nz
merges.txtFrutf-8encoding #r   r         z: Line z: Entry malformed, ignoring T)is_fileopennextstrip
startswithseeksplitr4   r>   rA   nameappendr   )	r+   r,   merges_filefp
first_lineline_numr   linepartss	            r-   rR   !SpecialVocab._try_load_merges_txtp   s   \)""$$+sw72b"++-J((--
FAzz|

4+u:?NNk&6&6%7wxjHc#deq
!E!H:67  8$ ' 87s   CD
Dc           	     &   [        U[        5      (       d  g US:  a  [        SU SU 35      eU R                  b  X R                  :  a  XR                  ;   a  g X R                  U'   g [
        R                  SU SU SU R                   S35        g )Nr   z%invalid value for special token type : zSpecial token type z, id z out of range, must be under r;   )
isinstanceint
ValueErrorr'   r   r>   rA   )r+   rG   tids      r-   _set_special_tokenSpecialVocab._set_special_token   s    #s##7DSEC5QRR<<3#5,,,*-""3',SEse;XY]YeYeXffqrsr0   c                R  ^' S nUS-  nUR                  5       (       Ga  [        USS9 n[        R                  " U5      nS S S 5        U R                  (       GaD  UR                  S0 5      R                  S5      n[        U[        5      (       Ga  U(       Ga  [        US   [        5      (       a  XPl	        O[        US   [        5      (       a  [        US   5      S:X  a  [        US   S   [        5      (       a  [        S U 5       5      (       a.  [        R                  S	[        [        S
5      S-   5      < 35        U VVs/ s H;  nS
R!                  U Vs/ s H  nSR!                  S U 5       5      PM     sn5      PM=     snnU l	        O[#        S5      eUR                  S0 5      nO0 nS n	US-  n
U
R                  5       (       a)  [        U
SS9 n[        R                  " U5      n	S S S 5        U(       Ga  U	=(       d    0 R                  S5      nU	=(       d    0 R                  S5      nU	=(       d    0 R                  S5      nU	=(       d    0 R                  S5      nU(       d  U(       a  U	(       a  U=U	S'   nU(       d  U(       a  U	(       a  U=U	S'   nUR                  S5      =n(       Ga  UR                  SU/5       GH  nUR                  S5      S:X  a  SU R$                  S'   SU R$                  S'   SU R$                  S'   U(       d!  U	(       a  UR                  SU/5      S   nXS'   U(       d!  U	(       a  UR                  SU/5      S   nXS'   M  UR                  S5      S:X  d  M  UR                  S/ 5      nUR                  S / 5      nS nS n[        U5      S!:  Gah  US   R                  S"0 5      R                  S#5      =n(       aB  U	(       d  UnUX4;   a  SOS$U R$                  S'   UX4;  a  [        R                  S%U< S&35        US'   R                  S"0 5      R                  S#5      =n(       a  U	(       d  UnOUU:w  a  S(U R&                  ;  a"  [)        U R&                  5      S)-   U l        XS*'   OPS+U R&                  ;  a"  [)        U R&                  5      S,-   U l        XS-'   O[        R                  S.U< S/U< S035        U=U	S'   nUU:X  a  SOS$U R$                  S'   UU:w  a  [        R                  S1U< S&35        U(       GaZ  U(       a*  US   R                  S"0 5      R                  S#5      U:X  a  S!OSnU(       a*  US'   R                  S"0 5      R                  S#5      U:X  a  S'OS nU(       a  US:X  d
  U(       a  Uc  [        R                  S25        U[+        UU5         =n(       Ga  US   R                  S30 5      R                  S#5      nUS'   R                  S30 5      R                  S#5      nUS4:w  d  US5:w  a  [        R                  S6U S7U S835        US4:X  Ga:  US5:X  Ga3  US!S' =n(       Ga&  S$nUS   R                  S"0 5      R                  S#5      =n(       a2  UX4;   a	  U(       d  SnUX4;  a  [        R                  S9U< S835        O[        R                  S:US   < S835        [        U5      S:X  as  US!   R                  S"0 5      R                  S#5      =n(       a+  UX4;   a  SnUX4;  a  [        R                  S;U< S835        O[        R                  S<US!   < S835        UU R$                  S'   U(       a  U(       d  U	(       a  XS'   GM     U	(       d  gS nUS=-  nUS>-  nUR                  5       (       a  [        USS9 nUR-                  5       nS S S 5        [        US?-  R/                  S@5      5      =n(       aN  SAUSB./nU HB  n[        USS9 n UR1                  UR2                  U R-                  5       SB.5        S S S 5        MD     OMUR                  5       (       a8  [        USS9 n[        R                  " U5      R                  SC5      nS S S 5        U	R                  SCU5      n!U!b  [        U![        [        45      (       a  U!U l        O[        R                  SDU
< SE35        U R&                   H  n"U	R                  SFU" SG35      n#[        U#[6        5      (       a  U#U R$                  U"'   U	R                  U" SG35      n$[        U$[        5      (       a  U$m'OB[        U$[8        5      (       a+  U$R                  SH5      n%[        U%[        5      (       d  M  U%m'OM  [;        U'4SI jU 5       S 5      n&U R=                  U"U&5        M     g! , (       d  f       G	N= fs  snf s  snnf ! , (       d  f       GN= f! , (       d  f       GN3= f! , (       d  f       GM  = f! , (       d  f       GN= f)JNtokenizer.jsonrV   rW   modelr   r   r\   c              3  >   #    U  H  o  H
  nS U;   v   M     M     g7f)r]   N ).0pairss      r-   	<genexpr>=SpecialVocab._try_load_from_tokenizer_json.<locals>.<genexpr>   s     IFDDqsaxDxFs   z'Spaces in merges detected, encoding as r]      rY   c              3  `   #    U  H$  nUS :X  a  [        [        U5      S-   5      OUv   M&     g7fr]   r   Nchrordr{   cs     r-   r~   r      s/      ,&15A >?#XCFSL(91(L15s   ,.zUnknown tokenizer merges formatadded_tokensztokenizer_config.json	bos_token	cls_token	eos_token	sep_tokenpost_processor
processorstypeRobertaProcessingTr    r!   r#   r%   TemplateProcessingsingler|   r   SpecialTokenidFzUnknown leading special token z in TemplateProcessing<single>eot)r   	eot_tokeneom)r   	eom_tokenzOverriding EOS token z with z without EOT/EOM fallback!zUnknown trailing special token z`TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>r   ABzUnknown sequence z...z in TemplateProcessing<pair>zUnknown separator token zUnknown middle sequence zUnknown second separator token zUnknown second middle sequence zchat_template.jsonzchat_template.jinjaadditional_chat_templatesz*.jinjadefault)re   templater   z$Bad type for chat_template field in z - ignoringr9   r=   contentc              3  r   >#    U  H,  oR                  S 5      T:X  d  M  UR                  S5      v   M.     g7f)r   r   N)get)r{   atok
tc_contents     r-   r~   r   6  s,     ^LDHHY<OS]<]$Ls   77)r^   r_   jsonloadr(   r   rp   liststrr   r4   anyr>   rA   r   r   joinrr   r   r)   tupleslicereadglobrf   stemr   booldictr`   rt   )(r+   r,   	tokenizertokenizer_filefr   r|   partr   tokenizer_configtokenizer_config_filespecial_bosspecial_clsspecial_eosspecial_sepr   	processortmpl_single	tmpl_pairspecial_firstspecial_last	seq_startseq_stoptmpl_atmpl_badd_sepspecial_entrychat_template_altchat_template_jsonchat_template_jinjaadditional_templatestemplate_pathrh   r   rG   	add_entryentryentry_contentmaybe_token_idr   s(                                          @r-   rP   *SpecialVocab._try_load_from_tokenizer_json   s	   	 00!!##n9Q IIaL	 :"w377Afd++!&)S11&,#F1It44VAY19LQ[\bcd\efg\hjmQnQn IFIII"NN-TUXY\]`YadgYgUhTk+lm )/' )/  HH 15!" 15	 %'GG ,&15,& %& 15!"	 )/' ))JKK$==<LL $'> > ((**+@A#'99Q<  A+1r66{CK+1r66{CK+1r66{CK+1r66{CK;3C>II -;3C>II -!*/?!@@~@!/!3!3L>BR!SI }}V,0CC8<..u58<..u58<..u5*/?*3--}*Ma*PK<G[9*/?*3--}*Ma*PK<G[9  !}}V,0DD&/mmHb&A$-MM&"$=	(,'+{+a/0;A0B0B>SU0V0Z0Z[_0``}`'72?KHUZeYsHsy~ 6 6u =#08R#R$*NN5STaSd  eC  4D  %E/:2/B/B>SU/V/Z/Z[_/``|`'72>K%1[%@',D4L4L'LCHIaIaCbenCn(@HS(E).d6N6N)NCHIaIaCbenCn(@HS(E(.9N{o]cdpcs  tN  8O  )PR^$^$4[$AKHTXcHcin 6 6u =#/;#>$*NN5TUaTd  eC  4D  %E$-:y|?O?OP^`b?c?g?ghl?mq~?~  EFI-9im>O>OP^`b>c>g>ghl>mq}>}r  DHH -)q.lW_Wg &  0R  !S,5eIx6P,QQyQ)21)9)9*b)I)M)Md)S)22):)::r)J)N)Nt)T#)S=FcM$*NN5FvhcRXQYYu3v$w#)S=Vs]U^_`acUdHd	Hd.3G8A!8H8HY[8\8`8`ae8f'f}'f+8[<V+V_k6:G+8@Z+Z,2NN=UVcUf  gC  <D  -E(.9QR[\]R^Qaa}7~('*9~':<EaL<L<L^]_<`<d<dei<j+j=+j/<@Z/Z:>/<[D^/^06A`an`q  rN  @O  1P,2NN=\]fgh]i\l  mI  <J  -KDKD$:$:5$A'.{GWHS(E S "TT   !$88"%::&&(()g>!$%FFH! ?'+T4O-O,U,UV_,`'aa#a.7EV%W$X!%9Mm@B)00-:L:LZ\ZaZaZc1de A@ &:  ''))(W=$(IIaL$4$4_$E! >(,,_>OP J}sDk$J$J!.DNNABWAZZefg++C(,,tC5-?@I)T**.7&&s+$((C58E%%%"
E4(( %		) 4!-55*
!^L^N ##C8' ,( C :9!"'* A@x ?>
 A@ >=sS   gg"g>gg 5g2-h &h
gg 
g/2
h
h	
h&c                ^   US-  nUR                  5       (       d  g[        USS9 n[        R                  " U5      nS S S 5        U R                   HH  nWR                  U S35      nUc  SU;   a  US   R                  U S35      nU R                  XV5        MJ     g! , (       d  f       Ng= f)Nzconfig.jsonFrV   rW   r:   text_configT)r^   r_   r   r   r)   r   rt   )r+   r,   config_filer   configrG   token_ids          r-   rQ   'SpecialVocab._try_load_from_config_json<  s    ]*""$$+'2aYYq\F 3++CzzSE"34HMV$;!-044uI5FG##C2 ,  32s   B
B,)r   r   r(   r   r'   r   r)   )FNN)r,   zstr | os.PathLike[str]r(   r   r)   zIterable[str] | Noner'   z
int | Nonereturnr   )F)rE   r   rF   r   r   None)r,   r   r   r   )r,   r   r   r   )rG   r   rs   r   r   r   )__name__
__module____qualname____firstlineno____annotations__r.   r6   rL   r*   rR   rt   rP   rQ   __static_attributes__rz   r0   r-   r   r   2   st    &&%%;; AF48"*9=1 "

58,2
teNr0   r   c                  *    \ rS rSr% S\S'   S\S'   Srg)	BaseVocabiK  zClassVar[str]tokenizer_modelre   rz   N)r   r   r   r   r   r   rz   r0   r-   r   r   K  s    ""
r0   r   c                  R    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)VocabiQ  rq   
vocab_sizer   added_tokens_dictr   added_tokens_listr   fname_tokenizerc                    g rO   rz   )r+   	base_paths     r-   r.   Vocab.__init__X  s    r0   c                    g rO   rz   r5   s    r-   
all_tokensVocab.all_tokensY  s    3r0   rz   Nr   r   r   z-Iterable[tuple[bytes, float, gguf.TokenType]])r   r   r   r   r   r.   r   r   rz   r0   r-   r   r   Q  s    O%%  ,Nr0   r   c                  &    \ rS rSrSrSrSS jrSrg)NoVocabi\  no_vocabc                    g)Nz3<NoVocab for a model without integrated vocabulary>rz   r5   s    r-   r6   NoVocab.__repr__`  s    Dr0   rz   Nr   )r   r   r   r   r   re   r6   r   rz   r0   r-   r   r   \  s     ODEr0   r   c                  N    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	rg
)BpeVocabid  gpt2bpec           
        0 nUS-  =nR                  5       (       a\  [        USS9 n[        R                  " U5      U l        S S S 5         [        US-  SS9 n[        R                  " U5      nS S S 5        OUS-  n[        USS9 n[        R                  " U5      nS S S 5        WS   nUS   S:w  d#  UR                  S	S
5      (       d  US   S   S:w  a  [        S5      eUS   U l        UR                  S5      =nb.  U Vs0 s H!  nUS   U R                  ;  d  M  US   US   _M#     nn[        U R                  5      n	[        [        X[        U5      -   5      5      n
[        UR                  5       5      nX:w  a1  U	[        U5      -   S-
  n[        S[        U5       SU	 SU SU 35      e[        UR                  5       S S9nX l        U VVs/ s H  u  pUPM	     snnU l        Xl        U R                   [        U R                  5      -   U l        X0l        g ! , (       d  f       GN= f! , (       d  f       GN= f! [
         a     GN$f = f! , (       d  f       GN= fs  snf s  snnf )Nz
vocab.jsonrV   rW   added_tokens.jsonrw   rx   r   BPEbyte_fallbackFdecoder	ByteLevelzCannot find GPT-2 BPE tokenizervocabr   r   r   r   zExpected the z1 added token ID(s) to be sequential in the range z - z; got c                    U S   $ Nr   rz   )text_idxs    r-   <lambda>#BpeVocab.__init__.<locals>.<lambda>  s    (1+r0   key)existsr_   r   r   r   FileNotFoundErrorr   r4   r   rangesortedvaluesrr   rB   r   r   vocab_size_baser   r   )r+   r   r   r   r   tokenizer_jsonr   addeditemr   expected_ids
actual_idsexpected_end_idrB   textidxs                   r-   r.   BpeVocab.__init__h  sl   ')(<77O??AAo8A!YYq\
 9)&99GLPQ#'99Q<L ML (*::O o8A!%1 9 /=W.EO'50O4G4GY^4_4_!),V4C'(IJJ(1DJ'++N;;H -2 F,1D#'	?$**#D !<Yd ;,1   F 4::
E*3|;L.LMNl1134
%(3z?:Q>O}S_,==n *|3.?vj\S T T |))+1MN$0=B$CUktTU$C$.$($8$83t?U?U;V$V$3[ 98
 ML$  98 F %DsY   H"I H40I II) I)I."
H14
I>I I 
II
I&c              #     #    U R                   R                  5        VVs0 s H  u  pX!_M	     nnn[        U R                   5       H&  u  pEX4   S[        R                  R
                  4v   M(     g s  snnf 7f)Ng        )r   rB   	enumerategguf	TokenTypeNORMAL)r+   encoded_tokr   reverse_vocabi_s         r-   
bpe_tokensBpeVocab.bpe_tokens  sa     @D

@P@P@RS@R_[@RSdjj)DA"C)>)>>> * Ts   A7A1A	A7c              #     #    U R                    H3  nSnUR                  S5      U[        R                  R                  4v   M5     g 7fN     @rV   )r   encoder  r  CONTROLr+   r  scores      r-   r   BpeVocab.added_tokens  s:     **DE++g&t~~/E/EEE +   AAc              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frO   )r  r   r5   s    r-   r   BpeVocab.all_tokens  s/     ??$$$$$&&& 	%&   848688c                N    SU R                    S[        U R                  5       S3$ )Nz<BpeVocab with  base tokens and  added tokens>r  r4   r   r5   s    r-   r6   BpeVocab.__repr__  s,     !5!5 66GDLbLbHcGddrssr0   )r   r   r   r   r   r  Nr   r   r   )r   r   r   r   r   re   r.   r  r   r   r6   r   rz   r0   r-   r   r   d  s(    OD24h?F
'tr0   r   c                  N    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	rg
)SentencePieceVocabi  llamaspmc           	        [         c  [        S5      e0 nUS-  =nR                  5       (       a.   [        US-  SS9 n[        R
                  " U5      nS S S 5        O/UR                  S-  =nR                  5       (       d  [        S5      e[        5       U l        U R                  R                  [        U5      5        U R                  R                  5       nUR                  5        VVs0 s H  u  pgXu:  d  M  Xv_M     nnn[        [        XU[        U5      -   5      5      n	[!        UR#                  5       5      n
X:w  a  [%        SU	 SU
 35      eX l        U
 Vs/ s H  oxU   PM	     snU l        XPl        U R*                  [        U R(                  5      -   U l        X0l        g ! , (       d  f       GN*= f! [         a     GN:f = fs  snnf s  snf )	Nzsentencepiece is not installedztokenizer.modelr   rV   rW   zCannot find tokenizer.modelzExpected new token IDs z to be sequential; got )r   RuntimeErrorr  r_   r   r   r  parentsentencepiece_tokenizerLoadFromFiler   r   rB   r   r	  r4   r
  keysrr   r   r   r  r   )r+   r   r   r   r   r   piecer   
new_tokensexpected_new_idsactual_new_idss              r-   r.   SentencePieceVocab.__init__  s   !)?@@')(+<<<ODDFF)&99GLPQ#'99Q<L ML &/%5%58I%II/QQSS#$ABB'='?$$$11#o2FG11<<>
7C7I7I7K`7K)%rO_IBI7K
`js:2N OP!*//"34-67G6HH_`n_opqq #/<J"KNbb>N"K","&"6"6T=S=S9T"T"11 ML$  a #Ls@   F9 F'F9 ,G
;G
G'
F61F9 6F9 9
GGc              #    #    U R                   n[        UR                  5       5       GH  nUR                  U5      nUR	                  S5      nUR                  U5      n[        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                  nUR                  U5      (       a  [        R                  R                   nXEU4v   GM     g 7fNrV   )r9  r	  r   	IdToPiecer$  GetScorer  r  r  	IsUnknownUNKNOWN	IsControlr%  IsUnusedUNUSEDIsByteBYTE)r+   r   r  r<  r  r'  toktypes          r-   sentencepiece_tokens'SentencePieceVocab.sentencepiece_tokens  s     00	y++-.A''*E <<0D$--a0Enn++G""1%%..00""1%%..00 !!!$$..//""..--w&&) /s   EEc              #     #    U R                    H3  nSnUR                  S5      U[        R                  R                  4v   M5     g 7fr"  )r   r$  r  r  USER_DEFINEDr&  s      r-   r   SentencePieceVocab.added_tokens  s:     **DE++g&t~~/J/JJJ +r)  c              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frO   )rM  r   r5   s    r-   r   SentencePieceVocab.all_tokens  s1     ,,...$$&&& 	/&r,  c                N    SU R                    S[        U R                  5       S3$ )Nz<SentencePieceVocab with r.  r/  r0  r5   s    r-   r6   SentencePieceVocab.__repr__  s.    *4+?+?*@@QRUVZVlVlRmQnn|}}r0   )r   r   r   r9  r   r  Nr   r   r   )r   r   r   r   r   re   r.   rM  r   r   r6   r   rz   r0   r-   r3  r3    s(    OD 2D'0K
'~r0   r3  c                  h    \ rS rSrSrSrSS jrSS jrSS jrSS jr	SS jr
S	 rSS
 jrSS jrSrg)LlamaHfVocabi  r4  hfftc                H   US-  n[        USS9 n[        R                  " U5      nS S S 5        WS   nUS   S:H  =(       a0    UR                  SS5      =(       a    UR                  S	S
5      (       + nU(       a  [	        S5      eU(       d7  US   S:w  d#  UR                  S	S5      (       a  US   S   S:w  a  [        S5      e SSKJn  UR                  UUS
S9U l
        U R                  R                  (       d   e/ U l        [        5       U l        [        5       U l        [#        U R                  R%                  5       R'                  5       S S9 Hd  u  pXR                  R(                  :  d  M   U R                  R+                  U	5        XR                  U	'   U R                   R-                  U
5        Mf     U R                  R.                   V	s0 s H   n	XR                  R1                  5       U	   _M"     sn	U l        [        U R                  R4                  5      U l        U R                  R(                  U l        U R8                  [;        U R                  5      -   U l        X l        g ! , (       d  f       GNO= f! [         a  n[        S5      UeS nAff = fs  sn	f )Nrw   rV   rW   rx   r   r   ignore_mergesFr   Tz'Llama 3 must be converted with BpeVocabr   r   zCannot find Llama BPE tokenizerr   )AutoTokenizerzsTo use LlamaHfVocab, please install the `transformers` package. You can install it with `pip install transformers`.)	cache_dirlocal_files_onlyc                    U S   $ r  rz   )xs    r-   r  'LlamaHfVocab.__init__.<locals>.<lambda>+  s    AaDr0   r  )r_   r   r   r   	TypeErrorr  transformersr[  ImportErrorfrom_pretrainedr   is_fastr   r   r   setadded_tokens_idsr
  get_added_vocabrB   r   rf   addall_special_tokens	get_vocabspecialsall_special_idsspecial_idsr  r4   r   )r+   r   r   r   r  r   	is_llama3r[  etoktokidxs              r-   r.   LlamaHfVocab.__init__  sb   #&66/G4!YYq\N 5 +9*AF#u, ?1D1D_V[1\ ?#''>> 	 EFFF#u,O4G4GY^4_4_i(0J>#$EFF	2 '66! 7 

 ~~%%%% "$!%!$ "NN**,224.
KC 222&&--c2.4&&s+%%))&1
 ~~88
8 ))+C008
 t~~==>  $~~88#33c$:P:P6QQ.{ 54(  	F 	>
s)   I/J 'J/
I>
JJJc              #  n  #    U R                   R                  5       R                  5        VVs0 s H  u  pX!_M	     nnn[        U R                  5       HV  nX@R
                  ;   a  M  X4   R                  S5      nXPR                  U5      U R                  XEU R                  5      4v   MX     g s  snnf 7frB  )
r   rk  rB   r	  r  rg  r$  get_token_scoreget_token_typern  )r+   r  r   r  r   
token_texts         r-   	hf_tokensLlamaHfVocab.hf_tokens@  s     37>>3K3K3M3S3S3U
3UBO3U 	 
 d223H000 '077@J 228<d>Q>Qd&6&6?   4	
s   ,B5B/A9B5c                    [         R                  " SU5      (       a  [        R                  R                  $ X;   a  [        R                  R
                  $ [        R                  R                  $ )Ns   <0x[0-9A-Fa-f]{2}>)re	fullmatchr  r  rK  r%  r  )r+   r   rw  rn  s       r-   rv  LlamaHfVocab.get_token_typeR  sJ    <<.
;;>>&&& *2)@t~~%%[dnnF[F[[r0   c                    g)Nr#  rz   )r+   r   s     r-   ru  LlamaHfVocab.get_token_scoreZ  s     r0   c              #  D  #    U R                    H  nXR                  ;   aI  U R                  U R                  U   SU R                  5      nU R	                  U R                  U   5      nO[
        R                  R                  nSnUR                  S5      X24v   M     g 7f)Nr0   r#  rV   )	r   rl  rv  rn  ru  r  r  rP  r$  )r+   r  rL  r'  s       r-   r   LlamaHfVocab.added_tokens_  s     **D}}$--dmmD.A3HXHXY,,T]]4-@A..55++g&66 +s   BB c                t    SU R                   R                  ;   =(       d    SU R                   R                  ;   $ )Nz<0x0A>
)r   r   r5   s    r-   has_newline_tokenLlamaHfVocab.has_newline_tokenj  s+    4>>///O44>>;O;O3OOr0   c              #  t   #    U R                  5        S h  vN   U R                  5        S h  vN   g  N N7frO   )rx  r   r5   s    r-   r   LlamaHfVocab.all_tokensm  s/     >>###$$&&& 	$&r,  c                N    SU R                    S[        U R                  5       S3$ )Nz<LlamaHfVocab with r.  r/  r0  r5   s    r-   r6   LlamaHfVocab.__repr__q  s-    $T%9%9$::KCPTPfPfLgKhhvwwr0   )	r   rg  r   r   rn  rl  r   r   r  Nr   r   )r   rq   rw  bytesrn  zset[int]r   zgguf.TokenType)r   rq   r   floatr   )r   r   r   r   r   re   r.   rx  rv  ru  r   r  r   r6   r   rz   r0   r-   rW  rW    s9    OD@/D$\
	7P'xr0   rW  c                      \ rS rSrSrSrSrg)MistralTokenizerTypeiu  r5  tekkenrz   N)r   r   r   r   r5  r  r   rz   r0   r-   r  r  u  s    
CFr0   r  c            	        [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U SS nS	n[        S
5       H4  nX0;  d  M
  U R                  U5        UR                  S
U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
!~r      ¡   ¬   ®   ÿNr   r   )r   r	  r   rf   r   r   zip)bscsnbcs_strs        r-   bytes_to_unicoder  }  s     	U3s8SX\*+
uSYD	A.
/	0
uSYD	A.
/	0 
 
AB	A4[;IIaLIIdQhFA	 
 !!bc!fbF!B   "s   C:c                  J   \ rS rSr% SrSr0 rS\S'   / rS\S'   SS jr	\
SS j5       r\
SS	 j5       rSS
 jrSS jrSS jr\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       r\
SS j5       rSS jr\S 5       rS rSrg)MistralVocabi  mistralr   r   r   r   c                   [         (       d  [        S5      e[        c   S5       e[        c   S5       e[        c   S5       e[
        R                  SU 35        UR                  S5       Vs/ s H)  o"R                  5       (       d  M  UR                  5       PM+     nn[        b  [        U5      nOs[        U5      n[        U5      S:X  a  [        SU 35      e[        U5      S:  a3  SU;   a  SnO[        U5      S	   n[
        R                  S
U SU 35        OUS   nX-  n[        R                  " U5      R                   R"                  U l        [%        U R"                  [        5      (       a  [&        R(                  O[&        R*                  U l        U R"                  R.                  U l        X@l        SU R,                  R4                  -   S-   U R"                  R6                  -   U l        g s  snf )NzwTo use MistralVocab, please install the `mistral-common` package. You can install it with `pip install mistral-common`.mistral_common is not installedzLoading Mistral tokenizer from z**/*r   z*No tokenizer file found in the directory: r   ztekken.jsonr   z"Multiple tokenizer files found in z. Using zmistral--)_mistral_common_installedrc  r   r   r   r>   r?   r   r^   as_posixr   r4   rr   r
  rA   	from_fileinstruct_tokenizerr   rp   r  r  r5  tokenizer_typen_wordsr   r   rJ   version_name)r+   r   r   	all_filestokenizer_file_pathvalid_tokenizer_filesr   s          r-   r.   MistralVocab.__init__  s   ((H  -8[:[[8+N-NN+%H'HH%5i[AB ,5>>&+AQ+AaYY[\QZZ\+A	Q'3">y"I$A)$L!()Q. #Mi[!YZZ()A- $99%2N%+,A%B2%FN88NK[\ "7q!9"+"<.88


YY 	
 $..*55 !''%)) 	
 ..002,,222S84>>;Q;QQ 	
C Rs   ,G1G1c                    U R                   $ rO   )r  r5   s    r-   tokenizer_nameMistralVocab.tokenizer_name  s    zzr0   c                F    U R                   [        R                  :X  a  S$ S$ )Nr4  r   )r  r  r5  r5   s    r-   gguf_tokenizer_model!MistralVocab.gguf_tokenizer_model  s!    --1E1I1IIwUvUr0   c              #  "  #    [         c   S5       e[        U R                  [         5      (       d   S[        U R                  5       35       e[	        U R                  R
                  R                  5       5       GH  nU R                  R
                  R                  U5      nUR                  S5      nU R                  R
                  R                  U5      n[        R                  R                  nU R                  R
                  R                  U5      (       a  [        R                  R                  nU R                  R
                  R                  U5      (       a  [        R                  R                   nU R                  R
                  R#                  U5      (       a  [        R                  R$                  nU R                  R
                  R'                  U5      (       a  [        R                  R(                  nX4U4v   GM     g 7f)Nr  z%Expected SentencePieceTokenizer, got rV   )r   rp   r   r   r	  _modelr   rC  r$  rD  r  r  r  rE  rF  rG  r%  rH  rI  rJ  rK  )r+   r  r<  r  r'  rL  s         r-   _sentencepiece_tokens"MistralVocab._sentencepiece_tokens  sr    %1T3TT1$..*@AA 	
3D4H3IJ	
A t~~,,779:ANN))33A6E<<(D>>0099!<Enn++G~~$$..q11..00~~$$..q11..00~~$$--a00..//~~$$++A....--w&&! ;s   HHc              #  B  #    [         c   S5       e[        U R                  [         5      (       d   S[        U R                  5       35       e[	        5       n[        U R                  R                  5       HJ  nU R                  R                  U5      R                  S5      S[        R                  R                  4v   ML     U R                  R                   H@  nU R                  X15      R                  S5      S[        R                  R                  4v   MB     g 7f)Nr  Expected Tekkenizer, got rV   r   )r   rp   r   r   r  r	  num_special_tokensid_to_piecer$  r  r  r%  _tekken_token2id_nospecialtoken_bytes_to_stringr  )r+   byte_encoderr   tokens       r-   _tekken_tokensMistralVocab._tekken_tokens  s     %H'HH%$..*55 	
'T^^(<'=>	
5 ()dnn??@H**84;;GD&&  A ^^>>E**5?FFwO%%  ?s   DDc                   [         b  [        c   S5       eU R                  [        R                  :X  aF  [        U R                  [         5      (       d   eU R                  R                  R                  U5      $ U R                  [        R                  :X  a]  [        U R                  [        5      (       d   eU R                  R                  R                  U5      U R                  R                  -   $ [        SU R                   35      e)Nr  Unknown tokenizer type: )r   r   r  r  r5  rp   r   _vocabindexr  r  rr   )r+   r  s     r-   get_token_idMistralVocab.get_token_id  s    %1j6LoNooL"6":"::dnn.DEEEE>>((..u55  $8$?$??dnnj9999%%++E2T^^5V5VV 78K8K7LMNNr0   c                .    U R                   R                  $ rO   )r   bos_idr5   s    r-   r  MistralVocab.bos_id      ~~$$$r0   c                .    U R                   R                  $ rO   )r   eos_idr5   s    r-   r  MistralVocab.eos_id  r  r0   c                z    U R                   R                  S:X  a  U R                  $ U R                   R                  $ )Nr   )r   pad_idr  r5   s    r-   r  MistralVocab.pad_id  s.    >>  B&;;~~$$$r0   c                .    U R                   R                  $ rO   )r   unk_idr5   s    r-   r  MistralVocab.unk_id  r  r0   c                `    U R                   R                  U R                   R                  5      $ rO   )r   r  r  r5   s    r-   r   MistralVocab.bos_token#  !    ~~))$..*?*?@@r0   c                `    U R                   R                  U R                   R                  5      $ rO   )r   r  r  r5   s    r-   r   MistralVocab.eos_token'  r  r0   c                `    U R                   R                  U R                   R                  5      $ rO   )r   r  r  r5   s    r-   	pad_tokenMistralVocab.pad_token+  r  r0   c                `    U R                   R                  U R                   R                  5      $ rO   )r   r  r  r5   s    r-   	unk_tokenMistralVocab.unk_token/  r  r0   c              #    #    U R                   [        R                  :X  a  U R                  5        S h  vN   g U R                   [        R                  :X  a  U R                  5        S h  vN   g [        SU R                    35      e NT N7f)Nr  )r  r  r5  r  r  r  rr   r5   s    r-   r   MistralVocab.all_tokens3  sy     "6":"::11333  $8$?$??**,,, 78K8K7LMNN 4 -s!   2BB	6B+B,BBc           	         SR                  U R                  S5       Vs/ s H  o![        U5         PM     sn5      $ s  snf )NrY   latin-1)r   decoder   )r  r  chars      r-   r  "MistralVocab.token_bytes_to_string=  s5    wwAHHY<OP<ODSY/<OPQQPs   =c                   ^ [         b  [        U R                  [         5      (       d   S[        U R                  5       35       eU R                  R                  R
                  mTR                  5        VVs0 s H  u  pX!_M	     nnn/ n[        SU R                  U R                  R                  -
  5       H  nX5   n/ n[        S[        U5      5       H9  nUS U n	XhS  n
U	T;   d  M  U
T;   d  M  X-   T;   d  M&  UR                  XU45        M;     U(       d   [        SU SUR                  S5       35      e[        UU4S jSS	9nUR                  U5        M     [        US
 SS	9n[!        5       nU Vs/ s H-  nU R#                  US   U5      U R#                  US   U5      /PM/     nnU VVs/ s H;  nSR%                  U Vs/ s H  nSR%                  S U 5       5      PM     sn5      PM=     nnnU$ s  snnf s  snf s  snf s  snnf )Nr  r   r   z-Could not find valid merge for token at rank ro   r  c                $   > TU S      TU S      4$ )Nr   r   rz   )r_  mergeable_rankss    r-   r  >MistralVocab.extract_vocab_merges_from_model.<locals>.<lambda>`  s    qt4oad6KLr0   F)r  reversec                    U S   $ )Nr\   rz   )vals    r-   r  r  d  s    #a&r0   r   r]   rY   c              3  ^   #    U  H#  oS :X  a  [        [        U5      S-   5      OUv   M%     g7fr   r   r   s     r-   r~   ?MistralVocab.extract_vocab_merges_from_model.<locals>.<genexpr>t  s&     O$QcCA-q@$s   +-)r   rp   r   r   r  _mergeable_ranksrB   r	  r   r  r4   rf   rr   r  r
  extendr  r  r   )r+   token_bytesranktoken_bytes_mapmerge_pairsr  merged_tokenlocaljleftrightr  r  decoded_merge_pairsr|   r   r   r  s                    @r-   extract_vocab_merges_from_model,MistralVocab.extract_vocab_merges_from_modelA  sH    %*T^^Z*P*P 	
'T^^(<'=>	
P ..//@@7F7L7L7N
7N"3+D7N 	 
  sDOOdnn.O.OOPA*-LE1c,/0#BQ'$R(O+0/9LL$q!12 1  CA3bI\I\]fIgHhi  LE
 u%+ Q, [.@%P') #

 # **3q6<@**3q6<@ # 	 
  ,	
 , HH !% !% GGO$OO $ , 	 	
 g
@
	
s$   8G:84H 3H
"H*H
H
)r  r   r   r  r   Nr   r   r   )r  r   r   rq   )r   rq   )r   r   r   r   r   re   r   r   r   r.   propertyr  r  r  r  r  r  r  r  r  r   r   r  r  r   staticmethodr  r  r   rz   r0   r-   r  r    s+   OD(*~*#%y%0
d   V V'0(O % % % % % %
 % % A A A A A A A AO R R:r0   r  )r   zdict[int, str])2
__future__r   enumr   r{  loggingr   ospathlibr   typingr   r   r   r	   r
   r   r   r   sentencepiecer   rc  r   (mistral_common.tokens.tokenizers.mistralr   'mistral_common.tokens.tokenizers.tekkenr   &mistral_common.tokens.tokenizers.utilsr   .mistral_common.tokens.tokenizers.sentencepiecer   r  r   r  gguf_writerr   	getLoggerr   r>   r   r   r   r   r   r3  rW  r   r  r  r  rz   r0   r-   <module>r     s   "  	   	  d d d'4%IB !%-  #			8	$V Vr   
 OIx O OEi EHtu HtVH~ H~Vxx5 xxv3 !4d5 dU  '"&C&'  . % c J"&C&)-!3-.  -(, #,-s5   C1  D D. 1D D$D+*D+.D>=D>