o
    h.                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ eeZdd
dZdedefddZdd ZG dd dZG dd deZ dedefddZ!G dd dZ"G dd de"Z#G dd de"Z$G dd  d e"Z%G d!d" d"e"Z&G d#d$ d$e"Z'G d%d& d&e"Z(G d'd( d(e"Z)G d)d* d*e"Z*G d+d, d,e"Z+G d-d. d.e"Z,G d/d0 d0e"Z-G d1d2 d2e"Z.G d3d4 d4e.Z/G d5d6 d6e.Z0G d7d8 d8e.Z1G d9d: d:e.Z2G d;d< d<e.Z3G d=d> d>e.Z4G d?d@ d@e.Z5G dAdB dBe.Z6G dCdD dDe.Z7G dEdF dFe.Z8G dGdH dHe.Z9G dIdJ dJe.Z:G dKdL dLe.Z;G dMdN dNe.Z<G dOdP dPe.Z=G dQdR dRe.Z>G dSdT dTe"Z?G dUdV dVe.Z@G dWdX dXe"ZAG dYdZ dZe"ZBG d[d\ d\e"ZCG d]d^ d^e.ZDG d_d` d`e.ZEG dadb dbe.ZFG dcdd dde"ZGG dedf dfe.ZHG dgdh dhe.ZIdidj ZJG dkdl dlZKi dme/dne+doe0dpe#dqe@dreCdse1dteAdue(dve#dwe-dxe2dye#dze#d{e#d|e#d}e#i d~e/de%de(de)de#de#de+de7de+de+de#deGde3de4de&de#de+i de5de'de<de*de#de9de:de#de+de,de6de#de=de>de?de7de8e$eDeFeFeEeFdZLddefddZMdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sj   t  rddlm} |S t r.dd l}t|jjtdk r&ddl	m} |S ddl	m
} |S tt| )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   google r    w/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf"   s   r"   add_prefix_spacereturnc                 C   s$   | rd}t |ddsd}|S d}|S )NalwayslegacyTfirstnever)getattr)r#   original_tokenizerprepend_schemer    r    r!   _get_prepend_scheme3   s   r,   c           
         s   |d u}|r
t |n }g }| D ]<\}}g }tdt|D ]}|d | ||d  }}	| v r>|	 v r>|||	|f qt| fddd}|| qt|dd |d}dd |D }|S )	Nr   c                        | d   | d  fS Nr   r   r    xvocabr    r!   <lambda>H       z!generate_merges.<locals>.<lambda>keyc                 S   s   | d t | d t | d fS )N   r   r   )lenvalr    r    r!   r3   K   s    r6   reversec                 S   s   g | ]
}|d  |d fqS r   r   r    .0r:   r    r    r!   
<listcomp>L       z#generate_merges.<locals>.<listcomp>)dictitemsranger8   appendsortedextend)
r2   vocab_scoresr<   mergesmergepiece_scorelocalindexpiece_lpiece_rr    r1   r!   generate_merges=   s   rP   c                   @   sB   e Zd ZdZdefddZd	deeeef e	e f fddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 C   s.   t | d ddlm} | | _| j| d S )Nr   r   )SentencePieceProcessor)r   r   rS   spLoad)selfrR   rS   r    r    r!   __init__U   s   
zSentencePieceExtractor.__init__Nr$   c                    s2   | j   fddt  D }t||}||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                       i | ]}  ||qS r    id_to_piecer?   rM   rT   r    r!   
<dictcomp>b   r4   z2SentencePieceExtractor.extract.<locals>.<dictcomp>)rT   rD   GetPieceSizerP   rV   rH   r2   rI   r    r]   r!   extract\   s   
zSentencePieceExtractor.extractN)__name__
__module____qualname____doc__strrW   tuplerB   intlistra   r    r    r    r!   rQ   P   s    (rQ   c                   @   s0   e Zd Zddeeeef ee f fddZdS )GemmaSentencePieceExtractorNr$   c                    sH   | j   fddt  D }d|vr|d|d< t||}||fS )rX   c                    rY   r    rZ   r\   r]   r    r!   r^   p   r4   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>	<0x09>)rT   rD   r_   getrP   r`   r    r]   r!   ra   j   s   
z#GemmaSentencePieceExtractor.extractrb   )	rc   rd   re   rh   rB   rg   ri   rj   ra   r    r    r    r!   rk   i   s    (rk   piecec                 C   s&   t | dk p| d dkp| d   S )Nr7   ,)r8   isdigit)ro   r    r    r!   check_number_commaz   s   &rt   c                   @   s"   e Zd Zdd ZdefddZdS )	Converterc                 C   s
   || _ d S rb   )r*   )rV   r*   r    r    r!   rW      s   
zConverter.__init__r$   c                 C   s   t  rb   )NotImplementedErrorrV   r    r    r!   	converted   s   zConverter.convertedN)rc   rd   re   rW   r   rx   r    r    r    r!   ru   ~   s    ru   c                   @      e Zd ZdefddZdS )BertConverterr$   c           
      C      | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr*   r2   r   r   rg   r}   hasattrr~   tokenize_chinese_charsr   do_lower_caser   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr	   TemplateProcessingpost_processorr   decoder
rV   r2   	tokenizerr   r   r   clssepr   r   r    r    r!   rx      :   



zBertConverter.convertedNrc   rd   re   r   rx   r    r    r    r!   rz          rz   c                   @   ry   )SplinterConverterr$   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkrx| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nr|   Fr~   Tr   .rightr    r   r   r   r   r   r   )r*   r2   r   r   rg   r}   r   r~   r   r   r   r   r   r   r   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider	   r   r   r   r   )rV   r2   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   r    r    r!   rx      sL   



$"
zSplinterConverter.convertedNr   r    r    r    r!   r      r   r   c                   @   ry   )FunnelConverterr$   c           
      C   r{   )Nr|   Fr~   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   r    r    r!   rx      r   zFunnelConverter.convertedNr   r    r    r    r!   r      r   r   c                   @   ry   )MPNetConverterr$   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nr|   Fr~   Tr   r   r   z:0 r   r   r   r   r   r   r   r    r    r!   rx     s:   



zMPNetConverter.convertedNr   r    r    r    r!   r     r   r   c                   @   ry   )OpenAIGPTConverterr$   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur/|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r2   rI   dropoutr}   end_of_word_suffixfuse_unkT)r   suffix)r*   encoderrj   	bpe_rankskeysr}   r   r
   rg   token_to_idadd_special_tokensr   r   r   r   r   r   r   
BPEDecoderr   rV   r2   rI   r}   r   r    r    r!   rx   .  s&   
zOpenAIGPTConverter.convertedNr   r    r    r    r!   r   -  r   r   c                   @   8   e Zd Zddeeef deeeef  defddZ	dS )GPT2ConverterNr2   rI   r$   c              	   C   s   |s| j j}|st| j j}tt||d dddd}t| j dd}tj|d|_	t
 |_t| j ddrP| j j}| j j}tj| d| d||fgd	|_|S tjdd
|_|S )Nr   Fr2   rI   r   continuing_subword_prefixr   r   r#   r#   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r*   r   rj   r   r   r
   r)   r   	ByteLevelr   r   r   	bos_tokenbos_token_idr	   r   r   )rV   r2   rI   r   r#   bosr   r    r    r!   rx   I  s:   
zGPT2Converter.convertedNN
rc   rd   re   rB   rg   ri   rj   rh   r   rx   r    r    r    r!   r   H      0r   c                   @   ry   )HerbertConverterr$   c                 C   s   d}d}| j j}t| j j }||d d v r|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   r}   r   F)r   r   r   )r   r   )r*   r   rj   r   r   r   r
   r}   r   r   r   r   r   r   r   r   r   r	   BertProcessingr   r   r   r   r   )rV   tokenizer_info_strtoken_suffixr2   rI   r   r    r    r!   rx   o  s.   

zHerbertConverter.convertedNr   r    r    r    r!   r   n  r   r   c                   @   r   )Qwen2ConverterNr2   rI   r$   c                 C   s   |s| j j}|st| j j }tt||d d ddddd}t |_	t
t
jtddddt
jt| j ddddg|_t |_tjdd	|_|S )
Nr   F)r2   rI   r   r}   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr#   r#   	use_regexr   )r*   r   rj   r   r   r   r
   r   NFCr   r   SequenceSplitr   r   r)   r   r   r   r	   r   )rV   r2   rI   r   r    r    r!   rx     sD   

zQwen2Converter.convertedr   r   r    r    r    r!   r     r   r   c                   @   ry   )RobertaConverterr$   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Tr   r   r#   r   )r*   r   rj   r   r   r   r
   r   r   r#   r   r   r   r	   RobertaProcessingr   r   r   r   r   rV   otr2   rI   r   r    r    r!   rx     s,   


zRobertaConverter.convertedNr   r    r    r    r!   r     r   r   c                   @   ry   )RoFormerConverterr$   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdr*| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerr|   Fr~   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr   r*   r2   r   r   rg   r}   r   r~   r   r   r   r   r   r   PreTokenizercustomr   r   r   r   r   r	   r   r   r   r   )
rV   r   r2   r   r   r   r   r   r   r   r    r    r!   rx     s8   

zRoFormerConverter.convertedNr   r    r    r    r!   r     r   r   c                   @   ry   )DebertaConverterr$   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r*   r   rj   r   r   r   r
   r   r   r#   r   r   r   r	   r   r   r   r   r    r    r!   rx     s.   
	zDebertaConverter.convertedNr   r    r    r    r!   r     r   r   c                       sn   e Zd ZdZeZi Z fddZdd Zdd Z	dd	 Z
d
d Zdd Zdd Zdd ZdefddZ  ZS )SpmConverterFc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 s+w   Y  || _
| j
jjrB| jsDtd d S d S d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrW   r"   
ModelProtoopenr*   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rV   args	model_pb2mf	__class__r    r!   rW   !  s   
zSpmConverter.__init__c                 C      dd |j D S )Nc                 S      g | ]}|j |jfqS r    ro   scorer?   ro   r    r    r!   r@   7  r4   z&SpmConverter.vocab.<locals>.<listcomp>piecesrV   r   r    r    r!   r2   6     zSpmConverter.vocabc                 C   s   |j jS rb   )r   unk_idr  r    r    r!   r	  9     zSpmConverter.unk_idc           	   	      s   |j j} |}|dkrtt| | jd}n-|dkrD  jj	
|\}}dd t|D }tt|||j jd jd d}ntd fd	d
t|jD }|dd
 t|dd dD  |S )Nr   r	  r   r7   c                 S   s   i | ]	\}\}}||qS r    r    )r?   iwordr  r    r    r!   r^   K      z*SpmConverter.tokenizer.<locals>.<dictcomp>T)r}   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                    8   g | ]\}}|j d v r||j|j dkp|j jv fqS )      r  typero   r   r?   idprw   r    r!   r@   `  
    
z*SpmConverter.tokenizer.<locals>.<listcomp>c                 S   s    g | ]\}}}t |d |dqS )F
normalizedspecialr   r?   r  tokenr  r    r    r!   r@   f  s    c                 S      | d S Nr   r    r/   r    r    r!   r3   h      z(SpmConverter.tokenizer.<locals>.<lambda>r5   )r   
model_typer2   r   r   r	  r   SpmExtractorr*   r   ra   	enumerater
   	unk_piece	Exceptionr  
add_tokensrF   )	rV   r   r"  rH   r   _rI   	bpe_vocabspm_added_tokensr    rw   r!   r   <  sF   

zSpmConverter.tokenizerc                 C   sJ   |j j}tjdddttddg}|st|S tt|g| S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   PrecompiledrV   r   r/  _normalizersr    r    r!   r   n  s   
zSpmConverter.normalizerc                 C      t || j}tj||dS Nreplacementr+   )r,   r*   r   	MetaspacerV   r8  r#   r+   r    r    r!   r   y     zSpmConverter.pre_tokenizerc                 C      d S rb   r    rw   r    r    r!   r   }     zSpmConverter.post_processorc                 C   r5  r6  )r,   r*   r   r9  r:  r    r    r!   r     r;  zSpmConverter.decoderr$   c                 C   s   |  | j}| | j}|d ur||_d}d}t| jdr!| jj}| ||}|d ur.||_| |||_|  }|r>||_|S )Nr-  Tr#   )	r   r   r   r   r*   r#   r   r   r   )rV   r   r   r8  r#   r   r   r    r    r!   rx     s    zSpmConverter.converted)rc   rd   re   r   rQ   r#  r   rW   r2   r	  r   r   r   r   r   r   rx   __classcell__r    r    r   r!   r     s    2r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   r   )Nc                 S   2   g | ]}t |jr|j|jfn|j|jd  fqS d   rt   ro   r  r  r    r    r!   r@         $z)AlbertConverter.vocab.<locals>.<listcomp>r  r  r    r    r!   r2        zAlbertConverter.vocabc                 C      t ddt ddg}| jjs|t   |t   | jjr)|t   |j	j
}|r7|t | |t tdd t |S Nz``"z''r,  r   r   r1  r*   keep_accentsrE   NFKDStripAccentsr   	Lowercaser.  r/  r2  r   r   rV   r   list_normalizersr/  r    r    r!   r        


zAlbertConverter.normalizerc                 C   ,   t jddd| jdfd| jdfgdS Nr   r   r   r   r   r	   r   r*   r   rw   r    r    r!   r        zAlbertConverter.post_processorNrc   rd   re   r2   r   r   r    r    r    r!   r@        r@  c                   @      e Zd Zdd Zdd ZdS )BarthezConverterc                 C      d}|S Nr  r    rV   r   r	  r    r    r!   r	       zBarthezConverter.unk_idc                 C   rR  Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   rT  rw   r    r    r!   r     rU  zBarthezConverter.post_processorN)rc   rd   re   r	  r   r    r    r    r!   rY    s    rY  c                   @   r?  )CamembertConverterc                 C   2   g d}|dd |j dd  D 7 }|dg7 }|S )N))z
<s>NOTUSED        <pad>rc  )z</s>NOTUSEDrc  z<unk>rc  )z<unk>NOTUSEDic                 S   r  r    r  r  r    r    r!   r@     r4   z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>rc  r  rV   r   r2   r    r    r!   r2     s   
zCamembertConverter.vocabc                 C      dS r[  r    r  r    r    r!   r	       zCamembertConverter.unk_idc                 C   rR  r^  rT  rw   r    r    r!   r     rU  z!CamembertConverter.post_processorNrc   rd   re   r2   r	  r   r    r    r    r!   ra    s    ra  c                   @   r?  )DebertaV2Converterc                 C   sH   g }| j jr|tjdd t|| j }|tj||d t|S )Nr   )r   r7  )r*   split_by_punctrE   r   Punctuationr,   r9  r   )rV   r8  r#   list_pretokenizersr+   r    r    r!   r     s   
z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|r"|t| |t	t
dd t|S )Nr,  r   )r*   r   rE   r   rN  r0  r.  r/  r2  r1  r   r   rO  r    r    r!   r     s   
zDebertaV2Converter.normalizerc                 C   rR  rS  rT  rw   r    r    r!   r     rU  z!DebertaV2Converter.post_processorN)rc   rd   re   r   r   r   r    r    r    r!   rl    s    rl  c                   @   r?  )MBartConverterc                 C   >   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )Nr_  rc  rd  r`  rc  rf  c                 S   r  r    r  r  r    r    r!   r@     r4   z(MBartConverter.vocab.<locals>.<listcomp>r  )ar_ARrc  cs_CZrc  de_DErc  en_XXrc  es_XXrc  et_EErc  fi_FIrc  fr_XXrc  gu_INrc  hi_INrc  it_ITrc  ja_XXrc  kk_KZrc  ko_KRrc  lt_LTrc  lv_LVrc  my_MMrc  ne_NPrc  nl_XXrc  ro_ROrc  ru_RUrc  si_LKrc  tr_TRrc  vi_VNrc  zh_CNrc  rg  r  rh  r    r    r!   r2     s
   
zMBartConverter.vocabc                 C   ri  r[  r    r  r    r    r!   r	  7  r=  zMBartConverter.unk_idc                 C   rR  )Nz$A </s> en_XXz$A $B </s> en_XXr|  r`  r   rT  rw   r    r    r!   r   :  rU  zMBartConverter.post_processorNrk  r    r    r    r!   rp    s    &rp  c                   @   r?  )MBart50Converterc                 C   rq  )Nrr  c                 S   r  r    r  r  r    r    r!   r@   M  r4   z*MBart50Converter.vocab.<locals>.<listcomp>r  )4ru  rw  ry  r{  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZArc  )az_AZrc  )bn_INrc  )fa_IRrc  )he_ILrc  )hr_HRrc  )id_IDrc  )ka_GErc  )km_KHrc  )mk_MKrc  )ml_INrc  )mn_MNrc  )mr_INrc  )pl_PLrc  )ps_AFrc  )pt_XXrc  )sv_SErc  )sw_KErc  )ta_INrc  )te_INrc  )th_THrc  )tl_XXrc  )uk_UArc  )ur_PKrc  )xh_ZArc  )gl_ESrc  )sl_SIrc  rg  r  rh  r    r    r!   r2   F  s
   
zMBart50Converter.vocabc                 C   ri  r[  r    r  r    r    r!   r	  R  r=  zMBart50Converter.unk_idc                 C   rR  )Nzen_XX $A </s>zen_XX $A $B </s>r|  r`  r   rT  rw   r    r    r!   r   U  rU  zMBart50Converter.post_processorNrk  r    r    r    r!   r  E  s    r  c                   @   r?  )NllbConverterc                 C   (   g d}|dd |j dd  D 7 }|S )Nrr  c                 S   r  r    r  r  r    r    r!   r@   h  r4   z'NllbConverter.vocab.<locals>.<listcomp>r  r  rh  r    r    r!   r2   a     zNllbConverter.vocabc                 C   ri  r[  r    r  r    r    r!   r	  k  r=  zNllbConverter.unk_idc                 C   rR  )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr`  r   rT  rw   r    r    r!   r   n  rU  zNllbConverter.post_processorNrk  r    r    r    r!   r  `      
r  c                   @   r?  )SeamlessM4TConverterc                 C   r  )N)rd  rf  rs  rt  c                 S   r  r    r  r  r    r    r!   r@     r4   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r  r  rh  r    r    r!   r2   z  r  zSeamlessM4TConverter.vocabc                 C   s   | j jS rb   )r*   unk_token_idr  r    r    r!   r	    r
  zSeamlessM4TConverter.unk_idc                 C   rR  )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r`  r   rT  rw   r    r    r!   r     rU  z#SeamlessM4TConverter.post_processorNrk  r    r    r    r!   r  y  r  r  c                   @   r?  )XLMRobertaConverterc                 C   rb  )Nrr  c                 S   r  r    r  r  r    r    r!   r@     r4   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r  rg  r  rh  r    r    r!   r2     s   
zXLMRobertaConverter.vocabc                 C   rZ  r[  r    r\  r    r    r!   r	    r]  zXLMRobertaConverter.unk_idc                 C   rR  r^  rT  rw   r    r    r!   r     rU  z"XLMRobertaConverter.post_processorNrk  r    r    r    r!   r        r  c                   @   r?  )XLNetConverterc                 C   r   )Nc                 S   rA  rB  rD  r  r    r    r!   r@     rE  z(XLNetConverter.vocab.<locals>.<listcomp>r  r  r    r    r!   r2     rF  zXLNetConverter.vocabc                 C   rG  rH  rJ  rO  r    r    r!   r     rQ  zXLNetConverter.normalizerc                 C   rR  )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   rT  rw   r    r    r!   r     rU  zXLNetConverter.post_processorNrV  r    r    r    r!   r    rW  r  c                   @      e Zd ZdS )ReformerConverterNrc   rd   re   r    r    r    r!   r        r  c                   @   rX  )RemBertConverterc                 C   s   t ddt ddt tddg}| jjs%|t   |t   | jjr0|t 	  |j
j}|r>|t | t |S rH  )r   r1  r   r*   rK  rE   rL  rM  r   rN  r.  r/  r2  r   rO  r    r    r!   r     s   


zRemBertConverter.normalizerc                 C   rR  rS  rT  rw   r    r    r!   r     rU  zRemBertConverter.post_processorN)rc   rd   re   r   r   r    r    r    r!   r    s    r  c                   @   r  )BertGenerationConverterNr  r    r    r    r!   r    r  r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur|| j jdfg7 }| j jd ur2| j j| j jk r2|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )Nrc  c                 S      g | ]
}d | ddfqS )z<unk_>g      Yr    r?   r  r    r    r!   r@   
  rA   z*PegasusConverter.vocab.<locals>.<listcomp>r7   c                 S   r  r    r  r  r    r    r!   r@     r4   )	r*   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrD   r  rh  r    r    r!   r2     s   

zPegasusConverter.vocabc                 C   s   |j j| jj S rb   )r   r	  r*   r  r  r    r    r!   r	    r  zPegasusConverter.unk_idc                 C   s(   t || j}tt tj||dgS r6  )r,   r*   r   r   WhitespaceSplitr9  r:  r    r    r!   r     s   zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br   )r*   r  eos_token_idr	   r   )rV   eosr   r    r    r!   r     s   
zPegasusConverter.post_processorN)rc   rd   re   r2   r	  r   r   r    r    r    r!   r    s
    	r  c                   @   rX  )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   r  r    r  r  r    r    r!   r@   %  r4   z%T5Converter.vocab.<locals>.<listcomp>c                 S   r  )z
<extra_id_r  rc  r    r  r    r    r!   r@   &  rA   r   rp   )r*   
_extra_idsr  rD   )rV   r   num_extra_idsr2   r    r    r!   r2   #  s   zT5Converter.vocabc                 C   &   t jddgg dd| jdfgdS Nr  r`  )r  r`  r  r`  r   rT  rw   r    r    r!   r   )     zT5Converter.post_processorN)rc   rd   re   r2   r   r    r    r    r!   r  "  s    r  c                   @      e Zd Zdd ZdS )UdopConverterc                 C   r  r  rT  rw   r    r    r!   r   4  r  zUdopConverter.post_processorNrc   rd   re   r   r    r    r    r!   r  3      r  c                   @   ry   )WhisperConverterr$   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )Nr   Fr   r   r   c                 S   s   g | ]}| d qS )r   r    r?   r  r    r    r!   r@   U  s    z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r   z $A:0 $B:1 r   r   )r*   r   rj   r   r   r   r
   r   r   r#   r   r   r   prefix_tokensconvert_ids_to_tokensr  r  joinr	   r   zipr   )	rV   r2   rI   r   prefix_token_idsprefixesr  r  prefix_templater    r    r!   rx   ?  s8   
	zWhisperConverter.convertedNr   r    r    r    r!   r  >  r   r  c                   @   r  )BigBirdConverterc                 C   rR  rS  rT  rw   r    r    r!   r   c  rU  zBigBirdConverter.post_processorNr  r    r    r    r!   r  b  r  r  c                   @   ry   )CLIPConverterr$   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr2   rI   r   r   r   r   r}   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r*   r   rj   r   r   r}   r   r
   rg   r   r   r   r1  r   rN  r   r   r   r   r   r   r   r	   r   r  r  r   r   r   r   r    r    r!   rx   o  sD   


zCLIPConverter.convertedNr   r    r    r    r!   r  n  r   r  c                   @   ry   )LayoutLMv2Converterr$   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nr|   FTr~   r   r   r   r   r   r   r   r   r   r   r    r    r!   rx     r   zLayoutLMv2Converter.convertedNr   r    r    r    r!   r    r   r  c                   @   ry   )BlenderbotConverterr$   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 r   )r   r   )r*   r   rj   r   r   r   r
   r   r   r#   r   r   r   r	   r   r  r  r   r   r    r    r!   rx     s*   

zBlenderbotConverter.convertedNr   r    r    r    r!   r    r   r  c                   @   r?  )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )Nrr  c                 S   r  r    r  r  r    r    r!   r@     r4   z'XGLMConverter.vocab.<locals>.<listcomp>r  ))z<madeupword0>rc  )z<madeupword1>rc  )z<madeupword2>rc  )z<madeupword3>rc  )z<madeupword4>rc  )z<madeupword5>rc  )z<madeupword6>rc  r  rh  r    r    r!   r2     s   zXGLMConverter.vocabc                 C   rZ  r[  r    r\  r    r    r!   r	    r]  zXGLMConverter.unk_idc                 C   rR  )Nz</s> $Az</s> $A </s> </s> $Br_  r`  r   rT  rw   r    r    r!   r     rU  zXGLMConverter.post_processorNrk  r    r    r    r!   r    r  r  c                   @   sF   e Zd ZdZeZddhZ	 dd Zdd Zdd	 Z	d
d Z
dd ZdS )GemmaConverterTz<start_of_turn>z<end_of_turn>c                 C      t ddS Nr   r-  )r   r1  r  r    r    r!   r        zGemmaConverter.normalizerc                 C   s|   | j jdf| j jdf| j jdfg}|dd |jdd  D 7 }tdd |D s<tdd t|D d }|d ur<d||< |S )	Nrc  c                 S   r  r    r  r  r    r    r!   r@     r4   z(GemmaConverter.vocab.<locals>.<listcomp>r  c                 s   s    | ]	}|d  dkV  qdS )r   rl   Nr    )r?   r0   r    r    r!   	<genexpr>  s    z'GemmaConverter.vocab.<locals>.<genexpr>c                 s   s$    | ]\}}|d  dkr|V  qdS )r   rm   Nr    )r?   r  r0   r    r    r!   r    s   " )rl   rc  )r*   r  r  r   r  anynextr$  )rV   r   r2   override_indexr    r    r!   r2     s   


zGemmaConverter.vocabc                 C   r  )Nr   merged_with_previous)r   r   rV   r8  r#   r    r    r!   r     r   zGemmaConverter.pre_tokenizerc                 C   rZ  r[  r    r\  r    r    r!   r	    r]  zGemmaConverter.unk_idc                 C   s    t t ddt  t  gS )Nr-  r   )r   r   r1  ByteFallbackFuser  r    r    r!   r   "  s   
zGemmaConverter.decoderN)rc   rd   re   r   rk   r#  r   r   r2   r   r	  r   r    r    r    r!   r    s    
r  c                   @   s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )LlamaConverterTc                 C   sN   | j ddf| j ddf| j ddfg}|dd |jdd  D 7 }|S )Nr   rc  r   r7   c                 S   r  r    r  r  r    r    r!   r@   5  r4   z(LlamaConverter.vocab.<locals>.<listcomp>r  )r*   r  r  rh  r    r    r!   r2   /  s   zLlamaConverter.vocabc                 C   rZ  r   r    r\  r    r    r!   r	  8  r]  zLlamaConverter.unk_idc                 C   <   t ddt  t  g}|r|t jdddg7 }t |S Nr-  r   r   )contentr+  r   r1  r  r  r0  r   rV   r8  r#   sequencer    r    r!   r   <     

zLlamaConverter.decoderc                 C   sT   t | jddr(g }t | jddr|tjddg7 }|tjdddg7 }t|S d S )Nr&   Tr#   r-  )prependr   )patternr  )r)   r*   r   Prependr1  r   )rV   r   r  r    r    r!   r   F  s   
zLlamaConverter.normalizerc                 C   s.   t | jddst|| j}tj||ddS d S )Nr&   TFr8  r+   split)r)   r*   r,   r   r9  r:  r    r    r!   r   O  s   zLlamaConverter.pre_tokenizerc                 C   r<  rb   r    rw   r    r    r!   r   U  rj  zLlamaConverter.post_processorN)
rc   rd   re   r   r2   r	  r   r   r   r   r    r    r    r!   r	  ,  s    	
	r	  c                   @   ry   )MarkupLMConverterr$   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr  r   z $A z $B r   )r*   r   rj   r   r   r   r
   r}   r   r   r#   r   r   r   rg   r   r   r   r   r	   r   r   )	rV   r   r2   rI   r   r   r   r   r   r    r    r!   rx   [  s8   
	zMarkupLMConverter.convertedNr   r    r    r    r!   r  Z  r   r  c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )MoshiConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S Nr   r   	r   ru   rW   r"   r   r   r   r   r   )rV   r   model_max_lengthkwargsr   r   r   r    r    r!   rW     s   

zMoshiConverter.__init__c                 C   s:   |j j}tddg}|st|S tt|g| S r  )r.  r/  r   r1  r   r2  r3  r    r    r!   r     s   

zMoshiConverter.normalizerc                 C   r
  r  r  r  r    r    r!   r     r  zMoshiConverter.decoderc                 C   s   d}t j||ddS )Nr'   Fr  )r   r9  r:  r    r    r!   r     s   zMoshiConverter.pre_tokenizerrb   )rc   rd   re   r   rW   r   r   r   r    r    r    r!   r    s    


r  c                   @   sR   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )HeliumConverterTNc                 G   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S r  r  )rV   r   r   r   r   r   r    r    r!   rW     s   

zHeliumConverter.__init__c                    s     |}tt| | jd} fddt|jD }|dd t|dd dD  |t	dd	d	d
g |j
ddd |S )Nr  c                    r  r  r  r  rw   r    r!   r@     r  z-HeliumConverter.tokenizer.<locals>.<listcomp>c                 S   s"   g | ]\}}}t |d |ddqS )FT)r  r  single_wordr  r  r    r    r!   r@     s    c                 S   r  r   r    r/   r    r    r!   r3     r!  z+HeliumConverter.tokenizer.<locals>.<lambda>r5   
Fr  re  r  )r  pad_id)r2   r   r   r	  r   r$  r  r'  rF   r   enable_padding)rV   r   rH   r   r*  r    rw   r!   r     s&   

zHeliumConverter.tokenizerc                 C   sB   g }|j D ]}|jdkr|d|jfg7 }q||j|jfg7 }q|S )Nz<0x0A>r  )r  ro   r  )rV   r   r2   ro   r    r    r!   r2     s   

zHeliumConverter.vocabc                 C   rZ  r   r    r\  r    r    r!   r	    r]  zHeliumConverter.unk_idc                 C   s8   t ddt  t  g}|t jdddg7 }t |S r  r  r  r    r    r!   r     s   

zHeliumConverter.decoderc                 C   s   t t dt ddgS r  )r   r   r  r1  r  r    r    r!   r     s   zHeliumConverter.normalizerc                 C   s   t t ddgS )Nr  
contiguous)r   r   r   r  r    r    r!   r     s   zHeliumConverter.pre_tokenizerc                 C   s   t jddgg ddgdS )Nr_  r  )r_  r  r_  r  )r_  r   r   )r	   r   rw   r    r    r!   r     s   zHeliumConverter.post_processorrb   )rc   rd   re   r   rW   r   r2   r	  r   r   r   r   r    r    r    r!   r    s    
		r  c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r    )chr)r?   nr    r    r!   r@     s    z$bytes_to_unicode.<locals>.<listcomp>)rj   rD   ordrE   rB   r  )bscsr*  br    r    r!   bytes_to_unicode  s   L
r/  c                       sN   e Zd ZdZ				d fdd	Zdefdd	Zd
d ZdefddZ	  Z
S )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                    sB   t  j|  || _|| _|| _t|tu r| | _d S || _d S rb   )	r   rW   r   r  r#   r  rB   r   additional_special_tokens)rV   r   r  r#   r2  r   r  r   r    r!   rW      s   	zTikTokenConverter.__init__tiktoken_urlc                    s   zddl m} W n ty   tdw || t fddg }i }  D ]P\}}|||< t|dkr:q)g }tdt|D ]%}|d | ||d  }	}
|	 v rh|
 v rh|	|
  v rh||	|
|f qCt	| fddd	d
}|
| q)t	|dd d	d
}fdd|D }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                    s   d  fdd| dD S )Nr   c                    s   g | ]} t | qS r    )r+  )r?   charbyte_encoderr    r!   r@   =  r4   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)r  decode)r.  r6  r    r!   token_bytes_to_string<  s   zPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringr   c                    r-   r.   r    r/   )r   r    r!   r3   J  r4   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Fr;   c                 S   r  )Nr7   r    r9   r    r    r!   r3   L  r!  c                    s$   g | ]} |d   |d fqS r=   r    r>   )r9  r    r!   r@   M  s   $ zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)tiktoken.loadr4  r&  
ValueErrorr/  rC   r8   rD   rE   rF   rG   )rV   r3  r4  rI   r2   r  rankrL   rM   rN   rO   r    )r   r7  r9  r!   extract_vocab_merges_from_model1  s6   z1TikTokenConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)r   ignore_mergesT)r=  r   r   r
   r   rR   r>  )rV   rH   rI   r   r    r    r!   r   P  s
   zTikTokenConverter.tokenizerr$   c                 C   sh   |   }ttjt| jdddtj| jddg|_t	 |_
|dd | jD  tjdd|_|S )Nr   Fr   r   c                 S   s   g | ]	}t |d ddqS )FTr  r  r  r    r    r!   r@   b  r  z/TikTokenConverter.converted.<locals>.<listcomp>r   )r   r   r   r   r   r  r   r#   r   r   r   r   r2  r	   r   )rV   r   r    r    r!   rx   W  s   
zTikTokenConverter.converted)Nr1  FN)rc   rd   re   rf   rW   rg   r=  r   r   rx   r>  r    r    r   r!   r0    s    r0  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3TokenizerFc                 C   sn   | j j}|tv r|st| }||  S ztd t| j| jd W S  t	y6   t
dtt  w )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r   r2  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r   rc   SLOW_TO_FAST_CONVERTERSrx   loggerinfor0  r   r2  r&  r;  rj   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classr    r    r!   convert_slow_tokenizer  s&   

r  )r   )F)Nrf   r   	packagingr   
tokenizersr   r   r   r   r   r   r	   tokenizers.modelsr
   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerrc   ry  r"   boolrg   r,   rP   rQ   rk   rt   ru   rz   r   r   r   r   r   r   r   r   r   r   r   r@  rY  ra  rl  rp  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r/  r0  rx  r  r    r    r    r!   <module>   sP  $


'2''&,' %!5% ($+'4.&)ZO	
 !"#$%&'()*+,-./01234=