o
    h                     @   s`   d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ G dd dZdefdd	Zd
S )    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)LlamaTokenizerFast)bytes_to_unicodec                       sN   e Zd ZdZ				d fdd	Zdefdd	Zd
d ZdefddZ	  Z
S )MistralConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                    s(   t  j|  || _|| _|| _|| _d S )N)super__init__vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   argskwargs	__class__ u/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/integrations/mistral.pyr      s
   	
zMistralConverter.__init__r   c           
         s  | t  fddg }i }t  D ]\\}\}}|| jvrm|||< t|dkr-qg }tdt|D ]%}|d | ||d  }}	| v r[|	 v r[||	  v r[|||	|f q6t| fdddd}|| q|||< qt|dd dd}fd	d
|D }||fS )Nc                    s   d  fdd| dD S )N c                    s   g | ]} t | qS r   )ord).0charbyte_encoderr   r   
<listcomp>!       zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)joindecode)br   r   r   token_bytes_to_string    s   zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   c                    s    | d   | d  fS )Nr   r%   r   )x)	bpe_ranksr   r   <lambda>/   r    zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>F)keyreversec                 S   s   | d S )N   r   )valr   r   r   r(   3   s    c                    s$   g | ]} |d   |d fqS )r   r%   r   )r   r,   )r$   r   r   r   4   s   $ zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)	r	   	enumerateitemsr   lenrangeappendsortedextend)
r   r   mergesidxtokenranklocalindexpiece_lpiece_rr   )r'   r   r$   r   extract_vocab_merges_from_model   s,   

z0MistralConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)fuse_unkignore_mergesT)r<   r   r   r   hasattrmodelr>   )r   vocab_scoresr4   	tokenizerr   r   r   rB   7   s
   zMistralConverter.tokenizerreturnc                 C   s^   |   }ttjt| jdddtj| jddg|_t	 |_
|| j tjdd|_|S )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rB   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rB   r   r   r   	converted>   s   
zMistralConverter.converted)Nr   FN)__name__
__module____qualname____doc__r   strr<   rB   r   rP   __classcell__r   r   r   r   r
      s    r
   tokenizer_filec                    s|   ddl m} || }|jjj}dd |jjjD   fdd D }|| |}tt	| d
 d}|d	 i |S )
z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )MistralTokenizerc                 S   s    g | ]}t |d r|jn|qS )value)r?   rY   r   r6   r   r   r   r   Y   s    z,convert_tekken_tokenizer.<locals>.<listcomp>c                    s   i | ]}|  |qS r   )r9   rZ   all_specialr   r   
<dictcomp>]   r    z,convert_tekken_tokenizer.<locals>.<dictcomp>)r   r   )tokenizer_objectr   )(mistral_common.tokens.tokenizers.mistralrX   	from_fileinstruct_tokenizerrB   _tekken_token2id_nospecial_all_special_tokensupdater   r
   rP   rN   )rW   rX   mistral_tokenizerr   specials_tokensrB   r   r[   r   convert_tekken_tokenizerN   s   


rg   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   transformersr   #transformers.convert_slow_tokenizerr	   r
   rU   rg   r   r   r   r   <module>   s    F