o
    
h                     @   s|   d dl Z d dlZd dlmZmZmZmZmZ d dlm	Z	m
Z
mZmZmZmZmZ d dlmZ ddlmZ G dd deZdS )	    N)IteratorListOptionalUnionTuple)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizerstrainers)Unigram   )BaseTokenizerc                       s  e Zd ZdZ			ddeeeeef   dede	f fdd	Z
	
				ddeeee f dede	deeeeef   deee  dee fddZ	
					ddeee eee  f dede	deeeeef   deee  dee dee fddZedefddZ  ZS )SentencePieceUnigramTokenizerzzSentencePiece Unigram Tokenizer

    Represents the Unigram algorithm, with the pretokenization used by SentencePiece
    N   ▁Tvocabreplacementadd_prefix_spacec                    s   |d urt t|}nt t }tt t ttddg|_|r'dnd}t	j
||d|_tj
||d|_d||d}t || d S )N {2,} alwaysneverr   prepend_schemeSentencePieceUnigram)modelr   r   )r	   r   r   SequenceNmtNFKCReplacer   
normalizerr   	Metaspacepre_tokenizerr
   decodersuper__init__)selfr   r   r   	tokenizerr   
parameters	__class__ /var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/tokenizers/implementations/sentencepiece_unigram.pyr'      s   
z&SentencePieceUnigramTokenizer.__init__@  files
vocab_sizeshow_progressspecial_tokensinitial_alphabet	unk_tokenc                 C   sP   |du rg }|du rg }t j|||||d}t|tr|g}| jj||d dS )a  
        Train the model using the given files

        Args:
            files (:obj:`List[str]`):
                A list of path to the files that we should use for training
            vocab_size (:obj:`int`):
                The size of the final vocabulary, including all tokens and alphabet.
            show_progress (:obj:`bool`):
                Whether to show progress bars while training.
            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
                A list of special tokens the model should know of.
            initial_alphabet (:obj:`List[str]`, `optional`):
                A list of characters to include in the initial alphabet, even
                if not seen in the training dataset.
                If the strings contain more than one character, only the first one
                is kept.
            unk_token (:obj:`str`, `optional`):
                The unknown token to be used by the model.
        Nr1   r3   r2   r4   r5   )trainer)r   UnigramTrainer
isinstancestr
_tokenizertrain)r(   r0   r1   r2   r3   r4   r5   r7   r-   r-   r.   r<   ,   s   
z#SentencePieceUnigramTokenizer.trainiteratorlengthc           	      C   sB   |du rg }|du rg }t j|||||d}| jj|||d dS )a  
        Train the model using the given iterator

        Args:
            iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
                Any iterator over strings or list of strings
            vocab_size (:obj:`int`):
                The size of the final vocabulary, including all tokens and alphabet.
            show_progress (:obj:`bool`):
                Whether to show progress bars while training.
            special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
                A list of special tokens the model should know of.
            initial_alphabet (:obj:`List[str]`, `optional`):
                A list of characters to include in the initial alphabet, even
                if not seen in the training dataset.
                If the strings contain more than one character, only the first one
                is kept.
            unk_token (:obj:`str`, `optional`):
                The unknown token to be used by the model.
            length (:obj:`int`, `optional`):
                The total number of sequences in the iterator. This is used to
                provide meaningful progress tracking
        Nr6   )r7   r>   )r   r8   r;   train_from_iterator)	r(   r=   r1   r2   r3   r4   r5   r>   r7   r-   r-   r.   r?   \   s    "
z1SentencePieceUnigramTokenizer.train_from_iteratorfilenamec                 C   sB  zdd l }|jd dd l}W n ty   tdw | }|t| d  |j	j
}dd |jD }|jj}|jj}|jj}|dkrJtdd	}	d
}
tt|||}|rjtt|ttddg|_ntttddg|_|
r{dnd}tj|	|d|_tj|	|d|_ddi}tt||}t ||| |S )Nr   .a\  You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required.rbc                 S   s   g | ]}|j |jfqS r-   )piecescore).0rC   r-   r-   r.   
<listcomp>   s    z:SentencePieceUnigramTokenizer.from_spm.<locals>.<listcomp>r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmr   Tr   r   r   r   r   r   r   )!syspathappendsentencepiece_model_pb2	Exception
ModelProtoParseFromStringopenreadnormalizer_specprecompiled_charsmappiecestrainer_specunk_id
model_typebyte_fallbackr	   r   r   r   Precompiledr!   r   r"   r   r#   r$   r
   r%   r   __new__r   r'   )r@   rG   r   mrQ   r   rT   rU   rV   r   r   r)   r   r*   objr-   r-   r.   from_spm   sL   z&SentencePieceUnigramTokenizer.from_spm)Nr   T)r/   TNNN)r/   TNNNN)__name__
__module____qualname____doc__r   r   r   r:   floatboolr'   r   intr   r<   r   r?   staticmethodr[   __classcell__r-   r-   r+   r.   r      sj    

3

6r   )jsonostypingr   r   r   r   r   
tokenizersr   r   r	   r
   r   r   r   tokenizers.modelsr   base_tokenizerr   r   r-   r-   r-   r.   <module>   s    $