o
    
hZ                     @   s   d dl mZmZmZmZmZmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZmZ ddlmZ G dd	 d	eZd
S )    )DictIteratorListOptionalTupleUnion   )
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)BertNormalizer	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       s  e Zd ZdZ									d&deeeeeef f  deeeee	eef e	eef f f  d	eee
f d
edee dedee dedef fddZededefddZdddgdg ddfdeeee f dededeeee
f  dedee d
ee defd d!Zdddgdg dddfd"eee eee  f dededeeee
f  dedee d
ee ded#ee fd$d%Z  ZS )'CharBPETokenizera  Original BPE Tokenizer

    Represents the BPE algorithm, as introduced by Rico Sennrich
    (https://arxiv.org/abs/1508.07909)

    The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
    Sennrich subword-nmt implementation by the following options that you can deactivate:
        - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
            * removing any control characters and replacing all whitespaces by the classic one.
            * handle chinese chars by putting spaces around them.
            * strip all accents.
        - spitting on punctuation in addition to whitespaces (deactivate it with
          `split_on_whitespace_only=True`)
    N<unk></w>FTvocabmerges	unk_tokensuffixdropout	lowercaseunicode_normalizerbert_normalizersplit_on_whitespace_onlyc
              	      s  |d ur|d urt t|||t||d}
nt tt|||d}
|
t|d ur2|
t|g g }|r=|t|g7 }|rG|tddg7 }|rO|t g7 }t|dkrft|dkrat	||
_
n|d |
_
|	rnt |
_nt |
_tj|d|
_d|||||||	d	}t |
| d S )
N)r   r   end_of_word_suffix)r   r   r!   F)r   r   r   )r   r   )modelr   r   r   r   r   r   r    )r
   r   strtoken_to_idadd_special_tokensr   r   r   lenr   
normalizerr   WhitespaceSplitpre_tokenizerBertPreTokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   r   r   r   r   r   r    	tokenizernormalizers
parameters	__class__ }/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/tokenizers/implementations/char_level_bpe.pyr.      sL   


zCharBPETokenizer.__init__vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r7   r8   kwargsr   r   r5   r5   r6   	from_file\   s   zCharBPETokenizer.from_filei0u  r   i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc	           
   	   C   s<   t j|||||||d}	t|tr|g}| jj||	d dS )z%Train the model using the given filesr=   r>   r?   r@   rA   r!   rB   )trainerN)r   
BpeTrainer
isinstancer#   
_tokenizertrain)
r/   r<   r=   r>   r?   r@   rA   r   rB   rD   r5   r5   r6   rH   a   s   
	zCharBPETokenizer.trainiteratorlengthc
              	   C   s.   t j|||||||d}
| jj||
|	d dS )z(Train the model using the given iteratorrC   )rD   rJ   N)r   rE   rG   train_from_iterator)r/   rI   r=   r>   r?   r@   rA   r   rB   rJ   rD   r5   r5   r6   rK   {   s   	
z$CharBPETokenizer.train_from_iterator)	NNr   r   NFNTF)__name__
__module____qualname____doc__r   r   r#   r   intr   r	   floatboolr.   staticmethodr;   r   rH   r   rK   __classcell__r5   r5   r3   r6   r   	   s    &
	
C	
	
r   N)typingr   r   r   r   r   r    r	   r
   r   r   r   modelsr   r1   r   r   r   r   base_tokenizerr   r   r5   r5   r5   r6   <module>   s     