o
    
h                     @   s   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizers
processorstrainers)BPE)	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       sJ  e Zd ZdZ									d!deeeeeef f  deeeee	eef e	eef f f  de
de
dee d	ee d
ee dee de
f fddZededefddZdddg fdeeee f dedede
deeeef  f
ddZdddg dfdeee eee  f dedede
deeeef  dee fdd Z  ZS )"ByteLevelBPETokenizerzjByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    NFvocabmergesadd_prefix_space	lowercasedropoutunicode_normalizercontinuing_subword_prefixend_of_word_suffixtrim_offsetsc
              	      s   |d ur|d urt t||||pd|pdd}
nt t }
g }|r(|t|g7 }|r0|t g7 }t|dkrGt|dkrBt||
_n|d |
_tj|d|
_	t
 |
_tj|	d|
_d|||||||	d}t |
| d S )	N )r   r   r   r   r   )r   )r   ByteLevelBPE)modelr   r   r   r   r   r   r   )r	   r   r   r   lenr   
normalizerr   	ByteLevelpre_tokenizerr
   decoderr   post_processorsuper__init__)selfr   r   r   r   r   r   r   r   r   	tokenizernormalizers
parameters	__class__ }/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/tokenizers/implementations/byte_level_bpe.pyr(      sB   



zByteLevelBPETokenizer.__init__vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r1   r2   kwargsr   r   r/   r/   r0   	from_fileJ   s   zByteLevelBPETokenizer.from_filei0u     Tfiles
vocab_sizemin_frequencyshow_progressspecial_tokensc                 C   s>   t j||||tj d}t|tr|g}| jj||d dS )z%Train the model using the given filesr8   r9   r:   r;   initial_alphabet)trainerN)	r   
BpeTrainerr   r#   alphabet
isinstancestr
_tokenizertrain)r)   r7   r8   r9   r:   r;   r>   r/   r/   r0   rD   O   s   

zByteLevelBPETokenizer.trainiteratorlengthc                 C   s0   t j||||tj d}| jj|||d dS )z(Train the model using the given iteratorr<   )r>   rF   N)r   r?   r   r#   r@   rC   train_from_iterator)r)   rE   r8   r9   r:   r;   rF   r>   r/   r/   r0   rG   d   s   
z)ByteLevelBPETokenizer.train_from_iterator)	NNFFNNNNF)__name__
__module____qualname____doc__r   r   rB   r   intr   boolfloatr(   staticmethodr5   r   r   rD   r   rG   __classcell__r/   r/   r-   r0   r   
   s    &	
:
r   N)typingr   r   r   r   r   r   
tokenizersr   r	   r
   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   r   r   base_tokenizerr   r   r/   r/   r/   r0   <module>   s      