o
    
h                     @   st   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                       sb  e Zd ZdZ							d$deeeeeef f  deeeee	eef e	eef f f  d	eee
f d
ededee dee f fddZededefddZdddgdg dfdeeee f dededeeee
f  dedee defddZdddgdg ddfd eee eee  f dededeeee
f  dedee ded!ee fd"d#Z  ZS )%SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁TFvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc                    s   |d ur|d urt t|||||d}n	t t|||d}|t|d ur.|t|g t |_|r6dnd}	tj||	d|_	t
j||	d|_d||||d}
t ||
 d S )N)r   r   r   alwaysnever)r   prepend_schemeSentencePieceBPE)modelr   r   r   r   )r	   r   token_to_idstradd_special_tokensr   
normalizerr   	Metaspacepre_tokenizerr
   decodersuper__init__)selfr   r   r   r   r   r   r   	tokenizerr   
parameters	__class__ /var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr(      s    
z"SentencePieceBPETokenizer.__init__vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r0   r1   kwargsr   r   r.   r.   r/   	from_file1   s   z#SentencePieceBPETokenizer.from_filei0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc           	      C   s:   t j||||||d}t|tr|g}| jj||d dS )z%Train the model using the given filesr7   r8   r9   r:   r;   r<   )trainerN)r   
BpeTrainer
isinstancer!   
_tokenizertrain)	r)   r6   r7   r8   r9   r:   r;   r<   r>   r.   r.   r/   rB   6   s   
zSentencePieceBPETokenizer.trainiteratorlengthc	           
      C   s,   t j||||||d}	| jj||	|d dS )z(Train the model using the given iteratorr=   )r>   rD   N)r   r?   rA   train_from_iterator)
r)   rC   r7   r8   r9   r:   r;   r<   rD   r>   r.   r.   r/   rE   N   s   
z-SentencePieceBPETokenizer.train_from_iterator)NNr   r   TNF)__name__
__module____qualname____doc__r   r   r!   r   intr   r   boolfloatr(   staticmethodr4   r   rB   r   rE   __classcell__r.   r.   r,   r/   r   
   s    &
!
	r   N)typingr   r   r   r   r   r   
tokenizersr   r	   r
   r   r   tokenizers.modelsr   tokenizers.normalizersr   base_tokenizerr   r   r.   r.   r.   r/   <module>   s     