o
    
h                     @   s   d dl mZmZmZmZmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ G d	d
 d
eZdS )    )DictIteratorListOptionalUnion)
AddedToken	Tokenizerdecoderstrainers)	WordPiece)BertNormalizer)BertPreTokenizer)BertProcessing   )BaseTokenizerc                       s~  e Zd ZdZ												d*d
eeeeeef f  deee	f deee	f deee	f deee	f deee	f de
de
dee
 de
def fddZed
efddZdddg g ddd	fdeeee f deded ed!ee d"eeee	f  d#e
defd$d%Zdddg g ddd	dfd&eee eee  f deded ed!ee d"eeee	f  d#e
ded'ee fd(d)Z  ZS )+BertWordPieceTokenizerzBert WordPiece TokenizerN[UNK][SEP][CLS][PAD][MASK]T##vocab	unk_token	sep_token	cls_token	pad_token
mask_token
clean_texthandle_chinese_charsstrip_accents	lowercasewordpieces_prefixc                    s  |d urt t|t|d}n	t tt|d}|t|d ur)|t|g |t|d ur:|t|g |t|d urK|t|g |t|d ur\|t|g |t|d urm|t|g t|||	|
d|_t |_|d ur|t|}|d u rt	d|t|}|d u rt	dt
t||ft||f|_tj|d|_d||||||||	|
|d}t || d S )N)r   )r   r   r    r!   z%sep_token not found in the vocabularyz%cls_token not found in the vocabulary)prefixBertWordPiece)modelr   r   r   r   r   r   r   r    r!   r"   )r   r   strtoken_to_idadd_special_tokensr   
normalizerr   pre_tokenizer	TypeErrorr   post_processorr	   decodersuper__init__)selfr   r   r   r   r   r   r   r   r    r!   r"   	tokenizersep_token_idcls_token_id
parameters	__class__ }/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/tokenizers/implementations/bert_wordpiece.pyr/      sT   zBertWordPieceTokenizer.__init__c                 K   s   t | } t| fi |S )N)r   	read_filer   )r   kwargsr7   r7   r8   	from_fileQ   s   
z BertWordPieceTokenizer.from_filei0u     i  )r   r   r   r   r   files
vocab_sizemin_frequencylimit_alphabetinitial_alphabetspecial_tokensshow_progressc	           
   	   C   s<   t j|||||||d}	t|tr|g}| jj||	d dS )z%Train the model using the given filesr>   r?   r@   rA   rB   rC   continuing_subword_prefix)trainerN)r
   WordPieceTrainer
isinstancer&   
_tokenizertrain)
r0   r=   r>   r?   r@   rA   rB   rC   r"   rF   r7   r7   r8   rJ   V   s   
	zBertWordPieceTokenizer.trainiteratorlengthc
              	   C   s.   t j|||||||d}
| jj||
|	d dS )z(Train the model using the given iteratorrD   )rF   rL   N)r
   rG   rI   train_from_iterator)r0   rK   r>   r?   r@   rA   rB   rC   r"   rL   rF   r7   r7   r8   rM   v   s   	
z*BertWordPieceTokenizer.train_from_iterator)Nr   r   r   r   r   TTNTr   )__name__
__module____qualname____doc__r   r   r&   r   intr   boolr/   staticmethodr;   r   rJ   r   rM   __classcell__r7   r7   r5   r8   r      s    




	
B
#r   N)typingr   r   r   r   r   
tokenizersr   r   r	   r
   tokenizers.modelsr   tokenizers.normalizersr   tokenizers.pre_tokenizersr   tokenizers.processorsr   base_tokenizerr   r   r7   r7   r7   r8   <module>   s    