o
    h                     @   s^  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ d
dlmZ d
dlmZ d
dlm Z  d
dl!m"Z" d
dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d
dl.m/Z/m0Z0m1Z1 e12e3Z4dZ5dZ6dZ7dZ8dZ9e$d7 Z$eeeedZ:e5e8dZ;e0e$G dd de)Z<dS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)AnyOptionalUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec                )       sj  e Zd ZU dZeZdZeed<  fddZ	e
defddZe
defd	d
Ze
defddZdeeef fddZe
deeef fddZe
deeef fddZe
deeef fddZdeeef fddZdefddZe
defddZe
defddZ							d`ded ee d!ee d"ed#ed$ed%ed&edeeee f e!e f fd'd(Z"d)e#ee$e f de#ee!e f fd*d+Z%d,edefd-d.Z&d/edee fd0d1Z'dad2e!e#eef  defd3d4Z(dad5edefd6d7Z)	dad8e#ee!e f d9ede#ee!e f fd:d;Z*dbd<ed5ee d=ede!e fd>d?Z+d@e,dAe-dBedCedDee dEee fdFdGZ.de,j/e-j0ddHddddddddddddfdIe#e!e1 e!e2 e!e3 e!e4 f d=ed@e,dAe-dBee dCedJedDee dEee dKee d ee d!ee d"ed#ed$ed%ed&edLede5f&dMdNZ6dde,j/e-j0ddHddddddddddddfd<e#e1e3f dOee#e1e3f  d=ed@e,dAe-dBee dCedJedDee dEee dKee d ee d!ee d"ed#ed$ed%ed&edLede5f(dPdQZ7d)e!e defdRdSZ8		dcdTe#ee!e f d9edUee defdVdWZ9		dddXe#ee:j;f dYee dZee d[ee dee f
d\d]Z<			ded^d_Z=  Z>S )fPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc                    s  | dd }| dd }| dd }| dd }| dd}| di }|dd| _|r:|d u r:| jd u r:td	|d urDt|}	nw|d urP|sPt|}	nk|rWt	|}	nd|d urt
|d
}
|
d d }|
d }|
d }t||\}	}|| t|dkr|| n3| jd ur|dur| j|i |}t	|}	n|s|d
d | _|dg | _t	| dd}	d }ntd|	| _|d ur||j d| _| jj}|d ur| jjd+i | |d|d  |d|d  |d|d  |d|d  n| j  | jj}|d ur<| jjd+i | |d|d  |d|d  |d|d  |d|d  |d|d  t jd+i | | j| j_d d! | jD   fd"d#t| d$d% d&D t | j!" d'd# D  fd(d#| j#D 7 tdkrg }| j$}D ].}t%|t&r|j'pt(||v nt(||v }t%|t(rt&||d)}n||_'|)| q|r| *| z0t+,| j-j./ }|d| j| jkrt0t1| d*}| j|d< |d+i || j-_.W d S W d S  t2y   Y d S w ),Ntokenizer_object__slow_tokenizer	gguf_filer$   	from_slowFadded_tokens_decoderadd_prefix_spacezCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r%   config
model_type	tokenizertokenizer_configr   additional_special_tokensT)from_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                 S   s   h | ]}t t|qS  hashrepr.0tokenr@   r@   x/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py	<setcomp>       z3PreTrainedTokenizerFast.__init__.<locals>.<setcomp>c                    s$   g | ]\}}t t| vr|qS r@   rA   )rE   indexrF   )added_tokens_decoder_hashr@   rG   
<listcomp>   s
    z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>c                 S      | d S Nr   r@   )xr@   r@   rG   <lambda>       z2PreTrainedTokenizerFast.__init__.<locals>.<lambda>keyc                 S   s   g | ]}t |qS r@   )strrD   r@   r@   rG   rL      s    c                    s    g | ]}| vr|vr|qS r@   r@   rD   )encodertokens_to_addr@   rG   rL      s    )specialtyper@   )3popgetr-   r'   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   r   updatelenr%   r2   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr,   sorteditemslistadded_tokens_encoderkeysall_special_tokens_extendedall_special_tokens
isinstancer   rW   rT   append
add_tokensjsonloadsbackend_tokenizerpre_tokenizer__getstate__getattrpre_tokenizers_fast	Exception)selfargskwargsr(   slow_tokenizerr*   fast_tokenizer_filer+   r,   fast_tokenizer
gguf_paramarchitecturetokenizer_dictr1   additional_kwargs_truncation_paddingtokensspecial_tokensrF   
is_specialpre_tok_statepre_tok_class	__class__)rK   rU   rV   rG   rl   b   s   










z PreTrainedTokenizerFast.__init__returnc                 C      dS )NTr@   r   r@   r@   rG   is_fast   s   zPreTrainedTokenizerFast.is_fastc                 C   r   )z
        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        Tr@   r   r@   r@   rG   can_save_slow_tokenizer   s   z/PreTrainedTokenizerFast.can_save_slow_tokenizerc                 C      | j jddS )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensrb   get_vocab_sizer   r@   r@   rG   
vocab_size   s   z"PreTrainedTokenizerFast.vocab_sizec                 C   r   )NTr   )rb   	get_vocabr   r@   r@   rG   r      s   z!PreTrainedTokenizerFast.get_vocabc                 C   s   |   S N)r   r   r@   r@   rG   vocab   s   zPreTrainedTokenizerFast.vocabc                 C       dd t | j dd dD S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 S      i | ]\}}|j |qS r@   contentrE   vkr@   r@   rG   
<dictcomp>   rI   z@PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<dictcomp>c                 S   rM   rN   r@   itemr@   r@   rG   rP      rQ   z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>rR   ro   r,   rp   r   r@   r@   rG   rr      s    z,PreTrainedTokenizerFast.added_tokens_encoderc                 C   s
   | j  S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        )rb   get_added_tokens_decoderr   r@   r@   rG   r,      s   
z,PreTrainedTokenizerFast.added_tokens_decoderc                 C   r   )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        c                 S   r   r@   r   r   r@   r@   rG   r     rI   z;PreTrainedTokenizerFast.get_added_vocab.<locals>.<dictcomp>c                 S   rM   rN   r@   r   r@   r@   rG   rP     rQ   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>rR   r   r   r@   r@   rG   get_added_vocab
  s    z'PreTrainedTokenizerFast.get_added_vocabc                 C   r   )zD
        Size of the full vocabulary with the added tokens.
        Tr   r   r   r@   r@   rG   __len__  s   zPreTrainedTokenizerFast.__len__c                 C   s   | j S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )rb   r   r@   r@   rG   r{     s   z)PreTrainedTokenizerFast.backend_tokenizerc                 C   s   | j jS )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )rb   decoderr   r@   r@   rG   r      s   zPreTrainedTokenizerFast.decoderFTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 C   s   |du r	d| j v }|du rd| j v }|r |jdur |g|j }	n|g}	tt}
|	D ]>}|
d |j |r=|
d |j |rG|
d |j |rQ|
d |j |r[|
d |j	 |rg|
d t
|j q)|
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        Ntoken_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr>   )model_input_namesoverflowingr   rq   rw   idstype_idsr   r   offsetsra   )r   r   r   r   r   r   r   r   r   	encodingsencoding_dicter@   r@   rG   _convert_encoding'  s,   

z)PreTrainedTokenizerFast._convert_encodingr   c                    s&   t |tr
 |S  fdd|D S )aX  
        Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        c                       g | ]}  |qS r@   )#_convert_token_to_id_with_added_vocrD   r   r@   rG   rL   d      zAPreTrainedTokenizerFast.convert_tokens_to_ids.<locals>.<listcomp>)rv   rT   r   r   r   r@   r   rG   convert_tokens_to_idsV  s   

z-PreTrainedTokenizerFast.convert_tokens_to_idsrF   c                 C   s   | j |}|d u r| jS |S r   )rb   token_to_idunk_token_id)r   rF   rJ   r@   r@   rG   r   f  s   z;PreTrainedTokenizerFast._convert_token_to_id_with_added_vocrJ   c                 C   s   | j t|S r   )rb   id_to_tokenint)r   rJ   r@   r@   rG   _convert_id_to_tokenl  s   z,PreTrainedTokenizerFast._convert_id_to_token
new_tokensc                 C   s   |r| j |S | j |S r   )rb   add_special_tokensrx   )r   r   r   r@   r@   rG   _add_tokenso  s   z#PreTrainedTokenizerFast._add_tokenspairc                 C   s   | j |S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )rb   num_special_tokens_to_add)r   r   r@   r@   rG   r   u  s   z1PreTrainedTokenizerFast.num_special_tokens_to_addr   skip_special_tokensc                 C   sR   t |tr| j|S g }|D ]}t|}|r|| jv rq|| j| q|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        )rv   r   rb   r   all_special_idsrw   )r   r   r   r   rJ   r@   r@   rG   convert_ids_to_tokens  s   
z-PreTrainedTokenizerFast.convert_ids_to_tokenstextr   c                 K   s   | j d|||d| S )N)r   	text_pairr   r@   )encode_plusr   )r   r   r   r   r   r@   r@   rG   tokenize  s   z PreTrainedTokenizerFast.tokenizepadding_strategyr8   r4   r7   r?   r=   c                    s   | j j | j j}|tjkr dur| j   n&|||j| jd} du r'd}	n	 fdd|D }	|	|kr=| j jdi | |t	j
krO|durM| j   dS dS |t	jkrV|nd}
|
|dur_|n| j| j| j| j|d}||krz| j jdi | dS dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r4   r7   r9   r6   c                    s   i | ]	}|  |d qS r   rZ   )rE   r   r   r@   rG   r     s    zFPreTrainedTokenizerFast.set_truncation_and_padding.<locals>.<dictcomp>)r>   r6   pad_idr:   r<   r?   r@   )rb   re   ri   r   DO_NOT_TRUNCATErh   valuer5   rf   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr=   pad_token_idr:   r;   rj   )r   r   r8   r4   r7   r?   r=   r   targetcurrentr>   r@   r   rG   set_truncation_and_padding  s>   !


z2PreTrainedTokenizerFast.set_truncation_and_paddingr   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrm   c                    s&  t |ttfstdt| dj||||||	d jj|kr&|j_jj|||d}fdd|D }i }|d d 	 D ]  fdd|D }|| < qId	d |D }r~g }t
|D ]\}\}}||gt|d
  7 }qh||d< |d
 D ]	}|| qt|||
dS )Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r   r8   r4   r7   r?   r=   )r   is_pretokenizedc                    s&   g | ]}j | d qS ))r   r   r   r   r   r   r   r   )r   )rE   r   )r   r   r   r   r   r   r   r   r@   rG   rL   '  s    z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>r   c                    s"   g | ]\}}|  D ]}|q
qS r@   r@   )rE   r   _r   rR   r@   rG   rL   =  s   " c                 S   s   g | ]\}}|D ]}|qqS r@   r@   )rE   r   r   r   r@   r@   rG   rL   ?  s    r   overflow_to_sample_mapping)tensor_type)rv   tuplerq   	TypeErrorrX   r   rb   rn   encode_batchrs   	enumeratera   &_eventual_warn_about_too_long_sequencer   )r   r   r   r   r8   r4   r7   r   r?   r=   r   r   r   r   r   r   r   r   rm   r   tokens_and_encodingssanitized_tokensstacksanitized_encodingsr   itoksr   r   r@   )	rS   r   r   r   r   r   r   r   r   rG   _batch_encode_plus  sF   	
z*PreTrainedTokenizerFast._batch_encode_plusr   c                 K   s   |r||fgn|g}| j |fi d|d|d|d|d|d|d|	d|
d	|d
|d|d|d|d|d|d|d||}|d u rY|sYtdd | D |j}| |d || |S )Nr   r   r   r8   r4   r7   r?   r=   r   r   r   r   r   r   r   r   rm   c                 S   s8   i | ]\}}|t |d krt|d  tr|d  n|qS )r   )ra   rv   rq   )rE   rS   r   r@   r@   rG   r     s    &z8PreTrainedTokenizerFast._encode_plus.<locals>.<dictcomp>r   )r   r   rp   r   r   )r   r   r   r   r   r8   r4   r7   r   r?   r=   r   r   r   r   r   r   r   r   rm   r   batched_inputbatched_outputr@   r@   rG   _encode_plusM  s`   	
z$PreTrainedTokenizerFast._encode_plusc                 C   s$   | j jd ur| j j|S d|S )N )r{   r   decodejoinr   r@   r@   rG   convert_tokens_to_string  s
   z0PreTrainedTokenizerFast.convert_tokens_to_string	token_idsclean_up_tokenization_spacesc                 K   sV   | dd| _t|tr|g}| jj||d}|d ur|n| j}|r)| |}|S |S )Nuse_source_tokenizerF)r   )rY   rd   rv   r   rb   r  r  clean_up_tokenization)r   r  r   r  r   r   
clean_textr@   r@   rG   _decode  s   

zPreTrainedTokenizerFast._decodesave_directory
file_nameslegacy_formatfilename_prefixc                    s6  t |} jdu r|du rtd|du s|du o  jduo  j}|du p(|du }|r}tj||r5|d ndt } fdd j	 D }|rot
|d	d
d}	tj|ddddd }
|	|
 W d   n1 sjw   Y   j||d}|| |f }|rtj||r|d ndt } j| ||f }|S )z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- c                    s    i | ]\}}| j kr||qS r@   )r   )rE   tokrJ   r   r@   rG   r     s     z<PreTrainedTokenizerFast._save_pretrained.<locals>.<dictcomp>wzutf-8)r      )indent	sort_keysensure_ascii
)r  )rT   r'   r[   r   ospathr  ADDED_TOKENS_FILErr   rp   openry   dumpswritesave_vocabularyTOKENIZER_FILEr{   save)r   r  r  r  r  	save_slow	save_fastadded_tokens_fileadded_vocabfout_strvocab_filesr$   r@   r   rG   _save_pretrained  s<   
z(PreTrainedTokenizerFast._save_pretrainedc              	      s,  t | j }|d}|d}	d}
|d d dkr)i |d d< g |d d< nW|d d d	kre|d d
 durd|d d
 }|d d | d }
 durU|
 v rU |
 }
d|d d
< |
dgg|d d< n|d d dv rti |d d< ntd|d d  d durd|d v r|d d  v r |d d  |d d< tt |g }|D ]5}|dd}|dd}|d d d	kr|sq dur|d  v rՈ |d  |d< |	t
d+i | q|dur|| |d d dkrd|vr|d d dur|d d |d< |d d dkr'd|vr'|d d dur'|d d |d< |d d d	kr9|
dur9|
|d< |d durn|d d dksg|d d dkrnd|d v rntdd |d d D rntj |d< t|d d  }|d+||d|}j|||d |	dur!t  }d|	v r|	d D ]D}|	d | d  } dur fd!d"|D }||	d | d < |D ]}|}|du rtd#qfd$d"|D |	d | d%< qd&D ]0}||	v r|	| \}} dur| v r | }|}|du rtd#||g|	|< q|	|d< tt || j }tj }|d' |D ]A}t| |durrt| |} durO| v rO | }| j|d}t|t
rnt
||j|j|j|jd(d)||< q2|||< q2| j }|dur|| t!|dkr||d'< | j"d+d*i|S ),uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `List[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`Dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelrX   r    r   mergesr!   unk_idr   g        )r"   r#   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenrW   idr   continuing_subword_prefixend_of_word_suffixr|   	ByteLevelSequencepretokenizersc                 s   s    | ]	}|d  dkV  qdS )rX   r2  Nr@   )rE   pretokenizerr@   r@   rG   	<genexpr>B  s
    

zBPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>initial_alphabet)r   r   )r>   trainerr   r   c                    s   g | ]}  ||qS r@   r   rD   )special_tokens_mapr@   rG   rL   T  rI   zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>zQAttempted to set a token in the post processor that does not exist in the mappingc                    r   r@   )r   rD   )r0   r@   rG   rL   ]  r   r   )clssepr2   T)single_wordlstriprstrip
normalizedrW   r(   r@   )#ry   rz   rb   to_strrY   r[   r^   from_strr  rw   r   extendanyr   r2  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorr   rc   r\   r   SPECIAL_TOKENS_ATTRIBUTESremover~   _special_tokens_maprZ   rv   r<  r=  r>  r?  r2   ra   r   )r   text_iteratorr   r>   new_special_tokensr9  r   tokenizer_jsonr)  r*  r.  r-  r   added_tokenrW   r   trainer_classr8  trained_tokenizer_jsonrS   r   rF   token_idspecial_tokenspecial_tokens_listspecial_token_fullr2   r@   )r9  r0   rG   train_new_from_iterator  s   "










"






	

z/PreTrainedTokenizerFast.train_new_from_iterator)NNFFFFT)F)NF)FN)NN)NNN)?__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesr'   r   __annotations__rl   propertyboolr   r   r   r   dictrT   r   r   rr   r   r,   r   r   r^   r{   DecoderFastr   EncodingFastr   r   r   rq   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r
  r  PathLiker(  rT  __classcell__r@   r@   r   rG   r&   Q   s  
 |				

*/ 
$
P	

^
	

=


5r&   )=rX  r\   ry   r  collectionsr   collections.abcr   typingr   r   r   tokenizers.pre_tokenizerspre_tokenizersr   
tokenizersr   r`  r   r^   tokenizers.decodersr	   r_  tokenizers.trainersr
   r   r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r   
get_loggerrU  loggerr  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEr  rE  rY  r&   r@   r@   r@   rG   <module>   sB   0


