o
    h                     @   s*  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZmZ ee Z!dZ"d	Z#d
Z$G dd dZ%G dd de%Z&dd Z'dd Z(dd Z)dd Z*dd Z+de,e- de-fddZ.eeG dd deZ/dS )z
Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
tokenization_utils_fast.py
    N)OrderedDict)AnyOptionalUnionoverload   )ENCODE_KWARGS_DOCSTRING'ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRINGINIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingEncodedInputEncodedInputPairPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBase	TextInputTextInputPairTruncationStrategy)PaddingStrategy
TensorTypeadd_end_docstringsloggingzspecial_tokens_map.jsonzadded_tokens.jsonztokenizer_config.jsonc                   @   sL   e Zd ZdZdd Zdd ZdefddZd	ed
ee fddZ	dd Z
dS )Triez
    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
    Loose reference https://en.wikipedia.org/wiki/Trie
    c                 G   s"   i | _ t | _d| _| j|  d S )N )dataset_tokens_termination_charupdateselfargs r#   s/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/tokenization_utils.py__init__:   s   zTrie.__init__c                 G   s   t | D ]}| | qdS )z
        Updates the Trie with new tokens provided as arguments.

        Args:
            *args: Variable number of words to be added to the Trie.
        N)tupleadd)r!   r"   tokenr#   r#   r$   r   @   s   zTrie.updatewordc                 C   sJ   |sdS | j | | j}|D ]}||i ||< || }qd|| j< dS )u  
        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
        The special key `""` in `self._termination_char` is used to represent termination.

        This function is idempotent, adding twice the same word will leave the trie unchanged

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("Hello 友達")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

        >>> trie.add("Hello")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
        ```
        Nr   )r   r'   r   
setdefaultr   )r!   r)   refcharr#   r#   r$   r'   J   s   
zTrie.addtextreturnc                 C   s  t  }dg}d}t|D ]\}}|r||k rqt }d}| D ]\}	}
d|
v r| D ]V\}}||	kr6 nM||	k rC|d }|d }n|}|}|t|k rQ|| nd}d|v r]|}	|}|}||v r|| }|d7 }d|v rs|}	|}|}|t|krzn|| }||v saq,||	 || d} n||
v r|
| }
|
||	< q ||	 q |ri }n|D ]}	||	= q||kr|| jv r| j| ||< q| D ]\}	}
d|
v rt|}||	 ||  nq| ||S )a\  
        Will look for the words added to the trie within `text`. Output is the original string splitted along the
        boundaries of the words found.

        This trie will match the longest possible word first !

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS] This is a extra_id_100"]

        >>> trie.add("[CLS]")
        >>> trie.add("extra_id_1")
        >>> trie.add("extra_id_100")
        >>> trie.split("[CLS] This is a extra_id_100")
        ["[CLS]", " This is a ", "extra_id_100"]
        ```
        r   Fr   r   NT)	r   	enumerater   itemslenappendr'   r   cut_text)r!   r-   statesoffsetsskipcurrentcurrent_char	to_removeresetstarttrie_pointer	lookstartlooktrie_pointerlookahead_indexend	next_charr#   r#   r$   spliti   st   !	






z
Trie.splitc                 C   sX   | t| g }d}|D ]}||krtd q||krq| |||  |}q|S )Nr   zbThere was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.)r2   r1   loggererror)r!   r-   r5   tokensr;   r@   r#   r#   r$   r3     s   zTrie.cut_textN)__name__
__module____qualname____doc__r%   r   strr'   listrB   r3   r#   r#   r#   r$   r   4   s    
 r   c                       sN   e Zd Z fddZdefddZdedefdd	Zd
edefddZ	  Z
S )ExtensionsTriec                    s   t  j|  d S N)superr%   r    	__class__r#   r$   r%     s   zExtensionsTrie.__init__prefixc                    s&   |   }| |} fdd|D S )aC  
        Generates all extensions of a given prefix token in the Trie.

        Example:

        ```python
        >>> trie = Trie()
        >>> trie.add("apple")
        >>> trie.add("app")
        >>> trie.add("application")
        >>> trie.extensions("app")
        ['app', 'apple', 'application']
        ```
        c                       g | ]} | qS r#   r#   .0r(   rQ   r#   r$   
<listcomp>0      z-ExtensionsTrie.extensions.<locals>.<listcomp>)	_get_node_collect_tokens)r!   rQ   prefix_noderetr#   rU   r$   
extensions  s   

zExtensionsTrie.extensionsr(   r.   c                 C   s*   | j }|D ]}||vr |S || }q|S )a  
        Retrieves the node corresponding to the given token in the Trie.

        Args:
            token (str): The token for which the corresponding node needs to be retrieved.

        Returns:
            dict: The node in the Trie corresponding to the given token.
        )r   )r!   r(   noder,   r#   r#   r$   rX   2  s   

zExtensionsTrie._get_noder]   c                    sX   | j |v r	| j gng }| D ]\ } | j kr)| |}| fdd|D  q|S )a  
        Generates all tokens in the Trie starting from a given node.

        Args:
            node (dict): The node in the Trie from which tokens need to be generated.

        Returns:
            list: List of tokens generated from the given node.
        c                    rR   r#   r#   )rT   subtokenr(   r#   r$   rV   R  rW   z2ExtensionsTrie._collect_tokens.<locals>.<listcomp>)r   r0   rY   extend)r!   r]   rE   subtrie_head	subtokensr#   r_   r$   rY   D  s   


zExtensionsTrie._collect_tokens)rF   rG   rH   r%   rJ   r\   dictrX   rK   rY   __classcell__r#   r#   rO   r$   rL     s
    rL   c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z0Checks whether `char` is a whitespace character. 	
TZsF)unicodedatacategoryr,   catr#   r#   r$   _is_whitespaceV  s    
rn   c                 C   s8   | dks| dks| dkrdS t | }|drdS dS )z-Checks whether `char` is a control character.rf   rg   rh   FCT)rj   rk   
startswithrl   r#   r#   r$   _is_controlb  s   

rq   c                 C   sh   t | }|dkr|dks$|dkr|dks$|dkr|dks$|dkr&|dkr&d	S t| }|d
r2d	S dS )z1Checks whether `char` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)ordrj   rk   rp   )r,   cprm   r#   r#   r$   _is_punctuationn  s   @

r}   c                 C   $   | d }t t|t|B t|B S )zcChecks whether the last character in text is one of a punctuation, control or whitespace character.boolrq   r}   rn   )r-   	last_charr#   r#   r$   _is_end_of_word}     r   c                 C   r~   )zdChecks whether the first character in text is one of a punctuation, control or whitespace character.r   r   )r-   
first_charr#   r#   r$   _is_start_of_word  r   r   
token_list	new_tokenc                 C   s8   t | |}|t| k r| | |krdS | || dS )zm
    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
    N)bisectbisect_leftr1   insert)r   r   insertion_idxr#   r#   r$   !_insert_one_token_to_ordered_list  s   r   c                '       s  e Zd ZdZ fddZedefddZedefddZ	ede
eef fd	d
Zede
eef fddZejde
eeeef f de
eef fddZde
eef fddZdd Zdd Zd`deee ee f dedefddZg fdee fddZd`dedefddZd edee fd!d"Zd#d$ Zd%eeee f deeee f fd&d'Zd(d) Zd*d+ Zd,d-ej e!j"d,d.dd,d,d,d,d,ddddd-fd eee#e$f d/eeee#e$f  d0ed1ed2e!d3ee d4ed5ed6ee d7ee d8eeee%f  d9ee d:ee d;ed<ed=ed>ed?ede&f&d@dAZ'd-ej e!j"d,d.dd,d,d,d,d,ddddd-dfdBeee ee( ee# ee) ee$ ee* f d0ed1ed2e!d3ee d4ed5ed6ee d7ee d8eeee%f  d9ee d:ee d;ed<ed=ed>ed?edCede&f&dDdEZ+e,e-e.d-ej e!j"d,d.d,d,d,d,d,dddd-dfdFeee)e/ee d,f f  d0ed1ed2e!d3ee d4ed6ee d7ee d8ee d9ee d:ee d;ed<ed>ed?edCede&f"dGdHZ0	d`d ed5ede/ee
ee1f f fdIdJZ2	dadKedLee dMedee f fdNdOZ3e4d`dPedQedefdRdSZ5e4d`dPee dQedee fdTdSZ5	d`dPeeee f dQedeeee f fdUdSZ5dVedefdWdXZ6d%ee defdYdZZ7		,	-dbd[eeee f dQed\ee d]edef
d^d_Z8  Z9S )cPreTrainedTokenizera  
    Base class for all slow tokenizers.

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
    pretrained tokenizers as well as adding tokens to the vocabulary.

    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    c                    s|   t   _t dsi  _ j|di  dd  j D  _t j	d
i |  j
 fdd jD dd d	 _d S )N_added_tokens_decoderadded_tokens_decoderc                 S      i | ]\}}|j |qS r#   contentrT   vkr#   r#   r$   
<dictcomp>      z0PreTrainedTokenizer.__init__.<locals>.<dictcomp>c                    s   g | ]	}| j vr|qS r#   _added_tokens_encoderrS   r!   r#   r$   rV     s    z0PreTrainedTokenizer.__init__.<locals>.<listcomp>T)special_tokensFr#   )r   tokens_triehasattrr   r   popr0   r   rN   r%   _add_tokensall_special_tokens_extended_decode_use_source_tokenizer)r!   kwargsrO   r   r$   r%     s   

zPreTrainedTokenizer.__init__r.   c                 C   s   dS NFr#   r   r#   r#   r$   is_fast  s   zPreTrainedTokenizer.is_fastc                 C      t )zP
        `int`: Size of the base vocabulary (without the added tokens).
        NotImplementedErrorr   r#   r#   r$   
vocab_size  s   zPreTrainedTokenizer.vocab_sizec                 C   s    dd t | j dd dD S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 S   r   r#   r   r   r#   r#   r$   r     r   z<PreTrainedTokenizer.added_tokens_encoder.<locals>.<dictcomp>c                 S      | d S Nr   r#   itemr#   r#   r$   <lambda>      z:PreTrainedTokenizer.added_tokens_encoder.<locals>.<lambda>key)sortedr   r0   r   r#   r#   r$   added_tokens_encoder  s    z(PreTrainedTokenizer.added_tokens_encoderc                 C   s   t t| j dd dS )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        c                 S   r   r   r#   r   r#   r#   r$   r     r   z:PreTrainedTokenizer.added_tokens_decoder.<locals>.<lambda>r   )rc   r   r   r0   r   r#   r#   r$   r     s   z(PreTrainedTokenizer.added_tokens_decodervaluec              	   C   s   |  D ]9\}}t|ttfrt|ts(td|j|jf dttttf f t|tr1t|n|| j|< || j	t|< q| 
  d S )Nz;The provided `added_tokens_decoder` has an element of type z, should be a dict of )r0   
isinstancerJ   r   int	TypeErrorrP   r   r   r   _update_total_vocab_size)r!   r   indexr(   r#   r#   r$   r     s   "c                 C      | j S )aX  
        Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
        the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
        something we should change.

        Returns:
            `Dict[str, int]`: The added tokens.
        r   r   r#   r#   r$   get_added_vocab  s   	z#PreTrainedTokenizer.get_added_vocabc                 C   r   )zD
        Size of the full vocabulary with the added tokens.
        )total_vocab_sizer   r#   r#   r$   __len__  s   zPreTrainedTokenizer.__len__c                 C   s   t |  | _dS )a!  
        Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because
        otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and
        is only updated when adding tokens.
        N)r1   	get_vocabr   r   r#   r#   r$   r     s   z,PreTrainedTokenizer._update_total_vocab_sizeF
new_tokensr   c           	      C   sz  d}|du r|S |    }t|}|D ]}t|ttfs*td| dt| dt|dkr1qt|trN|| jv r<q|| j	v pB|}t|dd| |d}n|rY|
d	|jd
 || jv r_q|jsq|jrqt| ddrq|j |_|j|vr|| }|||j< |d7 }n||j }|jrt|| j	vr| jd | || j|< || j|j< | jrtd| d q|   |   |S )a  
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
        vocab which is why they have to be handled specifically.

        Args:
            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
                (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
                of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
                stripping and normalization of this token. This is NOT possible in `tokenizers`.
            special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the tokens should be added as special tokens.

        Returns:
            `int`: The number of tokens actually added to the vocabulary.

        Examples:

        ```python
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        model = BertModel.from_pretrained("google-bert/bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```r   NzToken z is not a string but a .r   F)rstriplstrip
normalizedspecialT)r   r   do_lower_caser   additional_special_tokenszAdding z to the vocabulary)r   copyr1   r   rJ   r   r   typer   all_special_tokens__setstate__r   r   r   getattrr   lower_special_tokens_mapr2   verboserC   info_update_trier   )	r!   r   r   added_tokenscurrent_vocabnew_idxr(   
is_specialtoken_indexr#   r#   r$   r      sL   







zPreTrainedTokenizer._add_tokensunique_no_split_tokensc                 C   sP   | j  D ]}|| jjvr| j|j q|D ]}|| jjvr%| j| qd S rM   )r   valuesr   r   r'   r   )r!   r   r(   r#   r#   r$   r   N  s   z PreTrainedTokenizer._update_triepairc                 C   s$   g }g }t | ||r|S dS )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        N)r1    build_inputs_with_special_tokens)r!   r   token_ids_0token_ids_1r#   r#   r$   num_special_tokens_to_addV  s   z-PreTrainedTokenizer.num_special_tokens_to_addr-   c                 K   s$  | d| j}| j|fi |\}}|rtd| d t| drM| jrMdd | jD }|dd | j	 D 7 }dd	
| d
 d }t|dd |}|rUg }|g}n| j }| j|}t|D ]\}}	|	|v r| j| j|	 d}
|dkr||d  nd}|t|d k r||d  nd}t|
tr|
jr|r| ||d < |
jr|r| ||d < |
jr|r|d dkr||d   |	7  < d||< qd|
jr|r|d dkr|	||d   ||d < d||< qdt|
 dt|
 qdg }|D ]}	|	sq|	|v r||	 q|| |	 q|S )a$  
        Converts a string into a sequence of tokens, using the tokenizer.

        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
        (BPE/SentencePieces/WordPieces). Takes care of added tokens.

        Args:
            text (`str`):
                The sequence to be encoded.
            **kwargs (additional keyword arguments):
                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

        Returns:
            `List[str]`: The list of tokens.
        split_special_tokenszKeyword arguments z not recognized.r   c                 S   s   g | ]}t |qS r#   )reescaperT   s_tokr#   r#   r$   rV     s    z0PreTrainedTokenizer.tokenize.<locals>.<listcomp>c                 S   s$   g | ]}|j s|jrt|jqS r#   )r   r   r   r   r   r   r#   r#   r$   rV     s    
(|z)|z(.+?)c                 S   s   |   d p|   d  S )Nr   r   )groupsr   )mr#   r#   r$   r     s    z.PreTrainedTokenizer.tokenize.<locals>.<lambda>Nr   r   r   re   r   zy cannot be tokenized because it was not properly added to the tokenizer. This means that it is not an `AddedToken` but a )r   r   prepare_for_tokenizationrC   warningr   r   r   r   r   joinr   subr   keysr   rB   r/   getr1   r   r   r   r   single_word
ValueErrorr   r2   r`   	_tokenize)r!   r-   r   r   escaped_special_tokspatternno_split_tokenrE   ir(   tok_extendedleftrighttokenized_textr#   r#   r$   tokenizem  s^   
 




zPreTrainedTokenizer.tokenizec                 K   r   )a  
        Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

        Do NOT take care of added tokens.
        r   )r!   r-   r   r#   r#   r$   r     s   zPreTrainedTokenizer._tokenizerE   c                 C   sB   |du rdS t |tr| |S g }|D ]
}|| | q|S )aT  
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        N)r   rJ   #_convert_token_to_id_with_added_vocr2   )r!   rE   idsr(   r#   r#   r$   convert_tokens_to_ids  s   

z)PreTrainedTokenizer.convert_tokens_to_idsc                 C   s*   |d u rd S || j v r| j | S | |S rM   )r   _convert_token_to_idr!   r(   r#   r#   r$   r     s
   


z7PreTrainedTokenizer._convert_token_to_id_with_added_vocc                 C   r   rM   r   r   r#   r#   r$   r        z(PreTrainedTokenizer._convert_token_to_idNTr   	text_pairadd_special_tokenspadding_strategytruncation_strategy
max_lengthstrideis_split_into_wordspad_to_multiple_ofpadding_sidereturn_tensorsreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthr   c                    s    fdd}|rt d||}|d ur||nd }j|fi d|d|d|jd|jd|d	|d
|	d|
d|ddd|d|d|d|d|d|S )Nc                    s   t | trj| fi }|S t | ttfrBt| dkrBt | d trB r=ttjfdd| D  }|S | S t | ttfrXt| dkrXt | d t	rX| S  rbt
d|  dt
d|  d)Nr   c                 3   &    | ]}j |fd di V  qdS r  TNr   rT   tr   r!   r#   r$   	<genexpr>     $ zJPreTrainedTokenizer._encode_plus.<locals>.get_input_ids.<locals>.<genexpr>zInput z] is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`.zW is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.r   rJ   r   r   rK   r&   r1   	itertoolschainr   r   r-   rE   r  r   r!   r#   r$   get_input_ids  s&   

(

(

z7PreTrainedTokenizer._encode_plus.<locals>.get_input_idsa  return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674pair_idsr   padding
truncationr  r  r  r  r  prepend_batch_axisTr	  r  r
  r  r  r   )r   prepare_for_modelr   )r!   r-   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r   r   r  	first_ids
second_idsr#   r  r$   _encode_plus  sT   	
z PreTrainedTokenizer._encode_plusbatch_text_or_text_pairsr   c                    s    fdd}|rt dg }|D ]9}t|ttfs!|d }}n r2t|d ttfs2|d }}n|\}}||}|d urB||nd }|||f qj|f|||||||	||||||
||d}t|S )Nc                    s   t | trj| fi }|S t | ttfrBt| dkrBt | d trB r=ttjfdd| D  }|S | S t | ttfrXt| dkrXt | d t	rX| S t
d)Nr   c                 3   r  r  r  r  r  r#   r$   r  [  r  zPPreTrainedTokenizer._batch_encode_plus.<locals>.get_input_ids.<locals>.<genexpr>z\Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.r  r  r  r#   r$   r  T  s   

(

(z=PreTrainedTokenizer._batch_encode_plus.<locals>.get_input_idszreturn_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast.r   )r   r   r  r  r  r  r  r	  r  r
  r  r  r  r   r   )r   r   rK   r&   r2   _batch_prepare_for_modelr   )r!   r$  r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r  r   r   r   r  	input_idsids_or_pair_idsr   r  r!  r"  batch_outputsr#   r  r$   _batch_encode_plus7  sF   z&PreTrainedTokenizer._batch_encode_plusbatch_ids_pairsc                 C   s   i }|D ]W\}}| j ||fi d|dtjjd|jd|d|ddddd	d
d|
d|d|d|dddd
d|d|}| D ]\}}||vrSg ||< || | qGq| j||j||||d}t||	d}|S )a  
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        r   r  r  r  r  r  Nr  r	  Fr  r
  r  r  r  r  r   r   )r  r  r  r  r	  )tensor_type)r   r   
DO_NOT_PADr   r0   r2   padr   )r!   r*  r   r   r  r  r  r  r  r  r  r	  r
  r  r  r   r   r(  r!  r"  outputsr   r   r#   r#   r$   r%    sj   	
	z,PreTrainedTokenizer._batch_prepare_for_modelc                 K   s   ||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        r#   )r!   r-   r  r   r#   r#   r$   r     s   z,PreTrainedTokenizer.prepare_for_tokenizationr   r   already_has_special_tokensc                    sD   |r|dur
t dt j||ddS dg|rt|ndt|  S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.T)r   r   r/  r   )r   rN   get_special_tokens_maskr1   )r!   r   r   r/  rO   r#   r$   r0    s   z+PreTrainedTokenizer.get_special_tokens_maskr   skip_special_tokensc                 C      d S rM   r#   r!   r   r1  r#   r#   r$   convert_ids_to_tokens  r   z)PreTrainedTokenizer.convert_ids_to_tokensc                 C   r2  rM   r#   r3  r#   r#   r$   r4    r   c                 C   s   t |tr|| jv r| j| jS | |S g }|D ]%}t|}|r'|| jv r'q|| jv r6|| j| j q|| | q|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        )r   r   r   r   _convert_id_to_tokenall_special_idsr2   )r!   r   r1  rE   r   r#   r#   r$   r4    s   



r   c                 C   r   rM   r   )r!   r   r#   r#   r$   r5  2  r   z(PreTrainedTokenizer._convert_id_to_tokenc                 C   s
   d |S )Nre   )r   )r!   rE   r#   r#   r$   convert_tokens_to_string5  s   
z,PreTrainedTokenizer.convert_tokens_to_string	token_idsclean_up_tokenization_spacesspaces_between_special_tokensc                    s  | dd _ j||d}t|tr|g}t j t j  fdd j	D B }g }g }	|D ]-}
|r<|
 jv r<q2|
|v rZ|	rT 
|	}t|dkrR|| g }	||
 q2|	|
 q2|	rj| 
|	 |rrd|}nd|}|d ur}|n j}|r |}|S |S )	Nuse_source_tokenizerF)r1  c                    s    h | ]}  | jkr|qS r#   )r   r   rS   r   r#   r$   	<setcomp>G  s    z.PreTrainedTokenizer._decode.<locals>.<setcomp>r   re   r   )r   r   r4  r   rJ   r   r   r   r   r   r7  r1   r2   r   r9  clean_up_tokenization)r!   r8  r1  r9  r:  r   filtered_tokenslegacy_added_tokens	sub_textscurrent_sub_textr(   stringr-   
clean_textr#   r   r$   _decode8  sB   
 



zPreTrainedTokenizer._decode)Fr   )FNT):rF   rG   rH   rI   r%   propertyr   r   r   r   rc   rJ   r   r   r   setterr   r   r   r   rK   r   r   r   r   r   r   r   r   r   r   r   r,  r   DO_NOT_TRUNCATEr   r   r   r   r#  r   r   r   r)  r   r   r	   r&   r%  r   r   r0  r   r4  r5  r7  rD  rd   r#   r#   rO   r$   r     s   	,(NP*		

Z

Y	
F
"
 r   )0rI   r   r  r   rj   collectionsr   typingr   r   r   r   tokenization_utils_baser   r	   r
   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   
get_loggerrF   rC   SPECIAL_TOKENS_MAP_FILEADDED_TOKENS_FILETOKENIZER_CONFIG_FILEr   rL   rn   rq   r}   r   r   rK   rJ   r   r   r#   r#   r#   r$   <module>   s0   <
 h;