o
    
h 9                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmZmZ ejeZejedZeeZe ZW d   n1 sLw   Y  G dd dZe	e Ze	ee  ZG dd	 d	e
ZG d
d dZG dd dZG dd dZ efde	e! de!fddZ"dS )    N)Template)AnyCallableDictList
NamedTupleOptionalTuple)Encoding	Tokenizerzvisualizer-styles.cssc                   @   s<   e Zd ZU eed< eed< eed< dededefddZdS )
Annotationstartendlabelc                 C   s   || _ || _|| _d S N)r   r   r   )selfr   r   r    r   o/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/tokenizers/tools/visualizer.py__init__   s   
zAnnotation.__init__N)__name__
__module____qualname__int__annotations__strr   r   r   r   r   r      s
   
 r   c                   @   s&   e Zd ZU ee ed< ee ed< dS )CharStateKeytoken_ixanno_ixN)r   r   r   r   r   r   r   r   r   r   r      s   
 r   c                   @   sH   e Zd ZU ee ed< dd Zedd Zedd Z	de
fd	d
ZdS )	CharStatechar_ixc                 C   s   || _ d | _g | _d S r   )r   r   tokens)r   r   r   r   r   r   '   s   
zCharState.__init__c                 C   s   t | jdkr| jd S d S )Nr   lenr    r   r   r   r   r   -   s   zCharState.token_ixc                 C   s   t | jdkS )zJ
        BPE tokenizers can output more than one token for a char
           r!   r#   r   r   r   is_multitoken1   s   zCharState.is_multitokenreturnc                 C   s   t | j| jdS )N)r   r   )r   r   r   r#   r   r   r   partition_key8   s   zCharState.partition_keyN)r   r   r   r   r   r   r   propertyr   r%   r   r'   r   r   r   r   r   $   s   
 

r   c                   @   s   e Zd ZdS )AlignedN)r   r   r   r   r   r   r   r)   ?   s    r)   c                
   @   s  e Zd ZdZejdejdZ		ddede	de
eegef  fd	d
Zg dfdedede
e	 de
e fddZededeeef fddZedee dedefddZededededefddZedededefddZededededee fddZdS )EncodingVisualizera  
    Build an EncodingVisualizer

    Args:

         tokenizer (:class:`~tokenizers.Tokenizer`):
            A tokenizer instance

         default_to_notebook (:obj:`bool`):
            Whether to render html output in a notebook by default

         annotation_converter (:obj:`Callable`, `optional`):
            An optional (lambda) function that takes an annotation in any format and returns
            an Annotation object
    z(.{1})?(unk|oov)(.{1})?)flagsTN	tokenizerdefault_to_notebookannotation_converterc                 C   sF   |rz
ddl m}m} W n ty   tdw || _|| _|| _d S )Nr   HTMLdisplayzWe couldn't import IPython utils for html display.
                        Are you running in a notebook?
                        You can also pass `default_to_notebook=False` to get back raw HTML
                    )IPython.core.displayr0   r1   ImportError	Exceptionr,   r-   annotation_coverter)r   r,   r-   r.   r0   r1   r   r   r   r   V   s   zEncodingVisualizer.__init__textannotationsr&   c           	      C   s   | j }|dur	|}|r!z
ddlm}m} W n ty    tdw | jdur.tt| j|}| j	
|}t|||}|rE||| dS |S )a  
        Build a visualization of the given text

        Args:
            text (:obj:`str`):
                The text to tokenize

            annotations (:obj:`List[Annotation]`, `optional`):
                An optional list of annotations of the text. The can either be an annotation class
                or anything else if you instantiated the visualizer with a converter function

            default_to_notebook (:obj:`bool`, `optional`, defaults to `False`):
                If True, will render the html in a notebook. Otherwise returns an html string.

        Returns:
            The HTML string if default_to_notebook is False, otherwise (default) returns None and
            renders the HTML in the notebook

        Nr   r/   zeWe couldn't import IPython utils for html display.
                    Are you running in a notebook?)r-   r2   r0   r1   r3   r4   r5   listmapr,   encoder*   _EncodingVisualizer__make_html)	r   r6   r7   r-   final_default_to_notebookr0   r1   encodinghtmlr   r   r   __call__l   s$   
zEncodingVisualizer.__call__c           	      C   s   t | dkri S ttdd | }t |}td| }|dk r!d}d}d}d}i }t|D ]}d	| d
| d| d||< ||7 }q-|S )a  
        Generates a color palette for all the labels in a given set of annotations

        Args:
          annotations (:obj:`Annotation`):
            A list of annotations

        Returns:
            :obj:`dict`: A dictionary mapping labels to colors in HSL format
        r   c                 S   s   | j S r   )r   )xr   r   r   <lambda>   s    z;EncodingVisualizer.calculate_label_colors.<locals>.<lambda>          @   
   zhsl(,z%,%)r"   setr9   r   sorted)	r7   labels
num_labelsh_stepslhcolorsr   r   r   r   calculate_label_colors   s   
z)EncodingVisualizer.calculate_label_colorsconsecutive_chars_listr=   c                 C   s.  | d }|j du r|j|j }d| dS | d }|j }|j d }||| }g }	i }
|jdurg|	d |jr=|	d |jd	 rH|	d
 n|	d tj|j|j durf|	d |j|j |
d< n|	d dd|	 d}d}|
	 D ]\}}|d| d| d7 }q{d| d| d| dS )a  
        Converts a list of "consecutive chars" into a single HTML element.
        Chars are consecutive if they fall under the same word, token and annotation.
        The CharState class is a named tuple with a "partition_key" method that makes it easy to
        compare if two chars are consecutive.

        Args:
            consecutive_chars_list (:obj:`List[CharState]`):
                A list of CharStates that have been grouped together

            text (:obj:`str`):
                The original text being processed

            encoding (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`str`: The HTML span for a set of consecutive chars
        r   Nz(<span class="special-token" data-stoken=z></span>r$   tokenzmulti-token   z	odd-tokenz
even-tokenzspecial-tokenstokz	non-tokenzclass=" " z data-z="z<span z ></span>)
r   r    r   appendr%   r*   unk_token_regexsearchjoinitems)rS   r6   r=   firststokenlastr   r   	span_textcss_classes
data_itemscssdatakeyvalr   r   r   consecutive_chars_to_html   s6   








z,EncodingVisualizer.consecutive_chars_to_htmlc                 C   sV  t | ||}|d g}|d j}g }t |}|d j}|d ur8|| }	|	j}
||
 }|d| d|
 d |dd  D ][}|j}||kry|t j|| |d |g}|d ur^|d |d ury|| }	|	j}
||
 }|d| d|
 d |}| |d  kr|| q>|t j|| |d |g}q>|t j|| |d t|}|S )Nr   z&<span class="annotation" style="color:z" data-label="z">r$   )r6   r=   r[   )	r*   %_EncodingVisualizer__make_char_statesr   rR   r   r\   rk   r'   HTMLBody)r6   r=   r7   char_statescurrent_consecutive_charsprev_anno_ixspanslabel_colors_dictcur_anno_ixannor   colorcsresr   r   r   __make_html   sb   




zEncodingVisualizer.__make_htmlc                 C   s@   dgt |  }t|D ]\}}t|j|jD ]}|||< qq|S )a  
        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`AnnotationList`):
                A (possibly empty) list of annotations

        Returns:
            A list of  length len(text) whose entry at index i is None if there is no annotation on
            character i or k, the index of the annotation that covers index i where k is with
            respect to the list of annotations
        N)r"   	enumerateranger   r   )r6   r7   annotation_mapr   air   r   r   __make_anno_map<  s   
z"EncodingVisualizer.__make_anno_mapc                 C   s   t | |}dd tt| D }t|jD ]!\}}||}|dur7|\}}	t||	D ]
}
||
 j| q,qt|D ]	\}}||| _q<|S )a  
        For each character in the original text, we emit a tuple representing it's "state":

            * which token_ix it corresponds to
            * which word_ix it corresponds to
            * which annotation_ix it corresponds to

        Args:
            text (:obj:`str`):
                The raw text we want to align to

            annotations (:obj:`List[Annotation]`):
                A (possibly empty) list of annotations

            encoding: (:class:`~tokenizers.Encoding`):
                The encoding returned from the tokenizer

        Returns:
            :obj:`List[CharState]`: A list of CharStates, indicating for each char in the text what
            it's state is
        c                 S   s   g | ]}t |qS r   )r   ).0r   r   r   r   
<listcomp>j  s    z9EncodingVisualizer.__make_char_states.<locals>.<listcomp>N)	r*   "_EncodingVisualizer__make_anno_maprz   r"   ry   r    token_to_charsr\   r   )r6   r=   r7   r{   rn   r   rU   offsetsr   r   r}   r   r   r   r   r   __make_char_statesQ  s   
z%EncodingVisualizer.__make_char_states)TN)r   r   r   __doc__recompile
IGNORECASEr]   r   boolr   r   r   r   r   r   AnnotationListr?   staticmethodr   rR   r   r   r
   rk   r;   PartialIntListr   rl   r   r   r   r   r*   C   sN    

-CA$r*   childrenr&   c                 C   s   d | }d| d| dS )a[  
    Generates the full html with css from a list of html spans

    Args:
        children (:obj:`List[str]`):
            A list of strings, assumed to be html elements

        css_styles (:obj:`str`, `optional`):
            Optional alternative implementation of the css

    Returns:
        :obj:`str`: An HTML string with style markup
    rZ   z?
    <html>
        <head>
            <style>
                zs
            </style>
        </head>
        <body>
            <div class="tokenized-text" dir=auto>
            z4
            </div>
        </body>
    </html>
    )r_   )r   
css_styleschildren_textr   r   r   rm   w  s   
	rm   )#	itertoolsosr   stringr   typingr   r   r   r   r   r   r	   
tokenizersr
   r   pathdirname__file__r_   css_filenameopenfreadrg   r   r   r   r   r   r   r)   r*   r   rm   r   r   r   r   <module>   s*    $

  6