o
    hv                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ d	d
lmZmZm Z  e!e"Z#erpddl$m%Z% ddl&m'Z' e rpd dl(Z(e rd dl)Z*ddl+m,Z, dZ-e rd dl.Z.d dl/m-Z- ddl0m1Z1 dej2dej2de3de3dej2de
fddZ4					d#ddZ5G dd deZ6ee dd G d!d" d"eZ7dS )$    N)Iterable)TYPE_CHECKINGDictListOptionalTupleUnion   )SquadExampleSquadFeatures"squad_convert_examples_to_features)	ModelCard)PreTrainedTokenizer)PaddingStrategyadd_end_docstringsis_tf_availableis_tokenizers_availableis_torch_availablelogging   )ArgumentHandlerChunkPipelinebuild_pipeline_init_args)TFPreTrainedModel)PreTrainedModel)-TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES)Dataset)*MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMESstartendtopkmax_answer_lenundesired_tokensreturnc                 C   s  | j dkr	| d } |j dkr|d }tt| dt|d}tt||d }| }|dkr:t|g}n"t||k rGt	| }nt
| |d| }	|	t	||	   }t||jdd \}
}t|
| t|| @ }|
| }
|| }|d|
|f }|
||fS )aG  
    Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
    answer.

    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
    answer end position being before the starting position. The method supports output the k-best answer through the
    topk argument.

    Args:
        start (`np.ndarray`): Individual start probabilities for each token.
        end (`np.ndarray`): Individual end probabilities for each token.
        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
        undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
    r   Nr   )ndimnpmatmulexpand_dimstriltriuflattenargmaxlenargsortargpartitionunravel_indexshapeisinnonzero)r   r   r    r!   r"   outer
candidatesscores_flatidx_sortidxstartsendsdesired_spansscores r=   }/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/pipelines/question_answering.pydecode_spans.   s&   

 
r?   @B F   c                 C   s   t t |d }|dur||@ }|dk}	t |	d| } t |	d|}t | | jddd } | |   } t ||jddd }||  }|rWt|| d |d   }d | d< |d< t	| ||||\}
}}|
|||fS )	ai  
    Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
    `decode_spans()` to generate probabilities for each span to be the actual answer.

    Args:
        start (`np.ndarray`): Individual start logits for each token.
        end (`np.ndarray`): Individual end logits for each token.
        p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
        attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
        min_null_score(`float`): The minimum null (empty) answer score seen so far.
        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
        handle_impossible_answer(`bool`): Whether to allow null (empty) answers
        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
    r   Ng        g     r$   T)axiskeepdims)r   r   )
r&   absarraywhereexpmaxsumminitemr?   )r   r   p_maskattention_maskmin_null_scoretop_khandle_impossible_answerr!   r"   undesired_tokens_maskr9   r:   r<   r=   r=   r>   select_starts_ends`   s   rR   c                   @   s    e Zd ZdZdd Zdd ZdS ) QuestionAnsweringArgumentHandlera&  
    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
    internal [`SquadExample`].

    QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line
    supplied arguments.
    c                 C   s   t |tr|S t |trFdD ]/}||vrtd|| d u r&td| dt || tr=t|| dkr=td| dqtjdi |S t| d)	NquestioncontextzFYou need to provide a dictionary with keys {question:..., context:...}`z` cannot be Noner   z` cannot be emptyz2 argument needs to be of type (SquadExample, dict)r=   )	
isinstancer
   dictKeyError
ValueErrorstrr-   QuestionAnsweringPipelinecreate_sample)selfrK   kr=   r=   r>   	normalize   s   

z*QuestionAnsweringArgumentHandler.normalizec                    s  |d ur5t |dkr5t |dkr|d }nt |dkr0dd |D thkr0|d |d dg}nt|}nd v rDtdt  d }nd	 v rStd
t  d	 }nsd v rd v rt d trut d tru fdd d D }nQt d trt d trt  d t  d krtddd t d  d D }n$t d trt d tr d  d dg}ntdtd  t	d urt
jt	fnt
jf}t||r|S t|tr|g}nt|trt|}ntd  t|D ]\}}| |||< q|S )Nr   r   r	   c                 S   s   h | ]}t |qS r=   )type).0elr=   r=   r>   	<setcomp>   s    z<QuestionAnsweringArgumentHandler.__call__.<locals>.<setcomp>rT   XzPassing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.datazPassing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.rU   rV   c                    s   g | ]	}| d  dqS )rV   rT   r=   )rc   Qkwargsr=   r>   
<listcomp>       z=QuestionAnsweringArgumentHandler.__call__.<locals>.<listcomp>z2Questions and contexts don't have the same lengthsc                 S   s   g | ]	\}}||d qS )rT   r=   )rc   rh   Cr=   r=   r>   rk      rl   zArguments can't be understoodzUnknown arguments zInvalid arguments )r-   r\   listwarningswarnFutureWarningrX   r[   zipr   typesGeneratorTyperY   r   	enumeratera   )r_   argsrj   inputsgenerator_typesirK   r=   ri   r>   __call__   sP   
 






z)QuestionAnsweringArgumentHandler.__call__N)__name__
__module____qualname____doc__ra   rz   r=   r=   r=   r>   rS      s    rS   T)has_tokenizerc                       s0  e Zd ZdZdZdZ			d/ded ded	ee	 d
ee
 de
f
 fddZedee
ee
 f dee
ee
 f deeee f fddZ									d0ddZ fddZd1ddZdd Z				d2d d!Zd"d#d$ed%ed&ed'edeeef fd(d)Zd*e
d+ed,edee
ee
ef f fd-d.Z  ZS )3r]   a  
    Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering
    examples](../task_summary#question-answering) for more information.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="deepset/roberta-base-squad2")
    >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
    {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
    zquestion,contextFN model)r   r   	tokenizer	modelcard	frameworktaskc                    sH   t  jd|||||d| t | _| | jdkrt d S t d S )N)r   r   r   r   r   tfr=   )super__init__rS   _args_parsercheck_model_typer   r   r   )r_   r   r   r   r   r   rj   	__class__r=   r>   r     s    		
z"QuestionAnsweringPipeline.__init__rU   rV   r#   c                 C   s0   t | trdd t| |D S td| |dddS )aC  
        QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the
        logic for converting question(s) and context(s) to [`SquadExample`].

        We currently support extractive question answering.

        Arguments:
            question (`str` or `List[str]`): The question(s) asked.
            context (`str` or `List[str]`): The context(s) in which we will look for the answer.

        Returns:
            One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context.
        c              	   S   s"   g | ]\}}t d ||d d d qS )N)r
   )rc   qcr=   r=   r>   rk   0  s   " z;QuestionAnsweringPipeline.create_sample.<locals>.<listcomp>N)rX   rn   rr   r
   rT   r=   r=   r>   r^     s   
z'QuestionAnsweringPipeline.create_samplec
                 K   s   i }|d ur
||d< |d ur||d< |d ur||d< |d ur"||d< i }|d ur4|d u r4t dt |}|d urH|dk rDtd| d||d	< |d ur[|dk rWtd
| ||d< |d urc||d< |	d urk|	|d< |i |fS )Npadding
doc_stridemax_question_lenmax_seq_lenz/topk parameter is deprecated, use top_k insteadr   z$top_k parameter should be >= 1 (got )rO   z-max_answer_len parameter should be >= 1 (got r!   rP   align_to_words)ro   rp   UserWarningr[   )r_   r   r    rO   r   r!   r   r   rP   r   rj   preprocess_paramspostprocess_paramsr=   r=   r>   _sanitize_parameters4  s4   
z.QuestionAnsweringPipeline._sanitize_parametersc                    sf   |rt dt | j|i |}t|ttfr)t|dkr)t j	|d fi |S t j	|fi |S )a	  
        Answer the question(s) given as inputs by using the context(s).

        Args:
            question (`str` or `List[str]`):
                One or several question(s) (must be used in conjunction with the `context` argument).
            context (`str` or `List[str]`):
                One or several context(s) associated with the question(s) (must be used in conjunction with the
                `question` argument).
            top_k (`int`, *optional*, defaults to 1):
                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
                top_k answers if there are not enough options available within the context.
            doc_stride (`int`, *optional*, defaults to 128):
                If the context is too long to fit with the question for the model, it will be split in several chunks
                with some overlap. This argument controls the size of that overlap.
            max_answer_len (`int`, *optional*, defaults to 15):
                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
            max_seq_len (`int`, *optional*, defaults to 384):
                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
            max_question_len (`int`, *optional*, defaults to 64):
                The maximum length of the question after tokenization. It will be truncated if needed.
            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
                Whether or not we accept impossible as an answer.
            align_to_words (`bool`, *optional*, defaults to `True`):
                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on
                non-space-separated languages (like Japanese or Chinese)

        Return:
            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:

            - **score** (`float`) -- The probability associated to the answer.
            - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input).
            - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input).
            - **answer** (`str`) -- The answer to the question.
        zPassing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.r   r   )
ro   rp   rq   r   rX   rn   tupler-   r   rz   )r_   rv   rj   examplesr   r=   r>   rz   ^  s   'z"QuestionAnsweringPipeline.__call__
do_not_pad@   c                 #   s   t |trtd |d |d d d d }|d u rt| jjd}|d u r)t|d d}||kr8td| d| d| jjsLt|g| j|||t	j
d	d	d
}n| jjdk| jrY|jn|jr`|jn|j|rgdnd||ddddd
 t d } fddt|D }g }t|D ]}}	 d |	 }
d v r d |	 nd }d v r d |	 nd }| jjd urtt|
| jjkd }|D ]}d||	 |< q||	 }|td'i d|
d|d|d|d |	 dd di dddddddddg ddd dd!d	d"d  qt|D ]y\}}i }i }| jjddg }|j D ]P\}}||v rk| jd#krLt|}|jtjkrBt|tj }t!|d||< q | jd$krit"#|}|jt"j krb|$ }|%d||< q |||< q |t|d% k}||d&||V  qd S )(NrU   rV   i  r	      z`doc_stride` (z ) is larger than `max_seq_len` (r   F)r   r   max_seq_lengthr   max_query_lengthpadding_strategyis_trainingtqdm_enabledrightonly_second
only_firstT)
text	text_pairr   
truncation
max_lengthstridereturn_token_type_idsreturn_overflowing_tokensreturn_offsets_mappingreturn_special_tokens_mask	input_idsc                    s$   g | ]}fd d  |D qS )c                    s   g | ]
} r
|d kndqS )r   r   r=   )rc   tok)question_firstr=   r>   rk     s    zCQuestionAnsweringPipeline.preprocess.<locals>.<listcomp>.<listcomp>)sequence_ids)rc   span_idencoded_inputsr   r=   r>   rk     s    z8QuestionAnsweringPipeline.preprocess.<locals>.<listcomp>rM   token_type_idsr   rL   encoding	cls_indextoken_to_orig_mapexample_index	unique_idparagraph_lentoken_is_max_contexttokensstart_positionend_positionis_impossibleqas_idr   ptr   )exampleis_lastr=   )&rX   rY   r
   rJ   r   model_max_lengthr[   is_fastr   r   
MAX_LENGTHpadding_sidequestion_textcontext_textr-   rangecls_token_idr&   r3   rE   appendr   ru   model_input_names__dict__itemsr   r   constantdtypeint64castint32r(   torchtensorlong	unsqueeze)r_   r   r   r   r   r   features	num_spansrL   span_idxinput_ids_span_idxattention_mask_span_idxtoken_type_ids_span_idxcls_indicesr   submaskry   featurefw_argsothersr   r`   vr   r   r=   r   r>   
preprocess  s   

	



z$QuestionAnsweringPipeline.preprocessc                    s    d } fdd| j jD }| jdkr| jjn| jj}dt|j	 v r*d|d< | jdi |}t
|trC|d |d |d	 S |d d
 \}}|||d	 S )Nr   c                    s   i | ]}| | qS r=   r=   )rc   r`   rw   r=   r>   
<dictcomp>  s    z6QuestionAnsweringPipeline._forward.<locals>.<dictcomp>r   	use_cacheFstart_logits
end_logits)r   r   r   r	   r=   )r   r   r   r   forwardcallinspect	signature
parameterskeysrX   rY   )r_   rw   r   model_inputsmodel_forwardoutputr   r   r=   r   r>   _forward  s   
z"QuestionAnsweringPipeline._forwardr   rA   Tc                 C   sF  d}g }|D ]}| j dkr&|d jtjkr&|d tj}	|d tj}
n|d }	|d }
|d }|d }|dd d urD|d  nd }t|	|
||||||\}}}}| j	j
st|j}t|||D ]=\}}}|d }|| t||| kd	 d	  t||| kd	 d
  d|j|| || d  d qeqt| j	jdk}|d }| j	jdkr|d | j	jk  }nd	}|rdnd	}t|||D ])\}}}|| }|| }| |||||\}}|| |||j|| d qq|r	||d	d	dd t|dd ddd | }t|dkr!|d	 S |S )Nr@   r   r   r   r   rL   rM   r   r   r$    r   )scorer   r   answerr   r   leftr   r   c                 S   s   | d S )Nr   r=   )xr=   r=   r>   <lambda>g  s    z7QuestionAnsweringPipeline.postprocess.<locals>.<lambda>T)keyreverse)r   r   r   bfloat16tofloat32getnumpyrR   r   r   r&   rE   char_to_word_offsetrr   r   rK   rF   join
doc_tokensboolr   pad_token_idrI   get_indicesr   sortedr-   )r_   model_outputsrO   rP   r!   r   rN   answersr   start_end_r   rL   rM   r9   r:   r<   char_to_wordser   r   r   encoffsetsequence_indexstart_index	end_indexr=   r=   r>   postprocess  sf   z%QuestionAnsweringPipeline.postprocessr  ztokenizers.Encodingr  r  r  r   c           
      C   s   |r>z!| |}| |}|j||dd }|j||dd }	W ||	fS  ty=   |j| d }|j| d }	Y ||	fS w |j| d }|j| d }	||	fS )N)r  r   r   )token_to_wordword_to_chars	Exceptionoffsets)
r_   r  r  r  r  r   
start_wordend_wordr  r  r=   r=   r>   r	  l  s   

z%QuestionAnsweringPipeline.get_indicesr   r   r   c                 C   s   g }d } } }}t |dD ]?\}	}
| j|
}||  kr%|kr<n n||kr-|}||kr7|t|
 }||
g7 }||krB n|t|7 }|t|
d 7 }qd|td|tt||dS )a  
        When decoding from token probabilities, this method maps token indexes to actual word in the initial context.

        Args:
            text (`str`): The actual context to extract the answer from.
            start (`int`): The answer starting token index.
            end (`int`): The answer end token index.

        Returns:
            Dictionary like `{'answer': str, 'start': int, 'end': int}`
        r   r   r   )r   r   r   )ru   splitr   tokenizer-   r  rH   rJ   )r_   r   r   r   words	token_idxchar_start_idxchar_end_idx	chars_idxry   wordtokenr=   r=   r>   span_to_answer~  s$   
z(QuestionAnsweringPipeline.span_to_answer)NNr   )	NNNNNNNNN)r   Nr   N)r   FrA   T)r{   r|   r}   r~   default_input_namesrP   r   r   r   r   r\   r   staticmethodr   r
   r^   r   rz   r   r   r  intr  r   r	  r   r'  __classcell__r=   r=   r   r>   r]      st    
*
2u
Y

2r]   )r@   r   FrA   )8r   rs   ro   collections.abcr   typingr   r   r   r   r   r   r  r&   rg   r
   r   r   r   r   tokenization_utilsr   utilsr   r   r   r   r   r   baser   r   r   
get_loggerr{   loggermodeling_tf_utilsr   modeling_utilsr   
tokenizers
tensorflowr   models.auto.modeling_tf_autor   r   r   torch.utils.datamodels.auto.modeling_autor   ndarrayr*  r?   rR   rS   r]   r=   r=   r=   r>   <module>   s\      

7
6R