o
    h$                     @   s   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ eeZee  Z!e"dd e!D Z#eG dd dZ$G dd deZ%G dd deZ&dS )    N)	dataclassfield)Enum)DictListOptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)logging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc                 c   s    | ]}|j V  qd S N)
model_type).0conf r   t/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/data/datasets/squad.py	<genexpr>"   s    r   c                   @   s<  e Zd ZU dZedddde idZee	d< edddidZ
ee	d	< ed
ddidZee	d< ed
ddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< edddidZee	d< eddd idZee	d!< ed"dd#idZee	d$< dS )%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r   r!   intr"   r$   r&   r'   boolr(   r)   floatr+   r,   r.   r   r   r   r   r   %   sb   
 				r   c                   @   s   e Zd ZdZdZdS )SplittraindevN)r/   r0   r1   r;   r<   r   r   r   r   r:   h   s    r:   c                   @   s   e Zd ZU dZeed< ee ed< eed< e	ed< dej
dddfded	ed
ee deeef dee	 dee dee fddZdd Zdeeejf fddZdS )SquadDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                 C   s   || _ || _|jrt nt | _t|tr'zt| }W n t	y&   t	dw || _
|jr/dnd}tj|d ur:|n|jd|j d|jj d|j d| }	|	d }
t|
 tj|	r|jst }tj|	dd| _| jd	 | _| jd
d | _| jdd | _td|	 dt |  | jd u s| jd u rt d|	 d nX|tj!kr| j"|j| _n| j#|j| _t$| j||j|j%|j&|tj'k|j(|d\| _| _t }t)| j| j| jd|	 td|	 dt | dd W d    d S W d    d S 1 s	w   Y  d S )Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyr?   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rM   rC   r!   r"   r$   is_trainingr.   return_dataset)r?   rL   rM   z!Saving features into cached file z [took z.3fz s])*r>   rA   r(   r   r   	processor
isinstancer5   r:   KeyErrorr@   ospathr3   r   value	__class__r/   r!   r	   existsr'   timetorchloadold_featuresr?   getrL   rM   loggerinfowarningr<   get_dev_examplesget_train_examplesr   r"   r$   r;   r.   save)selfr>   rC   rD   r@   rA   rE   rF   version_tagcached_features_file	lock_pathstartr   r   r   __init__w   sn   

"


$ zSquadDataset.__init__c                 C   s
   t | jS r   )lenr?   )rc   r   r   r   __len__   s   
zSquadDataset.__len__returnc                 C   s4  | j | }tj|jtjd}tj|jtjd}tj|jtjd}tj|jtjd}tj|jtj	d}tj|j
tj	d}|||d}	| jjdv rJ|	d= | jjdv rx|	||d | jjrc|	d|i | jrx|	dtj|jtjd| jj i | jtjkrtj|jtjd}
tj|jtjd}|	|
|d	 |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertro   )xlnetrp   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)r?   rY   tensorrm   longrn   ro   ru   rv   r9   rw   r>   r   updater(   rA   onesshapeint64r,   r@   r:   r;   start_positionend_position)rc   ifeaturerm   rn   ro   ru   rv   rw   inputsry   rz   r   r   r   __getitem__   s0   
$zSquadDataset.__getitem__)r/   r0   r1   r2   r   r6   r   r   r:   r8   r;   r   r   r7   r   r5   rh   rj   r   rY   Tensorr   r   r   r   r   r=   m   s8   
 

Kr=   )'rS   rX   dataclassesr   r   enumr   typingr   r   r   r   rY   filelockr	   torch.utils.datar
   models.auto.modeling_autor   tokenization_utilsr   utilsr   processors.squadr   r   r   r   
get_loggerr/   r]   listkeysMODEL_CONFIG_CLASSEStupler4   r   r:   r=   r   r   r   r   <module>   s&   
B