o
    h                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZ d
dlmZmZmZ d
dlmZ eeZeG dd dZG dd deZG dd deZ dS )    N)	dataclassfield)Enum)ListOptionalUnion)FileLock)Dataset   )PreTrainedTokenizerBase)logging   )!glue_convert_examples_to_featuresglue_output_modesglue_processors)InputFeaturesc                   @   s   e Zd ZU dZeddde  idZe	e
d< eddidZe	e
d< ed	dd
idZee
d< edddidZee
d< dd ZdS )GlueDataTrainingArgumentsz
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
    line.
    helpz"The name of the task to train on: z, )metadata	task_namezUThe input data dir. Should contain the .tsv files (or other data files) for the task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.)defaultr   max_seq_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachec                 C   s   | j  | _ d S N)r   lowerself r   s/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/data/datasets/glue.py__post_init__=   s   z'GlueDataTrainingArguments.__post_init__N)__name__
__module____qualname____doc__r   joinr   keysr   str__annotations__r   r   intr   boolr!   r   r   r   r    r   #   s    
 $	r   c                   @   s   e Zd ZdZdZdZdS )SplittraindevtestN)r"   r#   r$   r-   r.   r/   r   r   r   r    r,   A   s    r,   c                   @   s   e Zd ZU dZeed< eed< ee ed< de	j
dfdededee deee	f d	ee f
d
dZdd ZdefddZdd ZdS )GlueDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsoutput_modefeaturesN	tokenizerlimit_lengthmode	cache_dirc                 C   s  t dt || _t|j  | _t|j | _t	|t
r-zt| }W n ty,   tdw tj|d ur6|n|jd|j d|jj d|j d|j }| j }|jdv ri|jjdv ri|d |d |d< |d< || _|d	 }t| tj|r|jst }	t|| _td
| dt |	  ndtd|j  |tj kr| j!|j}
n|tj"kr| j#|j}
n| j$|j}
|d ur|
d | }
t%|
||j|| jd| _t }	t&| j| td| dt |	 dd W d    d S W d    d S 1 sw   Y  d S )Nu  This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.pyzmode is not a valid split namecached__)mnlizmnli-mm)RobertaTokenizerRobertaTokenizerFastXLMRobertaTokenizerBartTokenizerBartTokenizerFastr      z.lockz"Loading features from cached file z [took %.3f s]z'Creating features from dataset file at )
max_length
label_listr2   z!Saving features into cached file z [took z.3fz s])'warningswarnFutureWarningr1   r   r   	processorr   r2   
isinstancer(   r,   KeyErrorospathr&   r   value	__class__r"   r   
get_labelsrB   r   existsr   timetorchloadr3   loggerinfor.   get_dev_examplesr/   get_test_examplesget_train_examplesr   save)r   r1   r4   r5   r6   r7   cached_features_filerB   	lock_pathstartexamplesr   r   r    __init__P   sh   
$



$zGlueDataset.__init__c                 C   s
   t | jS r   )lenr3   r   r   r   r    __len__      
zGlueDataset.__len__returnc                 C   s
   | j | S r   )r3   )r   ir   r   r    __getitem__   r_   zGlueDataset.__getitem__c                 C   s   | j S r   )rB   r   r   r   r    rM      s   zGlueDataset.get_labels)r"   r#   r$   r%   r   r)   r(   r   r   r,   r-   r   r   r*   r   r\   r^   rb   rM   r   r   r   r    r0   G   s,   
 

Jr0   )!rI   rO   rC   dataclassesr   r   enumr   typingr   r   r   rP   filelockr   torch.utils.datar	   tokenization_utils_baser   utilsr   processors.gluer   r   r   processors.utilsr   
get_loggerr"   rR   r   r,   r0   r   r   r   r    <module>   s$   
