o
    hd                     @   sp  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlZd dlZd dlmZ dd	lmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# dd
l$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* e rd dl+Z,d dl-Z,e.e.e,j/j0e.dkre,j1j2Z3ne,j1Z3e rd dl4mZ5 d dl6m7Z7 e3j8e7j8e3j9e7j9e3j:e7j:e3j;e7j;e3j<e7j<e3j=e7j=iZ>ere rd dl?Z?e!@eAZBedejCdeDd eDejC eDd f ZEeeDd ddeDd eDd eDeDd  eDeDd  eDeDd  f ZFG dd deZGG dd deZHG dd deZIeG dd dZJeKeLeeMeLeDeK f f ZNdd ZOG dd deZPdd ZQdd  ZRd!eDfd"d#ZSd$d% ZTd&d' ZUd(ejCd)eVfd*d+ZWdd-eMd)eDeE fd.d/ZXd!eeDeE eEf d)eEfd0d1ZYd!eeDeE eEf d)eEfd2d3ZZd)eFfd4d5Z[d)ejCfd6d7Z\	dd(ejCd8eeeMe]eMd9f f  d)eGfd:d;Z^	dd(ejCd<eeeGeLf  d)eMfd=d>Z_dd(ejCd?eGd)e]eMeMf fd@dAZ`dBe]eMeMf dCeMdDeMd)e]eMeMf fdEdFZadGeKeLeeDe]f f d)eVfdHdIZbdGeKeLeeDe]f f d)eVfdJdKZcdLeeKeLeeDe]f f  d)eVfdMdNZddLeeKeLeeDe]f f  d)eVfdOdPZedd(eeLdf dQeef d)dfdRdSZgddTeJfdUdVZhdWeLdXefdYdZZi	ddWeLdXee fd[d\ZjdWeLdXefd]d^ZkdWeLdXefd_d`ZlejeiekeldaZm			b	ddceeLddf deeeM dfeeM dgeLdXee d)ejnfdhdiZo	dd!eeDe]eLdf dQeef d)edeDd eDeDd  f fdjdkZp												ddleeV dmeef dneeV doeeefeDef f  dpeeefeDef f  dqeeV dreeM dseeV dteeKeLeMf  dueeV dveeKeLeMf  dwedx fdydzZqG d{d| d|Zrd}eHd~e]eHd9f dLeDeK d)dfddZsdeDeL deDeL fddZteddG dd dZudS )    N)Iterable)redirect_stdout)	dataclass)BytesIO)TYPE_CHECKINGCallableOptionalUnion)version   )ExplicitEnumis_av_availableis_cv2_availableis_decord_availableis_jax_tensoris_numpy_arrayis_tf_tensoris_torch_availableis_torch_tensoris_torchvision_availableis_vision_availableis_yt_dlp_availableloggingrequires_backendsto_numpy)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDIMAGENET_STANDARD_MEANIMAGENET_STANDARD_STDOPENAI_CLIP_MEANOPENAI_CLIP_STDz9.1.0)io)InterpolationModezPIL.Image.Imageztorch.Tensorz
np.ndarrayc                   @      e Zd ZdZdZdS )ChannelDimensionchannels_firstchannels_lastN)__name__
__module____qualname__FIRSTLAST r,   r,   l/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/image_utils.pyr$   f       r$   c                   @   r#   )AnnotationFormatcoco_detectioncoco_panopticN)r'   r(   r)   COCO_DETECTIONCOCO_PANOPTICr,   r,   r,   r-   r/   k   r.   r/   c                   @   s   e Zd ZejjZejjZdS )AnnotionFormatN)r'   r(   r)   r/   r2   valuer3   r,   r,   r,   r-   r4   p   s    r4   c                   @   s.   e Zd ZU eed< eed< eed< eed< dS )VideoMetadatatotal_num_framesfpsdurationvideo_backendN)r'   r(   r)   int__annotations__floatstrr,   r,   r,   r-   r6   u   s
   
 r6   c                 C   s   t  o	t| tjjS N)r   
isinstancePILImageimgr,   r,   r-   is_pil_image   s   rE   c                   @   s    e Zd ZdZdZdZdZdZdS )	ImageTypepillowtorchnumpy
tensorflowjaxN)r'   r(   r)   rA   TORCHNUMPY
TENSORFLOWJAXr,   r,   r,   r-   rF      s    rF   c                 C   sX   t | rtjS t| rtjS t| rtjS t| rtjS t	| r#tj
S tdt|  )NzUnrecognised image type )rE   rF   rA   r   rL   r   rM   r   rN   r   rO   
ValueErrortypeimager,   r,   r-   get_image_type   s   rT   c                 C   s(   t | pt| pt| pt| pt| S r?   )rE   r   r   r   r   rC   r,   r,   r-   is_valid_image   s   (rU   imagesc                 C   s   | o
t dd | D S )Nc                 s       | ]}t |V  qd S r?   )rU   .0rS   r,   r,   r-   	<genexpr>       z*is_valid_list_of_images.<locals>.<genexpr>allrV   r,   r,   r-   is_valid_list_of_images      r_   c                 C   s:   t | ttfr| D ]	}t|s dS q	dS t| sdS dS )NFT)r@   listtuplevalid_imagesrU   )imgsrD   r,   r,   r-   rc      s   rc   c                 C   s   t | ttfrt| d S dS )Nr   F)r@   ra   rb   rU   rC   r,   r,   r-   
is_batched   s   re   rS   returnc                 C   s,   | j tjkrdS t| dkot| dkS )zV
    Checks to see whether the pixel values have already been rescaled to [0, 1].
    Fr   r   )dtypenpuint8minmaxrR   r,   r,   r-   is_scaled_image   s   rl      expected_ndimsc                 C   s   t | r| S t| r| gS t| r9| j|d krt| } | S | j|kr(| g} | S td|d  d| d| j dtdt|  d)a  
    Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1.
    If the input is a batch of images, it is converted to a list of images.

    Args:
        images (`ImageInput`):
            Image of images to turn into a list of images.
        expected_ndims (`int`, *optional*, defaults to 3):
            Expected number of dimensions for a single input image. If the input image has a different number of
            dimensions, an error is raised.
    r   z%Invalid image shape. Expected either z or z dimensions, but got z dimensions.ztInvalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray, but got .)re   rE   rU   ndimra   rP   rQ   )rV   rn   r,   r,   r-   make_list_of_images   s*   	
rq   c                 C   s   t | ttfr tdd | D r tdd | D r dd | D S t | ttfrHt| rHt| d s8| d jdkr:| S | d jdkrHd	d | D S t| rat| sU| jdkrX| gS | jdkrat| S td
|  )a|  
    Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
    If the input is a nested list of images, it is converted to a flat list of images.
    Args:
        images (`Union[List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of images or a 4d array of images.
    c                 s       | ]
}t |ttfV  qd S r?   r@   ra   rb   rY   images_ir,   r,   r-   rZ          z+make_flat_list_of_images.<locals>.<genexpr>c                 s   rW   r?   r_   rt   r,   r,   r-   rZ      r[   c                 S      g | ]	}|D ]}|qqS r,   r,   rY   img_listrD   r,   r,   r-   
<listcomp>       z,make_flat_list_of_images.<locals>.<listcomp>r   rm      c                 S   rx   r,   r,   ry   r,   r,   r-   r{      r|   z*Could not make a flat list of images from 	r@   ra   rb   r]   r_   rE   rp   rU   rP   r^   r,   r,   r-   make_flat_list_of_images   s$   
r   c                 C   s   t | ttfrtdd | D rtdd | D r| S t | ttfrDt| rDt| d s3| d jdkr6| gS | d jdkrDdd | D S t| r_t| sQ| jdkrU| ggS | jdkr_t| gS td	)
z
    Ensure that the output is a nested list of images.
    Args:
        images (`Union[List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of list of images or a list of 4d array of images.
    c                 s   rr   r?   rs   rt   r,   r,   r-   rZ     rv   z-make_nested_list_of_images.<locals>.<genexpr>c                 s   rW   r?   rw   rt   r,   r,   r-   rZ     r[   r   rm   r}   c                 S      g | ]}t |qS r,   ra   rX   r,   r,   r-   r{   !      z.make_nested_list_of_images.<locals>.<listcomp>z]Invalid input type. Must be a single image, a list of images, or a list of batches of images.r~   r^   r,   r,   r-   make_nested_list_of_images	  s$   

r   c                 C   s   t | ttfr2t | d ttfr2t| d d r2t| d d s0| d d jdkr0dd | D } | S t | ttfr^t| d r^t| d sL| d jdkrO| gS | d jdkr]dd | D S nt| ryt| sk| jdkro| ggS | jdkryt| gS td|  )z
    Ensure that the input is a list of videos.
    Args:
        videos (`VideoInput`):
            Video or videos to turn into a list of videos.
    Returns:
        list: A list of videos.
    r   r}   c                 S   s   g | ]	}d d |D qS )c                 S   rx   r,   r,   )rY   
batch_listvideor,   r,   r-   r{   9  r|   z2make_batched_videos.<locals>.<listcomp>.<listcomp>r,   )rY   batched_videosr,   r,   r-   r{   9  r|   z'make_batched_videos.<locals>.<listcomp>rm   c                 S   r   r,   r   )rY   r   r,   r,   r-   r{   A  r   z"Could not make batched video from )r@   ra   rb   rU   rE   rp   rP   )videosr,   r,   r-   make_batched_videos-  s    0	"

r   c                 C   s@   t | stdt|  t rt| tjjrt| S t	| S )NzInvalid image type: )
rU   rP   rQ   r   r@   rA   rB   rh   arrayr   rC   r,   r,   r-   to_numpy_arrayL  s
   
r   num_channels.c                 C   s   |dur|nd}t |tr|fn|}| jdkrd\}}n| jdkr&d\}}ntd| j | j| |v rI| j| |v rItd| j d tjS | j| |v rStjS | j| |v r]tj	S td	)
a[  
    Infers the channel dimension format of `image`.

    Args:
        image (`np.ndarray`):
            The image to infer the channel dimension of.
        num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
            The number of channels of the image.

    Returns:
        The channel dimension of the image.
    Nr   rm   rm   )r      r}   z(Unsupported number of image dimensions: z4The channel dimension is ambiguous. Got image shape z,. Assuming channels are the first dimension.z(Unable to infer channel dimension format)
r@   r;   rp   rP   shapeloggerwarningr$   r*   r+   )rS   r   	first_dimlast_dimr,   r,   r-   infer_channel_dimension_formatU  s"   



r   input_data_formatc                 C   sF   |du rt | }|tjkr| jd S |tjkr| jd S td| )a  
    Returns the channel dimension axis of the image.

    Args:
        image (`np.ndarray`):
            The image to get the channel dimension axis of.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format of the image. If `None`, will infer the channel dimension from the image.

    Returns:
        The channel dimension axis of the image.
    Nrm   r   Unsupported data format: )r   r$   r*   rp   r+   rP   )rS   r   r,   r,   r-   get_channel_dimension_axisz  s   



r   channel_dimc                 C   sZ   |du rt | }|tjkr| jd | jd fS |tjkr&| jd | jd fS td| )a  
    Returns the (height, width) dimensions of the image.

    Args:
        image (`np.ndarray`):
            The image to get the dimensions of.
        channel_dim (`ChannelDimension`, *optional*):
            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.

    Returns:
        A tuple of the image's height and width.
    Nr   )r   r$   r*   r   r+   rP   )rS   r   r,   r,   r-   get_image_size  s   

r   
image_size
max_height	max_widthc           
      C   sB   | \}}|| }|| }t ||}t|| }t|| }	||	fS )a  
    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
    Important, even if image_height < max_height and image_width < max_width, the image will be resized
    to at least one of the edges be equal to max_height or max_width.

    For example:
        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)

    Args:
        image_size (`Tuple[int, int]`):
            The image to resize.
        max_height (`int`):
            The maximum allowed height.
        max_width (`int`):
            The maximum allowed width.
    )rj   r;   )
r   r   r   heightwidthheight_scalewidth_scale	min_scale
new_height	new_widthr,   r,   r-   #get_image_size_for_max_height_width  s   
r   
annotationc                 C   sV   t | tr)d| v r)d| v r)t | d ttfr)t| d dks't | d d tr)dS dS )Nimage_idannotationsr   TFr@   dictra   rb   lenr   r,   r,   r-   "is_valid_annotation_coco_detection  s   "r   c                 C   s^   t | tr-d| v r-d| v r-d| v r-t | d ttfr-t| d dks+t | d d tr-dS dS )Nr   segments_info	file_namer   TFr   r   r,   r,   r-   !is_valid_annotation_coco_panoptic  s   "r   r   c                 C      t dd | D S )Nc                 s   rW   r?   )r   rY   annr,   r,   r-   rZ     r[   z3valid_coco_detection_annotations.<locals>.<genexpr>r\   r   r,   r,   r-    valid_coco_detection_annotations     r   c                 C   r   )Nc                 s   rW   r?   )r   r   r,   r,   r-   rZ     r[   z2valid_coco_panoptic_annotations.<locals>.<genexpr>r\   r   r,   r,   r-   valid_coco_panoptic_annotations  r   r   timeoutc              
   C   s   t tdg t| tre| ds| dr$tjtt	j
| |dj} nOtj| r1tj| } nB| dr=| dd } zt|  }tjt|} W n$ tyd } z
td|  d	| d
}~ww t| tjjro| } ntdtj| } | d} | S )a3  
    Loads `image` to a PIL Image.

    Args:
        image (`str` or `PIL.Image.Image`):
            The image to convert to the PIL Image format.
        timeout (`float`, *optional*):
            The timeout value in seconds for the URL request.

    Returns:
        `PIL.Image.Image`: A PIL Image.
    visionhttp://https://r   zdata:image/,r   zIncorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got z. Failed with NzuIncorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image.RGB)r   
load_imager@   r>   
startswithrA   rB   openr   requestsgetcontentospathisfilesplitbase64decodebytesencode	ExceptionrP   	TypeErrorImageOpsexif_transposeconvert)rS   r   b64er,   r,   r-   r     s2   


r   metadatac                 K   s   | j }| j}|du r(|dur(t|| | }||kr(td| d| d| d|dur9tjd||| td}|S tjd|td}|S )a`  
    A default sampling function that replicates the logic used in get_uniform_frame_indices,
    while optionally handling `fps` if `num_frames` is not provided.

    Args:
        metadata (`VideoMetadata`):
            `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly.
        fps (`int`, *optional*):
            Desired frames per second. Takes priority over num_frames if both are provided.

    Returns:
        `np.ndarray`: Array of frame indices to sample.
    Nz When loading the video with fps=z, we computed num_frames=z  which exceeds total_num_frames=z. Check fps or video metadata.r   )rg   )r7   r8   r;   rP   rh   arange)r   
num_framesr8   kwargsr7   	video_fpsindicesr,   r,   r-   default_sample_indices_fn  s   r   
video_pathsample_indices_fnc                 K   s  t tdg ddl}|| }t||j}||j}|r#|| nd}tt|t	|t	|dd}|dd|i|}	d}
g }|
 r}| \}}|sMn0|
|	v rn|j\}}}|||j}||d|d|d|f  |rt|
d7 }
|
|kryn|
 sD|  |	|_t||fS )	av  
    Decode a video using the OpenCV backend.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    cv2r   Nopencvr7   r8   r9   r:   r   r   r,   )r   read_video_opencvr   VideoCapturer;   r   CAP_PROP_FRAME_COUNTCAP_PROP_FPSr6   r=   isOpenedreadr   cvtColorCOLOR_BGR2RGBappendreleaseframes_indicesrh   stack)r   r   r   r   r   r7   r   r9   r   r   indexframessuccessframer   r   channelr,   r,   r-   r   ?  s8   
 r   c                 K   s   t tdg ddlm}m} || |dd}| }t|}|r$|| nd}tt|t	|t	|dd}	|dd|	i|}
|
|
 }|
|	_||	fS )	a  
    Decode a video using the Decord backend.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    decordr   )VideoReadercpu)urictxr   r   Nr,   )r   read_video_decordr   r   r   get_avg_fpsr   r6   r;   r=   	get_batchasnumpyr   )r   r   r   r   r   vrr   r7   r9   r   r   r   r,   r,   r-   r   x  s   r   c                 K   s   t tdg ddl}|| }|jjd j}|jjd j}|r#|| nd}tt	|t
|t
|dd}|dd|i|}	g }
|d |	d }t|jddD ]\}}||krY n|dkrf||	v rf|
| qOtd	d
 |
D }|	|_||fS )a}  
    Decode the video with PyAV decoder.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    avr   Npyavr   r   r   )r   c                 S   s   g | ]}|j d dqS )rgb24)format)
to_ndarray)rY   xr,   r,   r-   r{         z#read_video_pyav.<locals>.<listcomp>r,   )r   read_video_pyavr   r   streamsr   r   average_rater6   r;   r=   seek	enumeratedecoder   rh   r   r   )r   r   r   r   	containerr7   r   r9   r   r   r   	end_indexir   r   r,   r,   r-   r    s,   


r  c                 K   s   t j| ddddd\}}}|d }|d}|r|| nd}tt|t|t|dd	}	|dd
|	i|}
||
   }|
|	_||	fS )a  
    Decode the video with torchvision decoder.

    Args:
        video_path (`str`):
            Path to the video file.
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniform sampling with fps is performed.
            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, `VideoMetadata`]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - `VideoMetadata` object.
    g        NsecTHWC)	start_ptsend_ptspts_unitoutput_formatr   r   torchvisionr   r   r,   )	torchvision_io
read_videosizer6   r;   r=   
contiguousrI   r   )r   r   r   r   _infor   r7   r9   r   r   r,   r,   r-   read_video_torchvision  s(   
r  )r   r   r   r  r   r   
VideoInputr   r8   backendc              	      s   durdur|du rt d|du r fdd}|}| ds'| drst s.tdttdg d	d
lm} t }t	|! | }	|	
| g W d   n1 sVw   Y  W d   n1 sew   Y  | }
t|
}n4| ds}| drtt| j}n!tj| r| }nt| st| ttfrt| d	 rd}ntd| dp| d}|r|dv rt d|du r| S t s|dkst s|dkst s|dkst s|dkrtd| d| dt| }|||fi |\} }| |fS )a  
    Loads `video` to a numpy array.

    Args:
        video (`str` or `VideoInput`):
            The video to convert to the numpy array format. Can be a link to video or local path.
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly. If not passed, the whole video is loaded.
        fps (`int`, *optional*):
            Number of frames to sample per second. Should be passed only when `num_frames=None`.
            If not specified and `num_frames==None`, all frames are sampled.
        backend (`str`, *optional*, defaults to `"opencv"`):
            The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
        sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
            The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
            indices at which the video should be sampled. For example:

            Example:
            def sample_indices_fn(metadata, **kwargs):
                return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int)

    Returns:
        Tuple[`np.array`, Dict]: A tuple containing:
            - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
            - Metadata dictionary.
    Nzc`num_frames`, `fps`, and `sample_indices_fn` are mutually exclusive arguments, please use only one!c                    s   t | f d|S )N)r   r8   )r   )r   	fn_kwargsr8   r   r,   r-   sample_indices_fn_func=  r`   z*load_video.<locals>.sample_indices_fn_funczhttps://www.youtube.comzhttp://www.youtube.comzETo load a video from YouTube url you have  to install `yt_dlp` first.yt_dlpr   )	YoutubeDLr   r   zVIncorrect format used for video. Should be an url linking to an video or a local path.)r   r  zlIf you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backendr   r   r   r  zYou chose backend=zf for loading the video but the required library is not found in your environment Make sure to install z before loading the video.)rP   r   r   ImportErrorr   
load_videor  r  r   r   downloadgetvaluer   r   r   r   r   r   rU   r@   ra   rb   r   r   r   r   r   VIDEO_DECODERS)r   r   r8   r  r   r   r  r  bufferf	bytes_objfile_objvideo_is_urlvideo_decoderr   r,   r  r-   r!    sl   ' 
"r!  c                    sX   t | ttfr&t| rt | d ttfr fdd| D S  fdd| D S t|  dS )a  Loads images, handling different levels of nesting.

    Args:
      images: A single image, a list of images, or a list of lists of images to load.
      timeout: Timeout for loading images.

    Returns:
      A single image, a list of images, a list of lists of images.
    r   c                    s   g | ]} fd d|D qS )c                       g | ]}t | d qS r   r   rX   r   r,   r-   r{     r  z*load_images.<locals>.<listcomp>.<listcomp>r,   )rY   image_groupr   r,   r-   r{     s    zload_images.<locals>.<listcomp>c                    r+  r,  r-  rX   r   r,   r-   r{     r  r   )r@   ra   rb   r   r   )rV   r   r,   r   r-   load_imagesr  s
   r/  
do_rescalerescale_factordo_normalize
image_mean	image_stddo_padsize_divisibilitydo_center_crop	crop_size	do_resizer  resamplePILImageResamplingc                 C   s|   | r
|du r
t d|r|du rt d|r"|du s|du r"t d|r,|du r,t d|	r:|
du s6|du r<t ddS dS )a  
    Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
    Raises `ValueError` if arguments incompatibility is caught.
    Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
    sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
    existing arguments when possible.

    Nz=`rescale_factor` must be specified if `do_rescale` is `True`.zzDepending on the model, `size_divisibility`, `size_divisor`, `pad_size` or `size` must be specified if `do_pad` is `True`.zP`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.z<`crop_size` must be specified if `do_center_crop` is `True`.zA`size` and `resample` must be specified if `do_resize` is `True`.)rP   )r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r  r:  r,   r,   r-   validate_preprocess_arguments  s   r<  c                   @   s   e Zd ZdZdd ZdddZdd Zd	ejd
e	e
ef dejfddZd ddZdd Zd!ddZd"ddZdd Zdd Zd#ddZdS )$ImageFeatureExtractionMixinzD
    Mixin that contain utilities for preparing image features.
    c                 C   s8   t |tjjtjfst|stdt| dd S d S )Nz	Got type zS which is not supported, only `PIL.Image.Image`, `np.array` and `torch.Tensor` are.)r@   rA   rB   rh   ndarrayr   rP   rQ   selfrS   r,   r,   r-   _ensure_format_supported  s
   z4ImageFeatureExtractionMixin._ensure_format_supportedNc                 C   s   |  | t|r| }t|tjrE|du r t|jd tj}|jdkr3|j	d dv r3|
ddd}|r9|d }|tj}tj|S |S )a"  
        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
        needed.

        Args:
            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
                The image to convert to the PIL Image format.
            rescale (`bool`, *optional*):
                Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
                default to `True` if the image type is a floating type, `False` otherwise.
        Nr   rm   r   r   r      )rA  r   rI   r@   rh   r>  flatfloatingrp   r   	transposeastyperi   rA   rB   	fromarray)r@  rS   rescaler,   r,   r-   to_pil_image  s   
z(ImageFeatureExtractionMixin.to_pil_imagec                 C   s&   |  | t|tjjs|S |dS )z
        Converts `PIL.Image.Image` to RGB format.

        Args:
            image (`PIL.Image.Image`):
                The image to convert.
        r   )rA  r@   rA   rB   r   r?  r,   r,   r-   convert_rgb  s   

z'ImageFeatureExtractionMixin.convert_rgbrS   scalerf   c                 C   s   |  | || S )z7
        Rescale a numpy image by scale amount
        )rA  )r@  rS   rK  r,   r,   r-   rH    s   
z#ImageFeatureExtractionMixin.rescaleTc                 C   s   |  | t|tjjrt|}t|r| }|du r&t|jd tj	n|}|r4| 
|tjd}|rB|jdkrB|ddd}|S )a  
        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
        dimension.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to convert to a NumPy array.
            rescale (`bool`, *optional*):
                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
                default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
            channel_first (`bool`, *optional*, defaults to `True`):
                Whether or not to permute the dimensions of the image to put the channel dimension first.
        Nr   p?rm   r   r   )rA  r@   rA   rB   rh   r   r   rI   rC  integerrH  rF  float32rp   rE  )r@  rS   rH  channel_firstr,   r,   r-   r     s   

z*ImageFeatureExtractionMixin.to_numpy_arrayc                 C   sD   |  | t|tjjr|S t|r|d}|S tj|dd}|S )z
        Expands 2-dimensional `image` to 3 dimensions.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to expand.
        r   )axis)rA  r@   rA   rB   r   	unsqueezerh   expand_dimsr?  r,   r,   r-   rR    s   

z'ImageFeatureExtractionMixin.expand_dimsFc                 C   sh  |  | t|tjjr| j|dd}n|r3t|tjr'| |tj	d}nt
|r3| | d}t|tjrXt|tjsHt||j}t|tjsWt||j}n6t
|rddl}t||jswt|tjrr||}n||}t||jst|tjr||}n||}|jdkr|jd dv r||ddddf  |ddddf  S || | S )a  
        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
        if it's a PIL Image.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to normalize.
            mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
                The mean (per channel) to use for normalization.
            std (`List[float]` or `np.ndarray` or `torch.Tensor`):
                The standard deviation (per channel) to use for normalization.
            rescale (`bool`, *optional*, defaults to `False`):
                Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
                happen automatically.
        T)rH  rL  r   Nrm   r   )rA  r@   rA   rB   r   rh   r>  rH  rF  rN  r   r=   r   rg   rH   Tensor
from_numpytensorrp   r   )r@  rS   meanstdrH  rH   r,   r,   r-   	normalize$  s6   


(z%ImageFeatureExtractionMixin.normalizec                 C   sJ  |dur|nt j}| | t|tjjs| |}t|tr#t|}t|t	s.t
|dkr|rBt|t	r9||fn|d |d f}n\|j\}}||krO||fn||f\}}	t|t	r\|n|d }
||
krf|S |
t	|
|	 | }}|dur||
krtd| d| ||krt	|| | |}}||kr||fn||f}|j||dS )a  
        Resizes `image`. Enforces conversion of input to PIL.Image.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to resize.
            size (`int` or `Tuple[int, int]`):
                The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
                matched to this.

                If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
                `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
                this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
            resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                The filter to user for resampling.
            default_to_square (`bool`, *optional*, defaults to `True`):
                How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
                square (`size`,`size`). If set to `False`, will replicate
                [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
                with support for resizing only the smallest edge and providing an optional `max_size`.
            max_size (`int`, *optional*, defaults to `None`):
                The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
                greater than `max_size` after being resized according to `size`, then the image is resized again so
                that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
                edge may be shorter than `size`. Only used if `default_to_square` is `False`.

        Returns:
            image: A resized `PIL.Image.Image`.
        Nr   r   zmax_size = zN must be strictly greater than the requested size for the smaller edge size = )r:  )r;  BILINEARrA  r@   rA   rB   rI  ra   rb   r;   r   r  rP   resize)r@  rS   r  r:  default_to_squaremax_sizer   r   shortlongrequested_new_short	new_shortnew_longr,   r,   r-   rZ  X  s4   


$
z"ImageFeatureExtractionMixin.resizec                 C   s  |  | t|ts||f}t|st|tjr8|jdkr"| |}|jd dv r0|jdd n|jdd }n
|j	d |j	d f}|d |d  d }||d  }|d |d  d }||d  }t|t
jjrr|||||fS |jd dv r{dnd}|st|tjr|ddd}t|r|ddd}|dkr||d kr|dkr||d kr|d||||f S |jdd	 t|d |d t|d |d f }	t|tjrtj||	d
}
n	t|r||	}
|	d	 |d  d }||d  }|	d |d  d }||d  }||
d||||f< ||7 }||7 }||7 }||7 }|
dtd|t|
jd	 |td|t|
jd |f }
|
S )a  
        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
        size given, it will be padded (so the returned result has the size asked).

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
                The image to resize.
            size (`int` or `Tuple[int, int]`):
                The size to which crop the image.

        Returns:
            new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
            height, width).
        r   r   r   r   NTF.r   )r   r   )rA  r@   rb   r   rh   r>  rp   rR  r   r  rA   rB   croprE  permuterk   
zeros_like	new_zerosrj   )r@  rS   r  image_shapetopbottomleftrightrO  	new_shape	new_imagetop_pad
bottom_padleft_pad	right_padr,   r,   r-   center_crop  sP   



,(2
4z'ImageFeatureExtractionMixin.center_cropc                 C   s>   |  | t|tjjr| |}|dddddddf S )a  
        Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
        `image` to a NumPy array if it's a PIL Image.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
                be first.
        Nr   )rA  r@   rA   rB   r   r?  r,   r,   r-   flip_channel_order  s   


z.ImageFeatureExtractionMixin.flip_channel_orderr   c                 C   sL   |dur|nt jj}| | t|t jjs| |}|j||||||dS )a  
        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
        counter clockwise around its centre.

        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
                rotating.

        Returns:
            image: A rotated `PIL.Image.Image`.
        N)r:  expandcenter	translate	fillcolor)rA   rB   NEARESTrA  r@   rI  rotate)r@  rS   angler:  rs  rt  ru  rv  r,   r,   r-   rx    s   

z"ImageFeatureExtractionMixin.rotater?   )NT)F)NTN)Nr   NNN)r'   r(   r)   __doc__rA  rI  rJ  rh   r>  r	   r=   r;   rH  r   rR  rX  rZ  rq  rr  rx  r,   r,   r,   r-   r=    s    
"
 

4CKr=  annotation_formatsupported_annotation_formatsc                 C   sX   | |vrt dt d| | tju rt|st d| tju r(t|s*t dd S d S )NzUnsupported annotation format: z must be one of zInvalid COCO detection annotations. Annotations must a dict (single image) or list of dicts (batch of images) with the following keys: `image_id` and `annotations`, with the latter being a list of annotations in the COCO format.zInvalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts (batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with the latter being a list of annotations in the COCO format.)rP   r   r/   r2   r   r3   r   )r{  r|  r   r,   r,   r-   validate_annotations  s   

r}  valid_processor_keyscaptured_kwargsc                 C   s:   t |t | }|rd|}td| d d S d S )Nz, zUnused or unrecognized kwargs: ro   )set
differencejoinr   r   )r~  r  unused_keysunused_key_strr,   r,   r-   validate_kwargs)  s
   
r  T)frozenc                   @   sz   e Zd ZU dZdZee ed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< dZee ed< d	d
 ZdS )SizeDictz>
    Hashable dictionary to store image size information.
    Nr   r   longest_edgeshortest_edger   r   c                 C   s$   t | |r
t| |S td| d)NzKey z not found in SizeDict.)hasattrgetattrKeyError)r@  keyr,   r,   r-   __getitem__>  s   

zSizeDict.__getitem__)r'   r(   r)   rz  r   r   r;   r<   r   r  r  r   r   r  r,   r,   r,   r-   r  1  s   
 r  )rm   r?   )NN)NNr   N)NNNNNNNNNNNN)vr   r   collections.abcr   
contextlibr   dataclassesr   r!   r   typingr   r   r   r	   rI   rh   r   	packagingr
   utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   utils.constantsr   r   r   r   r   r    	PIL.ImagerA   PIL.ImageOpsparse__version__base_versionrB   
Resamplingr;  r  r  torchvision.transformsr"   rw  BOXrY  HAMMINGBICUBICLANCZOSpil_torch_interpolation_mappingrH   
get_loggerr'   r   r>  ra   
ImageInputr  r$   r/   r4   r6   r   r>   r;   AnnotationTyperE   rF   rT   rU   r_   rc   re   boolrl   rq   r   r   r   r   rb   r   r   r   r   r   r   r   r   r=   r   r   r   r   r  r  r$  r   r!  r/  r<  r=  r}  r  r  r,   r,   r,   r-   <module>   s  D 






'
#
$

&
"


""&&$,#
;
+
3
1


e
	

*  a

