o
    h                 
   @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddlm
Z
mZmZmZmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZmZmZmZmZmZmZ e redd	lm Z  dd
l!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 e34e5Z6e/e	e7j8Z9ddddZ:ej;dkrej<Z<nej<Z<G dd deddZ=G dd deddZ>G dd deddZ?G dd deddZ@G dd deddZAG dd de=e>e?e@eAddZBG dd  d eddZCG d!d" d"eddZDG d#d$ d$eDeCddZEG d%d& d&e=e>e?e@eAeEZFG d'd( d(e)ZGd)d* ZHe.eGjIeG_IeGjIj dur@eGjIj jJd+d,d-d.eGjI_ dS dS )/z8
Processing saving/loading class for common processors.
    N)Path)AnyCallableDictListOptional	TypedDictUnion   )
load_audio)custom_object_save)ChannelDimension
ImageInput
VideoInputis_valid_imageis_vision_available
load_image
load_video)PILImageResampling)PaddingStrategyPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategy)PROCESSOR_NAMEPushToHubMixin
TensorTypeadd_model_info_to_auto_map"add_model_info_to_custom_pipelinescached_file	copy_funcdirect_transformers_importdownload_urlis_offline_modeis_remote_urlloggingr   FeatureExtractionMixinImageProcessingMixin)AutoTokenizerAutoFeatureExtractorAutoImageProcessor)      c                   @   s.  e Zd ZU dZeeeeee ee f  e	d< eeeee ee f e	d< eeeeee ee f  e	d< ee
 e	d< ee
eef e	d< ee
eef e	d< ee e	d< ee e	d	< ee
 e	d
< ee e	d< ee
 e	d< ee
 e	d< ee
 e	d< ee
 e	d< ee
 e	d< ee
 e	d< ee
 e	d< ee e	d< dS )
TextKwargsa  
    Keyword arguments for text processing. For extended documentation, check out tokenization_utils_base methods and
    docstrings associated.

    Attributes:
        add_special_tokens (`bool`, *optional*)
            Whether or not to add special tokens when encoding the sequences.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*)
            Activates and controls padding.
        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*):
            Activates and controls truncation.
        max_length (`int`, *optional*):
            Controls the maximum length to use by one of the truncation/padding parameters.
        stride (`int`, *optional*):
            If set, the overflowing tokens will contain some tokens from the end of the truncated sequence.
        is_split_into_words (`bool`, *optional*):
            Whether or not the input is already pre-tokenized.
        pad_to_multiple_of (`int`, *optional*):
            If set, will pad the sequence to a multiple of the provided value.
        return_token_type_ids (`bool`, *optional*):
            Whether to return token type IDs.
        return_attention_mask (`bool`, *optional*):
            Whether to return the attention mask.
        return_overflowing_tokens (`bool`, *optional*):
            Whether or not to return overflowing token sequences.
        return_special_tokens_mask (`bool`, *optional*):
            Whether or not to return special tokens mask information.
        return_offsets_mapping (`bool`, *optional*):
            Whether or not to return `(char_start, char_end)` for each token.
        return_length (`bool`, *optional*):
            Whether or not to return the lengths of the encoded inputs.
        verbose (`bool`, *optional*):
            Whether or not to print more information and warnings.
        padding_side (`str`, *optional*):
            The side on which padding will be applied.
    	text_pairtext_targettext_pair_targetadd_special_tokenspadding
truncation
max_lengthstrideis_split_into_wordspad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosepadding_sideN)__name__
__module____qualname____doc__r   r	   r   r   list__annotations__boolstrr   r   int rI   rI   q/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/processing_utils.pyr-   X   s(   
  % r-   F)totalc                   @   s  e Zd ZU dZee ed< eeee	f  ed< ee	 ed< eeee	f  ed< ee
de	f  ed< ee ed< ee ed	< ee ed
< ee
eee f  ed< ee
eee f  ed< ee ed< eeee	f  ed< ee ed< ee ed< ee
eef  ed< ee ed< dS )ImagesKwargsa  
    Keyword arguments for image processing. For extended documentation, check the appropriate ImageProcessor
    class methods and docstrings.

    Attributes:
        do_resize (`bool`, *optional*):
            Whether to resize the image.
        size (`Dict[str, int]`, *optional*):
            Resize the shorter side of the input to `size["shortest_edge"]`.
        size_divisor (`int`, *optional*):
            The size by which to make sure both the height and width can be divided.
        crop_size (`Dict[str, int]`, *optional*):
            Desired output size when applying center-cropping.
        resample (`PILImageResampling`, *optional*):
            Resampling filter to use if resizing the image.
        do_rescale (`bool`, *optional*):
            Whether to rescale the image by the specified scale `rescale_factor`.
        rescale_factor (`int` or `float`, *optional*):
            Scale factor to use if rescaling the image.
        do_normalize (`bool`, *optional*):
            Whether to normalize the image.
        image_mean (`float` or `List[float]`, *optional*):
            Mean to use if normalizing the image.
        image_std (`float` or `List[float]`, *optional*):
            Standard deviation to use if normalizing the image.
        do_pad (`bool`, *optional*):
            Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
        pad_size (`Dict[str, int]`, *optional*):
            The size `{"height": int, "width" int}` to pad the images to.
        do_center_crop (`bool`, *optional*):
            Whether to center crop the image.
        data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format for the output image.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format for the input image.
        device (`str`, *optional*):
            The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
    	do_resizesizesize_divisor	crop_sizer   resample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_padpad_sizedo_center_cropdata_formatinput_data_formatdeviceN)r@   rA   rB   rC   r   rF   rE   dictrG   rH   r	   floatrD   r   rI   rI   rI   rJ   rL      s$   
 'rL   c                   @   s   e Zd ZU dZee ed< eeee	f  ed< ee	 ed< ed ed< ee ed< ee
 ed< ee ed	< eee
ee
 f  ed
< eee
ee
 f  ed< ee ed< ee ed< ee ed< eeeef  ed< dS )VideosKwargsa  
    Keyword arguments for video processing.

    Attributes:
        do_resize (`bool`):
            Whether to resize the image.
        size (`Dict[str, int]`, *optional*):
            Resize the shorter side of the input to `size["shortest_edge"]`.
        size_divisor (`int`, *optional*):
            The size by which to make sure both the height and width can be divided.
        resample (`PILImageResampling`, *optional*):
            Resampling filter to use if resizing the image.
        do_rescale (`bool`, *optional*):
            Whether to rescale the image by the specified scale `rescale_factor`.
        rescale_factor (`int` or `float`, *optional*):
            Scale factor to use if rescaling the image.
        do_normalize (`bool`, *optional*):
            Whether to normalize the image.
        image_mean (`float` or `List[float]`, *optional*):
            Mean to use if normalizing the image.
        image_std (`float` or `List[float]`, *optional*):
            Standard deviation to use if normalizing the image.
        do_pad (`bool`, *optional*):
            Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
        do_center_crop (`bool`, *optional*):
            Whether to center crop the image.
        data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format for the output image.
        input_data_format (`ChannelDimension` or `str`, *optional*):
            The channel dimension format for the input image.
    rM   rN   rO   r   rQ   rR   rS   rT   rU   rV   rW   rY   rZ   r[   N)r@   rA   rB   rC   r   rF   rE   r]   rG   rH   r^   r	   rD   r   rI   rI   rI   rJ   r_      s   
  r_   c                   @   s   e Zd ZU dZee ed< eedee	 ed eee	  f  ed< eee
eef  ed< ee ed< ee
 ed< ee ed< ee
 ed	< d
S )AudioKwargsa  
    Keyword arguments for audio processing.

    Attributes:
        sampling_rate (`int`, *optional*):
            The sampling rate at which the `raw_speech` input was sampled.
        raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
            The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
            values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
            stereo, i.e. single float per timestep.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding
            index) among:

            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                sequence if provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        truncation (`bool`, *optional*):
            Activates truncation to cut input sequences longer than *max_length* to *max_length*.
        pad_to_multiple_of (`int`, *optional*):
            If set, will pad the sequence to a multiple of the provided value.
        return_attention_mask (`bool`, *optional*):
            Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`.
    sampling_ratez
np.ndarray
raw_speechr2   r4   r3   r7   r9   N)r@   rA   rB   rC   r   rH   rE   r	   rD   r^   rF   rG   r   rI   rI   rI   rJ   r`      s   
 (r`   c                   @   s"   e Zd ZU eeeef  ed< dS )CommonKwargsreturn_tensorsN)r@   rA   rB   r   r	   rG   r   rE   rI   rI   rI   rJ   rc   #  s   
 rc   c                   @   sl   e Zd ZU dZi ejZeed< i ejZeed< i e	jZ
e	ed< i ejZeed< i ejZeed< dS )ProcessingKwargsa'  
    Base class for kwargs passing to processors.
    A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide:
        1) Additional typed keys and that this model requires to process inputs.
        2) Default values for existing keys under a `_defaults` attribute.
    New keys have to be defined as follows to ensure type hinting is done correctly.

    ```python
    # adding a new image kwarg for this model
    class ModelImagesKwargs(ImagesKwargs, total=False):
        new_image_kwarg: Optional[bool]

    class ModelProcessorKwargs(ProcessingKwargs, total=False):
        images_kwargs: ModelImagesKwargs
        _defaults = {
            "images_kwargs: {
                "new_image_kwarg": False,
            }
            "text_kwargs": {
                "padding": "max_length",
            },
        }

    ```

    For Python 3.8 compatibility, when inheriting from this class and overriding one of the kwargs,
    you need to manually update the __annotations__ dictionary. This can be done as follows:

    ```python
    class CustomProcessorKwargs(ProcessingKwargs, total=False):
        images_kwargs: CustomImagesKwargs

    CustomProcessorKwargs.__annotations__["images_kwargs"] = CustomImagesKwargs  # python 3.8 compatibility
    ```python

    common_kwargstext_kwargsimages_kwargsvideos_kwargsaudio_kwargsN)r@   rA   rB   rC   rc   rE   rf   r-   rg   rL   rh   r_   ri   r`   rj   rI   rI   rI   rJ   re   '  s"   
 %re   c                   @   sr   e Zd ZU dZdZeee  ed< dZ	eeee
e
f   ed< dZee ed< dZee ed< dZee ed< dS )	TokenizerChatTemplateKwargsaU	  
    Keyword arguments for tokenizer's `apply_chat_template`, when it is called from within a processor.

    tools (`List[Dict]`, *optional*):
        A list of tools (callable functions) that will be accessible to the model. If the template does not
        support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
        giving the name, description and argument types for the tool. See our
        [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
        for more information.
    documents (`List[Dict[str, str]]`, *optional*):
        A list of dicts representing documents that will be accessible to the model if it is performing RAG
        (retrieval-augmented generation). If the template does not support RAG, this argument will have no
        effect. We recommend that each document should be a dict containing "title" and "text" keys. Please
        see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG)
        for examples of passing documents with chat templates.
    add_generation_prompt (bool, *optional*):
        If this is set, a prompt with the token(s) that indicate
        the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
        Note that this argument will be passed to the chat template, and so it must be supported in the
        template for this argument to have any effect.
    continue_final_message (bool, *optional*):
        If this is set, the chat will be formatted so that the final
        message in the chat is open-ended, without any EOS tokens. The model will continue this message
        rather than starting a new one. This allows you to "prefill" part of
        the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
    return_assistant_tokens_mask (`bool`, defaults to `False`):
        Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
        the mask will contain 1. For user and system tokens, the mask will contain 0.
        This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
    Ntools	documentsFadd_generation_promptcontinue_final_messagereturn_assistant_tokens_mask)r@   rA   rB   rC   rl   r   rD   r]   rE   rm   rG   rn   rF   ro   rp   rI   rI   rI   rJ   rk   ^  s   
 rk   c                   @   sr   e Zd ZU dZdZee ed< dZee	 ed< dZ
ee ed< dZee ed< dZee ed	< d
Zee ed< dS )ChatTemplateLoadKwargsaZ  
    Keyword arguments used to load multimodal data in processor chat templates.

    num_frames (`int`, *optional*):
        Number of frames to sample uniformly. If not passed, the whole video is loaded.
    video_load_backend (`str`, *optional*, defaults to `"pyav"`):
        The backend to use when loading the video which will be used only when there are videos in the conversation.
        Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend
        that supports all types of sources to load from.
    video_fps (`int`, *optional*):
        Number of frames to sample per second. Should be passed only when `num_frames=None`.
        If not specified and `num_frames==None`, all frames are sampled.
    sample_indices_fn (`Callable`, *optional*):
            A callable function that will return indices at which the video should be sampled. If the video has to be loaded using
            by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`.
            If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args.
            The function expects at input the all args along with all kwargs passed to `load_video` and should output valid
            indices at which the video should be sampled. For example:

            def sample_indices_fn(num_frames, fps, metadata, **kwargs):
                # add you sampling logic here ...
                return np.linspace(start_idx, end_idx, num_frames, dtype=int)
    N
num_framespyavvideo_load_backend	video_fpsi>  ra   sample_indices_fnFload_audio_from_video)r@   rA   rB   rC   rr   r   rH   rE   rt   rG   ru   ra   rv   r   rw   rF   rI   rI   rI   rJ   rq     s   
 rq   c                   @   s2   e Zd ZU dZdZee ed< dZee ed< dS )ProcessorChatTemplateKwargsa:  
    Keyword arguments for processor's `apply_chat_template`.

    tokenize (`bool`, *optional*, defaults to `False`):
        Whether to tokenize the output or not.
    return_dict (`bool`, defaults to `False`):
        Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
    Ftokenizereturn_dictN)	r@   rA   rB   rC   ry   r   rF   rE   rz   rI   rI   rI   rJ   rx     s   
 	rx   c                   @   s   e Zd ZdS )AllKwargsForChatTemplateN)r@   rA   rB   rI   rI   rI   rJ   r{     s    r{   c                   @   sZ  e Zd ZU dZddgZdgZg Zee e	d< dZ
dZdZg Zee e	d< dd	 Zd
eeef fddZd
efddZdeeejf fddZdd ZdDdefddZedeeejf d
eeeef eeef f fddZedeeef fddZ	dEdedee d
eeef fd d!Z e					"dFdeeejf d#eeeejf  d$ed%ed&eeeef  d'efd(d)Z!edGd+d,Z"ed-d. Z#e$d/d0 Z%e&d1d2 Z'e$d3d4 Z(d5d6 Z)d7e*e*e+eef   d8e*e, d9e*e- d:e*e*e+ee.f   d;e/e0 f
d<d=Z1	dEd7eeeeef  eeeeef   f dee d>e/e2 d
efd?d@Z3dHdBdCZ4dS )IProcessorMixinza
    This is a mixin used to provide saving/loading functionality for all processor classes.
    feature_extractor	tokenizerchat_templateoptional_call_argsNvalid_kwargsc           	   	      sP   j D ]}t |||d  q|D ]}| jvr!td| dqt| jD ]\}}||v r8td| d|||< q(t|t jkr]tdt j dd j dt| d|	 D ]D\}}t
 | d	}t||}t|trt fd
d|D }n |}t||stdt|j d| d| dt || qad S )NzUnexpected keyword argument .z!Got multiple values for argument zThis processor requires z arguments: z, z. Got z arguments instead._classc                 3   s"    | ]}|d ur  |V  qd S Nget_possibly_dynamic_module.0nselfrI   rJ   	<genexpr>  s     z*ProcessorMixin.__init__.<locals>.<genexpr>zReceived a z for argument z, but a z was expected.)optional_attributessetattrpop
attributes	TypeErrorziplen
ValueErrorjoinitemsgetattrAUTO_TO_BASE_CLASS_MAPPINGget
isinstancetupler   typer@   )	r   argskwargsoptional_attributekeyargattribute_name
class_nameproper_classrI   r   rJ   __init__  s8   





zProcessorMixin.__init__returnc                    s   t j}tj}|j fdd D   dg7   fdd| D }jj	|d< d|v r5|d= d|v r<|d= d	|v rC|d	= d
|v rJ|d
= dd | D }|S )z
        Serializes this instance to a Python dictionary.

        Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
        c                    s   g | ]
}| j jvr|qS rI   )	__class__r   )r   xr   rI   rJ   
<listcomp>      z*ProcessorMixin.to_dict.<locals>.<listcomp>auto_mapc                    s   i | ]\}}| v r||qS rI   rI   r   kv)attrs_to_saverI   rJ   
<dictcomp>   s    z*ProcessorMixin.to_dict.<locals>.<dictcomp>processor_classr~   image_processorr}   r   c                 S   s,   i | ]\}}t |ts|jjd ks||qS )BeamSearchDecoderCTC)r   r   r   r@   r   rI   rI   rJ   r     s    )
copydeepcopy__dict__inspect	signaturer   
parametersr   r   r@   )r   outputsigrI   )r   r   rJ   to_dict  s&   
zProcessorMixin.to_dictc                 C   s   |   }tj|dddd S )z
        Serializes this instance to a JSON string.

        Returns:
            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
           Tindent	sort_keys
)r   jsondumps)r   
dictionaryrI   rI   rJ   to_json_string  s   zProcessorMixin.to_json_stringjson_file_pathc                 C   sB   t |ddd}||   W d   dS 1 sw   Y  dS )z
        Save this instance to a JSON file.

        Args:
            json_file_path (`str` or `os.PathLike`):
                Path to the JSON file in which this processor instance's parameters will be saved.
        wutf-8encodingN)openwriter   )r   r   writerrI   rI   rJ   to_json_file!  s   "zProcessorMixin.to_json_filec                    s:    fdd j D }d|} jj d| d   S )Nc              	      s&   g | ]}d | dt t | qS )z- z: )reprr   )r   namer   rI   rJ   r   -  s   & z+ProcessorMixin.__repr__.<locals>.<listcomp>r   z:
z

)r   r   r   r@   r   )r   attributes_reprrI   r   rJ   __repr__,  s   
zProcessorMixin.__repr__Fpush_to_hubc                    s  | dd}|dur tdt |dddurtd||d< tj|dd |rJ| dd}| d	|tj	j
d
 } j|fi |} |} jdurl fdd jD }dd |D }	|	  t ||	d  jD ]}
t |
}t|dr| jj || qo jdur jD ]}
t |
}t|tr|jd= qtj	|t}tj	|d}tj	|d}  } jdur|ddrt|ddd}| j W d   n1 sw   Y  t !d|  n2t"j#d jidddd }t|ddd}|| W d   n	1 sw   Y  t !d|  t$|% dhkr5 &| t !d|  |rE j'|||||dd t$|% dhkrQg S |gS ) a  
        Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
        can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.

        <Tip>

        This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
        [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the
        methods above for more information.

        </Tip>

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                be created if it does not exist).
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs (`Dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        use_auth_tokenNrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.tokenV`token` and `use_auth_token` are both specified. Please set only the argument `token`.T)exist_okcommit_messagerepo_idc                    s   g | ]}t  |qS rI   )r   )r   r   r   rI   rJ   r   `      z2ProcessorMixin.save_pretrained.<locals>.<listcomp>c                 S   s    g | ]}t |tr|jn|qS rI   )r   r   init_kwargs)r   arI   rI   rJ   r   a  s     )config_set_processor_classr   chat_template.jinjachat_template.jsonsave_raw_chat_templateFr   r   r   zchat template saved in r   r   r   r   r   zprocessor saved in )r   r   )(r   warningswarnFutureWarningr   r   osmakedirssplitpathsep_create_repo_get_files_timestamps_auto_classr   appendr   r   hasattrr   r   r@   save_pretrainedr   r   r   r   r   r   r   r   r   loggerinfor   r   setkeysr   _upload_modified_files)r   save_directoryr   r   r   r   r   files_timestampsattrsconfigsr   	attributeoutput_processor_fileoutput_raw_chat_template_fileoutput_chat_template_fileprocessor_dictr   chat_template_json_stringrI   r   rJ   r   1  s~   










zProcessorMixin.save_pretrainedpretrained_model_name_or_pathc                 K   s  | dd}| dd}| dd}| dd}| dd}| dd}| d	d}	| d
d}
| dd}| dd}d|d}|durI||d< t rU|sUtd d}t|}tj|}tj|rltj|t	}tj
|r{|}d}d}d}ndt|r|}t|}d}d}nUt	}d}d}z2t||||||||||	|
dd}t||||||||||	|
dd}t||||||||||	|
dd}W n ty     ty   td| d| dt	 dw |durt|dd}| }W d   n1 sw   Y  ||d< n+|dur.t|dd}| }W d   n	1 sw   Y  t|d }||d< |du rEi }d|v rAd| di}||fS z"t|dd}| }W d   n	1 s\w   Y  t|}W n tjyx   td| dw |rtd|  ntd| d |  d|v r|d durtd! d|v r| d|d< |sd"|v rt|d" ||d"< d#|v rt|d# ||d#< ||fS )$a  
        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
        processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.

        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.

        Returns:
            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
        	cache_dirNforce_downloadFresume_downloadproxiesr   local_files_onlyrevision	subfolder _from_pipeline
_from_auto	processor)	file_typefrom_auto_classusing_pipelinez+Offline mode: forcing local_files_only=TrueTr   r   )
r  r  r  r  r  r   
user_agentr  r  %_raise_exceptions_for_missing_entrieszCan't load processor for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z2' is the correct path to a directory containing a z filer   r   r   z"It looks like the config file at 'z' is not a valid JSON file.zloading configuration file z from cache at zChat templates should be in a 'chat_template.jinja' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.r   custom_pipelines)r   r#   r   r   rG   r   r   isdirr   r   isfiler$   r"   r   OSError	Exceptionr   readr   loadsJSONDecodeErrorwarning_oncer   r   )clsr  r   r  r  r  r  r   r  r  r  from_pipeliner  r  is_localprocessor_fileresolved_processor_fileresolved_chat_template_fileresolved_raw_chat_template_filechat_template_fileraw_chat_template_filereaderr   textr   rI   rI   rJ   get_processor_dict  s  















z!ProcessorMixin.get_processor_dictr   c                 K   s   |  }|dd}d|v r|d= d|v r|d= | j|| jd}| |i |}t| D ]}t||r=t|||| q-|| t	
d|  |rQ||fS |S )a  
        Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.

        Args:
            processor_dict (`Dict[str, Any]`):
                Dictionary that will be used to instantiate the processor object. Such a dictionary can be
                retrieved from a pretrained checkpoint by leveraging the
                [`~processing_utils.ProcessingMixin.to_dict`] method.
            kwargs (`Dict[str, Any]`):
                Additional parameters from which to initialize the processor object.

        Returns:
            [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
            parameters.
        return_unused_kwargsFr   r   )processor_configr   z
Processor )r   r   validate_init_kwargsr   r   r   r   r   updater   r   )r  r   r   r   r'  unused_kwargsr  r   rI   rI   rJ   from_args_and_dictO  s"   

z!ProcessorMixin.from_args_and_dictModelProcessorKwargstokenizer_init_kwargsc                    s.  i i i i i d}i i i i i d h d}t  } D ]2}|j|i   |< |j| j D ]}||v rJt| j|r@t| j|n|| }	|	 | |< q.q|	  t |t | }
|D ]N}|j| j D ]C}||v r|| 
|d}|dkr||
v rtd| d| dn||v r||d}nd}t|tr|dkr||| |< || qeq[t fdd|D r| D ] \}}| v r| D ]\}}||vr||| |< || qqn+|D ](}||vr||jd	 j v r|| |d	 |< q||vrtd
| d q|D ]}|| 	|d	  q|S )a  
        Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance.
        The order of operations is as follows:
            1) kwargs passed as before have highest priority to preserve BC.
                ```python
                high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"}
                processor(..., **high_priority_kwargs)
                ```
            2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API.
                ```python
                processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}})
                ```
            3) kwargs passed during instantiation of a modality processor have fourth priority.
                ```python
                tokenizer = tokenizer_class(..., {"padding": "max_length"})
                image_processor = image_processor_class(...)
                processor(tokenizer, image_processor) # will pass max_length unless overridden by kwargs at call
                ```
            4) defaults kwargs specified at processor level have lowest priority.
                ```python
                class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
                    _defaults = {
                        "text_kwargs": {
                            "padding": "max_length",
                            "max_length": 64,
                        },
                    }
                ```
        Args:
            ModelProcessorKwargs (`ProcessingKwargs`):
                Typed dictionary of kwargs specifically required by the model passed.
            tokenizer_init_kwargs (`Dict`, *optional*):
                Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.

        Returns:
            output_kwargs (`Dict`):
                Dictionary of per-modality kwargs to be passed to each modality-specific processor.

        )rg   rh   rj   ri   rf   >   r%  audioimagesvideos	__empty__zKeyword argument z+ was passed two times:
in a dictionary for z and as a **kwarg.c                 3   s    | ]}| v V  qd S r   rI   )r   r   default_kwargsrI   rJ   r     s    z/ProcessorMixin._merge_kwargs.<locals>.<genexpr>rf   zKeyword argument `zA` is not a valid argument for this processor and will be ignored.)r   	_defaultsr   r   rE   r   r   r~   r   r*  r   r   r   rG   addanyr   r   r  )r   r-  r.  r   output_kwargspossible_modality_keywords	used_keysmodalitymodality_keyvaluenon_modality_kwargskwarg_valuesubdictsubkeysubvaluer   rI   r3  rJ   _merge_kwargsz  s   /	






zProcessorMixin._merge_kwargsmainr  r  r  r   r  c           
         s   ||d< ||d< ||d< ||d< | dd}|dur*tdt |dur(td|}|dur2||d	< | j|fi |}	| j|fi |\ }  fd
d| D  | j	|	 fi |S )a[  
        Instantiate a processor associated with a pretrained model.

        <Tip>

        This class method is simply calling the feature extractor
        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor
        [`~image_processing_utils.ImageProcessingMixin`] and the tokenizer
        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
        methods above for more information.

        </Tip>

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a feature extractor file saved using the
                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
                - a path or url to a saved feature extractor JSON *file*, e.g.,
                  `./my_model_directory/preprocessor_config.json`.
            **kwargs
                Additional keyword arguments passed along to both
                [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
        r  r  r  r  r   Nr   r   r   c                    s"   i | ]\}}|   v r||qS rI   )r   r   r   rI   rJ   r   9  s   " z2ProcessorMixin.from_pretrained.<locals>.<dictcomp>)
r   r   r   r   r   _get_arguments_from_pretrainedr&  r*  r   r,  )
r  r  r  r  r  r   r  r   r   r   rI   rE  rJ   from_pretrained  s*   'zProcessorMixin.from_pretrainedAutoProcessorc                 C   sD   t |ts|j}ddlm  m} t||st| d|| _dS )a  
        Register this class with a given auto class. This should only be used for custom feature extractors as the ones
        in the library are already mapped with `AutoProcessor`.

        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`):
                The auto class to register this new feature extractor with.
        r   Nz is not a valid auto class.)	r   rG   r@   transformers.models.automodelsautor   r   r   )r  
auto_classauto_modulerI   rI   rJ   register_for_auto_class<  s   


z&ProcessorMixin.register_for_auto_classc           	         s   g } j D ]W}t | d}t|trKt fdd|D }|dkr3|dd}|du r2td n|dd}|rF|d	 durF|d	 }n
|d
 }n |}||j	|fi | q|S )a  
        Identify and instantiate the subcomponents of Processor classes, like image processors and
        tokenizers. This method uses the Processor attributes like `tokenizer_class` to figure out what class those
        subcomponents should be. Note that any subcomponents must either be library classes that are accessible in
        the `transformers` root, or they must be custom code that has been registered with the relevant autoclass,
        via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method
        will be unable to find the relevant subcomponent class and will raise an error.
        r   c                 3   s&    | ]}|d ur  |nd V  qd S r   r   r   r  rI   rJ   r   d  s   $ z@ProcessorMixin._get_arguments_from_pretrained.<locals>.<genexpr>r   use_fastNaC  Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.Tr
   r   )
r   r   r   r   r   r   r  r   r   rG  )	r  r  r   r   r   r   classesrP  attribute_classrI   rO  rJ   rF  V  s&   





z-ProcessorMixin._get_arguments_from_pretrainedc                 C   s   t t| r
tt| S tjtjtjg}|D ]5}|j D ]-}t|t	r9|D ]}|d ur7|j
| kr7|      S q$q|d urH|j
| krH|    S qqtd|  d)NzCould not find module z in `transformers`. If this is a custom class, it should be registered using the relevant `AutoClass.register()` function so that other functions can find it!)r   transformers_moduler   IMAGE_PROCESSOR_MAPPINGTOKENIZER_MAPPINGFEATURE_EXTRACTOR_MAPPING_extra_contentvaluesr   r   r@   r   )module_namelookup_locationslookup_locationcustom_classcustom_subclassrI   rI   rJ   r   z  s*   



z*ProcessorMixin.get_possibly_dynamic_modulec                 C   s   t | | jd }t |dd S )Nr   model_input_names)r   r   )r   first_attributerI   rI   rJ   r^    s   z ProcessorMixin.model_input_namesc                    s6      }i }t|t| }|r fdd|D }|S )Nc                    s   i | ]}| | qS rI   rI   )r   r   r(  rI   rJ   r     r   z7ProcessorMixin.validate_init_kwargs.<locals>.<dictcomp>)r   r   )r(  r   kwargs_from_configr+  unused_keysrI   r`  rJ   r)    s   z#ProcessorMixin.validate_init_kwargsc                 G   sh   t |r	td t |t | jkr)tdt | j dd| j dt | ddd t|| jD S )	a  
        Matches optional positional arguments to their corresponding names in `optional_call_args`
        in the processor class in the order they are passed to the processor call.

        Note that this should only be used in the `__call__` method of the processors with special
        arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos`
        but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are:
            - `CLIPSegProcessor`
            - `LayoutLMv2Processor`
            - `OwlViTProcessor`

        Also note that passing by position to the processor call is now deprecated and will be disallowed
        in future versions. We only have this for backward compatibility.

        Example:
            Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`.
            And we define the call method as:
            ```python
            def __call__(
                self,
                text: str,
                images: Optional[ImageInput] = None,
                *arg,
                audio=None,
                videos=None,
            )
            ```

            Then, if we call the processor as:
            ```python
            images = [...]
            processor("What is common in these images?", images, arg_value_1, arg_value_2)
            ```

            Then, this method will return:
            ```python
            {
                "arg_name_1": arg_value_1,
                "arg_name_2": arg_value_2,
            }
            ```
            which we could then pass as kwargs to `self._merge_kwargs`
        zPassing positional arguments to the processor call is now deprecated and will be disallowed in v4.47. Please pass all arguments as keyword arguments.zExpected *at most* zK optional positional arguments in processor callwhich will be matched with  z+ in the order they are passed.However, got z positional arguments instead.Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`.c                 S   s   i | ]\}}||qS rI   rI   )r   	arg_valuearg_namerI   rI   rJ   r     r   zJProcessorMixin.prepare_and_validate_optional_call_args.<locals>.<dictcomp>)r   r   r   r   r   r   r   )r   r   rI   rI   rJ   'prepare_and_validate_optional_call_args  s   ,
z6ProcessorMixin.prepare_and_validate_optional_call_argsconversationbatch_imagesbatch_videosbatch_video_metadatamm_load_kwargsc                 K   s   |S )a)  
        Used within `apply_chat_template` when a model has a special way to process conversation history. For example,
        video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
        were sampled. This information cannot be accessed before the video is loaded.

        For most models it is a no-op, and must be overridden by model processors which require special processing.

        Args:
            conversation (`List[Dict, str, str]`):
                The conversation to process. Always comes in batched format.
            batch_images (`List[List[ImageInput]]`):
                Batch of images that were loaded from url/path defined in the conversation. The images
                are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL` images
                per batch.
            batch_videos (`List[List[ImageInput]]`):
                Batch of videos that were loaded from url/path defined in the conversation. The videos
                are ordered in the samm way as in the conversation. Comes in nested list format, one list of 4D video arrays
                per batch.
            batch_video_metadata (`List[List[Dict[[str, any]]]]`):
                Batch of metadata returned from loading videos. That includes video fps, duration and total number of framer in original video.
                Metadata are ordered in the same way as `batch_videos`. Comes in nested list format, one list of 4D video arrays
                per batch.

        rI   )r   rg  rh  ri  rj  rk  rI   rI   rJ   #_process_messages_for_chat_template  s    z2ProcessorMixin._process_messages_for_chat_templater   c               
   K   sD  |du r| j dur| j }ntdi }tj D ]}tt|d}|||}|||< qi }tj D ]}	tt|	d}||	|}|||	< q2t|t	t
frat|d t	t
fs\t|d drad}
|}nd}
|g}|dd}|dd}|rHg g }}g }g }|D ]}g g }}g }|D ]}d	d
 |d D }dd
 |d D }dd
 |D }dd
 |D }|D ]	}|t| q|d s|D ]}|t||d d qn|D ]}|t||d d q|D ]C}t|t	t
frt|d trdd
 |D }t|}d}td nt||d |d |d |d d\}}|| || qq|r-|| |r:|| || q| j|f|||d|}| jj|f|ddd|}|
s]|d }|r|
rg|d n|}| jjdur||| jjr|d|d< | d||r|nd|r|nd|r|ndd|}|r|S |d S |S )a  
        Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
        conversations to turn them into a single tokenizable string.

        The input is expected to be in the following format, where each message content is a list consisting of text and
        optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
        `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.

        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
                    {"type": "text", "text": "Please describe this image in detail."},
                ],
            },
        ]

        Args:
            conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`):
                The conversation to format.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
                chat template is used.
        NzNo chat template is set for this processor. Please either set the `chat_template` attribute, or provide a chat template as an argument. See https://huggingface.co/docs/transformers/main/en/chat_templating for more information.r   contentTFry   rz   c                 S   s   g | ]
}|d  dv r|qS )r   )imagevideorI   )r   rm  rI   rI   rJ   r   H  r   z6ProcessorMixin.apply_chat_template.<locals>.<listcomp>c                 S   2   g | ]}d D ]}||v r|d dkr|| qqS ))r/  urlr   r   r/  rI   )r   rm  r   rI   rI   rJ   r   I      c                 S   rp  ))rn  rq  r   base64r   rn  rI   r   vision_infor   rI   rI   rJ   r   O  rr  c                 S   rp  ))ro  rq  r   r   ro  rI   rt  rI   rI   rJ   r   U  rr  rw   ra   )ra   c                 S   s   g | ]
}t t|jqS rI   )nparrayr   T)r   image_fnamerI   rI   rJ   r   i  r   zWhen loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. If your model uses this metadata during processing, please load the whole video and let the model sample frames instead.rr   ru   rt   rv   )rr   fpsbackendrv   )rh  ri  rj  )r   ry   rz   r1   )r%  r0  r1  r/  	input_idsrI   )r   r   rk   rE   r   r   r   rq   r   rD   r   r   r   r   r   rG   rv  stackr   warningr   rl  r~   apply_chat_template	bos_token
startswith) r   rg  r   r   tokenizer_template_kwargstokenizer_keydefault_valuer=  rk  mm_load_key
is_batchedconversationsry   rz   rh  ri  batch_audiosrj  r0  r1  video_metadatamessagevisualsaudio_fnamesimage_fnamesvideo_fnamesfnamero  metadatapromptsingle_promptoutrI   rI   rJ   r    s    




 





z"ProcessorMixin.apply_chat_templateTc                 K   s   | j j|fd|i|S )a  
        Post-process the output of a vlm to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `List[str]`: The decoded text.
        skip_special_tokens)r~   batch_decode)r   generated_outputsr  r   rI   rI   rJ   post_process_image_text_to_text  s   z.ProcessorMixin.post_process_image_text_to_text)Fr   )NFFNrD  )rH  )T)5r@   rA   rB   rC   r   r   r   rD   rG   rE   feature_extractor_classtokenizer_classr   r   r   r]   r   r   r   r	   r   PathLiker   r   rF   r   classmethodr   r&  r,  re   r   rC  rG  rN  rF  staticmethodr   propertyr^  r)  rf  r   r   r   r   r7  Unpackrq   rl  r{   r  r  rI   rI   rI   rJ   r|     s   
 &(l 2-

 ?
#


:
%&
 5r|   c                    s   dt fdd fdd fdddd	 }||  }| }||} |}|r2|r2| |fS | d
u r8|sB|d
u r>|sB|rK|rKtd || fS td)a  
    For backward compatibility: reverse the order of `images` and `text` inputs if they are swapped.
    This method should only be called for processors where `images` and `text` have been swapped for uniformization purposes.
    Note that this method assumes that two `None` inputs are valid inputs. If this is not the case, it should be handled
    in the processor's `__call__` method before calling this method.
    r   c                 S   s   t | to	| dS )Nhttp)r   rG   r  )valrI   rI   rJ   is_url  s   z1_validate_images_text_input_order.<locals>.is_urlc                    sB   t | ttfr| D ]	} |s dS q	dS t| s| sdS dS )NFT)r   rD   r   r   )imgsimg)$_is_valid_images_input_for_processorr  rI   rJ   r    s   zO_validate_images_text_input_order.<locals>._is_valid_images_input_for_processorc                    sD   t | trdS t | ttfr t| dkrdS | D ]} |  S dS )NTr   F)r   rG   rD   r   r   )tt_s)"_is_valid_text_input_for_processorrI   rJ   r    s   
zM_validate_images_text_input_order.<locals>._is_valid_text_input_for_processorc                 S   s   || p| d u S r   rI   )input	validatorrI   rI   rJ   	_is_valid  s   z4_validate_images_text_input_order.<locals>._is_validNzYou may have used the wrong order for inputs. `images` should be passed before `text`. The `images` and `text` inputs will be swapped. This behavior will be deprecated in transformers v4.47.zGInvalid input type. Check that `images` and/or `text` are valid inputs.)rF   r   r  r   )r0  r%  r  images_is_validimages_is_texttext_is_validtext_is_imagesrI   )r  r  r  rJ   !_validate_images_text_input_order  s    

 r  r  rH  zprocessor files)objectobject_classobject_files)KrC   r   r   r   r   systypingr   pathlibr   r   r   r   r   r   r   r	   numpyrv  typing_extensionsaudio_utilsr   dynamic_module_utilsr   image_utilsr   r   r   r   r   r   r   r   tokenization_utils_baser   r   r   r   r   utilsr   r   r   r   r   r   r    r!   r"   r#   r$   r%   
get_loggerr@   r   __file__parentrS  r   version_infor  r-   rL   r_   r`   rc   re   rk   rq   rx   r{   r|   r  r   formatrI   rI   rI   rJ   <module>   sp   $$8

::0'7'!
        :