o
    hc                 	   @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlZddlZddlZddlmZ dd	lmZm Z  dd
l!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD ddlEmFZFmGZG e@ rddlHmIZI ddlJmKZL erddlmMZM eBNeOZPdejQvrdejQd< nejQd dkrePRd zddlSZTddlSmUZV W n# eWeXfy   ddlTZTddlTmUZV eeTjYjZdkre[dY nw eN Z\eeej] eej^ ee_ej]f ee_ej^f ej]ej^f Z`dd ZaG dd dZbdd ZcG d d! d!ZdG d"d# d#ZeG d$d% d%ZfG d&d' d'ZgG d(d) d)ZhG d*d+ d+edZiG d,d- d-Zjd.d/ Zkd0d1 Zld2d3 ZmdYd4d5Znd6e4fdZd9d:Zod[d<d=Zpd\d>d?Zq	d[d@dAZrd\dBdCZsd\dDdEZtd\dFdGZudHdI ZvG dJdK dKeTjwebe)e9ZxG dLdM dMeTjyjzZ{G dNdO dOeTjyjzZ|G dPdQ dQeTjyjzZ}d]d^dWdXZ~dS )_zTF general model utils.    )annotationsN)Mapping)Path)TYPE_CHECKINGAnyCallableDictListOptionalUnion)parse   )DataCollatorWithPaddingDefaultDataCollator)get_tf_activation)PretrainedConfig)custom_object_save)GenerationConfigTFGenerationMixin)convert_batch_encoding	expand_1dload_attributes_from_hdf5_groupsave_attributes_to_hdf5_group
shape_list)SAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMETF2_WEIGHTS_INDEX_NAMETF2_WEIGHTS_NAMETF_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEModelOutputPushToHubMixincached_filedownload_urlfind_labelshas_fileis_offline_modeis_remote_urlis_safetensors_availableis_tf_symbolic_tensorloggingrequires_backendsworking_or_temp_dir)convert_file_size_to_intget_checkpoint_shard_files)	safe_open)	save_file)PreTrainedTokenizerBaseTF_USE_LEGACY_KERAS1zTransformers is only compatible with Keras 2, but you have explicitly set `TF_USE_LEGACY_KERAS` to `0`. This may result in unexpected behaviour or errors if Keras 3 objects are passed to Transformers models.)backend   zYour currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.c                 C  s0   |j jdkr|S ttd|j j}tj||dS )Nr   axis)shaperanklistrangetfreduce_mean)y_truey_predreduction_axes rB   r/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/modeling_tf_utils.py
dummy_lossv   s   rD   c                   @  s   e Zd ZdZd
dddZd	S )TFModelUtilsMixinzC
    A few utilities for `keras.Model`, to be used as a mixin.
    Fonly_trainableboolreturnintc                 C  s$   |rt tdd | jD S |  S )a9  
        Get the number of (optionally, trainable) parameters in the model.

        Args:
            only_trainable (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of trainable parameters

        Returns:
            `int`: The number of parameters.
        c                 s  s     | ]}t |j V  qd S N)npprodr9   as_list.0wrB   rB   rC   	<genexpr>   s    z3TFModelUtilsMixin.num_parameters.<locals>.<genexpr>)rI   sumtrainable_variablescount_params)selfrF   rB   rB   rC   num_parameters   s   z TFModelUtilsMixin.num_parametersN)F)rF   rG   rH   rI   )__name__
__module____qualname____doc__rV   rB   rB   rB   rC   rE   ~   s    rE   c                   s    j t dddu rtdtfdd}| _ t ds)tdt jdr8 fd	d
}| _d _tt	j
drHt	j
    S )a  
    Decorate a Keras Layer class to support Keras serialization.

    This is done by:

    1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
       serialization time.
    2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
       convert it to a config object for the actual layer initializer.
    3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
       need to be supplied in `custom_objects` in the call to `keras.models.load_model`.

    Args:
        cls (a `keras.layers.Layers subclass`):
            Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its
            initializer.

    Returns:
        The same class object, with modifications for Keras deserialization.
    config_classNz2Must set `config_class` to use @keras_serializablec                   s   |rt |d tr|d n|dd }t |tr* |}| |g|R i | n(t |trNt|dkrA| g|R i | n| |g|R i | ntd|| _|| _d S )Nr   configz?Must pass either `config` (PretrainedConfig) or `config` (dict))	
isinstancer   popdict	from_dictlen
ValueError_config_kwargs)rU   argskwargsr\   )r[   initializerrB   rC   wrapped_init   s   &



z(keras_serializable.<locals>.wrapped_init
get_configz=Only use @keras_serializable on keras.layers.Layer subclasses_is_defaultc                   s,   t  |  }| j |d< || j |S )Nr\   )superri   rc   to_dictupdaterd   )rU   cfgclsrB   rC   ri      s   z&keras_serializable.<locals>.get_configTregister_keras_serializable)__init__getattrAttributeError	functoolswrapshasattr	TypeErrorri   _keras_serializablekerasutilsrq   )rp   rh   ri   rB   )rp   r[   rg   rC   keras_serializable   s    
r|   c                   @     e Zd ZdZdd ZdS )TFCausalLanguageModelingLossz
    Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token.

    <Tip>

    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.

    </Tip>
    c           
      C  s   t jjdt jjjd}| jjr8tt	|dd}t
t	|dt|d f|}t
t	|d|}|||S |tj||}tj|dk|jd}|| }t|t| }	t	|	dS )	NTfrom_logits	reductionr   r6   dtyper   )rz   lossesSparseCategoricalCrossentropy	ReductionNONEr\   tf_legacy_lossr=   	not_equalreshapeboolean_maskr   nnrelucastr   
reduce_sum
rU   labelslogitsloss_fnactive_lossreduced_logitsunmasked_loss	loss_maskmasked_lossreduced_masked_lossrB   rB   rC   hf_compute_loss   s    
z,TFCausalLanguageModelingLoss.hf_compute_lossNrW   rX   rY   rZ   r   rB   rB   rB   rC   r~          
r~   c                   @  r}   )TFQuestionAnsweringLossz8
    Loss function suitable for question answering.
    c                 C  sF   t jjdt jjjd}||d |d }||d |d }|| d S )NTr   start_positionr   end_positionr   g       @rz   r   r   r   r   )rU   r   r   r   
start_lossend_lossrB   rB   rC   r      s   z'TFQuestionAnsweringLoss.hf_compute_lossNr   rB   rB   rB   rC   r          r   c                   @  r}   )TFTokenClassificationLossz
    Loss function suitable for token classification.

    <Tip>

    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.

    </Tip>
    c           
      C  s  t jjdt jjjd}t rtj|dkrt	d | j
jr]tj|dkr6t	d t|ddk}nt|ddk}tt|dt|d f|}tt|d|}|||S |tj||}tj|dk|jd	}|| }t|t| }	t|	d
S )NTr   r   zSUsing `-1` to mask the loss for the token is deprecated. Please use `-100` instead.r   r   r6   r   r   r   )rz   r   r   r   r   r=   executing_eagerlymath
reduce_anyprintr\   r   r   r   r   r   r   r   r   r   r   rB   rB   rC   r   
  s"   

 
z)TFTokenClassificationLoss.hf_compute_lossNr   rB   rB   rB   rC   r      r   r   c                   @  r}   )TFSequenceClassificationLossz=
    Loss function suitable for sequence classification.
    c                 C  sj   |j jdks|j d dkr%tjjtjjjd}|j jdkr$tj|dd}ntjj	dtjjjd}|||S )Nr   )r   r   r7   Tr   )
r9   r:   rz   r   MeanSquaredErrorr   r   r=   expand_dimsr   rU   r   r   r   rB   rB   rC   r   .  s   

z,TFSequenceClassificationLoss.hf_compute_lossNr   rB   rB   rB   rC   r   )  r   r   c                   @  r}   )TFMultipleChoiceLossz1Loss function suitable for multiple choice tasks.c                 C  s    t jjdt jjjd}|||S )NTr   r   r   rB   rB   rC   r   ?  s   
z$TFMultipleChoiceLoss.hf_compute_lossNr   rB   rB   rB   rC   r   <  s    r   c                   @  s   e Zd ZdZdS )TFMaskedLanguageModelingLossz
    Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.

    <Tip>

    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.

    </Tip>
    N)rW   rX   rY   rZ   rB   rB   rB   rC   r   D  s    r   c                   @  r}   )TFNextSentencePredictionLossz
    Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence.

    <Tip>

    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.

    </Tip>
    c           
      C  s   t jjdt jjjd}| jjr2tt	|dd}t
t	|d|}t
t	|d|}|||S |tj||d}tj|dk|jd}|| }	|	S )NTr   r   r   )r   r6   )r?   r@   r   )rz   r   r   r   r   r\   r   r=   r   r   r   r   r   r   r   )
rU   r   r   r   next_sentence_active_lossnext_sentence_reduced_logitsnext_sentence_labelunmasked_ns_lossns_loss_maskmasked_ns_lossrB   rB   rC   r   [  s   
z,TFNextSentencePredictionLoss.hf_compute_lossNr   rB   rB   rB   rC   r   P  r   r   c                 K  s   i }d|v r|d dur|d n| j |d< |d dur|d n| j|d< |d dur.|d n| j|d< d|v rI|d durA|d nt| dd|d< |S )a  
    Process the input booleans of each model.

    Args:
        config ([`PretrainedConfig`]):
            The config of the running model.
        **kwargs:
            The boolean parameters

    Returns:
        A dictionary with the proper values for each boolean
    output_attentionsNoutput_hidden_statesreturn_dict	use_cache)r   r   r   rs   )r\   rf   final_booleansrB   rB   rC   booleans_processingr  s   r   c                   s,   t  t  fdd}|_|S )a  
    Decorator that processes the inputs to a Keras layer, passing them to the layer as keyword arguments. This enables
    downstream use of the inputs by their variable name, even if they arrive packed as a dictionary in the first input
    (common case in Keras).

    Args:
        func (`callable`):
            The callable function of the TensorFlow model.


    Returns:
        A callable that wraps the original `func` with the behavior described above.
    c                   s   fdd|  D   fdd|  D }|d i |ttjjdd  | d| jjv r6d }n| j}t	|fi |}| fi |S )Nc                   s$   i | ]\}}|t  jvr||qS rB   )r_   
parametersrO   keyval)original_signaturerB   rC   
<dictcomp>  s   $ zHunpack_inputs.<locals>.run_call_with_unpacked_inputs.<locals>.<dictcomp>c                   s   i | ]\}}| vr||qS rB   rB   r   kwargs_callrB   rC   r         r   r   EncoderDecoder)
itemsrm   r_   zip__code__co_varnames	__class__rW   r\   input_processing)rU   re   rf   fn_args_and_kwargsr\   unpacked_inputsfuncr   r   rC   run_call_with_unpacked_inputs  s    z4unpack_inputs.<locals>.run_call_with_unpacked_inputs)inspect	signatureru   rv   __signature__)r   r   rB   r   rC   unpack_inputs  s
   
r   c              	   K  s  t t| j}t|dd}|dd t| }|d }||d}i }tj	tt
tttt tjf}	d|d v rItdt |d d|d< d	|d v r^td
t |d d	|d< d|d v rxd|v rxtdt |d d|d< nd|d v rd|v r|d d|d< |r|di |d< nt|d dkrtdt|d   d|d | D ]&\}
}t||	st|s|du r|||
< qtdt| d|	 d|
 dt|ttfr/t|D ]E\}}t|r
|jdd }||v r|||< q|||| < qt||	s|du r|||| < qtdt| d|	 d||  dnt|trd|v rGtdt |d|d< d	|v rYtd
t |d	|d< t | D ]<\}
}t||	sn|du rt|||
< q_|
|vrd|vrtd|
 d| d q_tdt| d|	 d|
 dn t|s|du r|||< ntdt| d|	 d| d|D ]}|t| vr|dkr|||| j||< qd|v r	|d dur t|d r |d jdd }|d ||< n|d |d< |d= d|v r|d= i }| D ]:\}}t|tj	r4|j tj!kr4t"|tj#||< qt|tjrL|j tj!krL|$tj#||< q|||< q|}~|durodd | D }|%t&dd|i| |S )a  
    Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
    has to be named accordingly to the parameters name, i.e. `input_ids = keras.Input(shape=(128,), dtype='int32',
    name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training.

    Args:
        func (`callable`):
            The callable function of the TensorFlow model.
        config ([`PretrainedConfig`]):
            The config of the running model.
        **kwargs:
            The inputs of the model.

    Returns:
        Two lists, one for the missing layers, and another one for the unexpected layers.
    rf   NrU   r   inputsr   zeThe `inputs` argument is deprecated and will be removed in a future version, use `input_ids` instead.	input_idsdecoder_cached_stateszzThe `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.past_key_valuespastziThe `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.zAThe following keyword arguments are not supported by this model: .zData of type z is not allowed only z is accepted for :re   zThe parameter z( does not belongs to the parameter list z and will be ignored.c                 S  s   i | ]\}}|d v r||qS ))r   r   r   r   rB   )rO   kvrB   rB   rC   r   \  s
    z$input_processing.<locals>.<dictcomp>r\   rB   )'r_   r   r   r   rG   r^   r;   keysr=   TensorrI   r!   tuplerK   ndarraywarningswarnFutureWarningra   rb   r   r]   	is_tensortype	enumerater*   namesplitr   loggerwarningdefaultr   int64r   int32astyperm   r   )r   r\   rf   r   
has_kwargsparameter_namesmain_input_name
main_inputoutputallowed_typesr   r   iinputtensor_namer   cast_outputr   r   boolean_dictrB   rB   rC   r     s   

 




 



r   c                 C  sn   |d ur|  |r| t|d  } |  dr| dd  } d| vr5t| ddkr5d| ddd  } | S )N/r   model.)
startswithra   r   join)r   _prefixrB   rB   rC   strip_model_name_and_prefixl  s   
r  10GBweights_namestrc              	   C  s2  t |}g }g }d}d}| D ]'}| j|jj }|| |kr(|| g }d}|| ||7 }||7 }q|| t|dkrI||d idfS i }	i }
t|D ]:\}}|dd|d ddt|dd}|dd|d ddt|dd}||
|< |D ]	}|j}||	|< qqQd	|i}||	d
}|
|fS )a?  
    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
    given size.

    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].

    <Tip warning={true}>

    If one of the model's weight is bigger that `max_shard_size`, it will end up in its own sub-checkpoint which will
    have a size greater than `max_shard_size`.

    </Tip>

    Args:
        weights (`Dict[str, tf.RessourceVariable]`): The list of tf.RessourceVariable of a model to save.
        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
            (like `"5MB"`).
    r   r   Nz.h5-05dz-of-.safetensors
total_size)metadata
weight_map)	r.   numpysizer   appendra   r   replacer   )weightsmax_shard_sizer  sharded_state_dictscurrent_blockcurrent_block_sizer  itemweight_sizer  shardsidxshard
shard_fileweightweight_namer  indexrB   rB   rC   tf_shard_checkpointv  s@   



&

r   Fc                 C  s  t  }t  }t  }t  }i }	t| jD ]?\}
}|j}|dur0||r0|t|d }|d}d|v sIt|ddksId|ddd }|	| |
|	|< q|D ]!}t
| |	|||d\}}}|| || || t  qU|| }|rt|dkst|dkrd| jj }t|dkrdd	d
 |D }|d| d7 }t|dkrddd
 |D }|d| d7 }t||||fS )a  
    This is the same as `load_tf_weights` but for a sharded checkpoint. Detect missing and unexpected layers and load
    the TF weights from the shard file accordingly to their names and shapes.

    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
    loaded in the model.

    Args:
        model (`keras.models.Model`): The model in which to load the checkpoint.
        shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names.
        ignore_mismatched_sizes`bool`, *optional`, defaults to `True`):
            Whether or not to ignore the mismatch between the sizes
        strict (`bool`, *optional*, defaults to `True`):
            Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.

    Returns:
        Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
        mismatched layers.
    Nr   r   r   ignore_mismatched_sizesr  r   #Error(s) in loading state_dict for ,c                 S     g | ]}d | d qS "rB   rO   r   rB   rB   rC   
<listcomp>      z+load_tf_sharded_weights.<locals>.<listcomp>
Missing key(s): r   c                 S  r%  r&  rB   r(  rB   rB   rC   r)    r*  )setr   r  r   r  ra   lstripr   r  addload_tf_shardrm   gccollectr   rW   RuntimeError)modelshard_filesr"  strictr  unexpected_keys
saved_keysmismatched_keys
model_keysmodel_layer_mapr   r   
layer_namer  saved_weight_names_setunexpected_keys_setmismatched_keys_setmissing_keyserror_messagestr_missing_keysstr_unexpected_keysrB   rB   rC   load_tf_sharded_weights  sJ   







rC  c                 C  s  t  }i }t  }t  }zt|d~}	t t|	d}
g }|
D ]j}|	| }t|||< || ||vr:|| q| j||  }|| }|durt	||j
krzt|t	|}W n$ ty~ } z|rx|||j
t	|f W Y d}~q|d}~ww |}|||f qW d   n1 sw   Y  t| |||fW S  ty } z?z#t|}| drtdtd| d|1 sw   Y  W n ttfy   td| d	| d
w W Y d}~dS d}~ww )a  
    Loads a shard from a sharded checkpoint file. Can be either H5 or Safetensors.
    Handles missing keys and unexpected keys.

    Args:
        model (`keras.models.Model`): Model in which the weights are loaded
        model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model.
        resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded
        ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys

    Returns:
        `keras.models.Model`: Three lists, one for the layers that were found and successfully restored (from the
        shard file), one for the mismatched layers, and another one for the unexpected layers.
    rlayer_namesNversionYou seem to have cloned a repository without having git-lfs installed. Please install git-lfs and run `git lfs install` followed by `git lfs pull` in the folder you cloned.zUnable to locate the file z_ which is necessary to load this pretrained model. Make sure you have saved the model properly.z4Unable to load weights from TF checkpoint file for 'z' at 'z'. If you tried to load a TF model from a sharded checkpoint, you should try converting the model by loading it in pytorch and saving it locally. A convertion script should be released soon.)r,  h5pyFiler   rK   asarrayr.  r  K	int_shaper9   r   rb   r  batch_set_value	Exceptionopenreadr  OSErrorUnicodeDecodeError)r3  r:  resolved_archive_filer"  r  r<  saved_weightsr8  r6  sharded_checkpoint_filesaved_h5_model_layers_nameweight_value_tuplesr;  h5_layer_objectsymbolic_weightsaved_weight_valuearrayefrB   rB   rC   r/    st   
	
)

r/  c                 C  s   t  }g }t  }|D ]"}t| |||d\}	}
}|t |	 ||
 || t  q
t j| }|r{t|dks@t|dkr{d| jj	 }t|dkr_d
dd |D }|d| d7 }t|dkrwd
d	d |D }|d| d7 }t||||fS )
a  
    This is the same as `load_tf_weights_from_safetensors` but for a sharded TF-format safetensors checkpoint.
    Detect missing and unexpected layers and load the TF weights from the shard file accordingly to their names and
    shapes.

    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
    loaded in the model.

    Args:
        model (`keras.models.Model`): The model in which to load the checkpoint.
        shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names.
        ignore_mismatched_sizes`bool`, *optional`, defaults to `True`):
            Whether or not to ignore the mismatch between the sizes
        strict (`bool`, *optional*, defaults to `True`):
            Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.

    Returns:
        Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
        mismatched layers.
    r!  r   r#  r$  c                 S  r%  r&  rB   r(  rB   rB   rC   r)    r*  z<load_tf_sharded_weights_from_safetensors.<locals>.<listcomp>r+  r   c                 S  r%  r&  rB   r(  rB   rB   rC   r)    r*  )r,   load_tf_weights_from_safetensorsr  rm   r0  r1  intersectionra   r   rW   r  r2  )r3  r4  r"  r5  r  r6  all_missing_keysr8  r  missing_layersunexpected_layersmismatched_layersr?  r@  rA  rB  rB   rB   rC   (load_tf_sharded_weights_from_safetensorsW  s2   




rd  c                 C  s$   | drt}nt}|| |||dS )a  
    Detect missing and unexpected layers and load the TF weights from the shard file accordingly to their names and
    shapes.

    Args:
        model (`keras.models.Model`):
            The model to load the weights into.
        resolved_archive_file (`str`):
            The location of the H5 file.
        ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
            Whether or not to ignore weights with shapes that don't match between the checkpoint of the model.

    Returns:
        Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
        mismatched layers.
    r
  r!  )endswithr^  load_tf_weights_from_h5)r3  rS  r"  r  load_functionrB   rB   rC   load_tf_weights  s   
rh  c                 C  s  g }t |d}tt|d}tdd | jD | }t|dd | jD  }t }	t }
g }| jD ]}|j|v r||j }|j|j }i }t|dD ]&}d	|
ddd  }|d urf|d | }t|| ||< |	| qN|D ]}|d urt|
d}d	|j
dd | |j
d|d d   }nd	|j
ddd  }||d }|d u r|d	r|d d
 d }||d }|
| |d urt||jkrzt|t|}W n& ty
 } z|r|||jt|f W Y d }~qw|d }~ww |}|||f qwq4W d    n	1 s!w   Y  t| |t|
|	  |t|	|
  |||fS )NrD  rE  c                 S     h | ]}|j qS rB   r   rO   layerrB   rB   rC   	<setcomp>      z*load_tf_weights_from_h5.<locals>.<setcomp>c                 S  ri  rB   rj  rk  rB   rB   rC   rm    rn  weight_namesr   r   zembeddings:0izweight:0)rH  rI  r,  r   r;   layersr   trainable_weightsnon_trainable_weightsr  r   rK   rJ  r.  ra   getre  rK  rL  r9   r   rb   r  rM  extend)r3  rS  r"  r  rc  rU  rV  ra  rb  r<  symbolic_weights_namesrW  rl  rX  symbolic_weightsrT  r  r   rY  	delimetersymbolic_weight_namerZ  r[  r\  rB   rB   rC   rf    sn   



	
W
rf  c                   s.  t |dd}g } fdd| jD }t| }tt|t| }tt|t| }	| jD ]R}
t|
j d}||v r||}t	|
|j
kr|zt|t	|
}W n& ttjjfy{ } z|ru|||j
t	|
f W Y d }~q0|d }~ww t|
| q0W d    n1 sw   Y  ||	|fS )Nr=   	frameworkc                   s   g | ]	}t |j d qS )r  )r  r   rN   r{  rB   rC   r)    s    z4load_tf_weights_from_safetensors.<locals>.<listcomp>r{  )r0   r  r;   r   r,  r  r   
get_tensorrK  rL  r9   r=   r   rb   errorsInvalidArgumentErrorr  	set_value)r3  rS  r"  r  safetensors_archiverc  ro  loaded_weight_namesra  rb  r  r  weight_valuer\  rB   r{  rC   r^    s2   


r^  c                 C  s   t | \}}|| }tj|drItj|  td|gddggdd}t||}tt|dgd}tj|td|gddggdd}||fS t	|  tddgt||g}tt|dgd}||fS )aV  
    This function aims to reduce the embeddings in case new_num_tokens < old_num_tokens or to pad with -1 in case
    new_num_tokens > old_num_tokens. A mask is also computed in order to know which weight in the embeddings should be
    kept or not. Example:

        - if new_num_tokens=5 and old_num_tokens=4 and old_embeddings=[w1,w2,w3,w4]

            -  mask=[True,True,True,True,False] and current_weights=[w1,w2,w3,w4,-1]
        - if new_num_tokens=4 and old_num_tokens=5 and old_embeddings=[w1,w2,w3,w4,w5]

            - mask=[True,True,True,True] and current_weights=[w1,w2,w3,w4]
    r   r   constant_valuesr   TF)
r   r=   r   greaterpadvalueconvert_to_tensorminfillslice)old_embeddingsnew_num_tokensold_num_tokensold_embedding_dim	size_diffcurrent_weightsnum_tokens_to_copymaskrB   rB   rC   init_copy_embeddings-  s"   
"
r  c                
      s  e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZeddd	Zd
d ZedddZdddZ fddZdd Zeejj fddZeejj fddZeejj fddZeejj fddZeejj fddZeejj fdd Zed!d" Zed#d$ Z dd*d+Z!d,d- Z"e#j$d.d/ Z%edd1d2Z&d3d4 Z'edd6d7Z(dd9d:Z)d;d< Z*	=	>					>dddLdMZ+	N	O					dŇ fdPdQ	Z, fdRdSZ-dTdU Z.dVdW Z/dXdY Z0								dddddeZ1dfdg Z2ddidjZ3dkdl Z4ddmdnZ5ddpdqZ6ddsdtZ7dudv Z8ddwdxZ9	ddd|d}Z:ddddZ;dd Z<dd Z=dd Z>dd Z?dddZ@dd ZAddddZBdddZCdd ZD								ddddZEeddddddddddddZF							ddddZGedddZH  ZIS )TFPreTrainedModela  
    Base class for all TF models.

    [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
    downloading and saving models as well as a few methods common to all models to:

        - resize the input embeddings,
        - prune heads in the self-attention heads.

    Class attributes (overridden by derived classes):

        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
          for this model architecture.
        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
          classes of the same architecture adding modules on top of the base model.
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
    N r   FrH   Dict[str, tf.Tensor]c                 C  s   i }| j  D ].\}}dd |jD }|jd du rd|d< tj||jd||< |dkr5t|| ||< q| jjrbdt	
| jjv rbd|vrb| jd	kr^tjdd
| jjftjdd|d< |S td|S )z|
        Dummy inputs to build the network.

        Returns:
            `Dict[str, tf.Tensor]`: The dummy inputs.
        c                 S  s   g | ]
}|d ur
|ndqS )Nr6   rB   )rO   dimrB   rB   rC   r)        z2TFPreTrainedModel.dummy_inputs.<locals>.<listcomp>r   Nr   )r9   r   token_type_idsencoder_hidden_statesr   r6   r9   r   r   zModel has cross-attention but we couldn't infer the shape for the encoder hidden states. Please manually override dummy_inputs!)input_signaturer   r9   r=   onesr   
zeros_liker\   add_cross_attentionr   r   callr   r   hidden_sizefloat32NotImplementedError)rU   dummiesr   specdummy_shaperB   rB   rC   dummy_inputsx  s(   

zTFPreTrainedModel.dummy_inputsc                 C  s>   t | j | jd d W d    d S 1 sw   Y  d S )N)input_shape)r=   
name_scoper   buildrU   rB   rB   rC   build_in_name_scope  s   "z%TFPreTrainedModel.build_in_name_scoper  c                 C     dS )zC
        :str: Identifies that this is a TensorFlow model.
        r=   rB   r  rB   rB   rC   rz    s   zTFPreTrainedModel.frameworkc                 C  s   d S rJ   rB   rU   r  rB   rB   rC   r    s   zTFPreTrainedModel.buildc                   sp   t  j|i | t|tstd| jj d| jj d|| _|j| _| 	 r-t
|nd | _| | j d S )NzParameter config in `zt(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = z(.from_pretrained(PRETRAINED_MODEL_NAME)`)rk   rr   r]   r   rx   r   rW   r\   name_or_pathcan_generater   from_model_configgeneration_config_set_save_specr  )rU   r\   r   rf   r   rB   rC   rr     s   
zTFPreTrainedModel.__init__c                 C  s
   | j  S rJ   )r\   rl   r  rB   rB   rC   ri     s   
zTFPreTrainedModel.get_configc                   $   t |i |\}}t j|i |S rJ   )r   rk   fitrU   re   rf   r  rB   rC   r       zTFPreTrainedModel.fitc                   r  rJ   )r   rk   train_on_batchr  r  rB   rC   r    r  z TFPreTrainedModel.train_on_batchc                   r  rJ   )r   rk   test_on_batchr  r  rB   rC   r    r  zTFPreTrainedModel.test_on_batchc                   r  rJ   )r   rk   predict_on_batchr  r  rB   rC   r    r  z"TFPreTrainedModel.predict_on_batchc                   r  rJ   )r   rk   predictr  r  rB   rC   r    r  zTFPreTrainedModel.predictc                   r  rJ   )r   rk   evaluater  r  rB   rC   r    r  zTFPreTrainedModel.evaluatec                 K  s6   t |tr| j|fi |S | | jj|fi |S rJ   )r]   r   _from_configr[   r`   rp   r\   rf   rB   rB   rC   from_config  s   
zTFPreTrainedModel.from_configc                 K  s   | |fi |S )zZ
        All context managers that the model should be initialized under go here.
        rB   r  rB   rB   rC   r    s   zTFPreTrainedModel._from_config	head_masktf.Tensor | Nonenum_hidden_layersrI   	tf.Tensorc                 C  s&   |dur|  ||}|S dg| }|S )a$  
        Prepare the head mask if needed.

        Args:
            head_mask (`tf.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
            num_hidden_layers (`int`):
                The number of hidden layers in the model.

        Returns:
            `tf.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
            `[None]` for each layer.
        N)_convert_head_mask_to_5drU   r  r  rB   rB   rC   get_head_mask  s
   
zTFPreTrainedModel.get_head_maskc                 C  s   |j jdkr|ddddddf }tj||dd}n|j jdkr-|dddddddf }|j jdks<J d|  t|tj}|S )zD-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]r   Nr   )repeatsr8   r6      zhead_mask.dim != 5, instead )r9   r:   r=   repeatr  r   r  r  rB   rB   rC   r    s   z*TFPreTrainedModel._convert_head_mask_to_5dc                 C  s   |  |}| |S )a6  
        Args:
        Method used for serving the model. Does not have a specific signature, but will be specialized as concrete
        functions when saving with `save_pretrained`.
            inputs (`Dict[str, tf.Tensor]`):
                The input of the saved model as a dictionary of tensors.
        )r  serving_output)rU   r   r   rB   rB   rC   serving  s   
	
zTFPreTrainedModel.servingDict[str, tf.TensorSpec]c                 C  s  t t| jj}i }d|v r2| jjdrd}nd}dD ]}||v r1tj	dg| tj
|d||< qd|v rg d	}t| jd
rE| jj}n| j}t|drS|j|d< ntdt|drf|j |d< |d< nt|dru|j |d< |d< ntdtj	|tjdd|d< d|v rtd|S )z
        This property should return a dict mapping input names to tf.TensorSpec objects, representing the expected
        shape and dtype for model inputs. It is used for both serving and for generating dummy inputs.
        r   ForMultipleChoice   r6   )r   attention_maskr  decoder_input_idsdecoder_attention_maskNrj  pixel_values)NNNNvision_confignum_channelsr   zhCould not infer number of channels from config, please override input_signature to specify input shapes.
image_size
input_sizezgCould not infer input image shape from config, please override input_signature to specify input shapes.input_featuresz4Audio models need a manually defined input_signature)r;   r   r   r  r   r   rW   re  r=   
TensorSpecr   rw   r\   r  r  r  r  r  r  )rU   model_inputssig	text_dims
input_namepixel_values_shaper  rB   rB   rC   r    s>   



z!TFPreTrainedModel.input_signaturec              
   C  s   t |ts|S |D ]l}|drt| jddsd||< n7|dr-t| jdds-d||< n&|dkr=t| jdds=d||< n|d	krSt| jddrOt| jd
dsSd||< t || ttfruzt|| ||< W q	 t	tj
jfyt   Y q	w q	|S )zz
        Prepare the output of the saved model. Can be overridden if specific serving modifications are required.
        hidden_statesr   FN
attentionsr   r   r   cross_attentionsr  )r]   r!   re  rs   r\   r   r;   r=   r  rb   r}  r~  )rU   r   r   rB   rB   rC   r  =  s.   



z TFPreTrainedModel.serving_outputrG   c                 C  s$   dt | jv rdt | jv rdS dS )z
        Returns whether this model can generate sequences with `.generate()`.

        Returns:
            `bool`: Whether this model can generate sequences with `.generate()`.
        GenerationMixinFT)r  prepare_inputs_for_generationgeneratero   rB   rB   rC   r  U  s   
zTFPreTrainedModel.can_generatekeras.layers.Layerc                 C  s"   t | | j| }|| ur| S t)z
        Returns the model's input embeddings layer.

        Returns:
            `tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
        )rs   base_model_prefixget_input_embeddingsr  )rU   
main_layerrB   rB   rC   r  c  s   z&TFPreTrainedModel.get_input_embeddingsc                 C  s   t j|st | t j|d}| | || j d}t j|d}t|d}t	
|| W d    d S 1 s=w   Y  d S )Nz
weights.h5)epochoptimizer_statezextra_data.picklewb)ospathisdirmkdirr  save_weights	optimizerget_weightsrO  pickledump)rU   checkpoint_dirr  weights_path
extra_dataextra_data_pathr]  rB   rB   rC   _save_checkpointq  s   

"z"TFPreTrainedModel._save_checkpoint   Tdataset'datasets.Dataset'
batch_sizeshuffle	tokenizer#Optional['PreTrainedTokenizerBase']
collate_fnOptional[Callable]collate_fn_argsOptional[Dict[str, Any]]drop_remainderOptional[bool]prefetchc	              
     sl  t | dg ddl}	|du r|du rtdd}nt|dd}|du r$i }t||	js.tdtt	| j
j t| jdtt	|jj v rV|j|d|| d	\}
}n fd
d|jD }||}|j|d||d\}
}t|
 } fdd|D }fdd|D }t|dkr|d n|}t|dkr|d n|}|du r|}|j||||||||d}|S )a  
        Wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` with collation and batching. This method is
        designed to create a "ready-to-use" dataset that can be passed directly to Keras methods like `fit()` without
        further modification. The method will drop columns from the dataset if they don't match input names for the
        model. If you want to specify the column names to return rather than using the names that match this model, we
        recommend using `Dataset.to_tf_dataset()` instead.

        Args:
            dataset (`Any`):
                A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
            batch_size (`int`, *optional*, defaults to 8):
                The size of batches to return.
            shuffle (`bool`, defaults to `True`):
                Whether to return samples from the dataset in random order. Usually `True` for training datasets and
                `False` for validation/test datasets.
            tokenizer ([`PreTrainedTokenizerBase`], *optional*):
                A `PreTrainedTokenizer` that will be used to pad samples to create batches. Has no effect if a specific
                `collate_fn` is passed instead.
            collate_fn (`Callable`, *optional*):
                A function that collates samples from the dataset into a single batch. Defaults to
                `DefaultDataCollator` if no `tokenizer` is supplied or `DataCollatorWithPadding` if a `tokenizer` is
                passed.
            collate_fn_args (`Dict[str, Any]`, *optional*):
                A dict of arguments to pass to the `collate_fn` alongside the list of samples.
            drop_remainder (`bool`, *optional*):
                Whether to drop the final batch, if the batch_size does not evenly divide the dataset length. Defaults
                to the same setting as `shuffle`.
            prefetch (`bool`, defaults to `True`):
                Whether to add prefetching to the end of the `tf.data` pipeline. This is almost always beneficial for
                performance, but can be disabled in edge cases.


        Returns:
            `Dataset`: A `tf.data.Dataset` which is ready to pass to the Keras API.
        datasetsr   NrK   )return_tensors)r   r
  z.Dataset argument should be a datasets.Dataset!cols_to_retain)r  r  r  r  c                   s    g | ]}| vr|d vr|qS ))	label_idslabelrB   )rO   feature)r  rB   rC   r)    s
    z8TFPreTrainedModel.prepare_tf_dataset.<locals>.<listcomp>)r  r  r  c                   s    g | ]}| v r|vr|qS rB   rB   rO   colr  model_labelsrB   rC   r)         c                   s   g | ]}| v r|qS rB   rB   r  )r  rB   rC   r)    r*  r   )columns
label_colsr  r  r  r  r  r  )r,   r	  r   r   r]   Datasetrx   r;   r   r   r  r   r%   r   _get_output_signaturer   featuresremove_columnsra   to_tf_dataset)rU   r  r  r  r   r  r  r  r  r	  output_signature_unwanted_columnsoutput_columnsfeature_colsr  
tf_datasetrB   r  rC   prepare_tf_dataset~  sZ   .





z$TFPreTrainedModel.prepare_tf_datasetrmspropauto_with_warningc           
   
     s   |dv rt d d}|dkrt}d| _nd| _tttjj	j
 }	d|	v r;t j	d
|||||||d| d	S t j	d
|||||||d| d	S )z
        This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss
        function themselves.
        )r#  passthrougha  No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss. You can also specify `loss='auto'` to get the internal loss without printing this info string.autoTFsteps_per_execution)r  lossmetricsloss_weightsweighted_metricsrun_eagerlyr&  )r  r'  r(  r)  r*  r+   experimental_steps_per_executionNrB   )r   inforD   _using_dummy_lossr;   r   r   rz   Modelcompiler   r   rk   )
rU   r  r'  r(  r)  r*  r+  r&  rf   parent_argsr  rB   rC   r0    sB   

zTFPreTrainedModel.compilec                   s:   t tjdrt j|i |S tdt | j|i |S )Ncompute_lossa0  The old compute_loss method is deprecated as it conflicts with the Keras compute_loss method added in TF 2.8. If you want the original HF compute_loss, please call hf_compute_loss() instead. From TF versions >= 2.8, or Transformers versions >= 5, calling compute_loss() will get the Keras method instead.)	rw   rz   r/  rk   r2  r   r   r   r   r  r  rB   rC   r2    s   zTFPreTrainedModel.compute_lossc                 C  sn   t t| jj}| jd ur| jS d|v rdddS d|v r#dddS d	|v r,dd
dS d|v r5dddS i S )Nstart_positionsstart_logits
end_logits)r3  end_positionssentence_order_labelprediction_logits
sop_logits)r   r7  r   seq_relationship_logits)r   r   	mc_labelsr   	mc_logits)r   r;  )r;   r   r   r  r   _label_to_output_map)rU   	arg_namesrB   rB   rC    get_label_to_output_name_mapping+  s   




z2TFPreTrainedModel.get_label_to_output_name_mappingc                   s  t t| jj}t| j |  dd  D }| j	s+t
tjt
dk r+t|}tj|\}}t|tr=| }ttrF | j	rdurt dkrtttjrtt|tjre|d |i}tt }||vrs||< n9ttrt|tjr|d |i} D ]#\}}||v r||vr|||< q||d|v r||vr|||| < qdu rŇ fdd| D s| j	stdttrՇfd	d D t }	| j	rd
|v r| |ddd}
n| |dd}
| j	r| j|
j|
j|| jd}nd}ttrAtdkrAt  d |
 v r&|
t  d  }
nt |
 d dkr6|
d }
n|
d }
 \}nSttrSfdd|
 D }
nAtt s_tt rt |
 d dkrs|
! dd }
n|
! }
|
dt }
nt |
 d dkr|
d }
n|
d }
|du r| j|
|| jd}W d   n	1 sw   Y  | j"j#|| j$|	d | j%&|
| i }| j'D ]}|( }t|tr|)| q|||j*< q|S )  
        A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models
        and supports directly training on the loss output head. In addition, it ensures input keys are copied to the
        labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure
        that they are available to the model during the forward pass.
        c                 S     i | ]\}}||qS rB   rB   r   rB   rB   rC   r   F      z0TFPreTrainedModel.train_step.<locals>.<dictcomp>2.11.0Nr   r   c                      i | ]\}}| v r||qS rB   rB   r   label_kwargsrB   rC   r   g  r   RCould not find label column(s) in input dict and no separate labels were provided!c                      i | ]\}}  |||qS rB   rs  r   label_to_outputrB   rC   r   m  r   return_lossT)trainingrL  rM  regularization_lossesr'  c                   rD  rB   rB   r   yrB   rC   r     r   )tape)+r;   r   r   r  r   r%   r   r?  r   r.  r   r=   __version__r   rz   r{   unpack_x_y_sample_weightr]   r_   copyra   r   nextiterrs  rb   GradientTapecompiled_lossr'  r   r   popitemr   to_tupler  minimizerS   compiled_metricsupdate_stater(  resultrm   r   )rU   datar>  output_to_labelxsample_weightlabel_kwargr   r   rS  r@   r'  r  return_metricsmetricr`  rB   rF  rK  rR  rC   
train_step:  s   	










)
zTFPreTrainedModel.train_stepc                   s  t t| jj}t| j |  dd  D }| j	s+t
tjt
dk r+t|}tj|\}}t|tr=| }ttrF | j	rdurt t| jj}t dkr}ttjr}t|tjrn|d |i}tt }||vr|||< n9ttrt|tjr|d |i} D ]#\}}||v r||vr|||< q||d|v r||vr|||| < qdu r· fdd| D s| j	stdttrއfd	d D | j	rd
|v r| |ddd}	n| |dd}	| j	r| j|	j|	j|| jd}
nd}
ttrFtdkrFt  d |	 v r+|	t  d  }	nt |	 d dkr;|	d }	n|	d }	 \}nSttrXfdd|	 D }	nAttsdtt rt |	 d dkrx|	  dd }	n|	  }	|	dt }	nt |	 d dkr|	d }	n|	d }	|
du r| j|	|| jd}
| j!"|	| i }| j#D ]}|$ }t|tr|%| q|||j&< q|S )r@  c                 S  rA  rB   rB   r   rB   rB   rC   r     rB  z/TFPreTrainedModel.test_step.<locals>.<dictcomp>rC  Nr   r   c                   rD  rB   rB   r   rE  rB   rC   r     r   rG  c                   rH  rB   rI  r   rJ  rB   rC   r     r   rL  TF)rL  rM  rN  rO  r'  c                   rD  rB   rB   r   rQ  rB   rC   r     r   )'r;   r   r   r  r   r%   r   r?  r   r.  r   r=   rT  r   rz   r{   rU  r]   r_   rV  ra   r   rW  rX  rs  rb   rZ  r'  r   r   r[  r   r\  r^  r_  r(  r`  rm   r   )rU   ra  r>  rb  rc  rd  re  r   r   r@   r'  r  rf  rg  r`  rB   rh  rC   	test_step  s   










zTFPreTrainedModel.test_step
model_namelanguageOptional[str]licensetagsfinetuned_fromtasksdataset_tagsOptional[Union[str, List[str]]]dataset_argsc                 C  sz   ddl m} |j| | j||||||||	|
d}| }ttj|dd}|	| W d   dS 1 s6w   Y  dS )at  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            output_dir (`str` or `os.PathLike`):
                The folder in which to create the model card.
            model_name (`str`, *optional*):
                The name of the model.
            language (`str`, *optional*):
                The language of the model (if applicable)
            license (`str`, *optional*):
                The license of the model. Will default to the license of the pretrained model used, if the original
                model given to the `Trainer` comes from a repo on the Hub.
            tags (`str` or `List[str]`, *optional*):
                Some tags to be included in the metadata of the model card.
            finetuned_from (`str`, *optional*):
                The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
                of the original model given to the `Trainer` (if it comes from the Hub).
            tasks (`str` or `List[str]`, *optional*):
                One or several task identifiers, to be included in the metadata of the model card.
            dataset_tags (`str` or `List[str]`, *optional*):
                One or several dataset tags, to be included in the metadata of the model card.
            dataset (`str` or `List[str]`, *optional*):
                One or several dataset identifiers, to be included in the metadata of the model card.
            dataset_args (`str` or `List[str]`, *optional*):
               One or several dataset arguments, to be included in the metadata of the model card.
        r   )TrainingSummary)
keras_historyrl  rn  ro  rk  rp  rq  rr  r  rt  z	README.mdrP   N)
	modelcardru  
from_kerashistoryto_model_cardrO  r  r  r  write)rU   
output_dirrk  rl  rn  ro  rp  rq  rr  r  rt  ru  training_summary
model_cardr]  rB   rB   rC   create_model_card  s$   )"z#TFPreTrainedModel.create_model_cardc                 C  s^   t | | j}|du rtdz|| W dS  ty.   td |   || Y dS w )z
        Set model's input embeddings

        Args:
            value (`tf.Variable`):
                The new weights mapping hidden states to vocabulary.
        Nz>The model does not implements the base_model_prefix attribute.Building the model)rs   r  r  set_input_embeddingsrt   r   r-  r  )rU   r  r  rB   rB   rC   r  J  s   
z&TFPreTrainedModel.set_input_embeddingsUnion[None, keras.layers.Layer]c                 C  sR   |   dur'|   }z| W S  ty&   td |   |   Y S w dS )z
        Returns the model's output embeddings

        Returns:
            `tf.Variable`: The new weights mapping vocabulary to hidden states.
        Nr  )get_lm_headget_output_embeddingsrt   r   r-  r  rU   lm_headrB   rB   rC   r  ^  s   

z'TFPreTrainedModel.get_output_embeddingsc                 C  sZ   |   dur+|   }z|| W dS  ty*   td |   || Y dS w dS )z
        Set model's output embeddings

        Args:
            value (`tf.Variable`):
                The new weights mapping hidden states to vocabulary.
        Nr  )r  set_output_embeddingsrt   r   r-  r  rU   r  r  rB   rB   rC   r  r  s   
z'TFPreTrainedModel.set_output_embeddingsc                 C  s   t dt |  S )z
        Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the
        embeddings

        Return:
            `keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
        zVThe method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.)r   r   r   r  r  rB   rB   rC   get_output_layer_with_bias  s   z,TFPreTrainedModel.get_output_layer_with_biasUnion[None, str]c                 C  s   t dt dS )z
        Get the concatenated _prefix name of the bias from the model name to the parent layer

        Return:
            `str`: The _prefix name of the bias.
        zMThe method get_prefix_bias_name is deprecated. Please use `get_bias` instead.N)r   r   r   r  rB   rB   rC   get_prefix_bias_name  s   z&TFPreTrainedModel.get_prefix_bias_name#Union[None, Dict[str, tf.Variable]]c                 C  sF   |   dur!|   }z| W S  ty    |   |  Y S w dS )z
        Dict of bias attached to an LM head. The key represents the name of the bias attribute.

        Return:
            `tf.Variable`: The weights representing the bias, None if not an LM model.
        N)r  get_biasrt   r  r  rB   rB   rC   r    s   
zTFPreTrainedModel.get_biasc                 C  sP   |   dur&|   }z|| W dS  ty%   |   || Y dS w dS )z
        Set all the bias in the LM head.

        Args:
            value (`Dict[tf.Variable]`):
                All the new bias attached to an LM head.
        N)r  set_biasrt   r  r  rB   rB   rC   r    s   zTFPreTrainedModel.set_biasc                 C  r  )z
        The LM Head layer. This method must be overwritten by all the models that have a lm head.

        Return:
            `keras.layers.Layer`: The LM head layer if the model has one, None if not.
        NrB   r  rB   rB   rC   r    s   zTFPreTrainedModel.get_lm_headr  Optional[int]*Union[keras.layers.Embedding, tf.Variable]c                 C  sT   t |  tjjr| |S |du s|| jjkr| |  S | 	|}|| j_|S )a  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens without doing anything.

        Return:
            `tf.Variable` or `keras.layers.Embedding`: Pointer to the input tokens of the model.
        N)
r]   r  rz   rp  	Embedding_v2_resized_token_embeddingsr\   
vocab_size_get_word_embedding_weight_resize_token_embeddingsrU   r  model_embedsrB   rB   rC   resize_token_embeddings  s   

z)TFPreTrainedModel.resize_token_embeddingskeras.layers.Embeddingc                 C  s2   |du s
|| j jkr|  S | |}|| j _|S )aJ  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens without doing anything.

        Return:
            `keras.layers.Embedding`: Pointer to the input tokens of the model.
        N)r\   r  r  _v2_resize_token_embeddingsr  rB   rB   rC   r    s
   
z.TFPreTrainedModel._v2_resized_token_embeddingsc                 C  s|   t |tjr|S t|dd }|d ur|S t|dd }|d ur |S |   t|dd }|d ur0|S t|dd }|d ur<|S d S )Nr  decoder)r]   r=   r   rs   r  )r3  embedding_layerembedsrB   rB   rC   r    s    z,TFPreTrainedModel._get_word_embedding_weightc                 C  s   |  |  }| ||}|  d ur"|  }| ||}| | |  d ur:|  |  }| ||}| | | 	| |  S rJ   )
r  r  _get_resized_embeddingsr  _get_resized_lm_head_biasr  r  _get_resized_lm_head_decoderr  r  )rU   r  r  new_embeddingsold_lm_head_biasnew_lm_head_biasold_lm_head_decodernew_lm_head_decoderrB   rB   rC   r    s   


z*TFPreTrainedModel._resize_token_embeddingsc           	      C  s   |   }| ||}| | |  d ur$|  }| ||}| | |   |  k}|  d urF|sF| |  }| ||}| 	| |   S rJ   )
r  _v2_get_resized_embeddingsr  r  _v2_get_resized_lm_head_biasr  r  r  r  r  )	rU   r  r  r  r  r  tied_weightsr  r  rB   rB   rC   r  2  s   


z-TFPreTrainedModel._v2_resize_token_embeddingsc                 C  s~  i }|  D ]\}}t|dkrdt|d fnt|\}}|| }|du r*|gn||g}	tj|drx|du r>d|ggnddgd|gg}
tj| t|
dd}t	||}|du r_|gnd|g}t
t|d}tj|t|
dd}n#|du rdgnddg}t| t|t|	}t
t|	d}| j|	dd|jd	d d
}t||| }|| |||< q|S )a  
        Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
        Reducing the size will remove vectors from the end

        Args:
            old_lm_head_bias (`tf.Variable`):
                Old lm head bias to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the linear matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns None

        Return:
            `tf.Variable`: Pointer to the resized bias.
        r   Nr   r   r  TFzerosr   r9   rg   	trainabler   )r   r=   r:   r   r   r  r  r  r  r  r  r  
add_weightr   r   whereassign)rU   r  r  r  attrr  	first_dimr  r  final_shapepadding_shapecurrent_biasr  
mask_shape	bias_mask
slice_fromnew_bias	init_biasrB   rB   rC   r  G  s6   *"


z+TFPreTrainedModel._get_resized_lm_head_biasr  Dict[str, tf.Variable]c                 C  s   i }|  D ]L\}}t|dkrdt|d fnt|\}}|| }||kr2| dd|f }	n|du r;d|ggnddgd|gg}
t| t|
}	|	||< q|S )a\  
        Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
        Reducing the size will remove vectors from the end

        Args:
            old_lm_head_bias (`Dict[str, tf.Variable]`):
                Old lm head bias to be resized.
            new_num_tokens (`int`):
                New number of tokens in the linear matrix. Increasing the size will add newly initialized vectors at
                the end. Reducing the size will remove vectors from the end.

        Return:
            `tf.Tensor`: Values for the resized bias.
        r   Nr   .)r   r=   r:   r   r  r  r  )rU   r  r  r  r  r  r  r  r  r  r  rB   rB   rC   r  |  s   *"
z.TFPreTrainedModel._v2_get_resized_lm_head_biasc           	      C  s   |}t | |  |k}|dur@|s@t|d }t||\}}| j||fdd|jdd d}t 	|||
 }|| |S )a  
        Build a resized decoder from the old ones. Increasing the size will add newly initialized vectors at the end.
        Reducing the size will remove vectors from the end

        Args:
            old_lm_head_decoder (`tf.Variable`):
                Old lm head decoder to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the linear matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns None

        Return:
            `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the input
            ones.
        Nr   r  Tr   r   r  )r=   r   r  r  r   r  r  r   r   r  r  r  )	rU   r  r  r  is_input_output_equalsr  decoder_maskcurrent_decoderinit_decoderrB   rB   rC   r    s    
z.TFPreTrainedModel._get_resized_lm_head_decodertf.Variablec           	      C  sp   t |d }t| jdd}t||\}}| j|jdd ||gt|tj	d}t
||| }|| |S )as  
        Build a resized Embedding weights from a provided token Embedding weights. Increasing the size will add newly
        initialized vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_embeddings (`tf.Variable`):
                Old embeddings to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the embedding matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `tf.Variable` module of the model without doing anything.

        Return:
            `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is
            `None`
        r   initializer_range{Gz?r   r   )r   r9   rg   r   )r   rs   r\   r  r  r   r   get_initializerr=   r  r  r  r  )	rU   r  r  r  
init_rangeembeddings_maskcurrent_embeddingsr  init_embeddingsrB   rB   rC   r    s   
z)TFPreTrainedModel._get_resized_embeddingsr  c                 C  s   d}g d}|D ]}t | j|rt| j|}qtjj||jtjj|d|j	j
dd d}|tdgg |j|krC|j	d| }ntj|j	|j	|jd gdd}|j	| |S )	a  
        Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized
        vectors at the end. Reducing the size will remove vectors from the end.

        Args:
            old_embeddings (`keras.layers.Embedding`):
                Old embeddings to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the embedding matrix.

        Return:
            `keras.layers.Embedding`: Resized Embedding layer.
        r  )r  initializer_factorinit_stdstddevNi)	input_dim
output_dimembeddings_initializerr   r   r7   )rw   r\   rs   rz   rp  r  r  initializersTruncatedNormal
embeddingsr   r=   constantr  concatr  )rU   r  r  r  'potential_initialization_variable_namesvar_namer  r  rB   rB   rC   r    s(   
z,TFPreTrainedModel._v2_get_resized_embeddingsc                 C  s   t )a  
        Prunes heads of the base model.

        Arguments:
            heads_to_prune (`Dict[int, List[int]]`):
                Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
                to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
                layer 1 and heads 2 and 3 on layer 2.
        )r  )rU   heads_to_prunerB   rB   rC   prune_heads	  s   
zTFPreTrainedModel.prune_headsr   5GBr  Union[int, str]	create_prsafe_serializationtokenOptional[Union[str, bool]]c
           %   	   K  s  |
 dd}|durtdt |	durtd|}	|	dur"|	|
d< tj|r3t	d| d dS tj
|dd	 |r]|
 d
d}|
 d|tjjd }| j|fi |
}| |}|rt| jdddur|t| jjts|t| jjdd | j_|du r| j| j}tdd | j D rdd | j D }| j|}||d}n|}tj|dt|}| j|d|d td|  | jjdd g| j_| j durt!| || jd | j"| | # r| j$"| |rt%nt&}tj||}t'| j(||d\}}t)|D ]-}tj||}|*dd*dd}|+|r5tj|r5||, vr5t-| q	|du r_|rQdd | j(D }t.||d d!id" n| /| td#|  n|rdt0nt1}tj||}t2|d$d%d&}t3j4|ddd'd( }|5| W d   n	1 sw   Y  td)| d*t6| d+| d | D ]\}}|rd,d |D } t.| tj||d d!id" qt7j8tj||d$d-\}g }!t9|d.d/ d0D ]C}"d1|"j:v st6|"j:d2dkr|"j:}#nd2|"j:d2dd }#|j;|#|"< j=|"< j>d3}$|"< |$dd< |!?|#@d4 qtA|d5|! W d   n	1 s3w   Y  q|rI| jB|||||	d6 dS dS )7a<
  
        Save a model and its configuration file to a directory, so that it can be re-loaded using the
        [`~TFPreTrainedModel.from_pretrained`] class method.

        Arguments:
            save_directory (`str`):
                Directory to which to save. Will be created if it doesn't exist.
            saved_model (`bool`, *optional*, defaults to `False`):
                If the model has to be saved in saved model format as well or not.
            version (`int`, *optional*, defaults to 1):
                The version of the saved model. A saved model needs to be versioned in order to be properly loaded by
                TensorFlow Serving as detailed in the official documentation
                https://www.tensorflow.org/tfx/serving/serving_basic
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            signatures (`dict` or `tf.function`, *optional*):
                Model's signature used for serving. This will be passed to the `signatures` argument of model.save().
            max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).

                <Tip warning={true}>

                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
                which will be bigger than `max_shard_size`.

                </Tip>

            create_pr (`bool`, *optional*, defaults to `False`):
                Whether or not to create a PR with the uploaded files or directly commit.
            safe_serialization (`bool`, *optional*, defaults to `False`):
                Whether to save the model using `safetensors` or the traditional TensorFlow way (that uses `h5`).
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            kwargs (`Dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        use_auth_tokenNrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.V`token` and `use_auth_token` are both specified. Please set only the argument `token`.r  zProvided path (z#) should be a directory, not a fileT)exist_okcommit_messagerepo_idr   torch_dtyper   r   c                 s  s    | ]	}|j tjkV  qd S rJ   )r   r=   r   )rO   r  rB   rB   rC   rQ   z	  s    z4TFPreTrainedModel.save_pretrained.<locals>.<genexpr>c                 S  s:   i | ]\}}|t j|j|jt jkrt jn|j|jd qS )r  )r=   r  r9   r   r   r   r   )rO   r   r  rB   rB   rC   r   {	  s    z5TFPreTrainedModel.save_pretrained.<locals>.<dictcomp>)serving_defaultint64_servingsaved_modelF)include_optimizer
signatureszSaved model created in r6   )r\   )r  z.binr  r
  c                 S     i | ]
}t |j| qS rB   r  r   r  rN   rB   rB   rC   r   	  r  formatr=   )r  zModel weights saved in rP   zutf-8)encoding)indent	sort_keys
z:The model is bigger than the maximum size per checkpoint (z) and is going to be split in z^ checkpoint shards. You can find where each parameters has been saved in the index located at c                 S  r  rB   r  rN   rB   rB   rC   r   	  r  )modec                 S  s   | j S rJ   rj  )rc  rB   rB   rC   <lambda>	  s    z3TFPreTrainedModel.save_pretrained.<locals>.<lambda>)r   r   r   r   utf8rE  )r  r  )Cr^   r   r   r   rb   r  r  isfiler   errormakedirsr   sep_create_repo_get_files_timestampsrs   r\   r]   r  r  r  get_concrete_functionr  anyvaluesr   r  saver-  r   rW   architectures_auto_classr   save_pretrainedr  r  r   r   r   r  listdirr  r  r   removesafe_save_filer  r   r   rO  jsondumpsr{  ra   rH  rI  sortedr   create_datasetr  r9   r   r  encoder   _upload_modified_files)%rU   save_directoryr  rF  push_to_hubr  r  r  r  r  rf   r  r  r  files_timestampsr  
int64_specr  saved_model_dirr  output_model_filer  r  filenamefull_filenameweights_no_suffix
state_dictsave_index_file
index_filecontentr  r  shard_state_dictrp  rl  r;  
param_dsetrB   rB   rC   r  "	  s   5
 





"
z!TFPreTrainedModel.save_pretrainedmain)r\   	cache_dirr"  force_downloadlocal_files_onlyr  revisionuse_safetensorspretrained_model_name_or_path!Optional[Union[str, os.PathLike]]r\   3Optional[Union[PretrainedConfig, str, os.PathLike]]r  r"  r  r  r  r  c          3        s  | dd}| dd}| dd}| dd}| dd}| dd}| d	d}| d
d}| dd}| dd}| dd}| dd}| dd}| dd}|durhtdt |durftd|}|du rqtd dd|d}|dur||d< t r|std d}|	du rt	 sd}	t
|ts|dur|n|}| jj|f|d|||||||||d|\}}n|}|du rt|dd}d}|durt|}tj|}|r|rtjtj|trtj|t}n|r
tjtj|tr
tj|t}d}n|	dur$tjtj|tr$tj|t}n|	dur@tjtj|tr@tj|t}d}nptjtj|trUtj|t}n[tjtj|trltj|t}d}nD|	r}tdt dt d| dtjtj|tstjtj|trtdt dt d| d tdt d!t dt d| d"	tj|r|}d}ntj|d# r|d# }d}nt|r|}t|} n|rt}n
|	durt}nt}z|||||||||dd|d$}!t||fi |!} | du r|tkrt}t|tfi |!} | du r4|tkr4t|tfi |!} | dur4d}| du rN|tkrNt|tfi |!} | durNd}| du r|||||d%}"t|tfi |"rhd}n#t|tfi |"r|t| d&t d t| d&t d!t dt  W n# ty     t!y   td'| d(| d)t d!t dt  
w |rtd*|  |} | "tjj#d+ }ntd*| d,|   nd} |rt$|| |||||||||d-\} }#d}$|tkr(t%| d.d/}%|%& }&W d   n	1 sw   Y  |&du s|&'d0d1vr t(d2|  d3|&'d0d4k}$n>|tkrft%| d5 d.d/}%|%& }&W d   n	1 sEw   Y  |&du sW|&'d0d1vr_t(d2|  d3|&'d0d4k}$||_)| j*r|'d6dur|d7 |'d6 |d
< | |g|
R i |}'|du rt+|'dr|'j,}|rd8d9l-m.}( |(|'| d|||d:S |durt/j0j12| |'3  W d   n	1 sw   Y  n|'3  |$r|sd8d;l-m4}) t%| d.d/}*|)|'|*dd||||d<W  d   S 1 sw   Y  n|$rd8d=l-m5}+ |+|'| dd||||d<S z>|rI| D ]},tj|,d>|, f q|tkr=t6|'| ||d?\}-}.}/nt7|'| ||d?\}-}.}/nt8|'| ||d?\}-}.}/W nA t(y }0 z4zt9| }%|%: ;d@rpt(dAt|01 sww   Y  W n t<tfy   t(dBw W Y d}0~0nd}0~0ww | j=dur| j=D ]  fdCdD|-D }-q| j>dur| j>D ]  fdEdD|.D }.qt?|.d5krtdF| dG|'j@jA dH|. dI|'j@jA dJ|'j@jA dK ntdL|'j@jA dM t?|-d5kr
tdN|'j@jA dO| dP|- dQ nt?|/d5kr$tdR|'j@jA dS| dT|'j@jA dU t?|/d5krFdVdWdD |/D }1tdX|'j@jA dO| dY|1 dQ |'B rrztCj|f||||||||||dZ
||'_DW n t(yq   td[ Y nw |r|-|.|/d\}2|'|2fS |'S )]aS!  
        Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.

        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
        task.

        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
        weights are discarded.

        Parameters:
            pretrained_model_name_or_path (`str`, *optional*):
                Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
                      using the provided conversion scripts and loading the TensorFlow model afterwards.
                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
                      arguments `config` and `state_dict`).
            model_args (sequence of positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            config (`Union[PretrainedConfig, str]`, *optional*):
                Can be either:

                    - an instance of a class derived from [`PretrainedConfig`],
                    - a string valid as input to [`~PretrainedConfig.from_pretrained`].

                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:

                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
                    - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded by supplying the
                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            from_pt (`bool`, *optional*, defaults to `False`):
                Load the model weights from a PyTorch state_dict save file (see docstring of
                `pretrained_model_name_or_path` argument).
            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                checkpoint with 3 labels).
            cache_dir (`str`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies:
                (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g.,
                `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
                output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a
                dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (e.g., not try downloading the model).
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.


                <Tip>

                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.

                </Tip>

            mirror (`str`, *optional*):
                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                Please refer to the mirror site for more information.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            tf_to_pt_weight_rename (`Callable`, *optional*):
                A function that is called to transform the names of weights during the PyTorch to TensorFlow
                crossloading process. This is not necessary for most models, but is useful to allow composite models to
                be crossloaded correctly.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                is not installed, it will be set to `False`.
            kwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                automatically loaded:

                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
                      corresponds to a configuration attribute will be used to override said attribute with the
                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
                      will be passed to the underlying model's `__init__` function.

        Examples:

        ```python
        >>> from transformers import BertConfig, TFBertModel

        >>> # Download model and configuration from huggingface.co and cache.
        >>> model = TFBertModel.from_pretrained("google-bert/bert-base-uncased")
        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
        >>> model = TFBertModel.from_pretrained("./test/saved_model/")
        >>> # Update configuration during loading.
        >>> model = TFBertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
        >>> assert model.config.output_attentions == True
        >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
        >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json")
        >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config)
        ```from_ptFresume_downloadNproxiesoutput_loading_infor  trust_remote_codemirrorload_weight_prefix_from_pipeline
_from_auto	subfolderr  _commit_hashtf_to_pt_weight_renameadapter_kwargsr  r  TzgThe argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.r3  
tensorflow)	file_typerz  from_auto_classusing_pipelinez+Offline mode: forcing local_files_only=True)r  return_unused_kwargsr  r!  r"  r  r  r  r(  r'  r*  zError no file named z or z found in directory zu. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`.zc but there is a file for PyTorch weights. Use `from_pt=True` to load this model from those weights.z, r   z.index)r  r  r"  r!  r  r  
user_agentr  r)   _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entriesr*  )r  r"  r  r  r  z& does not appear to have a file named zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named zloading weights file r   z from cache at )	r  r  r"  r!  r  r  r2  r  r*  r=   ry  r  )ptr=   flaxmlxz"The safetensors archive passed at zf does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.r5  r   r   r   r   )$load_pytorch_checkpoint_in_tf2_model)allow_missing_keysr#  r  r+  )$load_pytorch_state_dict_in_tf2_model)	tf_inputsr9  r#  r  r"  r+  )-load_sharded_pytorch_safetensors_in_tf2_modelzError retrieving files r!  rF  rG  z}Unable to load weights from h5 file. If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. c                       g | ]}t  |d u r|qS rJ   researchr(  patrB   rC   r)    r  z5TFPreTrainedModel.from_pretrained.<locals>.<listcomp>c                   r=  rJ   r>  r(  rA  rB   rC   r)    r  z)Some layers from the model checkpoint at z! were not used when initializing z: z,
- This IS expected if you are initializing z from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing z from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).z8All model checkpoint layers were used when initializing z.
zSome layers of z3 were not initialized from the model checkpoint at z and are newly initialized: zo
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.zAll the layers of z/ were initialized from the model checkpoint at zf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use z* for predictions without further training.r  c              	   S  s*   g | ]\}}}d | d| d| dqS )z- z: found shape z in the checkpoint and z in the model instantiatedrB   )rO   r   shape1shape2rB   rB   rC   r)    s    zSome weights of z= and are newly initialized because the shapes did not match:
)
r  r  r!  r"  r  r  r  r)  r(  r'  zZGeneration config file not found, using a generation config created from the model config.)r?  r6  r8  )Er^   r   r   r   rb   r   r   r'   r-  r)   r]   r   r[   from_pretrainedrs   r  r  r  r  r  r  r    r   r   r   r   r   EnvironmentErrorr(   r$   r#   r&   r   rN  r   r  r/   r0   r  rs  rQ  r  _requires_load_weight_prefixrw   r+  modeling_tf_pytorch_utilsr8  r=   compatv1variable_scoper  r:  r<  rd  rC  rh  rO  rP  r  rR  _keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedra   r   rW   r  r   r  )3rp   r  r\   r  r"  r  r  r  r  r  
model_argsrf   r   r!  r"  r#  r  r$  r  r&  from_pipeliner/  r)  commit_hashr+  r2  config_pathmodel_kwargs
is_shardedis_localarchive_filer  rS  cached_file_kwargshas_file_kwargssharded_metadatasafetensors_from_ptr]  safetensors_metadatar3  r8  r:  r  r<  filer?  r6  r8  r\  mismatched_warningloading_inforB   rA  rC   rE  	  s   










	









$

	



z!TFPreTrainedModel.from_pretrainedr  r  use_temp_dirr  privateOptional[Union[int, str]]Optional[Union[bool, str]]r  c	              	   K  sV  |durt dt |durtd|}d|	v r"t d |	d}|	dd}
|	dd}tj|r@|}|tjj	d }n|d	d }| j
||||
|d
}|du r\tj| }t||d?}| |}| j||d t| drt| dr|t|jd}	|	|	 | jdi |	 | j||||||d W d   dS 1 sw   Y  dS )u  
        Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`.

        Parameters:
            repo_id (`str`):
                The name of the repository you want to push your model to. It should contain your organization name
                when pushing to a given organization.
            use_temp_dir (`bool`, *optional*):
                Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.
                Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.
            commit_message (`str`, *optional*):
                Message to commit while pushing. Will default to `"Upload model"`.
            private (`bool`, *optional*):
                Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
            token (`bool` or `str`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
                is not specified.
            max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
                Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
                will then be each of size lower than this size. If expressed as a string, needs to be digits followed
                by a unit (like `"5MB"`).
            create_pr (`bool`, *optional*, defaults to `False`):
                Whether or not to create a PR with the uploaded files or directly commit.

        Examples:

        ```python
        from transformers import TFAutoModel

        model = TFAutoModel.from_pretrained("google-bert/bert-base-cased")

        # Push the model to your namespace with the name "my-finetuned-bert".
        model.push_to_hub("my-finetuned-bert")

        # Push the model to an organization with the name "my-finetuned-bert".
        model.push_to_hub("huggingface/my-finetuned-bert")
        ```
        Nr  r  repo_path_or_namezpThe `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use `repo_id` instead.repo_urlorganizationr   r   )r_  r  rc  rd  )working_dirr^  )r  ry  r  )r|  rk  )r  r  r  rB   )r   r   r   rb   r^   r  r  r  r   r  r  r-   r  r  rw   r   r   rm   r  r  )rU   r  r^  r  r_  r  r  r  r  base_model_card_argsrc  rd  re  work_dirr
  rB   rB   rC   r	  -  sX   4



"zTFPreTrainedModel.push_to_hubTFAutoModelc                 C  sD   t |ts|j}ddlm  m} t||st| d|| _dS )a  
        Register this class with a given auto class. This should only be used for custom models as the ones in the
        library are already mapped with an auto class.

        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"TFAutoModel"`):
                The auto class to register this new model with.
        r   Nz is not a valid auto class.)	r]   r  rW   transformers.models.automodelsr%  rw   rb   r  )rp   
auto_classauto_modulerB   rB   rC   register_for_auto_class  s   


z)TFPreTrainedModel.register_for_auto_class)rH   r  )rH   r  rJ   )r  r  r  rI   rH   r  )rH   r  )rH   rG   )rH   r  )r  TNNNNT)r  r  r  rI   r  rG   r   r  r  r  r  r  r  r  r  rG   )r"  r#  NNNNN)NNNNNNNN)rk  r  rl  rm  rn  rm  ro  rm  rp  rm  rq  rm  rr  rs  r  rs  rt  rs  )rH   r  )rH   r  )rH   r  )r  r  rH   r  )r  r  rH   r  )r  r  r  rI   rH   r  )rH   r  )r  r  r  rI   rH   r  )Fr   FNr  FFN)r  r  r  rG   r  rG   r  r  )r  r  r\   r  r  r  r"  rG   r  rG   r  rG   r  r  r  r  r  r  )NNNr  NNF)r  r  r^  r  r  rm  r_  r  r  r`  r  ra  r  ra  r  rG   rH   r  )rh  )JrW   rX   rY   rZ   r[   r  r   r  r.  r=  rL  rM  rG  propertyr  r  rz  r  rr   ri   ru   rv   rz   r/  r  r  r  r  r  r  classmethodr  r  r  r  r=   functionr  r  r  r  r  r  r!  r0  r2  r?  ri  rj  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rE  r	  rm  __classcell__rB   rB   r  rC   r  U  s   




.
k6ll<






!
5"&
#/ 8    Zmr  c                      s2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
TFConv1DaO  
    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).

    Basically works like a linear layer but the weights are transposed.

    Args:
        nf (`int`):
            The number of output features.
        nx (`int`):
            The number of input features.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation to use to initialize the weights.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
    r  c                   s(   t  jdi | || _|| _|| _d S )NrB   )rk   rr   nfnxr  )rU   rs  rt  r  rf   r  rB   rC   rr     s   
zTFConv1D.__init__c                 C  sP   | j rd S d| _ | jd| j| jgt| jd| _| jdd| jgt d| _	d S )NTr  r9   rg   biasr   )
builtr  rt  rs  r  r  r  r=   zeros_initializerrv  r  rB   rB   rC   r    s    zTFConv1D.buildc                 C  sR   t |d d \}}t|d| jg}t|| j| j }t|||| jg}|S )Nr6   r   )r   r=   r   rt  matmulr  rv  rs  )rU   rc  bzslrB   rB   rC   r    s
   zTFConv1D.callr  )rW   rX   rY   rZ   rr   r  r  rq  rB   rB   r  rC   rr    s
    	rr  c                      sX   e Zd ZdZdd fdd	Z fd
dZ fddZddddZdd Zdd Z	  Z
S )TFSharedEmbeddingsa  
    Construct shared token embeddings.

    The weights of the embedding layer is usually shared with the weights of the linear decoder when doing language
    modeling.

    Args:
        vocab_size (`int`):
            The size of the vocabulary, e.g., the number of unique tokens.
        hidden_size (`int`):
            The size of the embedding vectors.
        initializer_range (`float`, *optional*):
            The standard deviation to use when initializing the weights. If no value is provided, it will default to
            \\(1/\sqrt{hidden\_size}\\).
        kwargs (`Dict[str, Any]`, *optional*):
            Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
    Nr  rI   r  r  Optional[float]c                   sD   t  jdi | || _|| _|d u r|d n|| _tdt d S )Ng      z^`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `keras.layers.Embedding` instead.rB   )rk   rr   r  r  r  r   r   DeprecationWarning)rU   r  r  r  rf   r  rB   rC   rr     s   zTFSharedEmbeddings.__init__c                   s0   | j d| j| jgt| jd| _t | dS )z
        Build shared token embedding layer Shared weights logic adapted from
        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        r  ru  N)r  r  r  r  r  r  rk   r  r  r  rB   rC   r    s   zTFSharedEmbeddings.buildc                   s8   | j | j| jd}t  }tt| t|  S )N)r  r  r  )r  r  r  rk   ri   r_   r;   r   )rU   r\   base_configr  rB   rC   ri   	  s   
zTFSharedEmbeddings.get_config	embeddingr   r  r  r  rH   c                 C  s4   |dkr	|  |S |dkr| |S td| d)am  
        Get token embeddings of inputs or decode final hidden state.

        Args:
            inputs (`tf.Tensor`):
                In embedding mode, should be an int64 tensor with shape `[batch_size, length]`.

                In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`.
            mode (`str`, defaults to `"embedding"`):
               A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer should be
               used as an embedding layer, the second one that the layer should be used as a linear decoder.

        Returns:
            `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape `[batch_size, length,
            embedding_size]`.

            In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`.

        Raises:
            ValueError: if `mode` is not valid.

        Shared weights logic is adapted from
        [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24).
        r  linearzmode z is not valid.)
_embedding_linearrb   )rU   r   r  rB   rB   rC   r    s
   

zTFSharedEmbeddings.callc                 C  s   t | j|S )z)Applies embedding based on inputs tensor.)r=   gatherr  )rU   r   rB   rB   rC   r  3  s   zTFSharedEmbeddings._embeddingc                 C  sH   t |dd }t|d| jg}tj|| jdd}t||| jg S )z
        Computes logits by running inputs through a linear layer.

        Args:
            inputs: A float32 tensor with shape [..., hidden_size]

        Returns:
            float32 tensor with shape [..., vocab_size].
        Nr   T)transpose_b)r   r=   r   r  ry  r  r  )rU   r   
first_dimsrc  r   rB   rB   rC   r  7  s   
zTFSharedEmbeddings._linearrJ   )r  rI   r  rI   r  r~  )r  )r   r  r  r  rH   r  )rW   rX   rY   rZ   rr   r  ri   r  r  r  rq  rB   rB   r  rC   r}    s    


 r}  c                      s6   e Zd ZdZdd fddZdddZdd Z  ZS )TFSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.

        initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation to use to initialize the weights.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
    r  r\   r   r  floatc                   s  t  jdi | t|dr|jnd| _| jdkrtt|do"|j| _| jrGt|dr8|jr8|jdkr8|j}n|j	}t
jj|t|dd| _d| _t|d	d }|d ur\d
| _t|| _t|doe|jdk| _| jrrt
j|j| _t|do{|jdk| _| jrt
j|j| _|j	| _	d S )Nsummary_use_projlastattnsummary_proj_to_labelsr   summary)kernel_initializerr   Fsummary_activationTsummary_first_dropoutsummary_last_dropoutrB   )rk   rr   rw   summary_typer  r  has_summaryr  
num_labelsr  rz   rp  Denser  r  has_activationrs   r   
activationr  has_first_dropoutDropoutfirst_dropoutr  has_last_dropoutlast_dropout)rU   r\   r  rf   num_classesactivation_stringr  rB   rC   rr   f  s0   


zTFSequenceSummary.__init__NFc                 C  s  t |tttfs|}n-t |ttfr-|d }t|dkr |d nd }t|dks,J dn|d}|dd }| jdkrF|d d df }nj| jd	krT|d d df }n\| jd
kratj|dd}nO| jdkrt	|}|d u r|t
|d d |d d }t	|}t|t|d krtj|dd}tj||t|d d}tj|t|d d}n| jdkrt| jr| j||d}| jr| |}| jr| |}| jr| j||d}|S )Nr   r   r6   zToo many inputs.r  	cls_indexr  r   firstmeanr7   )
batch_dimsr  rN  )r]   r_   r   r;   ra   rs  r  r=   r>   r   r  r   r  squeezer  r  r  r  r  r  r  r  r  )rU   r   r  rM  r  r   hidden_shape	cls_shaperB   rB   rC   r    sL   







zTFSequenceSummary.callc                 C  sb   | j rd S d| _ t| dd d ur/td | j| j W d    d S 1 s(w   Y  d S d S )NTr  )rw  rs   r=   r  r  r  r  r  rB   rB   rC   r    s   "zTFSequenceSummary.buildr|  )r\   r   r  r  )NF)rW   rX   rY   rZ   rr   r  r  rq  rB   rB   r  rC   r  H  s
    
#3r  r  r  r  rH   "keras.initializers.TruncatedNormalc                 C  s   t jj| dS )a  
    Creates a `keras.initializers.TruncatedNormal` with the given range.

    Args:
        initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.

    Returns:
        `keras.initializers.TruncatedNormal`: The truncated normal initializer.
    r  )rz   r  r  )r  rB   rB   rC   r    s   
r  rJ   )r  r  )FFN)FNr|  )r  r  rH   r  )rZ   
__future__r   ru   r0  r   r  r  r  r?  r   collections.abcr   pathlibr   typingr   r   r   r   r	   r
   r   rH  r  rK   r-  r=   packaging.versionr   r  r   r   activations_tfr   configuration_utilsr   dynamic_module_utilsr   
generationr   r   tf_utilsr   r   r   r   r   r{   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   	utils.hubr.   r/   safetensorsr0   safetensors.tensorflowr1   r  r2   
get_loggerrW   r   environr   tf_kerasrz   r5   rK  ModuleNotFoundErrorImportErrorrT  majorrb   	tf_loggerr   r   r  TFModelInputTyperD   rE   r|   r~   r   r   r   r   r   r   r   r   r   r  r   rC  r/  rd  rh  rf  r^  r  r/  r  rp  Layerrr  r}  r  r  rB   rB   rB   rC   <module>   s   $X


A*"!, 
.

E
CZ

7

d (                p+h}