o
    hgs                 %   @   s  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dl mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d d
l'm(Z( d dl)Z)d dl*Z)d dl+m,Z, d dl-m.Z. d dl)m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 e9 rd dl:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZA ddlBmCZCmDZDmEZE ddlFmGZGmHZHmIZI ddlJmKZKmLZL ddlMmNZNmOZO ddlPmQZQ ddlRmSZS ddlTmUZU ddlVmWZWmXZX ddlYmZZZ dd l[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb dd!lcmdZdmeZe dd"lfmgZg dd#lhmiZi dd$ljmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ dd%lmZmZ dd&lmZmZmZmZ dd'lmZmZ e
jd(d) Ze
jd*d) Ze~ rd d+lmZmZ d d,lmZ d d-lmZmZmZmZmZmZmZ e.ej.d.Zee.d/krd d0lmZ e r	d d1lmZ d d2lmZ d d3lmZ eO rd dlZeeZd4ad5ad5ae)j Zd6d7 Zd8d9 Ze rGd dlm  m)Z d d:lmZ e.eɡe.d;kZnd5Ze rSdd<ljmZ e%d=d>d?Ze0jje0jje0jje0jje0jje0jje0jje0jje0jje0jje0jje0jje0jje0jjd@ZedAdB ZedCdD ZedEdF ZdGdH ZdIe&e0jdJf fdKdLZdIe&e0jdJf fdMdNZdIe&e0jdJf fdOdPZdQdR ZdSdT ZddUdVZe)je)je)je)je)je)je)je)je)je)je)jdWZedXre)jedY< edZre)jed[< e)jed\< e)jed]< edXre)jedY< e)jed^< 	5	_	4dd`e&ee
jf daedbe!e&ee)jf  dcefdddeZdfdg Zdhe)j/diefdjdkZddme0jfdndoZdpe e"e  dqeee)j/f die#e e"e  e e f fdrdsZ dpe e"e  dqeee)j/f die#e e"e  e"e f fdtduZ		ddvd>dwedxe)j/dye!ej dze!ee die&ee!e)j f fd{d|Zdvd>dwedhe)j/fd}d~Ze) 							5			ddvd>dqedede e deeef de!e de!e de!e de!e de!e dze!ee dedye!ej de!e e  de!d die#e!e e!e f f ddZddede!e diefddZde!e&ee
jf  dede!e de!e dededededede!eeef  dede!e&eef  de	dede!e die#e!e e  e!e f f ddZ
de!e&ee)jef  de!e e  de?de!e dqe!e dcedie#e?e!e)j e!e)j f fddZdvd>de!e&eef  de!e dze!ee de!e)j dye!ej diefddZdvd>de e de e dedze!ee dedie#e e e e f fddZdvd>dqe!e de!e e  dedeeef daedcedie#e e e e#eef  f fddZG dd deZG ddJ dJZG dd> d>e0jeeEexeGZezeje_ejjdurVejjjdvdddej_G dd de0jZG dd de0jZG dd de0jZeG dd dewZG ddÄ de0jZG ddń de0jZddve0jdedie0jfddȄZddʄ Zddvedefdd΄ZddЄ ZG dd҄ deZe Z ee!d< dS )    N)defaultdict)MutableMapping)contextmanager)	dataclass)Enum)partialwraps)Thread)
AnyCallableDictListOptionalSetTupleTypeTypeVarUnion)
is_zipfile)"split_torch_state_dict_into_shards)version)Tensornn)constraints)CrossEntropyLossIdentity)
checkpoint)is_torchao_available)Int4WeightOnlyConfig   )get_activation)PretrainedConfig)custom_object_save)CompileConfigGenerationConfigGenerationMixin)PeftAdapterMixindeepspeed_configis_deepspeed_zero3_enabled)find_tied_parametersinit_empty_weights)!_load_state_dict_into_zero3_modelis_deepspeed_available)flash_attention_forward)flex_attention_forward)sdpa_attention_forward)SUPPORTED_TP_STYLESshard_and_distribute_module)LOSS_MAPPING)Conv1Dapply_chunking_to_forward find_pruneable_heads_and_indicesid_tensor_storageprune_conv1d_layerprune_layerprune_linear_layer)AutoHfQuantizerHfQuantizer)get_module_from_name)auto_conversion)$ADAPTER_SAFE_WEIGHTS_NAMEADAPTER_WEIGHTS_NAMECONFIG_NAMEDUMMY_INPUTSFLAX_WEIGHTS_NAMESAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMETF2_WEIGHTS_NAMETF_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEContextManagersModelOutputPushToHubMixincached_file	copy_funcdownload_urlextract_commit_hashhas_fileis_accelerate_availableis_bitsandbytes_availableis_flash_attn_2_availableis_offline_modeis_optimum_availableis_peft_availableis_remote_urlis_safetensors_availableis_torch_flex_attn_availableis_torch_greater_or_equalis_torch_mlu_availableis_torch_npu_availableis_torch_sdpa_availableis_torch_xla_availableloggingreplace_return_docstrings	strtobool)create_and_tag_model_cardget_checkpoint_shard_files)ENV_VARS_TRUE_VALUESis_sagemaker_mp_enabledis_torch_fx_proxyis_torchdynamo_compiling)BitsAndBytesConfigQuantizationMethodXLA_USE_BF160XLA_DOWNCAST_BF16)dispatch_modelinfer_auto_device_map)add_hook_to_module)$check_tied_parameters_on_same_deviceextract_model_from_parallelget_balanced_memoryget_max_memoryload_offloaded_weightsoffload_weightsave_offload_index
accelerate0.31)get_state_dict_from_offload)	safe_open)	load_file)	save_fileTFc                   C   s@   t j ot j ottjdddkottjdddkS )NACCELERATE_USE_FSDPFalser   FSDP_CPU_RAM_EFFICIENT_LOADING)torchdistributedis_availableis_initializedra   osenvironget r   r   o/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/modeling_utils.pyis_fsdp_enabled   s   
r   c                   C   s*   t j ot j ottjdddkS )N
LOCAL_RANKr   )r   r   r   r   intr   r   r   r   r   r   r   is_local_dist_rank_0   s
   
r   __version__z1.10)find_adapter_config_fileSpecificPreTrainedModelTypePreTrainedModel)bound)uniform_normal_trunc_normal_	constant_xavier_uniform_xavier_normal_kaiming_uniform_kaiming_normal_uniformnormalxavier_uniformxavier_normalkaiming_uniformkaiming_normalc                  c   s    t } da dd }t D ]\}}ttjj|| qzdV  W | a t D ]\}}ttjj|| q%dS | a t D ]\}}ttjj|| q:w )ze
    Context manager to globally disable weight initialization to speed up loading large models.
    Fc                  _   s   d S Nr   )argskwargsr   r   r   
_skip_init   s   z#no_init_weights.<locals>._skip_initN)_init_weightsTORCH_INIT_FUNCTIONSitemssetattrr   r   init)old_init_weightsr   name	init_funcr   r   r   no_init_weights   s    r   c                   c       da zd V  W da d S da w NTF)_is_quantizedr   r   r   r   set_quantized_state   
   r   c                   c   r   r   )_is_ds_init_calledr   r   r   r   set_zero3_state  r   r   c                    s   t   fdd}|S )z
    Decorator to restore the default torch dtype
    at the end of the function. Serves
    as a backup in case calling the function raises
    an error after the function has changed the default dtype but before it could restore it.
    c               	      s0   t  }z | i |W t | S t | w r   )r   get_default_dtypeset_default_dtype)r   r   	old_dtypefuncr   r   _wrapper  s   z-restore_default_torch_dtype.<locals>._wrapper)r   )r   r   r   r   r   restore_default_torch_dtype  s   r   	parameterModuleUtilsMixinc              
   C   b   zt |  jW S  ty0   dtjdtttt	f  fdd}| j
|d}t |}|d j Y S w )Nmodulereturnc                 S      dd | j  D }|S )Nc                 S   "   g | ]\}}t |r||fqS r   r   	is_tensor.0kvr   r   r   
<listcomp>%     " zHget_parameter_device.<locals>.find_tensor_attributes.<locals>.<listcomp>__dict__r   r   tuplesr   r   r   find_tensor_attributes$     z4get_parameter_device.<locals>.find_tensor_attributesget_members_fnr   )next
parametersdeviceStopIterationr   Moduler   r   strr   _named_membersr   r   genfirst_tupler   r   r   get_parameter_device  s    r   c              
   C   r   )z`
    Returns the first parameter dtype (can be non-floating) or asserts if none were found.
    r   r   c                 S   r   )Nc                 S   r   r   r   r   r   r   r   r   7  r   zMget_first_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>r   r   r   r   r   r   6  r   z9get_first_parameter_dtype.<locals>.find_tensor_attributesr   r   )r   r   dtyper   r   r   r   r   r   r   r   r   r   r   r   get_first_parameter_dtype-  s    r   c                 C   s  d}|   D ]7}|j}| r=ttv rt rtj  S ttv r8t r8|jtj	kr-tj  S |jtj
kr8tj  S |j  S q|durD|S dtjdttttf  fdd}| j|d}d}|D ]}|}|d  ro|d j  S q^|dury|d jS |  D ]}|j}| r|j  S q}|S )zz
    Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found.
    Nr   r   c                 S   r   )Nc                 S   r   r   r   r   r   r   r   r   Z  r   zGget_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>r   r   r   r   r   r   Y  r   z3get_parameter_dtype.<locals>.find_tensor_attributesr   r   )r   r   is_floating_pointrj   rd   r^   r   bfloat16rl   floatdoublefloat32r   r   r   r   r   r   r   buffers)r   
last_dtypetr   r   
last_tupletupler   r   r   get_parameter_dtype?  s>   



 

r   c                 C   s(   |   D ]}| r|j  S qtd)z_
    Returns the first found floating dtype in `state_dict` or asserts if none were found.
    z5couldn't find any floating point dtypes in state_dict)valuesr   r   
ValueError
state_dictr   r   r   r   get_state_dict_float_dtypep  s
   
r   c                 C   s.   |   D ]}| r|j  S qt|   jS )zt
    Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype.
    )r   r   r   r   r   r   r   r   get_state_dict_dtype{  s
   
r   c                    s*  t j|t}t j|t}t j|}t j|}|s9|r!t s9t r(ttfntf}tdd| d| dd}	|rS|rOt rEd}	nt	d| d n|sSd}	|	rW|n|}
t
|
d	d
d}t|}W d   n1 spw   Y  tt|d  }|d   |    fddD }fdd D }|rt|dkst|dkrd| jj }t|dkrddd |D }|d| d7 }t|dkrddd |D }|d| d7 }t||	rtnttjddd}|D ]}|t j||}| j|dd ~t  qtjjj||S )a  
    This is the same as
    [`torch.nn.Module.load_state_dict`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict)
    but for a sharded checkpoint.

    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
    loaded in the model.

    Args:
        model (`torch.nn.Module`): The model in which to load the checkpoint.
        folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
        strict (`bool`, *optional*, defaults to `True`):
            Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
        prefer_safe (`bool`, *optional*, defaults to `False`):
            If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the
            safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible.

    Returns:
        `NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields
            - `missing_keys` is a list of str containing the missing keys
            - `unexpected_keys` is a list of str containing the unexpected keys
    zCan't find a checkpoint index ( or z) in .FTz"Cannot load sharded checkpoint at z+ safely since safetensors is not installed!rutf-8encodingN
weight_mapc                       g | ]}| vr|qS r   r   r   key)loaded_keysr   r   r         z+load_sharded_checkpoint.<locals>.<listcomp>c                    r   r   r   r   )
model_keysr   r   r     r   r   #Error(s) in loading state_dict for ,c                 S      g | ]}d | d qS "r   r   r   r   r   r   r     r   z
Missing key(s): c                 S   r  r  r   r  r   r   r   r     r   cpumap_locationweights_only)strict) r   pathjoinrG   rC   isfilerX   r   loggerwarningopenjsonloadlistsetr   keysr   len	__class____name__RuntimeErrorsafe_load_filer   r   load_state_dictgccollectr   modulesr   _IncompatibleKeys)modelfolderr	  prefer_safe
index_filesafe_index_fileindex_presentsafe_index_present	filenames	load_safe
load_indexfindexshard_filesmissing_keysunexpected_keyserror_messagestr_missing_keysstr_unexpected_keysloader
shard_filer   r   )r   r   r   load_sharded_checkpoint  sV   

r3  )BOOLU8I8I16F16BF16I32F32F64I64F8_E4M32.1.0r>  z2.3.0U16U32U64F8_E5M2r  checkpoint_fileis_quantizedr  r  c                 C   s  |  drrt rrt| dd\}| }|dur&|ddvr&td|  di }| D ]4}|| }|t	v r>t	| }	nt
d	| |d
krYtj|| |	d
d||< q,||||< q,|W  d   S 1 smw   Y  zK|du rt rtj rtj dkst rt s|sd
}nd}i }
t| tr|d
krttjtdkrt| rddi}
tj| f||d|
W S  ty	 } z?z#t| }|ddkrtdt
d|  d|1 sw   Y  W n tt
fy   td|  d|  dw W Y d}~dS d}~ww )zg
    Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default.
    .safetensorspt	frameworkNformat)rG  tfflaxmlxz"The safetensors archive passed at zf does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.z)Cannot load safetensors of unknown dtype meta)sizer   r   r   r  r?  mmapTr     r   zYou seem to have cloned a repository without having git-lfs installed. Please install git-lfs and run `git lfs install` followed by `git lfs pull` in the folder you cloned.zUnable to locate the file z_ which is necessary to load this pretrained model. Make sure you have saved the model properly.z9Unable to load weights from pytorch checkpoint file for 'z' at 'zZ'. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.) endswithrX   rz   metadatar   OSErrorr  	get_slice	get_dtypestr_to_torch_dtyper   r   empty	get_shape
get_tensorr(   r   r   get_rankr   r   
isinstancer   r   parser   r   r  	Exceptionr  readUnicodeDecodeError)rD  rE  r  r  r)  rS  r   r   k_dtyper   
extra_argser   r   r   r    s   	

  

r  c                    sh   t |}i }|  D ]'\ } dkrt | }n fdd| D }||r-d|_q
|| < q
|S )z
    Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
    dict.
     c                    s   h | ]	}  d | qS r   r   r  module_namer   r   	<setcomp>T      z-set_initialized_submodules.<locals>.<setcomp>T)r  named_modulesr   issubset_is_hf_initialized)r  state_dict_keysnot_initialized_submodulesr   module_keysr   rf  r   set_initialized_submodulesH  s   

rp  tensorr   c                 C   s2   |   r| dd  |   }|S |  }|S )Nr   )nelementviewdata_ptrelement_size)rq  stopr   r   r   _end_ptr\  s
   rw  rd  r   c                    s   g }t | dd d ur fdd| jD }|| t | dd d ur0 fdd| jD }|| |  D ]\}} rA  d| n|}|t||d q4|S )N_tied_weights_keysc                    "   g | ]} r  d | n|qS re  r   r  prefixr   r   r   h  r   z)_get_tied_weight_keys.<locals>.<listcomp>_dynamic_tied_weights_keysc                    ry  re  r   r  rz  r   r   r   k  r   r   rz  )getattrrx  extendr|  named_children_get_tied_weight_keys)r   r{  tied_weight_keysnamesr   	submodulelocal_prefixr   rz  r   r  e  s   

r  tensorsr   c                 C   s   g }| D ]V}t |dk r|| qg }|D ]}|| }|| t||f q|  |d \}}}	||	h |dd  D ]\}
}}|
|krP||h n|d | |}q@qg }g }|D ]} t | dkrq||   qa||  qa||fS )N   r   r   r   )r  appendrt  rw  sortaddpop)r  r   filtered_tensorssharedareasr   rq  _	last_stop	last_namestartrv  disjoint_tensorsshared_tensorsr   r   r   _find_disjoints  s2   
r  c           	      C   s   g }g }| D ]9}t |dk rqtt}|D ]}|| }|j| t|f}|| | qt |dkr:|| q|| q||fS )Nr  r   )	r  collectionsr   r  r   rt  rw  r  r  )	r  r   r  	identicalr  r  r   rq  arear   r   r   _find_identical  s   
r  r  
param_nameempty_paramkeep_in_fp32_regexhf_quantizerc           
   
   C   s   z|  |}W n  ty' } z|d ur!|jjtjkr!W Y d }~dS |d }~ww ttd}d }|o6|jtj	k}	|jj
rV|	sV|d urJ||rJtj}n|d urS| jj}n|j}|d uo]| |fS )N)TNfloat8_e4m3fn)get_parameter_or_bufferr^  quantization_configquant_methodri   HQQhasattrr   r   r  r   searchr   config_pre_quantization_dtypeis_contiguous)
r  r  r  r  r  	old_paramrc  is_torch_e4m3fn_availablecasting_dtypeis_param_float8_e4m3fnr   r   r   _infer_parameter_dtype  s$   

r  c                 C   s&   t | |\}}|j||iddd dS )zKCast a single parameter `param_name` into the `model`, with value `tensor`.FT)r	  assignN)r<   r  )r  r  rq  r   
param_typer   r   r   _load_parameter_into_model  s   r  r2  expected_keysreverse_renaming_mapping
device_mapdisk_offload_folderdisk_offload_indexcpu_offload_foldercpu_offload_indexis_safetensorsr-  device_mesh(torch.distributed.device_mesh.DeviceMeshc           "      C   s  d}|dur*| dddur*|d dtdfvr*t|d tjr&|d jn|d }|dur>ddd t| ddD }|
du}|oM|
jj	t
jt
jfv }|d	oU| }d}|rat|d
|d}| D ]\}}||vroqe|r{|| }||}n||}t| ||||
\}}|durt| |||||ttjd | qe|d }|dur||}|r| }|du rd}nt||}|st| d||  }|dkr|st||||}qe|dkr|	durt||||	}	qe|r|
jr|
j| |||||dst r	t  rdnd}t!| ||| qe|
"| ||||| t s%t# rht$| |\}}t%||}d} t r=t  s=d} i }!t&|drQ|j'j(j)dkrQd|!d< t*||j+| fi |!|j,}t-||| qe|duru|.ddd ||	fS )a  Load parameters from `meta_state_dict` into the model. The parameters of the `meta_state_dict` are on the meta
    device in order to easily infer the shapes and dtypes that they will have. Then proper parameters are then loaded
    from `shard_file`, which is the actual state dict file on disk.
    This function takes care of correctly casting dtypes, devices, and sharding tensors in case of tensor parallelism.
    r  Nrd  |c                 S      g | ]}t |qS r   reescaper  r   r   r   r         z4_load_state_dict_into_meta_model.<locals>.<listcomp>T)reverserF  rG  )rI  r   RANK.z doesn't have any device set.disk)param_devicer  rN  weight
Int8ParamsFrequires_grad)/r   r   r   r\  r*  r  sortedr  r  r  ri   r  BITS_AND_BYTESrR  rz   r   rU  tor  r1   r   r   r   
contiguousr  r  r   groupru    requires_parameters_quantizationcheck_quantized_paramr   r   r  create_quantized_paramr(   r<   r}  r  r  r  r  typedatar   r   __exit__)"r  r   r2  r  r  r  r  r  r  r  r  r  r  r-  r  tensor_devicedevice_map_regexrE  is_hqq_or_bnbis_meta_state_dictfile_pointerr  r  serialized_param_nameparamto_contiguousr  r  module_layerr   r  valueparam_to
val_kwargsr   r   r    _load_state_dict_into_meta_model  s   " 



"
r  weights_namevariantc                 C   s0   |d ur|  dd\}}| d| d| } | S )Nr   r   )rsplit)r  r  r
  r   r   r   r   _add_variantS  s   r  pretrained_model_name_or_path	subfolder	gguf_filefrom_tf	from_flaxuse_safetensors	cache_dirforce_downloadproxieslocal_files_onlytoken
user_agentrevisioncommit_hashc                 C   sZ  d}| dura|du rat | } tj| }|r`|r5tjtj| |td r5tj| |td }n|rMtjtj| |trMtj| |t}n|retjtj| |tretj| |t}n|durtjtj| |t	t
|rtj| |t	t
|}n|durtjtj| |t	t|rtj| |t	t|}d}n|stjtj| |t	t|rtj| |t	t|}n|stjtj| |t	t|rtj| |t	t|}d}nb|stjtj| |td stjtj| |trtdt	t| d|  d|s0tjtj| |tr0tdt	t| d|  d|rAtdt	t
| d|  d	tdt	t| d
t	t
| d
t d
td  dt d|  d	tjtj|| rr| }d}ntjtj|| d r|std| d  dtj|| d }d}nt| r| }t| }n|rt}n|rt}n|durt	t
|}nt	t|}zR|||	|
||||dd|d}t| |fi |}|du r;|t	t
|kr;t| t	t|fi |}|durd}n>|r-|dkrt| fi |\}}}||d< |du r,t|  dt	t
| dt	t| dnt	t|}t| |fi |}|du r[|t	t|kr[t| t	t|fi |}|dur[d}|
st s|dur|ttfv r|rstnt
}||	|||
d}|||
||dd|d|}t| |fi |stt| fddi|dd  nq||	|||
d}t| tfi |rt|  dt	t| dt| tfi |rt|  dt	t| d|durt| tfi |rt|  dt	t| d| dt|  dt	t| d
t	t
| d
t d
t dt d	W n0 ty     tyF } ztd|  d|  dt	t| d
t d
t dt d	|d}~ww |rUtd|  |}n0td| d|  n$|rtj|rn|}n|||	|
||||dd|d}t| |fi |}d}|rt| ||||	|
|||||d\}}||fS | dur|gnd}||fS ) zGet all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
    checkpoints are sharded.
    This function will download the data if necesary.
    FN.indexTzError no file named z found in directory zf but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.zb but there is a file for Flax weights. Use `from_flax=True` to load this model from those weights.r   z, r   z$We found a TensorFlow checkpoint at z:, please set from_tf to True to load from this checkpoint.)r  r  r  r  r  r  r  r   _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries_commit_hashmainr  z& does not appear to have a file named z and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`.)r  r  r  r  r  )r  r  r  r  r  r  r  r  ignore_errors_during_conversionzThread-auto_conversion)targetr   r   r   z) but there is a file without the variant z;. Use `variant=None` to load this model from those weights.zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named zloading weights file z from cache at )	r  r  r  r  r  r  r  r  r  )r   r   r
  isdirr  r  rF   rE   rB   r  rD   rC   rH   rG   EnvironmentErrorr   rW   rN   rL   r=   rT   rP   r	   r  r^  r  inforc   )r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  
is_shardedis_localarchive_filefilenameresolved_archive_filecached_file_kwargssafe_weights_namehas_file_kwargsrc  sharded_metadatacheckpoint_filesr   r   r   _get_resolved_checkpoint_filesZ  s6  










	

	


r  torch_dtyper  r  r  c                 C   s  d}|du}|durt |tro|dkrQt|dr*|jdur*|j}td| d n|r5d|v r5|d }n|dur>t|}nt|d d|d	}t|}td
 ntt|rnt	t|}||_|j
 D ]
}	t	||	}
||
_qcndt |tjr||_|j
 D ]
}	t	||	}
||
_q}nJt |tr| D ]\}}t||rt	||}t |ts|nt	t|}||_q|d}t |ts|nt	t|}||_|du rtj}ntd| | |}nt }||_|j
 D ]
}t	||}||_q|||fS )a  Find the correct `torch_dtype` to use based on provided arguments. Also update the `config` based on the
    inferred dtype. We do the following:
    1. If torch_dtype is not None, we use that dtype
    2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
        weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
    we also may have config.torch_dtype available, but we won't rely on it till v5
    Nautor  zWill use torch_dtype=z$ as defined in model's config objectr   r   rN  r  zSince the `torch_dtype` attribute can't be found in model's config object, will use torch_dtype={torch_dtype} as derived from model's weightsrd  z`torch_dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `torch_dtype` for each sub-config in composite configs, but received )r\  r   r  r  r  r  r   r  r   r}  sub_configsr  r   dictr   r   r   r   _set_default_torch_dtyper   )clsr  r  r  r  r   r  
dtype_origr  sub_config_key
sub_configr   
curr_dtyper  default_dtyper   r   r   _get_torch_dtype  sr   













r  
max_memoryc                    s6  t |tri }|dur||| |  dur&| fdd|  D  |}|dur1||}| |}d|i}	dtt	j
v rG||	d< nt|dkrRtd |dkret| f||d	k|d
|	}nt|}|durr||}||	d< t	| fd|i|	}|dur|j|d |S |durt| }
t|
| |S )zCompute the final `device_map` to use if we passed a value in ['auto', 'balanced', 'balanced_low_0', 'sequential'].
    Otherwise, we check for any device inconsistencies in the device_map.
    Nc                    s"   i | ]\}}  |r|tjqS r   )r  r   r   r   r   r  r  r   r   
<dictcomp>  r   z#_get_device_map.<locals>.<dictcomp>no_split_module_classesspecial_dtypesr   zThis model has some weights that should be kept in higher precision, you need to upgrade `accelerate` to properly deal with them (`pip install --upgrade accelerate`).
sequentialbalanced_low_0)r   low_zeror  r  r   )r  )r\  r   updateget_special_dtypes_updatenamed_parametersadjust_target_dtype_get_no_split_modulesinspect	signaturern   r   r  r  r  rr   rs   adjust_max_memoryvalidate_environmentr)   rp   )r  r  r  r  r  r  r  target_dtypeno_split_modulesdevice_map_kwargstied_paramsr   r  r   _get_device_map  sR   





r'  original_checkpoint_keyscheckpoint_keys'loading_base_model_from_task_state_dictc                    s  |j t|  }|dur||||}tt|t| }t|t| }	|r8fdd|D }
|	|
 dd | D }t|	| }	t	dd |D }|rYdd |	D }	t
|}|D ]"  fd	d|D td
krtt k rfdd|D }q_|dur|||}|||	}	| jdur| jD ]fdd|D }q| jdur| jD ]fdd|	D }	q||	fS )zFind missing keys (keys that are part of the model parameters but were NOT found in the loaded state dict keys) and unexpected keys
    (keys found in the loaded state dict keys, but that are NOT part of the model parameters)
    Nc                    s    g | ]}|   d s|qS re  
startswithr  rz  r   r   r   :       z5_find_missing_and_unexpected_keys.<locals>.<listcomp>c                 S   s   h | ]\}}|qS r   r   )r   nr  r   r   r   rh  ?  s    z4_find_missing_and_unexpected_keys.<locals>.<setcomp>c                 s   s    | ]}| d V  qdS )rotary_emb.inv_freqN)rR  )r   bufferr   r   r   	<genexpr>D      z4_find_missing_and_unexpected_keys.<locals>.<genexpr>c                 S   s   g | ]}d |vr|qS )r/  r   r  r   r   r   r   F  r   c                    s   g | ]}| v r|qS r   r   r  )r  r   r   r   J  r   r   c                    r   r   r   r  )missing_in_groupr   r   r   L  r   c                        g | ]}t  |d u r|qS r   r  r  r  patternr   r   r   U  r-  c                    r4  r   r5  r  r6  r   r   r   Y  r-  )base_model_prefixr  r   r  update_expected_keysr  r  r  named_buffersanyr)   r  update_missing_keysupdate_unexpected_keys_keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpected)r
  r  r(  r)  r*  r  r  r  r,  r-  task_specific_keysmodel_buffershas_inv_freq_buffersr&  r   )r  r3  r7  r{  r   !_find_missing_and_unexpected_keys"  s<   




rC  ignore_mismatched_sizeskeys_to_rename_mappingc                    s   |sg g fS |durdg}|   }g }g }	|D ]W}
|
dkr%t|
|d|d} fdd| D }| D ]9}||v rm|| j|| jkrm|| jd dkr[||  d	 ||  ksm|| |	|| j|| jf q4q||	fS )
a   
    Find potential shape mismatch between the different state dicts and the model parameters, but only if `ignore_mismatched_sizes`
    is True. Otherwise, return immediately and any shape mismatch that may exist will be raised later on. This avoids checking
    every parameter in advance, as shape mismatch are extremely rare in practice. If we want to ignore them however, we do
    need to check in advance as we need to know which parameters we need to move back from meta to cpu, and initialize
    correctly. Indeed, as our model initialization takes place at the module level, and not the weight level, in the
    case of a sharded checkpoint we cannot correctly initialize the weights according to `model._init_weights()` if we perform
    this check on each state dict at loading time (after the first loaded checkpoint, there are no way to initialize only the
    mismatched weights if any, without overwriting the previously loaded weights as well because all the module will be
    initialized, not only the weights that are mismatched).
    Nrd  rN  rE  r  r  c                    "   i | ]\}}| v r | |qS r   r   r   rE  r   r   r    r   z)_find_mismatched_keys.<locals>.<dictcomp>r   r   r  )r   r  r   r  shapenumelr  )r  r   r  rD  rE  rE  r  model_state_dictmismatched_keysmismatched_shapesr2  new_state_dictr   r   rH  r   _find_mismatched_keys^  s,   
rO  c                   @   s   e Zd ZU ded< ded< dS )PipelineParallelr   inputsr   outputsN)r  
__module____qualname____annotations__r   r   r   r   rP    s   
 rP  c                   @   s<  e Zd ZdZedd Zedd Zdd Zdd	 Ze	d
e
jfddZe	d
e
jfddZded
efddZed-ddZ	d.dedee de
jde
jd
ef
ddZ	d/dee deded
efdd Zd!d" Zd0d#ed$ed
efd%d&Zd'eeee
jef f d
efd(d)Z	*d1d'eeee
jef f d$ed
efd+d,ZdS )2r   zH
    A few utilities for `torch.nn.Modules`, to be used as a mixin.
    c                 O   sF   zdd l }W n ty   tdw |t }| }|j| _d S )Nr   FYou need to install psutil (pip install psutil) to use memory tracing.)psutilImportErrorProcessr   getpidmemory_inforssmem_rss_pre_forward)r   r   r   rW  processmemr   r   r   _hook_rss_memory_pre_forward  s   z-ModuleUtilsMixin._hook_rss_memory_pre_forwardc                 O   sl   zdd l }W n ty   tdw |t }| }|j| _| j| j }|t	| dr0| j
nd | _
d S )Nr   rV  mem_rss_diff)rW  rX  rY  r   rZ  r[  r\  mem_rss_post_forwardr]  r  ra  )r   r   r   rW  r^  r_  ra  r   r   r   _hook_rss_memory_post_forward  s   z.ModuleUtilsMixin._hook_rss_memory_post_forwardc                 C   s2   |   D ]}|| j || j q|   dS )a%  
        Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.

        Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero
        with `model.reset_memory_hooks_state()`.
        N)r  register_forward_pre_hookr`  register_forward_hookrc  reset_memory_hooks_stateselfr   r   r   r   add_memory_hooks  s   z!ModuleUtilsMixin.add_memory_hooksc                 C   s$   |   D ]}d|_d|_d|_qdS )z
        Reset the `mem_rss_diff` attribute of each module (see [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]).
        r   N)r  ra  rb  r]  rg  r   r   r   rf    s
   z)ModuleUtilsMixin.reset_memory_hooks_stater   c                 C      t | S )z
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        )r   rh  r   r   r   r        zModuleUtilsMixin.devicec                 C   rj  )zw
        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
        )r   rk  r   r   r   r     s   zModuleUtilsMixin.dtypeencoder_attention_maskc                 C   st   |  dkr|dddddddf }|  dkr&|ddddddf }|j| jd}d| t| jj }|S )z
        Invert an attention mask (e.g., switches 0. and 1.).

        Args:
            encoder_attention_mask (`torch.Tensor`): An attention mask.

        Returns:
            `torch.Tensor`: The inverted attention mask.
           Nr  r         ?)dimr  r   r   finfomin)rh  rm  encoder_extended_attention_maskr   r   r   invert_attention_mask  s   
z&ModuleUtilsMixin.invert_attention_maskNc           	      C   s   |d urt dt n|j}| \}}tj||d}|d d d d f ||d|d d d d f k}||j}|j	d |j	d k r]|j	d |j	d  }tj
tj|||f||jd|gdd}|d d d d d d d f |d d d d d d f  }|S )NNThe `device` argument is deprecated and will be removed in v5 of Transformers.)r   r   r   r   r   axis)warningswarnFutureWarningr   r   arangerepeatr  r   rI  catones)	input_shapeattention_maskr   
batch_size
seq_lengthseq_idscausal_maskprefix_seq_lenextended_attention_maskr   r   r   *create_extended_attention_mask_for_decoder  s&   .4z;ModuleUtilsMixin.create_extended_attention_mask_for_decoderr  r  r   r   c                 C   s   |du r| j }| dkr| jjs|durtdt | dkr0|dddddddf }n+| dkrO| jjrBt|||}n|ddddddf }nt	d| d|j
 d|j|d}d	| t|j }|S )
a  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nr  rv  rn  z!Wrong shape for input_ids (shape z) or attention_mask (shape )ro  rp  )r   rq  r  
is_decoderrz  r{  r|  r   r  r   rI  r  r   rr  rs  )rh  r  r  r   r   r  r   r   r   get_extended_attention_mask  s*   	z,ModuleUtilsMixin.get_extended_attention_maskF	head_masknum_hidden_layersis_attention_chunkedc                 C   s8   |dur|  ||}|du r|d}|S dg| }|S )a  
        Prepare the head mask if needed.

        Args:
            head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
            num_hidden_layers (`int`):
                The number of hidden layers in the model.
            is_attention_chunked (`bool`, *optional*, defaults to `False`):
                Whether or not the attentions scores are computed by chunks or not.

        Returns:
            `torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
            `[None]` for each layer.
        NTr   )_convert_head_mask_to_5d	unsqueeze)rh  r  r  r  r   r   r   get_head_maskF  s   

zModuleUtilsMixin.get_head_maskc                 C   s   |  dkr|dddd}||dddd}n|  dkr/|ddd}|  dks>J d|   |j| jd}|S )zD-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]r   r   r   r     zhead_mask.dim != 5, instead ro  )rq  r  expandr  r   )rh  r  r  r   r   r   r  a  s   z)ModuleUtilsMixin._convert_head_mask_to_5donly_trainableexclude_embeddingsc           	         s   |rdd |   D   fdd|  D }nt|  }g }t| dd}|r3t r/ddl}ntd|D ]9}|js<|sn|rgt	||j
jrgt|d	rO| }nt|d
rY|jj}nd}|| d |  q5||  q5t|S )a  
        Get number of (optionally, trainable or non-embeddings) parameters in the module.

        Args:
            only_trainable (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of trainable parameters

            exclude_embeddings (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of non-embeddings parameters

        Returns:
            `int`: The number of parameters.
        c                 S   s&   g | ]\}}t |tjr| d qS ).weight)r\  r   	Embedding)r   r   module_typer   r   r   r   |  s
    z3ModuleUtilsMixin.num_parameters.<locals>.<listcomp>c                    s   g | ]
\}}| vr|qS r   r   )r   r   r   embedding_param_namesr   r   r     s    is_loaded_in_4bitFr   Nzbitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. ru  quant_storager   r  )rj  r  r  r   r}  rR   bitsandbytesr   r  r\  r   
Params4bitr  ru  r  itemsizer  rJ  sum)	rh  r  r  total_parameterstotal_numelr  bnbr  	num_bytesr   r  r   num_parametersl  s8   






zModuleUtilsMixin.num_parameters
input_dictc                 C   sJ   t | dsi | _| j|v r|| j  S d| jvr#td d| jd< dS )z
        Helper function to estimate the total number of tokens from the model inputs.

        Args:
            inputs (`dict`): The model inputs.

        Returns:
            `int`: The total number of tokens.
        warnings_issuedestimate_tokenszdCould not estimate the number of tokens of the input, floating-point operations will not be computedTr   )r  r  main_input_namerJ  r  r  )rh  r  r   r   r   r    s   




z ModuleUtilsMixin.estimate_tokensTc                 C   s   d|  | | j|d S )a  
        Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
        batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
        tokens (valid if `12 * d_model << sequence_length`) as laid out in [this
        paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter
        re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.

        Args:
            batch_size (`int`):
                The batch size for the forward pass.

            sequence_length (`int`):
                The number of tokens in each line of the batch.

            exclude_embeddings (`bool`, *optional*, defaults to `True`):
                Whether or not to count embedding and softmax operations.

        Returns:
            `int`: The number of floating-point operations.
           )r  )r  r  )rh  r  r  r   r   r   floating_point_ops  s   z#ModuleUtilsMixin.floating_point_opsr   NNFFFT)r  rS  rT  __doc__staticmethodr`  rc  ri  rf  propertyr   r   r   r   ru  r  r   r   r   r  r   boolr  r  r  r   r   r   r
   r  r  r   r   r   r   r     s`    

	
5
$6c                !       sX  e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZedeeej f fddZ!edefd	d
Z"de#f fddZ$dd Z%dd Z&dd Z'de(e)e ef ddfddZ*e+e,dd Z-e+				dde.de/ej0 de/e(eeee1f f  de.fddZ2e+d ej0dej0fd!d"Z3ede4j5fd#d$Z6e+de.fd%d&Z7e+				dde/ej0 de/e(eeee1f f  de.d'e.de#f
d(d)Z8e+dd'e.de#fd*d+Z9e+dd'e.de#fd,d-Z:d.d/ Z;d0d1 Z<de4j5fd2d3Z=d4e4j5fd5d6Z>de4j5fd7d8Z?d9d: Z@d;d< ZAd=d> ZBeCd?e4j5d@e4j5dAedBefdCdDZDdEdF ZEdefdGdHZF			ddIe/e1 dJe/e1 dKe.de4jGfdLdMZHddNdOZI			ddPe4jGdIe/e1 dJe/e1 dKe.de4jGf
dQdRZJ			ddSe4jKdIe/e1 dTe/e. dKe.de4jKf
dUdVZLdWdX ZM	ddYdZZNd[d\ ZOd]d^ ZPd_e1fd`daZQde(e4jGeRe4jG f fdbdcZSddde ZTdfee1e)e1 f fdgdhZUddidjZVdeWfdke.dleXfdmdnZYdodp ZZede.fdqdrZ[ddej\ddsddddf	dte(ee]j^f due.dve/e_ dweXdxe.dye(e1ef dze.d{e/e d|e/e(ee.f  d}e.fd~dZ`eaebjc fddZcdddZdeaej4j5je fddZeeaej4j5jf fddZf fddZg fddZhe+de.de.fddZie+e,dddddddddd	dejek de/e(ee]j^f  de/e(e#ee]j^f  de/e(ee]j^f  de.de.de.d|e/e(ee.f  dede/e. de.dekfddZleCdedeRee.f fddZm			dde)e de/eeef  de.de.fddZneCdeRee.f fddZodd Zpe+											ddd dve/e de/e)e  de/e de.de/e de/e de/e de/e. d e/ej0 de/eq de/erjs de/d de/eeef  de.fddZte+dd Zue+dd ZvdddZwe+dddZxdddZydd ZzddÄ Z{eddń Z|eddǄ Z}eddɄ Z~e~jddɄ Z~defdd̈́Ze+ddτ Zde)e de)e d e/ej0 de/eq dd f
ddӄZde)e de.de.dd fddքZdefddلZ  ZS )r   a  
    Base class for all models.

    [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
    downloading and saving models as well as a few methods common to all models to:

        - resize the input embeddings,
        - prune heads in the self-attention heads.

    Class attributes (overridden by derived classes):

        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
          for this model architecture.
        - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
          taking as arguments:

            - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint.
            - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model.
            - **path** (`str`) -- A path to the TensorFlow checkpoint.

        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
          classes of the same architecture adding modules on top of the base model.
        - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
    Nrd  	input_idsFr   c                 C   s   dt tiS )z^
        `Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
        r  )r   rq  rA   rk  r   r   r   dummy_inputs3     zPreTrainedModel.dummy_inputsc                 C      dS )z@
        :str: Identifies that this is a PyTorch model.
        rG  r   rk  r   r   r   rI  :  s   zPreTrainedModel.frameworkr  c                    s  t    t|tstd| jj d| jj dt|dds3t|dr'|j	nt
 }| j||dd}|| _| jj}|tvr\dd	t d
}t|| jj}t|dkrZ|d }nd }|| _|j| _i | _|  rot|nd | _t| jj| _| jp~g | _d S )NzParameter config in `zt(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = z(.from_pretrained(PRETRAINED_MODEL_NAME)`_attn_implementation_autosetFr  )r  check_device_map(r  r  r   )super__init__r\  r!   r   r  r  r}  r  r  r   r   _autoset_attn_implementationr  r2   r  r  findallr  	loss_typename_or_pathr  can_generater$   from_model_configgeneration_configcopy_keep_in_fp32_modules_no_split_modules)rh  r  rQ  r   r   r  loss_groupsr  r   r   r  A  s0   


zPreTrainedModel.__init__c                    sN  |    |   | jdur@dd |  D }t }|D ]}|dd |dD  q| jD ]}||vr?t| d| jj	 q.| j
jdurL| j
j nd| _| j
jdur[| j
j ni | _|  D ]\ }t|dd }r| j fd	d
|  D  qb| jdurtdr| j D ]\}}|tvrtd| dt qdS dS dS )z
        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
        modules properly initialized (such as weight initialization).
        Nc                 S   s    h | ]\}}t |d kr|qS r   r  r  r   r   r   rh  n  r-  z,PreTrainedModel.post_init.<locals>.<setcomp>c                 S   s    g | ]}|  s|d vr|qS ))r  bias)	isnumericr   r   r   r   r   r   s  r-  z-PreTrainedModel.post_init.<locals>.<listcomp>r   zV was specified in the `_keep_in_fp32_modules` list, but is not part of the modules in _tp_planc                        i | ]\}}  d | |qS re  r   r   r   r   r   r    r-  z-PreTrainedModel.post_init.<locals>.<dictcomp>z2.3z"Unsupported tensor parallel style z. Supported styles are )init_weights._backward_compatibility_gradient_checkpointingr  r  r  r  splitr   r  r  r  base_model_pp_planr  _pp_planbase_model_tp_planr  r  r}  r   rZ   r0   )rh  all_parametersunique_module_namesr  r   planr  r   r   r  r   	post_initd  s@   

"zPreTrainedModel.post_initc                 C   s&   t | dd}|du rtd|| S )z
        Potentially dequantize the model in case it has been quantized by a quantization method that support
        dequantization.
        r  Nz?You need to first quantize your model in order to dequantize it)r}  r   
dequantize)rh  r  r   r   r   r    s   
zPreTrainedModel.dequantizec                 C   s4   | j rt| jddr|   t| jd d S d S d S )Ngradient_checkpointingF)supports_gradient_checkpointingr}  r  gradient_checkpointing_enabledelattrrk  r   r   r   r    s   z>PreTrainedModel._backward_compatibility_gradient_checkpointingtagsc                 C   sD   t |tr|g}| jdu rg | _|D ]}|| jvr| j| qdS )a\  
        Add custom tags into the model that gets pushed to the Hugging Face Hub. Will
        not overwrite existing tags in the model.

        Args:
            tags (`Union[List[str], str]`):
                The desired tags to inject in the model

        Examples:

        ```python
        from transformers import AutoModel

        model = AutoModel.from_pretrained("google-bert/bert-base-cased")

        model.add_model_tags(["custom", "custom-bert"])

        # Push the model to your namespace with the name "my-custom-bert".
        model.push_to_hub("my-custom-bert")
        ```
        N)r\  r   
model_tagsr  )rh  r  tagr   r   r   add_model_tags  s   


zPreTrainedModel.add_model_tagsc           	      K   s  | d|j}t|trtt|}| dd}d}|dur"| |}t|}|j	dur0|j	}nd}| d||_
t|ddsH| j||d|d}t r|ts|ts|td tjjt d	t g}t| | |fi |}W d   n1 svw   Y  n| |fi |}|durt| |S )
z
        All context managers that the model should be initialized under go here.

        Args:
            torch_dtype (`torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model under this dtype.
        r  use_flash_attention_2FNattn_implementationr  )r  r  r  @Detected DeepSpeed ZeRO-3: activating zero.init() for this modelconfig_dict_or_path)r  r  r\  r   r}  r   r	  r  deepcopy_attn_implementation_internal_attn_implementationr  r(   r   r   r  r  	deepspeedzeroInitr'   r   rI   r   )	r
  r  r   r  r  r  r  init_contextsr  r   r   r   _from_config  s<   







zPreTrainedModel._from_configTr  r  r  r  c                 C   s  d}t |drQ|jdurQ|jdkr|rtd|j dt|jtsN|jdgt  vrNd|j d}| jr:|d	7 }| j	rA|d
7 }| j
rH|d7 }t|d |j}|j D ]}t||}	t|tsd|n||d}
|	durq|
|	_qV|r|td d|_|jdkr| j|||d|d n`|dkr| j|dd}nT|dv rt s| j||du rdndd}tjjdur|jdkrtj dkrttjtdk rtd tjjd n|t v r||_nt|trd|_nd|_d|_|S )az  
        Automatically checks and dispatches to a default attention implementation. In order of priority:
            1. An implementation specified in `config._attn_implementation` (due for example to the argument attn_implementation="sdpa" in from_pretrained).
            2. DEPRECATED: if use_flash_attention_2 is set to `True` and `flash_attn` is available, flash attention. (`LlamaFlashAttention` for example)
            3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
            4. The default model's implementation otherwise (`LlamaAttention` for example) .
        Nr  flash_attention_2zBoth attn_implementation="z" and `use_flash_attention_2=True` were used when loading the model, which are not compatible. We recommend to just use `attn_implementation="flash_attention_2"` when loading the model.eagerz Specified `attn_implementation="zt"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)zT, `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)zf, `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)zV, `"attn_implementation=flex_attention"` (implementation using torch's flex_attention)r   zThe model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.F)r  r  hard_check_onlyr  flex_attentionT)r  )Nsdpar  r   z2.4.1zUsing the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.)r  r  r  r   r\  r  ALL_ATTENTION_FUNCTIONS
valid_keys_supports_flash_attn_2_supports_sdpa_supports_flex_attnr  r  r}  r   r  warning_once_check_and_enable_flash_attn_2_check_and_enable_flex_attnr^   _check_and_enable_sdpar   r   hipcudadevice_countr]  r   backendsenable_flash_sdpr  )r
  r  r  r  r  r  requested_attn_implementationmessager   r  curr_attn_implementationr   r   r   r    s   





z,PreTrainedModel._autoset_attn_implementationr   c                 C   sN   |j std| j d| dtd| j d| d t }t| |S )a  
        Change the default dtype and return the previous one. This is needed when wanting to instantiate the model
        under specific dtype.

        Args:
            dtype (`torch.dtype`):
                a floating dtype to set to.

        Returns:
            `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)` if it was
            modified. If it wasn't, returns `None`.

        Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
        `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
        zCan't instantiate z model under dtype=z' since it is not a floating point dtypezInstantiating z model under default dtype r   )r   r   r  r  r  r   r   r   )r
  r   r  r   r   r   r	  a  s   
z(PreTrainedModel._set_default_torch_dtypec                 C   s   t | | j| S )z@
        `torch.nn.Module`: The main body of the model.
        )r}  r8  rk  r   r   r   
base_model|  r  zPreTrainedModel.base_modelc                 C   sl   dt | jv r	dS | jD ]}t|dsqdt |vr!| r! dS qdt | jvr4t| j d dS dS )a  
        Returns whether this model can generate sequences with `.generate()` from the `GenerationMixin`.

        Under the hood, on classes where this function returns True, some generation-specific changes are triggered:
        for instance, the model instance will have a populated `generation_config` attribute.

        Returns:
            `bool`: Whether this model can generate sequences with `.generate()`.
        r%   Tr  r   u:   has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.F)r   	__bases__r  r  prepare_inputs_for_generationr  r  r  )r
  baser   r   r   r    s   


zPreTrainedModel.can_generater  c           
      C   s  | j st| j d|j dt sd}d}tjddu r6t r-|s&d|_	t
d |S t| d	| ttjd}tjjrk|td
k rVt| d| d| tj sbt| dt| d| tjjr|tdk rt| d| d| t| d| t| dd}	|	rtd|du rt
d n|dur|tjtjfvrt
d| j d| d |r|du rtdjjdvrtj rt
d n(t rt
d ntd|r|durt|trd| v sd| v rtd|sd|_	|S ) a9  
        Checks the availability of Flash Attention 2 and compatibility with the current model.

        If all checks pass and `hard_check_only` is False, the method will set the config attribute `attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module.
        z does not support Flash Attention 2.0 yet. Please request to add support where the model is hosted, on its model hub page: https://huggingface.co/zk/discussions/new or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/newzVFlashAttention2 has been toggled on, but it cannot be used due to the following error:zPlease refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
flash_attnNr  z+Detect using FlashAttention2 on Ascend NPU.z3 the package flash_attn seems to be not installed. r?  zY you need flash_attn package version to be greater or equal than 2.1.0. Detected version z. z\ Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device.z% Flash Attention 2 is not available. z2.0.4z you need flash_attn package version to be greater or equal than 2.0.4. Make sure to have that version installed - detected version use_bettertransformerFzFlash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()zwYou are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviourzcFlash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in z is aG  . You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`r   )r  mluzYou are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.zYou are attempting to use Flash Attention 2.0 with a model not initialized on MLU. Make sure to move the model to MLU after initializing it on CPU with `model.to('mlu')`.a-  You are attempting to use Flash Attention 2.0 with a model not initialized on GPU and with no GPU available. This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU.r  r  zYou are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to initialise the model on a GPU by passing a device_map that contains only GPU devices as keys.)r  r   r  _name_or_pathrS   	importlibutil	find_specr\   r  r  r  rX  r   r]  rS  r   r  r   r   r}  r  float16r   rX  r   r  r[   r\  r  r   )
r
  r  r  r  r  r  prefaceinstall_messageflash_attention_version_is_bettertransformerr   r   r   r    s   


z.PreTrainedModel._check_and_enable_flash_attn_2c                 C   sZ   |r| j st| j dt stdt r| j s|S t| dd}|r&|S |s+d|_|S )a	  
        Checks the availability of SDPA for a given model.

        If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "sdpa" so that the model can initialize the correct attention module.
        a   does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`zSPyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.r  Fr  )r  r   r  r]   rX  r}  r  )r
  r  r  r  r   r   r   r  	  s"   
z&PreTrainedModel._check_and_enable_sdpac                 C   sF   |r| j st| j dt stdt r| j s|S |s!d|_|S )a  
        Checks the availability of Flex Attention for a given model.

        If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flex_attention" so that the model can initialize the correct attention module.
        a   does not support an attention implementation through torch's flex_attention. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`z]PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0.r  )r  r   r  rY   rX  r  )r
  r  r  r   r   r   r  3	  s   
z+PreTrainedModel._check_and_enable_flex_attnc                 C   s   dd }|   || _dS )z
        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
        the model weights fixed.
        c                 S   s   | d d S NT)requires_grad_)r   inputoutputr   r   r   make_inputs_require_gradsV	  s   zMPreTrainedModel.enable_input_require_grads.<locals>.make_inputs_require_gradsN)get_input_embeddingsre  _require_grads_hook)rh  r  r   r   r   enable_input_require_gradsP	  s   z*PreTrainedModel.enable_input_require_gradsc                 C   s   | j   dS )z4
        Removes the `_require_grads_hook`.
        N)r  removerk  r   r   r   disable_input_require_grads[	  s   z+PreTrainedModel.disable_input_require_gradsc                 C   s"   t | | j| }|| ur| S t)z
        Returns the model's input embeddings.

        Returns:
            `nn.Module`: A torch module mapping vocabulary to hidden states.
        )r}  r8  r  NotImplementedError)rh  r  r   r   r   r  a	  s   z$PreTrainedModel.get_input_embeddingsr  c                 C   s(   t | | j| }|| ur|| dS t)z
        Set model's input embeddings.

        Args:
            value (`nn.Module`): A module mapping vocabulary to hidden states.
        N)r}  r8  set_input_embeddingsr"  )rh  r  r  r   r   r   r#  n	  s   z$PreTrainedModel.set_input_embeddingsc                 C   r  )z
        Returns the model's output embeddings.

        Returns:
            `nn.Module`: A torch module mapping hidden states to vocabulary.
        Nr   rk  r   r   r   get_output_embeddings{	     z%PreTrainedModel.get_output_embeddingsc                 C   r  )a]  
        Initialize the weights. This method should be overridden by derived class and is
        the only initialization method that will be called when loading a checkpoint
        using `from_pretrained`. Any attempt to initialize outside of this function
        will be useless as the torch.nn.init function are all replaced with skip.
        Nr   rg  r   r   r   r   	  r%  zPreTrainedModel._init_weightsc                 C   s$   t |ddrdS | | d|_dS )zM
        Initialize the weights if they are not already initialized.
        rl  FNT)r}  r   rl  rg  r   r   r   _initialize_weights	  s   

z#PreTrainedModel._initialize_weightsc                 C   s   t | jjddddr|  }|dur| ||   t | jddrCt | jddrCt| | jr5t | | j} | | j	| j
| jd}|| _|  D ]}t|d	rR|  qGdS )
z
        Tie the weights between the input embeddings and the output embeddings.

        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
        weights instead.
        Tdecodertie_word_embeddingsNis_encoder_decoderFtie_encoder_decoderencoder_tie_weights)r}  r  get_text_configr$  _tie_or_clone_weightsr  r  r8  _tie_encoder_decoder_weightsr,  r(  r|  r  r-  )rh  output_embeddingstied_weightsr   r   r   r   tie_weights	  s    
zPreTrainedModel.tie_weightsr,  r(  r8  base_encoder_namec                    s   g }g |j | j krt|j  d| j  d 			ddtjdtjdtdtd	tt f
 fd
d  || ||| t|dkrGtd|  S )N and zZ are not equal. In this case make sure that all encoder weights are correctly initialized.r   rd  decoder_pointerencoder_pointerrg  r4  uninitialized_encoder_weightsc                    s  t | tjrt |tjsJ |  d| dt| drLt|ds"J | j|_| | d t| drJt|ds<J | | d | j|_d S |j}| j}	t|	dkrt|dkshJ d| d	|   fd
d|	 D }
d}|	
 D ]e\}}| rtt|| }|}t |	| t|| st|t|	kr|d8 }qyn||vrqy|dkrtd| }}|	| ||  d | |||d | d| | d| d |
 d |  qy|t|
7 }d S d S )Nr5  z have to be of type nn.Moduler  r  r  z.biasr   zEncoder module z does not match decoder module c                    s   h | ]} d  | qS )/r   )r   sub_namerf  r   r   rh  	  r   zkPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursively.<locals>.<setcomp>r   i  zMax depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model.r9  r   )depthtotal_encoder_nametotal_decoder_name)r\  r   r   r  r  r  r  _modulesr  r  r   isdigitr   r   r  r   r   r  )r6  r7  rg  r4  r8  r;  r=  r<  encoder_modulesdecoder_modulesall_encoder_weightsencoder_layer_posr   r   encoder_namedecoder_name"tie_encoder_to_decoder_recursivelyr2  rf  r   rG  	  sh   




zXPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursivelyz;The following encoder weights were not tied to the decoder )r   rd  rd  )	r  r  r  r   r   r   r   r  r  )r,  r(  r8  r4  r8  r   rF  r   r0  	  s8   D
z,PreTrainedModel._tie_encoder_decoder_weightsc                 C   s   | j jrt|j |_n|j|_t|dddur3tj|j	j
d|jjd |j	jd  fdd|j	_
t|drCt|drE|j|_dS dS dS )zPTie or clone module weights depending of whether we are using TorchScript or notr  Nr   constantout_featuresnum_embeddings)r  torchscriptr   	Parameterr  cloner}  
functionalpadr  r  rI  r  rJ  rI  )rh  r1  input_embeddingsr   r   r   r/  
  s   	z%PreTrainedModel._tie_or_clone_weightsc                 C   s   t  }| g}t|dkrB|d}|jj|vr<t|tr4|jdu r-t|jj d| d|t |jB }|t	|
 7 }t|dkst	|S )a  
        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
        get the underlying `_no_split_modules`.

        Args:
            device_map (`str`):
                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]

        Returns:
            `List[str]`: List of modules that should not be split
        r   r   Nz does not support `device_map='z_'`. To implement support, the model class needs to implement the `_no_split_modules` attribute.)r  r  r  r  r  r\  r   r  r   r  children)rh  r  r  modules_to_checkr   r   r   r   r  
  s   


z%PreTrainedModel._get_no_split_modulesnew_num_tokenspad_to_multiple_ofmean_resizingc                 C   s   |  |||}|du r|du r|S t| do| jdu}t r@|s@tjj|jdd |jjd }W d   n1 s:w   Y  n|jjd }|| j	
 _|| _|   |S )a$	  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        Nr  modifier_rankr   )_resize_token_embeddingsr  r  r(   r  r  GatheredParametersr  rI  r  r.  
vocab_sizer3  )rh  rS  rT  rU  model_embedsrE  rZ  r   r   r   resize_token_embeddings<
  s   $
z'PreTrainedModel.resize_token_embeddingsc                 C   sR  |   }| ||||}t|dr|j}t|| |jj}|| | | t| do0| j	d u}|d ur`t
 rZ|sZtjj|jd d |jjd }W d    n1 sTw   Y  n|jjd }|  d ur| jjddjs|  }	t|	tjjr| j|	||d}
n| j|	||d}
t|	dr|	j}t|
| |	jj}|
| | |
 |   S )N_hf_hookr  rV  r   Tr'  )rU  )r  _get_resized_embeddingsr  r]  ro   r  r  r  r#  r  r(   r  r  rY  rI  r$  r  r.  r)  r\  r   r   r  _get_resized_lm_headset_output_embeddings)rh  rS  rT  rU  old_embeddingsnew_embeddingshookold_embeddings_requires_gradrE  old_lm_headnew_lm_headold_lm_head_requires_gradr   r   r   rX  u
  s@   








z(PreTrainedModel._resize_token_embeddingsra  c                 C   sl  |dur&t |tstd| d|du r|jjd }|| d | | }n	td| d |du r5|S t| do>| jdu}t	 re|set
jj|jdd	 |j \}}W d   n1 s_w   Y  n|j \}}||krut	 su|S t |tjstd
t| dtj dtj dtj|||jj|jjd}||kr|s| | nA||kr|rtd || }	t	 r|st
jj|jgdd	 | |||||	 W d   n1 sw   Y  n	| |||||	 t||}
t	 r)|s)|j|jg}t
jj|dd	 |jjd|
ddf |jjd|
ddf< W d   n	1 s#w   Y  n|jjd|
ddf |jjd|
ddf< t	 r|s|j|jg}t
jj|dd	6 |j|_|jjjd |_|jdurz|d |jk rd|_W d   |S W d   |S W d   |S 1 sw   Y  |S |jj|j_|jjjd |_|jdur|d |jk rd|_|S )a	  
        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
        initialized vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_embeddings (`torch.nn.Embedding`):
                Old embeddings to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the embedding matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html


        Return:
            `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
            `new_num_tokens` is `None`
        Nz5Asking to pad the embedding matrix to a multiple of `z@`, which is not and integer. Please make sure to pass an integerr   r   zYou are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be a.  . This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tcr  rV  zOld embeddings are of type , which is not an instance of zj. You should either use a different resize function or make sure that `old_embeddings` are an instance of r   rw  zThe new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)r\  r   r   r  rI  r  r  r  r  r(   r  r  rY  rO  r   r  	TypeErrorr  r   r   r   r  (_init_added_embeddings_weights_with_meanrs  r  rJ  padding_idx)rh  ra  rS  rT  rU  rE  old_num_tokensold_embedding_dimrb  added_num_tokensr.  paramsr   r   r   r^  
  s   +






.,


z'PreTrainedModel._get_resized_embeddingsre  
transposedc              	   C   s|  |du r|S t | do| jdu}t r?|s?tjj|jdd |s&|j n|j  \}}W d   n1 s9w   Y  n|sF|j n|j  \}}||krXt sX|S t	|t
jsptdt| dt
j dt
j d|sv||fn||f}|jdu}	t
j||	|jj|jjd}
||kr|s| |
 n_||kr|rtd	 || }t r|s|jg}|	r||jg7 }tjj|dd | ||
|||| |	r| ||
| W d   n1 sw   Y  n| ||
|||| |	r| ||
| t||}t r3|s3|j|j|
j|
jg}tjj|d
d | |
||||	 W d   |
S 1 s,w   Y  |
S | |
||||	 |
S )a  
        Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
        vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_lm_head (`torch.nn.Linear`):
                Old lm head liner layer to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the linear matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
                to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
                vocab_size` else `vocab_size, lm_head_dim`.
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
            `None`
        Nr  rV  z#Old language model head is of type rh  zg. You should either use a different resize function or make sure that `old_lm_head` are an instance of r   )r  r   r   a  The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`r   )r  r  r(   r  r  rY  r  rO  r   r\  r   Linearri  r  r  r   r   r   r  r  %_init_added_lm_head_weights_with_mean"_init_added_lm_head_bias_with_meanrs  !_copy_lm_head_original_to_resized)rh  re  rS  rp  rU  rE  rl  old_lm_head_dimnew_lm_head_shapehas_new_lm_head_biasrf  rn  ro  num_tokens_to_copyr   r   r   r_  3  s   #





		
z$PreTrainedModel._get_resized_lm_headc                 C   s   |j jtj}tj|dd}|| }|j| | }	d}
tj	|
|	 
 }|rMtjjj||
|	 d}|j|fd|j j|j jd| d d d f< d S |d d d f |d|j j|j jd| d d d f< d S )Nr   rx  &.>)covariance_matrix)sample_shaper   r   )r  r  r  r   r   meanTr   positive_definitecheckalldistributionsmultivariate_normalMultivariateNormalsampler   r~  )rh  ra  rb  rm  rl  rn  old_embeddings_weightmean_embeddingsold_centered_embeddings
covarianceepsilonis_covariance_psddistributionr   r   r   rj    s"   
 z8PreTrainedModel._init_added_embeddings_weights_with_meanc                 C   sZ   |r|j jj|j _|j jj|j _| ||||| |r+|j jj|j _|j jj|j _d S d S r   )r  r  r}  rj  )rh  re  rf  ru  rl  rn  rp  r   r   r   rr    s   	
z5PreTrainedModel._init_added_lm_head_weights_with_meanc                 C   sV   t j|jjdt jd}t j|jjddt j}|jjd| d  j|d| d d S )Nr   )ry  r   rx  r   ry  )r|  std)r   r|  r  r  r   r  r  r   )rh  re  rf  rn  	bias_meanbias_stdr   r   r   rs    s   &z2PreTrainedModel._init_added_lm_head_bias_with_meanc                 C   s   |s|j jd |d d f |j jd |d d f< n|j jd d d |f |j jd d d |f< |rA|jjd | |jjd |< d S d S r   )r  r  r  )rh  rf  re  rx  rp  rw  r   r   r   rt    s   ., z1PreTrainedModel._copy_lm_head_original_to_resizednew_num_position_embeddingsc                 C   $   t d| j d| j d| jj d)Nz4`resize_position_embeddings` is not implemented for B`. To implement it, you should overwrite this method in the class  in `modeling_.py`r"  r  rS  )rh  r  r   r   r   resize_position_embeddings     
z*PreTrainedModel.resize_position_embeddingsc                 C   r  )Nz1`get_position_embeddings` is not implemented for r  r  r  r  rk  r   r   r   get_position_embeddings  r  z'PreTrainedModel.get_position_embeddingsc                 C   s6   | j jr| | j j tr| | j |   dS dS )z
        If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
        initialization logic in `_init_weights`.
        N)r  pruned_headsprune_headsr   applyr&  r3  rk  r   r   r   r    s   zPreTrainedModel.init_weightsheads_to_prunec                 C   sN   |  D ]\}}t| jj|g t|B }t|| jj|< q| j| dS )a  
        Prunes heads of the base model.

        Arguments:
            heads_to_prune (`Dict[int, List[int]]`):
                Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
                to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
                layer 1 and heads 2 and 3 on layer 2.
        N)r   r  r  r  r   r  r  _prune_heads)rh  r  layerheadsunion_headsr   r   r   r    s   zPreTrainedModel.prune_headsc                 C   s   | j st| jj d|du rddi}tjtfi |}dt| j	j
v }|s0| j	d|d n| t| j	dd td t| d	d
rK|   dS dS )az  
        Activates gradient checkpointing for the current model.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".

        We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
        the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2

        Args:
            gradient_checkpointing_kwargs (dict, *optional*):
                Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
        z) does not support gradient checkpointing.Nuse_reentrantTr  )enablegradient_checkpointing_funcr  V  You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model._hf_peft_config_loadedF)r  r   r  r  	functoolsr   r   r  r   _set_gradient_checkpointingr   r  r  r  r}  r  )rh  gradient_checkpointing_kwargsr  _is_using_old_formatr   r   r   r    s   z-PreTrainedModel.gradient_checkpointing_enabler  r  c                 C   s`   d}t | dr|| _|| _d}|  D ]}t |dr"||_||_d}q|s.t| jj dd S )NFr  Tz is not compatible with gradient checkpointing. Make sure all the architecture support it by setting a boolean attribute `gradient_checkpointing` to modules of the model that uses checkpointing.)r  _gradient_checkpointing_funcr  r  r   r  r  )rh  r  r  is_gradient_checkpointing_setr   r   r   r   r  H  s    

z+PreTrainedModel._set_gradient_checkpointingc                 C   sd   | j r$dt| jjv }|s| jdd ntd | t| jdd t	| ddr0| 
  dS dS )z
        Deactivates gradient checkpointing for the current model.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        r  F)r  r  r  r  N)r  r  r   r  r   r  r  r  r   r}  r!  )rh  r  r   r   r   gradient_checkpointing_disable^  s   z.PreTrainedModel.gradient_checkpointing_disablec                 C   s   t dd |  D S )z
        Whether gradient checkpointing is activated for this model or not.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        c                 s   s     | ]}t |d o|jV  qdS )r  N)r  r  )r   mr   r   r   r1  }  s    z<PreTrainedModel.is_gradient_checkpointing.<locals>.<genexpr>)r;  r  rk  r   r   r   is_gradient_checkpointingu  s   z)PreTrainedModel.is_gradient_checkpointing5GBsave_directoryis_main_processr   save_functionpush_to_hubmax_shard_sizesafe_serializationr  r  save_peft_formatc           H   	      s  | dd}| dd}|dur tdt |	durtd|}	|	dur(|	|d< t| dd}t| d	d}|duoBt|toB|j|d
}|durU|sU|sUtd|j	j
 dd|v rctd | d}|rlt sltdtj|r}td| d dS tj|dd |r| dd}| d|tjjd }| j|fi |}| |}t| }t|}t|dd |j_|jjg|j_d|j_| j durt!| || jd |rc|s|j" }| # r
t$|dkr
td| dt% |& D ]\}}t'|j(|| t'|j|d q|j)| | # r|j()| |rct*d |j+|d}|
rFt*d i }|& D ]\}}||d | < q7|}| , }t$|dkrUtd!|d }| j-| }|)| i }|du rt.| d"rt$t/| j01 dkrd#| j01 v sd$| j01 v rtd% |2 D ]\ } d&krq|3 } | D ]}|| d|  < qq|3 }t4rt5j6j7j8D ]	\}!}"|!|}q| j9dur| j9D ]}#|#|: v r||#= q| ;|}|rt<=t>}$|& D ] \ }%t|%t?j@r	|$tA|% B  q|$tC|% B  qt.| d"r4tD| }&|&r1|&d fd'd(|$& D }'ni }'n	d)d( |$& D }'tE| }(g })t/ }*|'1 D ]5}+|(dur~d},tF|+D ]& tG fd*d+|(D }-|-r| |v r||,d7 },|,t$|+k r||*H  qWqJtI|'1 |\}.}/|/D ] |  J | < qtK|.|\}.}0|0D ]"}1|1L|*}2|2D ] | = q|1M|*}3t$|3dkr|)B|3 q|.r|)Bt/|. t$|)dkrtNd,|) d-|s|rtOntP}4tQ|4|}4n|rtRntS}4|4Td.d/Td0d1}5tU||5|d2}6d}7|6jVr|6jW|6jXd3}7tY|D ]H}8tjZ||8}9|4Td.d&Td0d&}:|8Td.d&Td0d&};t[\d4}<|8]|:r]tj|9r]|8|6j^: vr]|r]|<_|;dur]t`|9 q|6j^& }=|rntajb|=d5d6}=|=D ]o\}>}?i }@|?D ]}%||% c |@|%< ||%= qx|rtdtefd7k rtd8td d9tgh|@d&}A|@D ]}B|Ai|Bd&krq||B }tj||B|A}Aq|A}@~Atkl  |rtm|@tjZ||>d:d;id< qp||@tjZ||> qp~|7du rtjZ||4}Ct*d=|C  nJ|rtnnto}DtjZ|tQ|D|}Dtp|Dd>d?d@}Etqjr|7dAddBdC }F|Es|F W d   n	1 s)w   Y  t*dD| dEt$|6j^ dF|D d |rbtt|| ju|	|dG}G|GvtjZ|dH | jw|||||	dI dS dS )Ja  
        Save a model and its configuration file to a directory, so that it can be re-loaded using the
        [`~PreTrainedModel.from_pretrained`] class method.

        Arguments:
            save_directory (`str` or `os.PathLike`):
                Directory to which to save. Will be created if it doesn't exist.
            is_main_process (`bool`, *optional*, defaults to `True`):
                Whether the process calling this is the main process or not. Useful when in distributed training like
                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
                the main process to avoid race conditions.
            state_dict (nested dictionary of `torch.Tensor`):
                The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
                save parts of the model or if special precautions need to be taken when recovering the state dictionary
                of a model (like when using model parallelism).
            save_function (`Callable`):
                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
                need to replace `torch.save` by another method.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            max_shard_size (`int` or `str`, *optional*, defaults to `"5GB"`):
                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
                We default it to 5GB in order for models to be able to run easily on free-tier google colab instances
                without CPU OOM issues.

                <Tip warning={true}>

                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
                which will be bigger than `max_shard_size`.

                </Tip>

            safe_serialization (`bool`, *optional*, defaults to `True`):
                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
            variant (`str`, *optional*):
                If specified, weights are saved in the format pytorch_model.<variant>.bin.
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            save_peft_format (`bool`, *optional*, defaults to `True`):
                For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
                keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
                disable this behaviours by setting `save_peft_format` to `False`.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        use_auth_tokenNignore_metadata_errorsFrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.V`token` and `use_auth_token` are both specified. Please set only the argument `token`.r  r  r  )r  zThe model is quantized with z and is not serializable - check out the warnings from the logger on the traceback to understand the reason why the quantized model is not serializable.save_configze`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead.zR`safe_serialization` requires the `safetensors library: `pip install safetensors`.zProvided path (z#) should be a directory, not a fileTexist_okcommit_messagerepo_idr   r   r   r  r   zHMoving the following attributes in the config to the generation config: z. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.zhDetected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved.)r   zTo match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`.zbase_model.model.zMultiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`hf_device_mapr  r  z|Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (5GB default)rd  c                    s,   i | ]\}}t  fd d|D r||qS )c                 3   s    | ]}| v V  qd S r   r   r  
tied_namesr   r   r1  n  s    z=PreTrainedModel.save_pretrained.<locals>.<dictcomp>.<genexpr>)r;  r   ptrr  r  r   r   r  m  s
    z3PreTrainedModel.save_pretrained.<locals>.<dictcomp>c                 S   s"   i | ]\}}t |d kr||qS )r   r  r  r   r   r   r  s  r   c                 3   s    | ]	}t | V  qd S r   r5  )r   patr  r   r   r1    s    z2PreTrainedModel.save_pretrained.<locals>.<genexpr>z8The weights trying to be saved contained shared tensors z that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.z.binz{suffix}.binrF  z{suffix}.safetensors)filename_patternr  )rS  r   z(.*?)-\d{5}-of-\d{5}zSaving checkpoint shardsdescrx   zxYou need accelerate version to be greater or equal than 0.31 to save models with offloaded parameters. Detected version z<. Please upgrade accelerate with `pip install -U accelerate`rJ  rG  )rS  zModel weights saved in wr   r   r  )indent	sort_keys
z:The model is bigger than the maximum size per checkpoint (z) and is going to be split in z^ checkpoint shards. You can find where each parameters has been saved in the index located at )r  r  z	README.md)r  r  )xr  rz  r{  r|  r   r}  r\  r;   is_serializabler  r  rX   rX  r   r
  r  r  errormakedirsr  sep_create_repo_get_files_timestampsunwrap_modelr   r   r  r  r  r  architecturesr  _auto_classr"   &_get_non_default_generation_parametersr  r  UserWarningr   r   r  save_pretrainedr  get_adapter_state_dictactive_adapterspeft_configr  r  r  r   rj  r   IS_SAGEMAKER_MP_POST_1_10smpstatemodule_managertranslate_functions_keys_to_ignore_on_saver  _fix_state_dict_keys_on_saver  r   r  r   r   r6   r  idr)   r  r  r;  r  r  rM  r  intersection
differencer  rD   rH   r  r>   r?   replacer   r  rS  tensor_to_filenamelistdirr  r  compiler,  filename_to_tensors	fullmatchr   r_   tqdmr  accelerate_versionr   r]  r  fromkeysr   ry   r  r  safe_save_filerC   rG   r  r  dumpswriterb   r  save_upload_modified_files)Hrh  r  r  r   r  r  r  r  r  r  r  r   r  r  r  r  quantization_serializabler  r  files_timestampsmodel_to_saver   misplaced_generation_parametersr  param_valuepeft_state_dictr   r  active_adaptercurrent_peft_config
module_mapr   module_state_dict	smp_to_hfr  
ignore_keyptrsrq  r&  shared_ptrsrx  error_namesto_delete_namesr  foundmatches_patternshared_namesdisjoint_namesidentical_namesinamesknownunknownr  r  state_dict_splitr*  r  full_filenameweights_no_suffixfilename_no_suffixregr  r2  r  shardshard_state_dictrg  path_to_weightssave_index_filer)  content
model_cardr   )r   r  r   r    s  ?









 

















 


zPreTrainedModel.save_pretrainedc                    sj   | j d ur| j ng }|dg }t|tr|g}|D ]}||vr%|| q|r,||d< t j|i |S )Nr  )r  r   r\  r   r  r  r  )rh  r   r   r  tags_kwargsr  r  r   r   r    s   

zPreTrainedModel.push_to_hubc                 C   s<   t dd |  D }|rt dd |  D }|| }|S )a  
        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2

        Arguments:
            return_buffers (`bool`, *optional*, defaults to `True`):
                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
        c                 S      g | ]
}|  |  qS r   rr  ru  )r   r  r   r   r   r   /      z8PreTrainedModel.get_memory_footprint.<locals>.<listcomp>c                 S   r  r   r  )r   bufr   r   r   r   1  r  )r  r   r   )rh  return_buffersr_  mem_bufsr   r   r   get_memory_footprint#  s
   z$PreTrainedModel.get_memory_footprintc                    s   t | dd tjkrtdt | dd tjkr9t | ddr tdttjdtdk r7td| j	 d	d S t
 j|i |S )
Nquantization_methodz2`.cuda` is not supported for HQQ-quantized models.is_loaded_in_8bitFzCalling `cuda()` is not supported for `8-bit` quantized models.  Please use the model as it is, since the model has already been set to the correct devices.r  0.43.2zCalling `cuda()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. The current device is `L`. If you intended to move the model, please install bitsandbytes >= 0.43.2.)r}  ri   r  r   r  r   r]  r  rS  r   r  r  )rh  r   r   r  r   r   r  5  s   zPreTrainedModel.cudac                    s   d|v }|s|D ]}t |tjrd} nqt| dd tjkr"td|r1t| dd tjkr1tdt| dd tjkrb|r@tdt| ddrJtd	t	
tj	d
t	
dk ratd| j dnt| dd tjkrq|rqtdt j|i |S )Nr   Tr   z0`.to` is not supported for HQQ-quantized models.zBCasting a Quark quantized model to a new `dtype` is not supported.zYou cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `torch_dtype` argument.r!  Fz`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.r  r"  zCalling `to()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. The current device is `r#  zYou cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `torch_dtype` argument.)r\  r   r   r}  ri   r  r   QUARKr  r   r]  r  rS  r   GPTQr  r  )rh  r   r   dtype_present_in_argsargr  r   r   r  H  sB   zPreTrainedModel.toc                        t | ddr
tdt j| S )NrE  Fz`.half()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r}  r   r  halfrh  r   r  r   r   r)  t  
   zPreTrainedModel.halfc                    r(  )NrE  Fz`.float()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r}  r   r  r   r*  r  r   r   r   ~  r+  zPreTrainedModel.floatrE  r   c                 C   sj   t  r-t g}|s |s td |tjjt dt	 g |S |r+|t
 t g |S t t
 g}|S )Nr  r  )r(   r   r  r  r~  r  r  r  r'   r   r*   r   )r
  rE  r   r  r   r   r   get_init_context  s   
z PreTrainedModel.get_init_contextr  )	r  r  rD  r  r  r  r  r  r  r
  r  r  rD  r  r  r  r  r  c       	   P      O   sb  | dd}| dd}| dd}| dd}| dd}| dd}| d	d}| d
d}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd}| dd} | di }!| dd}"| dd}#| dd}$| dd}%| dd}&| dd}'| d d}(| d!d}(| d"d}(| d#d$}(| d%d}(|dur|dus|%durtd&|&dur|&d'krtd(|& d)|&dur|durtd*|d'kr|&du rttjd+d,rd'}&d}d})|&durtd-std.tj	
 j}*tj szzGttjd/ }+ttjd+ },|*d0krJtjjd1|+|,d2d3 tjttjd4  n|*d5krfttjd6d,r[d7nd8}-tjj|-|+|,d9 W n tyy }. ztd:|.d}.~.ww |*d5krdntj }/t|*|/}0|/dur|/d,krd,dl}1ttjd;|1_ttjd;|1_|0}tj },tj|0j|,f})|durtd<t |durtd=|}|dur|!durd>|!vr||!d>< |	du rt sd}	|%durt std?|du r t |t!st"|t#|||||||dddd@}2t$|2|}nt%|dd}t& rk|! dAd}3|3du r=t'|f|||||dB|!}3|3durjtj()|3rjt|3dCdDdE}4|}3t*+|4dF }W d   n	1 sew   Y  nd}3t |tjryd|i}n8t |t,r|dGvrz	dt|i}W n# t-y   tdH| d)w t |tr|d,k rtdId|i}|durt. rtdJt stdK|s|r|durtdLdMdN |/ D }5i |5||dO}5t0j1d|5d$dP|\}}t23dQ ||B  }6dRdS|dT}7|dur||7dU< t4 r|st25dV d$}t |t!sM|dur(|n|}8| j6j7|8f|d$|||||||%||dW|\}}9d|9v rL|9 d nt89|}| dXd}:|:dur`|:|_:|}9t;|d};|;rst<=|j>ssd};|;s{|dur|;rt<?|j>||_>n||_>t<j@|j>|;dY}<nd}<|<dur|<jA|||||
dZ |<B|}|<C|}|<D|}t;|<j>jEd[r|<j>jEjF|7d\< n|<j>jE|7d\< |%dur|<durtd]|%r|durt |tGrd^|H v sd^|v rt-d_tI||| |%|||	||||||7||d`\}=}>|>du}?|<du}@|dup|%du}At r|Ar|?s|=d, JdartK|=d, dbdc}4|4L }BW d   n	1 sEw   Y  |Bdu rPn<|BdddbkrYn3|Bdddekrid$}t25df n#|Bdddgkryd$}t25dh n|Bdddikrn
tdj|Bdd ||B  }6|6r|%rdkdllMmN}C tdm | |}DW d   n	1 sw   Y  |C|=d, d$|Ddndo }tO| ||=||>||
\}}}E||_P| Q|@tR}Ft89|}t%|dpds| jS||#||dq}tT|F | |g|R i |9}GW d   n	1 sw   Y  |GU  |)dur,|GjVs,|jWdu r,|X jWdu r,tYdr|GjZ}d}H|Gj[durR|tj\ksDt%|<dsdrRt]^dt_dudv |Gj[D }H|<durm|<j`|G||Gj[|dw |durh|nta |_b|dur{tc|G|||<||H}|r| d|G||=\}G}In1|r| e|G|=}Gn'|6r|Edurtf|E | jg|G||=|||>|||||<|H|)|'|
dx\}G}J}K}L}M}N|GU  |Gh  |Gi r|$durt25dy |Gjj1|$k |G_jn0|Gi r
|dur
ztlj7|f|||||||||dz	||G_jW n tmy	   t25d{ Y nw |durx|)du rx|||M|d|}Od}tnotpjqv r)|Gjr|Od}< d~tnotpjqv rC|<durC|<j>jEtsjtkrCd$|Od~< |<durh|<j>jEtsjukrht |tGrhd5|H v sdd^|H v rhd$|Od< tv sxt. sxtp|Gfi |O |<dur|<jw|G|d |<|G_x|3dur|Gjy|3|"||!d |r|6r|J|K|L|Nd}I|G|IfS |rd}I|G|IfS |GS )a;  
        Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you should first set it back in training mode with `model.train()`.

        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
        task.

        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
        weights are discarded.

        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
                      `True`.
                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
                      arguments `config` and `state_dict`).
            model_args (sequence of positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*):
                Can be either:

                    - an instance of a class derived from [`PretrainedConfig`],
                    - a string or path valid as input to [`~PretrainedConfig.from_pretrained`].

                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:

                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            state_dict (`Dict[str, torch.Tensor]`, *optional*):
                A state dictionary to use instead of a state dictionary loaded from saved weights file.

                This option can be used if you want to create a model from a pretrained configuration but load your own
                weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            from_tf (`bool`, *optional*, defaults to `False`):
                Load the model weights from a TensorFlow checkpoint save file (see docstring of
                `pretrained_model_name_or_path` argument).
            from_flax (`bool`, *optional*, defaults to `False`):
                Load the model weights from a Flax checkpoint save file (see docstring of
                `pretrained_model_name_or_path` argument).
            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                checkpoint with 3 labels).
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (i.e., do not try to download the model).
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.

                <Tip>

                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.

                </Tip>
            attn_implementation (`str`, *optional*):
                The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.

            > Parameters for big model inference

            torch_dtype (`str` or `torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model under a specific `dtype`. The different options
                are:

                1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified
                  `dtype`, ignoring the model's `config.torch_dtype` if one exists. If not specified
                  - the model will get loaded in `torch.float` (fp32).

                2. `"auto"` - A `torch_dtype` entry in the `config.json` file of the model will be
                  attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in
                  the checkpoint that's of a floating point type and use that as `dtype`. This will load the model
                  using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how
                  the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32.

                3. A string that is a valid `torch.dtype`. E.g. "float32" loads the model in `torch.float32`, "float16" loads in `torch.float16` etc.

                <Tip>

                For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or
                reach out to the authors and ask them to add this information to the model's card and to insert the
                `torch_dtype` entry in `config.json` on the hub.

                </Tip>

            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                A map that specifies where each submodule should go. It doesn't need to be refined to each
                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
                like `1`) on which the model will be allocated, the device map will map the entire model to this
                device. Passing `device_map = 0` means put the whole model on GPU 0.

                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                more information about each option see [designing a device
                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
            max_memory (`Dict`, *optional*):
                A dictionary device identifier to maximum memory if using `device_map`. Will default to the maximum memory available for each
                GPU and the available CPU RAM if unset.
            tp_plan (`str`, *optional*):
                A torch tensor parallel plan, see [here](https://pytorch.org/tutorials/intermediate/TP_tutorial.html). Currently, it only accepts
                `tp_plan="auto"` to use predefined plan based on the model. Note that if you use it, you should launch your script accordingly with
                `torchrun [args] script.py`. This will be much faster than using a `device_map`, but has limitations.
            offload_folder (`str` or `os.PathLike`, *optional*):
                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
            offload_state_dict (`bool`, *optional*):
                If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
                RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
                `True` when there is some disk offload.
            offload_buffers (`bool`, *optional*):
                Whether or not to offload the buffers with the model parameters.
            quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*):
                A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g
                bitsandbytes, gptq). There may be other quantization-related kwargs, including `load_in_4bit` and
                `load_in_8bit`, which are parsed by QuantizationConfigParser. Supported only for bitsandbytes
                quantizations and not preferred. consider inserting all such arguments into quantization_config
                instead.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            variant (`str`, *optional*):
                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
                ignored when using `from_tf` or `from_flax`.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                is not installed, it will be set to `False`.
            weights_only (`bool`, *optional*, defaults to `True`):
                Indicates whether unpickler should be restricted to loading only tensors, primitive types,
                dictionaries and any types added via torch.serialization.add_safe_globals().
                When set to False, we can load wrapper tensor subclass weights.
            key_mapping (`Dict[str, str], *optional*):
                A potential mapping of the weight names if using a model on the Hub which is compatible to a Transformers
                architecture, but was not converted accordingly.
            kwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                automatically loaded:

                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
                      corresponds to a configuration attribute will be used to override said attribute with the
                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
                      will be passed to the underlying model's `__init__` function.

        <Tip>

        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
        use this method in a firewalled environment.

        </Tip>

        Examples:

        ```python
        >>> from transformers import BertConfig, BertModel

        >>> # Download model and configuration from huggingface.co and cache.
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased")
        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
        >>> model = BertModel.from_pretrained("./test/saved_model/")
        >>> # Update configuration during loading.
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
        >>> assert model.config.output_attentions == True
        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
        >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json")
        >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config)
        >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True)
        ```
        r   Nr  Fr  r  output_loading_infor  _from_pipeline
_from_autor  r  r  offload_folderoffload_state_dictoffload_buffersload_in_8bitload_in_4bitr  r  rd  r  r  adapter_kwargsadapter_namedefaultr  r  r  tp_plankey_mappingresume_downloadtrust_remote_codemirror
_fast_initTlow_cpu_mem_usagezq`state_dict` cannot be passed together with a model name or a `gguf_file`. Use one of the two loading strategies.r  z-tp_plan supports 'auto' only for now but got r   zY`tp_plan` and `device_map` are mutually exclusive. Choose either one for parallelization.
WORLD_SIZEr   z2.5z3tensor parallel is only supported for `torch>=2.5`.r  r  ncclzenv://)rank
world_sizeinit_methodr   r  CCL_WORKER_COUNTcclgloo)rA  rB  zWe tried to initialize torch.distributed for you, but it failed, makesure you init torch distributed in your script to use `tp_plan='auto'`r  r  r  r  zIaccelerate is required when loading a GGUF file `pip install accelerate`.)
r  r  r  r  r  r  r  r  r  '_raise_exceptions_for_connection_errors_adapter_model_path)r  r  r  r  r  r   r   r   base_model_name_or_path)r  balancedr  r  zWhen passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or 'auto', 'balanced', 'balanced_low_0', 'sequential' but found znYou can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' z?DeepSpeed Zero-3 is not compatible with passing a `device_map`.ziUsing a `device_map` or `tp_plan` requires `accelerate`. You can install it with `pip install accelerate`zwYou can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time.c                 S   s&   i | ]\}}|t tjv r||qS r   )r  r   rh   r   r   r   r   r   r  8  s   & z3PreTrainedModel.from_pretrained.<locals>.<dictcomp>)r4  r3  )config_dictreturn_unused_kwargszThe `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.r  pytorch)	file_typerI  from_auto_classusing_pipelinez+Offline mode: forcing local_files_only=True)r  rL  r  r  r  r  r  r  r  r/  r.  r  )pre_quantized)r  r  r  r  r  r  quantzYou cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub.r  zxOne or more modules is configured to be mapped to disk. Disk offload is not supported for models loaded from GGUF files.)r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rF  rG  rH  rJ  rK  zAA TensorFlow safetensors file is being loaded in a PyTorch model.rL  z;A Flax safetensors file is being loaded in a PyTorch model.rM  zTIncompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but r   )load_gguf_checkpointrN  )return_tensorsmodel_to_loadr  r  )r  r  r  z0This model does not have a tensor parallel plan.use_keep_in_fp32_modulesr  c                 S   s   g | ]}d | dqS )z((^|\.)z($|\.))r   )r   r   r   r   r   r     r   z3PreTrainedModel.from_pretrained.<locals>.<listcomp>)r  r  keep_in_fp32_modulesr  )rD  r  r  r  r1  r   r  r  r  r9  r  z\The user-defined `generation_config` will be used to override the default generation config.)	r  r  r  r  r  r  r  r/  r.  zZGeneration config file not found, using a generation config created from the model config.)r  offload_diroffload_indexr2  	skip_keysforce_hooksr  )r6  r  r5  )r,  r-  rL  
error_msgsr   )zr  r   r   r   r   r   rZ   r  r   _C_get_acceleratorr  r   r   init_process_groupr  
set_devicer^  current_devicer   sysr  devnullstdoutstderrget_world_sizeinit_device_meshrz  r{  r|  rX   rQ   r\  r!   rL   r@   rO   r}  rV   r   r
  r  r  r  r   r  r(   r   rh   	from_dictr  r  rT   r  config_classfrom_pretrainedr  r  r  r  r:   supports_quant_methodr  merge_quantization_configsfrom_configr"  update_torch_dtypeupdate_device_mapupdate_tp_planr  r  r  r   r  rR  rz   rS  modeling_gguf_pytorch_utilsrS  r  r  r,  r   r  rI   r3  supports_tp_planr  r.  r"  r  r  r  r  r  r  preprocess_modelr   r  r'  _load_from_tf_load_from_flaxr   _load_pretrained_modelevalr  r  to_dictr$   rT  r  r   rm   r   _skip_keys_device_placementri   r  
FBGEMM_FP8r   postprocess_modelr  load_adapter)Pr
  r  r  r  rD  r  r  r  r  r  r  
model_argsr   r   r  r  r  r-  r  from_pipelinerO  r  r  r  r0  r1  r2  r3  r4  r  r  r  r  r5  r6  r  r  r  r8  r9  r  r  device_typerA  rB  cpu_backendrc  r*  	tp_devicerb  resolved_config_filerH  r)  rK  from_ptr  config_pathmodel_kwargskwarg_attn_imprQ  r  r  r  r  rE  is_from_filerS  rS  dummy_modelr  model_init_contextr  r  loading_infor,  r-  rL  rY  r\  r%  r   r   r   rj    sn   a(








	











	



















	





zPreTrainedModel.from_pretrainedr   c                 C   s   |  dr| dddfS |  dr| dddfS ttjjdr?|  dr.| dddfS |  d	r;| d	d
dfS | dfS |  drL| dddfS |  d
rY| d
d	dfS | dfS )zaReplace legacy parameter names with their modern equivalents. E.g. beta -> bias, gamma -> weight.LayerNorm.betazLayerNorm.biasTLayerNorm.gammazLayerNorm.weightweight_normweight_gz!parametrizations.weight.original0weight_vz!parametrizations.weight.original1F)rR  r  r  r   utilsparametrizationsr   r   r   r   _fix_state_dict_key_on_load  s   





z+PreTrainedModel._fix_state_dict_key_on_loadr)  r9  r*  'loading_task_model_from_base_state_dictc                 C   s4  | j }| d}i }i }|D ]^}	| |	\}
}|dur5| D ]\}}t|||
\}
}|dkr4d} nq|r?d||
g}
n|rO|
|sGq|
t|d }
|
||	< |rl|	dra|	|
f|d< q|	drl|	|
f|d< q|rd| j	j
 d}|d	7 }| D ]\}}
|d
| d|
 d7 }q|d7 }t| |S )a&  
        Compute a mapping between the serialized keys on disk `checkpoint_keys`, and the keys that the model
        that we are loading expects. This is the single entry point for key renaming that will be used during
        loading.
        Log if any parameters have been renamed.
        r   Nr   Tr  r  zA pretrained model of type `z` zrcontains parameters that have been renamed internally (a few are listed below but more are present in the model):
z* `z` -> `z`
znIf you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users.)r8  r  r   r  subnr  r,  r  rR  r  r  r   r  	info_once)rh  r)  r9  r*  r  r{  _prefixrenamed_keyskey_renaming_mappingr   new_keyhas_changedr7  replacement	n_replacewarning_msgold_keyr   r   r   _get_key_renaming_mapping  sD   




z)PreTrainedModel._get_key_renaming_mappingc                 C   s   | dfS )z
        Similar to `_fix_state_dict_key_on_load` allows to define hook for state dict key renaming on model save.
        Do nothing by default, but can be overridden in particular models.
        Fr   r  r   r   r   _fix_state_dict_key_on_save  rl  z+PreTrainedModel._fix_state_dict_key_on_savec                    s    fdd|  D S )z
        Similar to `_fix_state_dict_keys_on_load` allows to define hook for state dict key renaming on model save.
        Apply `_fix_state_dict_key_on_save` to all keys in `state_dict`.
        c                    s    i | ]\}}  |d  |qS r  )r  r   r   r  rk  r   r   r    r-  z@PreTrainedModel._fix_state_dict_keys_on_save.<locals>.<dictcomp>)r   )rh  r   r   rk  r   r    s   z,PreTrainedModel._fix_state_dict_keys_on_saver  r  r  r  r1  r  r  r  r  c           4         sr  |d u}|o|j jtjk}|o|j jtjtjfv }|d ur"|d }n|d ur-t| }ntt|d d|d }|j d t	dkrSt
fdd|D nd}t	dkr`t|nd}| of|}|ok| }|||||t }t| ||||||\}}t||||||\}fd	d
 D t }|| ||
| |||| |d ur| D ]\}}||r|jtj|_q|}|rt|} fdd
 D t }|d ur fdd
| D } fdd|  D 	t|  t
	fdd|D rtddd
  D d}d } g }!|d urd| v r|	d u r@d}	|d urLtj|dd |d uoW|d d}|d u rd|sdtd|rt|||
d uryt |
!ddnd|d u rt"#||d }"n/tj$j%&|d 'tj$j%d d fdd
|d  D }"fdd
|" D }"t(||"}!fdd
|" D } ni } d }#d }$|	rt)* }#i }$|d urt	|dkrt+j,|d d!}n|d urdg}t|  }%|d ur|-||%|}%|d ur |s t||%}&t.||&|d u rd"nd#d$ g }'|D ]}(|(|!v r-q$d%})|(drB|sBt/ r?|rBd})n/|d urq|d urq|j jtj0krq|j j1d&v sct2|j j1t3rqt4d'd | D d })|(dkr~t|(||)|d(}fd)d
| D }t/ r|s|'t5||7 }'n t6 rt7 s|rt8|||(|%||| |#|$|||||d*\} }$~q$| d urt	| dkr|r| j|s| D ]}*t9:tj$&||* d+tj$&| d|* d+ q҇fd,d
|  D } |st;| | d } |	rt<||$|# t9=|# |d ur|>||}|d urqt| d }+|? D ]},|,j4|+kr=|,|+|,_q/|rqfd-d
| D }-|- D ]\}}t@||||\}.}/tA|||+|||/|.tjBd. | qQt	|'dkrd/&|'}0d0|0v r|0d17 }0tCd2|jDjE d3|0 t	|dkr|jFjGd u rg n|jFjG}1|jDjE|1v rtHjIntHjJ}2|2d4| d5|jDjE d6| d7|jDjE d8|jDjE d9 ntHJd:|jDjE d; t	|dkrtHId<|jDjE d=| d>| d? nt	dkrtHJd@|jDjE dA| dB|jDjE dC t	dkr1dD&dEd tK|D }3tHId<|jDjE d=| dF|3 d? |||| |'fS )GNall_checkpoint_keysr   rN  r  r   c                 3   s    | ]}|  V  qd S r   r+  r   srz  r   r   r1  $  r2  z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>Fc                    s   i | ]\}}| vr||qS r   r   r   )rL  r   r   r  I  s    z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>c                    s"   i | ]\}}||t  d  qS r   r  r   r  r   r   r  a  r   c                    s0   i | ]\}}|  r|t d  n||qS r   )r,  r  r   r  r   r   r  e  s   0 c                    s   g | ]	}|  s|qS r   r+  r  r  r   r   r   g  ri  z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>c                 3   s     | ]}|v o| vV  qd S r   r   r   )base_model_expected_keystask_specific_expected_keysr   r   r1  i  s    
zjThe state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?c                 S   s   i | ]\}}||qS r   r   r   r   r   r   r  r  r  r  Tr  rF  zThe current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.ztorch.rd  r   r   c                    rG  r   r   r   r  r   r   r    s
    r   c                    s    i | ]\}}|t j |qS r   )r   r
  r  r   )r   r   r   r    r-  c                    s.   i | ]\}} | d kr||| dqS )r  )safetensors_fileweight_namer   r   )r   r   file)param_device_mapreverse_key_renaming_mapping	str_dtyper   r   r    s    r   zLoading checkpoint shardsr  r     )factorr  )int4_weight_only	autoquantc                 S   s   g | ]}|d vr|qS )r  r  r   )r   dr   r   r   r     r   rF  c                    rG  r   r   r   r  r   r   r    r   )
r  r  r  r  r  r  r  r  r-  r  z.datc                    r  re  r   r  rz  r   r   r    r-  c                    s    i | ]\}}|  s||qS r   r+  )r   r   r  rz  r   r   r    s
    r  z
	zsize mismatchz_
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.r   z:
	z(Some weights of the model checkpoint at z! were not used when initializing z: z,
- This IS expected if you are initializing z from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing z from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).z9All model checkpoint weights were used when initializing z.
zSome weights of z3 were not initialized from the model checkpoint at z and are newly initialized: zo
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.zAll the weights of z/ were initialized from the model checkpoint at zf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use z* for predictions without further training.r  c              	   S   s,   g | ]\}\}}d | d| d| dqS )z- z: found shape z in the checkpoint and z in the model instantiatedr   )r   r   shape1shape2r   r   r   r   R  s    
z= and are newly initialized because the shapes did not match:
)Lr  r  ri   r  r  r  r  r  r8  r  r;  r  r  r   rC  rO  r   #_move_missing_keys_from_meta_to_cpu_initialize_missing_keysr  r  r  r  r   r   r}  r   r   r   r  rR  expand_device_mapr   r  r  r  r
  r  r  r  get_disk_only_shard_filestempfilemkdtempr_   r  r9  caching_allocator_warmupr(   TORCHAO
quant_typer\  r   r   r+   r   r   r  shutilmoverv   rt   rmtree!update_missing_keys_after_loadingr   r  r1   r   r  r  r  r  r  r  r  r  zip)4r
  r  r   r  r  rD  r  r  r  r1  r   r  r  r  r9  r  rE  is_hqqr  r(  has_prefix_moduleexpects_prefix_moduler  r*  r)  r,  r-  rM  r   r  rU  is_offloaded_safetensorsr  disk_only_shard_filesr   r  r  r  expanded_device_mapr\  r2  r  r  r  r0  parameters_to_initializer  r  	error_msgarchswarnermismatched_warningr   )
r  r  r   r  rL  r  r{  r  r  r  r   rv    s  


&








$



















z&PreTrainedModel._load_pretrained_modelc                 C   s~   |d  dr| |||d d d }d }||fS zddlm} |||d ddd\}}W ||fS  ty>   td  w )	Nr   r  ir   )$load_tf2_checkpoint_in_pytorch_modelT)allow_missing_keysr-  zLoading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.)rR  load_tf_weightsmodeling_tf_pytorch_utilsr  rX  r  r  )r
  r  r  r  r  r  r   r   r   rt  `  s    
zPreTrainedModel._load_from_tfc                 C   s<   zddl m} |||d }W |S  ty   td  w )Nr   )%load_flax_checkpoint_in_pytorch_modelr   zLoading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation instructions.)modeling_flax_pytorch_utilsr  rX  r  r  )r
  r  r  r  r   r   r   ru  w  s   zPreTrainedModel._load_from_flaxc           	      C   s   dd |D }| dd |D }g }|  D ]8\}}|r3| j d}||r0|t|d  n|}n|rFt|dkrCd| j|gn| j}||v rO|| q|S )Nc                 S   s$   h | ]}d  |d dd qS )r   Nr   )r  r  r   r   r   r   rh    s   $ z>PreTrainedModel.retrieve_modules_from_names.<locals>.<setcomp>c                 S   s<   h | ]}t |d kr|d  rd|ddd qS )r   r   r   N)r  r?  r  r  r   r   r   r   rh    s   < r   r   )unionrj  r8  r,  r  r  r  )	rh  r  
add_prefixremove_prefixro  retrieved_modulesr   r   r  r   r   r   retrieve_modules_from_names  s    "
z+PreTrainedModel.retrieve_modules_from_names	AutoModelc                 C   sD   t |ts|j}ddlm  m} t||st| d|| _dS )a  
        Register this class with a given auto class. This should only be used for custom models as the ones in the
        library are already mapped with an auto class.

        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`):
                The auto class to register this new model with.
        r   Nz is not a valid auto class.)	r\  r   r  transformers.models.automodelsr  r  r   r  )r
  
auto_classauto_moduler   r   r   register_for_auto_class  s   


z'PreTrainedModel.register_for_auto_classc                 C   T   t  stdddlm} t|tdk rtd| dddlm} || S )a(  
        Converts the model to use [PyTorch's native attention
        implementation](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html), integrated to
        Transformers through [Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). Only a
        subset of all Transformers models are supported.

        PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested
        tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog
        post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).

        Returns:
            [`PreTrainedModel`]: The model converted to BetterTransformer.
        <The package `optimum` is required to use Better Transformer.r   r   1.7.0EPlease install optimum>=1.7.0 to use Better Transformer. The version  was found.BetterTransformer)	rU   rX  optimum.versionr   r   r]  optimum.bettertransformerr  	transformrh  optimum_versionr  r   r   r   to_bettertransformer  s   

z$PreTrainedModel.to_bettertransformerc                 C   r  )a  
        Reverts the transformation from [`~PreTrainedModel.to_bettertransformer`] so that the original modeling is
        used, for example in order to save the model.

        Returns:
            [`PreTrainedModel`]: The model converted back to the original modeling.
        r  r   r   r  r  r  r  )	rU   rX  r  r   r   r]  r  r  r  r  r   r   r   reverse_bettertransformer  s   

z)PreTrainedModel.reverse_bettertransformerc              
   C   s   t |stj st rdS |dus| jjdu rdS | jj|ddddgf v rtd}| jjdur8| jj| jjksT| jjdurF| jj| jjksT| jj	durm| jj	| jjkrm|d| jj d| jj d| jj d| jj	 d		7 }t
| dS dS )
zv
        Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
        Nr   r   zWe strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.z5
You may ignore this warning if your `pad_token_id` (z&) is identical to the `bos_token_id` (z), `eos_token_id` (z), or the `sep_token_id` (z ), and your input is not padded.)rf   r   jit
is_tracingrg   r  pad_token_idbos_token_ideos_token_idsep_token_idr  r  )rh  r  r  warn_stringr   r   r   %warn_if_padding_and_no_attention_mask  s*   	z5PreTrainedModel.warn_if_padding_and_no_attention_maskc                 C   s(   | j durdS t| jdddurdS dS )zJ
        Returns whether the model has a tensor parallelism plan.
        NTr  F)r  r}  r  rk  r   r   r   rr    s
   
z PreTrainedModel.supports_tp_planc                 C   s(   | j d urdS t| jdd d urdS dS )NTr  F)r  r}  r  rk  r   r   r   supports_pp_plan  s
   
z PreTrainedModel.supports_pp_planc                 C   sJ   t | dr| jS t| dd }|d u s|tvr!td| d d}t| S )N_loss_functionr  z`loss_type=zY` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.ForCausalLM)r  r  r}  r2   r  r  )rh  r  r   r   r   loss_function!  s   

zPreTrainedModel.loss_functionc                 C   s
   || _ d S r   )r  )rh  r  r   r   r   r  0  s   
compile_configc                 C   sb   d| j jv r	| jS t| jdt }t| drt| d||kr.|| _tj	| jfi |
 | _| jS )a  Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between
        non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't
        want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding
        (where we want the speed-ups of compiled version with static shapes).llama4r  _compiled_call_last_compile_config)r  
model_type__call__r}  r  r#   r  r   r   r  rx  r  )rh  r  default_configr   r   r   get_compiled_call4  s   z!PreTrainedModel.get_compiled_callc                 C   s   | j S r   )_supports_attention_backend)r
  r   r   r   is_backend_compatibleE  s   z%PreTrainedModel.is_backend_compatibler,  r-  c           
   	   C   s   |du}t  r%t s%|s%|  D ]\}}tj||dd}t| || qdS |  }	|D ]8}|	| }|jtdkrctj||dd}|rRt|ddsR|j	| ||i dsYt| || q+|
| ||d|	| q+dS )zMove the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts) back
        from meta device to cpu.
        Nr  )r   r   rN  r  F)r  r  r   )r   r   r  r   
empty_liker  r   r   r}  r  r  )
rh  r,  r-  r   r  rE  r   r  r  rK  r   r   r   r  I  s*   

z3PreTrainedModel._move_missing_keys_from_meta_to_cpur   c                 C   s   |s/t | |}t| jjdddr.| jjddjr.|  }|dur.t|dr+|jdu r.d|_nt| 	 }t
 rl|sltttjdd | D }tjj|dd	 | | j W d   dS 1 sew   Y  dS | | j dS )
a  Initialize the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts), according to
        `_initialize_weights`. Indeed, since the corresponding weights are missing from the state dict, they will not be replaced and need to
        be initialized correctly (i.e. weight initialization distribution).
        Also take care of setting the `_is_hf_initialized` flag for keys that are not missing.
        Tr'  r)  Nr  c                 s   s    | ]	}|j d dV  qdS )F)recurseN)r   )r   r  r   r   r   r1    s    
z;PreTrainedModel._initialize_missing_keys.<locals>.<genexpr>r   rV  )rp  r  r  r.  r)  r$  r  rl  r  rj  r(   r  r  	itertoolschainfrom_iterabler   r  r  rY  r  r&  )rh  r   rD  rE  rn  r1  not_initialized_parametersr   r   r   r  l  s0   

"z(PreTrainedModel._initialize_missing_keysr  c                 C   sP   z|  |W S  ty   Y nw z| |W S  ty   Y nw td| d)a  
        Return the parameter or buffer given by `target` if it exists, otherwise throw an error. This combines
        `get_parameter()` and `get_buffer()` in a single handy function. Note that it only work if `target` is a
        leaf of the model.
        `z&` is neither a parameter nor a buffer.)get_parameterAttributeError
get_buffer)rh  r  r   r   r   r    s   z'PreTrainedModel.get_parameter_or_buffer)FNNT)NNTFr  )NNTr  )NFTr   r  )NFF)FNNNNNNNNNTr  )r  )r   r   )r  rS  rT  r  ri  r8  r  r  r  r  ry  r  r>  r?  r  rx  is_parallelizabler  _is_statefulr  r  r  _supports_cache_class_supports_static_cache_supports_quantized_cacher  r  r  r  r   r   r   r   r  rI  r!   r  r  r  r  r   r   r  classmethodr   r  r  r   r   r   r  r	  r   r   r  r  r  r  r  r  r!  r  r#  r$  r   r&  r3  r  r0  r/  r  r  r\  rX  r^  rq  r_  rj  rr  rs  rt  r  r   r  r  r  r  r   r   r  r  r  r  r   PathLiker  r  r   rK   r  r  r  r  r)  r   r,  r   r   rj  r  r  r  r  r;   r  Patternrv  rt  ru  r  r  r  r  r  rr  r  r  setterr#   r  r  r  r  r  __classcell__r   r   r  r   r     s   	#' 9h&j			Y

9+
 
z 

*
	

   
+

	
      
<
	
  e



#





#
'r  z
model file)objectobject_classobject_filesc                       sJ   e Zd ZdZdef fddZ	ddejdeej dejfd	d
Z	  Z
S )PoolerStartLogitsz
    Compute SQuAD start logits from sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    r  c                    s   t    t|jd| _d S )Nr   )r  r  r   rq  hidden_sizedenserh  r  r  r   r   r    s   
zPoolerStartLogits.__init__Nhidden_statesp_maskr   c                 C   sV   |  |d}|dur)t| tjkr|d|  d|  }|S |d|  d|  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        Returns:
            `torch.FloatTensor`: The start logits for SQuAD.
        r   Nr     ꌠ9Y>)F)r   squeezer   r   r  )rh  r"  r#  xr   r   r   forward  s   zPoolerStartLogits.forwardr   )r  rS  rT  r  r!   r  r   FloatTensorr   r(  r  r   r   r  r   r    s    r  c                       sb   e Zd ZdZdef fddZ			ddejdeej deej	 d	eej d
ejf
ddZ
  ZS )PoolerEndLogitsz
    Compute SQuAD end logits from sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    r  c                    sR   t    t|jd |j| _t | _tj|j|j	d| _t|jd| _
d S )Nr  )epsr   )r  r  r   rq  r  dense_0Tanh
activation	LayerNormlayer_norm_epsdense_1r!  r  r   r   r    s
   

zPoolerEndLogits.__init__Nr"  start_statesstart_positionsr#  r   c                 C   s   |dus|dusJ d|dur4|j dd \}}|ddddf dd|}|d|}|d|d}| tj||gdd}| |}| |}| |	d}|durst
| tjkri|d|  d|  }|S |d|  d|  }|S )	a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The end logits for SQuAD.
        N7One of start_states, start_positions should be not Noner  r   rq  r   r$  r%  )rI  r  gatherr,  r   r  r.  r/  r1  r&  r   r  )rh  r"  r2  r3  r#  slenhszr'  r   r   r   r(    s$   

zPoolerEndLogits.forwardNNNr  rS  rT  r  r!   r  r   r)  r   
LongTensorr(  r  r   r   r  r   r*    s"    	
r*  c                       s\   e Zd ZdZ fddZ			ddejdeej deej deej d	ejf
d
dZ	  Z
S )PoolerAnswerClassz
    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    c                    sB   t    t|jd |j| _t | _tj|jddd| _d S )Nr  r   F)r  )	r  r  r   rq  r  r,  r-  r.  r1  r!  r  r   r   r    s   

zPoolerAnswerClass.__init__Nr"  r2  r3  	cls_indexr   c                 C   s   |j d }|dus|dusJ d|dur,|ddddf dd|}|d|d}|durH|ddddf dd|}|d|d}n|dddddf }| tj||gdd}| |}| |d}|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The SQuAD 2.0 answer class.
        r   Nr4  r  r5  )	rI  r  r6  r&  r,  r   r  r.  r1  )rh  r"  r2  r3  r=  r8  cls_token_stater'  r   r   r   r(  %  s   

zPoolerAnswerClass.forwardr9  )r  rS  rT  r  r  r   r)  r   r;  r(  r  r   r   r  r   r<    s"    	r<  c                   @   s~   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dS )	SquadHeadOutputa  
    Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
            losses.
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
            (beam-search).
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the `is_impossible` label of the answers.

    Nlossstart_top_log_probsstart_top_indexend_top_log_probsend_top_index
cls_logits)r  rS  rT  r  r@  r   r   r)  rU  rA  rB  r;  rC  rD  rE  r   r   r   r   r?  W  s   
 r?  c                       s   e Zd ZdZ fddZeeed						ddej	de
ej d	e
ej d
e
ej de
ej de
ej	 dedeeeej	 f fddZ  ZS )	SQuADHeadz
    A SQuAD head inspired by XLNet.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    c                    s<   t    |j| _|j| _t|| _t|| _t|| _	d S r   )
r  r  start_n_top	end_n_topr  start_logitsr*  
end_logitsr<  answer_classr!  r  r   r   r    s   


zSQuADHead.__init__)output_typeri  NFr"  r3  end_positionsr=  is_impossibler#  return_dictr   c                 C   s  | j ||d}|durk|durk||||fD ]}	|	dur&|	 dkr&|	d q| j|||d}
t }|||}||
|}|| d }|dura|dura| j|||d}t }|||}||d 7 }|rht|d	S |fS |	 \}}}tj
j|dd
}tj|| jdd
\}}|ddd|}t|d|}|dd|dd}|d|}|dur|dnd}| j|||d}
tj
j|
dd
}tj|| jdd
\}}|d| j| j }|d| j| j }td||}| j|||d}|s|||||fS t|||||dS )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                Final hidden states of the model on the sequence tokens.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the first token for the labeled span.
            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the last token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Whether the question has a possible answer in the paragraph or not.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.
            return_dict (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:
        )r#  Nr   r   )r3  r#  r  )r3  r=  g      ?)r@  r5  r  )r2  r#  z
blh,bl->bh)r2  r=  )rA  rB  rC  rD  rE  )rI  rq  squeeze_rJ  r   rK  r   BCEWithLogitsLossr?  rO  rN  softmaxr   topkrG  r  r  r6  	expand_asrH  rs  einsum)rh  r"  r3  rM  r=  rN  r#  rO  rI  r'  rJ  loss_fct
start_lossend_loss
total_lossrE  loss_fct_clscls_lossbszr7  r8  start_log_probsrA  rB  start_top_index_expr2  hidden_states_expandedend_log_probsrC  rD  r   r   r   r(    s^   






zSQuADHead.forward)NNNNNF)r  rS  rT  r  r  r`   r?  r!   r   r)  r   r;  r  r   r   r(  r  r   r   r  r   rF  v  s6    	
		rF  c                       sJ   e Zd ZdZdef fddZ	ddejdeej	 dejfd	d
Z
  ZS )SequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r  c                    s   t    t|dd| _| jdkrtt | _t|dr;|jr;t|dr0|j	r0|j
dkr0|j
}n|j}t|j|| _t|dd }|rGt|nt | _t | _t|dr`|jdkr`t|j| _t | _t|d	rw|jdkryt|j| _d S d S d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r  r  r}  rb  r"  r   summaryr  re  rf  
num_labelsr  r   rq  r    r.  first_dropoutrh  Dropoutlast_dropoutri  )rh  r  num_classesactivation_stringr  r   r   r    s&   

zSequenceSummary.__init__Nr"  r=  r   c                 C   s  | j dkr|dddf }ne| j dkr|dddf }nW| j dkr(|jdd}nK| j d	krl|du rItj|d
ddddf |jd d tjd}n|dd}|d| d  |	df }|
d|d}n| j dkrst| |}| |}| |}| |}|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        rc  Nr   firstr   r|  r   r5  r=  .r  ro  )r   rd  )rb  r|  r   	full_likerI  longr  r  rq  rO  r6  r&  r"  rl  rj  r.  rn  )rh  r"  r=  r  r   r   r   r(  "  s.   



"




zSequenceSummary.forwardr   r:  r   r   r  r   ra    s    ra  	recursivec                 C   sN   t  ri }|rt dstd||d< t| fi |S t| dr%t| jS | S )a  
    Recursively unwraps a model from potential containers (as used in distributed training).

    Args:
        model (`torch.nn.Module`): The model to unwrap.
        recursive (`bool`, *optional*, defaults to `False`):
            Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
            recursively, not just the top-level distributed containers.
    z0.29.0zsSetting `recursive=True` to `unwrap_model` requires `accelerate` v0.29.0. Please upgrade your version of acceleratert  r   )rQ   r  rq   r  r  r   )r  rt  r   r   r   r   r  N  s   

r  c                    s4   i }|   D ]\ | fdd|D  q|S )zT
    Expand a device map to return the correspondence parameter name to device.
    c                    s2   i | ]}|ks|  d sdkr| qS )r   rd  r+  )r   pr   r   r   r   r  s  s   2 z%expand_device_map.<locals>.<dictcomp>)r   r  )r  param_namesnew_device_mapr   rv  r   r  l  s   r  r  r  c                 C   s>  dd |  D }t|sdS tr$tj r$tddd | j	D nd}t
dd }|  D ]4\}}| |}t|j|  }	|dur\td	d
|}
|	||
rYtj nd }	||  |	7  < q0|  D ]3\}}|jdkr|jdurz|jntj }tj|d }t|td| }tj|| tj|dd}qidS )aI  This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
    device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
    the model, which is actually the loading speed botteneck.
    Calling this function allows to cut the model loading time by a very large margin.

    A few facts related to loading speed (taking into account the use of this function):
    - When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
    to cache the different state dicts (if enough ressources/RAM are available)
    - Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
    and not a good idea in general as this is low level OS optimizations that depend on ressource usage anyway
    - As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
    The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
    These numbers are reported for TP on 4 H100 GPUs.
    - It is useless to pre-allocate more than the model size in this function (i.e. using an `allocation_factor` > 1) as
    cudaMalloc is not a bottleneck at all anymore
    - Loading speed bottleneck is now almost only tensor copy (i.e. changing the dtype) and moving the tensors to the devices.
    However, we cannot really improve on those aspects obviously, as the data needs to be moved/copied in the end.
    c                 S   s$   i | ]\}}|d vr|t |qS r  )r   r   )r   r  r   r   r   r   r    s    z,caching_allocator_warmup.<locals>.<dictcomp>Nr  c                 S   r  r   r  )r   r  r   r   r   r     r  z,caching_allocator_warmup.<locals>.<listcomp>c                   S   r  )Nr   r   r   r   r   r   <lambda>  s    z*caching_allocator_warmup.<locals>.<lambda>z\.\d+\.z.*.r   r  r   gffffff?F)r   r   r  )r   r  _torch_distributed_availabler   r   r   r  r  r  r  r   r  mathprodrI  ru  subr  rf  r  r*  r  ra  mem_get_infors  r   rX  r  )r  r  r  accelerator_device_maptp_plan_regextotal_byte_countr  r   r  param_byte_countgeneric_name
byte_countr*  device_memoryr  r   r   r   r  x  s4   

r  c                 C   s   t t}| D ]-\}}t|dkr-|| vr-d|ddd }t|dkr-|| vs|| | |  q	dd | D S )zT
    Returns the list of shard files containing only weights offloaded to disk.
    r   r   Nr   c                 S   s"   g | ]\}}t |d hkr|qS )r  )r  )r   fnamedevicesr   r   r   r     r   z-get_disk_only_shard_files.<locals>.<listcomp>)r  r   r  r   r  r  r  r  )r  r   files_contentr  r  r   r   r   r    s   
r  c                   @   st   e Zd ZdZeeedZdd Zdd Z	dd Z
d	d
 Zdd Zdd ZededefddZdee fddZdS )AttentionInterfacea_  
    Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
    with a call to `register()`. If a model needs to locally overwrite an existing attention function, say `sdpa`,
    it needs to declare a new instance of this class inside the `modeling_<model>.py`, and declare it on that instance.
    )r  r  r  c                 C   s
   i | _ d S r   _local_mappingrk  r   r   r   r    s   
zAttentionInterface.__init__c                 C   s   || j v r
| j | S | j| S r   )r  _global_mappingrh  r   r   r   r   __getitem__  s   


zAttentionInterface.__getitem__c                 C      | j ||i d S r   )r  r  )rh  r   r  r   r   r   __setitem__     zAttentionInterface.__setitem__c                 C   s   | j |= d S r   r  r  r   r   r   __delitem__     zAttentionInterface.__delitem__c                 C   s   t i | j| jS r   )iterr  r  rk  r   r   r   __iter__  r  zAttentionInterface.__iter__c                 C   s   t | j | j B S r   )r  r  r  r  rk  r   r   r   __len__  s   zAttentionInterface.__len__r   r  c                 C   r  r   )r  r  )r
  r   r  r   r   r   register  r  zAttentionInterface.registerr   c                 C   s   t |  S r   )r  r  rk  r   r   r   r    r  zAttentionInterface.valid_keysN)r  rS  rT  r  r-   r.   r/   r  r  r  r  r  r  r  r  r   r   r  r   r  r   r   r   r   r    s    	r  r  )TT)Fr  T)rd  r  )
NNNNNNFNNNr   r  )r  ("  r  r  r  r  importlib.metadatar  r  r	  r  r{  r   r  r  r  rz  r   collections.abcr   
contextlibr   dataclassesr   enumr   r   r   	threadingr	   typingr
   r   r   r   r   r   r   r   r   r   zipfiler   r   torch.distributed.tensorhuggingface_hubr   	packagingr   r   r   torch.distributionsr   torch.nnr   r   torch.utils.checkpointr   transformers.utilsr   torchao.quantizationr   activationsr    configuration_utilsr!   dynamic_module_utilsr"   
generationr#   r$   r%   integrationsr&   r'   r(   integrations.accelerater)   r*   integrations.deepspeedr+   r,   integrations.flash_attentionr-   integrations.flex_attentionr.   integrations.sdpa_attentionr/   integrations.tensor_parallelr0   r1   loss.loss_utilsr2   pytorch_utilsr3   r4   r5   r6   r7   r8   r9   
quantizersr:   r;   quantizers.quantizers_utilsr<   safetensors_conversionr=   r  r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   	utils.hubrb   rc   utils.import_utilsrd   re   rf   rg   utils.quantization_configrh   ri   r   r   upperrj   rl   rw   rm   rn   accelerate.hooksro   accelerate.utilsrp   rq   rr   rs   rt   ru   rv   r]  rS  r  accelerate.utils.modelingry   safetensorsrz   safetensors.torchr{   r  r|   r  r  
get_loggerr  r  r   r   r   r   r   rz  r   r   !smdistributed.modelparallel.torchmodelparallelr  smdistributed.modelparallelr   SMP_VERSIONr  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r3  r  uint8int8int16r  r   int32r   float64int64r  rW  uint16uint32uint64float8_e5m2r   r  r   r  rp  r   rw  r  r  r  r  r   r  r  no_gradr  r  r  r  r  r'  rC  rO  rP  r   r   r  r  rJ  r  r*  r<  r?  rF  ra  r  r  r  r  r  r  rU  r   r   r   r   <module>   s  
0$	&$


	


	1
U









Q	>>

	


 	
  ;
P

?<

8  :                           l&EAuc

6/