o
    h                  	   @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlZddlZddlmZ dd	lmZ dd
l m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/mZ e/ re0eej1 e. rddl2m3  m4Z5 e- rddl6m7Z7 e8e9Z:dd Z;deej<ej=f fddZ>dyddZ?dyddZ@dyddZAdd ZBdd  ZCd!d" ZDd#d$ ZEdzd%ed&eeF d'efd(d)ZGdeHd*fd+eIeeFeJf  d&eeF d,eejH d'ej<fd-d.ZKd/d0 ZLed1eFfd2d3ZMG d4d5 d5e&ZNG d6d7 d7ZOG d8d9 d9e$ZPd:ej+jQj!d;eFfd<d=ZRdyd>d?ZSdyd@dAZTdBdC ZUG dDdE dEZVeG dFdG dGZWd{dHdIZXG dJdK dKe$ZYG dLdM dMe&ZZG dNdO dOe$Z[G dPdQ dQe"Z\dRdS Z]dTdU Z^dVe_e`eJf d'e_e`eJf fdWdXZadYdZ Zbd|d\d]Zcd^d_ Zdd}dadbZedzdcddZfdedf Zgdgdh Zhe, rddlimj  mZk ekl d~didjZmekl dkdl Zndmdn Zododp ZpeG dqdr drZqG dsdt dtejrjsZtG dudv dve7Zudwdx ZvdS )z(
Torch utilities for the Trainer class.
    N)IteratorMapping)contextmanager)	dataclassfield)chain)StreamHandler)AnyOptionalUnion)nn)DatasetIterableDatasetRandomSamplerSampler)DistributedSampler   )is_deepspeed_zero3_enabled)BatchEncoding)is_sagemaker_mp_enabledis_torch_availableis_torch_xla_availableis_training_run_on_sagemakerlogging)LRSchedulerc                 C   s2   t | dr| jd urt| jS t | dr| jS d S )Nbatch_samplersampler)hasattrr   get_dataloader_samplerr   )
dataloader r    q/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/trainer_pt_utils.pyr   @   s
   

r   tensor_or_arrayc                 C   sH   t | tjrttdrt| } | S | jdk r| d  } | S t| } | S )N
atleast_1dr   )
isinstancetorchTensorr   r#   ndimnp)r"   r    r    r!   r#   G   s   



r#   c                 C   s   t | } t |}t| jdks| jd |jd kr"tj| |fddS | jd |jd  t| jd |jd f| jdd  }| ||}| |d| jd d| jd f< ||| jd dd|jd f< |S )z`Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary.r   r   dim   N)r#   lenshaper%   catmaxnew_full)tensor1tensor2padding_index	new_shaperesultr    r    r!   torch_pad_and_concatenateR   s   "8  r7   c                 C   s   t | } t |}t| jdks| jd |jd kr"tj| |fddS | jd |jd  t| jd |jd f| jdd  }tj| ||d}| |d| jd d| jd f< ||| jd dd|jd f< |S )z^Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary.r   r   )axisr,   Nr.   )r#   r-   r.   r(   concatenater0   	full_like)array1array2r4   r5   r6   r    r    r!   numpy_pad_and_concatenated   s   "8  r>   c                    s   t | tjrt  tjs#t| t u s#J dt|  dt  dt | ttfr:t| fddt|  D S t | tjrGt|  dS t | tr\t|  fdd| 	 D S t | t
jrit|  dS td	t|  )
z
    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
    nested list/tuples/dict of tensors.
    zEExpected `tensors` and `new_tensors` to have the same type but found z and .c                 3   s"    | ]\}}t || d V  qdS )r4   Nnested_concat).0tnr@   r    r!   	<genexpr>        z nested_concat.<locals>.<genexpr>r@   c                    s$   i | ]\}}|t | | d qS )r@   rA   rC   krD   new_tensorsr4   r    r!   
<dictcomp>   s   $ z!nested_concat.<locals>.<dictcomp>z(Unsupported type for concatenation: got )r$   r%   r&   typelisttuplezipr7   r   itemsr(   ndarrayr>   	TypeError)tensorsrK   r4   r    rJ   r!   rB   v   s    
rB   c                 C   s   t | ttfr| D ]}t|}|dur|  S q	dS t | tr6|  D ]\}}t|}|dur3|  S q#dS t | tjrJt| j	dkrH| j	d S dS t | t
jr^t| j	dkr\| j	d S dS dS )zV
    Find the first dimension of a tensor in a nested list/tuple/dict of tensors.
    Nr   r   )r$   rN   rO   find_batch_sizer   rQ   r%   r&   r-   r.   r(   rR   )rT   rD   r6   keyvaluer    r    r!   rU      s&   
rU   c                 C   sp   t | ttfrt| dd | D S t | tr$t| dd |  D S |  }|jtj	kr4|
tj}| S )zENumpify `tensors` (even if it's a nested list/tuple/dict of tensors).c                 s       | ]}t |V  qd S Nnested_numpifyrC   rD   r    r    r!   rF          z!nested_numpify.<locals>.<genexpr>c                 S      i | ]	\}}|t |qS r    rZ   rH   r    r    r!   rL          z"nested_numpify.<locals>.<dictcomp>)r$   rN   rO   rM   r   rQ   cpudtyper%   bfloat16tofloat32numpy)rT   rD   r    r    r!   r[      s   
r[   c                 C   s`   t | ttfrt| dd | D S t | tr$t| dd |  D S t | tjr.|  S | S )zDDetach `tensors` (even if it's a nested list/tuple/dict of tensors).c                 s   rX   rY   nested_detachr\   r    r    r!   rF      r]   z nested_detach.<locals>.<genexpr>c                 S   r^   r    rf   rH   r    r    r!   rL      r_   z!nested_detach.<locals>.<dictcomp>)	r$   rN   rO   rM   r   rQ   r%   r&   detach)rT   r    r    r!   rg      s
   
rg   c                    s   t  rDdd lm  m} t| ttfr"t|  fddt| D S t| t	r8t|  fddt| 
 D S t| } | | tjS td)Nr   c                 3   s(    | ]\}}t |  d | V  qdS )_Nnested_xla_mesh_reduce)rC   irD   namer    r!   rF      s   & z)nested_xla_mesh_reduce.<locals>.<genexpr>c                    s*   i | ]\}\}}|t |  d | qS )ri   rj   )rC   rl   rI   rD   rm   r    r!   rL      s   * z*nested_xla_mesh_reduce.<locals>.<dictcomp>z;Torch xla must be installed to use `nested_xla_mesh_reduce`)r   torch_xla.core.xla_modelcore	xla_modelr$   rN   rO   rM   	enumerater   rQ   r#   mesh_reducer%   r/   ImportError)rT   rn   xmr    rm   r!   rk      s   
rk   tensornum_total_examplesreturnc                    s   zWt ttfrt fddD W S t tr+t fdd D W S t fddtt	
 D }t	| tj|dd} d urU|d   }|W S  tyb   td	w )
Nc                 3       | ]}t | V  qd S rY   distributed_concatr\   rw   r    r!   rF          z%distributed_concat.<locals>.<genexpr>c                       i | ]
\}}|t | qS r    rz   rH   r|   r    r!   rL          z&distributed_concat.<locals>.<dictcomp>c                       g | ]}   qS r    clonerC   ri   rv   r    r!   
<listcomp>       z&distributed_concat.<locals>.<listcomp>r   r*   (Not currently using distributed training)r$   rO   rN   rM   r   rQ   r#   
contiguousrangedistget_world_size
all_gatherr%   r/   AssertionError)rv   rw   output_tensorsconcatr    )rw   rv   r!   r{      s   
 r{   cudascalarsdevicec                    st   z.t j| |d  fddtt D }t|  t j|dd}|d ur,|d | }|W S  ty9   tdw )N)r   c                    r   r    r   r   tensorized_scalarr    r!   r      r   z1distributed_broadcast_scalars.<locals>.<listcomp>r   r*   r   )r%   rv   r   r   r   r   r/   r   )r   rw   r   r   r   r    r   r!   distributed_broadcast_scalars   s   r   c                 C   s8   t | dkr| D ]}|jturt|j|j qd S d S )Nr   )r-   categoryUserWarningwarningswarnmessage)caught_warningswr    r    r!   reissue_pt_warnings   s   
r   
local_rankc                 c   s0    | dvr	t   dV  | dkrt   dS dS )z
    Decorator to make all processes in distributed training wait for each local_master to do something.

    Args:
        local_rank (`int`): The rank of the local process.
    )r   Nr   )r   barrier)r   r    r    r!   torch_distributed_zero_first   s   r   c                       s,   e Zd ZdZ fddZ fddZ  ZS )DistributedSamplerWithLoopa  
    Like a torch.utils.data.distributed.DistributedSampler` but loops at the end back to the beginning of the shuffled
    samples to make each process have a round multiple of batch_size samples.

    Args:
        dataset (`torch.utils.data.Dataset`):
            Dataset used for sampling.
        batch_size (`int`):
            The batch size used with this sampler
        kwargs (`Dict[str, Any]`, *optional*):
            All other keyword arguments passed to `DistributedSampler`.
    c                    s   t  j|fi | || _d S rY   )super__init__
batch_size)selfdatasetr   kwargs	__class__r    r!   r     s   
z#DistributedSamplerWithLoop.__init__c                    sr   t t  }t|| j dkrdn	| jt|| j  }| jt| j| j k r)dnd}|||||  7 }t|S )Nr   r   )	rN   r   __iter__r-   r   rankr   num_replicasiter)r   indices	remainderstart_remainderr   r    r!   r     s
   *z#DistributedSamplerWithLoop.__iter__)__name__
__module____qualname____doc__r   r   __classcell__r    r    r   r!   r     s    r   c                   @   s@   e Zd ZdZddedefddZdd
dZdddZdd Z	d	S )EvalLoopContaineraa  
    Container to store intermediate results of evaluation loop.

    Args:
        do_nested_concat (`bool`, *optional*, defaults to `True`):
            If set to `True`, each iteration will recursively concatenate a new object containing tensors to
            the existing stored tensors, provided that the structure of the existing object and the new one
            are identical. If set to `False`, all newly added tensors will be stored in a list.
        padding_index (`int`, *optional*, defaults to -100):
            Value used to pad tensors of different shapes when `do_nested_concat=True`.
    Tr)   do_nested_concatr4   c                 C   s   || _ || _d | _d | _d S rY   )r   r4   rT   arrays)r   r   r4   r    r    r!   r   1  s   
zEvalLoopContainer.__init__rx   Nc                 C   sT   | j du r| jr|| _ dS |g| _ dS | jr"t| j || jd| _ dS | j | dS )zlAdd tensors to the stored objects. If `do_nested_concat=True`, the tensors will be concatenated recursively.Nr@   )rT   r   rB   r4   append)r   rT   r    r    r!   add7  s
   
zEvalLoopContainer.addc                 C   s\   | j du rdS t| j }| jdu r|| _n| jr#t| j|| jd| _n| j| d| _ dS )zGMove tensors in stored objects to CPU and convert them to numpy arrays.Nr@   )rT   r[   r   r   rB   r4   extend)r   
new_arraysr    r    r!   to_cpu_and_numpy@  s   



z"EvalLoopContainer.to_cpu_and_numpyc                 C   s   |    | jS )z6Returns the numpified and moved to CPU stored objects.)r   r   r   r    r    r!   
get_arraysR  s   zEvalLoopContainer.get_arrays)Tr)   )rx   N)
r   r   r   r   boolintr   r   r   r   r    r    r    r!   r   $  s    

	r   c                   @   s*   e Zd ZdZd	ddZdd Zdd ZdS )
SequentialDistributedSamplera  
    Distributed Sampler that subsamples indices sequentially, making it easier to collate all results at the end.

    Even though we only use this sampler for eval and predict (no training), which means that the model params won't
    have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add
    extra samples to the sampler to make it evenly divisible (like in `DistributedSampler`) to make it easy to `gather`
    or `reduce` resulting tensors at the end of the loop.
    Nc                 C   s   t dt |d u rt stdt }|d u r&t s"tdt }|| _|| _	|| _
t| j}|d urGtt|||  | | _n
tt|| | _| j| j	 | _|| _d S )NzUSequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.,Requires distributed package to be available)r   r   FutureWarningr   is_availableRuntimeErrorr   get_rankr   r   r   r-   r   mathceilnum_samples
total_sizer   )r   r   r   r   r   r   r    r    r!   r   b  s*   

z%SequentialDistributedSampler.__init__c                 C   s   t tt| j}||d | jt|  7 }t|| jks+J dt| d| j d|| j| j | jd | j  }t|| jksPJ dt| d| j dt|S )NzIndices length z and total size z mismatchedr   z and sample number )rN   r   r-   r   r   r   r   r   r   r   r    r    r!   r   {  s    z%SequentialDistributedSampler.__iter__c                 C      | j S rY   r   r   r    r    r!   __len__     z$SequentialDistributedSampler.__len__)NNN)r   r   r   r   r   r   r   r    r    r    r!   r   X  s
    
	r   r   r   c                 C   s*   t  dkr
t| S t| t  t  dS )Nr   )r   r   )ru   xrt_world_sizer   r   get_ordinal)r   r   r    r    r!   get_tpu_sampler  s   r   c                    sJ   t | ttfrt|  fdd| D S tj| | g| jdd R dS )z\Create the same nested structure as `arrays` with a first dimension always at `num_samples`.c                 3   ry   rY   )nested_new_likerC   xr   r    r!   rF     r}   z"nested_new_like.<locals>.<genexpr>r   Nr9   )r$   rN   rO   rM   r(   r;   r.   )r   r   r4   r    r   r!   r     s   "r   c                 C   sF   t j| || jd |f| jdd  d}| |ddd| jd f< |S )zmExpand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding.r   r,   Nr9   r   )r(   r;   r.   )r   new_seq_lengthr4   r6   r    r    r!   expand_like  s   (r   c                    s\   t | ttfrt|  fdd| D S t | tr(t|  fdd|  D S | d  S )zQTruncate `tensors` at `limit` (even if it's a nested list/tuple/dict of tensors).c                 3   ry   rY   nested_truncater\   limitr    r!   rF     r}   z"nested_truncate.<locals>.<genexpr>c                    r~   r    r   rH   r   r    r!   rL     r   z#nested_truncate.<locals>.<dictcomp>N)r$   rN   rO   rM   r   rQ   )rT   r   r    r   r!   r     s
   
r   c                   @   s2   e Zd ZdZdddZdd Zdd	 Zd
d ZdS )DistributedTensorGathereraS  
    A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks.

    If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on CPU at every
    step, our sampler will generate the following indices:

        `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`

    to get something of size a multiple of 3 (so that each process gets the same dataset length). Then process 0, 1 and
    2 will be responsible of making predictions for the following samples:

        - P0: `[0, 1, 2, 3, 4, 5]`
        - P1: `[6, 7, 8, 9, 10, 11]`
        - P2: `[12, 13, 14, 15, 0, 1]`

    The first batch treated on each process will be:

        - P0: `[0, 1]`
        - P1: `[6, 7]`
        - P2: `[12, 13]`

    So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) corresponding to
    the following indices:

        `[0, 1, 6, 7, 12, 13]`

    If we directly concatenate our results without taking any precautions, the user will then get the predictions for
    the indices in this order at the end of the prediction loop:

        `[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`

    For some reason, that's not going to roll their boat. This class is there to solve that problem.

    Args:
        world_size (`int`):
            The number of processes used in the distributed training.
        num_samples (`int`):
            The number of samples in our dataset.
        make_multiple_of (`int`, *optional*):
            If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument
            (by adding samples).
        padding_index (`int`, *optional*, defaults to -100):
            The padding index to use if the arrays don't all have the same sequence length.
    Nr)   c                 C   sf   t dt || _|| _|d u r|n|| }tt|| | | _| j| | _	d | _
d | _|| _d S )NzRDistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.)r   r   r   
world_sizer   r   r(   r   total_samplesprocess_length_storage_offsetsr4   )r   r   r   make_multiple_ofr4   r   r    r    r!   r     s   
z"DistributedTensorGatherer.__init__c                 C   sz   |du rdS | j du r t|| j| jd| _ ttd| j| j| _| | j |\}| _ t| j	D ]}| j|  |7  < q/dS )z
        Add `arrays` to the internal storage, Will initialize the storage to the full size at the first arrays passed
        so that if we're bound to get an OOM, it happens at the beginning.
        Nr@   r   )
r   r   r   r4   rN   r   r   r   _nested_set_tensorsr   )r   r   	slice_lenrl   r    r    r!   
add_arrays  s   
z$DistributedTensorGatherer.add_arraysc                    sX  t |ttfr$ fddt||D }|d d t|dd |D fS |jd  j dks=J d j d|jd  d|jd  j }t jD ]]}t|jd	krl||| |d	 |  | j	|  j	| | < qJt|jd	kr|jd	 |jd	 k rt
||jd	  jd
}||| |d	 |  | j	|  j	| | d |jd	 f< qJ||fS )Nc                    s   g | ]
\}}  ||qS r    )r   )rC   r   yr   r    r!   r     r   zADistributedTensorGatherer._nested_set_tensors.<locals>.<listcomp>r   c                 s   s    | ]}|d  V  qdS )r   Nr    )rC   rr    r    r!   rF     r]   z@DistributedTensorGatherer._nested_set_tensors.<locals>.<genexpr>z<Arrays passed should all have a first dimension multiple of z, found r?   r   r@   )r$   rN   rO   rP   rM   r.   r   r   r-   r   r   r4   )r   storager   r6   r   rl   r    r   r!   r     s    "2",z-DistributedTensorGatherer._nested_set_tensorsc                 C   s6   | j du rdS | jd | jkrtd t| j | jS )z
        Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras
        to get each process a dataset of the same length).
        Nr   z>Not all data has been set. Are you sure you passed all values?)r   r   r   loggerwarningr   r   r   r    r    r!   finalize  s
   

z"DistributedTensorGatherer.finalize)Nr)   )r   r   r   r   r   r   r   r   r    r    r    r!   r     s    
-r   c                   @   s4   e Zd ZU dZdZeed< dZeed< d
ddZ	d	S )LabelSmoothera@  
    Adds label-smoothing on a pre-computed output from a Transformers model.

    Args:
        epsilon (`float`, *optional*, defaults to 0.1):
            The label smoothing factor.
        ignore_index (`int`, *optional*, defaults to -100):
            The index in the labels to ignore when computing the loss.
    g?epsilonr)   ignore_indexFc           
      C   s  t |tr	|d n|d }|r&|dd dd d f  }|ddd f  }tjj|dd }| | d kr>|d}|| j	}t
j|dd}|jd|d}|jdd	t
jd
}||d ||d | |   }	| |	 }| |	|jd   }d| j | | j|  S )Nlogitsr   .r   r   r*   )min)r+   indexT)r+   keepdimra   g        )r$   dictr   r   
functionallog_softmaxr+   	unsqueezeeqr   r%   clampgathersumrd   masked_fill_numellongr.   r   )
r   model_outputlabelsshift_labelsr   	log_probspadding_masknll_losssmoothed_lossnum_active_elementsr    r    r!   __call__)  s"   
zLabelSmoother.__call__NF)
r   r   r   r   r   float__annotations__r   r   r  r    r    r    r!   r     s
   
 
r   c                    s   |du rt t|d  d}|dkrd}tjt|d ||  fddtdtD }fd	d|D }fd
d|D }tt| }|| d |d d |d d< || d< dd |D S )a  
    Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar
    lengths. To do this, the indices are:

    - randomly permuted
    - grouped in mega-batches of size `mega_batch_mult * batch_size`
    - sorted by length in each mega-batch

    The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of
    maximum length placed first, so that an OOM happens sooner rather than later.
    N   2   r   r   	generatorc                    s    g | ]} ||    qS r    )tolist)rC   rl   )r   megabatch_sizer    r!   r   [  s     z.get_length_grouped_indices.<locals>.<listcomp>c                    s"   g | ]}t | fd dddqS )c                    s    |  S rY   r    )rl   lengthsr    r!   <lambda>\  s    z7get_length_grouped_indices.<locals>.<listcomp>.<lambda>T)rV   reverse)sortedrC   	megabatchr  r    r!   r   \  s   " c                    s   g | ]} |d   qS )r   r    r  r  r    r!   r   `      c                 S   s   g | ]	}|D ]}|qqS r    r    )rC   r  rl   r    r    r!   r   e  r_   )r   r-   r%   randpermr   argmaxrv   item)r  r   mega_batch_multr  megabatchesmegabatch_maximumsmax_idxr    )r   r  r  r!   get_length_grouped_indicesE  s    *r!  c                
   @   sT   e Zd ZdZ				ddedee deee  dee fddZ	d	d
 Z
dd ZdS )LengthGroupedSamplerz
    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
    keeping a bit of randomness.
    Nr   r   r  model_input_namec                    s   |d u r|d u rt d|| _|d u rA d ur nd t|d ts)t|d tr/ |d vr7t d  d fdd|D }nt|tjrPtd |	 }|| _
|| _d S )	N,One of dataset and lengths must be provided.	input_idsr   XCan only automatically infer lengths for datasets whose items are dictionaries with an '' key.c                       g | ]}t |  qS r    r-   rC   featurer#  r    r!   r     r  z1LengthGroupedSampler.__init__.<locals>.<listcomp>zcIf lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to List[int]...)
ValueErrorr   r$   r   r   r%   r&   r   infor  r  r  )r   r   r   r  r#  r  r    r,  r!   r   n  s.   
zLengthGroupedSampler.__init__c                 C   s
   t | jS rY   )r-   r  r   r    r    r!   r     s   
zLengthGroupedSampler.__len__c                 C   s   t | j| j| jd}t|S Nr  )r!  r  r   r  r   r   r    r    r!   r     s   zLengthGroupedSampler.__iter__)NNNN)r   r   r   r   r   r
   r   rN   strr   r   r   r    r    r    r!   r"  h  s"    

 r"  c                   @   sp   e Zd ZdZ							ddedee dee dee d	ed
edeee  dee	 fddZ
defddZdS )DistributedLengthGroupedSamplerz
    Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
    length while keeping a bit of randomness.
    Nr   Fr   r   r   r   seed	drop_lastr  r#  c	           	         sh  |d u r|d u rt d|d u rt stdt }|d u r,t s(tdt }|| _|| _|| _d| _	|| _
|d u rm d urE nd t|d tsUt|d tr[ |d vrct d  d fdd|D }nt|tjr|td	 | }|| _| j
rt| j| j dkrtt| j| j | j | _ntt| j| j | _| j| j | _|| _d S )
Nr$  r   r   r%  r&  r'  c                    r(  r    r)  r*  r,  r    r!   r     r  z<DistributedLengthGroupedSampler.__init__.<locals>.<listcomp>znIf lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to List[int]...)r-  r   r   r   r   r   r   r   r   epochr3  r$   r   r   r%   r&   r   r.  r  r  r-   r   r   r   r   r2  )	r   r   r   r   r   r2  r3  r  r#  r    r,  r!   r     sN    
z(DistributedLengthGroupedSampler.__init__rx   c                 C   s   t  }|| j| j  t| j| j|d}| js'||d | j	t
|  7 }n|d | j	 }t
|| j	ks7J || j| j	| j }t
|| jksJJ t|S r/  )r%   	Generatormanual_seedr2  r4  r!  r  r   r3  r   r-   r   r   r   r   )r   gr   r    r    r!   r     s   z(DistributedLengthGroupedSampler.__iter__)NNNr   FNN)r   r   r   r   r   r
   r   r   rN   r0  r   r   r   r    r    r    r!   r1    s6    	
	
<r1  c                   @   sH   e Zd ZdZ				ddedededed	ef
d
dZdd Zdd Z	dS )ShardSamplera  
    Sampler that shards batches between several processes. Dispatches indices batch by batch: on 2 processes with batch
    size 4, the first two batches are `[0, 1, 2, 3, 4, 5, 6, 7]` and `[8, 9, 10, 11, 12, 13, 14, 15]`, which shard into
    `[0, 1, 2, 3]` and `[8, 9, 10, 11]` for GPU-0 and `[4, 5, 6, 7]` and `[12, 13, 14, 15]` for GPU-1.

    The sampler thus yields `[0, 1, 2, 3, 8, 9, 10, 11]` on GPU-0 and `[4, 5, 6, 7, 12, 13, 14, 15]` on GPU-1.
    r   Fr   r   r   r3  num_processesprocess_indexc                 C   s\   || _ || _|| _|| _|| _||  | _}|rt|| ntt|| }|| | _	d S rY   )
r   r   r3  r9  r:  total_batch_sizer-   r   r   total_num_samples)r   r   r   r3  r9  r:  r;  num_batchesr    r    r!   r     s   "zShardSampler.__init__c                 C   s   t tt| j}t|| jk r$||d | jt|  7 }t|| jk sg }t| j| j | j| jD ]}||||| j  7 }q2t|S rY   )	rN   r   r-   r   r<  r   r:  r;  r   )r   r   r6   batch_startr    r    r!   r   
  s   zShardSampler.__iter__c                 C   s   | j | j S rY   )r<  r9  r   r    r    r!   r     s   zShardSampler.__len__N)r   Fr   r   )
r   r   r   r   r   r   r   r   r   r   r    r    r    r!   r8    s&    
r8  c                   @   sV   e Zd ZdZ					ddedededed	ed
efddZdd Zdd Z	dd Z
dS )IterableDatasetSharda  
    Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class will
    always yield a number of samples that is a round multiple of the actual batch size (which is `batch_size x
    num_processes`). Depending on the value of the `drop_last` attribute, it will either stop the iteration at the
    first batch that would be too small or loop with indices from the beginning.

    On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch size of
    2:

    - the shard on process 0 will yield `[0, 1, 4, 5, 8, 9]` so will see batches `[0, 1]`, `[4, 5]`, `[8, 9]`
    - the shard on process 1 will yield `[2, 3, 6, 7, 10, 11]` so will see batches `[2, 3]`, `[6, 7]`, `[10, 11]`

    <Tip warning={true}>

        If your IterableDataset implements some randomization that needs to be applied the same way on all processes
        (for instance, a shuffling), you should use a `torch.Generator` in a `generator` attribute of the `dataset` to
        generate your random numbers and call the [`~trainer_pt_utils.IterableDatasetShard.set_epoch`] method of this
        object. It will set the seed of this `generator` to `seed + epoch` on all processes before starting the
        iteration. Alternatively, you can also implement a `set_epoch()` method in your iterable dataset to deal with
        this.

    </Tip>

    Args:
        dataset (`torch.utils.data.IterableDataset`):
            The batch sampler to split in several shards.
        batch_size (`int`, *optional*, defaults to 1):
            The size of the batches per shard.
        drop_last (`bool`, *optional*, defaults to `False`):
            Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the
            beginning.
        num_processes (`int`, *optional*, defaults to 1):
            The number of processes running concurrently.
        process_index (`int`, *optional*, defaults to 0):
            The index of the current process.
        seed (`int`, *optional*, defaults to 0):
            A random seed that will be used for the random number generation in
            [`~trainer_pt_utils.IterableDatasetShard.set_epoch`].
    r   Fr   r   r   r3  r9  r:  r2  c                 C   s4   || _ || _|| _|| _|| _|| _d| _d| _d S )Nr   )r   r   r3  r9  r:  r2  r4  num_examples)r   r   r   r3  r9  r:  r2  r    r    r!   r   F  s   	
zIterableDatasetShard.__init__c                 C   s&   || _ t| jdr| j| d S d S )N	set_epoch)r4  r   r   rA  )r   r4  r    r    r!   rA  X  s   zIterableDatasetShard.set_epochc                 c   s2   d| _ t| jds#t| jdr#t| jjtjr#| jj| j| j	  | j
| j }t| j| j
 | jd | j
 }d }g }| jD ](}|  j d7  _ || t||krg|D ]}|| V  qU|d u re| }g }q?| jst|dkr|d u ry| }t||k r||7 }t||k s|D ]}|| V  qd S d S d S )Nr   rA  r  r   )r@  r   r   r$   r  r%   r5  r6  r2  r4  r   r9  r   r:  r   r-   copyr3  )r   real_batch_sizeprocess_slicefirst_batchcurrent_batchelementrl   r    r    r!   r   ]  sB   



zIterableDatasetShard.__iter__c                 C   sD   | j rt| j| j| j  | j S tt| j| j| j  | j S rY   )r3  r-   r   r   r9  r   r   r   r    r    r!   r   ~  s   "zIterableDatasetShard.__len__N)r   Fr   r   r   )r   r   r   r   r   r   r   r   rA  r   r   r    r    r    r!   r?    s.    +
!r?  c              
   C   s   | j r0z
| j d }W |S  ty/ } zdt|v r#td d}n W Y d }~|S d }~ww t| jtj	jj
rB| jjd d }n| j d }t|rR| }|S )Nr   zneed to call stepzQtried to get lr value before scheduler/optimizer started stepping, returning lr=0lr)is_deepspeed_enabledlr_schedulerget_last_lrr   r0  r   r   r$   r%   optimReduceLROnPlateau	optimizerparam_groups	is_tensorr  )r   last_lrer    r    r!   _get_learning_rate  s&   



rS  c                 C   s4   t t| t |  d }tjt | d d|dS )zN
    Convert seconds to hh:mm:ss.msec, msecs rounded to 2 decimal places.
    d   )secondsr?   02d)r   absdatetime	timedelta)secsmsecr    r    r!   _secs2timedelta  s   r\  metricsc                 C   s   |  }| D ];\}}d|v r|d?  d||< qd|v r%t|||< q|dkr5t|d?  d||< qt|| trCt|d||< q|S )	z
    Reformat Trainer metrics values to a human-readable format.

    Args:
        metrics (`Dict[str, float]`):
            The metrics returned from train/evaluate/predict

    Returns:
        metrics (`Dict[str, float]`): The reformatted metrics
    _mem_   MB_runtime
total_flos   GFr  )rB  rQ   r\  r   r$   r
  round)r   r]  metrics_copyrI   vr    r    r!   metrics_format  s   rh  c                 C   s   |   sdS td| d | |}tdd | D }tdd | D }t| D ]}td|d| d	|| d
|  q/dS )a?  
    Log metrics in a specially formatted way.

    Under distributed environment this is done only for a process with rank 0.

    Args:
        split (`str`):
            Mode/split name: one of `train`, `eval`, `test`
        metrics (`Dict[str, float]`):
            The metrics returned from train/evaluate/predictmetrics: metrics dict

    Notes on memory reports:

    In order to get memory usage report you need to install `psutil`. You can do that with `pip install psutil`.

    Now when this method is run, you will see a report that will include:

    ```
    init_mem_cpu_alloc_delta   =     1301MB
    init_mem_cpu_peaked_delta  =      154MB
    init_mem_gpu_alloc_delta   =      230MB
    init_mem_gpu_peaked_delta  =        0MB
    train_mem_cpu_alloc_delta  =     1345MB
    train_mem_cpu_peaked_delta =        0MB
    train_mem_gpu_alloc_delta  =      693MB
    train_mem_gpu_peaked_delta =        7MB
    ```

    **Understanding the reports:**

    - the first segment, e.g., `train__`, tells you which stage the metrics are for. Reports starting with `init_`
        will be added to the first stage that gets run. So that if only evaluation is run, the memory usage for the
        `__init__` will be reported along with the `eval_` metrics.
    - the third segment, is either `cpu` or `gpu`, tells you whether it's the general RAM or the gpu0 memory
        metric.
    - `*_alloc_delta` - is the difference in the used/allocated memory counter between the end and the start of the
        stage - it can be negative if a function released more memory than it allocated.
    - `*_peaked_delta` - is any extra memory that was consumed and then freed - relative to the current allocated
        memory counter - it is never negative. When you look at the metrics of any stage you add up `alloc_delta` +
        `peaked_delta` and you know how much memory was needed to complete that stage.

    The reporting happens only for process of rank 0 and gpu 0 (if there is a gpu). Typically this is enough since the
    main process does the bulk of work, but it could be not quite so if model parallel is used and then other GPUs may
    use a different amount of gpu memory. This is also not the same under DataParallel where gpu0 may require much more
    memory than the rest since it stores the gradient and optimizer states for all participating GPUs. Perhaps in the
    future these reports will evolve to measure those too.

    The CPU RAM metric measures RSS (Resident Set Size) includes both the memory which is unique to the process and the
    memory shared with other processes. It is important to note that it does not include swapped out memory, so the
    reports could be imprecise.

    The CPU peak memory is measured using a sampling thread. Due to python's GIL it may miss some of the peak memory if
    that thread didn't get a chance to run when the highest memory was used. Therefore this report can be less than
    reality. Using `tracemalloc` would have reported the exact peak memory, but it doesn't report memory allocations
    outside of python. So if some C++ CUDA extension allocated its own memory it won't be reported. And therefore it
    was dropped in favor of the memory sampling approach, which reads the current process memory usage.

    The GPU allocated and peak memory reporting is done with `torch.cuda.memory_allocated()` and
    `torch.cuda.max_memory_allocated()`. This metric reports only "deltas" for pytorch-specific allocations, as
    `torch.cuda` memory management system doesn't track any memory allocated outside of pytorch. For example, the very
    first cuda call typically loads CUDA kernels, which may take from 0.5 to 2GB of GPU memory.

    Note that this tracker doesn't account for memory allocations outside of [`Trainer`]'s `__init__`, `train`,
    `evaluate` and `predict` calls.

    Because `evaluation` calls may happen during `train`, we can't handle nested invocations because
    `torch.cuda.max_memory_allocated` is a single counter, so if it gets reset by a nested eval call, `train`'s tracker
    will report incorrect info. If this [pytorch issue](https://github.com/pytorch/pytorch/issues/16266) gets resolved
    it will be possible to change this class to be re-entrant. Until then we will only track the outer level of
    `train`, `evaluate` and `predict` methods. Which means that if `eval` is called during `train`, it's the latter
    that will account for its memory usage and that of the former.

    This also means that if any other tool that is used along the [`Trainer`] calls
    `torch.cuda.reset_peak_memory_stats`, the gpu peak memory stats could be invalid. And the [`Trainer`] will disrupt
    the normal behavior of any such tools that rely on calling `torch.cuda.reset_peak_memory_stats` themselves.

    For best performance you may want to consider turning the memory profiling off for production runs.
    Nz***** z metrics *****c                 s       | ]	}t t|V  qd S rY   r-   r0  r   r    r    r!   rF         zlog_metrics.<locals>.<genexpr>c                 s   ri  rY   rj  r   r    r    r!   rF     rk  z  z <z = >)is_world_process_zeroprintrh  r0   keysvaluesr  )r   splitr]  metrics_formattedk_widthv_widthrV   r    r    r!   log_metrics  s   O
*ru  Tc                 C   s
  |   sdS tj| jj| d}t|d}tj||ddd W d   n1 s+w   Y  |rtj| jjd}tj	|r[t|}t
|}W d   n1 sUw   Y  ni }|| t|d}tj||ddd W d   dS 1 s|w   Y  dS dS )a  
    Save metrics into a json file for that split, e.g. `train_results.json`.

    Under distributed environment this is done only for a process with rank 0.

    Args:
        split (`str`):
            Mode/split name: one of `train`, `eval`, `test`, `all`
        metrics (`Dict[str, float]`):
            The metrics returned from train/evaluate/predict
        combined (`bool`, *optional*, defaults to `True`):
            Creates combined metrics by updating `all_results.json` with metrics of this call

    To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only difference is that raw
    unformatted numbers are saved in the current method.

    Nz_results.jsonr   r  T)indent	sort_keyszall_results.json)rm  ospathjoinargs
output_diropenjsondumpexistsloadupdate)r   rq  r]  combinedry  fall_metricsr    r    r!   save_metrics  s&   

"r  c                 C   s.   |   sdS tj| jjd}| j| dS )z
    Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model.

    Under distributed environment this is done only for a process with rank 0.
    Nztrainer_state.json)rm  rx  ry  rz  r{  r|  statesave_to_json)r   ry  r    r    r!   
save_stateD  s   r  Fc                    s4   t  rdd  ndd  t fdd|  D S )zo
    Calculate model's total param count. If trainable_only is True then count only those requiring grads.
    c                 S   s   t | dr| jS |  S )Nds_numel)r   r  r   pr    r    r!   r   W  s   z$get_model_param_count.<locals>.numelc                 S   s   |   S rY   )r   r  r    r    r!   r   \  s   c                 3   s"    | ]}r	|j r |V  qd S rY   )requires_grad)rC   r  r   trainable_onlyr    r!   rF   _  rG   z(get_model_param_count.<locals>.<genexpr>)r   r   
parameters)modelr  r    r  r!   get_model_param_countQ  s   
r  c                    sj   du rg g }|   D ]\ t }| fdd|D 7 }q|fdd| j D 7 }|S )zZ
    Returns the names of the model parameters that are not inside a forbidden layer.
    Nc                    s@   g | ] t tst fd dD s d  qS )c                 3   s&    | ]}| d     v V  qdS )r?   NlowerrC   	forbidden)rE   rn   r    r!   rF   o  s   $ 1get_parameter_names.<locals>.<listcomp>.<genexpr>r?   )r$   rO   anyrC   childforbidden_layer_namesforbidden_layer_typesrn   )rE   r!   r   k  s    z'get_parameter_names.<locals>.<listcomp>c                    s&   g | ] t  fd dD s qS )c                 3   s    | ]	}|   v V  qd S rY   r  r  rI   r    r!   rF   s  rk  r  )r  r  )r  r  r!   r   r  s
    )named_childrenget_parameter_names_parametersro  )r  r  r  r6   child_paramsr    r  r!   r  b  s   
r  c                 C   sV   t |  }| jj|kr| jS t|dkrdS |D ]}t||}|dur(|  S qdS )z
    Gets a class from a module by its name.

    Args:
        module (`torch.nn.Module`): The module to get the class from.
        name (`str`): The name of the class.
    r   N)rN   childrenr   r   r-   get_module_class_from_name)modulern   modules_childrenchild_modulemodule_classr    r    r!   r  x  s   
r  c                 C   s:   | r|D ]}t j||}t j|rt | qd S d S rY   )rx  ry  rz  isfileremove)is_main_processr|  	filenamesfilenamefiler    r    r!   remove_dummy_checkpoint  s   
r  c                 C   s>   | di |}t |tr|d n|d }|| }| | |S )Nlossr   r    )r$   r   backward)r  inputsgradient_accumulation_stepsoutputsr  r    r    r!   smp_forward_backward  s
   
r  c                 C   s   | di |S )Nr    r    )r  r  r    r    r!   smp_forward_only  s   r  c                 C   s   t | ttfrt| dd | D S t | tr$t| dd |  D S t | tjs4tdt|  dt	
| t	jj}dd |D }tjd	d |D d
dS )Nc                 s   rX   rY   
smp_gatherr\   r    r    r!   rF     r]   zsmp_gather.<locals>.<genexpr>c                 S   r^   r    r  rC   rI   rg  r    r    r!   rL     r_   zsmp_gather.<locals>.<dictcomp>z Can't gather the values of type z-, only of nested list/tuple/dicts of tensors.c                 S   s   g | ]}t |qS r    )r#   r\   r    r    r!   r     r   zsmp_gather.<locals>.<listcomp>c                 S   s   g | ]}|  qS r    )r`   r\   r    r    r!   r     r   r   r*   )r$   rN   rO   rM   r   rQ   r%   r&   rS   smp	allgather	CommGroupDP_GROUPr/   )rv   all_tensorsr    r    r!   r    s   
r  c                 C   sX   t | ttfrt| dd | D S t | tr$t| dd |  D S |    S )Nc                 s   rX   rY   smp_nested_concatr\   r    r    r!   rF     r]   z$smp_nested_concat.<locals>.<genexpr>c                 S   r^   r    r  r  r    r    r!   rL     r_   z%smp_nested_concat.<locals>.<dictcomp>)	r$   rN   rO   rM   r   rQ   rh   r   r`   r   r    r    r!   r    s
   
r  c                   @   s   e Zd ZU dZedddidZeed< edddidZe	e ed	< ed
ddidZ
eed< ed
ddidZeed< edddidZe	e ed< edddidZe	e ed< edddidZeed< edd Zdd ZdddZdS )AcceleratorConfigaI  
    A subset of arguments relating to the underlying [`accelerate.Accelerator`]
    implementation utilized in the `Trainer` that can be customized.
    Mostly relating to data.

    Parameters:
        split_batches (`bool`, *optional*, defaults to `False`):
            Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If
            `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a
            round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set
            in your script multiplied by the number of processes.
        dispatch_batches (`bool`, *optional*):
            If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process
            and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose
            underlying dataset is an `IterableDataset`, `False` otherwise.
        even_batches (`bool`, *optional*, defaults to `True`):
            If set to `True`, in cases where the total batch size across all processes does not exactly divide the
            dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among
            all workers.
        use_seedable_sampler (`bool`, *optional*, defaults to `True`):
            Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
            training results are fully reproducible using a different sampling technique. While seed-to-seed results
            may differ, on average the differences are negligible when using multiple different seeds to compare. Should
            also be ran with [`~utils.set_seed`] for the best results.
        gradient_accumulation_kwargs (`dict`, *optional*):
            Additional kwargs to configure gradient accumulation, see [`accelerate.utils.GradientAccumulationPlugin`].
            Any of the following (optional) keys are acceptable:
              num_steps (`int`): Will take precedence over [`~.TrainingArguments.gradient_accumulation_steps`] if
                the latter is set to 1, otherwise an exception will be raised.
              adjust_scheduler (`bool`): Whether to adjust the scheduler steps to account for [`~.TrainingArguments.gradient_accumulation_steps`].
                The [`accelerate.utils.GradientAccumulationPlugin`] default is `True`.
              sync_each_batch (`bool`): Whether to synchronize the gradients at each data batch.
                The [`accelerate.utils.GradientAccumulationPlugin`] default is `False`.
        non_blocking (`bool`, *optional*, defaults to `False`):
            Whether to use non-blocking CUDA calls to help minimize synchronization during
            distributed training with prepared `DataLoader` inputs being moved to device.
            Best if used with `pin_memory=True` in the `TrainingArguments`.
        use_configured_state (`bool*, *optional*, defaults to `False`):
            Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined
            before calling `TrainingArguments`. If `True`, an `Accelerator` or `PartialState`
            must be initialized. May lead to issues using sweeps or hyperparameter tuning.

    Fhelpau  Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set in your script multiplied by the number of processes.)defaultmetadatasplit_batchesNa  If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose underlying dataset is an `IterableDataslet`, `False` otherwise.dispatch_batchesTzIf set to `True`, in cases where the total batch size across all processes does not exactly divide the dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among all workers.even_batchesa}  Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]).Ensures training results are fully reproducible using a different sampling technique. While seed-to-seed results may differ, on average the differences are negligible when usingmultiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results.use_seedable_samplerzWhether to use non-blocking CUDA calls to help minimize synchronization during distributed training with prepared `DataLoader` inputs being moved to device. Best if used with `pin_memory=True` in the `TrainingArguments`. Requires accelerate v0.30.0.non_blockinga  Additional kwargs to configure gradient accumulation, see [`accelerate.utils.GradientAccumulationPlugin`]. Any of the following (optional) keys are acceptable:   num_steps (`int`): Will take precedence over [`~.TrainingArguments.gradient_accumulation_steps`] if     the latter is set to 1, otherwise an exception will be raised.   adjust_scheduler (`bool`): Whether to adjust the scheduler steps to account for [`~.TrainingArguments.gradient_accumulation_steps`].     The [`accelerate.utils.GradientAccumulationPlugin`] default is `True`.   sync_each_batch (`bool`): Whether to synchronize the gradients at each data batch.     The [`accelerate.utils.GradientAccumulationPlugin`] default is `False`.gradient_accumulation_kwargszWhether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`.If `True`, an `Accelerator` or `PartialState` must be initialized. May lead to issues using sweeps or hyperparameter tuning.use_configured_statec                    s   t j|r	tjnt}||ddd}t|}W d    n1 s"w   Y  t fdd| D }t	|dkrEt
d| d| d	 d
i |S )Nr   zutf-8)encodingc                 3   s"    | ]}| j  vr|V  qd S rY   )__dataclass_fields__ro  )rC   rV   clsr    r!   rF   2  rG   z3AcceleratorConfig.from_json_file.<locals>.<genexpr>r   zThe config file at z had unknown keys (zu), please try upgrading your `transformers` version or fix (and potentially remove these keys) from your config file.r    )rx  ry  r  ior}  r~  r  r  ro  r-   r-  )r  	json_file	open_filer  config_dict
extra_keysr    r  r!   from_json_file+  s   z AcceleratorConfig.from_json_filec                 C   s   t | jS rY   )rB  deepcopy__dict__r   r    r    r!   to_dict:  s   zAcceleratorConfig.to_dictc                 C   s   | j ||S rY   )r  pop)r   rV   r  r    r    r!   r  =  s   zAcceleratorConfig.poprY   )r   r   r   r   r   r  r   r  r  r
   r  r  r  r  r   r  classmethodr  r  r  r    r    r    r!   r    sR   
 -	


r  c                       sJ   e Zd ZdZd fdd	Zddeddfdd	Zddee fd
dZ	  Z
S )LayerWiseDummyOptimizera  
    For Layer-wise optimizers such as GaLoRE optimizer, the optimization
    step is already done through the post gradient hooks. Therefore
    the trick is to create a dummy optimizer that can take arbitrary
    args and kwargs and return a no-op during training.

    Initial idea from @hiyouga in LLaMA-Factory:
    https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba
    Nc                    s2   t dd}|| _t |gd|ddi d S )Nr   rH  gMbP?)r%   randnoptimizer_dictr   r   get)r   r  r{  r   dummy_tensorr   r    r!   r   L  s    z LayerWiseDummyOptimizer.__init__Tset_to_nonerx   c                 C      d S rY   r    )r   r  r    r    r!   	zero_gradQ     z!LayerWiseDummyOptimizer.zero_gradc                 C   r  rY   r    )r   closurer    r    r!   stepT  r  zLayerWiseDummyOptimizer.steprY   T)r   r   r   r   r   r   r  r
   r
  r  r   r    r    r   r!   r  A  s
    
r  c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )LayerWiseDummySchedulera  
    For Layer-wise optimizers such as GaLoRE optimizer, the optimization and scheduling step
    are already done through the post gradient hooks. Therefore
    the trick is to create a dummy scheduler that can take arbitrary
    args and kwargs and return a no-op during training.
    c                    s4   |d | _ tdi |}d}d}t ||| d S )NrH  r   Fr    )
default_lrr  r   r   )r   r{  r   rN  
last_epochverboser   r    r!   r   `  s
   
z LayerWiseDummyScheduler.__init__c                 C   s8   | j g}| jd urdd | jj D }tt| }|S )Nc                 S   s   g | ]
}d d |j D qS )c                 S   s   g | ]}|d  qS )rH  r    )rC   groupr    r    r!   r   n  r   z=LayerWiseDummyScheduler.get_lr.<locals>.<listcomp>.<listcomp>)rO  )rC   rL  r    r    r!   r   m  s    z2LayerWiseDummyScheduler.get_lr.<locals>.<listcomp>)r  rN  r  rp  rN   r   )r   lrsparam_wise_lrsr    r    r!   get_lrg  s   

zLayerWiseDummyScheduler.get_lrc                 C   r   rY   )base_lrsr   r    r    r!   _get_closed_form_lrt  r   z+LayerWiseDummyScheduler._get_closed_form_lr)r   r   r   r   r   r  r  r   r    r    r   r!   r  X  s
    r  c              
   C   sx   |   }d}z|r|j||  W dS |j||  W dS  ty; } zt|j| |d W Y d}~dS d}~ww )zIHelper to set RNG state for a specific device type (CUDA, NPU, MLU, MUSA)zDidn't manage to set back the RNG states of the {backend} because of the following error:
 {exception}
This won't yield the same results as if the training had not been interrupted.)backend	exceptionN)r  randomset_rng_state_allset_rng_state	Exceptionr   errorformat)device_namedevice_modulecheckpoint_rng_stateis_distributeddevice_state_keyerr_templaterR  r    r    r!   set_rng_state_for_devicex  s   "r  )r)   rY   )NNr  r	  )r   )wr   rB  rX  r  r~  r   rx  sysr   collections.abcr   r   
contextlibr   dataclassesr   r   	itertoolsr   r   r   typingr	   r
   r   re   r(   r%   torch.distributeddistributedr   r   torch.utils.datar   r   r   r   torch.utils.data.distributedr   integrations.deepspeedr   tokenization_utils_baser   utilsr   r   r   r   add_handlerstdoutro   rp   rq   ru   torch.optim.lr_schedulerr   
get_loggerr   r   r   r&   rR   r#   r7   r>   rB   rU   r[   rg   rk   r   r{   r   rN   r
  r   r   r   r   r   r   datar   r   r   r   r   r   r!  r"  r1  r8  r?  rS  r\  r   r0  rh  ru  r  r  r  r  r  r  !smdistributed.modelparallel.torchmodelparallelr  r  r  r  r  r  r  rL  	Optimizerr  r  r  r    r    r    r!   <module>   s   	



	
48


l
*#.X/m"	
Z&



  