o
    hXG                    @   sn  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ dd	lmZmZm Z m!Z!m"Z"m#Z#m$Z$ dd
l%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddl%mJZJmKZKmLZLmMZM ddlNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZh ddlimjZjmkZkmlZlmmZmmnZnmoZompZp er+ddlqmrZr ddlsmtZt ddlumvZv e8wexZye4 r<d dlzm{Z{m|Z| g dZ}eG dd de3Z~eG dd  d e3ZeG d!d" d"e3ZeG d#d$ d$e3Ze~Ze~Ze~ZeZeZeZeZeZeZeZeeef Zeeef Zeeef Zeeef Zeeef Zee~ef Zeeef Zeeef ZG d%d& d&Zd'd( ZdOd*d+Zd,ejd-ejd.ejd/ejd0ed1ed2ejfd3d4Zd5ed6efd7d8Zd9ee3e
f d6ed5ed:e&d2eee3e
f  f
d;d<Zd=ee3 d:e&d2e3fd>d?Zd@edA dBdfdCejdDejdEedFedGed2ejfdHdIZdJee dKe
eejf dLejd2ejfdMdNZdS )P    N)	dataclass)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion)version)nn)
functional)AssistantVocabTranslatorCache   )CacheDynamicCacheEncoderDecoderCacheHybridChunkedCacheOffloadedCacheQuantizedCacheConfigStaticCache)PretrainedConfig)is_deepspeed_zero3_enabled)is_fsdp_managed_module)CausalLMOutputWithPastSeq2SeqLMOutput)isin_mps_friendly)ExtensionsTrie)ModelOutputis_accelerate_availableis_hqq_availableis_optimum_quanto_availableis_torchdynamo_exportinglogging   )DisjunctiveConstraintPhrasalConstraint)
BeamScorerBeamSearchScorerConstrainedBeamSearchScorer)	AssistedCandidateGenerator-AssistedCandidateGeneratorDifferentTokenizersCandidateGeneratorEarlyExitCandidateGeneratorPromptLookupCandidateGenerator%UniversalSpeculativeDecodingGenerator_crop_past_key_values_prepare_attention_mask_prepare_token_type_ids) NEED_SETUP_CACHE_CLASSES_MAPPINGQUANT_BACKEND_CLASSES_MAPPINGGenerationConfigGenerationMode)#EncoderNoRepeatNGramLogitsProcessor'EncoderRepetitionPenaltyLogitsProcessorEpsilonLogitsWarperEtaLogitsWarperExponentialDecayLengthPenaltyForcedBOSTokenLogitsProcessorForcedEOSTokenLogitsProcessorHammingDiversityLogitsProcessorInfNanRemoveLogitsProcessorLogitNormalizationLogitsProcessorListMinLengthLogitsProcessor!MinNewTokensLengthLogitsProcessorMinPLogitsWarperNoBadWordsLogitsProcessorNoRepeatNGramLogitsProcessor PrefixConstrainedLogitsProcessor RepetitionPenaltyLogitsProcessorSequenceBiasLogitsProcessor$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperTypicalLogitsWarper.UnbatchedClassifierFreeGuidanceLogitsProcessor)ConfidenceCriteriaEosTokenCriteriaMaxLengthCriteriaMaxTimeCriteriaStoppingCriteriaStoppingCriteriaListStopStringCriteria)PreTrainedModel)PreTrainedTokenizerBase)BaseStreamer)AlignDevicesHookadd_hook_to_module)past_key_valuescache_paramsstatememspast_buckets_statesc                   @   s   e Zd ZU dZejed< dZee	ej
  ed< dZee	ej
  ed< dZee	e	ej
   ed< dZee	e	ej
   ed< dZee	e	e	ej
    ed< dS )	GenerateDecoderOnlyOutputaw  
    Outputs of decoder-only generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    	sequencesNscoreslogits
attentionshidden_statesr]   )__name__
__module____qualname____doc__torch
LongTensor__annotations__rd   r   r	   FloatTensorre   rf   rg   r]    rp   rp   q/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/generation/utils.pyrb      s   
 
"rb   c                   @   s   e Zd ZU dZejed< dZee	ej
  ed< dZee	ej
  ed< dZee	ej
  ed< dZee	ej
  ed< dZee	e	ej
   ed< dZee	e	ej
   ed	< dZee	e	ej
   ed
< dZee	e	e	ej
    ed< dS )GenerateEncoderDecoderOutputaI  
    Outputs of encoder-decoder generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rc   Nrd   re   encoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesr]   )rh   ri   rj   rk   rl   rm   rn   rd   r   r	   ro   re   rs   rt   ru   rv   rw   r]   rp   rp   rp   rq   rr      s   
 
#"rr   c                   @   s   e Zd ZU dZejed< dZeej	 ed< dZ
eeej	  ed< dZeeej	  ed< dZeej ed< dZeeeej	   ed< dZeeeej	   ed	< dZeeeeej	    ed
< dS )GenerateBeamDecoderOnlyOutputa
  
    Outputs of decoder-only generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rc   Nsequences_scoresrd   re   beam_indicesrf   rg   r]   )rh   ri   rj   rk   rl   rm   rn   ry   r   ro   rd   r	   re   rz   rf   rg   r]   rp   rp   rp   rq   rx      s   
 
 "rx   c                   @   s  e Zd ZU dZejed< dZeej	 ed< dZ
eeej	  ed< dZeeej	  ed< dZeej ed< dZeeej	  ed< dZeeej	  ed	< dZeeeej	   ed
< dZeeeej	   ed< dZeeeej	   ed< dZeeeeej	    ed< dS ) GenerateBeamEncoderDecoderOutputa  
    Outputs of encoder-decoder generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
            sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    rc   Nry   rd   re   rz   rs   rt   ru   rv   rw   r]   )rh   ri   rj   rk   rl   rm   rn   ry   r   ro   rd   r	   re   rz   rs   rt   ru   rv   rw   r]   rp   rp   rp   rq   r{     s   
 
*"r{   c                "   @   s  e Zd ZdZdejdeej deej deejejf fddZ	dejdeej deej deejejf fdd	Z
	
	
	
	
ddejdee deej deej deej f
ddZ	
	
	
ddeej deej deeeejf  deejee eeejf f fddZ	
	
	
ddeej deej deeeejf  dejfddZdejdedeeef dejfddZdejdee dedeeef fddZ	
ddededeeejf dejdeej deejeeejf f fd d!Ze	"	#	
dd$ed%edeej deejeeef f fd&d'Z	#	"dd(edeeef d%ed)edeeef f
d*d+Zd,d- Zdedejdejd.d/d0ed1d2d3d2dede fd4d5Z!	
	
	
	
dded6ed7ejd8e"eejge#e f d0ee dee deeeef  d9eej d:eej defd;d<Z$	
dded=ee% d>ed2 de%fd?d@Z&dAe'ee%f dBe'ee%f de'ee%f fdCdDZ(	
	#ddEejdFeej dGeej dHedejf
dIdJZ)dKdL Z*dMdN Z+deeef fdOdPZ,dQdR Z-dSdT Z.	
ddee dUee dVedeeef fdWdXZ/dYdZ Z0deeee'eef f  fd[d\Z1d]eded^edejdef
d_d`Z2defdadbZ3deded.d/dedcedejdefdddeZ4defdfdgZ5	
	
ddedhee dee'ejef  fdidjZ6e7 	
	
	
	
	
	
	
	
	
	
	
ddeej dee d0ee d=ee% d8ee"eejge#e f  dkee d.ed/ dledm d9eej d:eej dUee de'e8ejf fdndoZ9dpedkedejdefdqdrZ:	
ddejd>ed2 dejfdsdtZ;dejdue'ee#e f d0ed=e%dedkedldmde'e<ejf fdvdwZ=e7 dejd0ed=e%dedkedledm de'e<ejf fdxdyZ>dejd0ed=e%dedkedledm de'e<ejf fdzd{Z?d|d} Z@ed~ejdejfddZAed~ejdededejfddZBed~ejdGejdejfddZCedejdejdejdejdededede'eef deDfddZEdejdejdejdedededededededeejejejf fddZFdejdejdejdejdedeejejejf fddZGdEejdejdejdejdGejdejdejdejdejdedededeDde'eef deejejejejf fddZHdejd0ed=e%dedkede'eIejf fddZJdejdeKd0ed=e%dedkefddZLdejdeMd0ed=e%dedkede'eIejf fddZNdejde d0ed=e%dedkedledm de'e<ejf fddZOdejdefddZPd
S )GenerationMixina	  
    A class containing all functions for auto-regressive text generation, to be used as a mixin in model classes.
    Inheriting from this class causes the model to have special generation-related behavior, such as loading a
    `GenerationConfig` at initialization time or ensuring `generate`-related tests are run in `transformers` CI.

    A model class should inherit from `GenerationMixin` to enable calling methods like `generate`, or when it
    has defined a custom `generate` method that relies on `GenerationMixin`, directly or indirectly, which
    approximately shares the same interface to public methods like `generate`. Three examples:
        - `LlamaForCausalLM` should inherit from `GenerationMixin` to enable calling `generate` and other public
            methods in the mixin;
        - `BlipForQuestionAnswering` has a custom `generate` method that approximately shares the same interface as
           `GenerationMixin.generate` (it has a few extra arguments, and the same output). That function also calls
           `GenerationMixin.generate` indirectly, through an inner model. As such, `BlipForQuestionAnswering` should
           inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
        - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
            However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
            `BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.

    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* if `num_beams=1` and `do_sample=False`
        - *contrastive search* if `penalty_alpha>0` and `top_k>1`
        - *multinomial sampling* if `num_beams=1` and `do_sample=True`
        - *beam-search decoding* if `num_beams>1` and `do_sample=False`
        - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
        - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
        - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`

    To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    	input_idsinputs_embedscache_positionreturnc                 C   s   t  r
| |||S |dur'|jd dkr'|dd|jd  df }||fS |dus4|d |jd krF|dd|jd  df }||fS |jd |jd krX|dd|f }||fS )a  
        Generic cache-dependent input preparation
        The code is put in a separate function to allow granular unit testing
        as it needs a different implementation to be exportable.

        If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        - Exception 1: when passing input_embeds, input_ids may be missing entries
        - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
        - Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
          generate the first token for each sequence. Later use the generated Input ids for continuation.

        The current implementation does not rely on ``self`` and could be
        a class method. It is left as a standard method to be easily rewritten.
        Nr$   r   )r"   ,_cache_dependant_input_preparation_exportingshapeselfr}   r~   r   rp   rp   rq   "_cache_dependant_input_preparation{  s   z2GenerationMixin._cache_dependant_input_preparationc                    sv   |du r|dd|f }||fS dd  dd dd t |jd d	k fd
dfdd|||g\}}||fS )z
        This method implements method ``_cache_dependant_input_preparation``
        with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
        The code is put in a separate function to allow granular unit testing.
        Nc                 S      | d d |j d  d f S Nr   r   )r~   r   rp   rp   rq   branch_1     zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_1c                 S   r   r   r   r}   r   rp   rp   rq   branch_2  r   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_2c                 S   s   | d d |f S Nrp   r   rp   rp   rq   branch_3  s   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_3r$   r   c                    s    ||| fS r   rp   r}   r~   r   )r   rp   rq   <lambda>  s   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>c                    s.   |t |d | jd k fdd| |gfS )Nr   r$   c                    s(   t | jd |jd k dd | |gS )Nr$   r   c                 S   s   | S r   rp   r   rp   rp   rq   r     s    zrGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>rl   condr   r   )r   rp   rq   r     s   z`GenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>r   r   )r   r   rp   rq   r     s   
r   r   rp   )r   r   r   rq   r     s   0
z<GenerationMixin._cache_dependant_input_preparation_exportingNr]   attention_maskc                 K   s0  i }| j r
||d< n!|du r+|dur|d d jd nd}tj||jd tj|jd}|dur<||d< | |||\}}| jjrBdnd	}	| jjsl|dur^t	||jd kr^d||	< ||d
< n|j
tjd||	< d|d
< n	|j
tjd||	< | jjr{|nd}
| jjr|ddn|}| jjrdnd}| jjrdnd}|dur||du r|tt| jj v r| dd }||dkd |||< dD ];}||}|dur|dur|d
dur|d
 jd n||	 jd }|dd| df }|j
tjd}|||< qt|trk|jdkrk|d
 dur%|d
 j\}}}|d
 j}n||	 j\}}||	 j}t| | jd}|du rDt| dd}nt|dd}|du rZt| jj d n|||| | j |||| j|d	}|durt|||< |
dur}|
|d< |! D ]\}}||vr|||< q|dd |S )a  
        Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
        slicing inputs given the existing cache.

        See the forward pass in the model documentation for expected arguments (different models might have different
        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
        r   Nr   r   r$   dtypedevicer]   decoder_input_idsr}   r~   )memory_formatdecoder_attention_maskr   decoder_position_idsposition_idsr   )r   token_type_idsr   5_prepare_4d_causal_attention_mask_with_cache_positiona   has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.)sequence_lengthtarget_lengthr   r   r   
batch_sizeconfigr]   labels)"_supports_cache_classr   rl   arangelongr   r   r   is_encoder_decoderlenclonecontiguous_formatpopgetsetinspect	signatureforward
parameterskeyscumsummasked_fill_
isinstancer   ndimgetattrbase_model_prefixloggerwarning_once	__class__rh   get_max_cache_shaper   items)r   r}   r]   r   r~   r   kwargsmodel_inputspast_lengthinput_ids_keyencoder_attention_maskattention_mask_keyposition_ids_keyr   model_input_namemodel_inputcurrent_input_lengthr   r   _r   
base_modelcausal_mask_creation_functionkeyvaluerp   rp   rq   prepare_inputs_for_generation  s   










z-GenerationMixin.prepare_inputs_for_generationinputsbos_token_idmodel_kwargsc              
      s&  | j jrt| dr| jj| jkr| jj n| j  fdd| D }| d}|durB|durBtd| d  d| d  d		|durH|} d
krd|v r| j jsxdtt	
| jj v }|smtd| jj d| j|||d|d
< n|durtd|d d} | |||}| |fS )zT
        This function extracts the model-specific `inputs` for generation.
        encoderc                    s&   i | ]\}}|d us| kr||qS r   rp   ).0kv
input_namerp   rq   
<dictcomp>u     & z9GenerationMixin._prepare_model_inputs.<locals>.<dictcomp>Nz
`inputs`: z` were passed alongside z0 which is not allowed. Make sure to either pass z or z=...r}   r~   zAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)r   zMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)r   r   hasattrr   main_input_namer   r   
ValueErrorr   r   r   r   r   r   r   rh   *_maybe_initialize_input_ids_for_generation)r   r   r   r   inputs_kwarghas_inputs_embeds_forwardingrp   r   rq   _prepare_model_inputsa  sH   

z%GenerationMixin._prepare_model_inputsc                 C   s   |dur|S | d}| jjr(|dur(|j dd }tj|tj| jdd S d}|	 D ]}t
|tjr=|jd } nq.d|v rNtj|dftj| jdS |du rVtd	tj|dftj| jd| S )
z3Initializes input ids for generation, if necessary.Nencoder_outputsr   r   ir$   r   r~   zB`bos_token_id` has to be defined when no `input_ids` are provided.)r   r   r   last_hidden_statesizerl   onesr   r   valuesr   Tensorr   r   )r   r   r   r   r   r   r   r   rp   rp   rq   r     s"   

z:GenerationMixin._maybe_initialize_input_ids_for_generationinputs_tensorgeneration_configc                 C   s   |j }|j}d|v r|d jd dkr|d }tj|jd d tj|jd}|d u r,|S t|jdko;|jtj	tjfv }|s@|S |d uoKt
||d }|d u pXt
||d  }	||	 }
|| }||
 ||
   }|S )Nr}   r$   r   r   r   elementstest_elements)_pad_token_tensor_eos_token_tensorr   rl   r   r   r   r   r   intr   anyne)r   r   r   r   pad_token_ideos_token_iddefault_attention_maskis_input_idsis_pad_token_in_inputs&is_pad_token_not_equal_to_eos_token_idcan_infer_attention_maskattention_mask_from_paddingr   rp   rp   rq   &_prepare_attention_mask_for_generation  s*    z6GenerationMixin._prepare_attention_mask_for_generationr   c                    s   |   }t| drt|drd|j_nt|tdd g dfdd| D }tt	|j
j d v p:d	 v }|sH fd
d| D }|j|d< |j|d< |d urX|n| j}d|d< |||< |di ||d< |S )Nhf_device_map_hf_hookT)io_same_device)decoder_
cross_attn	use_cachec                    s,   i | ]\ }t  fd dD s |qS )c                 3   s    | ]}  |V  qd S r   )
startswithr   pargumentrp   rq   	<genexpr>      z\GenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>.<genexpr>r   )r   r   )irrelevant_prefixr  rq   r     s    zRGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>r   r   c                    s   i | ]\}}| v r||qS rp   rp   )r   r  r   )encoder_signaturerp   rq   r      s    output_attentionsoutput_hidden_statesreturn_dictr   rp   )get_encoderr   r   r   r\   r[   r   r   r   r   r   r   r	  r
  r   )r   r   r   r   r   r   encoder_kwargsencoder_accepts_wildcardrp   )r  r  rq   ._prepare_encoder_decoder_kwargs_for_generation  s,   






z>GenerationMixin._prepare_encoder_decoder_kwargs_for_generationr   decoder_start_token_idr   c                 C   s  |durd|v r| d}nd|v r|dkr| d}nd}|du r%| j}|jdkrE|jd |kr>td| d|jd  |dd}ntj|dftj|d	| }|du r\|}||fS d
| j	j
 v ss| jjdkrxd
| jjj v rx	 ||fS | jjdv r	 ||fS |dddf |dddf k  rtj||gdd}d|v r|d }tjt|ddddf |fdd}||d< ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNr   r}   r$   r   z1`decoder_start_token_id` expected to have length z	 but got r   r   donutzvision-encoder-decoder)whisperdimr   )r   r   r   r   r   viewrl   r   r   r   rh   lowerr   
model_typer   allitemcat	ones_like)r   r   r   r   r  r   r   r   rp   rp   rq   )_prepare_decoder_input_ids_for_generation  sF   
(z9GenerationMixin._prepare_decoder_input_ids_for_generationr$   Fexpand_sizer   c                    sl    dkr||fS  fdd}|dur|j  dd}||}|r2|ddu r*td||d |d< ||fS )	zIExpands tensors from [batch_size, ...] to [batch_size * expand_size, ...]r$   c                    sH   | D ]}|dkr!| | d ur!t | | tjr!| | j dd| |< q| S )Nr   r   r  )r   rl   r   repeat_interleave)dict_to_expandr   r  rp   rq   _expand_dict_for_generationV  s   zRGenerationMixin._expand_inputs_for_generation.<locals>._expand_dict_for_generationNr   r  r   zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)r  r   r   )r  r   r}   r   r!  rp   r   rq   _expand_inputs_for_generationI  s   

z-GenerationMixin._expand_inputs_for_generationoutputsnum_new_tokensc                 C   sV  t D ]}||v r|dv rd}n|}t||||<  nqd|v r7|d }tj||d d df dgdd|d< |sUd|v rT|d }tj|||jd dfgdd|d< nd	|v rp|d	 }	tj|	|	|	jd dfgdd|d	< |d
dr|d dd  | |d< |S |d}
tj	|
d d |
d | d |
j
d|
j}t|
|f|d< |S )N)ra   r`   r]   r   r   r  r   r   r$   r   r   Tr   r   )ALL_CACHE_NAMESr   rl   r  	unsqueezenew_onesr   r   r   r   r   tor   )r   r#  r   r   r$  possible_cache_name
cache_namer   r   r   past_positionsnew_positionsrp   rp   rq   #_update_model_kwargs_for_generationl  sF   
(


z3GenerationMixin._update_model_kwargs_for_generationc                 C   s   t d| jj d| j )NzGMake sure that a `_reorder_cache` function is correctly implemented in z to enable beam search for )NotImplementedErrorr   ri   )r   r]   beam_idxrp   rp   rq   _reorder_cache  s
   zGenerationMixin._reorder_cacheassistant_modelrX   logits_processortarget_tokenizerrY   assistant_tokenizerc	                 C   s   t dd |||fD }	|jdurt|| ||||d}
|
S |jdur0t|j|j|j|jd}
|
S |	ro|jdu rQt	
||| jj|j}t|||||||||d	}
|
S |jdu rdt||||||||d	}
|
S td
t|jj t||||||d}
|
S )zU
        Returns the candidate generator to be used in `assisted_generation`
        c                 s   s    | ]}|d uV  qd S r   rp   )r   r   rp   rp   rq   r        z;GenerationMixin._get_candidate_generator.<locals>.<genexpr>N)r}   r2  r   r   r   r3  )r   num_output_tokensmax_matching_ngram_size
max_lengthT)	r}   r2  r   r   r   r3  r4  r5  atm_translatorF)r}   r2  r   r   r   r3  r4  r5  z7Invalid value for `do_sample`: expected a boolean, got )r  assistant_early_exitr-   prompt_lookup_num_tokensr.   r   r8  r9  	do_sampler   get_translatorr   
vocab_sizer   r/   r+   r   typerh   r*   )r   r   r}   r   r2  r3  r4  r5  r   different_tokenizerscandidate_generatorr:  rp   rp   rq   _get_candidate_generator  sx   
7
.
#
z(GenerationMixin._get_candidate_generatorinput_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnnegative_prompt_idsnegative_prompt_attention_maskc
              	   C   s  t  }
|jdur|jdkr|
t|j| ||	|jd |jdur)|
t|jd |jdur@|jdkr@|
t|j|j	|j
d |jdurb|jdkrbt|jdkr\|
t|j|d	 ntd
t |jduru|jdkru|
t|jd |jdur|jdkr|
t|j |jdur|jdkrt|jdkr|
t|j| ntdt |jdur|
t|j|j |jdur|jdur|jdkr|
t|j|j|d |jdur|jdur|jdkr|
t||j|j|d |dur|
t||j	|j
  |j dur|
t!|j  |j"dur|
t#|j$|j"|d |j%du r*|
t&  |j'dur;|
t(|j'|j| |j)durK|
t*|j)|d |j+duro|}|dks^|j du r`|n|d }|
t,|j+||d |j-duryt.d| /|
|}
|j0rS|j	dkrt1|jt2rt|jd }nt1|jt3j4r|jjd d }nd}nd}|j5dur|j5dkr|
t6|j5 |j7dur|j7dkr|
t8|j7|d |j9dur|j9dk r|
t:|j9|d |j;dur|
t<|j;|d |j=dur|j=dk r|
t>|j=|d |j?dur3d|j?  k r'dk r3n n
|
t@|j?|d |jAdurSd|jA  k rFdk rSn n|
tB|jA||d |jCdure|
|jCD| jEjF| |jGdu rq|
tH  |
S )z
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
        instances used to modify the scores of the language model head.
        Nr$   )unconditional_idsunconditional_attention_maskr   sequence_bias        )diversity_penalty	num_beamsnum_beam_groups      ?r   )penaltyrE  zyPassing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.)rR  r   z{Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.r   TzYou have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument in favour of `input_ids` or `decoder_input_ids` respectively.)top_kmin_tokens_to_keep)top_prU  )min_prU  )massrU  )epsilonrU  )rY  rU  r   )IrA   guidance_scaleappendrP   r   rL  rI   rN  r>   rO  rP  encoder_repetition_penaltyr   r   r8   warningswarnUserWarningrepetition_penaltyrH   no_repeat_ngram_sizerF   encoder_no_repeat_ngram_sizer7   bad_words_idsrE   r   
min_lengthrB   min_new_tokensrC   rG   forced_bos_token_idr<   forced_eos_token_idr=   r9  remove_invalid_valuesr?    exponential_decay_length_penaltyr;   suppress_tokensrK   begin_suppress_tokensrJ   forced_decoder_idsr   _merge_criteria_processor_listr=  r   listrl   r   temperaturerL   rT  rM   rV  rN   rW  rD   	typical_prO   epsilon_cutoffr9   
eta_cutoffr:   watermarking_configconstruct_processorr   r?  renormalize_logitsr@   )r   r   rD  rE  rF  r3  r   r   rG  rH  
processorsbegin_indexrU  rp   rp   rq   _get_logits_processor  sh  
	











**z%GenerationMixin._get_logits_processorstopping_criteria	tokenizerc                 K   s   t  }|jd urt| jdd }|t|j|d |jd ur'|t|jd |jd ur>|d u r4t	d|t
|j|d |jd urL|t|jd |jrb|jd urb|jdkrb|t|jd | ||}|S )	Nmax_position_embeddings)r9  r{  )max_timea  There are one or more stop strings, either in the arguments to `generate` or in the model's generation config, but we could not locate a tokenizer. When generating with stop strings, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.)stop_stringsrz  )r   r   )assistant_confidence_threshold)rV   r9  r   r   r[  rS   r|  rT   r}  r   rW   r   rR   is_assistantr~  rQ   rm  )r   r   ry  rz  r   criteriar{  rp   rp   rq   _get_stopping_criteria  s8   






z&GenerationMixin._get_stopping_criteriadefault_listcustom_listc                 C   s   t |dkr|S t| }|D ]B}d}|D ]4}t|t|u rIt|tr&dnd}td| dt| dt| dt| d		 || d
} nq|sQ|| q|D ]}||vr_|| qT|S )a3  
        Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
        processor/criteria is present on both lists, use the user-defined one.

        (Note: up to v4.49.0, this funtion threw an exception is the same logit processor was found twice.)
        r   Fzstopping criteriazlogits processorz	A custom z	 of type zt has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom z5 will take precedence. Please check the docstring of z$ to see related `.generate()` flags.T)r   r@  r   rU   r   r   r[  )r   r  r  
final_listdefaultusing_customcustomobject_typerp   rp   rq   rm    s6   



z.GenerationMixin._merge_criteria_processor_listrc   rd   rz   normalize_logitsc                 C   s6  |du rt |d jd dd|j}|dt|}t |	t|d
dd}|rM|	d| jj|jd }t jjj|dd}|	d|jd }|dk }d|  d }| ddd|f }|ddd|f }d||< || jj }|jd | }|dd|df | }	|d|	}
d|
|< |
S )a  
        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
        used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.

        Parameters:
            sequences (`torch.LongTensor`):
                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
                shorter if all batches finished early due to the `eos_token_id`.
            scores (`tuple(torch.FloatTensor)`):
                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
                of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
                Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
                with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
            beam_indices (`torch.LongTensor`, *optional*):
                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
                generate-time.
            normalize_logits (`bool`, *optional*, defaults to `False`):
                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).

        Return:
            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
                the transition scores (logits)

        Examples:

        ```python
        >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
        >>> import numpy as np

        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
        >>> inputs = tokenizer(["Today is"], return_tensors="pt")

        >>> # Example 1: Print the scores for each token generated with Greedy Search
        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, normalize_logits=True
        ... )
        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
        >>> # encoder-decoder models, like BART or T5.
        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
        >>> generated_tokens = outputs.sequences[:, input_length:]
        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
        ...     # | token | token string | log probability | probability
        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
        |   262 |  the     | -1.414 | 24.33%
        |  1110 |  day     | -2.609 | 7.36%
        |   618 |  when    | -2.010 | 13.40%
        |   356 |  we      | -1.859 | 15.58%
        |   460 |  can     | -2.508 | 8.14%

        >>> # Example 2: Reconstruct the sequence scores from Beam Search
        >>> outputs = model.generate(
        ...     **inputs,
        ...     max_new_tokens=5,
        ...     num_beams=4,
        ...     num_return_sequences=4,
        ...     return_dict_in_generate=True,
        ...     output_scores=True,
        ... )
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
        ... )
        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
        >>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
        >>> # use case, you might want to recompute it with `normalize_logits=True`.
        >>> # Tip 2: the output length does NOT include the input length
        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
        >>> length_penalty = model.generation_config.length_penalty
        >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
        True
        ```Nr   r   r$   r  )rl   r   r   r  r)  r   expandr   stackreshape	transposer   r?  r   r   log_softmaxr   summaxr   gather)r   rc   rd   rz   r  beam_indices_maskmax_beam_lengthbeam_sequence_indicescut_idxindicestransition_scoresrp   rp   rq   compute_transition_scores   s&   T$z)GenerationMixin.compute_transition_scoresc                 C   s.   |   sg d}td| jj d| ddS )z
        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
        right class to use.
        )ForCausalLMForConditionalGenerationForSpeechSeq2SeqForVision2SeqzThe current model class (z) is not compatible with `.generate()`, as it doesn't have a language model head. Classes that support generation often end in one of these names: .N)can_generate	TypeErrorr   rh   )r   $terminations_with_generation_supportrp   rp   rq   _validate_model_class  s   z%GenerationMixin._validate_model_classc                    s    d u rd S j jr0 j js0g dfddt j D t fddD }|s0tdd}j  j j  jkrL|d urJtd| d	d S |d u sT|d u r\td
| d	d S )N)encoder_attention_headsencoder_ffn_dimencoder_layersc                       g | ]}| v r|qS rp   rp   r   attr)attributes_to_checkrp   rq   
<listcomp>      z7GenerationMixin._validate_assistant.<locals>.<listcomp>c                 3   s(    | ]}t j|t  j|kV  qd S r   )r   r   r  )r2  r   rp   rq   r    s    
z6GenerationMixin._validate_assistant.<locals>.<genexpr>zThe main model and the assistant don't have compatible encoder-dependent input shapes. Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper.zc(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)z`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` r  zThe main and assistant moedels have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` )r   r   dirr  r   get_text_configr?  )r   r2  rz  r5  	are_equaldoc_referencerp   )r2  r  r   rq   _validate_assistant  s2   

z#GenerationMixin._validate_assistantc                 C   s|  t |ddtr| jst| jj d| jjr$dD ]}|	|d qg }t
t| jj}d|v s7d|v rB|t
t| jjO }| jjrt| | jd}t| dd}|du ra|durat|dd}|durrt
t|jj}||O }t| dd}|du r|durt|dd}|durt
t|jj}	|d	d
 |	D O }| D ]\}}
|
dur||vr|| q|rtd| ddS )zXValidates model kwargs for generation. Generate argument typos will also be caught here.r]   Nz does not support an instance of `Cache` as `past_key_values`. Please check the model documentation for supported cache formats.)r   r   r   r   decoderc                 S   s   h | ]}d | qS )r   rp   )r   xrp   rp   rq   	<setcomp>      z9GenerationMixin._validate_model_kwargs.<locals>.<setcomp>z8The following `model_kwargs` are not used by the model: zG (note: typos in the generate arguments will also show up in this list))r   r   r   r   r   r   rh   r   r   r   r   r   r   r   r   r   r   r   r   r[  )r   r   r   unused_model_args
model_argsr   r   encoder_model_argsr  decoder_model_argsr   rp   rp   rq   _validate_model_kwargs  sD   

z&GenerationMixin._validate_model_kwargsc              	   C   s
  |r|j du r|jdkrtd|j dt ||jkr3| jjr"dnd}td| d| d	|j d
d}|r@|d|j d7 }|jdur\|j|jkr\td|j d|j d| t |j	dur|j	| }||jkrtd|j	 d| d|j d| t dS dS dS )z=Performs validation related to the resulting generated lengthN   z0Using the model-agnostic default `max_length` (=zz) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.r   r}   zInput length of z is z, but `max_length` is set to z}. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.z Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.z" Note that `max_length` is set to z, its default value.z-Unfeasible length constraints: `min_length` (z.) is larger than the maximum possible length (z).z1Unfeasible length constraints: `min_new_tokens` (z$), when added to the prompt length (z/), is larger than the maximum possible length ()
max_new_tokensr9  r]  r^  r_  r   r   r   rd  re  )r   r   input_ids_lengthhas_default_max_lengthinput_ids_stringmin_length_error_suffixrd  rp   rp   rq   _validate_generated_length  sT   





z*GenerationMixin._validate_generated_lengthc                 C   s,  |j dur!|s|jdurtd|j  d|j d |j | |_n;|dkr;||jd kr;| jjs;| j|jd 8  _n!|r\|jt jkr\|j| |_t| jdd}|dur\t	|j||_|j
dury|sqtd|j
 d	|j d
 |j
| |_|S |dkr||jd kr| jjst|j|jd  d|_|S )z]Prepared max and min length in generation configs to avoid clashes between similar attributesNzBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r~   r$   r{  zBoth `min_new_tokens` (=z) and `min_length`(=z) seem to have been set. `min_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   )r  r9  r   warningr   r   r   r5   r   minre  rd  r  )r   r   r  has_default_min_lengthr   r  r   r{  rp   rp   rq   _prepare_generated_length1  sD   



	z)GenerationMixin._prepare_generated_lengthuse_model_defaultsr   c                 K   s  d}|du r5| j jr0| j jt| j kr0t| j dkr0t| j}|| j kr0t	
dt || _ | j }d}t|}|stt| j jj}|du sV|du r|tdkri }t }|j D ]+\}	}
|	dsm|	dkrnq`t||	}t| j |	}||
kr||
kr|||	< t||	| q`t|dkrtd	| d
 n(|jdu r| j j|_|jdu r| j j|_|jdu r| j j|_|jdu r| j j|_|jdi |}||fS )z
        Prepares the base generation config, then applies any generation configuration options from kwargs. This
        function handles retrocompatibility with respect to configuration files.
        FNr   a?  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed in v5. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )Tz4.50.0r   transformers_versionzX`generation_config` default values have been modified to match model-specific defaults: z=. If this is not desired, please set these values explicitly.rp   )r   _from_model_config_original_object_hashhashr   r   &_get_non_default_generation_parametersr5   from_model_configr]  r^  r_  copydeepcopyr   parser  base_version__dict__r   r   r   setattrr   r   r   r   r   r  update)r   r   r  r   using_model_generation_confignew_generation_configmodel_base_versionmodified_valuesdefault_generation_configr   default_valuecustom_gen_config_valuemodel_gen_config_valuer   rp   rp   rq   _prepare_generation_configi  s`   










z*GenerationMixin._prepare_generation_configc                 C   s  d|v r| j jstj|d ddddf tjddd }n2d|v r>| j jr>tj|d ddddf tjddd }ntj|dddf tjddd }d}|ddur|d }d}t|tso|d d j	d }nt
|d	r~| dur~| }||d }||d
< |S )zbCalculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past lengthr~   r   Nr%  r$   decoder_inputs_embedsr]   r   get_seq_lengthr   )r   r   rl   r  int64r   r   r   r   r   r   r  )r   r}   r   r   r   cacherp   rp   rq   _get_initial_cache_position  s"   .*&
z+GenerationMixin._get_initial_cache_positionc           
         s  d}t | dr7t| j dhkst| j ddhkrdndd | j D d fdd	| j D }|du r=dS | j j}t|d
krWd|v rWt	
t||d S i }t | drd |  D ]\}}||  u rr|  nqd du r{td fdd| D }t||krt|D ]}|D ]}d| d| dv r|| ||<  nqqnF	  |v rt	
t||  }nd v rɈ dd
d  ntd  dqn|D ]}	t|D ]}d| d|	 dv r||	 ||<  nqqt|D ]}||vrtd| dq|S )z
        Returns the device map for each decoder layer, to allocate the cache on the right device.
        Inspired from `dispatch_model` in accelerate.
        Nr   cpudiskc                 S   s   g | ]}|d vr|qS )r  r  rp   )r   drp   rp   rq   r    r  zHGenerationMixin._get_layer_device_map_for_cache_init.<locals>.<listcomp>r   c                    s"   i | ]\}}||d v r n|qS r  rp   )r   namer   )main_devicerp   rq   r     s    zHGenerationMixin._get_layer_device_map_for_cache_init.<locals>.<dictcomp>r$    get_decoderzw`model.get_decoder()` is not returning a named module of the model. This is unexpected, please open an issue on GitHub.c                    s   g | ]} |v r|qS rp   rp   )r   module_name)decoder_namerp   rq   r    s    r  TzDecoder name z" not found in execution device mapzlayer z! has not been mapped to a device.)r   r   r   r   r   r   r  num_hidden_layersr   dictfromkeysrangenamed_modulesr  RuntimeErrorr   rsplit)
r   execution_device_mapr  layer_device_mapr  moduledecoder_mapped_modulesidxr  layerrp   )r  r  rq   $_get_layer_device_map_for_cache_init  st   
*


	
z4GenerationMixin._get_layer_device_map_for_cache_initcache_implementationmax_cache_lenc                 C   s~  |dkrdt | jddv rd}t| }| jjp|ddu}t| dr,|r)| jjn| j}|d	kr7t| jj	|}t| d pLt
|| pL|j|kpLt
|t}	|d
krX|	pW|j|k }	|rot| dro|	pn| jjj|d d jd k}	|	rt| jdr|| jj}
n| j}
|  }| j |||
||d}|di || _|r| }|d d jd |d< t| j|di || _| jS | j  | jS )z
        Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
        new `generate` call requires a larger cache or uses a different batch size.

        Returns the resulting cache object.
        hybridllama4r  r  hybrid_chunkedr   N_cachesliding_windowmambar   r$   _pre_quantization_dtype)r   max_batch_sizer  r   r   r  r  rp   )r   r   r3   r   r   r   r  self_attention_cacher  r  r   r  r   r  cross_attention_cacher   r   r   r  r  r  r   reset)r   r  r   r  r   r   	cache_clsrequires_cross_attention_cachecache_to_checkneed_new_cachecache_dtyper  cache_kwargsr  rp   rp   rq   
_get_cache!  sT   	



zGenerationMixin._get_cachec                 C   s6   | j od| jj vod| jj vod| jj vS )a  
        Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`.
        This is mostly the same as `_supports_cache_class` attribute, but add exception for `Jamba` model which
        uses its own `HybridMambaAttentionDynamicCache` and do not need to initialize the Cache in advance in
        order to save memory (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
        for `HybridMambaAttentionDynamicCache`).
        jambazambabamba)r   r   rh   r  r   rp   rp   rq   _supports_default_dynamic_cache_  s   	z/GenerationMixin._supports_default_dynamic_cachemax_cache_lengthc                 C   s6  d| j j vr
dnd}| jjp|ddu}||}	|	durF|jdur-td| dt|	t	rD| 
 rD|s=t|	nt|	||< dS |jdu rMdS | 
 sc|jduratd	|j d
t dS |dury|jdurytd|j d d|_|jpt| j dd|_|jdur
|jtv r|jdkr| jstd| j|jt|j|j| |||d||< dS |jdkr| jstd|jdur|jnt }
t|
j  }|
j dkrt! st"d|
j dkrt# st"d||
||< dS |jdkrt$ ||< dS |jdkrt ||< dS dS |st ntt t ||< dS )z
        Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is
        instantiated, writes it to `model_kwargs`, under the name expected by the model.
        r  r]   r^   r   NzMPassing both `cache_implementation` (used to initialize certain caches) and `zB` (a Cache object) is unsupported. Please use only one of the two.FzThis model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to z) will be ignored.zRAn assistant model is provided, using a dynamic cache instead of a cache of type='z'.r  staticzThis model does not support `cache_implementation='static'`. Please check the following issue: https://github.com/huggingface/transformers/issues/28981)r  r   r  r   r   	quantizedzThis model does not support the quantized cache. If you want your model to support quantized cache, please open an issue and tag @zucchini-nlp.quantozYou need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. Please install it via  with `pip install optimum-quanto`HQQzYou need to install `HQQ` in order to use KV cache quantization with HQQ backend. Please install it via  with `pip install hqq`	offloadeddynamic)%r   rh   r  r   r   r   r  r   r   tupler  r   from_legacy_cacher   r   r]  r^  r_  r   r   r   r  r3   _supports_static_cacher  r  rO  num_return_sequences_supports_quantized_cachecache_configr   r4   backendr!   ImportErrorr    r   )r   r   r   r2  r   r  r   r+  r  user_defined_cacher  cache_classrp   rp   rq   _prepare_cache_for_generationn  s   











z-GenerationMixin._prepare_cache_for_generationc                 C   s   dt t| jj v S )z
        Return True if the current model supports the keyword argument `logits_to_keep` in forward()
        to save memory. Checking it in this way allows to avoid using a new model attribute.
        logits_to_keep)r   r   r   r   r   r   r  rp   rp   rq   _supports_logits_to_keep  s   z(GenerationMixin._supports_logits_to_keepkwargs_has_attention_maskc           	         sL  d fdd	}||j |d}||j|d}||j|d}||j|d} jjr/|dur-|n|}|dur=|jdkr=|d}|du r]|dur]|durP|sPt	d |d }t	d| d  jjri|du rit
d	|durt||d
 r|dur|std |durt|s|dk  rt	d| d ||_||_||_||_dS )a  
        Prepares the special tokens for generation, overwriting the generation config with their processed versions
        converted to tensor.

        Note that `generation_config` is changed in place and stops being serializable after this method is called.
        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
        Nc                    sF   | d u r| S |d ur|n j }t| tjr| |S tj| |tjdS )Nr   r   )r   r   rl   r   r)  tensorr   )tokenr   r  rp   rq   _tensor_or_none  s   
z@GenerationMixin._prepare_special_tokens.<locals>._tensor_or_nonerS  r   zThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.z\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.r   zThe attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z;`eos_token_id` should consist of positive integers, but is zq. Your generation will not stop until the maximum length is reached. Depending on other flags, it may even crash.r   )r   r   r   r  r   r   r   r'  r   r  r   r   r   r   rl   is_floating_point_bos_token_tensorr   r   _decoder_start_token_tensor)	r   r   r%  r   r)  bos_token_tensoreos_token_tensorpad_token_tensordecoder_start_token_tensorrp   r  rq   _prepare_special_tokens  sP   	


	
z'GenerationMixin._prepare_special_tokenssynced_gpusstreamerrZ   c           '         s  |    |dd}|dd}| j |fi |\ }| |  | ||| |du r;t s5t| o:t	 dk}|durA|nt
 }|durJ|nt }dtt| jj v }d|v}|dddu}| | j|\}}}|jd }|j}| j ||d | jjs jdur|dkrt|jd	krt|ddd
f  jkdkrtd | jjs|dkrd _ |s|r|r| !| ||d< n|r|dkrt|d jd	krt"d| jjrd|vr| #||| }| jjr| j$||| j%|jd\}}n|dkr|n|d} j&r| '||}|dur|(|)  |jd
 }|ddu o. j*du}|ddu o; j+du}| j, |||||d | - rUd|vrUd|d< | . ||  j*d }|jd |krz|dkrz| jjsz||jd 7 }| / |||||  0|}|dur j1dkrt"d| jj2|jj2krt34d|jj2 d| jj2 d| jj2 dt5 | j6 |||||j||	|
d	}| j7d8 ||d|} j |d< |t8j9kr4 j:dkrt"d j: d|dkrt"d|d st"d  j;d!v rt"d"| j<rt"d#| j=j> | j? |||||||d$}| j@|f||| ||d%|} n|t8jAkrZ| j<rGt"d&| j=j> | jB|f jC|| ||d'|} n|t8jDkr|d sit"d(| j<rvt"d)| j=j> | jE|f|| ||d*|} nS|t8jFt8jGfv r| jHd8| j:| jjd+|\}}| jI|f|| ||d*|} n(|t8jJt8jKfv r| jHd8| j1| jjd+|\}}| jL|f|| |d,|} n|t8jMkrtN| j1|j jO jP j: jQ j*d-}!| jHd8| j1| jjd+|\}}| jR||!f|| |d,|} n|t8jSkrg }" jTdur& jT}" jUdur fd.d/}#tV jUtWrAt jUdkrD|#   jUD ]`}$tV|$d tWrtV|$tWr^t|$dkra|#  tXd0d1 |$D rn|#  tXd2d1 |$D r{|#  tY|$}%n!tV|$tWrt|$dkr|#  tXd3d1 |$D r|#  tZ|$}%|"[|% qGt\|"| j1|j jO jP j: j*d4}&| jHd8| j1| jjd+|\}}| j]|f|&|| |d5|}  j^du rt_| d6rt`| jad7dur| jab | _a| S )9a  

        Generates sequences of token ids for models with a language modeling head.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
            generation_config ([`~generation.GenerationConfig`], *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which has the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complements the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
                intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://arxiv.org/abs/2010.00904).
            synced_gpus (`bool`, *optional*):
                Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
                to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
                deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
            assistant_model (`PreTrainedModel`, *optional*):
                An assistant model that can be used to accelerate generation. The assistant model must have the exact
                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                is much faster than running generation with the model you're calling generate from. As such, the
                assistant model should be much smaller.
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
                size. This is an experimental feature, subject to breaking API changes in future versions.
            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Attention_mask for `negative_prompt_ids`.
            use_model_defaults (`bool`, *optional*):
                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
                generation configuration (`model.generation_config`), as opposed to the global defaults
                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
                `True`.
            kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.

                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateDecoderOnlyOutput`],
                    - [`~generation.GenerateBeamDecoderOnlyOutput`]

                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        rz  Nr5  r$   r   r   r   rS  r   r   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.r~   Tr}   z1`attention_mask` passed to `generate` must be 2D.)r   r   r   r  r   r9  rd  )r   r  r  r   r   r  r#  zZ`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1.z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.)	r   rD  rE  rF  r3  r   r   rG  rH  )r   ry  rz  r   zFnum_return_sequences has to be 1 when doing assisted generate, but is r  z6assisted generate is only supported for batch_size = 1z+assisted generate requires `use_cache=True`)r  r  r  z=assisted generate is not supported with Static cache classes`zCassisted generation is not supported with stateful models, such as )r   r}   r   r2  r3  r4  r5  r   )rB  r3  ry  r   r2  r3  z=dola decoding is not supported with stateful models, such as )dola_layersr3  ry  r   r2  r3  z,Contrastive search requires `use_cache=True`zBcontrastive search is not supported with stateful models, such as )r3  ry  r   r2  r3  r}   r  r   )r3  ry  r   r2  )r   rO  r   length_penaltydo_early_stoppingnum_beam_hyps_to_keeprP  r9  c                      s   t d j d)Nzo`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` of positive integers, but is r  )r   force_words_idsrp   r   rp   rq   	typeerror	  s
   z+GenerationMixin.generate.<locals>.typeerrorc                 s   s    | ]	}t |t V  qd S r   )r   rn  r   	token_idsrp   rp   rq   r  	  s    z+GenerationMixin.generate.<locals>.<genexpr>c                 s   s"    | ]}t d d |D V  qdS )c                 s   $    | ]}t |t p|d k V  qdS r   Nr   r   r   token_idrp   rp   rq   r  	     " z5GenerationMixin.generate.<locals>.<genexpr>.<genexpr>Nr  r<  rp   rp   rq   r  	  s
    
c                 s   r>  r?  r@  rA  rp   rp   rq   r  	  rC  )constraintsr   rO  r   r6  r7  r8  r9  )constrained_beam_scorerr3  ry  r   r2  r]   to_legacy_cacherp   )cr  r   r  r  r  r  r   r   distget_world_sizerA   rV   r   r   r   r   r   r   r   r   r   r   r   r1  r   r   r   r   rl   r  r   r  r   r   r   r  r  r,  token_healingheal_tokensputr  r9  rd  r  r$  r  r"  get_generation_moderO  r@  r]  r^  r_  rx  r  r6   ASSISTED_GENERATIONr  r  _is_statefulr   rh   rC  _assisted_decodingDOLA_GENERATION_dola_decodingr4  CONTRASTIVE_SEARCH_contrastive_searchSAMPLEGREEDY_SEARCHr"  _sampleBEAM_SAMPLEBEAM_SEARCH_beam_searchGROUP_BEAM_SEARCHr(   r6  early_stoppingrP  _group_beam_searchCONSTRAINED_BEAM_SEARCHrD  r9  r   rn  r   r%   r&   r[  r)   _constrained_beam_searchreturn_legacy_cacher   r   r]   rF  )'r   r   r   r3  ry  rF  r2  r2  r3  rG  rH  r  r   rz  r5  r   accepts_attention_maskrequires_attention_maskr%  r   r   r   r   r}   r  r  r  r  generation_modeprepared_logits_processorprepared_stopping_criteriarB  resultbeam_scorerfinal_constraintsr;  word_ids
constraintrE  rp   r:  rq   generate@  st  j



 


















	






zGenerationMixin.generatethis_peer_finishedc                 C   sL   |r t j|rdnd|d}tj|tjjd | dkrdS dS |r$dS dS )z
        Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is
        fed through `this_peer_finished`. ZeRO stage 3-friendly.
        rM  rQ  rS  )opFT)rl   r'  rG  
all_reduceReduceOpSUMr  )r   rk  r2  r   this_peer_finished_flagrp   rp   rq   _has_unfinished_sequences*
  s   z)GenerationMixin._has_unfinished_sequencesc                    s  du rt djj}}t }td|d}dd j|ddD }|d	dd
j|j	}t
||k||}	 | dkrE|S |dddf  }dd   fdd|D }	tt||	D ]`\}
\}}||
 }t
||k r~qj	 fdd|j|dD }t|dkrqj||f  d7  < |j|d |dd }	 | dkrqjt|||k dkr||d< | j|d|d||
< qj|S )a  
        Generates sequences of token ids for models with a language modeling head.
        Parameters:
            input_ids (`torch.LongTensor`): The sequence used as a prompt for the generation.
            tokenizer (`PreTrainedTokenizerBase`, *optional*): The tokenizer used to decode the input ids.
        Return:
            `torch.LongTensor` where each sequence has its tail token replaced with its appropriate extension.
        Nzs When generating with token healing, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.r$   )r  r   c                 S   s   g | ]}|  qS rp   )stripr   rp   rp   rq   r  S
      z/GenerationMixin.heal_tokens.<locals>.<listcomp>T)skip_special_tokenspt)return_tensorspaddingr   r    c                 3   s"    | ]} |d  V  qdS )rx  N)decodereplace)r   t	space_tokrz  rp   rq   r  i
  s     z.GenerationMixin.heal_tokens.<locals>.<genexpr>c                    s   i | ]	}  |fd qS )g      $@)convert_tokens_to_ids)r   alt_tok)rz  rp   rq   r   u
  s    z/GenerationMixin.heal_tokens.<locals>.<dictcomp>)prefixrQ  rK  r:  )r   r   r   r   	get_vocabr5   batch_decoder}   r)  r   rl   wherenumeltolistconvert_ids_to_tokensr~  	enumeratezipr  r  
extensionsr   r  rj  r'  )r   r}   rz  r   r   
vocab_trier   promptstail_ids	tail_toks	batch_idxtail_idtail_tok	batch_idsseq_biastrimmed_idsrp   r|  rq   rJ  <
  sT   


zGenerationMixin.heal_tokensr4  c           &   	      sP  | j jrtd|j}	|j}
|j}|j}|j}|j}t	dd |D }|j
}|r,|r,dnd}|r4|r4dnd}|r<|
r<dnd}|rD|
rDdnd}|rL|rLdnd}|jd }tj|tj|jd}| ||}d}| j  j | j jsrd}n d	kryd	}n	 d	krd
}nd}t|tr|dkr| d	 kr|g}nL dkrtt| d	 d	ntt|dd	}n5t|tr|dkrΈ dkrtt d	  d	n	tt d  d	}nt|tr݇ fdd|D }ntd|  }|du rtd| j|||jdr| j|fi |}| di |d|
dd}|jdddddf  jdtjd}|jdddddf  }i } |D ]}!||j |! dddddf |j| |!< q3| j!||| j jd}|r^|r^qt"|| |}"|"|j}"|||"}#|r|rz||#f7 }|r||f7 }|
r|| j jr|j#fn|j$f7 }| j jr||j%f7 }|r|| j jr|j&fn|j f7 }|rt'j(j)|#dd}$tj*|$d
d+d
}%ntj,|#dd}%|r|%| |	d
|   }%tj-||%dddf gdd}|dur|.|%/  |||| @ }|0 dk}| j|||jds|dur|1  |r&t2||||||3ddS |S )a
  
        Generates sequences of token ids for models with a language modeling head using **dola decoding** and can be
        used for decoder-only text models.
        The method is based on the paper "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language
        Models" (https://arxiv.org/abs/2309.03883) in ICLR 2024.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            dola_layers (`Union[str, List[int]]`):
                The candidate layers used in contrasting layers of DoLa. It can be either 1) 'low' or 'high', which
                means the lower part or higher part of the model layers, respectively, or 2) a list of layer indices
                to be used for candidate layers. The 0-th layer is the word embedding layer of the model.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        z8DoLa decoding is only available for decoder-only models.c                 s       | ]}t |d V  qdS r   Nr   r   r  rp   rp   rq   r  
  r  z1GenerationMixin._dola_decoding.<locals>.<genexpr>rp   Nr   r   Fr   r$   low(   r  highc                    s   g | ]}| k r|qS rp   rp   r   ifinal_layerrp   rq   r    r  z2GenerationMixin._dola_decoding.<locals>.<listcomp>z?dola_layers must be either 'low', 'high' or a list of integers.zCDoLa is not supported for models that don't have output embeddings.rS  T)r  r	  r
  r   )r  r   r   r  num_samplesr]   rc   rd   re   rf   rg   r]   )4r   r   r   r   r	  r
  output_scoresoutput_logitsreturn_dict_in_generater   r=  r   rl   r   r   r   r  r  r  tie_word_embeddingsr   strrn  r  get_output_embeddingsrq  r   re   detachr)  float32floatrg   r.  _dola_select_contrastru   rf   rv   rw   r   r   softmaxmultinomialsqueezeargmaxr  rK  r  r  endrb   r   )&r   r}   r4  r3  ry  r   r2  r3  r   r   r	  r
  r  r  r  has_eos_stopping_criteriar=  rd   
raw_logitsru   rv   rw   r   unfinished_sequencesrk  start_layercandidate_premature_layerslm_headr   r#  final_layer_next_token_logitsfinal_logitscandidate_premature_logitscandidate_premature_layernext_token_logitsnext_token_scoresprobsnext_tokensrp   r  rq   rQ  
  s   2


(






L	zGenerationMixin._dola_decodingc           >         s 
  t dd |D }|j |j}	|j}
|j}|j}|j}|j}|j}|j	}|r*|r*dnd}|r2|r2dnd}|r:|r:dnd}|rB|rBdnd}|rJ|rJdnd}|rh| j
jrh|r[|d dnd}|rf|d dnd}|jd }tj|tj|jd	}| ||}tj|tjd
}| j
jrd|v r|d dur|d }n|d }|j dd}d}| j|||jdrz|ddu st|d ttfrV|d  dkrVd|d< | j|fi |}| di |dd|d}| j
jr|jd }n|jd }|jdddddf jdtj|jd}| j ||| j
jd}|s#| j!d| | j
jd|\} }|d}!|!du r6t"| j#j$ dt|!d t%tj&frM|!d d jd |krVt"| j#j$ d|||}"t'j(j)|"dd}#tj*|#d d\}$}%|r|rx||f7 }|r||"f7 }|r|| j
jr|j+fn|j,f7 }| j
jr||j-f7 }|r|| j
jr|jfn|jf7 }~|s|d }&t|&t.st|&trt|&j/t.r|&0  n%g }'|&D ]}(g })|(D ]}*|)1|*j dd q|'1t%|) qt%|'}&|&|d< |rXg }+t2 D ]L},| j|%dd|,f 3ddfi |}-| di |-dd|d}t|d t.s=t|d trHt|d j/t.rHd|d< |d 4d |+1| qt5|+| j
6 }n| j|%3ddfi |}-| di |-dd|d}~-| j
jr|jd }.|j}/n|jd }.|j}/|jdddddf 7 }0|j dd}1t8|1|.|$||	 }2tj9||:|jd dfgdd}|2d}2t; fddt<|2D }3|%t2t=|%|2f }4t>t?|.j@dd }.|.t2||2ddf }.tj9||.Adgdd}d}5|/D ]}(t>t?|( t2||2ddf }(|5|(f7 }5q|r@| j|%dd|2f 3ddfi |}6| di |6dddd}7|7d }8nNd}8tBD ]}9|8pNtC||9d}8qDt|8t.sdt|8trjt|8j/t.rj|8D|3 n$g }'|8D ]}(g })|(D ]}*|)1|*|3df  qt|'1t%|) qnt%|'}8t>t?|0 t2||2ddf }||j}| j
jrd}:d};|r|j-D ]}(t>tj?|( ddt2||2df }(|:|(f7 }:q|j+D ]}(t>tj?|( ddt2||2df }(|;|(f7 };qtE|8|5|;pd|:pdd }n-d}<|r |j,D ]}(t>tj?|( ddt2||2df }(|<|(f7 }<qtF|8|5|<p'dd!}| j ||| j
jd}|r;|r;q|rH|4| |
d|   }4tj9||4dddf gdd}|durc|G|4H  |||| @ }|I dk}| j|||jds|dur|J  |r|ddurt|d t.st|d trt|d j/t.r|d 4d n-g }!|d D ] }(g }=|(D ]}*|=1|*dddddf  q|!1t%|= qt%|!|d< | j
jrtK|||||||||dd"	S tL||||||dd#S |S )$a  
        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c                 s   r  r  r  r  rp   rp   rq   r    r  z6GenerationMixin._contrastive_search.<locals>.<genexpr>rp   Nr   rf   rg   r   r   r%  r   r   r  FrS  r]   Tr   )r  r
  r	  r   r  r   r   r  r5  zQ does not support caching and therefore **can't** be used for contrastive search.z| does not have a standard cache format and therefore **can't** be used for contrastive search without further modifications.)r  r   r$   r  c                    s   g | ]
\}}||   qS rp   rp   )r   r  r  rT  rp   rq   r  n      z7GenerationMixin._contrastive_search.<locals>.<listcomp>.)r]   rw   ru   rv   )r]   rg   rf   	rc   rd   re   rs   rt   ru   rv   rw   r]   r  )Mr   rT  penalty_alphar   r	  r
  r  r  r  
low_memoryr   r   r   r   rl   r   r   r   r  r  r  rq  r   r   r   r  r   rw   rg   re   r)  r  r.  r"  r   r   rh   r  r   r   r   r  topkru   rf   rv   r   r  batch_repeat_interleaver[  r  r  cropstack_model_outputsr  r  _ranking_fastr  r(  r'  r  r   r  splitr  r'  r&  r   batch_select_indicesr   r   rK  r  r  r  rr   rb   )>r   r}   r3  ry  r   r2  r3  r   r  r  r   r	  r
  r  r  r  
sequentialr  rd   ru   rv   rw   rs   rt   r   r  cosine_matrix_maskrk  r   r#  last_hidden_stateslogit_for_next_stepr   r]   processed_logit_for_next_step
next_probstop_k_probs	top_k_idspastnew_key_valuesr  r   r  all_outputsr  next_model_inputsnext_hiddenfull_hidden_statesre   context_hiddenselected_idxaugmented_idxr  next_decoder_hidden_statesnext_model_inputselected_outputsnext_past_key_valuesr*  next_step_cross_attentionsnext_step_decoder_attentionsnext_step_attentionslayer_past_key_valuesrp   r  rq   rS  e  s(  ,











&

	



$


$

$
$
$
    
 
	z#GenerationMixin._contrastive_searchc           $      K   s  |j }|j}	|j}
|j}|j}|j}tdd |D }|j}|r$|r$dnd}|r,|r,dnd}|r4|	r4dnd}|r<|	r<dnd}|rD|
rDdnd}|rb| jj	rb|	rU|d 
dnd}|
r`|d 
dnd}|j\}}d}tj|tj|jd	}| ||}| j}t|
d
tr|d
 jo| j}t| dddur|| jjM }|o|j }|r| jjdks|jjrdtjd< | |j}|jdur| j ||fi |}d}nd}| j!|||jdr| j"|fi |}|#|	rd|	ini  |#|
rd|
ini  |r| di |ddi}d}n|di |ddi}| j$||| jj	d}|r!|r!q|j%dddddf j&dtj'|jd} ||| }!|r||rE||!f7 }|rM|| f7 }|	rj|| jj	rZ|j(fn|j)f7 }| jj	rj||j*f7 }|
r||| jj	rw|j+fn|j,f7 }|rt-j.j/|!dd}"tj0|"dd1d}#ntj2|!dd}#|r|#| |d|   }#tj3||#dddf gdd}|dur|4|#5  |||| @ }|6 dk}|d7 }~| j!|||jds|dur|7  |r| jj	rt8|||||||||
d
d	S t9||||||
d
dS |S )a  
        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c                 s   r  r  r  r  rp   rp   rq   r  5  r  z*GenerationMixin._sample.<locals>.<genexpr>rp   Nr   rf   rg   Fr   r]   hf_quantizercuda0TOKENIZERS_PARALLELISMTrS  r	  r
  r  r  r   r  r  r$   r  r   r  r  ):r   r	  r
  r  r  r  r   r=  r   r   r   r   rl   r   r   r   r  __call__r   r   is_compileabler  r   r  disable_compiler@  compile_config_compile_all_devicesosenvironget_compiled_callprefill_chunk_size_prefill_chunkingrq  r   r  r.  re   r)  r  ru   rf   rv   rw   rg   r   r   r  r  r  r  r  rK  r  r  r  rr   rb   )$r   r}   r3  ry  r   r2  r3  r   r   r	  r
  r  r  r  r  r=  rd   r  ru   rv   rw   rs   rt   r   cur_lenrk  r  model_forwardr  
is_prefillr   r#  r  r  r  r  rp   rp   rq   rV    s   +


(





K
	zGenerationMixin._samplec                 C   sx   | j j }t|ttfr| ||}|S d|v r5t|ttfs(t	d| d| ||}t
|}|S || |S )aC  
        Temporary function to handle the different types of cache reordering processes while we roll out `Cache`.

        TODO: standardize cache formats and make all models compatible with `Cache`. It would remove the need
        for this function, with `Cache.reorder_cache` being the sole remaining code path
        
gptbigcodez'Using an unsupported cache format with zG. Currently, it only supports the legacy tuple format or `DynamicCache`)r   rh   r  r   r  rn  r1  r   r   r   r  reorder_cache)r   r]   r0  model_classrp   rp   rq   _temporary_reorder_cache  s   


z(GenerationMixin._temporary_reorder_cacher'  c                 C   s0   t | j}t| |d |d  g|dd  S )z=[batch_size, num_beams, ...] -> [batch_size * num_beams, ...]r   r$   r   Nrn  r   rl   r  )r'  r   rp   rp   rq   _flatten_beam_dim  s   
&z!GenerationMixin._flatten_beam_dimrO  c                 C   s&   t | j}t| ||g|dd  S )z=[batch_size * num_beams, ...] -> [batch_size, num_beams, ...]r$   Nr  )r'  r   rO  r   rp   rp   rq   _unflatten_beam_dim  s   
z#GenerationMixin._unflatten_beam_dimc                 C   sF   t |jt | jk r|d}t |jt | jk s
tj| |dd}|S )a  
        Gathers the beam slices indexed by beam_indices into new beam array.

        Args:
            tensor (`torch.Tensor`): A tensor containing data to be gathered. The tensor is a 2D or a 3D tensor
                with the two first dimensions depicting the batch and the beam dimensions.
            beam_indices (`torch.Tensor` of shape `(batch_size, num_beams_to_select)`): The indices of the beams to
                select .

        Returns:
            A tensor with the selected beams
        r   r$   )inputr  r  )r   r   r'  rl   take_along_dim)r'  rz   gathered_tensorrp   rp   rq   _gather_beams  s
   
zGenerationMixin._gather_beamsrunning_beam_scoresbeam_scoresis_sent_finished!next_token_hits_stopping_criteriar  r9  decoder_prompt_lenr[  r6  c	                 C   s   |dkr|dkr|| }	n|| }	| ddddf |	|  }
t |t j|dddd d}t |
|k}t ||du @  }t | }||@ |@ S )	zv
        Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
        neverrM  Nr$   Tr  keepdimr       e)rl   r  r  r   r  )r  r  r  r  r  r9  r  r[  r6  best_hypothetical_lengthbest_possible_running_scoreworst_finished_scoreimprovement_possibleexists_open_beamvalid_continuationsrp   rp   rq   %_beam_search_has_unfinished_sequences   s   
z5GenerationMixin._beam_search_has_unfinished_sequencesaccumulated_log_probsrunning_sequencesrunning_beam_indicesr=  beams_to_keepr?  c                 C   s   |rt jtjj|dd|d}t j|d|d}n	t j||d\}}||	 }| ||}| ||}||	 }||dddd|f< t j|
|j	d
dd| }|| }||dddd|| f< |||fS )	a'  
        Get top-K continuations given the accumulated log probs on the next token.

        A few notes to understand what's going on:
        1. Each item in batch has `num_beams` * `vocab_size` candidate continuations. For each item, get the
        top K [K = (number of EOS tokens + 1) * `num_beams`] candidates with the highest accumulated
        log-probabilities, or sample them without replacement using the accumulated scores
        2. We gather the top K (as opposed to `num_beams`, or any number lower than K) here so that we have at
        least `num_beams` sequences remaining to continue the live beam search.
        3. Note that other stopping criteria might result in impossible to continue beams, i.e. all continuations
        selected in this step hit the stopping criteria.
        r   r  r  r$   )r  r  indexr   NrS  )rl   r  r   r   r  r  r  r  r   r   r  )r   r  r  r  r  r  r=  r  rO  r?  r   topk_indicestopk_log_probstopk_current_beam_indicestopk_running_beam_indicestopk_running_sequencestopk_idsbatch_offsetbatch_modified_indicesrp   rp   rq   _get_top_k_continuations(  s   
z(GenerationMixin._get_top_k_continuationsr  r  r  c                 C   sT   || tjd  }tj||dd }| ||}| ||}	| ||}
||	|
fS )z
        Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
        best non-finished beams to continue beam search in the next iteration.
        r
  r  r$   )r)  rl   r  r  r  )r   r  r  r  r  rO  topk_running_log_probsnext_topk_indicesr  r  r  rp   rp   rq   %_get_running_beams_for_next_iteration^  s   
z5GenerationMixin._get_running_beams_for_next_iterationtop_num_beam_maskc                 C   s   ||	dddf @ }||d | |  }t j|ddd|du @ }||t jd 7 }|| d 7 }t j||fdd}t j||fdd}t j||fdd}t j||fdd}t j||
dd }| ||}| ||}| ||}| ||}||||fS )	z
        Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
        the current finished sequences.
        Nr$   r   T)axiskeepdimsr
  r  r  )rl   r  r)  r  r  r  r  )r   rc   r  r  r  rz   r  r  r  r$  rO  r  r  r6  r[  did_top_num_beams_just_finishedbeams_in_batch_are_fullmerged_sequencesmerged_scoresmerged_beam_indicesmerged_is_sent_finishedtopk_merged_indicesrp   rp   rq   _update_finished_beamst  s   z&GenerationMixin._update_finished_beamsc           8      K   s  |j }|j}|j}	|j}
|j}|j}|j}|j}|j}|j	}|j
}|j}|j}|j\}}|| }| jjdkr;| jj}n| jjdkrG|  j}n| j j}|}d}|durZ|jd nd}tdd| | }tjtj|tjdtj|| tjdfdd	|j}| ||}|j}|rt d
|r|rdnd}|r|rdnd}|r|rdnd} |r|	rdnd}!|r|	rdnd}"|r|
rdnd}#|r| jj!r|	r|d "dnd}$|
r|d "dnd}%|dur|p|d nd}&tj#|||f|&tj$|jd}'| %||||'ddddd|f< |'& ' }(tj||ftj(|jd})d|)ddddf< tj#||fdtj(|jd}*tj||ftj|jd}+tj||ftj|jd},tj#|||| fdtj)|jd}-|-& ' } | j*|||jdr| +|'ddddd|f }.| j,|.fi |}/|/-|	rd|	ini  |/-|
rd|
ini  | di |/ddi}0| j.|0|| jj!d}|r|rq^|0j/dddddf jdtj0|jd}1t1j2j3|1dd	}2||.|2}2|r |r||1' f7 }|r|r||2' f7 }|	r|!| jj!r|0j4fn|0j5f7 }!| jj!r|"|0j6f7 }"|
r |#| jj!r|0j7fn|0j8f7 }#~0| %|2||}2|2|)dddddf  }2t9|2||| f}2| j:|2|'|-|||||||d
\}3}4}5|| +|4ddddd|d f |},| %|,||},| j;|3|4|5|,|d\}'})}-| j<|(|4|*|3| |5|+|,||||||d\}(}*} }+|"dddur| j=|d | +|-d|| f d|d< |d }| >|)|*|+|,|||||	 }| j*|||jdsh| +|(ddd|ddf }(| +|*ddd|f }*| +| ddd|ddf } | d  j?dd	 }6||6 }7|(ddd|7f }(| ddd|6f } |rI|s#d}*| jj!r:t@|(|*||| |$|%|!|"|#|"dd S tA|(|*||| |!|#|"dd!S |(S )"a	  
        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        If it's the first time you're diving into Beam Search, we recommend you read the following blog post:
        https://huggingface.co/blog/how-to-generate (especially the beam search section).

        You can recompute the sequence scores from the individual scores using the `compute_transition_scores` function
        (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.compute_transition_scores)

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`:
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        MoshiDepthDecoderImageGPTForCausalImageModelingFNr   r   r$   r%  r  z`low_memory=True` is not supported after the beam search refactor. Please check the discussion in #35802 *after the PR got merged*, and add a comment there if your questions are not yet answered.rp   r   rf   rg   r   )
fill_valuer   r   r   r
  rS  r	  r
  r  Tr  r  )
r  r  r  r  r  r=  r  rO  r?  r   )r  r  r  r  rO  )rc   r  r  r  rz   r  r  r  r$  rO  r  r  r6  r[  r]   .)r]   r0  rc   ry   rd   re   rz   rs   rt   ru   rv   rw   r]   rc   ry   rd   re   rz   rf   rg   r]   )Br   r   r	  r
  r  r  r  r=  r[  r6  r9  rO  r  r   r   rh   r   audio_vocab_sizer  out_featuresr  r?  r  rl   r  r   boolzerosr)  r   r  r  r   r   r   fullr  r  r  r   r  int32rq  r  r   r  r.  re   r  r   r   r  ru   rf   rv   rw   rg   r  r   r#  r.  r  r  r  r{   rx   )8r   r}   r3  ry  r   r2  r   r   r   r	  r
  r  r  r  r=  r[  r6  r9  rO  r  batch_size_unflattenedr  r   r?  r  rk  n_eos_tokensr  r$  r  
all_scoresr  rz   ru   rv   rw   rs   rt   output_fill_valuer  rc   r  r  r  r  r  flat_running_sequencesr   model_outputsre   	log_probsr  r  r  max_generated_lengthoutput_lengthrp   rp   rq   rY    s  .

" $ (

"	
   
zGenerationMixin._beam_searchrf  c           4         s|  |j }|j}	|j}
|j}|j}|j}|j}|j|j}| t	|j
| |j}|j\}}| ||}|rE|rEfddt|D nd |krZtd  d| d|r`|r`dnd}|rh|rhdnd}|rp|
rpdnd}|rx|
rxdnd}|r|rdnd}|r| jjr|
r|d d	nd}|r|d d
nd}tjfdtj|d}d|ddddf< | f}d}|jd }| j|||jdrtj |j|d}tj tj|d}| j|fi |}||
rd|
ini  ||rd|ini  | di |ddi} | j| || jjd}|r$|r$|d }q|r6t| jdddddf }!|rK| jdddddf j d|jd}"t|D ] }#t!|# }$|$|# }%g }&tD ] |&" fddt|#|$D  qg||& }'| j|&dddf j tj#|jd}(t$j%j&|(dd})|)jd }*||'|)|d}+|+||& 'd })|)(|+})|r|+|!|&< |)|%|* })|	dur|	jd nd},tj)|)t*dd|, |% dddd\})}-tj+|-|*dd}.|-|* }-durt,dnd}/|j-|'|)|-|.||	|/|d	}0|0d  ||&< |0d! }1|0d" |r0|r0t.fd#d$tt	d D < |' ||&< tj/|'ddf |1'dgdd}'|'dddf ||&< tj+|%dd |# |%  ||&< qO|r|rq||!f7 }|ry||"f7 }|
r|| jjr| j0fn| j1f7 }| jjr|| j2f7 }|r|| jjr| j3fn| j4f7 }tj/||'dgdd}~ |d%ddur| 5|d% ||d%< |d }|j6st7|||rd}| j|||jdsΈdurt,dnd}2|j8|||-|.||	|j9|2|d&	}3|r:|sd|3d'< | jjr%t:|3d( |3d' |||3d) ||||||d%d*S t;|3d( |3d' |||3d) |||d%d+S |3d( S ),a	  
        Generates sequences of token ids for models with a language modeling head using **diverse beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            beam_scorer (`BeamScorer`):
                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
                model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c                    s&   g | ]}t d d t  D qS )c                 s       | ]}d V  qdS rp   Nrp   r   r   rp   rp   rq   r  (      z@GenerationMixin._group_beam_search.<locals>.<listcomp>.<genexpr>)r  r  rE  )r   num_sub_beamsrp   rq   r  (  r   z6GenerationMixin._group_beam_search.<locals>.<listcomp>N)Batch dimension of `input_ids` should be 	, but is r  rp   r   rf   rg   r
  r   r   Fr   rS  r	  r
  r  Tr  r$   )r  r   c                    s   g | ]}  | qS rp   rp   )r   r  )r  rO  rp   rq   r  s  r  r  )current_tokensbeam_group_idxr   r  largestsortedfloor)rounding_mode)r   r   rz   group_indexr  next_beam_scoresnext_beam_tokensnext_beam_indicesc                 3   s*    | ]}  |  | f V  qd S r   rp   r  )rK  r0  rz   rp   rq   r    s    
z5GenerationMixin._group_beam_search.<locals>.<genexpr>r]   r   r   r9  rz   r  sequence_scoresrc   rz   r2  r3  )<r   r   r	  r
  r  r  r  rO  rP  r   
_beam_hypsr   r   r  r  r   r   r   r   rl   r8  r  r  rq  r7  r   r   r   r  r.  
zeros_likere   r)  r  extendr  r   r   r  r'  	expand_asr  r  divr  processr  r  ru   rf   rv   rw   rg   r  is_doner  finalizer9  r{   rx   )4r   r}   rf  r3  ry  r   r2  r   r   r   r	  r
  r  r  r  rP  r   batch_beam_sizer  rd   r  ru   rv   rw   rs   rt   r  rk  r  rJ  reordering_indicesr   r#  processed_scoreraw_logit_scoregroup_start_idxgroup_end_idx
group_sizebatch_group_indicesgroup_input_idsr  r  r?  next_token_scores_processedr;  r  next_indicesprocess_beam_indicesbeam_outputsbeam_next_tokensfinal_beam_indicessequence_outputsrp   )r  r   rK  r0  rz   rO  rG  rq   r\    sl  +

$


$



  
z"GenerationMixin._group_beam_searchrE  c           *         s  |j }|j}	|j}
|j}|j}|j}|j}t|j}|j	}|j
\}}| ||}|| |kr;td||  d| d|rA|rAdnd}|rI|rIdnd}|rZ|rZtdd t|D nd|rb|
rbdnd}|rj|
rjdnd}|rr|rrdnd}|r| jjr|
r|d d	nd}|r|d d
nd}tj||ftj|jd}d|ddddf< ||| f}d}|j
d }| j|||jdr| j|fi |}||
rd|
ini  ||rd|ini  | di |ddi}| j||| jjd}|r|r|d }q|jdddddf jdtj|jd}tjj |dd} ||| }!|!|dddf !|! } | " }"|rp|r9|| f7 }|rA||f7 }|
r^|| jjrN|j#fn|j$f7 }| jjr^||j%f7 }|rp|| jjrk|j&fn|j'f7 }| j
d }#| |||# } |	dur|	j
d nd}$tj(| t)dd|$ | dddd\} }%|%|# * }&|%|# }%|j+|| |%|&|"||	|d	}'|'d }|'d }(|'d  tj,| ddf |(-dgdd}~|dddur| .|d  |d< |r|rt fd dttD |d }|j/st0|||rd}| j|||jds|j1|||%|&||	|j2|d!	})|rb|s0d|)d"< | jjrMt3|)d# |)d" |||)d$ ||||||dd%S t4|)d# |)d" |||)d$ |||dd&S |)d# S )'a|	  
        Generates sequences of token ids for models with a language modeling head using **constrained beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation, while satisfying a list of positive constraints. For more information, the
                documentation of [`ConstrainedBeamSearchScorer`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        rH  rI  r  rp   Nc                 s   rC  rD  rp   rE  rp   rp   rq   r  Q  rF  z;GenerationMixin._constrained_beam_search.<locals>.<genexpr>r   rf   rg   r   r
  r$   Fr   rS  r	  r
  r  Tr  r  r  r   r   rL  )r   r   rz   r  rR  rS  rT  r]   c                 3   s&    | ]} |   | f V  qd S r   rp   r  r0  rz   rp   rq   r       $ rU  rV  rc   rz   r2  r3  )5r   r   r	  r
  r  r  r  r   rW  rO  r   r  r   r  r  r   r   r   rl   r7  r  r   r  rq  r   r  r.  re   r)  r  r   r   r  rZ  r   ru   rf   rv   rw   rg   r  r  r   r\  r  r'  r  r]  r  r^  r9  r{   rx   )*r   r}   rE  r3  ry  r   r2  r   r   r   r	  r
  r  r  r  r   rO  r_  r  rd   r  ru   rv   rw   rs   rt   r  rk  r  r   r#  r  r  rh  scores_for_all_vocabr?  r;  r  ri  rk  rl  rn  rp   ro  rq   r^    s  ,

 
(





$ j
z(GenerationMixin._constrained_beam_searchrB  c           +         s  |j }	|j}
|j}|j}|j}|j}|r|rdnd}|r |r dnd}|r(|
r(dnd}|r0|
r0dnd}|r8|r8dnd}|rV| jjrV|
rI|d dnd}|rT|d dnd}|j	d }t
j|t
j|jd}| ||}d}d	}| j|||jd
r|j	d }||\}}|| j}|dur|| j}|j	d |j	d  }||d}t|}t||j	d | jj}t||j	d }d|v rt
j|d t
j||| |jt
jdfdd|d< | j|fi |} d| v r|d | d< | |
rd|
ini  | |rd|ini  | di | }!|!jdd| d df jt
j|jd   t|dkrSt|d D ]$}"||ddd||" f  dd|"ddf  dd|"ddf< q.|	rf|durft||| |\}#}$n`|	r jdd}%t
j |%dddddf dd!ddddf }&n j"dd}&|dd|df }'|'|&ddddf k j#dddk $ }$|r|$|kr|$d8 }$|&ddd|$d f }#t
j||#fdd}|dur|%|#&  |j	d }(|(d })t'| |!j(|)|!_(|)| |$ | j*|!|| jj|$d d}|r|rqo|r|$d }*|r |t+ fddt|*D 7 }|r2|t+fddt|*D 7 }|r7|(n|*}*|
rf| jjrTt,||!j-||*}t,||!j.||*d	d}n|!j/d durft,||!j/||*d	d}|r| jjrwt,||!j0||*}nt,||!j1||*}|||| @ }|2 dk}d}| j|||jd
sy|dur|3  t4|dr|j5j6j7dkr|j8|j5j6_8|r| jjrt9|||||||||dd	S t:||||||ddS |S )a
  
        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
        models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            candidate_generator (`CandidateGenerator`):
                A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
                more information, the documentation of [`CandidateGenerator`] should be read.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        rp   Nr   rf   rg   r   r   FTrS  r   r$   r   r&  r  r#  r	  r
  r  )r   r$  c                 3   &    | ]} d d |d d f V  qd S r   rp   r  )
new_logitsrp   rq   r    rp  z5GenerationMixin._assisted_decoding.<locals>.<genexpr>c                 3   rr  r   rp   r  )r  rp   rq   r    rp  )is_decoder_attentionr2  	heuristicr]   r  r  );r=  r	  r
  r  r  r  r   r   r   r   rl   r   r   r   r  rq  get_candidatesr)  r  r1   r2   r  r   r   r  re   r  r   r   r  _speculative_samplingr  r  r  r  r   r  rK  r  r0   r]   update_candidate_strategyr.  r  _split_model_outputsrv   ru   rf   rw   rg   r  r  r   r2  r   num_assistant_tokens_schedulenum_assistant_tokensrr   rb   )+r   r}   rB  r3  ry  r   r2  r3  r   r=  r	  r
  r  r  r  rd   r  ru   rv   rw   rs   rt   r   r  rk  is_first_iterationr  candidate_input_idscandidate_logitscandidate_lengthis_done_candidatecandidate_kwargsr   r#  r  valid_tokens	n_matchesr  selected_tokenscandidate_new_tokensnew_cur_lennew_cache_sizenewly_added_lengthrp   )rs  r  rq   rO    s6  1




F
4*






  

	z"GenerationMixin._assisted_decodingc                 K   s&  dt jj_|j}t j|d d d df |dd}d|vr td| |j}|	dd }d}|D ]J}	||	j
d  }
|d urI|d d d |
f |d< t j||
t j|	jd|d	< |d	 d|d
< | j|	fi |}|di |ddi}|j|d< |
}q0||d< |d	 dd  d |d	< |	d
d }|S )N@   r   r  r]   z+Cannot use prefill chunkink without a cacher   r   r   r   r   r  Tr$   rp   )rl   _dynamor   cache_size_limitr  r  r   r  r  r   r   r   r   r   r'  r   r]   )r   r}   r   r   
chunk_sizeinput_chunksr  r   r   input_chunkcurrent_lengthr   r#  r   rp   rp   rq   r    s0   
 

z!GenerationMixin._prefill_chunking)NNNN)NNNr   )r$   FN)Fr$   )NF)NN)NNNNNNNNNNN)Qrh   ri   rj   rk   rl   rm   r   ro   r	   r   r   r   r   r   r   r  r   r   r5   r   r   r  r   r   r  staticmethodr6  r"  r   r.  r1  rA   r,   rC  r   r   rx  rV   r  r
   rm  r  r  r  r  r  r  r  r  r  r  r  r"  r$  r1  no_gradGenerateOutputrj  rq  rJ  GenerateNonBeamOutputrQ  rS  rV  r  r  r  r  r  r  r   r#  r.  GenerateBeamOutputrY  r'   r\  r)   r^  rO  r  rp   rp   rp   rq   r|   [  s   
"
A
 	
B
 

$

/
;&


1	

Q	

 l
'



)
z8,9

P"N
>
y

R	
   l
U

 U	   !	
 C 
	'	

6
	


3
  F
  %	
 o

  r|   c                 C   s  | dd| df }|j dd}|ddt||f dd}|j dd}|ddt||f dd}	|	| }
t|
}||
k}| jdddk  }|ri||kri|d8 }|ddd|d f }||fS |jd }|dd|ddf }||k r|dd|ddf }tj|| dd}|	|  n|}tj
|ddddddf }|dkrtj|ddd|f |fdd}||fS |}||fS )a  
    Applies sampling as in the speculative decoding paper (https://arxiv.org/pdf/2211.17192.pdf, algorithm 1). Returns
    the selected tokens, as well as the number of candidate matches.

    NOTE: Unless otherwise stated, the variable names match those in the paper.
    Nr   r  r   r$   )r  r  )r  rl   r   r  	rand_liker   r  r   clampdiv_r  r  )r}  r~  r  rs  r  new_candidate_input_idsqq_ir  p_iprobability_ratior_iis_acceptedr  r  gamma
p_n_plus_1
q_n_plus_1p_primer{  rp   rp   rq   rw  /  s4     

 "rw  Fc           	      C   s   t | dkr1d}|D ]}|r|n|jd }||dd|d|f f7 }q
| |f7 } |d7 }||8 }t|D ])}d}|D ]}|rC|| n|jd }||d||d d|f f7 }q;| |f7 } q5| S )z
    Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
    where each member corresponds to a single generated token.
    r   rp   r   .Nr$   )r   r   r  )	r#  new_outputsr  	added_lenrt  	new_tupler  last_dim_sizer  rp   rp   rq   ry  g  s   
"ry  r  r  next_top_k_probsr  alpha
beam_widthr   c                 C   s   | | j ddd }||j ddd }t||ddd}|j|jd}d| t|jj }|| }tj	|dd\}	}
|
d}d| | ||	  }tt||}|j	dd\}
}|S )	a  
    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
    row in the batch.
    r   Tr  r$   r   r%  r  rQ  )normrl   matmulr  r  r)  r   finfor  r  r  r  r  )r  r  r  r  r  r  norm_context_hiddennorm_next_hiddencosine_matrixdegeneration_penaltyr   contrastive_scorer  rp   rp   rq   r    s   
r  full_batch_size
split_sizec                    s    du rdg|  S t  tjr fddtd|D S t  ts/t  tr5t  jtr5 |S t  tr]t  d trO fddtd|D S  fddtd|D S t	dt
  )a  
    Takes care of three cases:
    1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
    2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and
       return a list of tuples
    3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and
       return a list of tuples of tuples
    (see documentation of ModelOutput)
    Nc                    s   g | ]
} ||  qS rp   rp   r  datar  rp   rq   r    r  z_split.<locals>.<listcomp>r   c                    $   g | ] t  fd dD qS )c                 3   s(    | ]}t  fd d|D V  qdS )c                 3        | ]}|    V  qd S r   rp   )r   r'  r  r  rp   rq   r        z._split.<locals>.<listcomp>.<genexpr>.<genexpr>Nr  )r   inner_tupler  rp   rq   r    s   & $_split.<locals>.<listcomp>.<genexpr>r  r   r  r  rq   r        c                    r  )c                 3   r  r   rp   )r   
sub_tensorr  rp   rq   r    r  r  r  r  r  r  rq   r    r  Unexpected attribute type: )r   rl   r   r  r   r   r  batch_splitr  r  r@  )r  r  r  rp   r  rq   _split  s&   





r  r   r   c                    s6  du rg  S t  dkrtdkr!tdtdr+j n }fdd|D }fdd|D }g d	fd
d|D fddt D }fdd|D  dv rtd | fddt|D }dv rfdd|D } fdd|D }|S )a  
    Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
    size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
    previous forward pass.
    Nr   z3`full_batch_size` must be divisible by `split_size`z:`split_size` must be smaller or equal to `full_batch_size`__dataclass_fields__c                    r  rp   rp   r   r   r   rp   rq   r    r  z'_split_model_inputs.<locals>.<listcomp>c                    s&   g | ]}t  | ts|d kr|qS )r   r   r6  r  r  rp   rq   r    r   )r   r   r#  c                    s&   g | ]}t | ts| vr|qS rp   r  r  )keys_to_ignorer   rp   rq   r    r   c                    s$   g | ]  fd dD qS )c                    s"   i | ]}|t |   qS rp   )r  r  )r  r  r   r  rp   rq   r     s   " z2_split_model_inputs.<locals>.<listcomp>.<dictcomp>rp   r  )r  r   non_bool_keysr  r  rq   r    r  c                    s   i | ]}| | qS rp   rp   r  r  rp   rq   r     r  z'_split_model_inputs.<locals>.<dictcomp>r   c                    s$   g | ]\}}i |d  | iqS )r   rp   )r   r  
data_split)encoder_outputs_splitrp   rq   r    s    r#  c                    s    g | ]}i |d  d  iqS )r#  rp   r   r  r  rp   rq   r    s    c                    s   g | ]}d i | qS )rp   rp   r  )	bool_datamodel_output_clsrp   rq   r    s    )	r@  r   r   r  r   r  _split_model_inputsr  r  )r   r  r  r   r   	bool_keysdata_split_listsplit_model_inputsrp   )r  r  r  r  r   r  r  r  rq   r    s@   



r  r?  c                    sf   st dtd tfddD st ddd   fdd	j D }di |S )z
    Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
    specific ModelOutput subclass from the list provided.
    zInput list is empty.r   c                 3   s    | ]}t | V  qd S r   )r   )r   obj)r  rp   rq   r    r  z&stack_model_outputs.<locals>.<genexpr>z4All elements in the list should be of the same type.c                    s   t dd  D rdS t d tjrtj ddS t d tr&t S t d tr2t S t d trdt d d trSt fddt	t
 d D S t fddt	t
 d D S t d ttfrrt S tdt d  )	z5
        Reverse of `_split` function above.
        c                 s   s    | ]}|d u V  qd S r   rp   )r   r  rp   rp   rq   r    r6  z7stack_model_outputs.<locals>._concat.<locals>.<genexpr>Nr   r  c                 3   s8    | ] t  fd dttd d D V  qdS )c                 3   s.    | ] t j fd dD ddV  qdS )c                    s   g | ]}|   qS rp   rp   r  )r  jrp   rq   r  %  r  zLstack_model_outputs.<locals>._concat.<locals>.<genexpr>.<genexpr>.<listcomp>r   r  Nrl   r  r  )r  r  )r  rq   r  %  s   , zAstack_model_outputs.<locals>._concat.<locals>.<genexpr>.<genexpr>r   N)r  r  r   r  r  r  rq   r  $  s
    &
c                 3   s,    | ] t j fd dD ddV  qdS )c                    s   g | ]}|  qS rp   rp   r  r  rp   rq   r  )  rs  zBstack_model_outputs.<locals>._concat.<locals>.<genexpr>.<listcomp>r   r  Nr  r  r  r  rq   r  )  s   * r  )r   r   rl   r   r  r   from_batch_splitsr   r  r  r   r   r  r'  r  r@  r  rp   r  rq   _concat  s"   

"
z$stack_model_outputs.<locals>._concatc                    s$   i | ]   fd dD qS )c                    s   g | ]}t | qS rp   )r   )r   model_outputr  rp   rq   r  2  r  z2stack_model_outputs.<locals>.<dictcomp>.<listcomp>rp   r  )r  r?  r  rq   r   1  r  z'stack_model_outputs.<locals>.<dictcomp>Nrp   )r   r@  r  r  r   )r?  r   concatenated_datarp   )r  r  r?  rq   r    s   r  g?InfgMbPrd   baseline_scoresrelative_topfilter_valuerU  c                 C   s   | j dd}|j dd}tj|dd\}}	|d|d f }
tj|ddj}|t| }t|
|}|d}||||k < ||||k < ||fS )a]  
    Reference: https://github.com/XiangLi1999/ContrastiveDecoding/blob/170e9142e92159c1237d731e240f5eb14aabf428/transformers/src/transformers/generation_logits_process.py#L235
    Apply filtering to only keep tokens with a probability above a certain threshold. The threshold is defined as `relative_top` * max probability in the distribution.
    r   r  T)
descending.r$   )	r  rl   sortr  r   nplogr  r'  )rd   r  r  r  base_filter_valuerU  scores_normalizedbaseline_scores_normalizedsorted_logitssorted_indices
min_thresh	probs_maxprobs_threshrp   rp   rq   _relative_top_filter:  s   
r  r  r  r  c                    s.  t | dkr | d  }t||\}}|| }|S tj fdd| D dd}tj|dd}tj|dd}d|d d d d d f |  }tj|dd}	tj|dd}
tj|	d d d d d f |dd	d}tj|
|dd	d}d||  }|d}| t	|
   } | }t||\}}|| }|S )
Nr$   r   c                    s   g | ]} | qS rp   rp   r  r  rp   rq   r  _  rs  z)_dola_select_contrast.<locals>.<listcomp>r  r   g      ?none)	reduction)r   r  rl   r  Fr  r  kl_divmeanr   r  r  )r  r  r  base_logitsre   stacked_premature_layerssoftmax_mature_layersoftmax_premature_layersavg_distlog_softmax_mature_layerlog_softmax_premature_layerskl1kl2js_divspremature_layerrp   r  rq   r  S  s(   (
r  )F)r  r   r  r]  dataclassesr   typingr   r   r   r   r   r   r	   r
   numpyr  rl   torch.distributeddistributedrG  	packagingr   r   torch.nnr   r  +transformers.generation.candidate_generatorr   cache_utilsr   r   r   r   r   r   r   configuration_utilsr   integrations.deepspeedr   integrations.fsdpr   modeling_outputsr   r   pytorch_utilsr   tokenization_utilsr   utilsr   r   r    r!   r"   r#   beam_constraintsr%   r&   beam_searchr'   r(   r)   rB  r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   logits_processr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   ry  rQ   rR   rS   rT   rU   rV   rW   modeling_utilsrX   tokenization_utils_baserY   	streamersrZ   
get_loggerrh   r   accelerate.hooksr[   r\   r&  rb   rr   rx   r{   GreedySearchDecoderOnlyOutput"ContrastiveSearchDecoderOnlyOutputSampleDecoderOnlyOutput%ContrastiveSearchEncoderDecoderOutput GreedySearchEncoderDecoderOutputSampleEncoderDecoderOutputBeamSearchDecoderOnlyOutputBeamSampleDecoderOnlyOutputBeamSearchEncoderDecoderOutputBeamSampleEncoderDecoderOutputGreedySearchOutputSampleOutputBeamSearchOutputBeamSampleOutputContrastiveSearchOutputr  r  r  r|   rw  ry  ro   rm   r  r   r  r  r  r  r  r  rp   rp   rp   rq   <module>   s2  ($	 ,p$
	#/+:                                   w
8
$

@9
