o
    h.                    @   sX  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ ddlmZmZ ddlmZmZmZmZ ddlmZmZ dd	lmZmZ d
dlm Z  d
dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. e/e0Z1eG dd deZ2eG dd deZ3eG dd deZ4eG dd deZ5eG dd deZ6eG dd deZ7eG dd deZ8eG dd deZ9eG dd deZ:eG dd  d eZ;e
e3e2f Z<e
e5e4f Z=e
e7e6f Z>e
e9e8f Z?e
e;e:f Z@e
e<e=e>e?e@f ZAG d!d" d"ZBd#d$ ZCd%d& ZDd'ejEd(ejEd)ejEd*eFd+eGd,ejEfd-d.ZHdS )/    N)	dataclass)AnyDictOptionalTupleUnion)dynamic_update_slice   )TFCausalLMOutputWithPastTFSeq2SeqLMOutput)TF_MODEL_FOR_CAUSAL_LM_MAPPING)TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING%TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING!TF_MODEL_FOR_VISION_2_SEQ_MAPPING)
shape_liststable_softmax)ModelOutputlogging   )GenerationConfig)TFForcedBOSTokenLogitsProcessorTFForcedEOSTokenLogitsProcessorTFForceTokensLogitsProcessorTFLogitsProcessorListTFMinLengthLogitsProcessorTFNoBadWordsLogitsProcessorTFNoRepeatNGramLogitsProcessor"TFRepetitionPenaltyLogitsProcessor&TFSuppressTokensAtBeginLogitsProcessorTFSuppressTokensLogitsProcessorTFTemperatureLogitsWarperTFTopKLogitsWarperTFTopPLogitsWarperc                   @   n   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
e
ej   ed< dZee
e
ej   ed< dS )TFGreedySearchDecoderOnlyOutputa  
    Base class for outputs of decoder-only generation models using greedy search.


    Args:
        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
    N	sequencesscores
attentionshidden_states__name__
__module____qualname____doc__r%   r   tfTensor__annotations__r&   r   r'   r(    r1   r1   t/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/generation/tf_utils.pyr$   9      
 r$   c                   @      e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
e
ej   ed< dZee
e
ej   ed< dZee
e
ej   ed	< dS )
"TFGreedySearchEncoderDecoderOutputa
  
    Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention
    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)


    Args:
        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
    Nr%   r&   encoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesr*   r+   r,   r-   r%   r   r.   r/   r0   r&   r   r6   r7   r8   r9   r:   r1   r1   r1   r2   r5   U      
 r5   c                   @   r#   )TFSampleDecoderOnlyOutputaE  
    Base class for outputs of decoder-only generation models using sampling.


    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
            generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
    Nr%   r&   r'   r(   r)   r1   r1   r1   r2   r=      r3   r=   c                   @   r4   )
TFSampleEncoderDecoderOutputaZ  
    Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of
    the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)


    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
            generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size*num_return_sequences,
            num_heads, sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
    Nr%   r&   r6   r7   r8   r9   r:   r;   r1   r1   r1   r2   r>      r<   r>   c                   @      e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeej ed< dZeeeej   ed< dZeeeej   ed< dS )	TFBeamSearchDecoderOnlyOutputa  
    Base class for outputs of decoder-only generation models using beam search.

    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
    Nr%   sequences_scoresr&   beam_indicesr'   r(   r*   r+   r,   r-   r%   r   r.   r/   r0   rA   r&   r   rB   r'   r(   r1   r1   r1   r2   r@         
 r@   c                   @      e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeej ed< dZeeej  ed< dZeeej  ed< dZeeeej   ed	< dZeeeej   ed
< dZeeeej   ed< dS ) TFBeamSearchEncoderDecoderOutputa  
    Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights
    of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)

    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
            beam. `Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
            sequence_length)`.
        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
    Nr%   rA   r&   rB   r6   r7   r8   r9   r:   r*   r+   r,   r-   r%   r   r.   r/   r0   rA   r&   r   rB   r6   r7   r8   r9   r:   r1   r1   r1   r2   rF      s   
 %rF   c                   @   r?   )	TFBeamSampleDecoderOnlyOutputa  
    Base class for outputs of decoder-only generation models using beam sample.

    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
    Nr%   rA   r&   rB   r'   r(   rC   r1   r1   r1   r2   rH     rD   rH   c                   @   rE   ) TFBeamSampleEncoderDecoderOutputa~  
    Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention
    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)

    Args:
        sequences (`tf.Tensor` of shape `(batch_size*num_beams, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`tf.Tensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `tf.Tensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size*num_beams, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
    Nr%   rA   r&   rB   r6   r7   r8   r9   r:   rG   r1   r1   r1   r2   rI   =  s   
 $rI   c                   @   r#   )$TFContrastiveSearchDecoderOnlyOutputa  
    Base class for outputs of decoder-only generation models using contrastive search.

    Args:
        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
    Nr%   r&   r'   r(   r)   r1   r1   r1   r2   rJ   n  s   
 rJ   c                   @   r4   )
'TFContrastiveSearchEncoderDecoderOutputa
  
    Base class for outputs of encoder-decoder generation models using contrastive search. Hidden states and attention
    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)

    Args:
        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
    Nr%   r&   r6   r7   r8   r9   r:   r;   r1   r1   r1   r2   rK     s   
 rK   c                "   @   sF  e Zd ZdZdZedd ZdZdd Z		d_d	e	j
d
ee	j
 dee	j
 dede	j
f
ddZdd Zdeeef fddZ				d`dee	j
 dee dee deee	j
f fddZde	j
dee dee de	j
fddZ	dade	j
dee deeef fd d!Z		dbd"ededeee	j
f d#ee d$ee dee	j
eee	j
f f fd%d&Z	dbd#ee d$ee defd'd(Ze	)			dcd*ed+ed,ee	j
 d-edee	j
eeef f f
d.d/Z			dddee	j
 d$ee deeee	j
f  dee	j
ee eee	j
f f fd0d1Z 			dddee	j
 d$ee deeee	j
f  de	j
fd2d3Z!ed4e"fd5d6Z#	ded4e"deeef d+edeeef fd7d8Z$		9dfd:e"deeef d;ed<ed"ed+ed=efd>d?Z%dedefd@dAZ&dedBedee defdCdDZ'dEedFedefdGdHZ(								dgd,e	j
d<ee dee dee dee dIee dJee dKee dLee dee)e	j
f fdMdNZ*										dhd,e	j
dee dOee d<ee dee dee dPeeeef  dIee dJee dKee dLee dee+e	j
f fdQdRZ,edidSdTZ-													djd,e	j
dUed<ee dee dee dVee. dWeeeef  dee dOee dXee dIee dJee dKee dLee dee/e0e	j
f fdYdZZ1	)	9									dkd,e	j
d[ee d\ee. dee dOee d<ee dee dee dIee dJee dKee dLee dee2e	j
f fd]d^Z3dS )lTFGenerationMixina  
    A class containing all of the functions supporting generation, to be used as a mixin in [`TFPreTrainedModel`].

    The class exposes [`~generation.TFGenerationMixin.generate`], which can be used for:
        - *greedy decoding* by calling [`~generation.TFGenerationMixin.greedy_search`] if `num_beams=1` and
          `do_sample=False`
        - *contrastive search* by calling [`~generation.TFGenerationMixin.contrastive_search`] if `penalty_alpha>0` and
          `top_k>1`
        - *multinomial sampling* by calling [`~generation.TFGenerationMixin.sample`] if `num_beams=1` and
          `do_sample=True`
        - *beam-search decoding* by calling [`~generation.TFGenerationMixin.beam_search`] if `num_beams>1`

    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    Nc                 C   s*   t dt | jd u rtjj | _| jS )NzG`seed_generator` is deprecated and will be removed in a future version.)warningswarnUserWarning_seed_generatorr.   random	Generatorfrom_non_deterministic_state)selfr1   r1   r2   seed_generator  s   
z TFGenerationMixin.seed_generatorTc                 O   s   t d)NzbA model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`.)NotImplementedError)rT   argskwargsr1   r1   r2   prepare_inputs_for_generation  s   z/TFGenerationMixin.prepare_inputs_for_generationFr%   r&   rB   normalize_logitsreturnc                 C   s`  |du rt t jt |d jd dddt|g}t t t |t|dfd}t |d| j	j
|jd f}|rDt jj|dd}|dk }t jt jjdt j|t jd dd}|dd| df }|dd| df }t |d|}|jd | }|dd|df }t t |jd |j}	t j|||	gdd}
t ||
}t |d|}|S )a  
        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
        used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.

        Parameters:
            sequences (`tf.Tensor`):
                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
                shorter if all batches finished early due to the `eos_token_id`.
            scores (`tuple(tf.Tensor)`):
                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
                of log probabilities of tokens conditioned on log softmax of previously generated tokens Tuple of
                `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token), with each
                tensor of shape `(batch_size*num_beams, config.vocab_size)`.
            beam_indices (`tf.Tensor`, *optional*):
                Beam indices of generated token id at each generation step. `tf.Tensor` of shape
                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
                generate-time.
            normalize_logits (`bool`, *optional*, defaults to `False`):
                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).

        Return:
            `tf.Tensor`: A `tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
                the transition scores (logits)

        Examples:

        ```python
        >>> from transformers import GPT2Tokenizer, TFAutoModelForCausalLM
        >>> import numpy as np

        >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
        >>> inputs = tokenizer(["Today is"], return_tensors="tf")

        >>> # Example 1: Print the scores for each token generated with Greedy Search
        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, normalize_logits=True
        ... )
        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
        >>> # encoder-decoder models, like BART or T5.
        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
        >>> generated_tokens = outputs.sequences[:, input_length:]
        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
        ...     # | token | token string | logits | probability
        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
        |   262 |  the     | -1.414 | 24.33%
        |  1110 |  day     | -2.609 | 7.36%
        |   618 |  when    | -2.010 | 13.40%
        |   356 |  we      | -1.859 | 15.58%
        |   460 |  can     | -2.508 | 8.14%

        >>> # Example 2: Reconstruct the sequence scores from Beam Search
        >>> outputs = model.generate(
        ...     **inputs,
        ...     max_new_tokens=5,
        ...     num_beams=4,
        ...     num_return_sequences=4,
        ...     return_dict_in_generate=True,
        ...     output_scores=True,
        ... )
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
        ... )
        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
        >>> # Tip: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
        >>> # use case, you might want to recompute it with `normalize_logits=True`.
        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
        >>> length_penalty = model.generation_config.length_penalty
        >>> reconstructed_scores = np.sum(transition_scores, axis=1) / (output_length**length_penalty)
        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
        True
        ```Nr   r   axis)r   r   dtype)r.   tileexpand_dimsrangeshapelen	transposereshapestackconfig
vocab_sizennlog_softmaxmath
reduce_max
reduce_sumcastint32wherebroadcast_to	gather_nd)rT   r%   r&   rB   rZ   beam_indices_maskmax_beam_lengthcut_idxtoken_indicesgen_step_idxindicestransition_scoresr1   r1   r2   compute_transition_scores  s(   S."z+TFGenerationMixin.compute_transition_scoresc                 C   sz   |   s;ttttg}t }|D ]}|jt| jdd}|dur%|	|j
 qd| jj
 d}|r7|d| 7 }t|dS )z
        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
        right class to use.
        N)defaultzThe current model class (zQ) is not compatible with `.generate()`, as it doesn't have a language model head.z2 Please use one of the following classes instead: )can_generater   r   r   r   setgettyperi   addr*   	__class__	TypeError)rT   generate_compatible_mappingsgenerate_compatible_classesmodel_mappingsupported_modelsexception_messager1   r1   r2   _validate_model_classU  s$   z'TFGenerationMixin._validate_model_classmodel_kwargsc                 C   s   | j jrdD ]}||d qg }tt| jj}d|v s"d|v r-|tt| jjO }|	 D ]\}}|durB||vrB|
| q1|rMtd| ddS )zXValidates model kwargs for generation. Generate argument typos will also be caught here.)decoder_input_idsNrX   r   z8The following `model_kwargs` are not used by the model: zG (note: typos in the generate arguments will also show up in this list))ri   is_encoder_decoderpopr   inspect	signaturerY   
parameterscallitemsappend
ValueError)rT   r   keyunused_model_args
model_argsvaluer1   r1   r2   _validate_model_kwargsn  s    

z(TFGenerationMixin._validate_model_kwargsinputsgeneration_configlogits_processorc                 K   s  |    |du r+| jjr(| jjt| jkr(t| j}|| jkr(t	d || _| j}t
|}|jd0i |}| |
  |durdt|tjrN|jjrNnt|tjr]t|jtjr]nt|tj}|ddurvt|d tj|d< d|v rt|d tjr|d jjrnt|d tjrt|d jtjrnt|d tj|d< |dur|nt }|jdu r|jdur|ddu rtd |j}t|tr|d }||_t   }	|	r| j!st"d| #||j$|\}
}}t%|
d }|j&|d< |j'|d	< |j(|d
< dt)t*+| j,j-. v }d|v}|dddu r.|r.|r.| /|
|j|j|d< | jj0sN|jdurNtj12|
dddf |jkrNtd | jj0r_d|vr_| 3|
||}| jj0rs| j4||||j5|j$d\}}n|dkrz|
n|6d}t%|d }|ddu o|j7du}|r|j8du r|j7dkrt	d|j7 dt9 n#|j8dur|s|j7durtd|j8 d|j7 d |j8| |_7t|tjs|j:dur|j:|j7krt"d|j: d|j7 d||j7kr| jj0rdnd}td| d| d|j7 d |j;duo,|j;dko,|j<du o,|j=duo,|j=dk}| o;|j>dko;|j<du }| oJ|j>dkoJ|j<du }|j>dkoU|j<d u }|j>dko`|j<d u }| j?|||d!}|r|j@dkr{t"d"|j@ d#| jA|f|j7|j|j||jB|jCd$|S |r|j@dkrt"d"|j@ d%| jD|f|j;|j=||j7|j|j|jB|jCd&|S |r| jE|d'}| jFd0||j@| jj0d(|\}}| jG|f|||j7|j|j||jB|jCd)|S |r0|j>|j@k rt"d*|j> d+|j@ d,| jFd0||j>| jj0d d-|\}}| jH|f|j7|j|j|jI|jJ||jB|jC|j@d.	|S |r||j>|j@k rGt"d*|j> d+|j@ d,| jE|d'}| jFd0||j>| jj0d d-|\}}| jH|fd |j7|j|j|jI|jJ|||jB|jC|j@d/|S dS )1a  
        Generates sequences of token ids for models with a language modeling head.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            inputs (`tf.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            seed (`List[int]`, *optional*):
                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
                `seed` argument from stateless functions in `tf.random`.
            kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `tf.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when
            `config.return_dict_in_generate=True`) or a `tf.Tensor`.

                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.TFGreedySearchDecoderOnlyOutput`],
                    - [`~generation.TFSampleDecoderOnlyOutput`],
                    - [`~generation.TFBeamSearchDecoderOnlyOutput`],
                    - [`~generation.TFBeamSampleDecoderOnlyOutput`]

                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.TFGreedySearchEncoderDecoderOutput`],
                    - [`~generation.TFSampleEncoderDecoderOutput`],
                    - [`~generation.TFBeamSearchEncoderDecoderOutput`],
                    - [`~generation.TFBeamSampleEncoderDecoderOutput`]

        NaS  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )attention_maskr   zThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.r   z[The selected model does not support Graph mode nor XLA generation (e.g. from tf.function())output_attentionsoutput_hidden_states	use_cacheencoder_outputsr^   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.)
batch_sizemodel_input_namer   decoder_start_token_idbos_token_id	input_ids
max_length   z0Using the model-agnostic default `max_length` (=zx) to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.zBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)z3Unfeasable length constraints: the minimum length (z%) is larger than the maximum length ()zInput length of z is z, but `max_length` is set to zW. This can lead to unexpected behavior. You should consider increasing`max_new_tokens`.r   FT)r   input_ids_seq_lengthr   z)num_return_sequences has to be 1, but is z when doing greedy search.)r   pad_token_ideos_token_idr   output_scoresreturn_dict_in_generatez when doing contrastive search.)top_kpenalty_alphar   r   r   r   r   r   )r   )r   expand_sizer   )r   logits_warperr   r   r   seedr   r   zwBeam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences, got z and z (respectivelly))r   r   r   expand_in_new_axis)	r   r   r   length_penaltyearly_stoppingr   r   r   num_return_sequences)	do_sampler   r   r   r   r   r   r   r   r   r   r1   )Kr   r   _from_model_config_original_object_hashhashr   from_model_configri   rM   rN   copydeepcopyupdater   
isinstancer.   r/   r`   is_floatingnpndarray
issubdtypefloatingrp   rq   r   r   r   r   loggerwarninglistexecuting_eagerlysupports_xla_generationr   _prepare_model_inputsr   r   r   r   r   r   r   r   r   r   keys&_prepare_attention_mask_for_generationr   rm   
reduce_any._prepare_encoder_decoder_kwargs_for_generation)_prepare_decoder_input_ids_for_generationr   r   r   max_new_tokensrO   
min_lengthr   r   r   	num_beams_get_logits_processorr   greedy_searchr   r   contrastive_search_get_logits_warper_expand_inputs_for_generationsamplebeam_searchr   r   )rT   r   r   r   r   rX   new_generation_configr   r   use_xlainputs_tensorr   r   accepts_attention_maskrequires_attention_maskr   r   has_default_max_lengthinput_ids_stringis_contrastive_search_gen_modeis_greedy_gen_modeis_beam_gen_modeis_sample_gen_modeis_beam_sample_gen_moder   r1   r1   r2   generate  s  F



















	
	zTFGenerationMixin.generater   r   c                 C   s   t |jdko|jtjtjfv }|d uotj||k}|d u p#||k}|r7|r7|r7tjtj	||tjdS tj
|jd d tjdS )Nr	   r_   )re   rd   r`   r.   rq   int64rm   r   rp   	not_equalones)rT   r   r   r   is_input_idsis_pad_token_in_inputs&is_pad_token_not_equal_to_eos_token_idr1   r1   r2   r     s    z8TFGenerationMixin._prepare_attention_mask_for_generationr   r   c                    s   |   }g dfdd| D }tt|jj d v p#d v }|s1 fdd| D }d|d< |||< || jkrCd || j< |d
i |}||d	< |S )N)decoder_
cross_attnr   c                    s,   i | ]\ }t  fd dD s |qS )c                 3   s    | ]}  |V  qd S N)
startswith).0pargumentr1   r2   	<genexpr>   s    z^TFGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>.<genexpr>)any)r   r   )irrelevant_prefixr   r2   
<dictcomp>  s    zTTFGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>rX   r   c                    s   i | ]\}}| v r||qS r1   r1   )r   r   r   )encoder_signaturer1   r2   r   %  s    Treturn_dictr   r1   )get_encoderr   r   r   r   r   r   main_input_name)rT   r   r   r   encoderencoder_kwargsencoder_accepts_wildcardr   r1   )r   r   r2   r     s$   



z@TFGenerationMixin._prepare_encoder_decoder_kwargs_for_generationr   r   r   c           	      C   s   |durd|v r| d}nd|v r|dkr| d}nd}| ||}tj|dftjd| }|du r:|}||fS t|dddf |krptj||gdd}d	|v rp|d	 }tjt|ddddf |fdd}||d	< ||fS )
zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNr   r   r   r_   r   r^   r\   decoder_attention_mask)r   _get_decoder_start_token_idr.   r   rq   
reduce_allconcat	ones_like)	rT   r   r   r   r   r   r   decoder_input_ids_startr  r1   r1   r2   r   3  s(   z;TFGenerationMixin._prepare_decoder_input_ids_for_generationc                 C   sH   |d ur|n| j j}|d ur|n| j j}|d ur|S |d ur |S td)Nz\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.)r   r   r   r   )rT   r   r   r1   r1   r2   r  Z  s   z-TFGenerationMixin._get_decoder_start_token_idr   r   r   r   r   c                    sl   dt jffdd  fdd}|dur |}||}|r2|ddu r*td||d |d< ||fS )	a  
        Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...] or [batch_size, expand_size, ...],
        depending on `expand_in_new_axis`. Beam-based approaches expect this function to be used with
        `expand_in_new_axis=True`
        tensorc                    sL    rt | }t| d d d f |d ft|dd   S tj| ddS )Nr   r   r\   )r   r.   rs   tuplerepeat)r  rd   )r   r   r1   r2   _expand_tensor|  s   0zGTFGenerationMixin._expand_inputs_for_generation.<locals>._expand_tensorc                    s:   | D ]}| | d urt | | tjr | | | |< q| S r   )r   r.   r/   )dict_to_expandr   )r  r1   r2   _expand_dict_for_generation  s
   zTTFGenerationMixin._expand_inputs_for_generation.<locals>._expand_dict_for_generationNr   zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)r.   r/   r   r   )r   r   r   r   r   r  r1   )r  r   r   r2   r   n  s   z/TFGenerationMixin._expand_inputs_for_generationc              
      s2  | j jrt| drt| jdr| jj| jkr| jj n| j  fdd| D }| d}|durH|durHtd| d  d| d	  d
	|durN|} dkrd|v r| j js~dtt	
| jj v }|sstd| jj d| j|||d|d< n|durtd|d d} | |||}| |fS )zT
        This function extracts the model-specific `inputs` for generation.
        r   r   c                    s&   i | ]\}}|d us| kr||qS r   r1   )r   kv
input_namer1   r2   r     s   & z;TFGenerationMixin._prepare_model_inputs.<locals>.<dictcomp>Nz
`inputs`: z` were passed alongside z0 which is not allowed. Make sure to either pass z or z=...r   inputs_embedszAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)r   zMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)ri   r   hasattrr   r   r   r   r   r   r   r   rY   r   r   r   r*   *_maybe_initialize_input_ids_for_generation)rT   r   r   r   inputs_kwarghas_inputs_embeds_forwardingr1   r  r2   r     sL   


z'TFGenerationMixin._prepare_model_inputsc                 C   s   |dur|S | d}| jjr%|dur%|jjdd }tj|tjdd S |du r-tdd}|	 D ]}t
|tjrB|jd } nq3tj|dftjd| S )	z3Initializes input ids for generation, if necessary.Nr   r^   r_   izB`bos_token_id` has to be defined when no `input_ids` are provided.r   r   )r   ri   r   last_hidden_staterd   r.   r   rq   r   valuesr   r/   )rT   r   r   r   r   rd   r   r   r1   r1   r2   r    s   

z<TFGenerationMixin._maybe_initialize_input_ids_for_generationoutputsc                 C   s:   d }d| v r| j }|S d| v r| j}|S d| v r| j}|S )Npast_key_valuesmemspast_buckets_states)r  r  r  )r  r  r1   r1   r2   _extract_past_from_model_output  s   z1TFGenerationMixin._extract_past_from_model_outputc                 C   sT   |  ||d< |s(d|v r(|d }tj|tjt|d dftjdgdd|d< |S )Nr  r   r   r   r_   r^   r\   )r  r.   r  r   r   rq   )rT   r  r   r   r   r1   r1   r2   #_update_model_kwargs_for_generation  s    
z5TFGenerationMixin._update_model_kwargs_for_generationr   model_outputscur_lenr   
batch_axisc                    s    fdd} fdd}	dd }
dd }|  |}|d u r)td	t|  d
|dd d u}|sF|| d }||||}|
|||}n|d }|	|||}||||}|| t||d< |S )Nc                    s   |r*t jt j dft jdt j |ft jdt j dft jdgdd}d|i}|S | d}t j|t j |f|jdt j df|jdgdd}d|i}|S )zainitializes the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`r   r_   r\   r  r   )r.   r  r   rq   zerosr   r`   )r   num_padding_valuesr   r  maskr   r   r1   r2   _initialize_attention  s(   
zXTFGenerationMixin._update_model_kwargs_for_xla_generation.<locals>._initialize_attentionc           	         s   t jddgt jd| }|r)| d}t j df|jd}t|||}d|i}|S | d}t j df|jd}t|||}d|i}|S )z]updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`r   r   r_   r  r   )r.   constantrq   r   r   r`   r   )	r   new_past_indexr   update_startr  #decoder_attention_mask_update_slicer$  r   attention_mask_update_slicer%  r1   r2   _update_attention4  s   

zTTFGenerationMixin._update_model_kwargs_for_xla_generation.<locals>._update_attentionc                 S   s   |dkrCt jddgddgd|gddggt jd}d}| D ]$}t|}tt|dd D ]}t || |||< q,|t|f7 }q|S t jddgg|gdd	}t| }tt| D ]}t | | |||< qY|S )
zNinitialize past_key_values with zeros -- the structure depends on `batch_axis`r   r_   r1   Nr	      r   )   r	   )rz   updatesrd   )	r.   r'  rq   r   rc   re   padr	  
scatter_nd)r  r#  r!  padding_valuesnew_past
past_layernew_past_layerir1   r1   r2   _initialize_pastE  s   (zSTFGenerationMixin._update_model_kwargs_for_xla_generation.<locals>._initialize_pastc           	      S   s6  |dkrRt g d}d}| D ]@}t|}tt|d d D ](}|| d d d d dd f }t|| d d d d d df ||| ||< q|t|f7 }q|S t g d}dd tt| D }tt| D ].}| | d d d d d d dd f }t| | d d d d d d d df ||| ||< qj|S )	Nr   )r   r   r   r   r1   r	   r^   )r   r   r   r   r   c                 S   s   g | ]}d qS r   r1   )r   _r1   r1   r2   
<listcomp>f      zcTFGenerationMixin._update_model_kwargs_for_xla_generation.<locals>._update_past.<locals>.<listcomp>)r.   r'  r   rc   re   r   r	  )	r  r(  r!  slice_start_baser3  r4  r5  r6  update_slicer1   r1   r2   _update_pastV  s(   $
$*
zOTFGenerationMixin._update_model_kwargs_for_xla_generation.<locals>._update_pastzPNo known `past_key_values variable` found in model outputs (model outputs keys: r   r  r   r	   )r  r   r   r   r   r   r	  )rT   r  r   r   r   r   r   r!  r&  r,  r7  r=  r  is_past_initializedr#  r$  r3  r(  r1   r%  r2   '_update_model_kwargs_for_xla_generation  s,   



z9TFGenerationMixin._update_model_kwargs_for_xla_generationc                 C   s   t  }|jdkrt|jtrt|jd }nd}nd}|jdur-|jdkr-|t|j |j	durA|j	dkrA|t
|j	|d |jdurU|jdk rU|t|j|d |S )z
        This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsWarper`]
        instances used for multinomial sampling.
        r   r	   N      ?r   )r   min_tokens_to_keep)top_prA  )r   r   r   r   r   re   temperaturer   r    r   r!   rB  r"   )rT   r   warpersrA  r1   r1   r2   r     s   

z$TFGenerationMixin._get_logits_warperr   c                 C   sv  t  }|jdur|jdkr|t|jd |jdur(|jdkr(|t|j |jdur7|t|j|j |j	durP|jdurP|j	dkrP|t
|j	|j |jdur]|t|j |jdurl|t|j|j |jdury|t|j |jdur|}|dks|jdu r|n|d }|jdur||jd d 7 }|t|j| |jdur|t|j | ||}|S )z
        This class returns a [`TFLogitsProcessorList`] list object that contains all relevant [`TFLogitsProcessor`]
        instances used to modify the scores of the language model head.
        Nr@  )penaltyr   r   r^   )r   repetition_penaltyr   r   no_repeat_ngram_sizer   bad_words_idsr   r   r   r   forced_bos_token_idr   forced_eos_token_idr   r   suppress_tokensr   begin_suppress_tokensforced_decoder_idsr   r   _merge_criteria_processor_list)rT   r   r   r   
processorsbegin_indexr1   r1   r2   r     sL   












z'TFGenerationMixin._get_logits_processordefault_listcustom_listc                 C   s~   t |dkr|S |D ]-}|D ](}t|t|u r6d}td| dt| d| d| d| d| d	| d
qq
|| |S )Nr   zlogits processorz	A custom z	 of type z with values zP has been passed to `generate`, but it has already been created with the values z. z has been created by passing the corresponding arguments to generate or by the model's config default values. If you just want to change the default values of zL consider passing them as arguments to `generate` instead of using a custom .)re   r   r   extend)rT   rQ  rR  r}   customobject_typer1   r1   r2   rN    s*   

z0TFGenerationMixin._merge_criteria_processor_listr   r   r   r   c
                    s   durnt  durnjjdurnjjdur#njjttr/gdur5njj
dur?
njj
durInjj	durSnjj
|
djjt  dtv rotjnttfdddD rdndd	ttjj v 	rrg ndr
rg ndr
rg ndrrg ndt|\ }tj | ftjd
pd }tj||gdd}tj ftjd
}dd } 	
fdd}|||||
\}}}}
| }tj||||||
f|d\}}}}s|ddd|f }r~jj rv
r1|
d !dnd}r=|
d !dnd}durHt"nddurSt"nddur^t"nddurit"ndt#|||dS t$|dS |S )a  
        Generates sequences for models with a language modeling head using greedy decoding.

        Parameters:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`TFLogitsProcessorList`, *optional*):
                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            max_length (`int`, *optional*, defaults to 20):
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `call` function of the model. If
                model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.TFGreedySearchDecoderOnlyOutput`], [`~generation.TFGreedySearchEncoderDecoderOutput`] or
            `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a
            [`~generation.TFGreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.TFGreedySearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     TFAutoModelForCausalLM,
        ...     TFLogitsProcessorList,
        ...     TFMinLengthLogitsProcessor,
        ... )

        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")

        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id

        >>> input_prompt = "Today is a beautiful day, and"
        >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids

        >>> # instantiate logits processors
        >>> logits_processor = TFLogitsProcessorList(
        ...     [
        ...         TFMinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
        ...     ]
        ... )

        >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ["Today is a beautiful day, and I'm so happy to be here. I'm so happy to"]
        ```Nr   EncoderDecoderc                 3       | ]}| v V  qd S r   r1   r   model_prefix
model_namer1   r2   r   `      z2TFGenerationMixin.greedy_search.<locals>.<genexpr>TFGPT2TFCTRLr   r   use_memsr_   r^   r\   c                 S      t | S zstate termination condition fn.r.   r  	generatedfinished_sequencesr   r   r1   r1   r2   greedy_search_cond_fnt  s   z>TFGenerationMixin.greedy_search.<locals>.greedy_search_cond_fnc              	      sD  | ddu s	r| ddd|f }nt| dd|d f d}j|fdi|}di |d	
d}|jdddf }| ||}srrT| 	rajjra|j n	rwjjsw|j	 jjrw|j
 
rjjr|j n
rjjr|j tj|dtjd}	durԈdu rtd	dt|tj }
|	|
 d|
   }	tjjtt|	t ftdd
d}||B }tjt t| ggdd}tj| ||	d} |d7 }rj||| jjd}nj||jjd}| dddu r|dd | |||fS )state update fn.r  Nr   r^   r   Tr   r   r   )r]   output_typeGIf `eos_token_id` is defined, make sure that `pad_token_id` is defined.r   r\   r  rz   r/  r  r   r   r   r   r   r!  r   r1   )r   r.   rb   rY   logitsr   ri   r   r8   r'   r9   r:   r(   argmaxrq   r   rp   rm   r   equalrs   re   rh   rc   tensor_scatter_nd_updater?  r  r   )rf  rg  r   r   r   model_inputsr  next_token_logitsnext_tokens_scoresnext_tokensunfinished_seqnext_token_is_eosupdate_indices)r   cache_batch_axisr9   r8   r:   r   r   r   needs_full_inputr   r   r   r   r   r&   rT   r   r   r1   r2   greedy_search_body_fny  st   

"

z>TFGenerationMixin.greedy_search.<locals>.greedy_search_body_fnmaximum_iterationsr   r'   r(   r%   r&   r6   r7   r8   r9   r:   r%   r&   r'   r(   )%r   r   r   r   r   r   intr   r   r   r   r   r   r.   r   strdecoderr   r   r   r   rY   r   r   r   r   rq   r  r"  bool
while_loopri   r   r   r	  r5   r$   )rT   r   r   r   r   r   r   r   r   r   r   r   input_ids_paddingrf  rg  rh  r}  r  r8  r6   r7   r1   )r   r{  r9   r8   r:   r   r   r   r\  r|  r   r   r   r   r   r&   rT   r   r   r2   r     s   Q

 .P


zTFGenerationMixin.greedy_searchr   r   c                    s  durnt  durnt  durnjjdur"njjdur,njjttr8gdur>njjdurHnjjdurRnjj	dur\njj
|djjt  dtv rxtjnt	t	fdddD rdndd	ttjj v 
rrg ndrrg ndrrg ndrrg ndt|\ }tj | ftjd
pd }tj||gdd}tj ftjd
}dd } 
fdd}|||||\}}}}| }tj||||||f|d\}}}}s*|ddd|f }rjj rr<|d !dnd}rH|d !dnd}durSt"nddur^t"nddurit"nddurtt"ndt#|||dS t$|dS |S )ar  
        Generates sequences for models with a language modeling head using multinomial sampling.

        Parameters:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`TFLogitsProcessorList`, *optional*):
                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            logits_warper (`TFLogitsProcessorList`, *optional*):
                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsWarper`]
                used to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            max_length (`int`, *optional*, defaults to 20):
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            seed (`List[int]`, *optional*):
                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
                `seed` argument from stateless functions in `tf.random`.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `call` function of the model. If model is an
                encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.TFSampleDecoderOnlyOutput`], [`~generation.TFSampleEncoderDecoderOutput`] or `tf.Tensor`: A
            `tf.Tensor` containing the generated tokens (default behaviour) or a
            [`~generation.TFSampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.TFSampleEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> import tensorflow as tf
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     TFAutoModelForCausalLM,
        ...     TFLogitsProcessorList,
        ...     TFMinLengthLogitsProcessor,
        ...     TFTopKLogitsWarper,
        ...     TFTemperatureLogitsWarper,
        ... )

        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
        >>> model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")

        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id

        >>> input_prompt = "Today is a beautiful day, and"
        >>> input_ids = tokenizer(input_prompt, return_tensors="tf").input_ids

        >>> # instantiate logits processors
        >>> logits_processor = TFLogitsProcessorList(
        ...     [
        ...         TFMinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
        ...     ]
        ... )
        >>> # instantiate logits processors
        >>> logits_warper = TFLogitsProcessorList(
        ...     [
        ...         TFTopKLogitsWarper(50),
        ...         TFTemperatureLogitsWarper(0.7),
        ...     ]
        ... )

        >>> tf.random.set_seed(0)
        >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Today is a beautiful day, and I love my country. But when I look at Donald Trump,']
        ```Nr   rW  c                 3   rX  r   r1   rY  r[  r1   r2   r   |  r]  z+TFGenerationMixin.sample.<locals>.<genexpr>r^  r   r   ra  r_   r^   r\   c                 S   rb  r   rd  re  r1   r1   r2   sample_cond_fn  s   z0TFGenerationMixin.sample.<locals>.sample_cond_fnc              	      s  | dd u s		r| d d d |f }nt| d d |d f d}j|fdi|}di |d
d}|jd d df }| ||}| ||}srrZ| 
rgjjrg|j n
r}jjs}|j	 jjr}|j
 rjjr|j nrjjr|j d ur}	ntjjjjtjjtjjdtjd}	tjtjj|d|	tjd	dd
}
d urd u rtddt|tj }|
| d|   }
tjjtt|
t ftddd
}||B }tjt t| ggdd
}tj| ||
d} |d7 }r(j ||| jjd}nj!||jjd}| dd d u rA|"dd  | |||fS )Nr  r   r^   r   Trj  )r	   r_   )rp  num_samplesr   r`   r\   rl  r   rm  rn  ro  r1   )#r   r.   rb   rY   rp  r   ri   r   r8   r'   r9   r:   r(   experimentalnumpyrQ   randintrq   minmaxsqueezestateless_categoricalr   rp   rm   r   rr  rs   re   rh   rc   rs  r?  r  r   )rf  rg  r   r   r   rt  r  ru  rv  sample_seedrw  rx  ry  rz  )r   r{  r9   r8   r:   r   r   r   r   r|  r   r   r   r   r   r&   r   rT   r   r   r1   r2   sample_body_fn  s   

"
"

z0TFGenerationMixin.sample.<locals>.sample_body_fnr~  r   r'   r(   r  r  )%r   r   r   r   r   r   r  r   r   r   r   r   r   r.   r   r  r  r   r   r   r   rY   r   r   r   r   rq   r  r"  r  r  ri   r   r   r	  r>   r=   )rT   r   r   r   r   r   r   r   r   r   r   r   r   r   r  rf  rg  r  r  r  r8  r6   r7   r1   )r   r{  r9   r8   r:   r   r   r   r   r\  r|  r   r   r   r   r   r&   r   rT   r   r   r2   r     s   f

 2Y


zTFGenerationMixin.samplec                        fdd}t j|| S )zDGathers the beam slices indexed by beam_indices into new beam array.c                    s    dkr!t jt t |  d  t  fdd}t j| |d} t j| ddd} dkrQt jt t |  d  t  fdd}t j|}t j||d}|S )Nr   r\   )permr   )paramsrz   r]   
batch_dims)r.   r  rc   rankrf   gatherrm   invert_permutation)r  r  gathered_tensorr!  rB   r1   r2   	gather_fn#  s   ,,z2TFGenerationMixin._gather_beams.<locals>.gather_fnr.   nestmap_structure)nestedrB   r!  r  r1   r  r2   _gather_beams  s   zTFGenerationMixin._gather_beamsr   r   r   r   c           "         s@  ddd	ddddurnt  durnt  dur"njj|dur,|njj}dur6njjttrBg|
durH|
njj}
durRnjjdur\njj	durfnjj
durpnjj
durz
njj
durnjj|djjt  dtv rtjnttfd	d
dD rdnddttjj v rƈrg nd rΈrg ndrֈrg ndrވrg ndt|\}|}tj| ftjd|pd }tj||gdd}tjftjd|pd }tjftj d}t!tj"t#dgdgd   dddg}tfd }tj| ftjdd }tj| ftjdd }d|v rm	|d d |d d< d|v rz	|d |d< 
fdd} 	
fdd}|||||||||||
\
}}}}}}}}}}| }tj$||||||||||||f
|d\
}}}}}}}}}}tj%j&|dd}t'|ddddf ||}t'|ddddf ||}|tj(|| tj)d
  }t'|dddf ||}	|ddd|
ddf }	|ddd|
f }	|ddd|
ddf }sY|ddd|f }|ddd|| f }rj*j+rrk|d ,dnd}rw|d ,dnd} r~t-nt.}!|!|| ||| d	S rt/nt0}!|!|| |dS |S ) a  
        Generates sequences for models with a language modeling head using beam search. If `do_sample` is `False`, uses
        a greedy approach, otherwise does multinomial sampling without replacement.

        Parameters:
            input_ids (`tf.Tensor` of shape `(batch_size, num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            do_sample (`bool`, *optional*, defaults to `False`):
                Whether or not to use sampling ; use greedy decoding otherwise.
            max_length (`int`, *optional*, defaults to 20):
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            length_penalty (`float`, *optional*, defaults to 1.0):
                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
                while `length_penalty` < 0.0 encourages shorter sequences.
            early_stopping (`bool` or `str`, *optional*, defaults to `False`):
                Controls the stopping condition for beam-based methods, like beam-search. It accepts the following
                values: `True`, where the generation stops as soon as there are `num_beams` complete candidates;
                `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better
                candidates; `"never"`, where the beam search procedure only stops when there cannot be better
                candidates (canonical beam search algorithm).
            logits_processor (`[TFLogitsProcessorList]`, *optional*):
                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            logits_warper (`TFLogitsProcessorList`, *optional*):
                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsWarper`]
                used to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            num_return_sequences(`int`, *optional*, defaults to 1):
                The number of independently computed returned sequences for each element in the batch.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `call` function of the model. If model is an
                encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.TFBeamSearchDecoderOnlyOutput`], [`~generation.TFBeamSearchEncoderDecoderOutput`] or
            `tf.Tensor`: A `tf.Tensor` containing the generated tokens (default behaviour) or a
            [`~generation.TFBeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.TFBeamSearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     TFAutoModelForSeq2SeqLM,
        ...     TFLogitsProcessorList,
        ...     TFMinLengthLogitsProcessor,
        ... )
        >>> import tensorflow as tf

        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
        >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

        >>> encoder_input_str = "translate English to German: How old are you?"
        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="tf").input_ids

        >>> # lets run beam search using 3 beams
        >>> num_beams = 3
        >>> # define decoder start token ids
        >>> input_ids = tf.ones((1, num_beams, 1), dtype=tf.int32)
        >>> input_ids = input_ids * model.generation_config.decoder_start_token_id

        >>> # add encoder_outputs to model keyword arguments
        >>> encoder_outputs = model.get_encoder()(encoder_input_ids, return_dict=True)
        >>> encoder_outputs.last_hidden_state = tf.repeat(
        ...     tf.expand_dims(encoder_outputs.last_hidden_state, axis=0), num_beams, axis=1
        ... )
        >>> model_kwargs = {"encoder_outputs": encoder_outputs}

        >>> # instantiate logits processors
        >>> logits_processor = TFLogitsProcessorList(
        ...     [TFMinLengthLogitsProcessor(5, eos_token_id=model.generation_config.eos_token_id)]
        ... )

        >>> outputs = model.beam_search(input_ids, logits_processor=logits_processor, **model_kwargs)
        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Wie alt bist du?']
        ```r   c                 S   sB   t | }t| |d| || ||d   g ||d d  S )z8Flattens the first two dimensions of a non-scalar array.Nr   r	   r   r.   rg   )r  r!  rd   r1   r1   r2   flatten_beam_dim  s
   0z7TFGenerationMixin.beam_search.<locals>.flatten_beam_dimc                 S   s4   t | }t| |d| d|g ||d d  S )zFUnflattens the first, flat batch*beam dimension of a non-scalar array.Nr^   r   r  )r  r   r!  rd   r1   r1   r2   unflatten_beam_dim  s   ,z9TFGenerationMixin.beam_search.<locals>.unflatten_beam_dimNr   rW  c                 3   rX  r   r1   rY  r[  r1   r2   r     r]  z0TFGenerationMixin.beam_search.<locals>.<genexpr>r^  r   ra  r_   r^   r\               er   r  r   c
                    s   | k }
 dkrdkr|ddddf |   }n|ddddf t j| | t jd  }t |t jj|dddd}t j||k}t j| du @  }|
|@ |@ S )	z
            Beam Search termination condition function -- halts the generation loop if any of these conditions becomes
            False
            neverr  Nr   r_   Tr]   keepdimsr  )r.   rp   float32rr   rm   
reduce_minr   r  )r   running_sequencesrunning_scoresrunning_beam_indicesr%   r&   rB   is_sent_finisheddecoder_prompt_lenr   not_max_length_yetbest_running_scoreworst_finished_scoreimprovement_still_possiblestill_open_beam)r   r   r   r1   r2   beam_search_cond_fn 	  s   "z:TFGenerationMixin.beam_search.<locals>.beam_search_cond_fnc
           4         s  |	 ddu s	r|ddddd| f }
nt|dddd| d f d}
j	|
fdi|	}di |dd}|jdddf }tj|}	|	|| }|}rt	|	|| }|}|}|tj|dd	 }|jd }t|| f}s݈r݈r 		|	||  rj
jr	|j nrĈj
jsĈ	|j j
jrĈ	|j rшj
jrш	|j nr݈j
jr݈	|j d }rt||}tj||ddd
}n
tjj||d\}}|| }||}||}|| }tt|g}tt|g}tj||t| | ggdd	}tj||t|| gd}|ttjt dd	|j }tj||t| | | ggdd	}tj||t|| gd}du rtj|dddd| f jtjd}n0tjjtt|dddd| f t g|dddd| f j ttdddd	}|ttj!tj"tjdtjtjdfdd	t#|@ }|t$|tj%d  } tjj| dd }!|| |g|!\}"}#}$|tj$| d | tj%d
  }ttjj&|dddt#|du @ }%| |%B }&|t$|&tj%d 7 }tj!||gdd	}'tj!||gdd	}(tj!||gdd	})tj!||gdd	}*tjj|(dd }+|'|(|)|*g|+\},}-}.}/| d } d|v rtj'(fdd|j)}0||!}1j|0|1d}2tj'(	fdd|2|d< rj*||	|  j
jd}3nj+||	j
jd}3|	 dddu r|	,dd | |"|#|$|,|-|.|/||3f
S )z
            Beam Search iterative update function -- each iteration adds a new token and updates the best sequences
            seen so far
            r  Nr   r^   r   Trj  r	   r\   r]   r  r  rm  r_   r   r  r  c                    s   |  dS Nr!  r1   r  )r{  r   r  r1   r2   <lambda>	  s    zLTFGenerationMixin.beam_search.<locals>.beam_search_body_fn.<locals>.<lambda>r  c                    s   |  dS r  r1   r  )r{  r  r1   r2   r  	  s    rn  ro  r1   )-r   r.   rb   rY   rp  rk   rl   rd   rg   r   ri   r   r8   r'   r9   r:   r(   sample_without_replacementr  rm   r   r  r
  rc   ra   rh   rs   rs  r"  r  r   rr  re   r  r   r   rp   r  r  r  r  r  r?  r  r   )4r   r  r  r  r%   r&   rB   r  r  r   r   rt  r  rp  	log_probslog_probs_processedrj   beams_to_keeptopk_indicestopk_log_probstopk_current_beam_indicestopk_running_beam_indicestopk_running_sequencestopk_idsindices_batchindices_beamrz  topk_sequencesbatch_modified_indicestopk_beam_indiceseos_in_next_tokendid_topk_just_finishedrunning_topk_log_probsnext_topk_indicesnext_running_sequencesnext_running_scoresnext_running_beam_indicesbeams_in_batch_are_fulladd_penaltymerged_sequencesmerged_scoresmerged_beamsmerged_is_sent_finishedtopk_merged_indicesnext_sequencesnext_scoresnext_beam_indicesnext_is_sent_finishedcachenext_running_indices
next_cachenext_model_kwargs)
all_scoresr   r{  r9   r8   r:   r   r   r   r  r   r   r   r   r|  r   r   r   r   r   rT   r  r   r   r1   r2   beam_search_body_fn)	  s:  "






& 
(

	



z:TFGenerationMixin.beam_search.<locals>.beam_search_body_fnr~  r'   r(   )	r%   rA   r&   rB   r6   r7   r8   r9   r:   )r%   rA   r&   rB   r'   r(   r   )1r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r.   r   r  r  r   r   r   r   rY   r   r   r   r   rq   r  r"  r  ra   rb   convert_to_tensorr  rm   r   rr   rp   r  ri   r   r   rI   rF   rH   r@   )"rT   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r%   r  r  r&   r  rB   r  r  r  r8  none_finishedr6   r7   
output_clsr1   )r  r   r{  r9   r8   r:   r   r   r   r  r   r   r   r   r\  r|  r   r   r   r   r   rT   r  r   r   r2   r   4  s  
p


 (


:) q
	zTFGenerationMixin.beam_searchr   r   c                    s  ddddurnt  durnt  	dur	njj	dur'njjdur1njjttr=gdurCnjjdurMnjjdurWnjj	duranjj
d|dd t  dtv r}tjnt
t
fdd	d
D rdndrrg ndrrg ndrrg ndrrg ndt|\ }tj 	| ftjdpd }tj||gdd}tj ftjd}dd } 	fdd}|||||d\}}}}}	| }tj|||||||f|d\}}}}}s'|ddd|f }rjjr~r9|d dnd}rE|d dnd}durPtnddur[tnddurftnddurqtndt|||dS t|dS |S )u  
        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            top_k (`int`, *optional*, defaults to 1):
                The size of the candidate set that is used to re-rank for contrastive search
            penalty_alpha (`float`, *optional*, defaults to 0):
                The degeneration penalty for contrastive search; activate when it is larger than 0
            logits_processor (`TFLogitsProcessorList`, *optional*):
                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            logits_warper (`TFLogitsProcessorList`, *optional*):
                An instance of [`TFLogitsProcessorList`]. List of instances of class derived from [`TFLogitsWarper`]
                used to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            max_length (`int`, *optional*, defaults to 20):
                The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `call` function of the model. If
                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
        Return:
            [`~generation.TFContrastiveSearchDecoderOnlyOutput`],
            [`~generation.TFContrastiveSearchEncoderDecoderOutput`] or `tf.Tensor`: A `tf.Tensor` containing the
            generated tokens (default behaviour) or a [`~generation.TFContrastiveySearchDecoderOnlyOutput`] if
            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
            [`~generation.TFContrastiveSearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.
        Examples:
        ```python
        >>> from transformers import AutoTokenizer, TFAutoModelForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
        >>> model = TFAutoModelForCausalLM.from_pretrained("facebook/opt-125m")
        >>> # set pad_token_id to eos_token_id because OPT does not have a PAD token
        >>> model.config.pad_token_id = model.config.eos_token_id
        >>> input_prompt = "DeepMind Company is"
        >>> input_ids = tokenizer(input_prompt, return_tensors="tf")
        >>> outputs = model.contrastive_search(**input_ids, penalty_alpha=0.6, top_k=4, max_length=64)
        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
        ```r   c                    r  )zbGathers the slices indexed by selected_idx_stacked from a potentially nested structure of tensors.c                    s   t j|  d}|S )N)r  rz   r]   )r.   r  )r  r  r!  selected_idx_stackedr1   r2   r  
  s   zVTFGenerationMixin.contrastive_search.<locals>.gather_best_candidate.<locals>.gather_fnr  )r  r  r!  r  r1   r  r2   gather_best_candidate
  s   zCTFGenerationMixin.contrastive_search.<locals>.gather_best_candidateNTr   rW  c                 3   rX  r   r1   rY  r[  r1   r2   r   
  r]  z7TFGenerationMixin.contrastive_search.<locals>.<genexpr>r^  r   r_   r^   r\   c                 S   rb  rc  rd  )rf  rg  r   r   next_step_cached_variablesr1   r1   r2   contrastive_search_cond_fn
  s   zHTFGenerationMixin.contrastive_search.<locals>.contrastive_search_cond_fnc              	      s>  | ddu rj| ddd|f fdi|}di |dd
d}jjr1|jd }n|jd }rIt|ddgd	| gddgg}|jdddddf }rfj	|||	 jjd}n
j
||jjd	}jdjjd
|\}	}| d}
|
du rtjj dt|
d ttjfr|
d d jd  krtjj dn|d }|d }|d }| ||}| ||}t|dd}tjj|d\}}s(r(r| 
rjjr|j n
rjjs|j jjr|j rjjr|j nr(jjr(|j tjfdd|d |d< jt|ddgfdi|}di |dd
d}|}|jdddddf }jjru|jd }|j}n|jd }|j}tj|ddd|ddf dd}t|||}|tj |jd tj!d  }tj"||ddd}||}rt#||d|dg}n	tj$||gdd}||}||d}||}jjrd}d}
r|j|}|j|}t%|||pd|pdd}nd}
r|j|}t&|||pdd}durUdu r&tddt'|tj( }|| d|   }tjj)t*t+|t, ft-ddd}||B }tj.t  t+| ggdd}tj/| ||d} |d7 }rj	|||d 	  jjd}n
j
||jjd	}|||d}| ||||fS )ri  r  Nr   T)r   r   r   r^   r   rn  ro  )r   r   zQ does not support caching and therefore **can't** be used for contrastive search.z| does not have a standard cache format and therefore **can't** be used for contrastive search without further modifications.logit_for_next_steplast_hidden_statesr  r\   r  c                    s   t j|  dS )Nr\   )r.   r
  r  )r{  r   r1   r2   r  g  r:  zZTFGenerationMixin.contrastive_search.<locals>.contrastive_search_body_fn.<locals>.<lambda>r   r_   r  r  r1   )r  r:   r8   r9   )r  r(   r'   rl  rm  )r  r  r  )0r   rY   ri   r   r:   r(   r.   r0  rp  r?  r  r   r   r   r*   r   r	  r/   rd   r   rm   r   r   r8   r'   r9   r  r  rg   r  r
  _ranking_fastrc   r   r  r   r  r   r
   rp   rq   r   rr  rs   re   rb   rh   rs  )rf  rg  r   r   r  rt  r  r  r  r8  r  
next_probstop_k_probs	top_k_idsnext_model_inputsnext_past_key_valuesrp  next_hiddenfull_hidden_statescontext_hiddenselected_idxr  rw  next_decoder_hidden_statesnext_step_cross_attentionsnext_step_decoder_attentionsnext_step_attentionsrx  ry  rz  )r   r{  r9   r8   r:   r   r  r   r   r   r   r   r   r   r   r   r&   rT   r   r   r   r1   r2   contrastive_search_body_fn  s>  

"










&





"

zHTFGenerationMixin.contrastive_search.<locals>.contrastive_search_body_fnr~  r   r'   r(   r  r  r  )r   r   r   r   r   r   r  r   r   r   r   r   r.   r   r  r  r   r   r   rq   r  r"  r  r  ri   r   r   r	  rK   rJ   )rT   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  rf  rg  r  r  r  r  r8  r6   r7   r1   )r   r{  r9   r8   r:   r   r  r   r   r   r\  r   r   r   r   r   r   r&   rT   r   r   r   r2   r   |
  s   
J


 4 ]


z$TFGenerationMixin.contrastive_search)NF)NNNNr   )NN)r   FNF)NNN)F)Fr   )NNNNNNNN)
NNNNNNNNNNr  )FNNNNNNNNNNNN)r   r   NNNNNNNNN)4r*   r+   r,   r-   rP   propertyrU   r   rY   r.   r/   r   r   r  r|   r   r   r  r   r   r   r   r   TFGenerateOutputr   r  r   r   r   r  staticmethodr   r   r  r   r  r  r?  r   r   rN  TFGreedySearchOutputr   TFSampleOutputr   r  floatTFBeamSearchOutputTFBeamSampleOutputr   TFContrastiveSearchOutputr   r1   r1   r1   r2   rL     s   
	
x
   


#
(
(
D







7
	

  	

  $	

    O	
rL   c              
   C   sn   t |}tttjt|d dd|ddg}tt|t|ddggd}t|t| dg|S )Nr   r^   r\   r   )	r   r.   rg   rs   rb   rc   rf   r  r1  )r  batch_indicesrd   broad_casted_batch_dimspair_indicesr1   r1   r2   scatter_values_on_batch_indices  s   ,"r
  c              	   C   sB   t jt jt jt| dd  }t j| | |\}}|S )z
    categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
    https://github.com/tensorflow/tensorflow/issues/9260 for more info
    r   r   )r.   rm   logrQ   uniformr   rk   r   )rp  r  zr8  rz   r1   r1   r2   r    s   (r  r  r  next_top_k_probsalpha
beam_widthr[   c                 C   s   | t j| ddd }|t j|ddd }t jt jj||dddd}t j|dd}t j|dgd}d| | ||  }	t j|	d|gd}	t j|	d	d}
|
S )
a  
    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
    row in the batch.
    r	   Tr  )transpose_br^   r\   )rd   r@  r   )r.   normr  linalgmatmulrn   rg   rq  )r  r  r  r  r  norm_context_hiddennorm_next_hiddencosine_matrixdegeneration_penaltycontrastive_scorer  r1   r1   r2   r  (  s   r  )Ir   r   rM   dataclassesr   typingr   r   r   r   r   r  r   
tensorflowr.   %tensorflow.compiler.tf2xla.python.xlar   modeling_tf_outputsr
   r   models.autor   r   r   r   tf_utilsr   r   utilsr   r   configuration_utilsr   tf_logits_processr   r   r   r   r   r   r   r   r   r   r    r!   r"   
get_loggerr*   r   r$   r5   r=   r>   r@   rF   rH   rI   rJ   rK   r  r  r  r  r  r  rL   r
  r  r/   r  r  r  r1   r1   r1   r2   <module>   s   <
))"1"0(                    l

