o
    hL$                     @   sn  d Z ddlmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ e	 r3ddlmZmZ dd	lmZ G d
d dZeejef Z				d%dejdee deeeef  ddfddZejjdd	d&dejdejdejdejfddZdejdedejfddZ			d'dejjdejdejdejdeejdf d ee d!ee d"eej deejejf fd#d$ZdS )(a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalTupleUnionN)version   )is_torch_flex_attn_available)_torch_version)	BlockMaskflex_attention)create_block_maskc                       sJ   e Zd ZdZdZdZdZ fddZej	j
dddd Zd	d
 Z  ZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                    s   | j d u rt | | _ | j S N)	_instancesuper__new__)clsargskwargs	__class__ |/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/transformers/integrations/flex_attention.pyr   6   s   
zWrappedFlexAttention.__new__	recursivec                 C   sX   | j r|| jkr*|| _ttjdkr|rtjtddd| _	ntt| _	d| _ dS dS )z>
        Initialize or update the singleton instance.
        z2.6.0Fzmax-autotune-no-cudagraphs)dynamicmodeTN)
_is_flex_compiledtrainingr   parser   base_versiontorchcompiler
   _compiled_flex_attention)selfr   r   r   r   __init__<   s   

zWrappedFlexAttention.__init__c                 C   s   | j S r   )r"   )r#   r   r   r   __call__N   s   zWrappedFlexAttention.__call__)__name__
__module____qualname____doc__r   r   r"   r   r    compilerdisabler$   r%   __classcell__r   r   r   r   r   -   s    
r   attention_mask_2dattention_chunk_sizeoffsetsreturnr	   c           	   	      s    j \}}|s	|}|s|}tjjj dd|fd  j}  |dur0ddd |  fdd|durL|d |d fdd	}n}t	||d|||d
dS )a  
    Create a block causal document mask for a batch of sequences, both packed and unpacked.
    Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full block causal
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
    r   )valuepadN   c                    s@   ||k}| |f | |f k} | |f dk}||@ |@ }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.

        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r   r   )	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_mask)r-   document_idsr   r   causal_mask_mod   s
   z4make_flex_block_causal_mask.<locals>.causal_mask_modc                    s   | }| } | |||S r   r   )r5   r6   r7   r8   offset_q	offset_kv)r>   	kv_offsetq_offsetr   r   mask_mod   s   z-make_flex_block_causal_mask.<locals>.mask_modT)rC   BHQ_LENKV_LENdevice_compile)
shaper    nn
functionalr2   rH   clonefill_cumsumcreate_block_causal_mask_flex)	r-   r.   query_length
key_lengthr/   
batch_sizetotal_seq_lenrH   rC   r   )r-   r>   r=   rA   rB   r   make_flex_block_causal_maskU   s2   
rU   Fr   querykeyr1   c                 K   s   t | }|| ||fi |S r   )r   )rV   rW   r1   r   r   flex_attention_compiledr   r   r   compile_friendly_flex_attention   s   
	rY   hidden_statesn_repc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r3   N)rJ   expandreshape)rZ   r[   batchnum_key_value_headsslenhead_dimr   r   r   	repeat_kv   s
   0rb   moduleattention_maskscalingsoftcap	head_maskc                    s  d }	d  t |tr|}	n|  d ur% d d d d d d d |jd f   fdd}
d}|jd }||d @ dksXt||jd |jd  }t||jd |jd  }d}|dd }t||||
|	|||d| jd	
\}}||j}|	dd

 }||fS )Nc                    s^   d urt |   }  d ur|  | d | |  } d ur-| | | d d  } | S )Nr   )r    tanh)scorer5   r6   r7   r8   r9   rg   rf   r   r   	score_mod   s   z)flex_attention_forward.<locals>.score_modTr3   r   Fkernel_options)rl   
block_mask
enable_gqascalerm   
return_lser   r   )
isinstancer	   rJ   rb   getrY   r   todtype	transpose
contiguous)rc   rV   rW   r1   rd   re   rf   rg   r   rn   rl   ro   num_local_query_headsrm   attn_outputattention_weightsr   rk   r   flex_attention_forward   s<   
&	

r{   )NNNN)F)NNN)r)   typingr   r   r   r    	packagingr   utilsr   utils.import_utilsr   !torch.nn.attention.flex_attentionr	   r
   r   rP   r   TensorintOffsetrU   r*   r+   rY   rb   rK   Modulefloatr{   r   r   r   r   <module>   st    %
R
