o
    hgp                     @   sT  U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlZddgZedZedZe jjZdd Zi Z e!eef e"d< dd Z#dUdeeeef geeef f fddZ$e$ej%ddde&fddZ'e$ej(dVde&fddZ)e$ej*dVde&fddZ+e$ej,dVde&fdd Z-e$ej.					dWde&fd!d"Z/	dUd#e0e& d$e0e& d%e0e& d&e1de&f
d'd(Z2e$ej3ej4gddde&fd)d*Z5e$ej6de&fd+d,Z7d-d. Z8e$ej9ej:ej;gddde&fd/d0Z<d1d2 Z=dd3dee>e>e&d4f e>e&d4f e>e&d4f e	e>e&d4f  f  fd5d6Z?dd3dee>e>e&d4f e>e&d4f e>e&d4f e	e>e&d4f  f  fd7d8Z@e$ejAd9d:ddde&fd;d<ZBe$ejCd9d:de&fd=d>ZDd?d@ ZEe$ejFejGejHgddde&fdAdBZIe$ejJd9d:de&fdCdDZKe$ejLd9d:de&fdEdFZMi ej%e'ej(e)ej*e+ej,e-ej.e/ej3e5ej4e5ej6e7ej9e<ej:e<ej;e<ejFeIejGeIejHeIejAeBejCeDejJeKejLeMiZ dGdH ZNg dIZOdJdK ZPdLdM ZQdNdO ZRdPdQ ZSG dRd dZTG dSdT dTeZUdS )X    N)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyOptionalUnionTypeVarCallable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_Pc                 C   s   t | tjr	| jS | S N)
isinstancetorchTensorshape)i r   l/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/torch/utils/flop_counter.py	get_shape   s   r    flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r    )r"   argskwargsr#   fr   r   nf   s   zshape_wrapper.<locals>.nfr   r'   r(   r   r&   r   shape_wrapper   s   r*   Freturnc                    s,   dt ttf dt ttf f fdd}|S )Nflop_formular+   c                    s,   st    fdd}tjj|  S )Nc                    sH   t | tjjstd|  dt|  | tv rtd|   t| < d S )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   _opsOpOverloadPacket
ValueErrortyper!   RuntimeError)targetr,   r   r   register(   s   z=register_flop_formula.<locals>.register_fun.<locals>.register)r*   r   utils_pytree	tree_map_)r,   r4   get_rawtargetsr3   r   register_fun$   s
   z+register_flop_formula.<locals>.register_fun)r   r   r   )r:   r9   r;   r   r8   r   r   #   s   ()r#   c          	      O   s,   | \}}|\}}||ksJ || d | S )zCount flops for matmul.   r   )	a_shapeb_shaper#   r$   r%   mkk2nr   r   r   mm_flop9   s   rC   c                 K   
   t ||S )zCount flops for addmm.rC   
self_shaper=   r>   r#   r%   r   r   r   
addmm_flopD   s   
rH   c                 K   sD   | \}}}|\}}}	||ksJ ||ksJ || |	 d | }
|
S )z"Count flops for the bmm operation.r<   r   )r=   r>   r#   r%   br?   r@   b2rA   rB   flopr   r   r   bmm_flopI   s   

rL   c                 K   rD   )z&Count flops for the baddbmm operation.rL   rF   r   r   r   baddbmm_flopV   s   
rN   c	           
      K   s
   t | |S )zCount flops for _scaled_mm.rE   )
r=   r>   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr#   r%   r   r   r   _scaled_mm_flop]   s   
rU   x_shapew_shaper#   
transposedc           
      C   sL   | d }|r| n|dd }|^}}}	 t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   r<   Nr   )
rV   rW   r#   rX   
batch_size
conv_shapec_outc_infilter_sizerK   r   r   r   conv_flop_countn   s   
 r^   c          
      O   s   t | |||dS )zCount flops for convolution.rX   )r^   )
rV   rW   _bias_stride_padding	_dilationrX   r#   r$   r%   r   r   r   	conv_flop   s   rd   c                 C   s   dd }d}	 |
d rt |d }|t| ||| 7 }|
d rIt |d }|r9|t|| ||||dd7 }|S |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   r<   )list)r   r   r   r   t   s    zconv_backward_flop.<locals>.tr   r   Fr_   )r    r^   )grad_out_shaperV   rW   r`   ra   rb   rc   rX   _output_padding_groupsoutput_maskr#   rf   
flop_countgrad_input_shapegrad_weight_shaper   r   r   conv_backward_flop   s   F  rn   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr|kr8n J ||  kr)|kr8n J ||
kr8|	|kr8||
ks:J d}|t || ||f|| ||	f7 }|t || ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   rM   )query_shape	key_shapevalue_shaperI   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r   sdpa_flop_count  s   P""r~   c                O   s   t | ||S )Count flops for self-attention.r~   )ro   rp   rq   r#   r$   r%   r   r   r   	sdpa_flop  s   r   c                 C   sR   ddl m} ddlm} t| ||fs| jjdkr|   S |g| 	dd  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r   devicer0   difftolistsize)offsetsmax_lenr   r   r   r   r   _offsets_to_lengths  s
   r   )grad_out.c                 c   s&   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}	}
|j\}}}|j\}}}|dus;J |dusAJ |j|jksIJ t||}t||}t||D ]%\}}d|	||
f}d|||f}d|||f}|durt|nd}||||fV  qXdS | j|j|j|dur|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qrt   h_kd_kh_vr|   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r   %_unpack_flash_attention_nested_shapes*  s*   

&r   c                 c   s,   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}}	}
|j\}}}}|j\}}}}|dus>J |dusDJ |j|jksLJ t||}t||}t||D ]%\}}d|	||
f}d|||f}d|||f}|durw|nd}||||fV  q[dS | j|j|j|dur|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   rt   r   r   r   r|   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   r   r   r   )_unpack_efficient_attention_nested_shapesX  s*   

&r   T)r9   c             	   O   s(   t | ||||||d}
tdd |
D S )r   )r   r   r   r   r   r   r   c                 s   $    | ]\}}}}t |||V  qd S r   r   .0ro   rp   rq   r   r   r   r   	<genexpr>  
    


z0_flash_attention_forward_flop.<locals>.<genexpr>r   sum)r   r   r   r   r   r   r   r#   r$   r%   sizesr   r   r   _flash_attention_forward_flop     	r   c              	   O   s(   t | ||||||d}
tdd |
D S )r   )r   r   r   r   r   r   r   c                 s   r   r   r   r   r   r   r   r     r   z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )r   r   r   biasr   r   r   r   r$   r%   r   r   r   r   !_efficient_attention_forward_flop  r   r   c                 C   sV  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  kr)|  kr)|krBn J ||
  kr;|  kr;|krBn J ||ksDJ ||krP||krP||ksRJ d}|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|S )Nr   rM   )rg   ro   rp   rq   r}   rI   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   _b4_h4_s4_d4r   r   r   sdpa_backward_flop_count  s   T"""""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )rg   ro   rp   rq   r#   r$   r%   r   r   r   sdpa_backward_flop  s   r   c
              
   O   *   t |||| ||||	d}tdd |D S )N)r   r   r   r   r   r   r   r   c                 s   &    | ]\}}}}t ||||V  qd S r   r   r   ro   rp   rq   rg   r   r   r   r     
    

z1_flash_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   out	logsumexpr   r   r   r   r$   r%   shapesr   r   r   _flash_attention_backward_flop     
r   c
              
   O   r   )N)r   r   r   r   r   r   r   r   c                 s   r   r   r   r   r   r   r   r   &  r   z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   r   r   r   r   r   r   r$   r%   r   r   r   r   "_efficient_attention_backward_flop  r   r   c                 C   s   t | ts| fS | S r   )r   tuple)xr   r   r   normalize_tupleA  s   
r   ) KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   r<   r   )maxminr   suffixesstr)numberindexr   r   r   get_suffix_strJ  s   (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   r   r   r   r   convert_num_with_suffixQ  s   
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numdenomr   r   r   convert_to_percent_strX  s   r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r$   	flat_argsspecr   r&   r   r   r(   ^  s   
z)_pytreeify_preserve_structure.<locals>.nfr   r)   r   r&   r   _pytreeify_preserve_structure]  s   r   c                       s   e Zd ZdZ				ddeeejje	ejj f  de
dedeeeef  f fd	d
Zde
fddZdeeeee
f f fddZdddZdd Zdd Zdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nr<   Tmodsdepthdisplaycustom_mappingc                    st   t    tdd | _|| _|| _d | _|d u ri }|d ur&tjddd i t	dd |
 D | _	t | _d S )Nc                   S   s   t tS r   )r   intr   r   r   r   <lambda>  s    z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itr<   )
stacklevelc                 S   s*   i | ]\}}|t |d dr|nt|qS )_get_rawF)getattrr*   r   r@   vr   r   r   
<dictcomp>  s   * z,FlopCounterMode.__init__.<locals>.<dictcomp>)super__init__r   flop_countsr   r   modewarningswarnr!   itemsr   mod_tracker)selfr   r   r   r   	__class__r   r   r   {  s   
zFlopCounterMode.__init__r+   c                 C   s   t | jd  S )NGlobal)r   r   valuesr   r   r   r   get_total_flops  s   zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]	\}}|t |qS r   )dictr   r   r   r   r     s    z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r   r   r  r   r   r   get_flop_counts  s   
zFlopCounterMode.get_flop_countsc           
         s  |d u rj }|d u rd}dd l}d|_g d}g }  t d fdd}tj D ]}|dkr;q4|d	d
 }||krGq4|||d
 }|	| q4djv roso|D ]
}	d|	d  |	d< q]|dd| }t
|dkrzg dg}|j||ddS )Ni?B r   T)ModuleFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]\}}||d t| t|t| g q,|S )N z - )r   r   r   appendr   r   r   r   )mod_namer   r}   paddingr   r@   r   global_flopsglobal_suffixis_global_subsumedr   r   r   process_mod  s    z.FlopCounterMode.get_table.<locals>.process_modr   .r   r  )r   0r   )leftrightr  )headerscolalign)r   tabulatePRESERVE_WHITESPACEr  r   sortedr   keyscountextendr   )
r   r   r  headerr   r  mod	mod_depth
cur_valuesr   r   r  r   	get_table  s6   
zFlopCounterMode.get_tablec                 C   s,   | j   | j  t| | _| j  | S r   )r   clearr   	__enter___FlopCounterModer   r  r   r   r   r"    s
   



zFlopCounterMode.__enter__c                 G   sD   | j d usJ | j j| }d | _ | j  | jr t| | j |S r   )r   __exit__r   r   printr   r   )r   r$   rI   r   r   r   r$    s   
zFlopCounterMode.__exit__c                 C   sV   || j v r)| j | }||i |d|i}t| jjD ]}| j| |  |7  < q|S )Nr"   )r!   setr   parentsr   )r   func_packetr   r$   r%   flop_count_funcrk   parr   r   r   _count_flops  s   

zFlopCounterMode._count_flops)Nr<   TNr   )__name__
__module____qualname____doc__r   r	   r   nnr  re   r   boolr  r   r   r  r   r  r   r"  r$  r+  __classcell__r   r   r   r   r   g  s*    
=	c                   @   s$   e Zd ZdefddZdddZdS )	r#  counterc                 C   s
   || _ d S r   )r3  )r   r3  r   r   r   r     s   
z_FlopCounterMode.__init__r   Nc                 C   s0  |r|ni }|t jjjjt jjjjt jjjjt jjjjt jjjjt jjj	jt jjj
jt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjhv rRtS || jjvr|t jjjjur|  |j|i |}|turx|W  d    S W d    n1 sw   Y  ||i |}| j|j|||S r   )r   opsatenis_contiguousdefaultmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutNotImplementedr3  r!   r   	decomposer+  _overloadpacket)r   functypesr$   r%   rr   r   r   r   __torch_dispatch__  s6   












z#_FlopCounterMode.__torch_dispatch__)r   N)r,  r-  r.  r   r   rK  r   r   r   r   r#    s    r#  )Fr   )NNNFN)Vr   torch.utils._pytreer   r   r   module_trackerr   typingr   r   r	   r
   r   collections.abcr   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r   __all__r   r   r4  r5  r    r!   r  __annotations__r*   r   mmr   rC   addmmrH   bmmrL   baddbmmrN   
_scaled_mmrU   re   r1  r^   convolution_convolutionrd   convolution_backwardrn   r~   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r   r   r   r#  r   r   r   r   <module>   s  
*

'g6

36

0
  	

 