o
    hDN                     @  sR  d dl mZ d dlmZ d dlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%eddd/d"d#Z&ed$d	&d0d1d,d-Z'd.S )2    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   @  s<   e Zd ZdZdddZdd	d
ZdddZedddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterstrreturnboolc                 C     t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r$   i/var/www/html/construction_image-detection-poc/venv/lib/python3.10/site-packages/charset_normalizer/md.pyeligible&      zMessDetectorPlugin.eligibleNonec                 C  r   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r    r"   r$   r$   r%   feed,   s   zMessDetectorPlugin.feedc                 C  r   )zB
        Permit to reset the plugin to the initial state.
        r    r#   r$   r$   r%   reset3   r'   zMessDetectorPlugin.resetfloatc                 C  r   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r    r*   r$   r$   r%   ratio9   s   zMessDetectorPlugin.ratioNr   r   r   r   r   r   r   r(   r   r(   r   r,   )	__name__
__module____qualname____doc__r&   r)   r+   propertyr-   r$   r$   r$   r%   r       s    


r   c                   @  B   e Zd ZdddZddd	Zdd
dZdddZedddZdS ) TooManySymbolOrPunctuationPluginr   r(   c                 C  s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr*   r$   r$   r%   __init__C   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   r   r   c                 C     |  S Nisprintabler"   r$   r$   r%   r&   K      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C  sp   |  j d7  _ || jkr3|tvr3t|r|  jd7  _n| du r3t|r3t|du r3|  jd7  _|| _d S )Nr   F   )	r;   r<   r   r   r9   isdigitr   r   r:   r"   r$   r$   r%   r)   N   s   

z%TooManySymbolOrPunctuationPlugin.feedc                 C  s   d| _ d| _d| _d S Nr   )r9   r;   r:   r*   r$   r$   r%   r+   `      
z&TooManySymbolOrPunctuationPlugin.resetr,   c                 C  s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           333333?)r;   r9   r:   )r#   ratio_of_punctuationr$   r$   r%   r-   e   s   

z&TooManySymbolOrPunctuationPlugin.ratioNr0   r.   r/   r1   	r2   r3   r4   r>   r&   r)   r+   r6   r-   r$   r$   r$   r%   r8   B   s    



r8   c                   @  r7   )TooManyAccentuatedPluginr   r(   c                 C     d| _ d| _d S rF   r;   _accentuated_countr*   r$   r$   r%   r>   r      
z!TooManyAccentuatedPlugin.__init__r   r   r   c                 C  r?   r@   )isalphar"   r$   r$   r%   r&   v   rC   z!TooManyAccentuatedPlugin.eligiblec                 C  ,   |  j d7  _ t|r|  jd7  _d S d S Nr   )r;   r	   rO   r"   r$   r$   r%   r)   y      zTooManyAccentuatedPlugin.feedc                 C  rM   rF   rN   r*   r$   r$   r%   r+      rP   zTooManyAccentuatedPlugin.resetr,   c                 C  s*   | j dk rdS | j| j  }|dkr|S dS )N   rH   gffffff?rN   )r#   ratio_of_accentuationr$   r$   r%   r-      s   
zTooManyAccentuatedPlugin.ratioNr0   r.   r/   r1   rK   r$   r$   r$   r%   rL   q   s    



rL   c                   @  r7   )UnprintablePluginr   r(   c                 C  rM   rF   )_unprintable_countr;   r*   r$   r$   r%   r>      rP   zUnprintablePlugin.__init__r   r   r   c                 C     dS NTr$   r"   r$   r$   r%   r&         zUnprintablePlugin.eligiblec                 C  s(   t |r|  jd7  _|  jd7  _d S rS   )r   rX   r;   r"   r$   r$   r%   r)      s   zUnprintablePlugin.feedc                 C  s
   d| _ d S rF   )rX   r*   r$   r$   r%   r+      s   
zUnprintablePlugin.resetr,   c                 C     | j dkrdS | jd | j  S )Nr   rH   rU   )r;   rX   r*   r$   r$   r%   r-         
zUnprintablePlugin.ratioNr0   r.   r/   r1   rK   r$   r$   r$   r%   rW      s    



rW   c                   @  r7   )SuspiciousDuplicateAccentPluginr   r(   c                 C     d| _ d| _d | _d S rF   _successive_countr;   _last_latin_characterr*   r$   r$   r%   r>      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   r   r   c                 C  s   |  ot|S r@   )rQ   r   r"   r$   r$   r%   r&      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C  st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rS   )r;   rb   r	   isupperra   r   r"   r$   r$   r%   r)      s   

z$SuspiciousDuplicateAccentPlugin.feedc                 C  r_   rF   r`   r*   r$   r$   r%   r+      rG   z%SuspiciousDuplicateAccentPlugin.resetr,   c                 C  r\   )Nr   rH   rD   )r;   ra   r*   r$   r$   r%   r-      r]   z%SuspiciousDuplicateAccentPlugin.ratioNr0   r.   r/   r1   rK   r$   r$   r$   r%   r^      s    



r^   c                   @  r7   )SuspiciousRanger   r(   c                 C  r_   rF   )"_suspicious_successive_range_countr;   _last_printable_seenr*   r$   r$   r%   r>      rG   zSuspiciousRange.__init__r   r   r   c                 C  r?   r@   rA   r"   r$   r$   r%   r&      rC   zSuspiciousRange.eligiblec                 C  sx   |  j d7  _ | st|s|tv rd | _d S | jd u r"|| _d S t| j}t|}t||r7|  jd7  _|| _d S rS   )r;   isspacer   r   rf   r    is_suspiciously_successive_rangere   )r#   r   unicode_range_aunicode_range_br$   r$   r%   r)      s    



zSuspiciousRange.feedc                 C  r_   rF   )r;   re   rf   r*   r$   r$   r%   r+      rG   zSuspiciousRange.resetr,   c                 C  s"   | j dkrdS | jd | j  }|S )N   rH   rD   )r;   re   )r#   ratio_of_suspicious_range_usager$   r$   r%   r-      s   
zSuspiciousRange.ratioNr0   r.   r/   r1   rK   r$   r$   r$   r%   rd      s    



rd   c                   @  r7   )SuperWeirdWordPluginr   r(   c                 C  s@   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d S )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr;   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr*   r$   r$   r%   r>      s   
zSuperWeirdWordPlugin.__init__r   r   r   c                 C  rY   rZ   r$   r"   r$   r$   r%   r&   
  r[   zSuperWeirdWordPlugin.eligiblec                 C  s  |  rc|  j|7  _t|r|  jd7  _| jdu rFt|du s%t|rFt|du rFt|du rFt|du rFt	|du rFt
|du rFd| _t|sZt|sZt|sZt	|sZt
|ra|  jd7  _d S | jshd S | sut|sut|r$| jr$|  jd7  _t| j}|  j|7  _|dkr| j| dkrd| _n4t| jd r| jd  rtdd | jD du r|  jd7  _d| _n| jdkrd| _|  jd7  _|d	kr| jrd
d t| jtd|D }d}|rt|| dkrd}|s|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d| _d| _d S |dvrA| du rCt|rEd| _|  j|7  _d S d S d S d S )Nr   FT   g      ?c                 s  s    | ]}|  V  qd S r@   rc   ).0_r$   r$   r%   	<genexpr>7  s    z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>   c                 S  s   g | ]
\}}|  r|qS r$   rz   )r{   cir$   r$   r%   
<listcomp>?  s    z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>r   rI   rn   >   r|   -<=>|~)rQ   ru   r	   rv   rs   r   r   r   r   r   r   rw   rg   r   r   ro   lenr;   rr   rc   allrq   ziprangerp   rt   rE   r   )r#   r   buffer_lengthcamel_case_dstprobable_camel_casedr$   r$   r%   r)     s   




zSuperWeirdWordPlugin.feedc                 C  s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nrn   Fr   )ru   rr   rs   rp   ro   r;   rt   rq   r*   r$   r$   r%   r+   ^  s   
zSuperWeirdWordPlugin.resetr,   c                 C  s$   | j dkr| jdkrdS | j| j S )N
   r   rH   )ro   rq   rt   r;   r*   r$   r$   r%   r-   h  s   zSuperWeirdWordPlugin.ratioNr0   r.   r/   r1   rK   r$   r$   r$   r%   rm      s    



Q
rm   c                   @  sF   e Zd ZdZdddZdd	d
ZdddZdddZedddZ	dS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   r(   c                 C  rM   rF   _wrong_stop_count_cjk_character_countr*   r$   r$   r%   r>   v  rP   zCjkInvalidStopPlugin.__init__r   r   r   c                 C  rY   rZ   r$   r"   r$   r$   r%   r&   z  r[   zCjkInvalidStopPlugin.eligiblec                 C  s8   |dv r|  j d7  _ d S t|r|  jd7  _d S d S )N>      丄   丅r   )r   r   r   r"   r$   r$   r%   r)   }  s   zCjkInvalidStopPlugin.feedc                 C  rM   rF   r   r*   r$   r$   r%   r+     rP   zCjkInvalidStopPlugin.resetr,   c                 C  s   | j dk rdS | j| j  S )N   rH   )r   r   r*   r$   r$   r%   r-     s   
zCjkInvalidStopPlugin.ratioNr0   r.   r/   r1   )
r2   r3   r4   r5   r>   r&   r)   r+   r6   r-   r$   r$   r$   r%   r   p  s    



r   c                   @  r7   )ArchaicUpperLowerPluginr   r(   c                 C  s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr;   _last_alpha_seen_current_ascii_onlyr*   r$   r$   r%   r>     s   
z ArchaicUpperLowerPlugin.__init__r   r   r   c                 C  rY   rZ   r$   r"   r$   r$   r%   r&     r[   z ArchaicUpperLowerPlugin.eligiblec                 C  s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQ|
 du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   TrD   )rQ   r   r   rE   r   r   r   r   r   r;   isasciirc   islower)r#   r   is_concerned	chunk_sepr$   r$   r%   r)     s@   




zArchaicUpperLowerPlugin.feedc                 C  s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r;   r   r   r   r   r   r   r*   r$   r$   r%   r+     s   
zArchaicUpperLowerPlugin.resetr,   c                 C  s   | j dkrdS | j| j  S )Nr   rH   )r;   r   r*   r$   r$   r%   r-     s   
zArchaicUpperLowerPlugin.ratioNr0   r.   r/   r1   rK   r$   r$   r$   r%   r     s    



*	r   c                   @  sB   e Zd ZdddZdddZdd
dZdddZedddZdS )ArabicIsolatedFormPluginr   r(   c                 C  rM   rF   r;   _isolated_form_countr*   r$   r$   r%   r>     rP   z!ArabicIsolatedFormPlugin.__init__c                 C  rM   rF   r   r*   r$   r$   r%   r+     rP   zArabicIsolatedFormPlugin.resetr   r   r   c                 C  s   t |S r@   )r
   r"   r$   r$   r%   r&     rC   z!ArabicIsolatedFormPlugin.eligiblec                 C  rR   rS   )r;   r   r   r"   r$   r$   r%   r)     rT   zArabicIsolatedFormPlugin.feedr,   c                 C  s   | j dk rdS | j| j  }|S )NrU   rH   r   )r#   isolated_form_usager$   r$   r%   r-     s   
zArabicIsolatedFormPlugin.ratioNr0   r.   r/   r1   )	r2   r3   r4   r>   r+   r&   r)   r6   r-   r$   r$   r$   r%   r     s    



r      )maxsizeri   
str | Nonerj   r   r   c                 C  sv  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS | dks|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr   )ri   rj   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charsr$   r$   r%   rh     sZ   rh   i   皙?Fdecoded_sequencer   maximum_thresholdr,   debugc              	   C  sR  dd t  D }t| d }d}|dk rd}n	|dkrd}nd	}t| d
 t|D ]2\}}|D ]}	|	|r<|	| q0|dkrG|| dksM||d kr\tdd |D }||kr\ nq*|rtd}
|
	t
d| d| d|  t| dkr|
	t
d| dd   |
	t
d| dd   |D ]}|
	t
|j d|j  qt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S  s   g | ]}| qS r$   r$   )r{   md_classr$   r$   r%   r   I  s    zmess_ratio.<locals>.<listcomp>r   rH   i       r   r      
r   c                 s  s    | ]}|j V  qd S r@   )r-   )r{   dtr$   r$   r%   r}   `  s    zmess_ratio.<locals>.<genexpr>charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r&   r)   sumr   logr   	__class__r-   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   r$   r$   r%   
mess_ratioA  sN   


r   N)ri   r   rj   r   r   r   )r   F)r   r   r   r,   r   r   r   r,   )(
__future__r   	functoolsr   loggingr   constantr   r   r   utilsr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r8   rL   rW   r^   rd   rm   r   r   r   rh   r   r$   r$   r$   r%   <module>   s(    L"/%1vLI