a
    [XhLb                     @   s  d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	 d dl
Z
ddlmZmZmZ edd eD Zed	d eD Zed
d e	D ZeeddgB ZedZh dZedZi Zdd ZG dd dZG dd deZG dd deZG dd dZG dd dZdd Z dS )    N)BytesIOStringIO)Path)ascii_lettersascii_uppercase   )EOFReparseErrorspace_charactersc                 c   s   | ]}|  V  qd S Nencode.0item r   Z/var/www/viveiro_mudafortebrasil/venv/lib/python3.9/site-packages/tinyhtml5/inputstream.py	<genexpr>       r   c                 c   s   | ]}|  V  qd S r   r   r   r   r   r   r      r   c                 c   s   | ]}|  V  qd S r   r   r   r   r   r   r      r      >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿-]>    i
 i i i i i i i i i i i i i	 i i i	 i i i i i i i i i i i
 i i i i z[	- -/:-@\[-`{-~]c                 K   s   t | tr:t| dk r:t|  r:tt|  fi |S t | trXt|  fi |S t t| drn| dn| trt| fi |S t	| fi |S d S )N   readr   )

isinstancestrlenr   is_fileHTMLUnicodeInputStream	read_texthasattrr   HTMLBinaryInputStream)sourcekwargsr   r   r   HTMLInputStream&   s    "
r#   c                   @   sZ   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dddZdd ZdS )r   zProvides a Unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    c                 C   s.   dg| _ tddf| _| || _|   dS )a  Initialise the HTMLInputStream.

        Create a normalized stream from source for use by tinyhtml5.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element).

        r   utf-8certainN)	new_lineslookup_encodingencodingopen_streamstreamresetselfr!   r   r   r   __init__9   s    zHTMLUnicodeInputStream.__init__c                 C   s.   d| _ d| _d| _g | _d| _d| _d | _d S )N r   )chunk
chunk_sizechunk_offseterrorsprevious_number_linesprevious_number_columns_buffered_characterr-   r   r   r   r+   N   s    zHTMLUnicodeInputStream.resetc                 C   s   t |dr|S t|S )zuProduce a file object from source.

        source can be either a file object, local filename or a string.

        r   )r   r   r,   r   r   r   r)   \   s    z"HTMLUnicodeInputStream.open_streamc                 C   sT   | j }|dd|}| j| }|dd|}|dkr@| j| }n||d  }||fS )N
r   r   )r0   countr4   rfindr5   )r-   offsetr0   Znumber_linesZposition_lineZlast_line_positionZposition_columnr   r   r   	_positiond   s    
z HTMLUnicodeInputStream._positionc                 C   s   |  | j\}}|d |fS )z9Return (line, col) of the current position in the stream.r   )r=   r2   )r-   linecolumnr   r   r   positiono   s    zHTMLUnicodeInputStream.positionc                 C   s6   | j | jkr|  stS | j }| j| }|d | _ |S )zlRead one character from the stream or queue if available.

        Return EOF when EOF is reached.

        r   )r2   r1   
read_chunkr   r0   )r-   r2   	characterr   r   r   rB   t   s    

z HTMLUnicodeInputStream.characterc                 C   s   |  | j\| _| _d| _d| _d| _| jd}| jrJ| j| }d | _n|sRdS t	|dkrt
|d }|dksd|  krd	krn n|d | _|d d }tt	t|D ]}| jd
 q|dd}|dd}|| _t	|| _dS )Nr/   r   i (  Fr   r9      i   i  zinvalid-codepointz
r8   T)r=   r1   r4   r5   r0   r2   r*   r   r6   r   ordrangeinvalid_unicode_refindallr3   appendreplace)r-   datalast_r   r   r   rA      s0    
 

z!HTMLUnicodeInputStream.read_chunkFc                 C   s   zt ||f }W nV tyf   ddd |D }|s@d| }td| d}| }t ||f< Y n0 g }|| j| j}|du r| j| jkrqn0|	 }|| jkr|
| j| j|  || _q|
| j| jd  |  slqqld|S )a   Return a string of characters from the stream.

        String goes up to but does not include any character in 'characters' or
        EOF. 'characters' must be a container that supports the 'in' method and
        iteration over its characters.

        r/   c                 S   s   g | ]}d t |dqS )z\xZ02x)rE   )r   rB   r   r   r   
<listcomp>   r   z6HTMLUnicodeInputStream.chars_until.<locals>.<listcomp>^[z]+N)characters_until_regexKeyErrorjoinrecompilematchr0   r2   r1   endrI   rA   )r-   
charactersoppositeregexresultrV   rW   r   r   r   chars_until   s,    


z"HTMLUnicodeInputStream.chars_untilc                 C   sT   |t urP| jdkr.|| j | _|  jd7  _n"|  jd8  _| j| j |ksPJ d S )Nr   r   )r   r2   r0   r1   )r-   charr   r   r   unget   s    
zHTMLUnicodeInputStream.ungetN)F)__name__
__module____qualname____doc__r.   r+   r)   r=   r@   rB   rA   r\   r^   r   r   r   r   r   1   s   %
.r   c                       sR   e Zd ZdZdddZ fddZdd	 Zd
d Zdd Zdd Z	dd Z
  ZS )r    zProvide a binary stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    Nwindows-1252c                 C   sX   |  || _d| _|| _|| _|| _|| _|| _|  | _	| j	d d usLJ | 
  d S )Ni   r   )r)   
raw_streamnumber_bytes_metaoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermine_encodingr(   r+   )r-   r!   rf   rg   rh   ri   rj   r   r   r   r.      s    
zHTMLBinaryInputStream.__init__c                    s*   | j d jj}|| jd| _t   d S )Nr   rJ   )r(   
codec_infostreamreaderrd   r*   superr+   )r-   rm   	__class__r   r   r+   	  s    zHTMLBinaryInputStream.resetc                 C   s0   t |dr(t |dr | r |S | }t|S )Nr   seekable)r   rq   r   r   r,   r   r   r   r)     s
    
z!HTMLBinaryInputStream.open_streamc                 C   s   |   df}|d d ur|S t| jdf}|d d ur:|S t| jdf}|d d urX|S |  df}|d d urt|S t| jdf}|d d ur|d jds|S t| jdf}|d d ur|S t| j	df}|d d ur|S tddfS )Nr%   r   	tentativezutf-16rc   )

detect_bomr'   rf   rg   detect_encoding_metarh   name
startswithri   rj   )r-   r(   r   r   r   rk     s,    z(HTMLBinaryInputStream.determine_encodingc                 C   s   | j d dksJ t| }d u r&d S |jdv rFtd}|d usJ nX|| j d krf| j d df| _ n8| jd |df| _ |   td| j d  d| d S )Nr   r%   utf-16beutf-16ler$   r   zEncoding changed from z to )r(   r'   ru   rd   seekr+   r	   )r-   Znew_encodingr   r   r   change_encoding=  s    

z%HTMLBinaryInputStream.change_encodingc              
   C   s   t jdt jdt jdt jdt jdi}| jd}t|t	s<J dD ]2}|
|d|  }r@| j| t|  S q@| jd	 dS )
zAttempt to detect at BOM at the start of the stream.

        If an encoding can be determined from the BOM return the name of the
        encoding otherwise return None.

        r$   ry   rx   zutf-32lezutf-32be   )   r|      Nr   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BErd   r   r   bytesgetrz   r'   )r-   Zbomsstringrz   r(   r   r   r   rs   M  s    	z HTMLBinaryInputStream.detect_bomc                 C   sV   | j | j}t|tsJ t|}| j d | }|durR|jdv rRt	d}|S )z1Report the encoding declared by the meta element.r   Nrw   r$   )
rd   r   re   r   r   EncodingParserrz   get_encodingru   r'   )r-   bufferparserr(   r   r   r   rt   j  s    z*HTMLBinaryInputStream.detect_encoding_meta)NNNNrc   )r_   r`   ra   rb   r.   r+   r)   rk   r{   rs   rt   __classcell__r   r   ro   r   r       s      
(r    c                   @   sz   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
e	eZe
dd ZefddZdd Zdd Zdd ZdS )EncodingByteszBytes-like object with an associated position and various extra methods.

    If the position is ever greater than the string length then an exception is
    raised.

    c                 C   s   t |tsJ t| | S r   )r   r   __new__lower)clsvaluer   r   r   r     s    zEncodingBytes.__new__c                 C   s
   d| _ d S )Nr9   r=   )r-   r   r   r   r   r.     s    zEncodingBytes.__init__c                 C   s0   | j d  }| _ |t| kr t| ||d  S Nr   r=   r   StopIterationr-   r@   r   r   r   __next__  s    zEncodingBytes.__next__c                 C   s    | j d  | _ }| ||d  S r   r   r   r   r   r   previous  s    zEncodingBytes.previousc                 C   s"   | j t| krttd|| _ d S Nr   )r=   r   r   maxr   r   r   r   set_position  s    zEncodingBytes.set_positionc                 C   s&   | j t| krt| j dkr"| j S d S r   r   r7   r   r   r   get_position  s    
zEncodingBytes.get_positionc                 C   s   | | j | j d  S r   )r@   r7   r   r   r   current_byte  s    zEncodingBytes.current_bytec                 C   sH   | j }|t| k r>| ||d  }||vr4|| _|S |d7 }q|| _dS )zSkip past a list of characters.r   Nr@   r   r=   r-   rX   r@   rB   r   r   r   skip  s    
zEncodingBytes.skipc                 C   sH   | j }|t| k r>| ||d  }||v r4|| _|S |d7 }q|| _d S r   r   r   r   r   r   
skip_until  s    
zEncodingBytes.skip_untilc                 C   s(   |  || j }r$|  jt|7  _|S )zLook for a sequence of bytes at the start of a string.

        If the bytes are found return True and advance the position to the byte
        after the match. Otherwise return False and leave the position alone.

        )rv   r@   r   )r-   r   r[   r   r   r   match_bytes  s    zEncodingBytes.match_bytesc                 C   s<   z |  || jt| d | _W n ty6   tY n0 dS )zLook for the next sequence of bytes matching a given sequence.

        If a match is found advance the position to the last byte of the match.

        r   T)indexr@   r   r=   
ValueErrorr   )r-   r   r   r   r   jump_to  s
     
zEncodingBytes.jump_toN)r_   r`   ra   rb   r   r.   r   r   r   r   propertyr@   r   space_characters_bytesr   r   r   r   r   r   r   r   r   x  s   

r   c                   @   sX   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )r   z@Mini parser for detecting character encoding from meta elements.c                 C   s   t || _d | _d S r   )r   rK   r(   r-   rK   r   r   r   r.     s    
zEncodingParser.__init__c              
   C   s   d| j vrd S | j| j| j| j| j| jd}| j D ]}d}z| j d W n tyb   Y  qY n0 | D ]B\}}| j 	|rlz| }W  qW ql ty   d}Y  qY ql0 ql|s2 qq2| j
S )N   <meta)s   <!--r   s   </s   <!s   <?r   Tr   F)rK   handle_commenthandle_metahandle_possible_end_taghandle_otherhandle_possible_start_tagr   r   itemsr   r(   )r-   Zmethod_dispatchrM   Zkeep_parsingkeymethodr   r   r   r     s4    


zEncodingParser.get_encodingc                 C   s   | j dS )zSkip over comments.s   -->rK   r   r7   r   r   r   r     s    zEncodingParser.handle_commentc                 C   s   | j jtvrdS d}d }|   }d u r,dS |d dkr\|d dk}|r|d ur|| _dS q|d dkr|d }t|}|d ur|| _dS q|d dkrtt|d }|  }d urt|}|d ur|r|| _dS |}qd S )	NTFr   s
   http-equivr   s   content-type   charsets   content)	rK   r   r   get_attributer(   r'   ContentAttributeParserr   parse)r-   Z
has_pragmaZpending_encoding	attributeZtentative_encodingcodecZcontent_parserr   r   r   r     s4    zEncodingParser.handle_metac                 C   s   | j ddS )NFend_tag)handle_possible_tagr7   r   r   r   r     s    z(EncodingParser.handle_possible_start_tagc                 C   s   t | j | jddS )NTr   )nextrK   r   r7   r   r   r   r     s    
z&EncodingParser.handle_possible_end_tagc                 C   sX   | j }|jtvr(|r$|  |   dS |t}|dkrD|  n|  d u rDqTqDdS )NTr   )rK   r   ascii_letters_bytesr   r   r   spaces_angle_bracketsr   )r-   r   rK   rB   r   r   r   r   #  s    


z"EncodingParser.handle_possible_tagc                 C   s   | j dS )Nr   r   r7   r   r   r   r   :  s    zEncodingParser.handle_otherc                 C   s  | j }|ttdgB }|du s2t|dks2J |dv r>dS g }g }|dkrV|rVqnX|tv rj| }qnD|dv rd|dfS |tv r||  n|du rdS || t	|}qF|dkr|
  d|dfS t	| | }| }dv rJt	|}||kr"t	| d|d|fS |tv r<||  q|| qnJ|d	krbd|dfS |tv r|||  n|du rdS || t	|}|tv rd|d|fS |tv r||  n|du rdS || qdS )
z{Return a (name, value) pair for the next attribute in the stream.

        If no attribute is found, return None.

           /Nr   )r   N   =)r   r   r   )   '   "r   )rK   r   r   	frozensetr   rS   ascii_uppercase_bytesrI   r   r   r   r   )r-   rK   rB   Zattribute_nameZattribute_valuequoter   r   r   r   =  s`    










zEncodingParser.get_attributeN)r_   r`   ra   rb   r.   r   r   r   r   r   r   r   r   r   r   r   r   r     s   !r   c                   @   s   e Zd Zdd Zdd ZdS )r   c                 C   s   t |tsJ || _d S r   )r   r   rK   r   r   r   r   r.     s    zContentAttributeParser.__init__c                 C   s   z| j d | j  jd7  _| j   | j jdks<W d S | j  jd7  _| j   | j jdv r| j j}| j  jd7  _| j j}| j |r| j || j j W S W d S nP| j j}z"| j t | j || j j W W S  t y   | j |d   Y W S 0 W n ty   Y d S 0 d S )Nr   r   r   )r   r   )rK   r   r@   r   r   r   r   r   )r-   r   Zold_positionr   r   r   r     s.    

zContentAttributeParser.parseN)r_   r`   ra   r.   r   r   r   r   r   r     s   r   c                 C   s\   t | tr.z| d} W n ty,   Y dS 0 | durXzt| W S  tyV   Y dS 0 dS )zReturn the Python codec name corresponding to an encoding.

    Return None if the string doesn't correspond to a valid encoding.

    asciiN)r   r   decodeUnicodeDecodeErrorwebencodingslookupAttributeError)r(   r   r   r   r'     s    
r'   )!r   rT   ior   r   pathlibr   r   r   r   r   	constantsr   r	   r
   r   r   r   r   r   rU   rG   non_bmp_invalid_codepointsascii_punctuation_rerQ   r#   r   r    r   r   r   r   r'   r   r   r   r   <module>   s8    8 Y <&