a
    [Xh                     @   s   d dl mZ d dlmZ d dlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZ ddlmZ eeeZdd Zd	d
 ZG dd dZdS )    )bisect_left)deque)html5   )	EOFTokenascii_lettersascii_upper_to_lowerdigits	hexdigitsreplacement_charactersspace_characterstag_token_types)HTMLInputStreamc                 C   s4   | t v rdS tt|  }tt kr&dS t| | S )NTF)entitiesr   entity_keyslen
startswithprefixi r   X/var/www/viveiro_mudafortebrasil/venv/lib/python3.9/site-packages/tinyhtml5/tokenizer.pyhas_keys_with_prefix   s
    r   c                 C   sT   | t v r| S tdt| d D ](}| d |  t v r| d |    S qt| d S )Nr   )r   ranger   KeyErrorr   r   r   r   longest_prefix   s    r   c                   @   sl  e Zd ZdZdddZdd Zdd Zd	d
 Zdd ZdddZ	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Zd:d; Zd<d= Z d>d? Z!d@dA Z"dBdC Z#dDdE Z$dFdG Z%dHdI Z&dJdK Z'dLdM Z(dNdO Z)dPdQ Z*dRdS Z+dTdU Z,dVdW Z-dXdY Z.dZd[ Z/d\d] Z0d^d_ Z1d`da Z2dbdc Z3ddde Z4dfdg Z5dhdi Z6djdk Z7dldm Z8dndo Z9dpdq Z:drds Z;dtdu Z<dvdw Z=dxdy Z>dzd{ Z?d|d} Z@d~d ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd ZGdd ZHdd ZIdd ZJdd ZKdd ZLdd ZMdd ZNdS )HTMLTokenizerzHTML tokenizer.Nc                 K   s*   t |fi || _|| _| j| _d | _d S )N)r   streamparser
data_statestatecurrent_token)selfr   r   kwargsr   r   r   __init__)   s    zHTMLTokenizer.__init__c                 c   sN   t g | _|  rJ| jjr4tj| jjddV  q| jr
| j V  q4q
dS )zThis is where the magic happens.

        We do our usually processing through the states and when we have a token
        to return we yield the token which pauses processing until the next token
        is requested.

        r   typedataN)	r   token_queuer!   r   errorsr   PARSE_ERRORpoppopleftr#   r   r   r   __iter__1   s    

zHTMLTokenizer.__iter__c                 K   s(   t j|d}|r||d< | j| dS )z%Add a parse error to the token queue.r&   datavarsN)r   r+   r)   append)r#   _datar0   tokenr   r   r   parse_errorE   s    zHTMLTokenizer.parse_errorc                 C   s   | j tj|d dS )z+Add a characters string to the token queue.r&   N)r)   r1   r   
CHARACTERS)r#   r2   r   r   r   
charactersL   s    zHTMLTokenizer.charactersc                 C   sT  |rt nt}|rdnd}g }| j }||v rD|| | j }q&td||}|tv rtt| }| jd|d nd|  krdksn |dkrd	}| jd|d nd
|  krdksn d|  krdksn d|  krdksn d|  krdksn |t	g dv r(| jd|d t
|}|dkrP| d | j| |S )zReturn either U+FFFD or the character based on the representation.

        It also discards ";" if present. If not present self.parse_error is
        invoked.

           
    z$illegal-codepoint-for-numeric-entity)integeri   i      �r                  i  i  )#   i  i  i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i r;   ;z numeric-entity-without-semicolon)r   r
   r   	characterr1   intjoinr   r4   	frozensetchrunget)r#   Zis_hexallowedradixstackrD   r:   replacementr   r   r   consume_number_entityP   sJ    







z#HTMLTokenizer.consume_number_entityFc                 C   s  d}| j  g}|d tddgtR v p:|d uo:||d k}|rT| j |d  n|d dkrd}|| j   |d dv rd}|| j   |d |rtntv r| j |d  | |}n*| 	d	 | j |
  dd
| }n@|d tur&td
|sq&|| j   qztd
|d d }W n> ty~   | 	d | j |
  dd
| }Y n0 |d dkr| 	d t|}|| tv p|| tv p|| dk}	|d dkr|r|	r| j |
  dd
| }n.| j |
  t|  d
||d   }|rT| jd d d  |7  < n(|tv rbdnd}
| jt|
 |d d S )N&r   <#F)xXTzexpected-numeric-entityr9   zexpected-named-entityrC   znamed-entity-without-semicolon=r(   r   SPACE_CHARACTERSr5   r&   )r   rD   r   r   rI   r1   r   r
   rN   r4   r,   rF   r   r   r   r   r   r   r"   r)   r   )r#   rJ   from_attributeoutputrL   rI   hexentity_nameZentity_lengthZallowed_characterr'   r   r   r   consume_entity   s\    




zHTMLTokenizer.consume_entityc                 C   s   | j |dd dS )z5Replace the need for entity_in_attribute_value_state.T)rJ   rW   N)r[   )r#   rJ   r   r   r   process_entity_in_attribute   s    z)HTMLTokenizer.process_entity_in_attributec                 C   s   | j }|d tv r|d t|d< |d tjkrn|d }t|}t|t|krf||ddd  ||d< |d tj	kr|d r| 
d |d r| 
d | j| | j| _dS )	zThis method is a generic handler for emitting the tags.

        It also sets the state to "data" because that's what's needed after a
        token has been emitted.

        r'   namer(   NrR   zattributes-in-end-tagselfClosingzself-closing-flag-on-end-tag)r"   r   	translater	   r   	START_TAGdictr   updateEND_TAGr4   r)   r1   r    r!   )r#   r3   rawr(   r   r   r   emit_current_token   s     

z HTMLTokenizer.emit_current_tokenc                 C   s   | j  }|dkr| j| _n|dkr.| j| _np|dkrL| d | d nR|tu rXdS |tv r| j	
tj|| j td d n| j d}| ||  dS )	NrO   rP    invalid-codepointFTr&   rO   rP   rf   )r   rD   entity_data_stater!   tag_open_stater4   r6   r   r   r)   r1   r   rV   chars_untilr#   r(   r6   r   r   r   r       s$    




zHTMLTokenizer.data_statec                 C   s   |    | j| _dS NT)r[   r    r!   r.   r   r   r   ri     s    zHTMLTokenizer.entity_data_statec                 C   s   | j  }|dkr| j| _n|dkr.| j| _np|tu r:dS |dkrX| d | d nF|tv r| j	
tj|| j td d n| j d	}| ||  dS )
NrO   rP   Frf   rg   r<   Tr&   rh   )r   rD   $character_reference_in_rc_data_stater!   rcdata_less_than_sign_stater   r4   r6   r   r)   r1   r   rV   rk   )r#   r(   charsr   r   r   rcdata_state  s$    




zHTMLTokenizer.rcdata_statec                 C   s   |    | j| _dS rm   )r[   rq   r!   r.   r   r   r   rn   (  s    z2HTMLTokenizer.character_reference_in_rc_data_statec                 C   sd   | j  }|dkr| j| _nD|dkr:| d | d n&|tu rFdS | j d}| ||  dS NrP   rf   rg   r<   F)rP   rf   T)r   rD   rawtext_less_than_sign_stater!   r4   r6   r   rk   rl   r   r   r   rawtext_state-  s    


zHTMLTokenizer.rawtext_statec                 C   sd   | j  }|dkr| j| _nD|dkr:| d | d n&|tu rFdS | j d}| ||  dS rr   )r   rD    script_data_less_than_sign_stater!   r4   r6   r   rk   rl   r   r   r   script_data_state;  s    


zHTMLTokenizer.script_data_statec                 C   sN   | j  }|tu rdS |dkr4| d | d n| || j d  dS )NFrf   rg   r<   T)r   rD   r   r4   r6   rk   r#   r(   r   r   r   plaintext_stateI  s    

zHTMLTokenizer.plaintext_statec                 C   s   | j  }|dkr| j| _n|dkr.| j| _n|tv rTtj|g ddd| _| j	| _nv|dkrz| 
d | d | j| _nP|dkr| 
d	 | j | | j| _n(| 
d
 | d | j | | j| _dS )N!/F)r'   r]   r(   r^   selfClosingAcknowledged>z'expected-tag-name-but-got-right-bracketz<>?z'expected-tag-name-but-got-question-markzexpected-tag-namerP   T)r   rD   markup_declaration_open_stater!   close_tag_open_stater   r   r`   r"   tag_name_stater4   r6   r    rI   bogus_comment_staterw   r   r   r   rj   T  s4    










zHTMLTokenizer.tag_open_statec                 C   s   | j  }|tv r.tj|g dd| _| j| _nd|dkrJ| d | j	| _nH|t
u rp| d | d | j	| _n"| jd|d | j | | j| _d	S )
NFr'   r]   r(   r^   r|   z*expected-closing-tag-but-got-right-bracketz expected-closing-tag-but-got-eof</z!expected-closing-tag-but-got-charr(   T)r   rD   r   r   rc   r"   r   r!   r4   r    r   r6   rI   r   rw   r   r   r   r   w  s&    






z"HTMLTokenizer.close_tag_open_statec                 C   s   | j  }|tv r| j| _nx|dkr.|   nf|tu rJ| d | j| _nJ|dkr\| j	| _n8|dkr| d | j
d  d7  < n| j
d  |7  < dS )	Nr|   zeof-in-tag-namerz   rf   rg   r]   r<   T)r   rD   r   before_attribute_name_stater!   re   r   r4   r    self_closing_start_tag_stater"   rw   r   r   r   r     s    






zHTMLTokenizer.tag_name_statec                 C   sD   | j  }|dkr"d| _| j| _n| d | j | | j| _dS Nrz   r9   rP   T)r   rD   temporary_bufferrcdata_end_tag_open_stater!   r6   rI   rq   rw   r   r   r   ro     s    


z)HTMLTokenizer.rcdata_less_than_sign_statec                 C   sL   | j  }|tv r*|  j|7  _| j| _n| d | j | | j| _dS Nr   T)	r   rD   r   r   rcdata_end_tag_name_stater!   r6   rI   rq   rw   r   r   r   r     s    


z'HTMLTokenizer.rcdata_end_tag_open_statec                 C   s   | j o| j d  | j k}| j }|tv rR|rRtj| jg dd| _ | j| _	n|dkr||r|tj| jg dd| _ | j
| _	np|dkr|rtj| jg dd| _ |   | j| _	n>|tv r|  j|7  _n&| d| j  | j| | j| _	dS Nr]   Fr   rz   r|   r   T)r"   lowerr   r   rD   r   r   rc   r   r!   r   re   r    r   r6   rI   rq   r#   appropriater(   r   r   r   r     s@    



z'HTMLTokenizer.rcdata_end_tag_name_statec                 C   sD   | j  }|dkr"d| _| j| _n| d | j | | j| _dS r   )r   rD   r   rawtext_end_tag_open_stater!   r6   rI   rt   rw   r   r   r   rs     s    


z*HTMLTokenizer.rawtext_less_than_sign_statec                 C   sL   | j  }|tv r*|  j|7  _| j| _n| d | j | | j| _dS r   )	r   rD   r   r   rawtext_end_tag_name_stater!   r6   rI   rt   rw   r   r   r   r     s    


z(HTMLTokenizer.rawtext_end_tag_open_statec                 C   s   | j o| j d  | j k}| j }|tv rR|rRtj| jg dd| _ | j| _	n|dkr||r|tj| jg dd| _ | j
| _	np|dkr|rtj| jg dd| _ |   | j| _	n>|tv r|  j|7  _n&| d| j  | j| | j| _	dS r   )r"   r   r   r   rD   r   r   rc   r   r!   r   re   r    r   r6   rI   rt   r   r   r   r   r     s@    



z(HTMLTokenizer.rawtext_end_tag_name_statec                 C   s`   | j  }|dkr"d| _| j| _n:|dkr>| d | j| _n| d | j | | j| _dS )Nrz   r9   ry   z<!rP   T)	r   rD   r   script_data_end_tag_open_stater!   r6   script_data_escape_start_staterI   rv   rw   r   r   r   ru     s    




z.HTMLTokenizer.script_data_less_than_sign_statec                 C   sL   | j  }|tv r*|  j|7  _| j| _n| d | j | | j| _dS r   )	r   rD   r   r   script_data_end_tag_name_stater!   r6   rI   rv   rw   r   r   r   r   )  s    


z,HTMLTokenizer.script_data_end_tag_open_statec                 C   s   | j o| j d  | j k}| j }|tv rR|rRtj| jg dd| _ | j| _	n|dkr||r|tj| jg dd| _ | j
| _	np|dkr|rtj| jg dd| _ |   | j| _	n>|tv r|  j|7  _n&| d| j  | j| | j| _	dS r   )r"   r   r   r   rD   r   r   rc   r   r!   r   re   r    r   r6   rI   rv   r   r   r   r   r   4  s@    



z,HTMLTokenizer.script_data_end_tag_name_statec                 C   s>   | j  }|dkr&| d | j| _n| j | | j| _dS N-T)r   rD   r6   #script_data_escape_start_dash_stater!   rI   rv   rw   r   r   r   r   Z  s    


z,HTMLTokenizer.script_data_escape_start_statec                 C   s>   | j  }|dkr&| d | j| _n| j | | j| _dS r   )r   rD   r6   #script_data_escaped_dash_dash_stater!   rI   rv   rw   r   r   r   r   d  s    


z1HTMLTokenizer.script_data_escape_start_dash_statec                 C   s   | j  }|dkr&| d | j| _nX|dkr8| j| _nF|dkrV| d | d n(|tu rh| j| _n| || j 	d  dS )Nr   rP   rf   rg   r<   )rP   r   rf   T)
r   rD   r6   script_data_escaped_dash_stater!   (script_data_escaped_less_than_sign_stater4   r   r    rk   rw   r   r   r   script_data_escaped_staten  s    





z'HTMLTokenizer.script_data_escaped_statec                 C   s   | j  }|dkr&| d | j| _n\|dkr8| j| _nJ|dkr^| d | d | j| _n$|tu rp| j	| _n| | | j| _dS )Nr   rP   rf   rg   r<   T)
r   rD   r6   r   r!   r   r4   r   r   r    rw   r   r   r   r   ~  s    








z,HTMLTokenizer.script_data_escaped_dash_statec                 C   s   | j  }|dkr| d nx|dkr0| j| _nf|dkrL| d | j| _nJ|dkrr| d | d | j| _n$|tu r| j	| _n| | | j| _dS )Nr   rP   r|   rf   rg   r<   T)
r   rD   r6   r   r!   rv   r4   r   r   r    rw   r   r   r   r     s"    








z1HTMLTokenizer.script_data_escaped_dash_dash_statec                 C   sl   | j  }|dkr"d| _| j| _nF|tv rJ| d|  || _| j| _n| d | j | | j	| _dS r   )
r   rD   r   &script_data_escaped_end_tag_open_stater!   r   r6   %script_data_double_escape_start_staterI   r   rw   r   r   r   r     s    



z6HTMLTokenizer.script_data_escaped_less_than_sign_statec                 C   sD   | j  }|tv r"|| _| j| _n| d | j | | j| _dS r   )	r   rD   r   r   &script_data_escaped_end_tag_name_stater!   r6   rI   r   rw   r   r   r   r     s    


z4HTMLTokenizer.script_data_escaped_end_tag_open_statec                 C   s   | j o| j d  | j k}| j }|tv rR|rRtj| jg dd| _ | j| _	n|dkr||r|tj| jg dd| _ | j
| _	np|dkr|rtj| jg dd| _ |   | j| _	n>|tv r|  j|7  _n&| d| j  | j| | j| _	dS r   )r"   r   r   r   rD   r   r   rc   r   r!   r   re   r    r   r6   rI   r   r   r   r   r   r     s@    



z4HTMLTokenizer.script_data_escaped_end_tag_name_statec                 C   s   | j  }|ttdB v rF| | | j dkr<| j| _q|| j	| _n6|t
v rh| | |  j|7  _n| j | | j	| _dS N)rz   r|   scriptT)r   rD   r   rG   r6   r   r    script_data_double_escaped_stater!   r   r   rI   rw   r   r   r   r     s    




z3HTMLTokenizer.script_data_double_escape_start_statec                 C   s   | j  }|dkr&| d | j| _n`|dkrB| d | j| _nD|dkr`| d | d n&|tu r|| d | j| _n
| | dS Nr   rP   rf   rg   r<   eof-in-script-in-scriptT)	r   rD   r6   %script_data_double_escaped_dash_stater!   /script_data_double_escaped_less_than_sign_stater4   r   r    rw   r   r   r   r     s    








z.HTMLTokenizer.script_data_double_escaped_statec                 C   s   | j  }|dkr&| d | j| _np|dkrB| d | j| _nT|dkrh| d | d | j| _n.|tu r| d | j	| _n| | | j| _dS r   )
r   rD   r6   *script_data_double_escaped_dash_dash_stater!   r   r4   r   r   r    rw   r   r   r   r     s"    










z3HTMLTokenizer.script_data_double_escaped_dash_statec                 C   s   | j  }|dkr| d n|dkr:| d | j| _np|dkrV| d | j| _nT|dkr|| d | d | j| _n.|tu r| d | j	| _n| | | j| _dS )	Nr   rP   r|   rf   rg   r<   r   T)
r   rD   r6   r   r!   rv   r4   r   r   r    rw   r   r   r   r     s&    










z8HTMLTokenizer.script_data_double_escaped_dash_dash_statec                 C   sD   | j  }|dkr,| d d| _| j| _n| j | | j| _dS )Nrz   r9   T)r   rD   r6   r   #script_data_double_escape_end_stater!   rI   r   rw   r   r   r   r   0  s    


z=HTMLTokenizer.script_data_double_escaped_less_than_sign_statec                 C   s   | j  }|ttdB v rF| | | j dkr<| j| _q|| j	| _n6|t
v rh| | |  j|7  _n| j | | j	| _dS r   )r   rD   r   rG   r6   r   r   r   r!   r   r   rI   rw   r   r   r   r   ;  s    




z1HTMLTokenizer.script_data_double_escape_end_statec                 C   s  | j  }|tv r"| j td n|tv rH| jd |dg | j| _n|dkrZ| 	  n|dkrl| j
| _n|dv r| d | jd |dg | j| _nh|dkr| d	 | jd d
dg | j| _n8|tu r| d | j| _n| jd |dg | j| _dS )NTr(   r9   r|   rz   )'"rU   rP   #invalid-character-in-attribute-namerf   rg   r<   z#expected-attribute-name-but-got-eof)r   rD   r   rk   r   r"   r1   attribute_name_stater!   re   r   r4   r   r    rw   r   r   r   r   K  s0    









z)HTMLTokenizer.before_attribute_name_statec                 C   s  | j  }d}d}|dkr&| j| _n|tv r\| jd d d  || j td 7  < d}n|dkrjd}n|tv r|| j| _n|dkr| j	| _n|d	kr| 
d
 | jd d d  d7  < d}nn|dv r| 
d | jd d d  |7  < d}n<|tu r| 
d | j| _n| jd d d  |7  < d}|r| jd d d t| jd d d< | jd d d D ]2\}}| jd d d |krl| 
d  qql|r|   dS )NTFrU   r(   rR   r   r|   rz   rf   rg   r<   r   r   rP   r   zeof-in-attribute-namezduplicate-attribute)r   rD   before_attribute_value_stater!   r   r"   rk   r   after_attribute_name_stater   r4   r   r    r_   r	   re   )r#   r(   Zleaving_this_stateZ
emit_tokenr]   _r   r   r   r   f  sN    









z"HTMLTokenizer.attribute_name_statec                 C   s  | j  }|tv r"| j td n|dkr4| j| _n|dkrF|   n|tv rl| jd 	|dg | j
| _n|dkr~| j| _n|dkr| d | jd 	d	dg | j
| _nh|d
v r| d | jd 	|dg | j
| _n8|tu r| d | j| _n| jd 	|dg | j
| _dS )NTrU   r|   r(   r9   rz   rf   rg   r<   r   z&invalid-character-after-attribute-namezexpected-end-of-tag-but-got-eof)r   rD   r   rk   r   r!   re   r   r"   r1   r   r   r4   r   r    rw   r   r   r   r     s4    










z(HTMLTokenizer.after_attribute_name_statec                 C   s2  | j  }|tv r$| j td n
|dkr6| j| _n|dkrT| j| _| j | n|dkrf| j| _n|dkr| 	d | 
  n|dkr| 	d | jd	 d
 d  d7  < | j| _nv|dv r| 	d | jd	 d
 d  |7  < | j| _n@|tu r| 	d | j| _n"| jd	 d
 d  |7  < | j| _dS )NTr   rO   r   r|   z.expected-attribute-value-but-got-right-bracketrf   rg   r(   rR   r   r<   )rU   rP   `z"equals-in-unquoted-attribute-valuez$expected-attribute-value-but-got-eof)r   rD   r   rk   #attribute_value_double_quoted_stater!   attribute_value_unquoted_staterI   #attribute_value_single_quoted_stater4   re   r"   r   r    rw   r   r   r   r     s6    











z*HTMLTokenizer.before_attribute_value_statec                 C   s   | j  }|dkr| j| _n|dkr0| d np|dkr^| d | jd d d  d7  < nB|tu rz| d	 | j| _n&| jd d d  || j 	d
 7  < dS )Nr   rO   rf   rg   r(   rR   r   r<   z#eof-in-attribute-value-double-quote)r   rO   rf   T
r   rD   after_attribute_value_stater!   r\   r4   r"   r   r    rk   rw   r   r   r   r     s    




z1HTMLTokenizer.attribute_value_double_quoted_statec                 C   s   | j  }|dkr| j| _n|dkr0| d np|dkr^| d | jd d d  d7  < nB|tu rz| d	 | j| _n&| jd d d  || j 	d
 7  < dS )Nr   rO   rf   rg   r(   rR   r   r<   z#eof-in-attribute-value-single-quote)r   rO   rf   Tr   rw   r   r   r   r     s    




z1HTMLTokenizer.attribute_value_single_quoted_statec                 C   s   | j  }|tv r| j| _n|dkr0| d n|dkrB|   n|dv rp| d | jd d d  |7  < nx|dkr| d	 | jd d d  d
7  < nJ|t	u r| d | j
| _n.| jd d d  || j tdtB  7  < dS )NrO   r|   )r   r   rU   rP   r   z0unexpected-character-in-unquoted-attribute-valuer(   rR   r   rf   rg   r<   z eof-in-attribute-value-no-quotes)rO   r|   r   r   rU   rP   r   rf   T)r   rD   r   r   r!   r\   re   r4   r"   r   r    rk   rG   rw   r   r   r   r     s0    






z,HTMLTokenizer.attribute_value_unquoted_statec                 C   s   | j  }|tv r| j| _nj|dkr.|   nX|dkr@| j| _nF|tu rh| d | j 	| | j
| _n| d | j 	| | j| _dS )Nr|   rz   z$unexpected-eof-after-attribute-valuez*unexpected-character-after-attribute-valueT)r   rD   r   r   r!   re   r   r   r4   rI   r    rw   r   r   r   r     s    






z)HTMLTokenizer.after_attribute_value_statec                 C   sp   | j  }|dkr&d| jd< |   nF|tu rN| d | j | | j| _n| d | j | | j	| _dS )Nr|   Tr^   z#unexpected-eof-after-solidus-in-tagz)unexpected-character-after-solidus-in-tag)
r   rD   r"   re   r   r4   rI   r    r!   r   rw   r   r   r   r     s    





z*HTMLTokenizer.self_closing_start_tag_statec                 C   sB   | j d}|dd}| jtj|d | j   | j| _	dS )Nr|   rf   r<   r&   T)
r   rk   replacer)   r1   r   COMMENTrD   r    r!   rw   r   r   r   r   -  s    
z!HTMLTokenizer.bogus_comment_statec                 C   s  | j  g}|d dkrR|| j   |d dkrNtjdd| _| j| _dS n |d r|d dv rd}dD ]0}|| j   |d r|d |vrnd} qqn|rtjdd d dd	| _| j	| _dS n|d d
krR| j
d urR| j
jjrR| j
jjd j| j
jjkrRd}dD ].}|| j   |d |krd} q@q|rR| j| _dS | d |rv| j |  q\| j| _dS )NrR   r   r9   r&   TZdD)ZoOcCtTyYpPeEF)r'   r]   publicIdsystemIdcorrect[zCDATA[zexpected-dashes-or-doctype)r   rD   r1   r   r   r"   comment_start_stater!   ZDOCTYPEdoctype_stater   treeZopen_elements	namespacedefault_namespacecdata_section_stater4   rI   r,   r   )r#   rL   matchedexpectedr   r   r   r~   ;  s\    

z+HTMLTokenizer.markup_declaration_open_statec                 C   s   | j  }|dkr| j| _n|dkrB| d | jd  d7  < nn|dkrl| d | j| j | j| _nD|t	u r| d | j| j | j| _n| jd  |7  < | j
| _d	S )
Nr   rf   rg   r(   r<   r|   incorrect-commenteof-in-commentT)r   rD   comment_start_dash_stater!   r4   r"   r)   r1   r    r   comment_staterw   r   r   r   r   j  s"    






z!HTMLTokenizer.comment_start_statec                 C   s   | j  }|dkr| j| _n|dkrB| d | jd  d7  < nt|dkrl| d | j| j | j| _nJ|t	u r| d | j| j | j| _n | jd  d| 7  < | j
| _d	S )
Nr   rf   rg   r(      -�r|   r   r   T)r   rD   comment_end_stater!   r4   r"   r)   r1   r    r   r   rw   r   r   r   r   ~  s"    






z&HTMLTokenizer.comment_start_dash_statec                 C   s   | j  }|dkr| j| _nn|dkrB| d | jd  d7  < nH|tu rl| d | j| j | j	| _n| jd  || j 
d 7  < dS )	Nr   rf   rg   r(   r<   r   )r   rf   T)r   rD   comment_end_dash_stater!   r4   r"   r   r)   r1   r    rk   rw   r   r   r   r     s    





zHTMLTokenizer.comment_statec                 C   s   | j  }|dkr| j| _nx|dkrJ| d | jd  d7  < | j| _nJ|tu rt| d | j	| j | j
| _n | jd  d| 7  < | j| _dS )Nr   rf   rg   r(   r   zeof-in-comment-end-dashT)r   rD   r   r!   r4   r"   r   r   r)   r1   r    rw   r   r   r   r     s    





z$HTMLTokenizer.comment_end_dash_statec                 C   s   | j  }|dkr*| j| j | j| _n|dkrX| d | jd  d7  < | j| _n|dkrt| d | j	| _nz|dkr| d	 | jd  |7  < nT|t
u r| d
 | j| j | j| _n*| d | jd  d| 7  < | j| _dS )Nr|   rf   rg   r(   u   --�ry   z,unexpected-bang-after-double-dash-in-commentr   z,unexpected-dash-after-double-dash-in-commentzeof-in-comment-double-dashzunexpected-char-in-commentz--T)r   rD   r)   r1   r"   r    r!   r4   r   comment_end_bang_stater   rw   r   r   r   r     s,    









zHTMLTokenizer.comment_end_statec                 C   s   | j  }|dkr*| j| j | j| _n|dkrN| jd  d7  < | j| _nx|dkr|| d | jd  d7  < | j	| _nJ|t
u r| d | j| j | j| _n | jd  d| 7  < | j	| _d	S )
Nr|   r   r(   z--!rf   rg   u   --!�zeof-in-comment-end-bang-stateT)r   rD   r)   r1   r"   r    r!   r   r4   r   r   rw   r   r   r   r     s$    






z$HTMLTokenizer.comment_end_bang_statec                 C   sr   | j  }|tv r| j| _nR|tu rP| d d| jd< | j	| j | j
| _n| d | j | | j| _dS )N!expected-doctype-name-but-got-eofFr   zneed-space-after-doctypeT)r   rD   r   before_doctype_name_stater!   r   r4   r"   r)   r1   r    rI   rw   r   r   r   r     s    





zHTMLTokenizer.doctype_statec                 C   s   | j  }|tv rn|dkrH| d d| jd< | j| j | j| _nl|dkrn| d d| jd< | j	| _nF|t
u r| d	 d| jd< | j| j | j| _n|| jd< | j	| _d
S )Nr|   z+expected-doctype-name-but-got-right-bracketFr   rf   rg   r<   r]   r   T)r   rD   r   r4   r"   r)   r1   r    r!   doctype_name_stater   rw   r   r   r   r     s(    










z'HTMLTokenizer.before_doctype_name_statec                 C   s   | j  }|tv r2| jd t| jd< | j| _n|dkrh| jd t| jd< | j	| j | j
| _n|dkr| d | jd  d7  < | j| _n\|tu r| d d| jd< | jd t| jd< | j	| j | j
| _n| jd  |7  < d	S )
Nr]   r|   rf   rg   r<   zeof-in-doctype-nameFr   T)r   rD   r   r"   r_   r	   after_doctype_name_stater!   r)   r1   r    r4   r   r   rw   r   r   r   r     s.    







z HTMLTokenizer.doctype_name_statec                 C   s@  | j  }|tv rn&|dkr8| j| j | j| _n|tu rxd| jd< | j 	| | 
d | j| j | j| _n|r|dv rd}dD ]"}| j  }|r||vrd} qq|r| j| _dS nN|r|dv rd}d	D ]"}| j  }|r||vrd} qq|r| j| _dS | j 	| | j
d
|d d| jd< | j| _dS )Nr|   Fr   eof-in-doctyper   T)ZuUZbBZlLZiIr   sS)r   r   r   r   ZmMz*expected-space-or-right-bracket-in-doctyper   )r   rD   r   r)   r1   r"   r    r!   r   rI   r4   "after_doctype_public_keyword_state"after_doctype_system_keyword_statebogus_doctype_state)r#   r(   r   r   r   r   r   r   "  sJ    






z&HTMLTokenizer.after_doctype_name_statec                 C   s   | j  }|tv r| j| _np|dv rD| d | j | | j| _nH|tu rx| d d| jd< | j	
| j | j| _n| j | | j| _dS N)r   r   unexpected-char-in-doctyper   Fr   T)r   rD   r   &before_doctype_public_identifier_stater!   r4   rI   r   r"   r)   r1   r    rw   r   r   r   r   O  s    






z0HTMLTokenizer.after_doctype_public_keyword_statec                 C   s   | j  }|tv rn|dkr0d| jd< | j| _n|dkrLd| jd< | j| _n|dkr| d d| jd< | j	| j | j
| _nP|tu r| d	 d| jd< | j	| j | j
| _n| d
 d| jd< | j| _dS )Nr   r9   r   r   r|   unexpected-end-of-doctypeFr   r   r   T)r   rD   r   r"   -doctype_public_identifier_double_quoted_stater!   -doctype_public_identifier_single_quoted_stater4   r)   r1   r    r   r   rw   r   r   r   r   a  s.    












z4HTMLTokenizer.before_doctype_public_identifier_statec                 C   s   | j  }|dkr| j| _n|dkrB| d | jd  d7  < nz|dkrv| d d| jd	< | j| j | j| _nF|t	u r| d
 d| jd	< | j| j | j| _n| jd  |7  < dS )Nr   rf   rg   r   r<   r|   r   Fr   r   T
r   rD   %after_doctype_public_identifier_stater!   r4   r"   r)   r1   r    r   rw   r   r   r   r   {  s$    








z;HTMLTokenizer.doctype_public_identifier_double_quoted_statec                 C   s   | j  }|dkr| j| _n|dkrB| d | jd  d7  < nz|dkrv| d d| jd	< | j| j | j| _nF|t	u r| d
 d| jd	< | j| j | j| _n| jd  |7  < dS )Nr   rf   rg   r   r<   r|   r   Fr   r   Tr   rw   r   r   r   r     s$    








z;HTMLTokenizer.doctype_public_identifier_single_quoted_statec                 C   s   | j  }|tv r| j| _n|dkr<| j| j | j| _n|dkrb| 	d d| jd< | j
| _nv|dkr| 	d d| jd< | j| _nP|tu r| 	d d| jd	< | j| j | j| _n| 	d d| jd	< | j| _d
S )Nr|   r   r   r9   r   r   r   Fr   T)r   rD   r   3between_doctype_public_and_system_identifiers_stater!   r)   r1   r"   r    r4   -doctype_system_identifier_double_quoted_state-doctype_system_identifier_single_quoted_stater   r   rw   r   r   r   r     s.    













z3HTMLTokenizer.after_doctype_public_identifier_statec                 C   s   | j  }|tv rn|dkr4| j| j | j| _n|dkrPd| jd< | j| _nl|dkrld| jd< | j	| _nP|t
u r| d d| jd< | j| j | j| _n| d	 d| jd< | j| _d
S )Nr|   r   r9   r   r   r   Fr   r   T)r   rD   r   r)   r1   r"   r    r!   r   r   r   r4   r   rw   r   r   r   r     s*    










zAHTMLTokenizer.between_doctype_public_and_system_identifiers_statec                 C   s   | j  }|tv r| j| _np|dv rD| d | j | | j| _nH|tu rx| d d| jd< | j	
| j | j| _n| j | | j| _dS r   )r   rD   r   &before_doctype_system_identifier_stater!   r4   rI   r   r"   r)   r1   r    rw   r   r   r   r     s    






z0HTMLTokenizer.after_doctype_system_keyword_statec                 C   s   | j  }|tv rn|dkr0d| jd< | j| _n|dkrLd| jd< | j| _n|dkr| d d| jd< | j	| j | j
| _nP|tu r| d	 d| jd< | j	| j | j
| _n| d d| jd< | j| _d
S )Nr   r9   r   r   r|   r   Fr   r   T)r   rD   r   r"   r   r!   r   r4   r)   r1   r    r   r   rw   r   r   r   r     s.    












z4HTMLTokenizer.before_doctype_system_identifier_statec                 C   s   | j  }|dkr| j| _n|dkrB| d | jd  d7  < nz|dkrv| d d| jd	< | j| j | j| _nF|t	u r| d
 d| jd	< | j| j | j| _n| jd  |7  < dS )Nr   rf   rg   r   r<   r|   r   Fr   r   T
r   rD   %after_doctype_system_identifier_stater!   r4   r"   r)   r1   r    r   rw   r   r   r   r     s$    








z;HTMLTokenizer.doctype_system_identifier_double_quoted_statec                 C   s   | j  }|dkr| j| _n|dkrB| d | jd  d7  < nz|dkrv| d d| jd	< | j| j | j| _nF|t	u r| d
 d| jd	< | j| j | j| _n| jd  |7  < dS )Nr   rf   rg   r   r<   r|   r   Fr   r   Tr   rw   r   r   r   r     s$    








z;HTMLTokenizer.doctype_system_identifier_single_quoted_statec                 C   s~   | j  }|tv rnf|dkr4| j| j | j| _nF|tu rh| 	d d| jd< | j| j | j| _n| 	d | j
| _dS )Nr|   r   Fr   r   T)r   rD   r   r)   r1   r"   r    r!   r   r4   r   rw   r   r   r   r   -  s    





z3HTMLTokenizer.after_doctype_system_identifier_statec                 C   sZ   | j  }|dkr*| j| j | j| _n,|tu rV| j | | j| j | j| _n dS )Nr|   T)	r   rD   r)   r1   r"   r    r!   r   rI   rw   r   r   r   r   >  s    


z!HTMLTokenizer.bogus_doctype_statec                 C   s   g }| | jd | | jd | j }|tu r>qq|dksJJ |d dd  dkrv|d d d |d< qq| | qd|}|d }dkrt|D ]}| d	 q|	dd
}|r| 
| | j| _dS )N]r|   rR   z]]r9   rf   r   rg   r<   T)r1   r   rk   rD   r   rF   countr   r4   r   r6   r    r!   )r#   r(   charZ
null_countr   r   r   r   r   L  s(    


z!HTMLTokenizer.cdata_section_state)N)NF)O__name__
__module____qualname____doc__r%   r/   r4   r6   rN   r[   r\   re   r    ri   rq   rn   rt   rv   rx   rj   r   r   ro   r   r   rs   r   r   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   &   s   
5
G#&&&

&1/-r   N)bisectr   collectionsr   html.entitiesr   r   	constantsr   r   r   r	   r
   r   r   r   r   Zinputstreamr   tuplesortedr   r   r   r   r   r   r   r   <module>   s   ,	