
R\w              1   @   s  d  d l  m Z m Z m Z d  d l m Z d  d l Z d  d l Z d d l m	 Z	 m
 Z
 m Z m Z d d l m Z m Z d d l m Z d  d l m Z y d  d	 l m Z Wn e k
 r e Z Yn Xy d  d
 l m Z Wn( e k
 r	Gd d   d e  Z Yn Xe d d   e
 D  Z e d d   e D  Z e d d   e D  Z e e d d g  BZ e j d  Z e d d d d d d d d d d d d d  d! d" d# d$ d% d& d' d( d) d* d+ d, d- d. d/ d0 d1 d2 d3 g   Z e j d4  Z  i  Z! Gd5 d6   d6  Z" d d7 d7 d8 d9  Z# Gd: d;   d;  Z$ Gd< d=   d= e$  Z% Gd> d?   d? e&  Z' Gd@ dA   dA e  Z( GdB dC   dC e  Z) dD dE   Z* d S)F    )absolute_importdivisionunicode_literals)	text_typeN   )EOFspaceCharactersasciiLettersasciiUppercase)	encodingsReparseException)utils)StringIO)BytesIO)BufferedIOBasec               @   s   e  Z d  Z d S)r   N)__name__
__module____qualname__ r   r   C/var/www/dbchiro/venv/build/pip/pip/_vendor/html5lib/inputstream.pyr      s   r   c             C   s   g  |  ] } | j  d    q S)ascii)encode).0itemr   r   r   
<listcomp>   s   	 r   c             C   s   g  |  ] } | j  d    q S)r   )r   )r   r   r   r   r   r      s   	 c             C   s   g  |  ] } | j  d    q S)r   )r   )r   r   r   r   r   r      s   	    >   <u   [----﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿]i i i i i i i i i i i i i i i i i	 i	 i
 i
 i i i i i i i i i i i i z[	- -/:-@[-`{-~]c               @   sj   e  Z d  Z d Z d d   Z d d   Z d d   Z d d	   Z d
 d   Z d d   Z	 d d   Z
 d S)BufferedStreamzBuffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
    joining many strings will be slow since it is O(n**2)
    c             C   s%   | |  _  g  |  _ d d g |  _ d  S)Nr   r   )streambufferposition)selfr   r   r   r   __init__5   s    		zBufferedStream.__init__c             C   sP   d } x2 |  j  d  |  j d  D] } | t |  7} q! W| |  j d 7} | S)Nr   r   )r    r!   len)r"   poschunkr   r   r   tell:   s
    !zBufferedStream.tellc             C   sk   | |  j    k  s t  | } d } x1 t |  j |  | k  rW | | 8} | d 7} q' W| | g |  _ d  S)Nr   r   )_bufferedBytesAssertionErrorr$   r    r!   )r"   r%   offsetir   r   r   seekA   s    
zBufferedStream.seekc             C   sp   |  j  s |  j |  S|  j d t |  j   k r_ |  j d t |  j  d  k r_ |  j |  S|  j |  Sd  S)Nr   r   r   )r    _readStreamr!   r$   _readFromBuffer)r"   bytesr   r   r   readJ   s    	 zBufferedStream.readc             C   s   t  d d   |  j D  S)Nc             S   s   g  |  ] } t  |   q Sr   )r$   )r   r   r   r   r   r   T   s   	 z1BufferedStream._bufferedBytes.<locals>.<listcomp>)sumr    )r"   r   r   r   r(   S   s    zBufferedStream._bufferedBytesc             C   sJ   |  j  j |  } |  j j |  |  j d d 7<t |  |  j d <| S)Nr   r   )r   r0   r    appendr!   r$   )r"   r/   datar   r   r   r-   V   s
    zBufferedStream._readStreamc             C   s%  | } g  } |  j  d } |  j  d } x | t |  j  k  r | d k r | d k s\ t  |  j | } | t |  | k r | } | | | g |  _  n/ t |  | } | t |  g |  _  | d 7} | j | | | |   | | 8} d } q) W| r| j |  j |   n  d j |  S)Nr   r    )r!   r$   r    r)   r2   r-   join)r"   r/   remainingBytesrvbufferIndexbufferOffsetbufferedDatabytesToReadr   r   r   r.   ]   s&    $


zBufferedStream._readFromBufferN)r   r   r   __doc__r#   r'   r,   r0   r(   r-   r.   r   r   r   r   r   .   s   		r   Tc             C   s{   t  |  d  r* t |  j d  t  } n t |  t  } | rd | d  k	 rZ t d   n  t |   St |  | | |  Sd  S)Nr0   r   z7Cannot explicitly set an encoding with a unicode string)hasattr
isinstancer0   r   	TypeErrorHTMLUnicodeInputStreamHTMLBinaryInputStream)sourceencoding	parseMetachardet	isUnicoder   r   r   HTMLInputStreamx   s    
rG   c               @   s   e  Z d  Z d Z d Z d d   Z d d   Z d d   Z d	 d
   Z d d   Z	 d d   Z
 d d d  Z d d   Z d d   Z d d d  Z d d   Z d S)r@   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    i (  c             C   s   t  d  d k r3 |  j |  _ t j d  |  _ n |  j |  _ t j d  |  _ d g |  _ d	 |  _ |  j	 |  |  _
 |  j   d S)
a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        parseMeta - Look for a <meta> element containing encoding information

        u   􏿿r   u	   [-]u0   ([-](?![-])|(?<![-])[-])r   utf-8certainN)rH   zcertain)r$   characterErrorsUCS4reportCharacterErrorsrecompilereplaceCharactersRegexpcharacterErrorsUCS2newLinescharEncoding
openStream
dataStreamreset)r"   rB   r   r   r   r#      s    	zHTMLUnicodeInputStream.__init__c             C   sC   d |  _  d |  _ d |  _ g  |  _ d |  _ d |  _ d  |  _ d  S)Nr4   r   )r&   	chunkSizechunkOffseterrorsprevNumLinesprevNumCols_bufferedCharacter)r"   r   r   r   rT      s    						zHTMLUnicodeInputStream.resetc             C   s(   t  | d  r | } n t |  } | S)zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r0   )r=   r   )r"   rB   r   r   r   r   rR      s    	z!HTMLUnicodeInputStream.openStreamc             C   st   |  j  } | j d d |  } |  j | } | j d d |  } | d k r\ |  j | } n | | d } | | f S)N
r   r   r   )r&   countrX   rfindrY   )r"   r*   r&   nLinespositionLinelastLinePospositionColumnr   r   r   	_position   s    	z HTMLUnicodeInputStream._positionc             C   s&   |  j  |  j  \ } } | d | f S)z:Returns (line, col) of the current position in the stream.r   )rb   rV   )r"   linecolr   r   r   r!      s    zHTMLUnicodeInputStream.positionc             C   sL   |  j  |  j k r% |  j   s% t Sn  |  j  } |  j | } | d |  _  | S)zo Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        r   )rV   rU   	readChunkr   r&   )r"   rV   charr   r   r   rf      s    	zHTMLUnicodeInputStream.charNc             C   s^  | d  k r |  j  } n  |  j |  j  \ |  _ |  _ d |  _ d |  _ d |  _ |  j j |  } |  j	 r |  j	 | } d  |  _	 n
 | s d St
 |  d k r t | d  } | d k s d | k o d k n r | d |  _	 | d  d  } q n  |  j |  |  j j d |  } | j d	 d
  } | j d d
  } | |  _ t
 |  |  _ d S)Nr4   r   Fr      i   i  u   �z
r[   Tr   r   r   )_defaultChunkSizerb   rU   rX   rY   r&   rV   rS   r0   rZ   r$   ordrK   rN   subreplace)r"   rU   r3   lastvr   r   r   re      s0    				(	z HTMLUnicodeInputStream.readChunkc             C   s:   x3 t  t t j |    D] } |  j j d  q Wd  S)Nzinvalid-codepoint)ranger$   invalid_unicode_refindallrW   r2   )r"   r3   r+   r   r   r   rJ     s    "z*HTMLUnicodeInputStream.characterErrorsUCS4c             C   s  d } x t  j |  D] } | r( q n  t | j    } | j   } t j | | | d   r t j | | | d   } | t k r |  j	 j
 d  n  d } q | d k r | d k r | t |  d k r |  j	 j
 d  q d } |  j	 j
 d  q Wd  S)NF   zinvalid-codepointTi   i  r   )ro   finditerrj   groupstartr   isSurrogatePairsurrogatePairToCodepointnon_bmp_invalid_codepointsrW   r2   r$   )r"   r3   skipmatch	codepointr%   char_valr   r   r   rO     s     	z*HTMLUnicodeInputStream.characterErrorsUCS2Fc       
      C   sq  y t  | | f } Wn t k
 r x& | D] } t |  d k  s+ t  q+ Wd j d d   | D  } | s| d | } n  t j d |  } t  | | f <Yn Xg  } x | j |  j |  j	  } | d k r |  j	 |  j
 k r-Pq-nE | j   } | |  j
 k r-| j |  j |  j	 |   | |  _	 Pn  | j |  j |  j	 d   |  j   s Pq q Wd j |  }	 |	 S)z Returns a string of characters from the stream up to but not
        including any character in 'characters' or EOF. 'characters' must be
        a container that supports the 'in' method and iteration over its
        characters.
           r4   c             S   s    g  |  ] } d  t  |   q S)z\x%02x)rj   )r   cr   r   r   r   >  s   	 z5HTMLUnicodeInputStream.charsUntil.<locals>.<listcomp>z^%sz[%s]+N)charsUntilRegExKeyErrorrj   r)   r5   rL   rM   ry   r&   rV   rU   endr2   re   )
r"   
charactersoppositecharsr}   regexr7   mr   rr   r   r   
charsUntil0  s2    &	z!HTMLUnicodeInputStream.charsUntilc             C   so   | d  k	 rk |  j  d k r= | |  j |  _ |  j d 7_ qk |  j  d 8_  |  j |  j  | k sk t  n  d  S)Nr   r   )rV   r&   rU   r)   )r"   rf   r   r   r   unget_  s    zHTMLUnicodeInputStream.unget)r   r   r   r<   ri   r#   rT   rR   rb   r!   rf   re   rJ   rO   r   r   r   r   r   r   r@      s   !(/r@   c               @   sy   e  Z d  Z d Z d d d d d  Z d d   Z d d	   Z d d d
 d  Z d d   Z d d   Z	 d d   Z
 d S)rA   zProvides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    NTc             C   s   |  j  |  |  _ t j |  |  j  t |  d f |  _ d |  _ d |  _ d |  _ |  j d d k r |  j	 | |  |  _ n  |  j
   d S)a  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        parseMeta - Look for a <meta> element containing encoding information

        rI   i   d   zwindows-1252r   N)rR   	rawStreamr@   r#   	codecNamerQ   numBytesMetanumBytesChardetdefaultEncodingdetectEncodingrT   )r"   rB   rC   rD   rE   r   r   r   r#   x  s    			zHTMLBinaryInputStream.__init__c             C   s6   t  j |  j d  |  j d  |  _ t j |   d  S)Nr   rl   )codecs	getreaderrQ   r   rS   r@   rT   )r"   r   r   r   rT     s    zHTMLBinaryInputStream.resetc          	   C   sV   t  | d  r | } n t |  } y | j | j    Wn t |  } Yn X| S)zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        r0   )r=   r   r,   r'   r   )r"   rB   r   r   r   r   rR     s    	z HTMLBinaryInputStream.openStreamc       
      C   s  |  j    } d } | d  k r9 | r9 |  j   } d } n  | d  k r:| r:d } y y d d l m } Wn" t k
 r d d l m } Yn Xg  } |   } x[ | j s |  j j |  j	  } t
 | t  s t  | s Pn  | j |  | j |  q W| j   | j d } |  j j d  Wq:t k
 r6Yq:Xn  | d  k rXd } |  j } n  i d d 6}	 | j   |	 k r|	 | j   } n  | | f S)NrI   	tentativer   )UniversalDetectorrC   zwindows-1252z
iso-8859-1)	detectBOMdetectEncodingMetacharade.universaldetectorr   ImportErrorchardet.universaldetectordoner   r0   r   r>   r/   r)   r2   feedcloseresultr,   r   lower)
r"   rD   rE   rC   
confidencer   buffersdetectorr    encodingSubr   r   r   r     sB    		
z$HTMLBinaryInputStream.detectEncodingc             C   s   |  j  d d k s t  t |  } | d	 k r: d } n  | d  k rJ d  S| |  j  d k rv |  j  d d f |  _  nF |  j j d  |  j   | d f |  _  t d |  j  d | f   d  S)
Nr   rI   utf-16	utf-16-be	utf-16-lezutf-8r   zEncoding changed from %s to %s)r   r   r   )rQ   r)   r   r   r,   rT   r   )r"   newEncodingr   r   r   changeEncoding  s    	
z$HTMLBinaryInputStream.changeEncodingc             C   s   i d t  j 6d t  j 6d t  j 6d t  j 6d t  j 6} |  j j d  } t | t	  s_ t
  | j | d d   } d } | s | j |  } d } | s | j | d d	   } d	 } q n  |  j j | r | p d
  | S)zAttempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return Nonezutf-8z	utf-16-lez	utf-16-bez	utf-32-lez	utf-32-be   N   rq   r   )r   BOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr   r0   r>   r/   r)   getr,   )r"   bomDictstringrC   r,   r   r   r   r     s     
zHTMLBinaryInputStream.detectBOMc             C   sk   |  j  j |  j  } t | t  s* t  t |  } |  j  j d  | j   } | d k rg d } n  | S)z9Report the encoding declared by the meta element
        r   utf-16	utf-16-be	utf-16-lezutf-8)r   r   r   )	r   r0   r   r>   r/   r)   EncodingParserr,   getEncoding)r"   r    parserrC   r   r   r   r     s    	z(HTMLBinaryInputStream.detectEncodingMeta)r   r   r   r<   r#   rT   rR   r   r   r   r   r   r   r   r   rA   p  s   (-rA   c               @   s   e  Z d  Z d Z d d   Z d d   Z d d   Z d d	   Z d
 d   Z d d   Z	 d d   Z
 d d   Z e e e
  Z d d   Z e e  Z e d d  Z d d   Z d d   Z d d   Z d S)EncodingByteszString-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raisedc             C   s+   t  | t  s t  t j |  | j    S)N)r>   r/   r)   __new__r   )r"   valuer   r   r   r   &  s    zEncodingBytes.__new__c             C   s   d |  _  d  S)Nr   r   )rb   )r"   r   r   r   r   r#   *  s    zEncodingBytes.__init__c             C   s   |  S)Nr   )r"   r   r   r   __iter__-  s    zEncodingBytes.__iter__c             C   sV   |  j  d } |  _  | t |   k r/ t  n | d k  rD t  n  |  | | d  S)Nr   r   )rb   r$   StopIterationr?   )r"   pr   r   r   __next__0  s    		zEncodingBytes.__next__c             C   s
   |  j    S)N)r   )r"   r   r   r   next8  s    zEncodingBytes.nextc             C   s\   |  j  } | t |   k r$ t  n | d k  r9 t  n  | d |  _  } |  | | d  S)Nr   r   )rb   r$   r   r?   )r"   r   r   r   r   previous<  s    			zEncodingBytes.previousc             C   s+   |  j  t |   k r t  n  | |  _  d  S)N)rb   r$   r   )r"   r!   r   r   r   setPositionE  s    	zEncodingBytes.setPositionc             C   s<   |  j  t |   k r t  n  |  j  d k r4 |  j  Sd  Sd  S)Nr   )rb   r$   r   )r"   r   r   r   getPositionJ  s
    	zEncodingBytes.getPositionc             C   s   |  |  j  |  j  d  S)Nr   )r!   )r"   r   r   r   getCurrentByteT  s    zEncodingBytes.getCurrentBytec             C   sf   |  j  } xM | t |   k  rX |  | | d  } | | k rK | |  _ | S| d 7} q W| |  _ d S)zSkip past a list of charactersr   N)r!   r$   rb   )r"   r   r   r}   r   r   r   rx   Y  s    			zEncodingBytes.skipc             C   sf   |  j  } xM | t |   k  rX |  | | d  } | | k rK | |  _ | S| d 7} q W| |  _ d  S)Nr   )r!   r$   rb   )r"   r   r   r}   r   r   r   	skipUntile  s    			zEncodingBytes.skipUntilc             C   sT   |  j  } |  | | t |   } | j |  } | rP |  j  t |  7_  n  | S)zLook for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone)r!   r$   
startswith)r"   r/   r   r3   r7   r   r   r   
matchBytesp  s    	zEncodingBytes.matchBytesc             C   sn   |  |  j  d  j |  } | d k rd |  j d k rC d |  _ n  |  j | t |  d 7_ d St  d S)zLook for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the matchNr   r   Tr   r   )r!   findrb   r$   r   )r"   r/   newPositionr   r   r   jumpTo{  s    zEncodingBytes.jumpToN)r   r   r   r<   r   r#   r   r   r   r   r   r   propertyr!   r   currentBytespaceCharactersBytesrx   r   r   r   r   r   r   r   r   "  s    	r   c               @   s   e  Z d  Z d Z d d   Z d d   Z d d   Z d d	   Z d
 d   Z d d   Z	 d d   Z
 d d   Z d d   Z d S)r   z?Mini parser for detecting character encoding from meta elementsc             C   s   t  |  |  _ d |  _ d S)z3string - the data to work on for encoding detectionN)r   r3   rC   )r"   r3   r   r   r   r#     s    zEncodingParser.__init__c             C   s   d |  j  f d |  j f d |  j f d |  j f d |  j f d |  j f f } xw |  j D]l } d } xS | D]K \ } } |  j j |  rk y |   } PWq t k
 r d } PYq Xqk qk W| sX PqX qX W|  j S)	Ns   <!--s   <metas   </s   <!s   <?r   TF)	handleComment
handleMetahandlePossibleEndTaghandleOtherhandlePossibleStartTagr3   r   r   rC   )r"   methodDispatchbytekeepParsingkeymethodr   r   r   r     s&    	zEncodingParser.getEncodingc             C   s   |  j  j d  S)zSkip over commentss   -->)r3   r   )r"   r   r   r   r     s    zEncodingParser.handleCommentc             C   sE  |  j  j t k r d Sd } d  } x|  j   } | d  k rA d S| d d k r | d d k } | r=| d  k	 r=| |  _ d Sq% | d d k r | d } t |  } | d  k	 r=| |  _ d Sq% | d d k r% t t | d   } | j   } | d  k	 r=t |  } | d  k	 r:| r.| |  _ d S| } q:q=q% q% Wd  S)	NTFr   s
   http-equivr   s   content-types   charsets   content)	r3   r   r   getAttributerC   r   ContentAttrParserr   parse)r"   	hasPragmapendingEncodingattrtentativeEncodingcodeccontentParserr   r   r   r     s:    	
		zEncodingParser.handleMetac             C   s   |  j  d  S)NF)handlePossibleTag)r"   r   r   r   r     s    z%EncodingParser.handlePossibleStartTagc             C   s   t  |  j  |  j d  S)NT)r   r3   r   )r"   r   r   r   r     s    z#EncodingParser.handlePossibleEndTagc             C   s   |  j  } | j t k r9 | r5 | j   |  j   n  d S| j t  } | d k ra | j   n+ |  j   } x | d  k	 r |  j   } qp Wd S)NTr   )r3   r   asciiLettersBytesr   r   r   spacesAngleBracketsr   )r"   endTagr3   r}   r   r   r   r   r     s    	
z EncodingParser.handlePossibleTagc             C   s   |  j  j d  S)Nr   )r3   r   )r"   r   r   r   r     s    zEncodingParser.handleOtherc             C   s  |  j  } | j t t d g  B } | d k sI t |  d k sI t  | d	 k rY d Sg  } g  } x | d k r~ | r~ Pnz | t k r | j   } Pn^ | d
 k r d j |  d f S| t k r | j | j	    n | d k r d S| j |  t
 |  } qh W| d k r1| j   d j |  d f St
 |  | j   } | d k r| } x t
 |  } | | k rt
 |  d j |  d j |  f S| t k r| j | j	    q\| j |  q\Wn^ | d k rd j |  d f S| t k r| j | j	    n | d k r!d S| j |  xw t
 |  } | t k red j |  d j |  f S| t k r| j | j	    q1| d k rd S| j |  q1Wd S)z_Return a name,value pair for the next attribute in the stream,
        if one is found, or None   /Nr   r      =       '   ")r   N)r   r   )r   r   )r3   rx   r   	frozensetr$   r)   r5   asciiUppercaseBytesr2   r   r   r   r   )r"   r3   r}   attrName	attrValue	quoteCharr   r   r   r     sh    	$


zEncodingParser.getAttributeN)r   r   r   r<   r#   r   r   r   r   r   r   r   r   r   r   r   r   r     s   $r   c               @   s(   e  Z d  Z d d   Z d d   Z d S)r   c             C   s"   t  | t  s t  | |  _ d  S)N)r>   r/   r)   r3   )r"   r3   r   r   r   r#   ?  s    zContentAttrParser.__init__c             C   sN  y1|  j  j d  |  j  j d 7_ |  j  j   |  j  j d k sH d  S|  j  j d 7_ |  j  j   |  j  j d k r |  j  j } |  j  j d 7_ |  j  j } |  j  j |  r |  j  | |  j  j  Sd  Sn] |  j  j } y+ |  j  j t  |  j  | |  j  j  SWn# t k
 r/|  j  | d   SYn XWn t k
 rId  SYn Xd  S)Ns   charsetr   r   r   r   )r   r   )r3   r   r!   rx   r   r   r   r   )r"   	quoteMarkoldPositionr   r   r   r   C  s.    zContentAttrParser.parseN)r   r   r   r#   r   r   r   r   r   r   >  s   r   c             C   st   t  |  t  r> y |  j d  }  Wq> t k
 r: d SYq> Xn  |  rl t j d |   j   } t j | d  Sd Sd S)z{Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding.r   Nr4   )	r>   r/   decodeUnicodeDecodeErrorascii_punctuation_rerk   r   r   r   )rC   canonicalNamer   r   r   r   e  s    r   )+
__future__r   r   r   Zpip._vendor.sixr   r   rL   	constantsr   r   r	   r
   r   r   r4   r   ior   r   r   r   objectr   r   r   r   r   rM   ro   setrw   r   r~   r   rG   r@   rA   r/   r   r   r   r   r   r   r   r   <module>   sJ   "Jg'