try: import chardet CHARDET_OK = True except: CHARDET_OK = False import json import os import re from hydrus.core import HydrusExceptions re_newlines = re.compile( '[\r\n]+' ) re_multiple_spaces = re.compile( r'\s+' ) # want to keep the 'leading space' part here, despite tag.strip() elsewhere, in case of some crazy '- test' tag re_leading_space_or_garbage = re.compile( r'^(\s|-|system:)+' ) re_leading_single_colon = re.compile( '^:(?!:)' ) re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew def HexFilter( text ): text = text.lower() text = re.sub( '[^0123456789abcdef]', '', text ) return text def DeserialiseNewlinedTexts( text ): texts = text.splitlines() texts = [ StripIOInputLine( line ) for line in texts ] texts = [ line for line in texts if line != '' ] return texts def ElideText( text, max_length, elide_center = False ): if len( text ) > max_length: if elide_center: CENTER_END_CHARS = max( 2, max_length // 8 ) text = '{}\u2026{}'.format( text[ : max_length - ( 1 + CENTER_END_CHARS ) ], text[ - CENTER_END_CHARS : ] ) else: text = '{}\u2026'.format( text[ : max_length - 1 ] ) return text def LooksLikeHTML( file_data ): # this will false-positive if it is json that contains html, ha ha if isinstance( file_data, bytes ): search_elements = ( b' confidence chardet_errors_is_better = error_count is None or chardet_error_count < error_count chardet_is_better = chardet_confidence_is_better and chardet_errors_is_better if chardet_is_better: text = chardet_text encoding = chardet_encoding else: if text is None: try: ( default_text, default_encoding, default_error_count ) = DefaultDecode( data ) text = default_text encoding = default_encoding except: text = 'Could not decode the page--problem with given encoding "{}" and no chardet library available.'.format( encoding ) encoding = 'utf-8' if text is None: raise Exception() except Exception as e: text = 'Unfortunately, could not decode the page with given encoding "{}".'.format( encoding ) encoding = 'utf-8' if NULL_CHARACTER in text: # I guess this is valid in unicode for some reason # funnily enough, it is not replaced by 'replace' # nor does it raise an error in normal str creation text = text.replace( NULL_CHARACTER, '' ) return ( text, encoding ) def RemoveNewlines( text ): text = re.sub( r'\r|\n', '', text ) return text def SortStringsIgnoringCase( list_of_strings ): list_of_strings.sort( key = lambda s: s.lower() ) def StripIOInputLine( t ): t = re_leading_byte_order_mark.sub( '', t ) t = t.strip() return t