hydrus/hydrus/core/HydrusText.py

try:
    
    import chardet
    
    CHARDET_OK = True
    
except:
    
    CHARDET_OK = False
    
import json
import os
import re

from hydrus.core import HydrusExceptions

re_one_or_more_whitespace = re.compile( r'\s+' ) # this does \t and friends too
# want to keep the 'leading space' part here, despite tag.strip() elsewhere, in case of some crazy '- test' tag
re_leading_garbage = re.compile( r'^(-|system:)+' )
re_leading_single_colon = re.compile( '^:(?!:)' )
re_leading_double_colon = re.compile( '^::(?!:)' )
re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew

HYDRUS_NOTE_NEWLINE = '\n'

def CleanNoteText( t: str ):
    
    # trim leading and trailing whitespace
    
    t = t.strip()
    
    # wash all newlines to be os.linesep
    
    lines = t.splitlines()
    
    # now trim each line
    
    lines = [ line.strip() for line in lines ]
    
    t = HYDRUS_NOTE_NEWLINE.join( lines )
    
    # now replace big gaps with reasonable ones
    
    double_newline = HYDRUS_NOTE_NEWLINE * 2
    triple_newline = HYDRUS_NOTE_NEWLINE * 3
    
    while triple_newline in t:
        
        t = t.replace( triple_newline, double_newline )
        
    
    return t
    

def HexFilter( text ):
    
    text = text.lower()
    
    text = re.sub( '[^0123456789abcdef]', '', text )
    
    return text
    
def DeserialiseNewlinedTexts( text ):
    
    texts = text.splitlines()
    
    texts = [ StripIOInputLine( line ) for line in texts ]
    
    texts = [ line for line in texts if line != '' ]
    
    return texts
    
def ElideText( text, max_length, elide_center = False ):
    
    if len( text ) > max_length:
        
        if elide_center:
            
            CENTER_END_CHARS = max( 2, max_length // 8 )
            
            text = '{}\u2026{}'.format( text[ : max_length - ( 1 + CENTER_END_CHARS ) ], text[ - CENTER_END_CHARS : ] )
            
        else:
            
            text = '{}\u2026'.format( text[ : max_length - 1 ] )
            
        
    
    return text
    
def LooksLikeHTML( file_data ):
    # this will false-positive if it is json that contains html, ha ha
    
    if isinstance( file_data, bytes ):
        
        search_elements = ( b'<html', b'<HTML', b'<title', b'<TITLE' )
        
    else:
        
        search_elements = ( '<html', '<HTML', '<title', '<TITLE' )
        
    
    for s_e in search_elements:
        
        if s_e in file_data:
            
            return True
            
        
    
    return False
    
def LooksLikeJSON( file_data ):
    
    try:
        
        if isinstance( file_data, bytes ):
            
            file_data = str( file_data, 'utf-8' )
            
        
        json.loads( file_data )
        
        return True
        
    except:
        
        return False
        
    

UNICODE_REPLACEMENT_CHARACTER = u'\ufffd'
NULL_CHARACTER = '\x00'

def ChardetDecode( data ):
    
    chardet_result = chardet.detect( data )
    
    chardet_confidence = chardet_result[ 'confidence' ]
    
    chardet_encoding = chardet_result[ 'encoding' ]
    
    chardet_text = str( data, chardet_encoding, errors = 'replace' )
    
    chardet_error_count = chardet_text.count( UNICODE_REPLACEMENT_CHARACTER )
    
    return ( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count )

def DefaultDecode( data ):
    
    default_encoding = 'windows-1252'
    
    default_text = str( data, default_encoding, errors = 'replace' )
    
    default_error_count = default_text.count( UNICODE_REPLACEMENT_CHARACTER )
    
    return ( default_text, default_encoding, default_error_count )
    
def NonFailingUnicodeDecode( data, encoding ):
    
    text = None
    
    try:
        
        if encoding in ( 'ISO-8859-1', 'Windows-1252', None ):
            
            # ok, the site delivered one of these non-utf-8 'default' encodings. this is probably actually requests filling this in as default
            # we don't want to trust these because they are very permissive sets and'll usually decode garbage without errors
            # we want chardet to have a proper look
            
            raise LookupError()
            
        
        text = str( data, encoding )
        
    except ( UnicodeDecodeError, LookupError ) as e:
        
        try:
            
            if isinstance( e, UnicodeDecodeError ):
                
                text = str( data, encoding, errors = 'replace' )
                
                confidence = 0.7
                error_count = text.count( UNICODE_REPLACEMENT_CHARACTER )
                
            else:
                
                confidence = None
                error_count = None
                
            
            if CHARDET_OK:
                
                ( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count ) = ChardetDecode( data )
                
                if chardet_error_count == 0:
                    
                    chardet_is_better = True
                    
                else:
                    
                    chardet_confidence_is_better = confidence is None or chardet_confidence > confidence
                    chardet_errors_is_better = error_count is None or chardet_error_count < error_count
                    
                    chardet_is_better = chardet_confidence_is_better and chardet_errors_is_better
                    
                
                if chardet_is_better:
                    
                    text = chardet_text
                    encoding = chardet_encoding
                    
                
            else:
                
                if text is None:
                    
                    try:
                        
                        ( default_text, default_encoding, default_error_count ) = DefaultDecode( data )
                        
                        text = default_text
                        encoding = default_encoding
                        
                    except:
                        
                        text = 'Could not decode the page--problem with given encoding "{}" and no chardet library available.'.format( encoding )
                        encoding = 'utf-8'
                        
                    
                
            
            if text is None:
                
                raise Exception()
                
            
        except Exception as e:
            
            text = 'Unfortunately, could not decode the page with given encoding "{}".'.format( encoding )
            encoding = 'utf-8'
            
        
    
    if NULL_CHARACTER in text:
        
        # I guess this is valid in unicode for some reason
        # funnily enough, it is not replaced by 'replace'
        # nor does it raise an error in normal str creation
        
        text = text.replace( NULL_CHARACTER, '' )
        
    
    return ( text, encoding )
    
def RemoveNewlines( text: str ) -> str:
    
    text = ''.join( text.splitlines() )
    
    return text
    
def SortStringsIgnoringCase( list_of_strings ):
    
    list_of_strings.sort( key = lambda s: s.lower() )
    
def StripIOInputLine( t ):
    
    t = re_leading_byte_order_mark.sub( '', t )
    
    t = t.strip()
    
    return t
Version 341 2019-02-27 23:03:30 +00:00			`try:`

			`import chardet`

			`CHARDET_OK = True`

			`except:`

			`CHARDET_OK = False`

Version 325 2018-10-03 21:00:15 +00:00			`import json`
Version 419 2020-11-25 22:22:47 +00:00			`import os`
Version 286 2017-12-13 22:33:07 +00:00			`import re`

Version 445 2021-06-30 21:27:35 +00:00			`from hydrus.core import HydrusExceptions`

Version 509 2022-12-07 22:41:53 +00:00			`re_one_or_more_whitespace = re.compile( r'\s+' ) # this does \t and friends too`
Version 387 2020-03-04 22:12:53 +00:00			`# want to keep the 'leading space' part here, despite tag.strip() elsewhere, in case of some crazy '- test' tag`
Version 509 2022-12-07 22:41:53 +00:00			`re_leading_garbage = re.compile( r'^(-\|system:)+' )`
Version 335 2019-01-09 22:59:03 +00:00			`re_leading_single_colon = re.compile( '^:(?!:)' )`
Version 514 2023-01-25 22:59:39 +00:00			`re_leading_double_colon = re.compile( '^::(?!:)' )`
Version 335 2019-01-09 22:59:03 +00:00			`re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew`
Version 286 2017-12-13 22:33:07 +00:00
Version 497 2022-08-24 21:06:25 +00:00			`HYDRUS_NOTE_NEWLINE = '\n'`

Version 496 2022-08-17 20:54:59 +00:00			`def CleanNoteText( t: str ):`

Version 497 2022-08-24 21:06:25 +00:00			`# trim leading and trailing whitespace`

Version 496 2022-08-17 20:54:59 +00:00			`t = t.strip()`

Version 497 2022-08-24 21:06:25 +00:00			`# wash all newlines to be os.linesep`

			`lines = t.splitlines()`

			`# now trim each line`

			`lines = [ line.strip() for line in lines ]`

			`t = HYDRUS_NOTE_NEWLINE.join( lines )`

			`# now replace big gaps with reasonable ones`

			`double_newline = HYDRUS_NOTE_NEWLINE * 2`
			`triple_newline = HYDRUS_NOTE_NEWLINE * 3`

			`while triple_newline in t:`

			`t = t.replace( triple_newline, double_newline )`


Version 496 2022-08-17 20:54:59 +00:00			`return t`


Version 339 2019-02-06 22:41:35 +00:00			`def HexFilter( text ):`

			`text = text.lower()`

			`text = re.sub( '[^0123456789abcdef]', '', text )`

			`return text`

Version 286 2017-12-13 22:33:07 +00:00			`def DeserialiseNewlinedTexts( text ):`

Version 340 2019-02-13 22:26:43 +00:00			`texts = text.splitlines()`
Version 286 2017-12-13 22:33:07 +00:00
Version 341 2019-02-27 23:03:30 +00:00			`texts = [ StripIOInputLine( line ) for line in texts ]`
Version 286 2017-12-13 22:33:07 +00:00
			`texts = [ line for line in texts if line != '' ]`

			`return texts`

Version 388 2020-03-11 21:52:11 +00:00			`def ElideText( text, max_length, elide_center = False ):`
Version 379 2020-01-02 03:05:35 +00:00
			`if len( text ) > max_length:`

Version 388 2020-03-11 21:52:11 +00:00			`if elide_center:`

			`CENTER_END_CHARS = max( 2, max_length // 8 )`

			`text = '{}\u2026{}'.format( text[ : max_length - ( 1 + CENTER_END_CHARS ) ], text[ - CENTER_END_CHARS : ] )`

			`else:`

			`text = '{}\u2026'.format( text[ : max_length - 1 ] )`

Version 379 2020-01-02 03:05:35 +00:00

			`return text`

Version 321 2018-09-05 20:52:32 +00:00			`def LooksLikeHTML( file_data ):`
Version 325 2018-10-03 21:00:15 +00:00			`# this will false-positive if it is json that contains html, ha ha`
Version 321 2018-09-05 20:52:32 +00:00
Version 335 2019-01-09 22:59:03 +00:00			`if isinstance( file_data, bytes ):`

Version 459 closes #447, closes #982, closes #875, closes #989, closes #986, closes #858, closes #855, closes #807, closes #790 2021-10-27 21:12:33 +00:00			`search_elements = ( b'<html', b'<HTML', b'<title', b'<TITLE' )`
Version 335 2019-01-09 22:59:03 +00:00
			`else:`

Version 459 closes #447, closes #982, closes #875, closes #989, closes #986, closes #858, closes #855, closes #807, closes #790 2021-10-27 21:12:33 +00:00			`search_elements = ( '<html', '<HTML', '<title', '<TITLE' )`
Version 335 2019-01-09 22:59:03 +00:00

			`for s_e in search_elements:`

			`if s_e in file_data:`

			`return True`



			`return False`
Version 321 2018-09-05 20:52:32 +00:00
Version 325 2018-10-03 21:00:15 +00:00			`def LooksLikeJSON( file_data ):`

			`try:`

Version 335 2019-01-09 22:59:03 +00:00			`if isinstance( file_data, bytes ):`

			`file_data = str( file_data, 'utf-8' )`


Version 325 2018-10-03 21:00:15 +00:00			`json.loads( file_data )`

			`return True`

			`except:`

			`return False`


Version 423 2020-12-23 23:07:58 +00:00
			`UNICODE_REPLACEMENT_CHARACTER = u'\ufffd'`
			`NULL_CHARACTER = '\x00'`

Version 445 2021-06-30 21:27:35 +00:00			`def ChardetDecode( data ):`

			`chardet_result = chardet.detect( data )`

			`chardet_confidence = chardet_result[ 'confidence' ]`

			`chardet_encoding = chardet_result[ 'encoding' ]`

			`chardet_text = str( data, chardet_encoding, errors = 'replace' )`

			`chardet_error_count = chardet_text.count( UNICODE_REPLACEMENT_CHARACTER )`

			`return ( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count )`

			`def DefaultDecode( data ):`

			`default_encoding = 'windows-1252'`

			`default_text = str( data, default_encoding, errors = 'replace' )`

			`default_error_count = default_text.count( UNICODE_REPLACEMENT_CHARACTER )`

			`return ( default_text, default_encoding, default_error_count )`

Version 340 2019-02-13 22:26:43 +00:00			`def NonFailingUnicodeDecode( data, encoding ):`

Version 445 2021-06-30 21:27:35 +00:00			`text = None`

Version 340 2019-02-13 22:26:43 +00:00			`try:`

Version 445 2021-06-30 21:27:35 +00:00			`if encoding in ( 'ISO-8859-1', 'Windows-1252', None ):`

			`# ok, the site delivered one of these non-utf-8 'default' encodings. this is probably actually requests filling this in as default`
			`# we don't want to trust these because they are very permissive sets and'll usually decode garbage without errors`
			`# we want chardet to have a proper look`

			`raise LookupError()`

Version 340 2019-02-13 22:26:43 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`text = str( data, encoding )`
Version 340 2019-02-13 22:26:43 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`except ( UnicodeDecodeError, LookupError ) as e:`
Version 340 2019-02-13 22:26:43 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`try:`
Version 340 2019-02-13 22:26:43 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`if isinstance( e, UnicodeDecodeError ):`

			`text = str( data, encoding, errors = 'replace' )`

			`confidence = 0.7`
			`error_count = text.count( UNICODE_REPLACEMENT_CHARACTER )`

			`else:`

			`confidence = None`
			`error_count = None`

Version 340 2019-02-13 22:26:43 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`if CHARDET_OK:`
Version 341 2019-02-27 23:03:30 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count ) = ChardetDecode( data )`
Version 341 2019-02-27 23:03:30 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`if chardet_error_count == 0:`

			`chardet_is_better = True`

			`else:`

			`chardet_confidence_is_better = confidence is None or chardet_confidence > confidence`
			`chardet_errors_is_better = error_count is None or chardet_error_count < error_count`

			`chardet_is_better = chardet_confidence_is_better and chardet_errors_is_better`

Version 340 2019-02-13 22:26:43 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`if chardet_is_better:`

			`text = chardet_text`
			`encoding = chardet_encoding`

Version 341 2019-02-27 23:03:30 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`else:`

			`if text is None:`
Version 341 2019-02-27 23:03:30 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`try:`
Version 410 2020-09-02 21:10:41 +00:00
Version 445 2021-06-30 21:27:35 +00:00			`( default_text, default_encoding, default_error_count ) = DefaultDecode( data )`

			`text = default_text`
			`encoding = default_encoding`

			`except:`

			`text = 'Could not decode the page--problem with given encoding "{}" and no chardet library available.'.format( encoding )`
			`encoding = 'utf-8'`
Version 410 2020-09-02 21:10:41 +00:00
Version 341 2019-02-27 23:03:30 +00:00
Version 340 2019-02-13 22:26:43 +00:00

Version 445 2021-06-30 21:27:35 +00:00			`if text is None:`

			`raise Exception()`


			`except Exception as e:`

			`text = 'Unfortunately, could not decode the page with given encoding "{}".'.format( encoding )`
			`encoding = 'utf-8'`

Version 340 2019-02-13 22:26:43 +00:00

Version 423 2020-12-23 23:07:58 +00:00			`if NULL_CHARACTER in text:`

			`# I guess this is valid in unicode for some reason`
			`# funnily enough, it is not replaced by 'replace'`
			`# nor does it raise an error in normal str creation`

			`text = text.replace( NULL_CHARACTER, '' )`


Version 340 2019-02-13 22:26:43 +00:00			`return ( text, encoding )`

Version 509 2022-12-07 22:41:53 +00:00			`def RemoveNewlines( text: str ) -> str:`
Version 324 2018-09-26 19:05:12 +00:00
Version 509 2022-12-07 22:41:53 +00:00			`text = ''.join( text.splitlines() )`
Version 324 2018-09-26 19:05:12 +00:00
			`return text`

Version 309 2018-05-30 20:13:21 +00:00			`def SortStringsIgnoringCase( list_of_strings ):`

			`list_of_strings.sort( key = lambda s: s.lower() )`

Version 341 2019-02-27 23:03:30 +00:00			`def StripIOInputLine( t ):`
Version 286 2017-12-13 22:33:07 +00:00
Version 290 2018-01-17 22:52:10 +00:00			`t = re_leading_byte_order_mark.sub( '', t )`

Version 341 2019-02-27 23:03:30 +00:00			`t = t.strip()`
Version 286 2017-12-13 22:33:07 +00:00
			`return t`