hydrus/include/HydrusText.py

try:
    
    import chardet
    
    CHARDET_OK = True
    
except:
    
    CHARDET_OK = False
    
import json
import re

re_newlines = re.compile( '[\r\n]+' )
re_multiple_spaces = re.compile( r'\s+' )
re_leading_space_or_garbage = re.compile( r'^(\s|-|system:)+' )
re_leading_single_colon = re.compile( '^:(?!:)' )
re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew

def HexFilter( text ):
    
    text = text.lower()
    
    text = re.sub( '[^0123456789abcdef]', '', text )
    
    return text
    
def DeserialiseNewlinedTexts( text ):
    
    texts = text.splitlines()
    
    texts = [ StripIOInputLine( line ) for line in texts ]
    
    texts = [ line for line in texts if line != '' ]
    
    return texts
    
def ElideText( text, max_length ):
    
    if len( text ) > max_length:
        
        text = '{}\u2026'.format( text[:max_length - 1] )
        
    
    return text
    
def LooksLikeHTML( file_data ):
    # this will false-positive if it is json that contains html, ha ha
    
    if isinstance( file_data, bytes ):
        
        search_elements = ( b'<html', b'<HTML' )
        
    else:
        
        search_elements = ( '<html', '<HTML' )
        
    
    for s_e in search_elements:
        
        if s_e in file_data:
            
            return True
            
        
    
    return False
    
def LooksLikeJSON( file_data ):
    
    try:
        
        if isinstance( file_data, bytes ):
            
            file_data = str( file_data, 'utf-8' )
            
        
        json.loads( file_data )
        
        return True
        
    except:
        
        return False
        
    
def NonFailingUnicodeDecode( data, encoding ):
    
    try:
        
        text = str( data, encoding )
        
    except UnicodeDecodeError:
        
        unicode_replacement_character = u'\ufffd'
        
        text = str( data, encoding, errors = 'replace' )
        
        error_count = text.count( unicode_replacement_character )
        
        if CHARDET_OK:
            
            chardet_result = chardet.detect( data )
            
            if chardet_result[ 'confidence' ] > 0.85:
                
                chardet_encoding = chardet_result[ 'encoding' ]
                
                chardet_text = str( data, chardet_encoding, errors = 'replace' )
                
                chardet_error_count = chardet_text.count( unicode_replacement_character )
                
                if chardet_error_count < error_count:
                    
                    return ( chardet_text, chardet_encoding )
                    
                
            
        
    
    return ( text, encoding )
    
def RemoveNewlines( text ):
    
    text = re.sub( r'\r|\n', '', text )
    
    return text
    
def SortStringsIgnoringCase( list_of_strings ):
    
    list_of_strings.sort( key = lambda s: s.lower() )
    
def StripIOInputLine( t ):
    
    t = re_leading_byte_order_mark.sub( '', t )
    
    t = t.strip()
    
    return t
Version 341 2019-02-27 23:03:30 +00:00			`try:`

			`import chardet`

			`CHARDET_OK = True`

			`except:`

			`CHARDET_OK = False`

Version 325 2018-10-03 21:00:15 +00:00			`import json`
Version 286 2017-12-13 22:33:07 +00:00			`import re`

Version 335 2019-01-09 22:59:03 +00:00			`re_newlines = re.compile( '[\r\n]+' )`
Version 363 2019-08-07 22:59:53 +00:00			`re_multiple_spaces = re.compile( r'\s+' )`
			`re_leading_space_or_garbage = re.compile( r'^(\s\|-\|system:)+' )`
Version 335 2019-01-09 22:59:03 +00:00			`re_leading_single_colon = re.compile( '^:(?!:)' )`
			`re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew`
Version 286 2017-12-13 22:33:07 +00:00
Version 339 2019-02-06 22:41:35 +00:00			`def HexFilter( text ):`

			`text = text.lower()`

			`text = re.sub( '[^0123456789abcdef]', '', text )`

			`return text`

Version 286 2017-12-13 22:33:07 +00:00			`def DeserialiseNewlinedTexts( text ):`

Version 340 2019-02-13 22:26:43 +00:00			`texts = text.splitlines()`
Version 286 2017-12-13 22:33:07 +00:00
Version 341 2019-02-27 23:03:30 +00:00			`texts = [ StripIOInputLine( line ) for line in texts ]`
Version 286 2017-12-13 22:33:07 +00:00
			`texts = [ line for line in texts if line != '' ]`

			`return texts`

Version 379 2020-01-02 03:05:35 +00:00			`def ElideText( text, max_length ):`

			`if len( text ) > max_length:`

			`text = '{}\u2026'.format( text[:max_length - 1] )`


			`return text`

Version 321 2018-09-05 20:52:32 +00:00			`def LooksLikeHTML( file_data ):`
Version 325 2018-10-03 21:00:15 +00:00			`# this will false-positive if it is json that contains html, ha ha`
Version 321 2018-09-05 20:52:32 +00:00
Version 335 2019-01-09 22:59:03 +00:00			`if isinstance( file_data, bytes ):`

			`search_elements = ( b'<html', b'<HTML' )`

			`else:`

			`search_elements = ( '<html', '<HTML' )`


			`for s_e in search_elements:`

			`if s_e in file_data:`

			`return True`



			`return False`
Version 321 2018-09-05 20:52:32 +00:00
Version 325 2018-10-03 21:00:15 +00:00			`def LooksLikeJSON( file_data ):`

			`try:`

Version 335 2019-01-09 22:59:03 +00:00			`if isinstance( file_data, bytes ):`

			`file_data = str( file_data, 'utf-8' )`


Version 325 2018-10-03 21:00:15 +00:00			`json.loads( file_data )`

			`return True`

			`except:`

			`return False`


Version 340 2019-02-13 22:26:43 +00:00			`def NonFailingUnicodeDecode( data, encoding ):`

			`try:`

			`text = str( data, encoding )`

			`except UnicodeDecodeError:`

			`unicode_replacement_character = u'\ufffd'`

			`text = str( data, encoding, errors = 'replace' )`

			`error_count = text.count( unicode_replacement_character )`

Version 341 2019-02-27 23:03:30 +00:00			`if CHARDET_OK:`
Version 340 2019-02-13 22:26:43 +00:00
Version 341 2019-02-27 23:03:30 +00:00			`chardet_result = chardet.detect( data )`
Version 340 2019-02-13 22:26:43 +00:00
Version 341 2019-02-27 23:03:30 +00:00			`if chardet_result[ 'confidence' ] > 0.85:`

			`chardet_encoding = chardet_result[ 'encoding' ]`

			`chardet_text = str( data, chardet_encoding, errors = 'replace' )`
Version 340 2019-02-13 22:26:43 +00:00
Version 341 2019-02-27 23:03:30 +00:00			`chardet_error_count = chardet_text.count( unicode_replacement_character )`

			`if chardet_error_count < error_count:`

			`return ( chardet_text, chardet_encoding )`

Version 340 2019-02-13 22:26:43 +00:00



			`return ( text, encoding )`

Version 324 2018-09-26 19:05:12 +00:00			`def RemoveNewlines( text ):`

			`text = re.sub( r'\r\|\n', '', text )`

			`return text`

Version 309 2018-05-30 20:13:21 +00:00			`def SortStringsIgnoringCase( list_of_strings ):`

			`list_of_strings.sort( key = lambda s: s.lower() )`

Version 341 2019-02-27 23:03:30 +00:00			`def StripIOInputLine( t ):`
Version 286 2017-12-13 22:33:07 +00:00
Version 290 2018-01-17 22:52:10 +00:00			`t = re_leading_byte_order_mark.sub( '', t )`

Version 341 2019-02-27 23:03:30 +00:00			`t = t.strip()`
Version 286 2017-12-13 22:33:07 +00:00
			`return t`