hydrus/include/HydrusText.py

142 lines
3.1 KiB
Python

try:
import chardet
CHARDET_OK = True
except:
CHARDET_OK = False
import json
import re
re_newlines = re.compile( '[\r\n]+' )
re_multiple_spaces = re.compile( r'\s+' )
# want to keep the 'leading space' part here, despite tag.strip() elsewhere, in case of some crazy '- test' tag
re_leading_space_or_garbage = re.compile( r'^(\s|-|system:)+' )
re_leading_single_colon = re.compile( '^:(?!:)' )
re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew
def HexFilter( text ):
text = text.lower()
text = re.sub( '[^0123456789abcdef]', '', text )
return text
def DeserialiseNewlinedTexts( text ):
texts = text.splitlines()
texts = [ StripIOInputLine( line ) for line in texts ]
texts = [ line for line in texts if line != '' ]
return texts
def ElideText( text, max_length ):
if len( text ) > max_length:
text = '{}\u2026'.format( text[:max_length - 1] )
return text
def LooksLikeHTML( file_data ):
# this will false-positive if it is json that contains html, ha ha
if isinstance( file_data, bytes ):
search_elements = ( b'<html', b'<HTML' )
else:
search_elements = ( '<html', '<HTML' )
for s_e in search_elements:
if s_e in file_data:
return True
return False
def LooksLikeJSON( file_data ):
try:
if isinstance( file_data, bytes ):
file_data = str( file_data, 'utf-8' )
json.loads( file_data )
return True
except:
return False
def NonFailingUnicodeDecode( data, encoding ):
try:
text = str( data, encoding )
except UnicodeDecodeError:
unicode_replacement_character = u'\ufffd'
text = str( data, encoding, errors = 'replace' )
error_count = text.count( unicode_replacement_character )
if CHARDET_OK:
chardet_result = chardet.detect( data )
if chardet_result[ 'confidence' ] > 0.85:
chardet_encoding = chardet_result[ 'encoding' ]
chardet_text = str( data, chardet_encoding, errors = 'replace' )
chardet_error_count = chardet_text.count( unicode_replacement_character )
if chardet_error_count < error_count:
return ( chardet_text, chardet_encoding )
return ( text, encoding )
def RemoveNewlines( text ):
text = re.sub( r'\r|\n', '', text )
return text
def SortStringsIgnoringCase( list_of_strings ):
list_of_strings.sort( key = lambda s: s.lower() )
def StripIOInputLine( t ):
t = re_leading_byte_order_mark.sub( '', t )
t = t.strip()
return t