hydrus/include/HydrusText.py

119 lines
2.7 KiB
Python
Raw Normal View History

2018-10-03 21:00:15 +00:00
import json
2017-12-13 22:33:07 +00:00
import re
2019-01-09 22:59:03 +00:00
re_newlines = re.compile( '[\r\n]+' )
re_multiple_spaces = re.compile( '\\s+' )
re_trailing_space = re.compile( '\\s+$' )
re_leading_space = re.compile( '^\\s+' )
re_leading_space_or_garbage = re.compile( '^(\\s|-|system:)+' )
re_leading_single_colon = re.compile( '^:(?!:)' )
re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew
2017-12-13 22:33:07 +00:00
2019-02-06 22:41:35 +00:00
def HexFilter( text ):
text = text.lower()
text = re.sub( '[^0123456789abcdef]', '', text )
return text
2017-12-13 22:33:07 +00:00
def DeserialiseNewlinedTexts( text ):
2019-02-13 22:26:43 +00:00
texts = text.splitlines()
2017-12-13 22:33:07 +00:00
texts = [ StripTrailingAndLeadingSpaces( line ) for line in texts ]
texts = [ line for line in texts if line != '' ]
return texts
2018-09-05 20:52:32 +00:00
def LooksLikeHTML( file_data ):
2018-10-03 21:00:15 +00:00
# this will false-positive if it is json that contains html, ha ha
2018-09-05 20:52:32 +00:00
2019-01-09 22:59:03 +00:00
if isinstance( file_data, bytes ):
search_elements = ( b'<html', b'<HTML' )
else:
search_elements = ( '<html', '<HTML' )
for s_e in search_elements:
if s_e in file_data:
return True
return False
2018-09-05 20:52:32 +00:00
2018-10-03 21:00:15 +00:00
def LooksLikeJSON( file_data ):
try:
2019-01-09 22:59:03 +00:00
if isinstance( file_data, bytes ):
file_data = str( file_data, 'utf-8' )
2018-10-03 21:00:15 +00:00
json.loads( file_data )
return True
except:
return False
2019-02-13 22:26:43 +00:00
def NonFailingUnicodeDecode( data, encoding ):
try:
text = str( data, encoding )
except UnicodeDecodeError:
unicode_replacement_character = u'\ufffd'
text = str( data, encoding, errors = 'replace' )
error_count = text.count( unicode_replacement_character )
if encoding not in ( 'utf-8', 'utf8', 'UTF-8', 'UTF8' ):
utf8_text = str( data, 'utf-8', errors = 'replace' )
utf8_error_count = utf8_text.count( unicode_replacement_character )
if utf8_error_count < error_count:
return ( utf8_text, 'utf-8' )
return ( text, encoding )
2018-09-26 19:05:12 +00:00
def RemoveNewlines( text ):
text = re.sub( r'\r|\n', '', text )
return text
2018-05-30 20:13:21 +00:00
def SortStringsIgnoringCase( list_of_strings ):
list_of_strings.sort( key = lambda s: s.lower() )
2017-12-13 22:33:07 +00:00
def StripTrailingAndLeadingSpaces( t ):
2018-01-17 22:52:10 +00:00
t = re_leading_byte_order_mark.sub( '', t )
2017-12-13 22:33:07 +00:00
t = re_trailing_space.sub( '', t )
t = re_leading_space.sub( '', t )
return t