2019-02-27 23:03:30 +00:00
|
|
|
try:
|
|
|
|
|
|
|
|
import chardet
|
|
|
|
|
|
|
|
CHARDET_OK = True
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
CHARDET_OK = False
|
|
|
|
|
2018-10-03 21:00:15 +00:00
|
|
|
import json
|
2017-12-13 22:33:07 +00:00
|
|
|
import re
|
|
|
|
|
2019-01-09 22:59:03 +00:00
|
|
|
re_newlines = re.compile( '[\r\n]+' )
|
2019-08-07 22:59:53 +00:00
|
|
|
re_multiple_spaces = re.compile( r'\s+' )
|
|
|
|
re_leading_space_or_garbage = re.compile( r'^(\s|-|system:)+' )
|
2019-01-09 22:59:03 +00:00
|
|
|
re_leading_single_colon = re.compile( '^:(?!:)' )
|
|
|
|
re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew
|
2017-12-13 22:33:07 +00:00
|
|
|
|
2019-02-06 22:41:35 +00:00
|
|
|
def HexFilter( text ):
|
|
|
|
|
|
|
|
text = text.lower()
|
|
|
|
|
|
|
|
text = re.sub( '[^0123456789abcdef]', '', text )
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
2017-12-13 22:33:07 +00:00
|
|
|
def DeserialiseNewlinedTexts( text ):
|
|
|
|
|
2019-02-13 22:26:43 +00:00
|
|
|
texts = text.splitlines()
|
2017-12-13 22:33:07 +00:00
|
|
|
|
2019-02-27 23:03:30 +00:00
|
|
|
texts = [ StripIOInputLine( line ) for line in texts ]
|
2017-12-13 22:33:07 +00:00
|
|
|
|
|
|
|
texts = [ line for line in texts if line != '' ]
|
|
|
|
|
|
|
|
return texts
|
|
|
|
|
2020-01-02 03:05:35 +00:00
|
|
|
def ElideText( text, max_length ):
|
|
|
|
|
|
|
|
if len( text ) > max_length:
|
|
|
|
|
|
|
|
text = '{}\u2026'.format( text[:max_length - 1] )
|
|
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
2018-09-05 20:52:32 +00:00
|
|
|
def LooksLikeHTML( file_data ):
|
2018-10-03 21:00:15 +00:00
|
|
|
# this will false-positive if it is json that contains html, ha ha
|
2018-09-05 20:52:32 +00:00
|
|
|
|
2019-01-09 22:59:03 +00:00
|
|
|
if isinstance( file_data, bytes ):
|
|
|
|
|
|
|
|
search_elements = ( b'<html', b'<HTML' )
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
search_elements = ( '<html', '<HTML' )
|
|
|
|
|
|
|
|
|
|
|
|
for s_e in search_elements:
|
|
|
|
|
|
|
|
if s_e in file_data:
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return False
|
2018-09-05 20:52:32 +00:00
|
|
|
|
2018-10-03 21:00:15 +00:00
|
|
|
def LooksLikeJSON( file_data ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
2019-01-09 22:59:03 +00:00
|
|
|
if isinstance( file_data, bytes ):
|
|
|
|
|
|
|
|
file_data = str( file_data, 'utf-8' )
|
|
|
|
|
|
|
|
|
2018-10-03 21:00:15 +00:00
|
|
|
json.loads( file_data )
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2019-02-13 22:26:43 +00:00
|
|
|
def NonFailingUnicodeDecode( data, encoding ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
text = str( data, encoding )
|
|
|
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
|
|
|
unicode_replacement_character = u'\ufffd'
|
|
|
|
|
|
|
|
text = str( data, encoding, errors = 'replace' )
|
|
|
|
|
|
|
|
error_count = text.count( unicode_replacement_character )
|
|
|
|
|
2019-02-27 23:03:30 +00:00
|
|
|
if CHARDET_OK:
|
2019-02-13 22:26:43 +00:00
|
|
|
|
2019-02-27 23:03:30 +00:00
|
|
|
chardet_result = chardet.detect( data )
|
2019-02-13 22:26:43 +00:00
|
|
|
|
2019-02-27 23:03:30 +00:00
|
|
|
if chardet_result[ 'confidence' ] > 0.85:
|
|
|
|
|
|
|
|
chardet_encoding = chardet_result[ 'encoding' ]
|
|
|
|
|
|
|
|
chardet_text = str( data, chardet_encoding, errors = 'replace' )
|
2019-02-13 22:26:43 +00:00
|
|
|
|
2019-02-27 23:03:30 +00:00
|
|
|
chardet_error_count = chardet_text.count( unicode_replacement_character )
|
|
|
|
|
|
|
|
if chardet_error_count < error_count:
|
|
|
|
|
|
|
|
return ( chardet_text, chardet_encoding )
|
|
|
|
|
2019-02-13 22:26:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return ( text, encoding )
|
|
|
|
|
2018-09-26 19:05:12 +00:00
|
|
|
def RemoveNewlines( text ):
|
|
|
|
|
|
|
|
text = re.sub( r'\r|\n', '', text )
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
2018-05-30 20:13:21 +00:00
|
|
|
def SortStringsIgnoringCase( list_of_strings ):
|
|
|
|
|
|
|
|
list_of_strings.sort( key = lambda s: s.lower() )
|
|
|
|
|
2019-02-27 23:03:30 +00:00
|
|
|
def StripIOInputLine( t ):
|
2017-12-13 22:33:07 +00:00
|
|
|
|
2018-01-17 22:52:10 +00:00
|
|
|
t = re_leading_byte_order_mark.sub( '', t )
|
|
|
|
|
2019-02-27 23:03:30 +00:00
|
|
|
t = t.strip()
|
2017-12-13 22:33:07 +00:00
|
|
|
|
|
|
|
return t
|
|
|
|
|