302 lines
7.9 KiB
Python
302 lines
7.9 KiB
Python
import typing
|
|
|
|
try:
|
|
|
|
import chardet
|
|
|
|
CHARDET_OK = True
|
|
|
|
except:
|
|
|
|
CHARDET_OK = False
|
|
|
|
|
|
import json
|
|
import re
|
|
|
|
from hydrus.core import HydrusExceptions
|
|
|
|
re_one_or_more_whitespace = re.compile( r'\s+' ) # this does \t and friends too
|
|
# want to keep the 'leading space' part here, despite tag.strip() elsewhere, in case of some crazy '- test' tag
|
|
re_leading_garbage = re.compile( r'^(-|system:)+' )
|
|
re_leading_single_colon = re.compile( '^:(?!:)' )
|
|
re_leading_single_colon_and_no_more_colons = re.compile( '^:(?=[^:]+$)' )
|
|
re_leading_single_colon_and_later_colon = re.compile( '^:(?=[^:]+:[^:]+$)' )
|
|
re_leading_double_colon = re.compile( '^::(?!:)' )
|
|
re_leading_colons = re.compile( '^:+' )
|
|
re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew
|
|
|
|
HYDRUS_NOTE_NEWLINE = '\n'
|
|
|
|
def CleanNoteText( t: str ):
|
|
|
|
# trim leading and trailing whitespace
|
|
|
|
t = t.strip()
|
|
|
|
# wash all newlines to be os.linesep
|
|
|
|
lines = t.splitlines()
|
|
|
|
# now trim each line
|
|
|
|
lines = [ line.strip() for line in lines ]
|
|
|
|
t = HYDRUS_NOTE_NEWLINE.join( lines )
|
|
|
|
# now replace big gaps with reasonable ones
|
|
|
|
double_newline = HYDRUS_NOTE_NEWLINE * 2
|
|
triple_newline = HYDRUS_NOTE_NEWLINE * 3
|
|
|
|
while triple_newline in t:
|
|
|
|
t = t.replace( triple_newline, double_newline )
|
|
|
|
|
|
return t
|
|
|
|
|
|
def HexFilter( text ):
|
|
|
|
text = text.lower()
|
|
|
|
text = re.sub( '[^0123456789abcdef]', '', text )
|
|
|
|
return text
|
|
|
|
def DeserialiseNewlinedTexts( text ):
|
|
|
|
texts = text.splitlines()
|
|
|
|
texts = [ StripIOInputLine( line ) for line in texts ]
|
|
|
|
texts = [ line for line in texts if line != '' ]
|
|
|
|
return texts
|
|
|
|
def ElideText( text, max_length, elide_center = False ):
|
|
|
|
if len( text ) > max_length:
|
|
|
|
if elide_center:
|
|
|
|
CENTER_END_CHARS = max( 2, max_length // 8 )
|
|
|
|
text = '{}\u2026{}'.format( text[ : max_length - ( 1 + CENTER_END_CHARS ) ], text[ - CENTER_END_CHARS : ] )
|
|
|
|
else:
|
|
|
|
text = '{}\u2026'.format( text[ : max_length - 1 ] )
|
|
|
|
|
|
|
|
return text
|
|
|
|
def LooksLikeHTML( file_data: typing.Union[ str, bytes ] ):
|
|
# this will false-positive if it is json that contains html, ha ha
|
|
|
|
if isinstance( file_data, bytes ):
|
|
|
|
search_elements = ( b'<html', b'<HTML', b'<!DOCTYPE html', b'<!DOCTYPE HTML' )
|
|
|
|
else:
|
|
|
|
search_elements = ( '<html', '<HTML', '<!DOCTYPE html', '<!DOCTYPE HTML' )
|
|
|
|
|
|
for s_e in search_elements:
|
|
|
|
if s_e in file_data:
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
def LooksLikeSVG( file_data ):
|
|
|
|
if isinstance( file_data, bytes ):
|
|
|
|
search_elements = ( b'<svg', b'<SVG', b'<!DOCTYPE svg', b'<!DOCTYPE SVG' )
|
|
|
|
else:
|
|
|
|
search_elements = ( '<svg', '<SVG', '<!DOCTYPE svg', '<!DOCTYPE SVG' )
|
|
|
|
|
|
for s_e in search_elements:
|
|
|
|
if s_e in file_data:
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def LooksLikeJSON( file_data: typing.Union[ str, bytes ] ) -> bool:
|
|
|
|
try:
|
|
|
|
if isinstance( file_data, bytes ):
|
|
|
|
file_data = str( file_data, 'utf-8' )
|
|
|
|
|
|
json.loads( file_data )
|
|
|
|
return True
|
|
|
|
except:
|
|
|
|
return False
|
|
|
|
|
|
|
|
UNICODE_REPLACEMENT_CHARACTER = u'\ufffd'
|
|
NULL_CHARACTER = '\x00'
|
|
|
|
def ChardetDecode( data ):
|
|
|
|
chardet_result = chardet.detect( data )
|
|
|
|
chardet_confidence = chardet_result[ 'confidence' ]
|
|
|
|
chardet_encoding = chardet_result[ 'encoding' ]
|
|
|
|
chardet_text = str( data, chardet_encoding, errors = 'replace' )
|
|
|
|
chardet_error_count = chardet_text.count( UNICODE_REPLACEMENT_CHARACTER )
|
|
|
|
return ( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count )
|
|
|
|
def DefaultDecode( data ):
|
|
|
|
default_encoding = 'windows-1252'
|
|
|
|
default_text = str( data, default_encoding, errors = 'replace' )
|
|
|
|
default_error_count = default_text.count( UNICODE_REPLACEMENT_CHARACTER )
|
|
|
|
return ( default_text, default_encoding, default_error_count )
|
|
|
|
def NonFailingUnicodeDecode( data, encoding ):
|
|
|
|
text = None
|
|
|
|
try:
|
|
|
|
if encoding in ( 'ISO-8859-1', 'Windows-1252', None ):
|
|
|
|
# ok, the site delivered one of these non-utf-8 'default' encodings. this is probably actually requests filling this in as default
|
|
# we don't want to trust these because they are very permissive sets and'll usually decode garbage without errors
|
|
# we want chardet to have a proper look
|
|
|
|
raise LookupError()
|
|
|
|
|
|
text = str( data, encoding )
|
|
|
|
except ( UnicodeDecodeError, LookupError ) as e:
|
|
|
|
try:
|
|
|
|
if isinstance( e, UnicodeDecodeError ):
|
|
|
|
text = str( data, encoding, errors = 'replace' )
|
|
|
|
confidence = 0.7
|
|
error_count = text.count( UNICODE_REPLACEMENT_CHARACTER )
|
|
|
|
else:
|
|
|
|
confidence = None
|
|
error_count = None
|
|
|
|
|
|
if CHARDET_OK:
|
|
|
|
( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count ) = ChardetDecode( data )
|
|
|
|
if chardet_error_count == 0:
|
|
|
|
chardet_is_better = True
|
|
|
|
else:
|
|
|
|
chardet_confidence_is_better = confidence is None or chardet_confidence > confidence
|
|
chardet_errors_is_better = error_count is None or chardet_error_count < error_count
|
|
|
|
chardet_is_better = chardet_confidence_is_better and chardet_errors_is_better
|
|
|
|
|
|
if chardet_is_better:
|
|
|
|
text = chardet_text
|
|
encoding = chardet_encoding
|
|
|
|
|
|
else:
|
|
|
|
if text is None:
|
|
|
|
try:
|
|
|
|
( default_text, default_encoding, default_error_count ) = DefaultDecode( data )
|
|
|
|
text = default_text
|
|
encoding = default_encoding
|
|
|
|
except:
|
|
|
|
text = 'Could not decode the page--problem with given encoding "{}" and no chardet library available.'.format( encoding )
|
|
encoding = 'utf-8'
|
|
|
|
|
|
|
|
|
|
if text is None:
|
|
|
|
raise Exception()
|
|
|
|
|
|
except Exception as e:
|
|
|
|
text = 'Unfortunately, could not decode the page with given encoding "{}".'.format( encoding )
|
|
encoding = 'utf-8'
|
|
|
|
|
|
|
|
if NULL_CHARACTER in text:
|
|
|
|
# I guess this is valid in unicode for some reason
|
|
# funnily enough, it is not replaced by 'replace'
|
|
# nor does it raise an error in normal str creation
|
|
|
|
text = text.replace( NULL_CHARACTER, '' )
|
|
|
|
|
|
return ( text, encoding )
|
|
|
|
def RemoveNewlines( text: str ) -> str:
|
|
|
|
text = ''.join( text.splitlines() )
|
|
|
|
return text
|
|
|
|
def SortStringsIgnoringCase( list_of_strings ):
|
|
|
|
list_of_strings.sort( key = lambda s: s.lower() )
|
|
|
|
def StripIOInputLine( t ):
|
|
|
|
t = re_leading_byte_order_mark.sub( '', t )
|
|
|
|
t = t.strip()
|
|
|
|
return t
|
|
|