hydrus/hydrus/core/HydrusText.py

275 lines
7.2 KiB
Python
Raw Normal View History

2019-02-27 23:03:30 +00:00
try:
import chardet
CHARDET_OK = True
except:
CHARDET_OK = False
2018-10-03 21:00:15 +00:00
import json
2020-11-25 22:22:47 +00:00
import os
2017-12-13 22:33:07 +00:00
import re
2021-06-30 21:27:35 +00:00
from hydrus.core import HydrusExceptions
2022-12-07 22:41:53 +00:00
re_one_or_more_whitespace = re.compile( r'\s+' ) # this does \t and friends too
2020-03-04 22:12:53 +00:00
# want to keep the 'leading space' part here, despite tag.strip() elsewhere, in case of some crazy '- test' tag
2022-12-07 22:41:53 +00:00
re_leading_garbage = re.compile( r'^(-|system:)+' )
2019-01-09 22:59:03 +00:00
re_leading_single_colon = re.compile( '^:(?!:)' )
2023-01-25 22:59:39 +00:00
re_leading_double_colon = re.compile( '^::(?!:)' )
2019-01-09 22:59:03 +00:00
re_leading_byte_order_mark = re.compile( '^\ufeff' ) # unicode .txt files prepend with this, wew
2017-12-13 22:33:07 +00:00
2022-08-24 21:06:25 +00:00
HYDRUS_NOTE_NEWLINE = '\n'
2022-08-17 20:54:59 +00:00
def CleanNoteText( t: str ):
2022-08-24 21:06:25 +00:00
# trim leading and trailing whitespace
2022-08-17 20:54:59 +00:00
t = t.strip()
2022-08-24 21:06:25 +00:00
# wash all newlines to be os.linesep
lines = t.splitlines()
# now trim each line
lines = [ line.strip() for line in lines ]
t = HYDRUS_NOTE_NEWLINE.join( lines )
# now replace big gaps with reasonable ones
double_newline = HYDRUS_NOTE_NEWLINE * 2
triple_newline = HYDRUS_NOTE_NEWLINE * 3
while triple_newline in t:
t = t.replace( triple_newline, double_newline )
2022-08-17 20:54:59 +00:00
return t
2019-02-06 22:41:35 +00:00
def HexFilter( text ):
text = text.lower()
text = re.sub( '[^0123456789abcdef]', '', text )
return text
2017-12-13 22:33:07 +00:00
def DeserialiseNewlinedTexts( text ):
2019-02-13 22:26:43 +00:00
texts = text.splitlines()
2017-12-13 22:33:07 +00:00
2019-02-27 23:03:30 +00:00
texts = [ StripIOInputLine( line ) for line in texts ]
2017-12-13 22:33:07 +00:00
texts = [ line for line in texts if line != '' ]
return texts
2020-03-11 21:52:11 +00:00
def ElideText( text, max_length, elide_center = False ):
2020-01-02 03:05:35 +00:00
if len( text ) > max_length:
2020-03-11 21:52:11 +00:00
if elide_center:
CENTER_END_CHARS = max( 2, max_length // 8 )
text = '{}\u2026{}'.format( text[ : max_length - ( 1 + CENTER_END_CHARS ) ], text[ - CENTER_END_CHARS : ] )
else:
text = '{}\u2026'.format( text[ : max_length - 1 ] )
2020-01-02 03:05:35 +00:00
return text
2018-09-05 20:52:32 +00:00
def LooksLikeHTML( file_data ):
2018-10-03 21:00:15 +00:00
# this will false-positive if it is json that contains html, ha ha
2018-09-05 20:52:32 +00:00
2019-01-09 22:59:03 +00:00
if isinstance( file_data, bytes ):
search_elements = ( b'<html', b'<HTML', b'<title', b'<TITLE' )
2019-01-09 22:59:03 +00:00
else:
search_elements = ( '<html', '<HTML', '<title', '<TITLE' )
2019-01-09 22:59:03 +00:00
for s_e in search_elements:
if s_e in file_data:
return True
return False
2018-09-05 20:52:32 +00:00
2018-10-03 21:00:15 +00:00
def LooksLikeJSON( file_data ):
try:
2019-01-09 22:59:03 +00:00
if isinstance( file_data, bytes ):
file_data = str( file_data, 'utf-8' )
2018-10-03 21:00:15 +00:00
json.loads( file_data )
return True
except:
return False
2020-12-23 23:07:58 +00:00
UNICODE_REPLACEMENT_CHARACTER = u'\ufffd'
NULL_CHARACTER = '\x00'
2021-06-30 21:27:35 +00:00
def ChardetDecode( data ):
chardet_result = chardet.detect( data )
chardet_confidence = chardet_result[ 'confidence' ]
chardet_encoding = chardet_result[ 'encoding' ]
chardet_text = str( data, chardet_encoding, errors = 'replace' )
chardet_error_count = chardet_text.count( UNICODE_REPLACEMENT_CHARACTER )
return ( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count )
def DefaultDecode( data ):
default_encoding = 'windows-1252'
default_text = str( data, default_encoding, errors = 'replace' )
default_error_count = default_text.count( UNICODE_REPLACEMENT_CHARACTER )
return ( default_text, default_encoding, default_error_count )
2019-02-13 22:26:43 +00:00
def NonFailingUnicodeDecode( data, encoding ):
2021-06-30 21:27:35 +00:00
text = None
2019-02-13 22:26:43 +00:00
try:
2021-06-30 21:27:35 +00:00
if encoding in ( 'ISO-8859-1', 'Windows-1252', None ):
# ok, the site delivered one of these non-utf-8 'default' encodings. this is probably actually requests filling this in as default
# we don't want to trust these because they are very permissive sets and'll usually decode garbage without errors
# we want chardet to have a proper look
raise LookupError()
2019-02-13 22:26:43 +00:00
2021-06-30 21:27:35 +00:00
text = str( data, encoding )
2019-02-13 22:26:43 +00:00
2021-06-30 21:27:35 +00:00
except ( UnicodeDecodeError, LookupError ) as e:
2019-02-13 22:26:43 +00:00
2021-06-30 21:27:35 +00:00
try:
2019-02-13 22:26:43 +00:00
2021-06-30 21:27:35 +00:00
if isinstance( e, UnicodeDecodeError ):
text = str( data, encoding, errors = 'replace' )
confidence = 0.7
error_count = text.count( UNICODE_REPLACEMENT_CHARACTER )
else:
confidence = None
error_count = None
2019-02-13 22:26:43 +00:00
2021-06-30 21:27:35 +00:00
if CHARDET_OK:
2019-02-27 23:03:30 +00:00
2021-06-30 21:27:35 +00:00
( chardet_text, chardet_encoding, chardet_confidence, chardet_error_count ) = ChardetDecode( data )
2019-02-27 23:03:30 +00:00
2021-06-30 21:27:35 +00:00
if chardet_error_count == 0:
chardet_is_better = True
else:
chardet_confidence_is_better = confidence is None or chardet_confidence > confidence
chardet_errors_is_better = error_count is None or chardet_error_count < error_count
chardet_is_better = chardet_confidence_is_better and chardet_errors_is_better
2019-02-13 22:26:43 +00:00
2021-06-30 21:27:35 +00:00
if chardet_is_better:
text = chardet_text
encoding = chardet_encoding
2019-02-27 23:03:30 +00:00
2021-06-30 21:27:35 +00:00
else:
if text is None:
2019-02-27 23:03:30 +00:00
2021-06-30 21:27:35 +00:00
try:
2020-09-02 21:10:41 +00:00
2021-06-30 21:27:35 +00:00
( default_text, default_encoding, default_error_count ) = DefaultDecode( data )
text = default_text
encoding = default_encoding
except:
text = 'Could not decode the page--problem with given encoding "{}" and no chardet library available.'.format( encoding )
encoding = 'utf-8'
2020-09-02 21:10:41 +00:00
2019-02-27 23:03:30 +00:00
2019-02-13 22:26:43 +00:00
2021-06-30 21:27:35 +00:00
if text is None:
raise Exception()
except Exception as e:
text = 'Unfortunately, could not decode the page with given encoding "{}".'.format( encoding )
encoding = 'utf-8'
2019-02-13 22:26:43 +00:00
2020-12-23 23:07:58 +00:00
if NULL_CHARACTER in text:
# I guess this is valid in unicode for some reason
# funnily enough, it is not replaced by 'replace'
# nor does it raise an error in normal str creation
text = text.replace( NULL_CHARACTER, '' )
2019-02-13 22:26:43 +00:00
return ( text, encoding )
2022-12-07 22:41:53 +00:00
def RemoveNewlines( text: str ) -> str:
2018-09-26 19:05:12 +00:00
2022-12-07 22:41:53 +00:00
text = ''.join( text.splitlines() )
2018-09-26 19:05:12 +00:00
return text
2018-05-30 20:13:21 +00:00
def SortStringsIgnoringCase( list_of_strings ):
list_of_strings.sort( key = lambda s: s.lower() )
2019-02-27 23:03:30 +00:00
def StripIOInputLine( t ):
2017-12-13 22:33:07 +00:00
2018-01-17 22:52:10 +00:00
t = re_leading_byte_order_mark.sub( '', t )
2019-02-27 23:03:30 +00:00
t = t.strip()
2017-12-13 22:33:07 +00:00
return t