hydrus/include/HydrusText.py

41 lines
1.1 KiB
Python
Raw Normal View History

2017-12-13 22:33:07 +00:00
import re
re_newlines = re.compile( '[\r\n]+', re.UNICODE )
re_multiple_spaces = re.compile( '\\s+', re.UNICODE )
re_trailing_space = re.compile( '\\s+$', re.UNICODE )
re_leading_space = re.compile( '^\\s+', re.UNICODE )
re_leading_space_or_garbage = re.compile( '^(\\s|-|system:)+', re.UNICODE )
re_leading_single_colon = re.compile( '^:(?!:)', re.UNICODE )
2018-01-17 22:52:10 +00:00
re_leading_byte_order_mark = re.compile( u'^\ufeff', re.UNICODE ) # unicode .txt files prepend with this, wew
2017-12-13 22:33:07 +00:00
def DeserialiseNewlinedTexts( text ):
text = text.replace( '\r', '' )
texts = text.split( '\n' )
texts = [ StripTrailingAndLeadingSpaces( line ) for line in texts ]
texts = [ line for line in texts if line != '' ]
return texts
2018-09-05 20:52:32 +00:00
def LooksLikeHTML( file_data ):
return '<html' in file_data or '<HTML' in file_data
2018-05-30 20:13:21 +00:00
def SortStringsIgnoringCase( list_of_strings ):
list_of_strings.sort( key = lambda s: s.lower() )
2017-12-13 22:33:07 +00:00
def StripTrailingAndLeadingSpaces( t ):
2018-01-17 22:52:10 +00:00
t = re_leading_byte_order_mark.sub( '', t )
2017-12-13 22:33:07 +00:00
t = re_trailing_space.sub( '', t )
t = re_leading_space.sub( '', t )
return t