hydrus/include/HydrusText.py

import json
import re

re_newlines = re.compile( '[\r\n]+', re.UNICODE )
re_multiple_spaces = re.compile( '\\s+', re.UNICODE )
re_trailing_space = re.compile( '\\s+$', re.UNICODE )
re_leading_space = re.compile( '^\\s+', re.UNICODE )
re_leading_space_or_garbage = re.compile( '^(\\s|-|system:)+', re.UNICODE )
re_leading_single_colon = re.compile( '^:(?!:)', re.UNICODE )
re_leading_byte_order_mark = re.compile( u'^\ufeff', re.UNICODE ) # unicode .txt files prepend with this, wew

def DeserialiseNewlinedTexts( text ):
    
    text = text.replace( '\r', '' )
    
    texts = text.split( '\n' )
    
    texts = [ StripTrailingAndLeadingSpaces( line ) for line in texts ]
    
    texts = [ line for line in texts if line != '' ]
    
    return texts
    
def LooksLikeHTML( file_data ):
    # this will false-positive if it is json that contains html, ha ha
    
    return '<html' in file_data or '<HTML' in file_data
    
def LooksLikeJSON( file_data ):
    
    try:
        
        json.loads( file_data )
        
        return True
        
    except:
        
        return False
        
    
def RemoveNewlines( text ):
    
    text = re.sub( r'\r|\n', '', text )
    
    return text
    
def SortStringsIgnoringCase( list_of_strings ):
    
    list_of_strings.sort( key = lambda s: s.lower() )
    
def StripTrailingAndLeadingSpaces( t ):
    
    t = re_leading_byte_order_mark.sub( '', t )
    
    t = re_trailing_space.sub( '', t )
    
    t = re_leading_space.sub( '', t )
    
    return t
Version 325 2018-10-03 21:00:15 +00:00			`import json`
Version 286 2017-12-13 22:33:07 +00:00			`import re`

			`re_newlines = re.compile( '[\r\n]+', re.UNICODE )`
			`re_multiple_spaces = re.compile( '\\s+', re.UNICODE )`
			`re_trailing_space = re.compile( '\\s+$', re.UNICODE )`
			`re_leading_space = re.compile( '^\\s+', re.UNICODE )`
			`re_leading_space_or_garbage = re.compile( '^(\\s\|-\|system:)+', re.UNICODE )`
			`re_leading_single_colon = re.compile( '^:(?!:)', re.UNICODE )`
Version 290 2018-01-17 22:52:10 +00:00			`re_leading_byte_order_mark = re.compile( u'^\ufeff', re.UNICODE ) # unicode .txt files prepend with this, wew`
Version 286 2017-12-13 22:33:07 +00:00
			`def DeserialiseNewlinedTexts( text ):`

			`text = text.replace( '\r', '' )`

			`texts = text.split( '\n' )`

			`texts = [ StripTrailingAndLeadingSpaces( line ) for line in texts ]`

			`texts = [ line for line in texts if line != '' ]`

			`return texts`

Version 321 2018-09-05 20:52:32 +00:00			`def LooksLikeHTML( file_data ):`
Version 325 2018-10-03 21:00:15 +00:00			`# this will false-positive if it is json that contains html, ha ha`
Version 321 2018-09-05 20:52:32 +00:00
			`return '<html' in file_data or '<HTML' in file_data`

Version 325 2018-10-03 21:00:15 +00:00			`def LooksLikeJSON( file_data ):`

			`try:`

			`json.loads( file_data )`

			`return True`

			`except:`

			`return False`


Version 324 2018-09-26 19:05:12 +00:00			`def RemoveNewlines( text ):`

			`text = re.sub( r'\r\|\n', '', text )`

			`return text`

Version 309 2018-05-30 20:13:21 +00:00			`def SortStringsIgnoringCase( list_of_strings ):`

			`list_of_strings.sort( key = lambda s: s.lower() )`

Version 286 2017-12-13 22:33:07 +00:00			`def StripTrailingAndLeadingSpaces( t ):`

Version 290 2018-01-17 22:52:10 +00:00			`t = re_leading_byte_order_mark.sub( '', t )`

Version 286 2017-12-13 22:33:07 +00:00			`t = re_trailing_space.sub( '', t )`

			`t = re_leading_space.sub( '', t )`

			`return t`