hydrus/hydrus/client/ClientStrings.py

1635 lines
50 KiB
Python
Raw Normal View History

import base64
2023-04-19 20:38:13 +00:00
import datetime
import hashlib
import html
2024-03-27 21:47:50 +00:00
import random
import re
import typing
import urllib.parse
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.core import HydrusExceptions
from hydrus.core import HydrusSerialisable
2022-07-13 21:35:17 +00:00
from hydrus.core import HydrusTags
2024-02-07 21:22:05 +00:00
from hydrus.core import HydrusText
2023-04-19 20:38:13 +00:00
from hydrus.core import HydrusTime
from hydrus.client import ClientTime
STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING = 0
STRING_CONVERSION_REMOVE_TEXT_FROM_END = 1
STRING_CONVERSION_PREPEND_TEXT = 2
STRING_CONVERSION_APPEND_TEXT = 3
STRING_CONVERSION_ENCODE = 4
STRING_CONVERSION_DECODE = 5
STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING = 6
STRING_CONVERSION_CLIP_TEXT_FROM_END = 7
STRING_CONVERSION_REVERSE = 8
STRING_CONVERSION_REGEX_SUB = 9
STRING_CONVERSION_DATE_DECODE = 10
STRING_CONVERSION_INTEGER_ADDITION = 11
STRING_CONVERSION_DATE_ENCODE = 12
STRING_CONVERSION_HASH_FUNCTION = 13
2023-09-13 18:26:31 +00:00
STRING_CONVERSION_DATEPARSER_DECODE = 14
2024-03-27 21:47:50 +00:00
STRING_CONVERSION_APPEND_RANDOM = 15
conversion_type_str_lookup = {}
conversion_type_str_lookup[ STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING ] = 'remove text from beginning of string'
conversion_type_str_lookup[ STRING_CONVERSION_REMOVE_TEXT_FROM_END ] = 'remove text from end of string'
conversion_type_str_lookup[ STRING_CONVERSION_PREPEND_TEXT ] = 'prepend text'
conversion_type_str_lookup[ STRING_CONVERSION_APPEND_TEXT ] = 'append text'
2024-03-27 21:47:50 +00:00
conversion_type_str_lookup[ STRING_CONVERSION_APPEND_RANDOM ] = 'append random text'
conversion_type_str_lookup[ STRING_CONVERSION_ENCODE ] = 'encode'
conversion_type_str_lookup[ STRING_CONVERSION_DECODE ] = 'decode'
conversion_type_str_lookup[ STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING ] = 'take the start of the string'
conversion_type_str_lookup[ STRING_CONVERSION_CLIP_TEXT_FROM_END ] = 'take the end of the string'
conversion_type_str_lookup[ STRING_CONVERSION_REVERSE ] = 'reverse text'
conversion_type_str_lookup[ STRING_CONVERSION_REGEX_SUB ] = 'regex substitution'
2023-09-13 18:26:31 +00:00
conversion_type_str_lookup[ STRING_CONVERSION_DATE_DECODE ] = 'datestring to timestamp (advanced)'
conversion_type_str_lookup[ STRING_CONVERSION_DATEPARSER_DECODE ] = 'datestring to timestamp (easy)'
conversion_type_str_lookup[ STRING_CONVERSION_INTEGER_ADDITION ] = 'integer addition'
conversion_type_str_lookup[ STRING_CONVERSION_DATE_ENCODE ] = 'timestamp to datestring'
conversion_type_str_lookup[ STRING_CONVERSION_HASH_FUNCTION ] = 'get hash of string'
class StringProcessingStep( HydrusSerialisable.SerialisableBase ):
def _GetSerialisableInfo( self ):
raise NotImplementedError()
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
raise NotImplementedError()
def MakesChanges( self ) -> bool:
raise NotImplementedError()
def ToString( self, simple = False, with_type = False ) -> str:
raise NotImplementedError()
2023-08-30 16:25:24 +00:00
class StringConverter( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_CONVERTER
SERIALISABLE_NAME = 'String Converter'
SERIALISABLE_VERSION = 1
def __init__( self, conversions = None, example_string = None ):
if conversions is None:
conversions = []
if example_string is None:
example_string = 'example string'
StringProcessingStep.__init__( self )
self.conversions = conversions
self.example_string = example_string
def _GetSerialisableInfo( self ):
return ( self.conversions, self.example_string )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_conversions, self.example_string ) = serialisable_info
self.conversions = []
try: # I initialised this bad one time and broke a dialog on subsequent loads, fugg
for ( conversion_type, data ) in serialisable_conversions:
if isinstance( data, list ):
data = tuple( data ) # convert from list to tuple thing
self.conversions.append( ( conversion_type, data ) )
except:
pass
def Convert( self, s, max_steps_allowed = None ):
for ( i, conversion ) in enumerate( self.conversions ):
if max_steps_allowed is not None and i >= max_steps_allowed:
return s
try:
( conversion_type, data ) = conversion
if conversion_type == STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING:
num_chars = data
s = s[ num_chars : ]
elif conversion_type == STRING_CONVERSION_REMOVE_TEXT_FROM_END:
num_chars = data
s = s[ : - num_chars ]
elif conversion_type == STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING:
num_chars = data
s = s[ : num_chars ]
elif conversion_type == STRING_CONVERSION_CLIP_TEXT_FROM_END:
num_chars = data
s = s[ - num_chars : ]
elif conversion_type == STRING_CONVERSION_PREPEND_TEXT:
text = data
s = text + s
elif conversion_type == STRING_CONVERSION_APPEND_TEXT:
text = data
s = s + text
2024-03-27 21:47:50 +00:00
elif conversion_type == STRING_CONVERSION_APPEND_RANDOM:
( population_text, num_chars ) = data
s = s + ''.join( random.choices( population_text, k = num_chars ) )
elif conversion_type == STRING_CONVERSION_ENCODE:
encode_type = data
if encode_type == 'url percent encoding':
s = urllib.parse.quote( s, safe = '' )
elif encode_type == 'unicode escape characters':
s = s.encode( 'unicode-escape' ).decode( 'utf-8' )
elif encode_type == 'html entities':
s = html.escape( s )
else:
# due to py3, this is now a bit of a pain
# _for now_, let's convert to bytes if not already and then spit out a str
if isinstance( s, str ):
s_bytes = bytes( s, 'utf-8' )
else:
s_bytes = s
if encode_type == 'hex':
s = s_bytes.hex()
elif encode_type == 'base64':
s_bytes = base64.b64encode( s_bytes )
s = str( s_bytes, 'utf-8' )
elif conversion_type == STRING_CONVERSION_DECODE:
encode_type = data
if encode_type == 'url percent encoding':
s = urllib.parse.unquote( s )
elif encode_type == 'unicode escape characters':
s = s.encode( 'utf-8' ).decode( 'unicode-escape' )
elif encode_type == 'html entities':
s = html.unescape( s )
# the old 'hex' and 'base64' are now deprecated, no-ops
elif conversion_type == STRING_CONVERSION_REVERSE:
s = s[::-1]
elif conversion_type == STRING_CONVERSION_REGEX_SUB:
( pattern, repl ) = data
s = re.sub( pattern, repl, s )
elif conversion_type == STRING_CONVERSION_DATE_DECODE:
( phrase, timezone, timezone_offset ) = data
2023-04-19 20:38:13 +00:00
dt = datetime.datetime.strptime( s, phrase )
2023-04-19 20:38:13 +00:00
if timezone in ( HC.TIMEZONE_UTC, HC.TIMEZONE_OFFSET ):
2023-04-19 20:38:13 +00:00
dt = datetime.datetime(
dt.year,
dt.month,
dt.day,
dt.hour,
dt.minute,
dt.second,
tzinfo = datetime.timezone.utc
)
2023-04-19 20:38:13 +00:00
if timezone == HC.TIMEZONE_OFFSET:
2022-07-13 21:35:17 +00:00
2023-04-19 20:38:13 +00:00
dt = dt - datetime.timedelta( seconds = timezone_offset )
2022-07-13 21:35:17 +00:00
2023-04-19 20:38:13 +00:00
timestamp = HydrusTime.DateTimeToTimestamp( dt )
s = str( timestamp )
2023-09-13 18:26:31 +00:00
elif conversion_type == STRING_CONVERSION_DATEPARSER_DECODE:
timestamp = ClientTime.ParseDate( s )
s = str( timestamp )
elif conversion_type == STRING_CONVERSION_DATE_ENCODE:
( phrase, timezone ) = data
try:
timestamp = int( s )
except:
raise Exception( '"{}" was not an integer!'.format( s ) )
2023-04-19 20:38:13 +00:00
dt = HydrusTime.TimestampToDateTime( timestamp, timezone )
2023-04-19 20:38:13 +00:00
s = dt.strftime( phrase )
elif conversion_type == STRING_CONVERSION_INTEGER_ADDITION:
delta = data
s = str( int( s ) + int( delta ) )
elif conversion_type == STRING_CONVERSION_HASH_FUNCTION:
hash_function = data
if hash_function == 'md5':
2023-03-29 20:57:59 +00:00
s = hashlib.md5( s.encode( 'utf-8' ) ).hexdigest()
elif hash_function == 'sha1':
2023-03-29 20:57:59 +00:00
s = hashlib.sha1( s.encode( 'utf-8' ) ).hexdigest()
elif hash_function == 'sha256':
2023-03-29 20:57:59 +00:00
s = hashlib.sha256( s.encode( 'utf-8' ) ).hexdigest()
elif hash_function == 'sha512':
2023-03-29 20:57:59 +00:00
s = hashlib.sha512( s.encode( 'utf-8' ) ).hexdigest()
else:
raise Exception( f'Unknown hash function "{hash_function}"!' )
2023-03-29 20:57:59 +00:00
except Exception as e:
2023-03-29 20:57:59 +00:00
raise HydrusExceptions.StringConvertException( 'ERROR: Could not apply "{}" to string "{}": {}'.format( self.ConversionToString( conversion ), s, e ) )
return s
2022-05-11 21:16:33 +00:00
def GetConversions( self ):
return list( self.conversions )
def GetConversionStrings( self ):
return [ self.ConversionToString( conversion ) for conversion in self.conversions ]
def MakesChanges( self ):
return len( self.conversions ) > 0
def ToString( self, simple = False, with_type = False ) -> str:
num_rules = len( self.conversions )
if num_rules == 0:
if simple:
label = 'no changes'
else:
label = 'no string conversions'
else:
if simple:
label = '{} changes'.format( HydrusData.ToHumanInt( num_rules ) )
else:
label = ', '.join( self.GetConversionStrings() )
if with_type:
label = 'CONVERT: {}'.format( label )
return label
@staticmethod
def ConversionToString( conversion ):
( conversion_type, data ) = conversion
if conversion_type == STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING:
return 'remove the first ' + HydrusData.ToHumanInt( data ) + ' characters'
elif conversion_type == STRING_CONVERSION_REMOVE_TEXT_FROM_END:
return 'remove the last ' + HydrusData.ToHumanInt( data ) + ' characters'
elif conversion_type == STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING:
return 'take the first ' + HydrusData.ToHumanInt( data ) + ' characters'
elif conversion_type == STRING_CONVERSION_CLIP_TEXT_FROM_END:
return 'take the last ' + HydrusData.ToHumanInt( data ) + ' characters'
elif conversion_type == STRING_CONVERSION_PREPEND_TEXT:
return 'prepend with "' + data + '"'
elif conversion_type == STRING_CONVERSION_APPEND_TEXT:
return 'append with "' + data + '"'
2024-03-27 21:47:50 +00:00
elif conversion_type == STRING_CONVERSION_APPEND_RANDOM:
( population_text, num_chars ) = data
return f'append with {HydrusData.ToHumanInt( num_chars )} random characters, from "{population_text}"'
elif conversion_type == STRING_CONVERSION_ENCODE:
return 'encode to ' + data
elif conversion_type == STRING_CONVERSION_DECODE:
if data in ( 'hex', 'base64' ):
return 'deprecated {} decode, now a no-op, can be deleted'.format( data )
return 'decode from ' + data
elif conversion_type == STRING_CONVERSION_REVERSE:
return conversion_type_str_lookup[ STRING_CONVERSION_REVERSE ]
elif conversion_type == STRING_CONVERSION_REGEX_SUB:
return 'regex substitution: ' + str( data )
elif conversion_type == STRING_CONVERSION_DATE_DECODE:
return 'datestring to timestamp: ' + repr( data )
2023-09-13 18:26:31 +00:00
elif conversion_type == STRING_CONVERSION_DATEPARSER_DECODE:
return 'datestring to timestamp: automatic'
elif conversion_type == STRING_CONVERSION_DATE_ENCODE:
return 'timestamp to datestring: ' + repr( data )
elif conversion_type == STRING_CONVERSION_INTEGER_ADDITION:
return 'integer addition: add ' + str( data )
elif conversion_type == STRING_CONVERSION_HASH_FUNCTION:
2023-03-29 20:57:59 +00:00
return 'hash string by ' + str( data )
else:
return 'unknown conversion'
2023-08-30 16:25:24 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_CONVERTER ] = StringConverter
2023-08-30 16:25:24 +00:00
class StringJoiner( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_JOINER
SERIALISABLE_NAME = 'String Concatenator'
2023-12-06 22:13:50 +00:00
SERIALISABLE_VERSION = 2
2023-08-30 16:25:24 +00:00
2023-12-06 22:13:50 +00:00
def __init__( self, joiner: str = '', join_tuple_size: typing.Optional[ int ] = None ):
2023-08-30 16:25:24 +00:00
StringProcessingStep.__init__( self )
self._joiner = joiner
self._join_tuple_size = join_tuple_size
def _GetSerialisableInfo( self ):
return ( self._joiner, self._join_tuple_size )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._joiner, self._join_tuple_size ) = serialisable_info
2023-12-06 22:13:50 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( joiner, join_tuple_size ) = old_serialisable_info
joiner = joiner.replace( '\\', '\\\\' )
new_serialisable_info = ( joiner, self._join_tuple_size )
return ( 2, new_serialisable_info )
2023-08-30 16:25:24 +00:00
def GetJoiner( self ):
return self._joiner
def GetJoinTupleSize( self ):
return self._join_tuple_size
def MakesChanges( self ) -> bool:
return True
def Join( self, texts: typing.Collection[ str ] ) -> typing.List[ str ]:
for text in texts:
if isinstance( text, bytes ):
raise HydrusExceptions.StringJoinerException( 'Got a bytes value in a string joiner!' )
2023-12-06 22:13:50 +00:00
try:
# \\n -> \n
joiner = self._joiner.encode( 'latin-1', 'backslashreplace' ).decode( 'unicode-escape' )
except Exception as e:
raise HydrusExceptions.StringJoinerException( 'Could not escape the joiner string. Wrong number of backslashes?' )
2023-08-30 16:25:24 +00:00
try:
joined_texts = []
if self._join_tuple_size is None:
2023-12-06 22:13:50 +00:00
joined_texts.append( joiner.join( texts ) )
2023-08-30 16:25:24 +00:00
else:
for chunk_of_texts in HydrusData.SplitIteratorIntoChunks( texts, self._join_tuple_size ):
if len( chunk_of_texts ) == self._join_tuple_size:
2023-12-06 22:13:50 +00:00
joined_texts.append( joiner.join( chunk_of_texts ) )
2023-08-30 16:25:24 +00:00
except Exception as e:
raise HydrusExceptions.StringJoinerException( 'Problem when joining text: {}'.format( e ) )
return joined_texts
def ToString( self, simple = False, with_type = False ) -> str:
if simple:
return 'joiner'
if self._join_tuple_size is None:
result = f'joining all strings using "{self._joiner}"'
else:
result = f'joining every {self._join_tuple_size} strings using "{self._joiner}"'
if with_type:
result = 'JOIN: {}'.format( result )
return result
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_JOINER ] = StringJoiner
STRING_MATCH_FIXED = 0
STRING_MATCH_FLEXIBLE = 1
STRING_MATCH_REGEX = 2
STRING_MATCH_ANY = 3
ALPHA = 0
ALPHANUMERIC = 1
NUMERIC = 2
class StringMatch( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_MATCH
SERIALISABLE_NAME = 'String Match'
SERIALISABLE_VERSION = 1
def __init__( self, match_type = STRING_MATCH_ANY, match_value = '', min_chars = None, max_chars = None, example_string = 'example string' ):
StringProcessingStep.__init__( self )
self._match_type = match_type
self._match_value = match_value
self._min_chars = min_chars
self._max_chars = max_chars
self._example_string = example_string
def _GetSerialisableInfo( self ):
return ( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string ) = serialisable_info
def GetExampleString( self ):
return self._example_string
def MakesChanges( self ) -> bool:
if self._min_chars is not None or self._max_chars is not None:
return True
if self._match_type != STRING_MATCH_ANY:
return True
return False
def Matches( self, text ):
try:
self.Test( text )
return True
except HydrusExceptions.StringMatchException:
return False
def SetMaxChars( self, max_chars ):
self._max_chars = max_chars
def SetMinChars( self, min_chars ):
self._min_chars = min_chars
def Test( self, text ):
if isinstance( text, bytes ):
raise HydrusExceptions.StringMatchException( 'Got a bytes value in a string match!' )
text_len = len( text )
presentation_text = '"{}"'.format( text )
if self._min_chars is not None and text_len < self._min_chars:
raise HydrusExceptions.StringMatchException( presentation_text + ' had fewer than ' + HydrusData.ToHumanInt( self._min_chars ) + ' characters' )
if self._max_chars is not None and text_len > self._max_chars:
raise HydrusExceptions.StringMatchException( presentation_text + ' had more than ' + HydrusData.ToHumanInt( self._max_chars ) + ' characters' )
if self._match_type == STRING_MATCH_FIXED:
if text != self._match_value:
raise HydrusExceptions.StringMatchException( presentation_text + ' did not exactly match "' + self._match_value + '"' )
elif self._match_type in ( STRING_MATCH_FLEXIBLE, STRING_MATCH_REGEX ):
if self._match_type == STRING_MATCH_FLEXIBLE:
if self._match_value == ALPHA:
r = '^[a-zA-Z]+$'
fail_reason = ' had non-alpha characters'
elif self._match_value == ALPHANUMERIC:
r = '^[a-zA-Z\\d]+$'
fail_reason = ' had non-alphanumeric characters'
elif self._match_value == NUMERIC:
r = '^\\d+$'
fail_reason = ' had non-numeric characters'
elif self._match_type == STRING_MATCH_REGEX:
r = self._match_value
fail_reason = ' did not match "' + r + '"'
try:
2024-02-07 21:22:05 +00:00
text_to_test = ''.join( text.splitlines() ).strip()
result = re.search( r, text_to_test )
except Exception as e:
raise HydrusExceptions.StringMatchException( 'That regex did not work! ' + str( e ) )
if result is None:
raise HydrusExceptions.StringMatchException( presentation_text + fail_reason )
elif self._match_type == STRING_MATCH_ANY:
pass
def ToTuple( self ):
return ( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string )
def ToString( self, simple = False, with_type = False ) -> str:
if simple:
return 'filter'
result = ''
if self._min_chars is None:
if self._max_chars is None:
result += 'any number of '
else:
result += 'at most ' + str( self._max_chars ) + ' '
else:
if self._max_chars is None:
result += 'at least ' + str( self._min_chars ) + ' '
else:
result += 'between ' + str( self._min_chars ) + ' and ' + str( self._max_chars ) + ' '
show_example = True
if self._match_type == STRING_MATCH_ANY:
result += 'characters'
show_example = False
elif self._match_type == STRING_MATCH_FIXED:
result = self._match_value
show_example = False
elif self._match_type == STRING_MATCH_FLEXIBLE:
if self._match_value == ALPHA:
result += 'alphabetical characters'
elif self._match_value == ALPHANUMERIC:
result += 'alphanumeric characters'
elif self._match_value == NUMERIC:
result += 'numeric characters'
elif self._match_type == STRING_MATCH_REGEX:
result += 'characters, matching regex "' + self._match_value + '"'
if show_example:
result += ', such as "' + self._example_string + '"'
if with_type:
result = 'MATCH: {}'.format( result )
return result
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_MATCH ] = StringMatch
class StringSlicer( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_SLICER
SERIALISABLE_NAME = 'String Selector/Slicer'
SERIALISABLE_VERSION = 1
def __init__( self, index_start: typing.Optional[ int ] = None, index_end: typing.Optional[ int ] = None ):
StringProcessingStep.__init__( self )
self._index_start = index_start
self._index_end = index_end
def _GetSerialisableInfo( self ):
return ( self._index_start, self._index_end )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._index_start, self._index_end ) = serialisable_info
def GetIndexStartEnd( self ) -> typing.Tuple[ typing.Optional[ int ], typing.Optional[ int ] ]:
return ( self._index_start, self._index_end )
def MakesChanges( self ) -> bool:
return self._index_start is not None or self._index_end is not None
def SelectsNothingEver( self ) -> bool:
if self._index_end == 0:
return True
if self._index_start is None or self._index_end is None:
return False
both_positive = self._index_start >= 0 and self._index_end >= 0
both_negative = self._index_start < 0 and self._index_end < 0
if both_positive or both_negative:
if self._index_start >= self._index_end:
return True
return False
def SelectsOne( self ) -> bool:
if self.SelectsNothingEver():
return False
if self._index_start == -1 and self._index_end is None:
return True
if self._index_start is None or self._index_end is None:
return False
both_positive = self._index_start >= 0 and self._index_end >= 0
both_negative = self._index_start < 0 and self._index_end < 0
return ( both_positive or both_negative ) and self._index_start == self._index_end - 1
def Slice( self, texts: typing.Sequence[ str ] ) -> typing.List[ str ]:
try:
if self._index_start is None and self._index_end is None:
return list( texts )
elif self._index_end is None:
return texts[ self._index_start : ]
elif self._index_start is None:
return texts[ : self._index_end ]
else:
return texts[ self._index_start : self._index_end ]
except IndexError as e:
return []
def ToString( self, simple = False, with_type = False ) -> str:
if simple:
return 'selector/slicer'
if self.SelectsNothingEver():
result = 'selecting nothing'
elif self.SelectsOne():
result = 'selecting the {} string'.format( HydrusData.ConvertIndexToPrettyOrdinalString( self._index_start ) )
elif self._index_start is None and self._index_end is None:
result = 'selecting everything'
elif self._index_end is None:
result = 'selecting the {} string and onwards'.format( HydrusData.ConvertIndexToPrettyOrdinalString( self._index_start ) )
elif self._index_start is None:
result = 'selecting up to and including the {} string'.format( HydrusData.ConvertIndexToPrettyOrdinalString( self._index_end - 1 ) )
else:
result = 'selecting the {} string up to and including the {} string'.format( HydrusData.ConvertIndexToPrettyOrdinalString( self._index_start ), HydrusData.ConvertIndexToPrettyOrdinalString( self._index_end - 1 ) )
if with_type:
if self.SelectsOne():
result = 'SELECT: {}'.format( result )
else:
result = 'SLICE: {}'.format( result )
return result
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_SLICER ] = StringSlicer
CONTENT_PARSER_SORT_TYPE_NONE = 0
CONTENT_PARSER_SORT_TYPE_LEXICOGRAPHIC = 1
CONTENT_PARSER_SORT_TYPE_HUMAN_SORT = 2
CONTENT_PARSER_SORT_TYPE_REVERSE = 3
sort_str_enum = {
CONTENT_PARSER_SORT_TYPE_NONE : 'no sorting',
CONTENT_PARSER_SORT_TYPE_LEXICOGRAPHIC : 'strict lexicographic',
CONTENT_PARSER_SORT_TYPE_HUMAN_SORT : 'human sort',
CONTENT_PARSER_SORT_TYPE_REVERSE : 'reverse'
}
class StringSorter( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_SORTER
SERIALISABLE_NAME = 'String Sorter'
SERIALISABLE_VERSION = 1
def __init__( self, sort_type: int = CONTENT_PARSER_SORT_TYPE_HUMAN_SORT, asc: bool = False, regex: typing.Optional[ str ] = None ):
StringProcessingStep.__init__( self )
self._sort_type = sort_type
self._asc = asc
self._regex = regex
def _GetSerialisableInfo( self ):
return ( self._sort_type, self._asc, self._regex )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._sort_type, self._asc, self._regex ) = serialisable_info
def GetAscending( self ) -> bool:
return self._asc
def GetRegex( self ) -> typing.Optional[ str ]:
return self._regex
def GetSortType( self ) -> int:
return self._sort_type
def MakesChanges( self ) -> bool:
return True
def Sort( self, texts: typing.Sequence[ str ] ) -> typing.List[ str ]:
try:
texts = list( texts )
if self._sort_type == CONTENT_PARSER_SORT_TYPE_REVERSE:
texts.reverse()
else:
data_convert = lambda d_s: d_s
invalid_data_convert_texts = []
if self._regex is not None:
re_job = re.compile( self._regex )
def d( d_s ):
m = re_job.search( d_s )
if m is None:
return ''
else:
return m.group()
data_convert = d
invalid_data_convert_texts = [ text for text in texts if data_convert( text ) == '' ]
texts = [ text for text in texts if data_convert( text ) != '' ]
sort_convert = lambda s: s
if self._sort_type == CONTENT_PARSER_SORT_TYPE_HUMAN_SORT:
sort_convert = HydrusData.HumanTextSortKey
key = lambda k_s: sort_convert( data_convert( k_s ) )
reverse = not self._asc
texts.sort( key = key, reverse = reverse )
invalid_data_convert_texts.sort( key = sort_convert, reverse = reverse )
texts.extend( invalid_data_convert_texts )
return texts
except Exception as e:
raise HydrusExceptions.StringSortException( e )
def ToString( self, simple = False, with_type = False ) -> str:
if simple:
return 'sorter'
result = 'sorting {} ({})'.format( sort_str_enum[ self._sort_type ], 'ascending' if self._asc else 'descending' )
if self._regex is not None:
result = '{} (with regex)'.format( result )
if with_type:
result = 'SORT: {}'.format( result )
return result
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_SORTER ] = StringSorter
class StringSplitter( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_SPLITTER
SERIALISABLE_NAME = 'String Splitter'
2023-12-06 22:13:50 +00:00
SERIALISABLE_VERSION = 2
def __init__( self, separator: str = ',', max_splits: typing.Optional[ int ] = None ):
StringProcessingStep.__init__( self )
self._separator = separator
self._max_splits = max_splits
def _GetSerialisableInfo( self ):
return ( self._separator, self._max_splits )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._separator, self._max_splits ) = serialisable_info
2023-12-06 22:13:50 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( separator, max_splits ) = old_serialisable_info
separator = separator.replace( '\\', '\\\\' )
new_serialisable_info = ( separator, max_splits )
return ( 2, new_serialisable_info )
def GetMaxSplits( self ):
return self._max_splits
def GetSeparator( self ):
return self._separator
def MakesChanges( self ) -> bool:
return True
def Split( self, text: str ) -> typing.List[ str ]:
if isinstance( text, bytes ):
raise HydrusExceptions.StringSplitterException( 'Got a bytes value in a string splitter!' )
2023-12-06 22:13:50 +00:00
try:
separator = self._separator.encode( 'latin-1', 'backslashreplace' ).decode( 'unicode-escape' )
except:
raise HydrusExceptions.StringSplitterException( 'Could not escape the splitter string. Wrong number of backslashes?' )
2022-06-01 21:19:26 +00:00
try:
2022-06-01 21:19:26 +00:00
if self._max_splits is None:
2023-12-06 22:13:50 +00:00
results = text.split( separator )
2022-06-01 21:19:26 +00:00
else:
2023-12-06 22:13:50 +00:00
results = text.split( separator, self._max_splits )
2022-06-01 21:19:26 +00:00
2022-06-01 21:19:26 +00:00
except Exception as e:
2022-06-01 21:19:26 +00:00
raise HydrusExceptions.StringSplitterException( 'Problem when splitting text: {}'.format( e ) )
return [ result for result in results if result != '' ]
def ToString( self, simple = False, with_type = False ) -> str:
if simple:
return 'splitter'
result = 'splitting by "{}"'.format( self._separator )
if self._max_splits is not None:
result = '{}, at most {} times'.format( result, HydrusData.ToHumanInt( self._max_splits ) )
if with_type:
result = 'SPLIT: {}'.format( result )
return result
2023-08-30 16:25:24 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_SPLITTER ] = StringSplitter
2022-07-13 21:35:17 +00:00
class StringTagFilter( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_TAG_FILTER
SERIALISABLE_NAME = 'String Tag Filter'
SERIALISABLE_VERSION = 1
def __init__( self, tag_filter = None, example_string = 'blue eyes' ):
StringProcessingStep.__init__( self )
if tag_filter is None:
tag_filter = HydrusTags.TagFilter()
self._tag_filter = tag_filter
self._example_string = example_string
def _GetSerialisableInfo( self ):
serialisable_tag_filter = self._tag_filter.GetSerialisableTuple()
return ( serialisable_tag_filter, self._example_string )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_tag_filter, self._example_string ) = serialisable_info
self._tag_filter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_filter )
def ConvertAndFilter( self, tag_texts ):
tags = HydrusTags.CleanTags( tag_texts )
tags = self._tag_filter.Filter( tags, apply_unnamespaced_rules_to_namespaced_tags = True )
tags = sorted( tags, key = HydrusTags.ConvertTagToSortable )
return tags
def GetExampleString( self ) -> str:
return self._example_string
def GetTagFilter( self ) -> HydrusTags.TagFilter:
return self._tag_filter
def MakesChanges( self ) -> bool:
# it always scans for valid tags
return True
def Matches( self, text ):
try:
self.Test( text )
return True
except HydrusExceptions.StringMatchException:
return False
def Test( self, text ):
if isinstance( text, bytes ):
raise HydrusExceptions.StringMatchException( 'Got a bytes value in a string match!' )
presentation_text = '"{}"'.format( text )
try:
tags = HydrusTags.CleanTags( [ text ] )
if len( tags ) == 0:
raise Exception()
else:
tag = list( tags )[0]
except:
raise HydrusExceptions.StringMatchException( '{} was not a valid tag!'.format( presentation_text ) )
if not self._tag_filter.TagOK( tag, apply_unnamespaced_rules_to_namespaced_tags = True ):
raise HydrusExceptions.StringMatchException( '{} did not pass the tag filter!'.format( presentation_text ) )
def ToString( self, simple = False, with_type = False ) -> str:
if simple:
return 'tag filter'
result = '{}, such as {}'.format( self._tag_filter.ToPermittedString(), self._example_string )
if with_type:
result = 'TAG FILTER: {}'.format( result )
return result
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_TAG_FILTER ] = StringTagFilter
class StringProcessor( StringProcessingStep ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_PROCESSOR
SERIALISABLE_NAME = 'String Processor'
SERIALISABLE_VERSION = 1
def __init__( self ):
StringProcessingStep.__init__( self )
self._processing_steps = []
def _GetSerialisableInfo( self ):
return HydrusSerialisable.SerialisableList( self._processing_steps ).GetSerialisableTuple()
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
serialisable_processing_steps = serialisable_info
self._processing_steps = list( HydrusSerialisable.CreateFromSerialisableTuple( serialisable_processing_steps ) )
def GetProcessingSteps( self ):
return list( self._processing_steps )
def GetProcessingStrings( self ):
proc_strings = []
for processing_step in self._processing_steps:
if isinstance( processing_step, StringConverter ):
proc_strings.extend( processing_step.GetConversionStrings() )
else:
proc_strings.append( processing_step.ToString() )
return proc_strings
2022-10-26 20:43:00 +00:00
def MakesChanges( self ) -> bool:
return True in ( step.MakesChanges() for step in self._processing_steps )
def ProcessStrings( self, starting_strings: typing.Iterable[ str ], max_steps_allowed = None, no_slicing = False ) -> typing.List[ str ]:
current_strings = list( starting_strings )
for ( i, processing_step ) in enumerate( self._processing_steps ):
if max_steps_allowed is not None and i >= max_steps_allowed:
break
if isinstance( processing_step, StringSorter ):
try:
next_strings = processing_step.Sort( current_strings )
except HydrusExceptions.StringSortException:
next_strings = current_strings
elif isinstance( processing_step, StringSlicer ):
if no_slicing:
next_strings = current_strings
else:
try:
next_strings = processing_step.Slice( current_strings )
except:
next_strings = current_strings
2022-07-13 21:35:17 +00:00
elif isinstance( processing_step, StringTagFilter ):
try:
next_strings = processing_step.ConvertAndFilter( current_strings )
except:
next_strings = current_strings
2023-08-30 16:25:24 +00:00
elif isinstance( processing_step, StringJoiner ):
try:
next_strings = processing_step.Join( current_strings )
except:
next_strings = current_strings
else:
next_strings = []
for current_string in current_strings:
if isinstance( processing_step, StringConverter ):
if isinstance( current_string, bytes ):
continue
try:
next_string = processing_step.Convert( current_string )
next_strings.append( next_string )
except HydrusExceptions.StringConvertException:
continue
elif isinstance( processing_step, StringMatch ):
try:
if processing_step.Matches( current_string ):
next_strings.append( current_string )
except HydrusExceptions.StringMatchException:
continue
elif isinstance( processing_step, StringSplitter ):
if isinstance( current_string, bytes ):
continue
try:
split_strings = processing_step.Split( current_string )
next_strings.extend( split_strings )
except HydrusExceptions.StringSplitterException:
continue
current_strings = next_strings
return current_strings
def SetProcessingSteps( self, processing_steps: typing.List[ StringProcessingStep ] ):
self._processing_steps = list( processing_steps )
def ToString( self, simple = False, with_type = False ) -> str:
if len( self._processing_steps ) == 0:
return 'no string processing'
else:
components = []
if True in ( isinstance( ps, StringConverter ) for ps in self._processing_steps ):
components.append( 'conversion' )
2023-08-30 16:25:24 +00:00
if True in ( isinstance( ps, StringJoiner ) for ps in self._processing_steps ):
components.append( 'joining' )
if True in ( isinstance( ps, StringMatch ) for ps in self._processing_steps ):
components.append( 'filtering' )
if True in ( isinstance( ps, StringSplitter ) for ps in self._processing_steps ):
components.append( 'splitting' )
if True in ( isinstance( ps, StringSorter ) for ps in self._processing_steps ):
components.append( 'sorting' )
if True in ( isinstance( ps, StringSlicer ) for ps in self._processing_steps ):
components.append( 'selecting/slicing' )
return 'some {}'.format( ', '.join( components ) )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_PROCESSOR ] = StringProcessor