# made by prkc for Hydrus Network
# Licensed under the same terms as Hydrus Network
# hydev has changed a couple things here and there
# The basic idea here is to take a system predicate written as text and parse it into a (predicate type, operator, value, unit)
# tuple. The exact structure of the operator, value and unit members depend on the type of the predicate.
# For example, system:width < 500 would become (Predicate.WIDTH, '<', 500).
# The parsers recognize multiple forms for various units and operators, but always normalize to a single canonical form,
# which is given in the comments beside the various enums below.
# Some or all of them can be None, depending on the predicate.
# The "parsing" is done with regex, which is hacky but good enough for this usecase.
# To extend the parser with additional predicates, first extend the Predicate, Value, Operators, Units enums if the
# already present options are not sufficient, then implement parsing for them in the corresponding parse_{unit,value,operator} funtions.
# Finally, add a new entry to the SYSTEM_PREDICATES dict describing the new predicate.
# Initially everything below is independent from other Hydrus code so there is some redundancy.
# It might be better to switch to already established Hydrus enums and constants where possible.
# Errors are handled by throwing ValueErrors. The main function to call is parse_system_predicate.
# If this file is run by itself it will parse and print all the included examples. There are examples for each supported predicate type.
import dateparser
import math
import re
import datetime
from enum import Enum, auto
# sort according to longest thing first to rid ourselves of ambiguity
operator_strings_and_results = sorted(
( '=', '=' ),
( '==', '=' ),
( 'is', '=' ),
( 'is not', UNICODE_NOT_EQUAL ),
( 'isn\'t', UNICODE_NOT_EQUAL ),
( '<', '<' ),
( 'less than', '<' ),
( '>', '>' ),
( 'more than', '>' ),
( 'is about', UNICODE_APPROX_EQUAL ),
key = lambda a: -len( a[0] )
operator_strings_to_results = dict( operator_strings_and_results )
# Note this needs to be initialised here with all types that Hydrus supports.
def InitialiseFiletypes( str_to_enum ):
for ( filetype_string, enum ) in str_to_enum.items():
if isinstance( enum, int ):
enum_tuple = (enum,)
enum_tuple = tuple( enum )
FILETYPES[ filetype_string ] = enum_tuple
# This enum lists all the recognized predicate types.
class Predicate( Enum ):
INBOX = auto()
ARCHIVE = auto()
NO_DURATION = auto()
HAS_AUDIO = auto()
NO_AUDIO = auto()
HAS_EXIF = auto()
NO_EXIF = auto()
HAS_TAGS = auto()
UNTAGGED = auto()
NUM_OF_TAGS = auto()
NUM_OF_WORDS = auto()
HEIGHT = auto()
WIDTH = auto()
FILESIZE = auto()
LIMIT = auto()
FILETYPE = auto()
HASH = auto()
MOD_DATE = auto()
DURATION = auto()
FRAMERATE = auto()
NUM_OF_FRAMES = auto()
NUM_FILE_RELS = auto()
RATIO = auto()
NUM_PIXELS = auto()
MEDIA_VIEWS = auto()
ALL_VIEWS = auto()
URL_REGEX = auto()
NO_URL_REGEX = auto()
URL = auto()
NO_URL = auto()
DOMAIN = auto()
NO_DOMAIN = auto()
URL_CLASS = auto()
NO_URL_CLASS = auto()
TAG_AS_NUMBER = auto()
HAS_NOTES = auto()
NO_NOTES = auto()
NUM_NOTES = auto()
HAS_NOTE_NAME = auto()
NO_NOTE_NAME = auto()
HAS_RATING = auto()
NO_RATING = auto()
# This enum lists the possible value formats a predicate can have (if it has a value).
# Parsing for each of these options is implemented in parse_value
class Value( Enum ):
NATURAL = auto() # An int that holds a non-negative value
SHA256_HASHLIST_WITH_DISTANCE = auto() # A 2-tuple, where the first part is a set of potential hashes (as strings), the second part is a non-negative integer
SIMILAR_TO_HASHLIST_WITH_DISTANCE = auto() # A 3-tuple, where the first two parts are potential pixel and perceptual hashes (as strings), the second part is a non-negative integer
HASHLIST_WITH_ALGORITHM = auto() # A 2-tuple, where the first part is a set of potential hashes (as strings), the second part is one of 'sha256', 'md5', 'sha1', 'sha512'
FILETYPE_LIST = auto() # A set of file types using the enum set in InitialiseFiletypes as defined in FILETYPES
# Either a tuple of 4 non-negative integers: (years, months, days, hours) where the latter is < 24 OR
# a datetime.datetime object. For the latter, only the YYYY-MM-DD format is accepted.
# dateutils has a function to try to guess and parse arbitrary date formats but I didn't use it here since it would be an additional dependency.
TIME_SEC_MSEC = auto() # A tuple of two non-negative integers: (seconds, milliseconds) where the latter is <1000
ANY_STRING = auto() # A string (accepts any string so can't use units after this since it consumes the entire remaining part of the input)
TIME_INTERVAL = auto() # A tuple of 4 non-negative integers: (days, hours, minutes, seconds) where hours < 24, minutes < 60, seconds < 60
INTEGER = auto() # An integer
RATIO = auto() # A tuple of 2 ints, both non-negative
RATIO_SPECIAL = auto() # 1:1
RATING_SERVICE_NAME_AND_NUMERICAL_VALUE = auto() # my favourites 3/5
RATING_SERVICE_NAME_AND_LIKE_DISLIKE = auto() # my favourites like
RATING_SERVICE_NAME_AND_INCDEC = auto() # my favourites 3/5
# Possible operator formats
# Implemented in parse_operator
class Operators( Enum ):
RELATIONAL = auto() # One of '=', '<', '>', UNICODE_APPROX_EQUAL ('≈') (takes '~=' too)
RELATIONAL_EXACT = auto() # Like RELATIONAL but without the approximately equal operator
RELATIONAL_TIME = auto() # One of '=', '<', '>', UNICODE_APPROX_EQUAL ('≈') (takes '~=' too), and the various 'since', 'before', 'the day of', 'the month of' time-based analogues
RELATIONAL_FOR_RATING_SERVICE = auto() # RELATIONAL, but in the middle of a 'service_name = 4/5' kind of thing
EQUAL = auto() # One of '=' or '!='
FILESERVICE_STATUS = auto() # One of 'is not currently in', 'is currently in', 'is not pending to', 'is pending to'
TAG_RELATIONAL = auto() # A tuple of a string (a potential tag name) and a relational operator (as a string)
ONLY_EQUAL = auto() # None (meaning =, since thats the only accepted operator)
RATIO_OPERATORS = auto() # One of '=', 'wider than','taller than', UNICODE_APPROX_EQUAL ('≈') (takes '~=' too)
RATIO_OPERATORS_SPECIAL = auto() # 'square', 'portrait', 'landscape'
# Possible unit formats
# Implemented in parse_unit
class Units( Enum ):
FILESIZE = auto() # One of 'B', 'KB', 'MB', 'GB'
FILE_RELATIONSHIP_TYPE = auto() # One of 'not related/false positive', 'duplicates', 'alternates', 'potential duplicates'
PIXELS_OR_NONE = auto() # Always None (meaning pixels)
PIXELS = auto() # One of 'pixels', 'kilopixels', 'megapixels'
FPS_OR_NONE = auto() # 'fps'
# All system predicates
# A predicate is described by a 4-tuple of (predicate type, operator format, value format, unit format) (use None if some are not applicable)
# The keys are regexes matching the predicate names as written by the user.
# The parser will also automatically accept _ instead of space in the predicate names, always use space in this dict.
'everything': (Predicate.EVERYTHING, None, None, None),
'inbox': (Predicate.INBOX, None, None, None),
'archived?$': (Predicate.ARCHIVE, None, None, None), # $ so as not to clash with system:archive(d) date
'has duration': (Predicate.HAS_DURATION, None, None, None),
'no duration': (Predicate.NO_DURATION, None, None, None),
'(is the )?best quality( file)? of( its)?( duplicate)? group': (Predicate.BEST_QUALITY_OF_GROUP, None, None, None),
'(((is )?not)|(isn\'t))( the)? best quality( file)? of( its)?( duplicate)? group': (Predicate.NOT_BEST_QUALITY_OF_GROUP, None, None, None),
'has audio': (Predicate.HAS_AUDIO, None, None, None),
'no audio': (Predicate.NO_AUDIO, None, None, None),
'has (transparency|alpha)': (Predicate.HAS_TRANSPARENCY, None, None, None),
'no (transparency|alpha)': (Predicate.NO_TRANSPARENCY, None, None, None),
'has exif': (Predicate.HAS_EXIF, None, None, None),
'no exif': (Predicate.NO_EXIF, None, None, None),
'has.*embedded.*metadata': (Predicate.HAS_HUMAN_READABLE_EMBEDDED_METADATA, None, None, None),
'no.*embedded.*metadata': (Predicate.NO_HUMAN_READABLE_EMBEDDED_METADATA, None, None, None),
'has icc profile': (Predicate.HAS_ICC_PROFILE, None, None, None),
'no icc profile': (Predicate.NO_ICC_PROFILE, None, None, None),
'has tags': (Predicate.HAS_TAGS, None, None, None),
'untagged|no tags': (Predicate.UNTAGGED, None, None, None),
'num(ber)?( of)? tags': (Predicate.NUM_OF_TAGS, Operators.RELATIONAL, Value.NATURAL, None),
'num(ber)?( of)? (?=[^\\s].* tags)': (Predicate.NUM_OF_TAGS_WITH_NAMESPACE, None, Value.NAMESPACE_AND_NUM_TAGS, None),
'num(ber)?( of)? words': (Predicate.NUM_OF_WORDS, Operators.RELATIONAL, Value.NATURAL, None),
'height': (Predicate.HEIGHT, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS_OR_NONE),
'width': (Predicate.WIDTH, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS_OR_NONE),
'file ?size': (Predicate.FILESIZE, Operators.RELATIONAL, Value.NATURAL, Units.FILESIZE),
'similar to(?! data)( files)?': (Predicate.SIMILAR_TO_FILES, None, Value.SHA256_HASHLIST_WITH_DISTANCE, None),
'similar to data': (Predicate.SIMILAR_TO_DATA, None, Value.SIMILAR_TO_HASHLIST_WITH_DISTANCE, None),
'limit': (Predicate.LIMIT, Operators.ONLY_EQUAL, Value.NATURAL, None),
'file ?type': (Predicate.FILETYPE, Operators.ONLY_EQUAL, Value.FILETYPE_LIST, None),
'hash': (Predicate.HASH, Operators.EQUAL, Value.HASHLIST_WITH_ALGORITHM, None),
'archived? (date|time)|(date|time) archived|archived.': (Predicate.ARCHIVED_DATE, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
'modified (date|time)|(date|time) modified|modified': (Predicate.MOD_DATE, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
'last view(ed)? (date|time)|(date|time) last viewed|last viewed': (Predicate.LAST_VIEWED_TIME, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
'import(ed)? (date|time)|(date|time) imported|imported': (Predicate.TIME_IMPORTED, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
'duration': (Predicate.DURATION, Operators.RELATIONAL, Value.TIME_SEC_MSEC, None),
'framerate': (Predicate.FRAMERATE, Operators.RELATIONAL_EXACT, Value.NATURAL, Units.FPS_OR_NONE),
'num(ber)?( of)? frames': (Predicate.NUM_OF_FRAMES, Operators.RELATIONAL, Value.NATURAL, None),
'file service': (Predicate.FILE_SERVICE, Operators.FILESERVICE_STATUS, Value.ANY_STRING, None),
'num(ber)?( of)? file relationships': (Predicate.NUM_FILE_RELS, Operators.RELATIONAL, Value.NATURAL, Units.FILE_RELATIONSHIP_TYPE),
'ratio(?=.*\d)': (Predicate.RATIO, Operators.RATIO_OPERATORS, Value.RATIO, None),
'ratio(?!.*\d)': (Predicate.RATIO_SPECIAL, Operators.RATIO_OPERATORS_SPECIAL, Value.RATIO_SPECIAL, None),
'num pixels': (Predicate.NUM_PIXELS, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS),
'media views': (Predicate.MEDIA_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
'preview views': (Predicate.PREVIEW_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
'all views': (Predicate.ALL_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
'media viewtime': (Predicate.MEDIA_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
'preview viewtime': (Predicate.PREVIEW_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
'all viewtime': (Predicate.ALL_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
'has (a )?url matching regex': (Predicate.URL_REGEX, None, Value.ANY_STRING, None),
'(does not|doesn\'t) have (a )?url matching regex': (Predicate.NO_URL_REGEX, None, Value.ANY_STRING, None),
'has url': (Predicate.URL, None, Value.ANY_STRING, None),
'(does not|doesn\'t) have url': (Predicate.NO_URL, None, Value.ANY_STRING, None),
'has (a )?(url with )?domain': (Predicate.DOMAIN, None, Value.ANY_STRING, None),
'(does not|doesn\'t) have (a )?(url with )?domain': (Predicate.NO_DOMAIN, None, Value.ANY_STRING, None),
'has (a )?url with (url )?class': (Predicate.URL_CLASS, None, Value.ANY_STRING, None),
'(does not|doesn\'t) have (a )?url with (url )?class': (Predicate.NO_URL_CLASS, None, Value.ANY_STRING, None),
'tag as number': (Predicate.TAG_AS_NUMBER, Operators.TAG_RELATIONAL, Value.INTEGER, None),
'has notes?$': (Predicate.HAS_NOTES, None, None, None),
'((has )?no|does not have( a)?|doesn\'t have) notes?$': (Predicate.NO_NOTES, None, None, None),
'num(ber)?( of)? notes?': (Predicate.NUM_NOTES, Operators.RELATIONAL_EXACT, Value.NATURAL, None),
'(has (a )?)?note (with name|named)': (Predicate.HAS_NOTE_NAME, None, Value.ANY_STRING, None),
'((has )?no|does not have( a)?|doesn\'t have( a)?) note (with name|named)': (Predicate.NO_NOTE_NAME, None, Value.ANY_STRING, None),
'has( a)? rating( for)?': (Predicate.HAS_RATING, None, Value.ANY_STRING, None ),
'((has )?no|does not have( a)?|doesn\'t have( a)?) rating( for)?': (Predicate.NO_RATING, None, Value.ANY_STRING, None ),
'rating( for)?(?=.+?(like|dislike)$)': (Predicate.RATING_SPECIFIC_LIKE_DISLIKE, None, Value.RATING_SERVICE_NAME_AND_LIKE_DISLIKE, None ),
'rating( for)?(?=.+?[^/]\d+$)': (Predicate.RATING_SPECIFIC_INCDEC, Operators.RELATIONAL_FOR_RATING_SERVICE, Value.RATING_SERVICE_NAME_AND_INCDEC, None ),
def string_looks_like_date( string ):
# this sucks but it will do for now
test_words = [ 'year', 'month', 'day', 'hour', 'second', 'ago' ]
return True not in ( word in string for word in test_words )
# Parsing is just finding a matching predicate name,
# then trying to parse it by consuming the input string.
# The parse_* functions consume some of the string and return a (remaining part of the string, parsed value) tuple.
def parse_system_predicate( string: str ):
string = string.lower().strip()
string = string.replace( '_', ' ' )
if string.startswith( "-" ):
raise ValueError( "System predicate can't start with negation" )
if not string.startswith( SYSTEM_PREDICATE_PREFIX ):
raise ValueError( "Not a system predicate!" )
string = string[ len( SYSTEM_PREDICATE_PREFIX ): ]
for pred_regex in SYSTEM_PREDICATES:
match = re.match( pred_regex.replace( ' ', '([_ ]+)' ) + ":?", string )
if match:
pred = SYSTEM_PREDICATES[ pred_regex ]
string = string[ len( match[ 0 ] ): ]
string, operator = parse_operator( string, pred[ 1 ] )
string, value = parse_value( string, pred[ 2 ] )
string, unit = parse_unit( string, pred[ 3 ] )
if string: raise ValueError( "Unrecognized characters at the end of the predicate: " + string )
return pred[ 0 ], operator, value, unit
raise ValueError( "Unknown system predicate!" )
def parse_unit( string: str, spec ):
string = string.strip()
if spec is None:
return string, None
elif spec == Units.FILESIZE:
match = re.match( 'b|byte|bytes', string )
if match: return string[ len( match[ 0 ] ): ], 'B'
match = re.match( 'kb|kilobytes|kilobyte', string )
if match: return string[ len( match[ 0 ] ): ], 'KB'
match = re.match( 'mb|megabytes|megabyte', string )
if match: return string[ len( match[ 0 ] ): ], 'MB'
match = re.match( 'gb|gigabytes|gigabyte', string )
if match: return string[ len( match[ 0 ] ): ], 'GB'
raise ValueError( "Invalid unit, expected a filesize" )
elif spec == Units.FILE_RELATIONSHIP_TYPE:
match = re.match( 'duplicates', string )
if match: return string[ len( match[ 0 ] ): ], 'duplicates'
match = re.match( 'alternates', string )
if match: return string[ len( match[ 0 ] ): ], 'alternates'
match = re.match( '(not related/false positives?)|not related|(false positives?)', string )
if match: return string[ len( match[ 0 ] ): ], 'not related/false positive'
match = re.match( 'potential duplicates', string )
if match: return string[ len( match[ 0 ] ): ], 'potential duplicates'
raise ValueError( "Invalid unit, expected a file relationship" )
elif spec == Units.PIXELS_OR_NONE:
if not string:
return string, None
match = re.match( '(pixels?)|px', string )
if match: return string[ len( match[ 0 ] ): ], None
raise ValueError( "Invalid unit, expected no unit or pixels" )
elif spec == Units.PIXELS:
match = re.match( 'px|pixels|pixel', string )
if match: return string[ len( match[ 0 ] ): ], 'pixels'
match = re.match( 'kpx|kilopixels|kilopixel', string )
if match: return string[ len( match[ 0 ] ): ], 'kilopixels'
match = re.match( 'mpx|megapixels|megapixel', string )
if match: return string[ len( match[ 0 ] ): ], 'megapixels'
raise ValueError( "Invalid unit, expected pixels" )
elif spec == Units.FPS_OR_NONE:
if not string:
return string, None
match = re.match( 'fps', string )
if match: return string[ len( match[ 0 ] ): ], None
raise ValueError( "Invalid unit, expected no unit or fps" )
raise ValueError( "Invalid unit specification" )
def parse_value( string: str, spec ):
string = string.strip()
if spec is None:
return string, None
elif spec in ( Value.NATURAL, Value.INTEGER ):
match = re.match( '-?[0-9,]+', string )
if match:
rest_of_string = string[ len( match[ 0 ] ): ]
value_text = match[ 0 ]
value_text = value_text.replace( ',', '' )
value = int( value_text )
if spec == Value.NATURAL and value < 0:
raise ValueError( "Invalid value, expected a positive integer!" )
return ( rest_of_string, value )
if spec == Value.NATURAL:
raise ValueError( "Invalid value, expected a natural number" )
raise ValueError( "Invalid value, expected an integer" )
elif spec == Value.SHA256_HASHLIST_WITH_DISTANCE:
match = re.match( '(?P<hashes>([0-9a-f]+(\s|,)+)+)(with\s+)?distance\s+(?P<distance>0|([1-9][0-9]*))', string )
if match:
hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )
distance = int( match[ 'distance' ] )
return string[ len( match[ 0 ] ): ], (hashes, distance)
raise ValueError( "Invalid value, expected a list of hashes with distance" )
match = re.match( '(?P<hashes>([0-9a-f]+(\s|,)+)+)(with\s+)?distance\s+(?P<distance>0|([1-9][0-9]*))', string )
if match:
hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )
pixel_hashes = { hash for hash in hashes if len( hash ) == 64 }
perceptual_hashes = { hash for hash in hashes if len( hash ) == 16 }
distance = int( match[ 'distance' ] )
return string[ len( match[ 0 ] ): ], (pixel_hashes, perceptual_hashes, distance)
raise ValueError( "Invalid value, expected a list of hashes with distance" )
match = re.match( '(?P<hashes>([0-9a-f]+(\s|,)*)+)((with\s+)?algorithm)?\s*(?P<algorithm>sha256|sha512|md5|sha1|)', string )
if match:
hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )
algorithm = match[ 'algorithm' ] if len( match[ 'algorithm' ] ) > 0 else 'sha256'
return string[ len( match[ 0 ] ): ], (hashes, algorithm)
raise ValueError( "Invalid value, expected a list of hashes with algorithm" )
elif spec == Value.FILETYPE_LIST:
valid_values = sorted( FILETYPES.keys(), key = lambda k: len( k ), reverse = True )
ftype_regex = '(' + '|'.join( [ '(' + val + ')' for val in valid_values ] ) + ')'
match = re.match( '(' + ftype_regex + '(\s|,)+)*' + ftype_regex, string )
if match:
found_ftypes_all = re.sub( '\s', ' ', match[ 0 ].replace( ',', '|' ) ).split( '|' )
found_ftypes_good = [ ]
for ftype in found_ftypes_all:
ftype = ftype.strip()
if len( ftype ) > 0 and ftype in FILETYPES:
found_ftypes_good.extend( FILETYPES[ ftype ] )
return string[ len( match[ 0 ] ): ], set( found_ftypes_good )
raise ValueError( "Invalid value, expected a list of file types" )
elif spec == Value.DATE_OR_TIME_INTERVAL:
dt = dateparser.parse( string )
if not string_looks_like_date( string ):
# a time delta
now = dateparser.parse( 'now' ) # lol, that's how you get around cross-library timezone headaches
time_delta = now - dt
# this sucked a lot, and then I decided to eventually switch the whole system to days/seconds, just like datetime's time_delta
# if a user wants to put in 365 days, knowing what inaccuracy that implies, then they can. we just can't reliably deliver leap-year accuracy on long durations
years = 0
months = 0
days = time_delta.days
hours = round( time_delta.seconds / 3600 )
if years + months + days + hours == 0:
return ( '', dt )
return ( '', ( years, months, days, hours ) )
return ( '', dt )
match = re.match( '((?P<year>0|([1-9][0-9]*))\s*(years|year))?\s*((?P<month>0|([1-9][0-9]*))\s*(months|month))?\s*((?P<day>0|([1-9][0-9]*))\s*(days|day))?\s*((?P<hour>0|([1-9][0-9]*))\s*(hours|hour|h))?', string )
if match and (match.group( 'year' ) or match.group( 'month' ) or match.group( 'day' ) or match.group( 'hour' )):
years = int( match.group( 'year' ) ) if match.group( 'year' ) else 0
months = int( match.group( 'month' ) ) if match.group( 'month' ) else 0
days = int( match.group( 'day' ) ) if match.group( 'day' ) else 0
hours = int( match.group( 'hour' ) ) if match.group( 'hour' ) else 0
string_result = string[ len( match[ 0 ] ): ]
if string_result == 'ago':
string_result = ''
return string_result, (years, months, days, hours)
match = re.match( '(?P<year>[0-9][0-9][0-9][0-9])-(?P<month>[0-9][0-9]?)-(?P<day>[0-9][0-9]?)', string )
if match:
# good expansion here would be to parse a full date with 08:20am kind of thing, but we'll wait for better datetime parsing library for that I think!
return string[ len( match[ 0 ] ): ], datetime.datetime( int( match.group( 'year' ) ), int( match.group( 'month' ) ), int( match.group( 'day' ) ) )
raise ValueError( "Invalid value, expected a date or a time interval" )
elif spec == Value.TIME_SEC_MSEC:
match = re.match( '((?P<sec>0|([1-9][0-9]*))\s*(seconds|second|secs|sec|s))?\s*((?P<msec>0|([1-9][0-9]*))\s*(milliseconds|millisecond|msecs|msec|ms))?', string )
if match and (match.group( 'sec' ) or match.group( 'msec' )):
seconds = int( match.group( 'sec' ) ) if match.group( 'sec' ) else 0
mseconds = int( match.group( 'msec' ) ) if match.group( 'msec' ) else 0
seconds += math.floor( mseconds / 1000 )
mseconds = mseconds % 1000
return string[ len( match[ 0 ] ): ], (seconds, mseconds)
raise ValueError( "Invalid value, expected a duration" )
elif spec == Value.ANY_STRING:
return "", string
elif spec == Value.TIME_INTERVAL:
match = re.match( '((?P<day>0|([1-9][0-9]*))\s*(days|day))?\s*((?P<hour>0|([1-9][0-9]*))\s*(hours|hour|h))?\s*((?P<minute>0|([1-9][0-9]*))\s*(minutes|minute|mins|min))?\s*((?P<second>0|([1-9][0-9]*))\s*(seconds|second|secs|sec|s))?', string )
if match and (match.group( 'day' ) or match.group( 'hour' ) or match.group( 'minute' ) or match.group( 'second' )):
days = int( match.group( 'day' ) ) if match.group( 'day' ) else 0
hours = int( match.group( 'hour' ) ) if match.group( 'hour' ) else 0
minutes = int( match.group( 'minute' ) ) if match.group( 'minute' ) else 0
seconds = int( match.group( 'second' ) ) if match.group( 'second' ) else 0
minutes += math.floor( seconds / 60 )
seconds = seconds % 60
hours += math.floor( minutes / 60 )
minutes = minutes % 60
days += math.floor( hours / 24 )
hours = hours % 24
return string[ len( match[ 0 ] ): ], (days, hours, minutes, seconds)
raise ValueError( "Invalid value, expected a time interval" )
elif spec == Value.RATIO:
match = re.match( '(?P<first>0|([1-9][0-9]*)):(?P<second>0|([1-9][0-9]*))', string )
if match: return string[ len( match[ 0 ] ): ], (int( match[ 'first' ] ), int( match[ 'second' ] ))
raise ValueError( "Invalid value, expected a ratio" )
elif spec == Value.RATIO_SPECIAL:
if string == 'square': return ( '', ( 1, 1 ) )
if string == 'landscape': return ( '', ( 1, 1 ) )
if string == 'portrait': return ( '', ( 1, 1 ) )
# 'my favourites 3/5' (no operator here)
match = re.match( '(?P<name>.+?)\s+(?P<num>\d+)/(?P<den>\d+)$', string )
if match:
service_name = match[ 'name' ]
numerator = int( match[ 'num' ] )
denominator = int( match[ 'den' ] )
if numerator < 0 or numerator > denominator:
raise ValueError( 'Invalid value, rating value was out of bounds')
return ( '', ( numerator, service_name ) )
raise ValueError( "Invalid value, expected a numerical rating" )
# 'tag this later = like' (maybe operator here)
# 'tag this later like'
# check dislike first lol
if string.endswith( 'dislike' ):
value = 0.0
string = string[ : -len( 'dislike' ) ]
elif string.endswith( 'like' ):
value = 1.0
string = string[ : -len( 'like' ) ]
raise ValueError( 'Invalid value, expected like/dislike' )
string = string.strip()
for ( operator_string, result ) in operator_strings_and_results:
if string.endswith( operator_string ):
string = string[ : -len( operator_string ) ]
string = string.strip()
service_name = string
return ( '', ( value, service_name ) )
# 'I'm cooooollecting counter 123' (no operator here)
match = re.match( '(?P<name>.+?)\s+(?P<num>\d+)$', string )
if match:
service_name = match[ 'name' ]
value = int( match[ 'num' ] )
return ( '', ( value, service_name ) )
raise ValueError( "Invalid value, expected an inc/dec rating" )
elif spec == Value.NAMESPACE_AND_NUM_TAGS:
# 'character tags > 4'
match = re.match( r'(?P<namespace>.+) tags (?P<operator>.+?)\s?(?P<num>\d+)\s*$', string )
if match:
namespace = match[ 'namespace' ]
operator_string = match[ 'operator' ]
num = int( match[ 'num' ] )
if namespace == 'unnamespaced':
namespace = ''
( gubbins, operator ) = parse_operator( operator_string, Operators.RELATIONAL )
return ( '', ( namespace, operator, num ) )
raise ValueError( "Invalid value specification" )
def parse_operator( string: str, spec ):
while string.startswith( ':' ) or string.startswith( ' ' ):
string = string.strip()
if string.startswith( ':' ):
string = string[ 1 : ]
if spec is None:
return string, None
elif spec in ( Operators.RELATIONAL, Operators.RELATIONAL_EXACT, Operators.RELATIONAL_TIME ):
exact = spec == Operators.RELATIONAL_EXACT
ops = [ '=', '<', '>' ]
if spec == Operators.RELATIONAL_TIME:
re_result = re.search( r'\d.*', string )
if re_result:
op_string = string[ : re_result.start() ]
string_result = re_result.group()
invert_ops = not string_looks_like_date( string_result )
looks_like_date = string_looks_like_date( string_result )
invert_ops = not looks_like_date
if 'month' in op_string and looks_like_date:
return ( string_result, UNICODE_APPROX_EQUAL )
elif 'around' in op_string and not looks_like_date:
return ( string_result, UNICODE_APPROX_EQUAL )
elif 'day' in op_string and looks_like_date:
return ( string_result, '=' )
elif 'since' in op_string:
return ( string_result, '<' if invert_ops else '>' )
elif 'before' in op_string:
return ( string_result, '>' if invert_ops else '<' )
if not exact:
if string.startswith( '==' ): return string[ 2: ], '='
if not exact:
if string.startswith( '!=' ): return string[ 2: ], UNICODE_NOT_EQUAL
if string.startswith( 'is not' ): return string[ 6: ], UNICODE_NOT_EQUAL
if string.startswith( 'isn\'t' ): return string[ 5: ], UNICODE_NOT_EQUAL
if string.startswith( '~=' ): return string[ 2: ], UNICODE_APPROX_EQUAL
for op in ops:
if string.startswith( op ): return string[ len( op ): ], op
if string.startswith( 'is' ): return string[ 2: ], '='
raise ValueError( "Invalid relational operator" )
# "favourites service name > 3/5"
# since service name can be all sorts of gubbins, we'll work backwards and KISS
match = re.match( '(?P<first>.*?)(?P<second>(dislike|like|\d+/\d+|\d+))$', string )
if match:
without_value_string_raw = match[ 'first' ]
without_value_string = without_value_string_raw.strip()
for ( operator_string, possible_operator ) in operator_strings_and_results:
if without_value_string.endswith( operator_string ):
if possible_operator == UNICODE_NOT_EQUAL:
raise ValueError( 'Invalid rating operator--cannot select "is not"' )
service_name = without_value_string[ : -len( operator_string) ]
value = match[ 'second' ]
parsing_string = f'{service_name} {value}'
return ( parsing_string, possible_operator )
raise ValueError( "Invalid rating operator" )
elif spec == Operators.EQUAL:
if string.startswith( '==' ): return string[ 2: ], '='
if string.startswith( '=' ): return string[ 1: ], '='
if string.startswith( UNICODE_NOT_EQUAL ): return string[ 1: ], '!='
if string.startswith( '!=' ): return string[ 2: ], '!='
if string.startswith( 'is not' ): return string[ 6: ], '!='
if string.startswith( 'is' ): return string[ 2: ], '='
if string.startswith( 'isn\'t' ): return string[ 5: ], '!='
raise ValueError( "Invalid equality operator" )
elif spec == Operators.FILESERVICE_STATUS:
match = re.match( '(is )?currently in', string )
if match: return string[ len( match[ 0 ] ): ], 'is currently in'
match = re.match( '((is )?not currently in)|isn\'t currently in', string )
if match: return string[ len( match[ 0 ] ): ], 'is not currently in'
match = re.match( '(is )?pending to', string )
if match: return string[ len( match[ 0 ] ): ], 'is pending to'
match = re.match( '((is )?not pending to)|isn\'t pending to', string )
if match: return string[ len( match[ 0 ] ): ], 'is not pending to'
raise ValueError( "Invalid operator, expected a file service relationship" )
elif spec == Operators.TAG_RELATIONAL:
# note this is in the correct order, also, to eliminate = vs == ambiguity
all_operators_piped = '|'.join( ( s_r[0] for s_r in operator_strings_and_results ) )
match = re.match( f'(?P<namespace>.*)\s+(?P<op>({all_operators_piped}))', string )
if match:
namespace = match[ 'namespace' ]
if namespace == 'any namespace':
namespace = '*'
if namespace == 'unnamespaced':
namespace = ''
op_string = match[ 'op' ]
op = operator_strings_to_results.get( op_string, UNICODE_APPROX_EQUAL )
if op not in ( '<', '>', UNICODE_APPROX_EQUAL ):
return string[ len( match[ 0 ] ): ], (namespace, op)
raise ValueError( "Invalid operator, expected a tag followed by a relational operator" )
elif spec == Operators.ONLY_EQUAL:
if string.startswith( '==' ): return string[ 2: ], '='
if string.startswith( '=' ): return string[ 1: ], '='
if string.startswith( 'is' ): return string[ 2: ], '='
raise ValueError( "Invalid equality operator" )
elif spec == Operators.RATIO_OPERATORS:
if string.startswith( 'wider than' ): return string[ 10: ], 'wider than'
if string.startswith( 'taller than' ): return string[ 11: ], 'taller than'
if string.startswith( 'is wider than' ): return string[ 13: ], 'wider than'
if string.startswith( 'is taller than' ): return string[ 14: ], 'taller than'
if string.startswith( '==' ): return string[ 2: ], '='
if string.startswith( '=' ): return string[ 1: ], '='
if string.startswith( 'is' ): return string[ 2: ], '='
if string.startswith( '~=' ): return string[ 2: ], UNICODE_APPROX_EQUAL
if string.startswith( UNICODE_APPROX_EQUAL ): return string[ 1: ], UNICODE_APPROX_EQUAL
raise ValueError( "Invalid ratio operator" )
elif spec == Operators.RATIO_OPERATORS_SPECIAL:
if 'square' in string: return 'square', '='
if 'portrait' in string: return 'portrait', 'taller than'
if 'landscape' in string: return 'landscape', 'wider than'
raise ValueError( "Invalid operator specification" )