hydrus/hydrus/external/SystemPredicateParser.py

# made by prkc for Hydrus Network
# Licensed under the same terms as Hydrus Network
# hydev has changed a couple things here and there

# The basic idea here is to take a system predicate written as text and parse it into a (predicate type, operator, value, unit)
# tuple. The exact structure of the operator, value and unit members depend on the type of the predicate.
# For example, system:width < 500 would become (Predicate.WIDTH, '<', 500).
# The parsers recognize multiple forms for various units and operators, but always normalize to a single canonical form,
# which is given in the comments beside the various enums below.
# Some or all of them can be None, depending on the predicate.
# The "parsing" is done with regex, which is hacky but good enough for this usecase.
# To extend the parser with additional predicates, first extend the Predicate, Value, Operators, Units enums if the
# already present options are not sufficient, then implement parsing for them in the corresponding parse_{unit,value,operator} funtions.
# Finally, add a new entry to the SYSTEM_PREDICATES dict describing the new predicate.
# Initially everything below is independent from other Hydrus code so there is some redundancy.
# It might be better to switch to already established Hydrus enums and constants where possible.
# Errors are handled by throwing ValueErrors. The main function to call is parse_system_predicate.
# If this file is run by itself it will parse and print all the included examples. There are examples for each supported predicate type.

try:

    import dateparser

    DATEPARSER_OK = True

except:

    DATEPARSER_OK = False


import math
import re
import datetime
from enum import Enum, auto

UNICODE_APPROX_EQUAL = '\u2248'
UNICODE_NOT_EQUAL = '\u2260'

# sort according to longest thing first to rid ourselves of ambiguity
operator_strings_and_results = sorted(
    [
        ( '=', '=' ),
        ( '==', '=' ),
        ( 'is', '=' ),
        ( UNICODE_NOT_EQUAL, UNICODE_NOT_EQUAL ),
        ( '!=', UNICODE_NOT_EQUAL ),
        ( 'is not', UNICODE_NOT_EQUAL ),
        ( 'isn\'t', UNICODE_NOT_EQUAL ),
        ( '<', '<' ),
        ( 'less than', '<' ),
        ( '>', '>' ),
        ( 'more than', '>' ),
        ( UNICODE_APPROX_EQUAL, UNICODE_APPROX_EQUAL ),
        ( '~=', UNICODE_APPROX_EQUAL ),
        ( 'about', UNICODE_APPROX_EQUAL ),
        ( 'is about', UNICODE_APPROX_EQUAL ),
    ],
    key = lambda a: -len( a[0] )
)

operator_strings_to_results = dict( operator_strings_and_results )

# Note this needs to be initialised here with all types that Hydrus supports.
FILETYPES = { }

def InitialiseFiletypes( str_to_enum ):
    for ( filetype_string, enum ) in str_to_enum.items():

        if isinstance( enum, int ):

            enum_tuple = (enum,)

        else:

            enum_tuple = tuple( enum )


        FILETYPES[ filetype_string ] = enum_tuple


NAMESPACE_SEPARATOR = ':'
SYSTEM_PREDICATE_PREFIX = 'system' + NAMESPACE_SEPARATOR


# This enum lists all the recognized predicate types.
class Predicate( Enum ):
    EVERYTHING = auto()
    INBOX = auto()
    ARCHIVE = auto()
    HAS_DURATION = auto()
    NO_DURATION = auto()
    BEST_QUALITY_OF_GROUP = auto()
    NOT_BEST_QUALITY_OF_GROUP = auto()
    HAS_AUDIO = auto()
    NO_AUDIO = auto()
    HAS_TRANSPARENCY = auto()
    NO_TRANSPARENCY = auto()
    HAS_EXIF = auto()
    NO_EXIF = auto()
    HAS_HUMAN_READABLE_EMBEDDED_METADATA = auto()
    NO_HUMAN_READABLE_EMBEDDED_METADATA = auto()
    HAS_ICC_PROFILE = auto()
    NO_ICC_PROFILE = auto()
    HAS_FORCED_FILETYPE = auto()
    NO_FORCED_FILETYPE = auto()
    HAS_TAGS = auto()
    UNTAGGED = auto()
    NUM_OF_TAGS = auto()
    NUM_OF_TAGS_WITH_NAMESPACE = auto()
    NUM_OF_URLS = auto()
    NUM_OF_WORDS = auto()
    HEIGHT = auto()
    WIDTH = auto()
    FILESIZE = auto()
    SIMILAR_TO_FILES = auto()
    SIMILAR_TO_DATA = auto()
    LIMIT = auto()
    FILETYPE = auto()
    HASH = auto()
    MOD_DATE = auto()
    ARCHIVED_DATE = auto()
    LAST_VIEWED_TIME = auto()
    TIME_IMPORTED = auto()
    DURATION = auto()
    FRAMERATE = auto()
    NUM_OF_FRAMES = auto()
    FILE_SERVICE = auto()
    NUM_FILE_RELS = auto()
    RATIO = auto()
    RATIO_SPECIAL = auto()
    NUM_PIXELS = auto()
    MEDIA_VIEWS = auto()
    PREVIEW_VIEWS = auto()
    ALL_VIEWS = auto()
    MEDIA_VIEWTIME = auto()
    PREVIEW_VIEWTIME = auto()
    ALL_VIEWTIME = auto()
    URL_REGEX = auto()
    NO_URL_REGEX = auto()
    URL = auto()
    NO_URL = auto()
    DOMAIN = auto()
    NO_DOMAIN = auto()
    URL_CLASS = auto()
    NO_URL_CLASS = auto()
    TAG_AS_NUMBER = auto()
    HAS_NOTES = auto()
    NO_NOTES = auto()
    NUM_NOTES = auto()
    HAS_NOTE_NAME = auto()
    NO_NOTE_NAME = auto()
    RATING_SPECIFIC_NUMERICAL = auto()
    RATING_SPECIFIC_LIKE_DISLIKE = auto()
    RATING_SPECIFIC_INCDEC = auto()
    HAS_RATING = auto()
    NO_RATING = auto()


# This enum lists the possible value formats a predicate can have (if it has a value).
# Parsing for each of these options is implemented in parse_value
class Value( Enum ):
    NATURAL = auto()  # An int that holds a non-negative value
    SHA256_HASHLIST_WITH_DISTANCE = auto()  # A 2-tuple, where the first part is a set of potential hashes (as strings), the second part is a non-negative integer
    SIMILAR_TO_HASHLIST_WITH_DISTANCE = auto()  # A 3-tuple, where the first two parts are potential pixel and perceptual hashes (as strings), the second part is a non-negative integer
    HASHLIST_WITH_ALGORITHM = auto()  # A 2-tuple, where the first part is a set of potential hashes (as strings), the second part is one of 'sha256', 'md5', 'sha1', 'sha512'
    FILETYPE_LIST = auto()  # A set of file types using the enum set in InitialiseFiletypes as defined in FILETYPES
    # Either a tuple of 4 non-negative integers: (years, months, days, hours) where the latter is < 24 OR
    # a datetime.datetime object. For the latter, only the YYYY-MM-DD format is accepted.
    # dateutils has a function to try to guess and parse arbitrary date formats but I didn't use it here since it would be an additional dependency.
    DATE_OR_TIME_INTERVAL = auto()
    TIME_SEC_MSEC = auto()  # A tuple of two non-negative integers: (seconds, milliseconds) where the latter is <1000
    ANY_STRING = auto()  # A string (accepts any string so can't use units after this since it consumes the entire remaining part of the input)
    TIME_INTERVAL = auto()  # A tuple of 4 non-negative integers: (days, hours, minutes, seconds) where hours < 24, minutes < 60, seconds < 60
    INTEGER = auto()  # An integer
    RATIO = auto()  # A tuple of 2 ints, both non-negative
    RATIO_SPECIAL = auto() # 1:1
    RATING_SERVICE_NAME_AND_NUMERICAL_VALUE = auto() # my favourites 3/5
    RATING_SERVICE_NAME_AND_LIKE_DISLIKE = auto() # my favourites like
    RATING_SERVICE_NAME_AND_INCDEC = auto() # my favourites 3/5
    NAMESPACE_AND_NUM_TAGS = auto()


# Possible operator formats
# Implemented in parse_operator
class Operators( Enum ):
    RELATIONAL = auto()  # One of '=', '<', '>', UNICODE_APPROX_EQUAL ('≈') (takes '~=' too)
    RELATIONAL_EXACT = auto() # Like RELATIONAL but without the approximately equal operator
    RELATIONAL_TIME = auto()  # One of '=', '<', '>', UNICODE_APPROX_EQUAL ('≈') (takes '~=' too), and the various 'since', 'before', 'the day of', 'the month of' time-based analogues
    RELATIONAL_FOR_RATING_SERVICE = auto()  # RELATIONAL, but in the middle of a 'service_name = 4/5' kind of thing
    EQUAL = auto()  # One of '=' or '!='
    EQUAL_NOT_CONSUMING = auto()  # One of '=' or '!=', doesn't consume this text so later things can look at it
    FILESERVICE_STATUS = auto()  # One of 'is not currently in', 'is currently in', 'is not pending to', 'is pending to'
    TAG_RELATIONAL = auto()  # A tuple of a string (a potential tag name) and a relational operator (as a string)
    ONLY_EQUAL = auto()  # None (meaning =, since thats the only accepted operator)
    RATIO_OPERATORS = auto()  # One of '=', 'wider than','taller than', UNICODE_APPROX_EQUAL ('≈') (takes '~=' too)
    RATIO_OPERATORS_SPECIAL = auto() # 'square', 'portrait', 'landscape'


# Possible unit formats
# Implemented in parse_unit
class Units( Enum ):
    FILESIZE = auto()  # One of 'B', 'KB', 'MB', 'GB'
    FILE_RELATIONSHIP_TYPE = auto()  # One of 'not related/false positive', 'duplicates', 'alternates', 'potential duplicates'
    PIXELS_OR_NONE = auto()  # Always None (meaning pixels)
    PIXELS = auto()  # One of 'pixels', 'kilopixels', 'megapixels'
    FPS_OR_NONE = auto() # 'fps'


# All system predicates
# A predicate is described by a 4-tuple of (predicate type, operator format, value format, unit format) (use None if some are not applicable)
# The keys are regexes matching the predicate names as written by the user.
# The parser will also automatically accept _ instead of space in the predicate names, always use space in this dict.
SYSTEM_PREDICATES = {
    'everything': (Predicate.EVERYTHING, None, None, None),
    'inbox': (Predicate.INBOX, None, None, None),
    'archived?$': (Predicate.ARCHIVE, None, None, None), # $ so as not to clash with system:archive(d) date
    'has duration': (Predicate.HAS_DURATION, None, None, None),
    'no duration': (Predicate.NO_DURATION, None, None, None),
    '(is the )?best quality( file)? of( its)?( duplicate)? group': (Predicate.BEST_QUALITY_OF_GROUP, None, None, None),
    '(((is )?not)|(isn\'t))( the)? best quality( file)? of( its)?( duplicate)? group': (Predicate.NOT_BEST_QUALITY_OF_GROUP, None, None, None),
    'has audio': (Predicate.HAS_AUDIO, None, None, None),
    'no audio': (Predicate.NO_AUDIO, None, None, None),
    'has (transparency|alpha)': (Predicate.HAS_TRANSPARENCY, None, None, None),
    'no (transparency|alpha)': (Predicate.NO_TRANSPARENCY, None, None, None),
    'has exif': (Predicate.HAS_EXIF, None, None, None),
    'no exif': (Predicate.NO_EXIF, None, None, None),
    'has.*embedded.*metadata': (Predicate.HAS_HUMAN_READABLE_EMBEDDED_METADATA, None, None, None),
    'no.*embedded.*metadata': (Predicate.NO_HUMAN_READABLE_EMBEDDED_METADATA, None, None, None),
    'has icc profile': (Predicate.HAS_ICC_PROFILE, None, None, None),
    'no icc profile': (Predicate.NO_ICC_PROFILE, None, None, None),
    'has forced filetype': (Predicate.HAS_FORCED_FILETYPE, None, None, None),
    'no forced filetype': (Predicate.NO_FORCED_FILETYPE, None, None, None),
    'has tags': (Predicate.HAS_TAGS, None, None, None),
    'untagged|no tags': (Predicate.UNTAGGED, None, None, None),
    'num(ber)?( of)? tags': (Predicate.NUM_OF_TAGS, Operators.RELATIONAL, Value.NATURAL, None),
    'num(ber)?( of)? (?=[^\\s].* tags)': (Predicate.NUM_OF_TAGS_WITH_NAMESPACE, None, Value.NAMESPACE_AND_NUM_TAGS, None),
    'num(ber)?( of)? urls': (Predicate.NUM_OF_URLS, Operators.RELATIONAL, Value.NATURAL, None),
    'num(ber)?( of)? words': (Predicate.NUM_OF_WORDS, Operators.RELATIONAL_EXACT, Value.NATURAL, None),
    'height': (Predicate.HEIGHT, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS_OR_NONE),
    'width': (Predicate.WIDTH, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS_OR_NONE),
    'file ?size': (Predicate.FILESIZE, Operators.RELATIONAL, Value.NATURAL, Units.FILESIZE),
    'similar to(?! data)( files)?': (Predicate.SIMILAR_TO_FILES, None, Value.SHA256_HASHLIST_WITH_DISTANCE, None),
    'similar to data': (Predicate.SIMILAR_TO_DATA, None, Value.SIMILAR_TO_HASHLIST_WITH_DISTANCE, None),
    'limit': (Predicate.LIMIT, Operators.ONLY_EQUAL, Value.NATURAL, None),
    'file ?type': (Predicate.FILETYPE, Operators.ONLY_EQUAL, Value.FILETYPE_LIST, None),
    'hash': (Predicate.HASH, Operators.EQUAL_NOT_CONSUMING, Value.HASHLIST_WITH_ALGORITHM, None),
    'archived? (date|time)|(date|time) archived|archived.': (Predicate.ARCHIVED_DATE, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
    'modified (date|time)|(date|time) modified|modified': (Predicate.MOD_DATE, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
    'last view(ed)? (date|time)|(date|time) last viewed|last viewed': (Predicate.LAST_VIEWED_TIME, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
    'import(ed)? (date|time)|(date|time) imported|imported': (Predicate.TIME_IMPORTED, Operators.RELATIONAL_TIME, Value.DATE_OR_TIME_INTERVAL, None),
    'duration': (Predicate.DURATION, Operators.RELATIONAL, Value.TIME_SEC_MSEC, None),
    'framerate': (Predicate.FRAMERATE, Operators.RELATIONAL_EXACT, Value.NATURAL, Units.FPS_OR_NONE),
    'num(ber)?( of)? frames': (Predicate.NUM_OF_FRAMES, Operators.RELATIONAL, Value.NATURAL, None),
    'file service': (Predicate.FILE_SERVICE, Operators.FILESERVICE_STATUS, Value.ANY_STRING, None),
    'num(ber)?( of)? file relationships': (Predicate.NUM_FILE_RELS, Operators.RELATIONAL, Value.NATURAL, Units.FILE_RELATIONSHIP_TYPE),
    'ratio(?=.*\d)': (Predicate.RATIO, Operators.RATIO_OPERATORS, Value.RATIO, None),
    'ratio(?!.*\d)': (Predicate.RATIO_SPECIAL, Operators.RATIO_OPERATORS_SPECIAL, Value.RATIO_SPECIAL, None),
    'num pixels': (Predicate.NUM_PIXELS, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS),
    'media views': (Predicate.MEDIA_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
    'preview views': (Predicate.PREVIEW_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
    'all views': (Predicate.ALL_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
    'media viewtime': (Predicate.MEDIA_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
    'preview viewtime': (Predicate.PREVIEW_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
    'all viewtime': (Predicate.ALL_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
    'has (a )?url matching regex': (Predicate.URL_REGEX, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have (a )?url matching regex': (Predicate.NO_URL_REGEX, None, Value.ANY_STRING, None),
    'has url:? (?=http)': (Predicate.URL, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have url:? (?=http)': (Predicate.NO_URL, None, Value.ANY_STRING, None),
    'has (an? )?(url with )?domain': (Predicate.DOMAIN, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have (an? )?(url with )?domain': (Predicate.NO_DOMAIN, None, Value.ANY_STRING, None),
    'has (an? )?url with (url )?class': (Predicate.URL_CLASS, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have (an? )?url with (url )?class': (Predicate.NO_URL_CLASS, None, Value.ANY_STRING, None),
    'tag as number': (Predicate.TAG_AS_NUMBER, Operators.TAG_RELATIONAL, Value.INTEGER, None),
    'has notes?$': (Predicate.HAS_NOTES, None, None, None),
    '((has )?no|does not have( a)?|doesn\'t have) notes?$': (Predicate.NO_NOTES, None, None, None),
    'num(ber)?( of)? notes?': (Predicate.NUM_NOTES, Operators.RELATIONAL_EXACT, Value.NATURAL, None),
    '(has (a )?)?note (with name|named)': (Predicate.HAS_NOTE_NAME, None, Value.ANY_STRING, None),
    '((has )?no|does not have( a)?|doesn\'t have( a)?) note (with name|named)': (Predicate.NO_NOTE_NAME, None, Value.ANY_STRING, None),
    'has( a)? rating( for)?': (Predicate.HAS_RATING, None, Value.ANY_STRING, None ),
    '((has )?no|does not have( a)?|doesn\'t have( a)?) rating( for)?': (Predicate.NO_RATING, None, Value.ANY_STRING, None ),
    'rating( for)?(?=.+?\d+/\d+$)': (Predicate.RATING_SPECIFIC_NUMERICAL, Operators.RELATIONAL_FOR_RATING_SERVICE, Value.RATING_SERVICE_NAME_AND_NUMERICAL_VALUE, None ),
    'rating( for)?(?=.+?(like|dislike)$)': (Predicate.RATING_SPECIFIC_LIKE_DISLIKE, None, Value.RATING_SERVICE_NAME_AND_LIKE_DISLIKE, None ),
    'rating( for)?(?=.+?[^/]\d+$)': (Predicate.RATING_SPECIFIC_INCDEC, Operators.RELATIONAL_FOR_RATING_SERVICE, Value.RATING_SERVICE_NAME_AND_INCDEC, None ),
}

def string_looks_like_date( string ):

    # this sucks but it will do for now

    test_words = [ 'year', 'month', 'day', 'hour', 'second', 'ago' ]

    return True not in ( word in string for word in test_words )


# Parsing is just finding a matching predicate name,
# then trying to parse it by consuming the input string.
# The parse_* functions consume some of the string and return a (remaining part of the string, parsed value) tuple.
def parse_system_predicate( string: str ):

    # TODO: (hydev): rework this thing into passing around a 'parse result object' that the operator parser can set a value for and say 'yeah value is sorted' for things like 'has words' = '> 0' in one swoop

    string = string.strip()

    if 'url' not in string: # hack for system:url has regex (blah) and matching url in general

        string = string.lower()


    string = string.replace( '_', ' ' )
    if string.startswith( "-" ):
        raise ValueError( "System predicate can't start with negation" )
    if not string.startswith( SYSTEM_PREDICATE_PREFIX ):
        raise ValueError( "Not a system predicate!" )
    string = string[ len( SYSTEM_PREDICATE_PREFIX ): ]
    for pred_regex in SYSTEM_PREDICATES:
        match = re.match( pred_regex.replace( ' ', '([_ ]+)' ) + ":?", string )
        if match:
            pred = SYSTEM_PREDICATES[ pred_regex ]
            string = string[ len( match[ 0 ] ): ]
            string, operator = parse_operator( string, pred[ 1 ] )
            string, value = parse_value( string, pred[ 2 ] )
            string, unit = parse_unit( string, pred[ 3 ] )
            if string: raise ValueError( "Unrecognized characters at the end of the predicate: " + string )
            return pred[ 0 ], operator, value, unit


    raise ValueError( "Unknown system predicate!" )


def parse_unit( string: str, spec ):
    string = string.strip()
    if spec is None:
        return string, None
    elif spec == Units.FILESIZE:
        match = re.match( 'b|byte|bytes', string )
        if match: return string[ len( match[ 0 ] ): ], 'B'
        match = re.match( 'kb|kilobytes|kilobyte', string )
        if match: return string[ len( match[ 0 ] ): ], 'KB'
        match = re.match( 'mb|megabytes|megabyte', string )
        if match: return string[ len( match[ 0 ] ): ], 'MB'
        match = re.match( 'gb|gigabytes|gigabyte', string )
        if match: return string[ len( match[ 0 ] ): ], 'GB'
        raise ValueError( "Invalid unit, expected a filesize" )
    elif spec == Units.FILE_RELATIONSHIP_TYPE:
        match = re.match( 'duplicates', string )
        if match: return string[ len( match[ 0 ] ): ], 'duplicates'
        match = re.match( 'alternates', string )
        if match: return string[ len( match[ 0 ] ): ], 'alternates'
        match = re.match( '(not related/false positives?)|not related|(false positives?)', string )
        if match: return string[ len( match[ 0 ] ): ], 'not related/false positive'
        match = re.match( 'potential duplicates', string )
        if match: return string[ len( match[ 0 ] ): ], 'potential duplicates'
        raise ValueError( "Invalid unit, expected a file relationship" )
    elif spec == Units.PIXELS_OR_NONE:
        if not string:
            return string, None
        else:
            match = re.match( '(pixels?)|px', string )
            if match: return string[ len( match[ 0 ] ): ], None
        raise ValueError( "Invalid unit, expected no unit or pixels" )
    elif spec == Units.PIXELS:
        match = re.match( 'px|pixels|pixel', string )
        if match: return string[ len( match[ 0 ] ): ], 'pixels'
        match = re.match( 'kpx|kilopixels|kilopixel', string )
        if match: return string[ len( match[ 0 ] ): ], 'kilopixels'
        match = re.match( 'mpx|megapixels|megapixel', string )
        if match: return string[ len( match[ 0 ] ): ], 'megapixels'
        raise ValueError( "Invalid unit, expected pixels" )
    elif spec == Units.FPS_OR_NONE:
        if not string:
            return string, None
        else:
            match = re.match( 'fps', string )
            if match: return string[ len( match[ 0 ] ): ], None
        raise ValueError( "Invalid unit, expected no unit or fps" )


    raise ValueError( "Invalid unit specification" )


def parse_value( string: str, spec ):

    string = string.strip()

    if spec is None:

        return string, None

    elif spec in ( Value.NATURAL, Value.INTEGER ):

        # 'has urls', 'has words'
        if string.startswith( 'has' ) or string.startswith( 'no' ):

            return '', 0


        match = re.match( '-?[0-9,]+', string )

        if match:

            rest_of_string = string[ len( match[ 0 ] ): ]

            value_text = match[ 0 ]

            value_text = value_text.replace( ',', '' )

            value = int( value_text )

            if spec == Value.NATURAL and value < 0:

                raise ValueError( "Invalid value, expected a positive integer!" )


            return ( rest_of_string, value )


        if spec == Value.NATURAL:

            raise ValueError( "Invalid value, expected a natural number" )

        else:

            raise ValueError( "Invalid value, expected an integer" )


    elif spec == Value.SHA256_HASHLIST_WITH_DISTANCE:
        match = re.match( '(?P<hashes>([0-9a-f]{4}[0-9a-f]+(\s|,)*)+)(with\s+)?(distance\s+)?(of\s+)?(?P<distance>0|([1-9][0-9]*))?', string )
        if match:
            hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )

            d = match.groupdict()

            if 'distance' in d and d[ 'distance' ] is not None:

                distance = int( match[ 'distance' ] )

            else:

                distance = 4


            return string[ len( match[ 0 ] ): ], (hashes, distance)
        raise ValueError( "Invalid value, expected a list of hashes with distance" )
    elif spec == Value.SIMILAR_TO_HASHLIST_WITH_DISTANCE:
        match = re.match( '(?P<hashes>([0-9a-f]{4}[0-9a-f]+(\s|,)*)+)(with\s+)?(distance\s+)?(of\s+)?(?P<distance>0|([1-9][0-9]*))?', string )
        if match:
            hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )
            pixel_hashes = { hash for hash in hashes if len( hash ) == 64 }
            perceptual_hashes = { hash for hash in hashes if len( hash ) == 16 }

            d = match.groupdict()

            if 'distance' in d and d[ 'distance' ] is not None:

                distance = int( match[ 'distance' ] )

            else:

                distance = 8


            return string[ len( match[ 0 ] ): ], (pixel_hashes, perceptual_hashes, distance)
        raise ValueError( "Invalid value, expected a list of hashes with distance" )
    elif spec == Value.HASHLIST_WITH_ALGORITHM:

        # hydev KISS hijack here, instead of clever regex to capture algorithm in all sorts of situations, let's just grab the hex we see and scan the rest for non-hex phrases mate
        # old pattern: match = re.match( '(?P<hashes>([0-9a-f]+(\s|,)*)+)((with\s+)?algorithm)?\s*(?P<algorithm>sha256|sha512|md5|sha1|)', string )

        algorithm = 'sha256'

        for possible_algorithm in ( 'md5', 'sha1', 'sha512' ):

            if possible_algorithm in string:

                algorithm = possible_algorithm

                break


        # {8} here to make sure we are looking at proper hash hex and not some short 'a' or 'de' word
        match = re.search( '(?P<hashes>([0-9a-f]{8}[0-9a-f]+(\s|,)*)+)', string )

        if match:
            hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )
            return string[ match.endpos : ], (hashes, algorithm)

        raise ValueError( "Invalid value, expected a list of hashes and perhaps an algorithm" )


    elif spec == Value.FILETYPE_LIST:

        valid_values = sorted( FILETYPES.keys(), key = lambda k: len( k ), reverse = True )
        ftype_regex = '(' + '|'.join( [ '(' + val + ')' for val in valid_values ] ) + ')'
        match = re.match( '(' + ftype_regex + '(\s|,)+)*' + ftype_regex, string )

        if match:

            found_ftypes_all = re.sub( '\s', ' ', match[ 0 ].replace( ',', '|' ) ).split( '|' )
            found_ftypes_good = [ ]
            for ftype in found_ftypes_all:
                ftype = ftype.strip()
                if len( ftype ) > 0 and ftype in FILETYPES:
                    found_ftypes_good.extend( FILETYPES[ ftype ] )
            return string[ len( match[ 0 ] ): ], set( found_ftypes_good )


        raise ValueError( "Invalid value, expected a list of file types" )

    elif spec == Value.DATE_OR_TIME_INTERVAL:

        if DATEPARSER_OK:

            dt = dateparser.parse( string )

            if not string_looks_like_date( string ):

                # a time delta
                now = dateparser.parse( 'now' ) # lol, that's how you get around cross-library timezone headaches

                time_delta = now - dt

                # this sucked a lot, and then I decided to eventually switch the whole system to days/seconds, just like datetime's time_delta
                # if a user wants to put in 365 days, knowing what inaccuracy that implies, then they can. we just can't reliably deliver leap-year accuracy on long durations

                years = 0
                months = 0
                days = time_delta.days

                hours = round( time_delta.seconds / 3600 )

                if years + months + days + hours == 0:

                    return ( '', dt )


                return ( '', ( years, months, days, hours ) )

            else:

                return ( '', dt )


        else:

            match = re.match( '((?P<year>0|([1-9][0-9]*))\s*(years|year))?\s*((?P<month>0|([1-9][0-9]*))\s*(months|month))?\s*((?P<day>0|([1-9][0-9]*))\s*(days|day))?\s*((?P<hour>0|([1-9][0-9]*))\s*(hours|hour|h))?', string )
            if match and (match.group( 'year' ) or match.group( 'month' ) or match.group( 'day' ) or match.group( 'hour' )):
                years = int( match.group( 'year' ) ) if match.group( 'year' ) else 0
                months = int( match.group( 'month' ) ) if match.group( 'month' ) else 0
                days = int( match.group( 'day' ) ) if match.group( 'day' ) else 0
                hours = int( match.group( 'hour' ) ) if match.group( 'hour' ) else 0

                string_result = string[ len( match[ 0 ] ): ]

                if string_result == 'ago':

                    string_result = ''


                return string_result, (years, months, days, hours)


            match = re.match( '(?P<year>[0-9][0-9][0-9][0-9])-(?P<month>[0-9][0-9]?)-(?P<day>[0-9][0-9]?)', string )
            if match:
                # good expansion here would be to parse a full date with 08:20am kind of thing, but we'll wait for better datetime parsing library for that I think!
                return string[ len( match[ 0 ] ): ], datetime.datetime( int( match.group( 'year' ) ), int( match.group( 'month' ) ), int( match.group( 'day' ) ) )
            raise ValueError( "Invalid value, expected a date or a time interval" )


    elif spec == Value.TIME_SEC_MSEC:
        match = re.match( '((?P<sec>0|([1-9][0-9]*))\s*(seconds|second|secs|sec|s))?\s*((?P<msec>0|([1-9][0-9]*))\s*(milliseconds|millisecond|msecs|msec|ms))?', string )
        if match and (match.group( 'sec' ) or match.group( 'msec' )):
            seconds = int( match.group( 'sec' ) ) if match.group( 'sec' ) else 0
            mseconds = int( match.group( 'msec' ) ) if match.group( 'msec' ) else 0
            seconds += math.floor( mseconds / 1000 )
            mseconds = mseconds % 1000
            return string[ len( match[ 0 ] ): ], (seconds, mseconds)
        raise ValueError( "Invalid value, expected a duration" )
    elif spec == Value.ANY_STRING:
        return "", string
    elif spec == Value.TIME_INTERVAL:
        match = re.match( '((?P<day>0|([1-9][0-9]*))\s*(days|day))?\s*((?P<hour>0|([1-9][0-9]*))\s*(hours|hour|h))?\s*((?P<minute>0|([1-9][0-9]*))\s*(minutes|minute|mins|min))?\s*((?P<second>0|([1-9][0-9]*))\s*(seconds|second|secs|sec|s))?', string )
        if match and (match.group( 'day' ) or match.group( 'hour' ) or match.group( 'minute' ) or match.group( 'second' )):
            days = int( match.group( 'day' ) ) if match.group( 'day' ) else 0
            hours = int( match.group( 'hour' ) ) if match.group( 'hour' ) else 0
            minutes = int( match.group( 'minute' ) ) if match.group( 'minute' ) else 0
            seconds = int( match.group( 'second' ) ) if match.group( 'second' ) else 0
            minutes += math.floor( seconds / 60 )
            seconds = seconds % 60
            hours += math.floor( minutes / 60 )
            minutes = minutes % 60
            days += math.floor( hours / 24 )
            hours = hours % 24
            return string[ len( match[ 0 ] ): ], (days, hours, minutes, seconds)
        raise ValueError( "Invalid value, expected a time interval" )
    elif spec == Value.RATIO:
        match = re.match( '(?P<first>0|([1-9][0-9]*)):(?P<second>0|([1-9][0-9]*))', string )
        if match: return string[ len( match[ 0 ] ): ], (int( match[ 'first' ] ), int( match[ 'second' ] ))
        raise ValueError( "Invalid value, expected a ratio" )
    elif spec == Value.RATIO_SPECIAL:

        if string == 'square': return ( '', ( 1, 1 ) )
        if string == 'landscape': return ( '', ( 1, 1 ) )
        if string == 'portrait': return ( '', ( 1, 1 ) )

    elif spec == Value.RATING_SERVICE_NAME_AND_NUMERICAL_VALUE:

        # 'my favourites 3/5' (no operator here)

        match = re.match( '(?P<name>.+?)\s+(?P<num>\d+)/(?P<den>\d+)$', string )

        if match:

            service_name = match[ 'name' ]
            numerator = int( match[ 'num' ] )
            denominator = int( match[ 'den' ] )

            if numerator < 0 or numerator > denominator:

                raise ValueError( 'Invalid value, rating value was out of bounds')


            return ( '', ( numerator, service_name ) )


        raise ValueError( "Invalid value, expected a numerical rating" )

    elif spec == Value.RATING_SERVICE_NAME_AND_LIKE_DISLIKE:

        # 'tag this later = like' (maybe operator here)
        # 'tag this later like'

        # check dislike first lol
        if string.endswith( 'dislike' ):

            value = 0.0

            string = string[ : -len( 'dislike' ) ]

        elif string.endswith( 'like' ):

            value = 1.0

            string = string[ : -len( 'like' ) ]

        else:

            raise ValueError( 'Invalid value, expected like/dislike' )


        string = string.strip()

        for ( operator_string, result ) in operator_strings_and_results:

            if string.endswith( operator_string ):

                string = string[ : -len( operator_string ) ]

                string = string.strip()

                break


        service_name = string

        return ( '', ( value, service_name ) )

    elif spec == Value.RATING_SERVICE_NAME_AND_INCDEC:

        # 'I'm cooooollecting counter 123' (no operator here)

        match = re.match( '(?P<name>.+?)\s+(?P<num>\d+)$', string )

        if match:

            service_name = match[ 'name' ]
            value = int( match[ 'num' ] )

            return ( '', ( value, service_name ) )


        raise ValueError( "Invalid value, expected an inc/dec rating" )

    elif spec == Value.NAMESPACE_AND_NUM_TAGS:

        # 'character tags > 4'
        match = re.match( r'(?P<namespace>.+) tags (?P<operator>.+?)\s?(?P<num>\d+)\s*$', string )

        if match:

            namespace = match[ 'namespace' ]
            operator_string = match[ 'operator' ]
            num = int( match[ 'num' ] )

            if namespace == 'unnamespaced':

                namespace = ''


            ( gubbins, operator ) = parse_operator( operator_string, Operators.RELATIONAL )

            return ( '', ( namespace, operator, num ) )


    raise ValueError( "Invalid value specification" )


def parse_operator( string: str, spec ):

    while string.startswith( ':' ) or string.startswith( ' ' ):

        string = string.strip()

        if string.startswith( ':' ):

            string = string[ 1 : ]


    if spec is None:
        return string, None
    elif spec in ( Operators.RELATIONAL, Operators.RELATIONAL_EXACT, Operators.RELATIONAL_TIME ):
        exact = spec == Operators.RELATIONAL_EXACT
        ops = [ '=', '<', '>' ]

        if spec == Operators.RELATIONAL_TIME:

            re_result = re.search( r'\d.*', string )

            if re_result:

                op_string = string[ : re_result.start() ]
                string_result = re_result.group()

                invert_ops = not string_looks_like_date( string_result )

                looks_like_date = string_looks_like_date( string_result )
                invert_ops = not looks_like_date

                if 'month' in op_string and looks_like_date:

                    return ( string_result, UNICODE_APPROX_EQUAL )

                elif 'around' in op_string and not looks_like_date:

                    return ( string_result, UNICODE_APPROX_EQUAL )

                elif 'day' in op_string and looks_like_date:

                    return ( string_result, '=' )

                elif 'since' in op_string:

                    return ( string_result, '<' if invert_ops else '>' )

                elif 'before' in op_string:

                    return ( string_result, '>' if invert_ops else '<' )


        if not exact:
            ops = ops + [ UNICODE_NOT_EQUAL, UNICODE_APPROX_EQUAL ]
        if string.startswith( '==' ): return string[ 2: ], '='
        if not exact:
            if string.startswith( '!=' ): return string[ 2: ], UNICODE_NOT_EQUAL
            if string.startswith( 'is not' ): return string[ 6: ], UNICODE_NOT_EQUAL
            if string.startswith( 'isn\'t' ): return string[ 5: ], UNICODE_NOT_EQUAL
            if string.startswith( '~=' ): return string[ 2: ], UNICODE_APPROX_EQUAL
        for op in ops:
            if string.startswith( op ): return string[ len( op ): ], op
        if string.startswith( 'is' ): return string[ 2: ], '='
        if string.startswith( 'has' ): return string, '>'
        if string.startswith( 'no' ): return string, '='
        raise ValueError( "Invalid relational operator" )
    elif spec == Operators.RELATIONAL_FOR_RATING_SERVICE:

        # "favourites service name > 3/5"
        # since service name can be all sorts of gubbins, we'll work backwards and KISS
        match = re.match( '(?P<first>.*?)(?P<second>(dislike|like|\d+/\d+|\d+))$', string )

        if match:

            without_value_string_raw = match[ 'first' ]

            without_value_string = without_value_string_raw.strip()

            for ( operator_string, possible_operator ) in operator_strings_and_results:

                if without_value_string.endswith( operator_string ):

                    if possible_operator == UNICODE_NOT_EQUAL:

                        raise ValueError( 'Invalid rating operator--cannot select "is not"' )


                    service_name = without_value_string[ : -len( operator_string) ]

                    value = match[ 'second' ]

                    parsing_string = f'{service_name} {value}'

                    return ( parsing_string, possible_operator )


        raise ValueError( "Invalid rating operator" )
    elif spec == Operators.EQUAL:
        if string.startswith( '==' ): return string[ 2: ], '='
        if string.startswith( UNICODE_NOT_EQUAL ): return string[ 1: ], '!='
        if string.startswith( '!=' ): return string[ 2: ], '!='
        if string.startswith( '=' ): return string[ 1: ], '='
        if string.startswith( 'is not' ): return string[ 6: ], '!='
        if string.startswith( 'isn\'t' ): return string[ 5: ], '!='
        if string.startswith( 'is' ): return string[ 2: ], '='
        raise ValueError( "Invalid equality operator" )
    elif spec == Operators.EQUAL_NOT_CONSUMING:

        # hydev checking in here with some nonsense that catches an awkward situation
        # system:hash (md5) = blah
        # we want to see the = but not eat the md5, so in this special case, which isn't hard to parse otherwise, we'll just look for it and return no changes

        if '==' in string: return string, '='
        if UNICODE_NOT_EQUAL in string: return string, '!='
        if '!=' in string: return string, '!='
        if '=' in string: return string, '='
        if 'is not' in string: return string, '!='
        if 'isn\'t' in string: return string, '!='
        if 'is' in string: return string, '='
        raise ValueError( "Invalid equality operator" )
    elif spec == Operators.FILESERVICE_STATUS:
        match = re.match( '(is )?currently in', string )
        if match: return string[ len( match[ 0 ] ): ], 'is currently in'
        match = re.match( '((is )?not currently in)|isn\'t currently in', string )
        if match: return string[ len( match[ 0 ] ): ], 'is not currently in'
        match = re.match( '(is )?pending to', string )
        if match: return string[ len( match[ 0 ] ): ], 'is pending to'
        match = re.match( '((is )?not pending to)|isn\'t pending to', string )
        if match: return string[ len( match[ 0 ] ): ], 'is not pending to'
        raise ValueError( "Invalid operator, expected a file service relationship" )
    elif spec == Operators.TAG_RELATIONAL:

        # note this is in the correct order, also, to eliminate = vs == ambiguity
        all_operators_piped = '|'.join( ( s_r[0] for s_r in operator_strings_and_results ) )

        match = re.match( f'(?P<namespace>.*)\s+(?P<op>({all_operators_piped}))', string )

        if match:

            namespace = match[ 'namespace' ]

            if namespace == 'any namespace':

                namespace = '*'


            if namespace == 'unnamespaced':

                namespace = ''


            op_string = match[ 'op' ]

            op = operator_strings_to_results.get( op_string, UNICODE_APPROX_EQUAL )

            if op not in ( '<', '>', UNICODE_APPROX_EQUAL ):

                op = UNICODE_APPROX_EQUAL


            return string[ len( match[ 0 ] ): ], (namespace, op)


        raise ValueError( "Invalid operator, expected a tag followed by a relational operator" )

    elif spec == Operators.ONLY_EQUAL:
        if string.startswith( '==' ): return string[ 2: ], '='
        if string.startswith( '=' ): return string[ 1: ], '='
        if string.startswith( 'is' ): return string[ 2: ], '='
        raise ValueError( "Invalid equality operator" )
    elif spec == Operators.RATIO_OPERATORS:
        if string.startswith( 'wider than' ): return string[ 10: ], 'wider than'
        if string.startswith( 'taller than' ): return string[ 11: ], 'taller than'
        if string.startswith( 'is wider than' ): return string[ 13: ], 'wider than'
        if string.startswith( 'is taller than' ): return string[ 14: ], 'taller than'
        if string.startswith( '==' ): return string[ 2: ], '='
        if string.startswith( '=' ): return string[ 1: ], '='
        if string.startswith( 'is' ): return string[ 2: ], '='
        if string.startswith( '~=' ): return string[ 2: ], UNICODE_APPROX_EQUAL
        if string.startswith( UNICODE_APPROX_EQUAL ): return string[ 1: ], UNICODE_APPROX_EQUAL
        raise ValueError( "Invalid ratio operator" )
    elif spec == Operators.RATIO_OPERATORS_SPECIAL:

        if 'square' in string: return 'square', '='
        if 'portrait' in string: return 'portrait', 'taller than'
        if 'landscape' in string: return 'landscape', 'wider than'


    raise ValueError( "Invalid operator specification" )