hydrus/hydrus/external/SystemPredicateParser.py

# made by prkc for Hydrus Network
# Licensed under the same terms as Hydrus Network
# hydev has changed a couple things here and there, and changed how filetypes work

# The basic idea here is to take a system predicate written as text and parse it into a (predicate type, operator, value, unit)
# tuple. The exact structure of the operator, value and unit members depend on the type of the predicate.
# For example, system:width < 500 would become (Predicate.WIDTH, '<', 500).
# The parsers recognize multiple forms for various units and operators, but always normalize to a single canonical form,
# which is given in the comments beside the various enums below.
# Some or all of them can be None, depending on the predicate.
# The "parsing" is done with regex, which is hacky but good enough for this usecase.
# To extend the parser with additional predicates, first extend the Predicate, Value, Operators, Units enums if the
# already present options are not sufficient, then implement parsing for them in the corresponding parse_{unit,value,operator} funtions.
# Finally, add a new entry to the SYSTEM_PREDICATES dict describing the new predicate.
# Initially everything below is independent from other Hydrus code so there is some redundancy.
# It might be better to switch to already established Hydrus enums and constants where possible.
# Errors are handled by throwing ValueErrors. The main function to call is parse_system_predicate.
# If this file is run by itself it will parse and print all the included examples. There are examples for each supported predicate type.

import math
import re
import datetime
from enum import Enum, auto

# TODO: This needs to be updated with all types that Hydrus supports.
FILETYPES = { }


def InitialiseFiletypes( str_to_enum ):
    for ( filetype_string, enum ) in str_to_enum.items():

        if isinstance( enum, int ):

            enum_tuple = (enum,)

        else:

            enum_tuple = tuple( enum )

        if '/' in filetype_string:
            (filetype_class, specific_filetype) = filetype_string.split( '/', 1 )

            FILETYPES[ specific_filetype ] = enum_tuple

        FILETYPES[ filetype_string ] = enum_tuple


NAMESPACE_SEPARATOR = ':'
SYSTEM_PREDICATE_PREFIX = 'system' + NAMESPACE_SEPARATOR


# This enum lists all the recognized predicate types.
class Predicate( Enum ):
    EVERYTHING = auto()
    INBOX = auto()
    ARCHIVE = auto()
    HAS_DURATION = auto()
    NO_DURATION = auto()
    BEST_QUALITY_OF_GROUP = auto()
    NOT_BEST_QUALITY_OF_GROUP = auto()
    HAS_AUDIO = auto()
    NO_AUDIO = auto()
    HAS_EXIF = auto()
    NO_EXIF = auto()
    HAS_HUMAN_READABLE_EMBEDDED_METADATA = auto()
    NO_HUMAN_READABLE_EMBEDDED_METADATA = auto()
    HAS_ICC_PROFILE = auto()
    NO_ICC_PROFILE = auto()
    HAS_TAGS = auto()
    UNTAGGED = auto()
    NUM_OF_TAGS = auto()
    NUM_OF_WORDS = auto()
    HEIGHT = auto()
    WIDTH = auto()
    FILESIZE = auto()
    SIMILAR_TO = auto()
    LIMIT = auto()
    FILETYPE = auto()
    HASH = auto()
    MOD_DATE = auto()
    ARCHIVED_DATE = auto()
    LAST_VIEWED_TIME = auto()
    TIME_IMPORTED = auto()
    DURATION = auto()
    FRAMERATE = auto()
    NUM_OF_FRAMES = auto()
    FILE_SERVICE = auto()
    NUM_FILE_RELS = auto()
    RATIO = auto()
    RATIO_SPECIAL = auto()
    NUM_PIXELS = auto()
    MEDIA_VIEWS = auto()
    PREVIEW_VIEWS = auto()
    ALL_VIEWS = auto()
    MEDIA_VIEWTIME = auto()
    PREVIEW_VIEWTIME = auto()
    ALL_VIEWTIME = auto()
    URL_REGEX = auto()
    NO_URL_REGEX = auto()
    URL = auto()
    NO_URL = auto()
    DOMAIN = auto()
    NO_DOMAIN = auto()
    URL_CLASS = auto()
    NO_URL_CLASS = auto()
    TAG_AS_NUMBER = auto()
    HAS_NOTES = auto()
    NO_NOTES = auto()
    NUM_NOTES = auto()
    HAS_NOTE_NAME = auto()
    NO_NOTE_NAME = auto()


# This enum lists the possible value formats a predicate can have (if it has a value).
# Parsing for each of these options is implemented in parse_value
class Value( Enum ):
    NATURAL = auto()  # An int which holds a non-negative value
    HASHLIST_WITH_DISTANCE = auto()  # A 2-tuple, where the first part is a set of potential hashes (as strings), the second part is a non-negative integer
    HASHLIST_WITH_ALGORITHM = auto()  # A 2-tuple, where the first part is a set of potential hashes (as strings), the second part is one of 'sha256', 'md5', 'sha1', 'sha512'
    FILETYPE_LIST = auto()  # A set of file types using the enum set in InitialiseFiletypes as defined in FILETYPES
    # Either a tuple of 4 non-negative integers: (years, months, days, hours) where the latter is < 24 OR
    # a datetime.datetime object. For the latter, only the YYYY-MM-DD format is accepted.
    # dateutils has a function to try to guess and parse arbitrary date formats but I didn't use it here since it would be an additional dependency.
    DATE_OR_TIME_INTERVAL = auto()
    TIME_SEC_MSEC = auto()  # A tuple of two non-negative integers: (seconds, milliseconds) where the latter is <1000
    ANY_STRING = auto()  # A string (accepts any string so can't use units after this since it consumes the entire remaining part of the input)
    TIME_INTERVAL = auto()  # A tuple of 4 non-negative integers: (days, hours, minutes, seconds) where hours < 24, minutes < 60, seconds < 60
    INTEGER = auto()  # An integer
    RATIO = auto()  # A tuple of 2 ints, both non-negative
    RATIO_SPECIAL = auto() # 1:1


# Possible operator formats
# Implemented in parse_operator
class Operators( Enum ):
    RELATIONAL = auto()  # One of '=', '<', '>', '\u2248' ('≈') (takes '~=' too)
    RELATIONAL_EXACT = auto() # Like RELATIONAL but without the approximately equal operator
    EQUAL = auto()  # One of '=' or '!='
    FILESERVICE_STATUS = auto()  # One of 'is not currently in', 'is currently in', 'is not pending to', 'is pending to'
    TAG_RELATIONAL = auto()  # A tuple of a string (a potential tag name) and a relational operator (as a string)
    ONLY_EQUAL = auto()  # None (meaning =, since thats the only accepted operator)
    RATIO_OPERATORS = auto()  # One of '=', 'wider than','taller than', '\u2248' ('≈') (takes '~=' too)
    RATIO_OPERATORS_SPECIAL = auto() # 'square', 'portrait', 'landscape'


# Possible unit formats
# Implemented in parse_unit
class Units( Enum ):
    FILESIZE = auto()  # One of 'B', 'KB', 'MB', 'GB'
    FILE_RELATIONSHIP_TYPE = auto()  # One of 'not related/false positive', 'duplicates', 'alternates', 'potential duplicates'
    PIXELS_OR_NONE = auto()  # Always None (meaning pixels)
    PIXELS = auto()  # One of 'pixels', 'kilopixels', 'megapixels'
    FPS_OR_NONE = auto() # 'fps'


# All system predicates
# A predicate is described by a 4-tuple of (predicate type, operator format, value format, unit format) (use None if some are not applicable)
# The keys are regexes matching the predicate names as written by the user.
# The parser will also automatically accept _ instead of space in the predicate names, always use space in this dict.
SYSTEM_PREDICATES = {
    'everything': (Predicate.EVERYTHING, None, None, None),
    'inbox': (Predicate.INBOX, None, None, None),
    'archive$': (Predicate.ARCHIVE, None, None, None), # $ so as not to clash with system:archive(d) date
    'has duration': (Predicate.HAS_DURATION, None, None, None),
    'no duration': (Predicate.NO_DURATION, None, None, None),
    '(is the )?best quality( file)? of( its)?( duplicate)? group': (Predicate.BEST_QUALITY_OF_GROUP, None, None, None),
    '(((is )?not)|(isn\'t))( the)? best quality( file)? of( its)?( duplicate)? group': (Predicate.NOT_BEST_QUALITY_OF_GROUP, None, None, None),
    'has audio': (Predicate.HAS_AUDIO, None, None, None),
    'no audio': (Predicate.NO_AUDIO, None, None, None),
    'has exif': (Predicate.HAS_EXIF, None, None, None),
    'no exif': (Predicate.NO_EXIF, None, None, None),
    'has.*embedded.*metadata': (Predicate.HAS_HUMAN_READABLE_EMBEDDED_METADATA, None, None, None),
    'no.*embedded.*metadata': (Predicate.NO_HUMAN_READABLE_EMBEDDED_METADATA, None, None, None),
    'has icc profile': (Predicate.HAS_ICC_PROFILE, None, None, None),
    'no icc profile': (Predicate.NO_ICC_PROFILE, None, None, None),
    'has tags': (Predicate.HAS_TAGS, None, None, None),
    'untagged|no tags': (Predicate.UNTAGGED, None, None, None),
    'number of tags': (Predicate.NUM_OF_TAGS, Operators.RELATIONAL, Value.NATURAL, None),
    'number of words': (Predicate.NUM_OF_WORDS, Operators.RELATIONAL, Value.NATURAL, None),
    'height': (Predicate.HEIGHT, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS_OR_NONE),
    'width': (Predicate.WIDTH, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS_OR_NONE),
    'file ?size': (Predicate.FILESIZE, Operators.RELATIONAL, Value.NATURAL, Units.FILESIZE),
    'similar to': (Predicate.SIMILAR_TO, None, Value.HASHLIST_WITH_DISTANCE, None),
    'limit': (Predicate.LIMIT, Operators.ONLY_EQUAL, Value.NATURAL, None),
    'file ?type': (Predicate.FILETYPE, Operators.ONLY_EQUAL, Value.FILETYPE_LIST, None),
    'hash': (Predicate.HASH, Operators.EQUAL, Value.HASHLIST_WITH_ALGORITHM, None),
    'archived? (date|time)|(date|time) archived': (Predicate.ARCHIVED_DATE, Operators.RELATIONAL, Value.DATE_OR_TIME_INTERVAL, None),
    'modified (date|time)|(date|time) modified': (Predicate.MOD_DATE, Operators.RELATIONAL, Value.DATE_OR_TIME_INTERVAL, None),
    'last view(ed)? (date|time)|(date|time) last viewed': (Predicate.LAST_VIEWED_TIME, Operators.RELATIONAL, Value.DATE_OR_TIME_INTERVAL, None),
    'import(ed)? (date|time)|(date|time) imported': (Predicate.TIME_IMPORTED, Operators.RELATIONAL, Value.DATE_OR_TIME_INTERVAL, None),
    'duration': (Predicate.DURATION, Operators.RELATIONAL, Value.TIME_SEC_MSEC, None),
    'framerate': (Predicate.FRAMERATE, Operators.RELATIONAL_EXACT, Value.NATURAL, Units.FPS_OR_NONE),
    'number of frames': (Predicate.NUM_OF_FRAMES, Operators.RELATIONAL, Value.NATURAL, None),
    'file service': (Predicate.FILE_SERVICE, Operators.FILESERVICE_STATUS, Value.ANY_STRING, None),
    'num(ber of)? file relationships': (Predicate.NUM_FILE_RELS, Operators.RELATIONAL, Value.NATURAL, Units.FILE_RELATIONSHIP_TYPE),
    'ratio(?=.*\d)': (Predicate.RATIO, Operators.RATIO_OPERATORS, Value.RATIO, None),
    'ratio(?!.*\d)': (Predicate.RATIO_SPECIAL, Operators.RATIO_OPERATORS_SPECIAL, Value.RATIO_SPECIAL, None),
    'num pixels': (Predicate.NUM_PIXELS, Operators.RELATIONAL, Value.NATURAL, Units.PIXELS),
    'media views': (Predicate.MEDIA_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
    'preview views': (Predicate.PREVIEW_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
    'all views': (Predicate.ALL_VIEWS, Operators.RELATIONAL, Value.NATURAL, None),
    'media viewtime': (Predicate.MEDIA_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
    'preview viewtime': (Predicate.PREVIEW_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
    'all viewtime': (Predicate.ALL_VIEWTIME, Operators.RELATIONAL, Value.TIME_INTERVAL, None),
    'has (a )?url matching regex': (Predicate.URL_REGEX, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have (a )?url matching regex': (Predicate.NO_URL_REGEX, None, Value.ANY_STRING, None),
    'has url': (Predicate.URL, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have url': (Predicate.NO_URL, None, Value.ANY_STRING, None),
    'has (a )?(url with )?domain': (Predicate.DOMAIN, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have (a )?(url with )?domain': (Predicate.NO_DOMAIN, None, Value.ANY_STRING, None),
    'has (a )?url with (url )?class': (Predicate.URL_CLASS, None, Value.ANY_STRING, None),
    '(does not|doesn\'t) have (a )?url with (url )?class': (Predicate.NO_URL_CLASS, None, Value.ANY_STRING, None),
    'tag as number': (Predicate.TAG_AS_NUMBER, Operators.TAG_RELATIONAL, Value.INTEGER, None),
    'has notes?': (Predicate.HAS_NOTES, None, None, None),
    '(no|does not have|doesn\'t have) notes': (Predicate.NO_NOTES, None, None, None),
    'num(ber of)? notes': (Predicate.NUM_NOTES, Operators.RELATIONAL_EXACT, Value.NATURAL, None),
    '(has (a )?)?note with name': (Predicate.HAS_NOTE_NAME, None, Value.ANY_STRING, None),
    '(no|does not have|doesn\'t have) note with name': (Predicate.NO_NOTE_NAME, None, Value.ANY_STRING, None),
}


# Parsing is just finding a matching predicate name,
# then trying to parse it by consuming the input string.
# The parse_* functions consume some of the string and return a (remaining part of the string, parsed value) tuple.
def parse_system_predicate( string: str ):
    string = string.lower().strip()
    string = string.replace( '_', ' ' )
    if string.startswith( "-" ):
        raise ValueError( "System predicate can't start with negation" )
    if not string.startswith( SYSTEM_PREDICATE_PREFIX ):
        raise ValueError( "Not a system predicate!" )
    string = string[ len( SYSTEM_PREDICATE_PREFIX ): ]
    for pred_regex in SYSTEM_PREDICATES:
        match = re.match( pred_regex.replace( ' ', '([_ ]+)' ) + ":?", string )
        if match:
            pred = SYSTEM_PREDICATES[ pred_regex ]
            string = string[ len( match[ 0 ] ): ]
            string, operator = parse_operator( string, pred[ 1 ] )
            string, value = parse_value( string, pred[ 2 ] )
            string, unit = parse_unit( string, pred[ 3 ] )
            if string: raise ValueError( "Unrecognized characters at the end of the predicate: " + string )
            return pred[ 0 ], operator, value, unit
    raise ValueError( "Unknown system predicate!" )


def parse_unit( string: str, spec ):
    string = string.strip()
    if spec is None:
        return string, None
    elif spec == Units.FILESIZE:
        match = re.match( 'b|byte|bytes', string )
        if match: return string[ len( match[ 0 ] ): ], 'B'
        match = re.match( 'kb|kilobytes|kilobyte', string )
        if match: return string[ len( match[ 0 ] ): ], 'KB'
        match = re.match( 'mb|megabytes|megabyte', string )
        if match: return string[ len( match[ 0 ] ): ], 'MB'
        match = re.match( 'gb|gigabytes|gigabyte', string )
        if match: return string[ len( match[ 0 ] ): ], 'GB'
        raise ValueError( "Invalid unit, expected a filesize" )
    elif spec == Units.FILE_RELATIONSHIP_TYPE:
        match = re.match( 'duplicates', string )
        if match: return string[ len( match[ 0 ] ): ], 'duplicates'
        match = re.match( 'alternates', string )
        if match: return string[ len( match[ 0 ] ): ], 'alternates'
        match = re.match( '(not related/false positives?)|not related|(false positives?)', string )
        if match: return string[ len( match[ 0 ] ): ], 'not related/false positive'
        match = re.match( 'potential duplicates', string )
        if match: return string[ len( match[ 0 ] ): ], 'potential duplicates'
        raise ValueError( "Invalid unit, expected a file relationship" )
    elif spec == Units.PIXELS_OR_NONE:
        if not string:
            return string, None
        else:
            match = re.match( '(pixels?)|px', string )
            if match: return string[ len( match[ 0 ] ): ], None
        raise ValueError( "Invalid unit, expected no unit or pixels" )
    elif spec == Units.PIXELS:
        match = re.match( 'px|pixels|pixel', string )
        if match: return string[ len( match[ 0 ] ): ], 'pixels'
        match = re.match( 'kpx|kilopixels|kilopixel', string )
        if match: return string[ len( match[ 0 ] ): ], 'kilopixels'
        match = re.match( 'mpx|megapixels|megapixel', string )
        if match: return string[ len( match[ 0 ] ): ], 'megapixels'
        raise ValueError( "Invalid unit, expected pixels" )
    elif spec == Units.FPS_OR_NONE:
        if not string:
            return string, None
        else:
            match = re.match( 'fps', string )
            if match: return string[ len( match[ 0 ] ): ], None
        raise ValueError( "Invalid unit, expected no unit or fps" )
    raise ValueError( "Invalid unit specification" )


def parse_value( string: str, spec ):
    string = string.strip()
    if spec is None:
        return string, None
    elif spec == Value.NATURAL:
        match = re.match( '0|([1-9][0-9]*)', string )
        if match: return string[ len( match[ 0 ] ): ], int( match[ 0 ] )
        raise ValueError( "Invalid value, expected a natural number" )
    elif spec == Value.HASHLIST_WITH_DISTANCE:
        match = re.match( '(?P<hashes>([0-9a-f]+(\s|,)+)+)(with\s+)?distance\s+(?P<distance>0|([1-9][0-9]*))', string )
        if match:
            hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )
            distance = int( match[ 'distance' ] )
            return string[ len( match[ 0 ] ): ], (hashes, distance)
        raise ValueError( "Invalid value, expected a list of hashes with distance" )
    elif spec == Value.HASHLIST_WITH_ALGORITHM:
        match = re.match( '(?P<hashes>([0-9a-f]+(\s|,)*)+)((with\s+)?algorithm)?\s*(?P<algorithm>sha256|sha512|md5|sha1|)', string )
        if match:
            hashes = set( hsh.strip() for hsh in re.sub( '\s', ' ', match[ 'hashes' ].replace( ',', ' ' ) ).split( ' ' ) if len( hsh ) > 0 )
            algorithm = match[ 'algorithm' ] if len( match[ 'algorithm' ] ) > 0 else 'sha256'
            return string[ len( match[ 0 ] ): ], (hashes, algorithm)
        raise ValueError( "Invalid value, expected a list of hashes with algorithm" )
    elif spec == Value.FILETYPE_LIST:
        valid_values = sorted( FILETYPES.keys(), key = lambda k: len( k ), reverse = True )
        ftype_regex = '(' + '|'.join( [ '(' + val + ')' for val in valid_values ] ) + ')'
        match = re.match( '(' + ftype_regex + '(\s|,)+)*' + ftype_regex, string )
        if match:
            found_ftypes_all = re.sub( '\s', ' ', match[ 0 ].replace( ',', ' ' ) ).split( ' ' )
            found_ftypes_good = [ ]
            for ftype in found_ftypes_all:
                if len( ftype ) > 0 and ftype in FILETYPES:
                    found_ftypes_good.extend( FILETYPES[ ftype ] )
            return string[ len( match[ 0 ] ): ], set( found_ftypes_good )
        raise ValueError( "Invalid value, expected a list of file types" )
    elif spec == Value.DATE_OR_TIME_INTERVAL:
        match = re.match( '((?P<year>0|([1-9][0-9]*))\s*(years|year))?\s*((?P<month>0|([1-9][0-9]*))\s*(months|month))?\s*((?P<day>0|([1-9][0-9]*))\s*(days|day))?\s*((?P<hour>0|([1-9][0-9]*))\s*(hours|hour|h))?', string )
        if match and (match.group( 'year' ) or match.group( 'month' ) or match.group( 'day' ) or match.group( 'hour' )):
            years = int( match.group( 'year' ) ) if match.group( 'year' ) else 0
            months = int( match.group( 'month' ) ) if match.group( 'month' ) else 0
            days = int( match.group( 'day' ) ) if match.group( 'day' ) else 0
            hours = int( match.group( 'hour' ) ) if match.group( 'hour' ) else 0
            return string[ len( match[ 0 ] ): ], (years, months, days, hours)
        match = re.match( '(?P<year>[0-9][0-9][0-9][0-9])-(?P<month>[0-9][0-9]?)-(?P<day>[0-9][0-9]?)', string )
        if match:
            # good expansion here would be to parse a full date with 08:20am kind of thing, but we'll wait for better datetime parsing library for that I think!
            return string[ len( match[ 0 ] ): ], datetime.datetime( int( match.group( 'year' ) ), int( match.group( 'month' ) ), int( match.group( 'day' ) ) )
        raise ValueError( "Invalid value, expected a date or a time interval" )
    elif spec == Value.TIME_SEC_MSEC:
        match = re.match( '((?P<sec>0|([1-9][0-9]*))\s*(seconds|second|secs|sec|s))?\s*((?P<msec>0|([1-9][0-9]*))\s*(milliseconds|millisecond|msecs|msec|ms))?', string )
        if match and (match.group( 'sec' ) or match.group( 'msec' )):
            seconds = int( match.group( 'sec' ) ) if match.group( 'sec' ) else 0
            mseconds = int( match.group( 'msec' ) ) if match.group( 'msec' ) else 0
            seconds += math.floor( mseconds / 1000 )
            mseconds = mseconds % 1000
            return string[ len( match[ 0 ] ): ], (seconds, mseconds)
        raise ValueError( "Invalid value, expected a duration" )
    elif spec == Value.ANY_STRING:
        return "", string
    elif spec == Value.TIME_INTERVAL:
        match = re.match( '((?P<day>0|([1-9][0-9]*))\s*(days|day))?\s*((?P<hour>0|([1-9][0-9]*))\s*(hours|hour|h))?\s*((?P<minute>0|([1-9][0-9]*))\s*(minutes|minute|mins|min))?\s*((?P<second>0|([1-9][0-9]*))\s*(seconds|second|secs|sec|s))?', string )
        if match and (match.group( 'day' ) or match.group( 'hour' ) or match.group( 'minute' ) or match.group( 'second' )):
            days = int( match.group( 'day' ) ) if match.group( 'day' ) else 0
            hours = int( match.group( 'hour' ) ) if match.group( 'hour' ) else 0
            minutes = int( match.group( 'minute' ) ) if match.group( 'minute' ) else 0
            seconds = int( match.group( 'second' ) ) if match.group( 'second' ) else 0
            minutes += math.floor( seconds / 60 )
            seconds = seconds % 60
            hours += math.floor( minutes / 60 )
            minutes = minutes % 60
            days += math.floor( hours / 24 )
            hours = hours % 24
            return string[ len( match[ 0 ] ): ], (days, hours, minutes, seconds)
        raise ValueError( "Invalid value, expected a time interval" )
    elif spec == Value.INTEGER:
        match = re.match( '0|(-?[1-9][0-9]*)', string )
        if match: return string[ len( match[ 0 ] ): ], int( match[ 0 ] )
        raise ValueError( "Invalid value, expected an integer" )
    elif spec == Value.RATIO:
        match = re.match( '(?P<first>0|([1-9][0-9]*)):(?P<second>0|([1-9][0-9]*))', string )
        if match: return string[ len( match[ 0 ] ): ], (int( match[ 'first' ] ), int( match[ 'second' ] ))
        raise ValueError( "Invalid value, expected a ratio" )
    elif spec == Value.RATIO_SPECIAL:

        if string == 'square': return ( '', ( 1, 1 ) )
        if string == 'landscape': return ( '', ( 1, 1 ) )
        if string == 'portrait': return ( '', ( 1, 1 ) )

    raise ValueError( "Invalid value specification" )


def parse_operator( string: str, spec ):
    string = string.strip()
    if spec is None:
        return string, None
    elif spec == Operators.RELATIONAL or spec == Operators.RELATIONAL_EXACT:
        exact = spec == Operators.RELATIONAL_EXACT
        ops = [ '=', '<', '>' ]
        if not exact:
            ops = ops + [ '\u2260', '\u2248' ]
        if string.startswith( '==' ): return string[ 2: ], '='
        if not exact:
            if string.startswith( '!=' ): return string[ 2: ], '\u2260'
            if string.startswith( 'is not' ): return string[ 6: ], '\u2260'
            if string.startswith( 'isn\'t' ): return string[ 5: ], '\u2260'
            if string.startswith( '~=' ): return string[ 2: ], '\u2248'
        for op in ops:
            if string.startswith( op ): return string[ len( op ): ], op
        if string.startswith( 'is' ): return string[ 2: ], '='
        raise ValueError( "Invalid relational operator" )
    elif spec == Operators.EQUAL:
        if string.startswith( '==' ): return string[ 2: ], '='
        if string.startswith( '=' ): return string[ 1: ], '='
        if string.startswith( '\u2260' ): return string[ 1: ], '!='
        if string.startswith( '!=' ): return string[ 2: ], '!='
        if string.startswith( 'is not' ): return string[ 6: ], '!='
        if string.startswith( 'is' ): return string[ 2: ], '='
        if string.startswith( 'isn\'t' ): return string[ 5: ], '!='
        raise ValueError( "Invalid equality operator" )
    elif spec == Operators.FILESERVICE_STATUS:
        match = re.match( '(is )?currently in', string )
        if match: return string[ len( match[ 0 ] ): ], 'is currently in'
        match = re.match( '((is )?not currently in)|isn\'t currently in', string )
        if match: return string[ len( match[ 0 ] ): ], 'is not currently in'
        match = re.match( '(is )?pending to', string )
        if match: return string[ len( match[ 0 ] ): ], 'is pending to'
        match = re.match( '((is )?not pending to)|isn\'t pending to', string )
        if match: return string[ len( match[ 0 ] ): ], 'is not pending to'
        raise ValueError( "Invalid operator, expected a file service relationship" )
    elif spec == Operators.TAG_RELATIONAL:
        match = re.match( '(?P<tag>.*)\s+(?P<op>(<|>|=|==|~=|\u2248|\u2260|is|is not))', string )
        if re.match:
            tag = match[ 'tag' ]
            op = match[ 'op' ]
            if op == '==': op = '='
            if op == 'is': op = '='
            return string[ len( match[ 0 ] ): ], (tag, op)
        raise ValueError( "Invalid operator, expected a tag followed by a relational operator" )
    elif spec == Operators.ONLY_EQUAL:
        if string.startswith( '==' ): return string[ 2: ], '='
        if string.startswith( '=' ): return string[ 1: ], '='
        if string.startswith( 'is' ): return string[ 2: ], '='
        raise ValueError( "Invalid equality operator" )
    elif spec == Operators.RATIO_OPERATORS:
        if string.startswith( 'wider than' ): return string[ 10: ], 'wider than'
        if string.startswith( 'taller than' ): return string[ 11: ], 'taller than'
        if string.startswith( 'is wider than' ): return string[ 13: ], 'wider than'
        if string.startswith( 'is taller than' ): return string[ 14: ], 'taller than'
        if string.startswith( '==' ): return string[ 2: ], '='
        if string.startswith( '=' ): return string[ 1: ], '='
        if string.startswith( 'is' ): return string[ 2: ], '='
        if string.startswith( '~=' ): return string[ 2: ], '\u2248'
        if string.startswith( '\u2248' ): return string[ 1: ], '\u2248'
        raise ValueError( "Invalid ratio operator" )
    elif spec == Operators.RATIO_OPERATORS_SPECIAL:

        if 'square' in string: return 'square', '='
        if 'portrait' in string: return 'portrait', 'taller than'
        if 'landscape' in string: return 'landscape', 'wider than'

    raise ValueError( "Invalid operator specification" )


examples = [
    "system:everything",
    "system:inbox  ",
    "system:archive ",
    "system:has duration",
    "system:has_duration",
    "   system:no_duration",
    "system:no duration",
    "system:is the best quality file  of its group",
    "system:isn't the best quality file of its duplicate group",
    "system:has_audio",
    "system:no audio",
    "system:has icc profile",
    "system:no icc profile",
    "system:has tags",
    "system:no tags",
    "system:untagged",
    "system:number of tags > 5",
    "system:number of tags ~= 10",
    "system:number of tags > 0  ",
    "system:number of words < 2",
    "system:height = 600px",
    "system:height is 800",
    "system:height > 900",
    "system:width < 200",
    "system:width > 1000 pixels",
    "system:filesize ~= 50 kilobytes",
    "system:filesize > 10megabytes",
    "system:file size    < 1 GB",
    "system:file size > 0 B",
    "system:similar to abcdef1 abcdef2 abcdef3, abcdef4 with distance 3",
    "system:similar to abcdef distance 5",
    "system:limit is 5000",
    "system:limit = 100",
    #"system:filetype is jpeg",
    #"system:filetype =   image/jpg, image/png, apng",
    "system:hash = abcdef1 abcdef2 abcdef3",
    "system:hash = abcdef1 abcdef, abcdef4 md5",
    "system:archived date < 7  years 45 days 70h",
    "system:modified date < 7  years 45 days 70h",
    "system:modified date > 2011-06-04",
    "system:date modified > 7 years 2    months",
    "system:date modified < 1 day",
    "system:date modified < 0 years 1 month 1 day 1 hour",
    "system:time_imported < 7 years 45 days 70h",
    "system:time imported > 2011-06-04",
    "system:time imported > 7 years 2 months",
    "system:time imported < 1 day",
    "system:time imported < 0 years 1 month 1 day 1 hour",
    " system:time imported ~= 2011-1-3 ",
    "system:import time < 7 years 45 days 70h",
    "system:import time > 2011-06-04",
    "system:import time > 7 years 2 months",
    "system:import time < 1 day",
    "system:import time = 1 day",
    "system:import time < 0 years 1 month 1 day 1 hour",
    " system:import time ~= 2011-1-3 ",
    "system:import time ~= 1996-05-2",
    "system:duration < 5 seconds",
    "system:duration ~= 5 sec 6000 msecs",
    "system:duration > 3 milliseconds",
    "system:framerate > 60fps",
    "system:number of frames > 6000",
    "system:file service is pending to my files",
    "   system:file service currently in my files",
    "system:file service isn't currently in my files",
    "system:file service is not pending to my files",
    "system:num file relationships < 3 alternates",
    "system:number of file relationships > 3 false positives",
    "system:ratio is wider than 16:9        ",
    "system:ratio is 16:9",
    "system:ratio taller than 1:1",
    "system:num pixels > 50 px",
    "system:num pixels < 1 megapixels ",
    "system:num pixels ~= 5 kilopixel",
    "system:media views ~= 10",
    "system:all views > 0",
    "system:preview views < 10  ",
    "system:media viewtime < 1 days 1 hour 0 minutes",
    "system:all viewtime > 1 hours 100 seconds",
    "system:preview viewtime ~= 1 day 30 hours 100 minutes 90s",
    " system:has url matching regex reg.*ex ",
    "system:does not have a url matching regex test",
    "system:has_url https://test.test/",
    " system:doesn't have url test url here  ",
    "system:has domain test.com",
    "system:doesn't have domain test.com",
    "system:has a url with class safebooru file page",
    "system:doesn't have a url with url class safebooru file page ",
    "system:tag as number page < 5",
    "system:has notes",
    "system:no notes",
    "system:does not have notes",
    "system:num notes is 5",
    "system:num notes > 1",
    "system:has note with name note name",
    "system:no note with name note name",
    "system:does not have note with name note name"
]

if __name__ == "__main__":
    for ex in examples:
        print( ex )
        print( parse_system_predicate( ex ) )