hydrus/hydrus/external/LogicExpressionQueryParser.py

#made by prkc for Hydrus Network
#Licensed under the same terms as Hydrus Network

"""
Accepted operators: not (!, -), and (&&), or (||), implies (=>), xnor (iff, <=>), nand, nor.
Parentheses work the usual way. \ can be used to escape characters (eg. to search for tags including parentheses)
The usual precedence rules apply.
ValueErrors are thrown with a message on syntax/parser errors.

Some test inputs:
a or b
a OR b
a and b
not a
a implies b
a xor b
a nor b
a nand b
a xnor b
(a && b) and not (a xor !b)
blah blah blah and another_long_tag_241245!
a_and_b
test!
!test
aaaaa_\(bbb ccc \(\)\) and not x
(a || b) and c and d and e or f and x or not (y or k or z and (h or i or j or t and f))
"""

import re

#Generates tokens for the parser. Consumes the input string.
#As opposed to most lexers it doesn't split on spaces.
#In fact, it tries to avoid splitting when possible by only splitting on logical operators or parentheses.
#Lowercase input is assumed.
#Contains some special handling for:
# * escapes with the \ character (escaping any character is valid). 'a \or b' is parsed as a single tag 'a or b'.
# * to allow tags ending with ! and other special chars without escaping. '!a' is negation of 'a' but 'a!' is just a tag.
#Returns a token and the remaining (unconsumed) input
def next_token(src):
    def check_tag_end(src):
        if re.match(r"\s(and|or|implies|xor|nor|nand|xnor|iff)", src): return True
        if re.match(r"&&|\|\||=>|<=>|\)|\(", src): return True
        return False

    src = src.strip()
    if len(src) == 0: return ("end", None), ""

    escape = False
    if src[0] == '\\' and len(src) > 1:
        escape = True
        src = src[1:]

    if not escape:
        if src.startswith(("!","-")):
            return ("not", None), src[1:]
        if src.startswith("&&"):
            return ("and", None), src[2:]
        if src.startswith("||"):
            return ("or", None), src[2:]
        if src.startswith("=>"):
            return ("implies", None), src[2:]
        if src.startswith("<=>"):
            return ("iff", None), src[3:]
        if src.startswith("("):
            return ("(", None), src[1:]
        if src.startswith(")"):
            return (")", None), src[1:]

        m = re.match(r"(not|and|or|implies|xor|nor|nand|xnor|iff)[\s\(]", src)
        if m:
            kw = m.group(1)
            return (kw if kw != "xnor" else "iff", None), src[len(kw):]

    tag = ""
    if escape:
        tag += src[0]
        src = src[1:]
    while len(src) > 0 and not check_tag_end(src):
        if len(src) > 1 and src[0] == '\\':
            tag += src[1]
            src = src[2:]
        else:
            tag += src[0]
            src = src[1:]
    tag = tag.strip()
    if len(tag) == 0:
        raise ValueError("Syntax error: empty search term")
    return ("tag", tag), src

#Roughly following conventional preferences, or C/C++ for rarely used operators
precedence_table = { "not": 10, "and": 9, "or": 8, "nor": 7, "nand": 7, "xor": 6, "implies": 5, "iff": 4 }

def precedence(token):
    if token[0] in precedence_table: return precedence_table[token[0]]
    raise ValueError("Syntax error: '{}' is not an operator".format(token[0]))

#A simple class representing a node in a logical expression tree
class Node:
    def __init__(self, op, children = []):
        self.op = op
        self.children = children[:]
    def __str__(self): #pretty string form, for debug purposes
        if self.op == "not":
            return "not ({})".format(str(self.children[0]) if type(self.children[0]) != str else self.children[0])
        else:
            child_strs = ["("+(str(x) if type(x) != str else x)+")" for x in self.children]
            final_str = ""
            for child_s in child_strs[:-1]:
                final_str += child_s
                final_str += " "+self.op+" "
            final_str += child_strs[-1]
            return final_str

#Parse a string into a logical expression tree
#First uses the shunting-yard algorithm to parse into reverse polish notation (RPN),
#then builds the tree from that
def parse(src):
    src = src.lower()
    prev_tok_type = "start"
    tok_type = "start"
    rpn_result = []
    operator_stack = []
    #Parse into reverse polish notation using the shunting-yard algorithm
    #Basic algorithm:
    #https://en.wikipedia.org/wiki/Shunting-yard_algorithm
    #Handling of unary operators:
    #https://stackoverflow.com/questions/1593080/how-can-i-modify-my-shunting-yard-algorithm-so-it-accepts-unary-operators
    #tl;dr - make unary operators right associative and higher precedence than any infix operator
    #however it will also accept prefix operators as postfix - we check for that later
    while True:
        prev_tok_type = tok_type
        token, src = next_token(src)
        tok_type, tok_val = token
        if tok_type == "end":
            break
        if tok_type == "tag":
            rpn_result.append(token)
        elif tok_type == "(":
            operator_stack.append(token)
        elif tok_type == ")":
            while len(operator_stack) > 0 and operator_stack[-1][0] != "(":
                rpn_result.append(operator_stack[-1])
                del operator_stack[-1]
            if len(operator_stack) > 0:
                del operator_stack[-1]
            else:
                raise ValueError("Syntax error: mismatched parentheses")
        else:
            if tok_type == "not" and prev_tok_type in ["tag",")"]:
                raise ValueError("Syntax error: invalid negation")
            while len(operator_stack) > 0 and operator_stack[-1][0] != "(" and \
                    (precedence(operator_stack[-1]) > precedence(token) or (precedence(operator_stack[-1]) == precedence(token) and operator_stack[-1][0] != "not")):
                rpn_result.append(operator_stack[-1])
                del operator_stack[-1]
            operator_stack.append(token)

    while len(operator_stack) > 0:
        if operator_stack[-1][0] in ["(", ")"]:
            raise ValueError("Syntax error: mismatched parentheses")
        rpn_result.append(operator_stack[-1])
        del operator_stack[-1]

    if len(rpn_result) == 0:
        raise ValueError("Empty input!")

    #Convert RPN into a tree
    #The original shunting-yard algorithm doesn't check for wrong number of arguments so also check that here
    rpn_result = list(reversed(rpn_result))
    stack = []
    while len(rpn_result) > 0:
        if rpn_result[-1][0] == "tag":
            stack.append(rpn_result[-1][1])
            del rpn_result[-1]
        else:
            if rpn_result[-1][0] == "not":
                if len(stack) == 0:
                    raise ValueError("Syntax error: wrong number of arguments")
                op = Node("not", [stack[-1]])
                del stack[-1]
                stack.append(op)
            else:
                if len(stack) < 2:
                    raise ValueError("Syntax error: wrong number of arguments")
                op = Node(rpn_result[-1][0], [stack[-2], stack[-1]])
                del stack[-1]
                del stack[-1]
                stack.append(op)
            del rpn_result[-1]

    #The original shunting-yard algorithm also accepts prefix operators as postfix
    #Check for that here
    if len(stack) != 1:
        raise ValueError("Parser error: unused values left in stack")

    return stack[0]

#Input is an expression tree
#Convert all logical operators to 'and', 'or' and 'not'
def convert_to_and_or_not(node):
    def negate(node):
        return Node("not", [convert_to_and_or_not(node)])

    if not hasattr(node, 'op'): return node

    if node.op == "implies": #convert to !a || b
        return Node("or", [negate(node.children[0]), convert_to_and_or_not(node.children[1])])
    elif node.op == "xor": #convert to (a && !b) || (!a && b)
        return Node("or", [
            Node("and", [convert_to_and_or_not(node.children[0]), negate(node.children[1])]),
            Node("and", [negate(node.children[0]), convert_to_and_or_not(node.children[1])])
        ])
    elif node.op == "nor": #convert to !(a || b)
        return negate(Node("or", node.children))
    elif node.op == "nand": #convert to !(a && b)
        return negate(Node("and", node.children))
    elif node.op == "iff": #convert to (a && b) || (!a && !b)
        return Node("or", [
            convert_to_and_or_not(Node("and", node.children)),
            Node("and", [negate(node.children[0]), negate(node.children[1])])
        ])
    else:
        return Node(node.op, list(map(convert_to_and_or_not, node.children)))

#Move negation inwards (downwards in the expr. tree) by using De Morgan's law,
#until they are directly apply to a term
#Also eliminates double negations
def move_not_inwards(node):
    if hasattr(node, 'op'):
        if node.op == "not" and hasattr(node.children[0], 'op'):
            if node.children[0].op == "not": #eliminate double negation
                return move_not_inwards(node.children[0].children[0])
            elif node.children[0].op == "and": #apply De Morgan's law
                return Node("or", [move_not_inwards(Node("not", [node.children[0].children[0]])), move_not_inwards(Node("not", [node.children[0].children[1]]))])
            elif node.children[0].op == "or": #apply De Morgan's law
                return Node("and", [move_not_inwards(Node("not", [node.children[0].children[0]])), move_not_inwards(Node("not", [node.children[0].children[1]]))])
            else:
                return Node(node.op, list(map(move_not_inwards, node.children)))
        else:
            return Node(node.op, list(map(move_not_inwards, node.children)))
    return node

#Use the distribute law to swap 'and's and 'or's so we get CNF
#Basically pushes 'or's downwards in the expression tree
def distribute_and_over_or(node):
    if hasattr(node, 'op'):
        node.children = list(map(distribute_and_over_or, node.children))
        if node.op == 'or' and hasattr(node.children[0], 'op') and node.children[0].op == 'and': #apply (A && B) || C -> (A || C) && (B || C)
            a = node.children[0].children[0]
            b = node.children[0].children[1]
            c = node.children[1]
            return Node("and", [distribute_and_over_or(Node("or", [a, c])), distribute_and_over_or(Node("or", [b, c]))])
        elif node.op == 'or' and hasattr(node.children[1], 'op') and node.children[1].op == 'and': #apply C || (A && B) -> (A || C) && (B || C)
            a = node.children[1].children[0]
            b = node.children[1].children[1]
            c = node.children[0]
            return Node("and", [distribute_and_over_or(Node("or", [a, c])), distribute_and_over_or(Node("or", [b, c]))])
        else:
            return node
    return node

#Flatten the tree so that 'and'/'or's don't have 'and'/'or's as direct children
#or(or(a,b),c) -> or(a,b,c)
#After this step, nodes can have more than two child
def flatten_tree(node):
    if hasattr(node, 'op'):
        node.children = list(map(flatten_tree, node.children))
        if node.op == 'and':
            new_children = []
            for chld in node.children:
                if hasattr(chld, 'op') and chld.op == 'and':
                    new_children += chld.children
                else:
                    new_children.append(chld)
            node.children = new_children
        elif node.op == 'or':
            new_children = []
            for chld in node.children:
                if hasattr(chld, 'op') and chld.op == 'or':
                    new_children += chld.children
                else:
                    new_children.append(chld)
            node.children = new_children
    return node

#Convert the flattened tree to a list of sets of terms
#Do some basic simplification: removing tautological or repeating clauses
def convert_to_list_and_simplify(node):
    res = []
    if hasattr(node, 'op'):
        if node.op == 'and':
            for chld in node.children:
                if type(chld) == str:
                    res.append(set([chld]))
                elif chld.op == 'not':
                    res.append(set(["-"+chld.children[0]]))
                else:
                    res.append(set(map(lambda x: "-"+x.children[0] if hasattr(x, "op") else x, chld.children)))
        elif node.op == 'or':
            res.append(set(map(lambda x: "-"+x.children[0] if hasattr(x, "op") else x, node.children)))
        elif node.op == 'not':
            res.append(set(["-"+node.children[0]]))
    else:
        res.append(set([node]))
    filtered_res = []
    last_found_always_true_clause = None
    #Filter out tautologies
    for clause in res:
        always_true = False
        for term in clause:
            if "-"+term in clause:
                always_true = True
                last_found_always_true_clause = clause
                break
        if not always_true: filtered_res.append(clause)
    #Remove repeating clauses
    for i in range(len(filtered_res)):
        for j in range(len(filtered_res)):
            if i != j and filtered_res[i] == filtered_res[j]: filtered_res[i] = None
    filtered_res = [x for x in filtered_res if x is not None]
    #Do not return empty if all clauses are tautologies, return a single clause instead
    if len(filtered_res) == 0 and last_found_always_true_clause:
        filtered_res.append(last_found_always_true_clause)
    return filtered_res

#This is the main function of this module that should be called from outside
def parse_logic_expression_query(input_str):
    return convert_to_list_and_simplify(flatten_tree(distribute_and_over_or(move_not_inwards(convert_to_and_or_not(parse(input_str))))))