hydrus/hydrus/external/LogicExpressionQueryParser.py

328 lines
13 KiB
Python

#made by prkc for Hydrus Network
#Licensed under the same terms as Hydrus Network
"""
Accepted operators: not (!, -), and (&&), or (||), implies (=>), xnor (iff, <=>), nand, nor.
Parentheses work the usual way. \ can be used to escape characters (eg. to search for tags including parentheses)
The usual precedence rules apply.
ValueErrors are thrown with a message on syntax/parser errors.
Some test inputs:
a or b
a OR b
a and b
not a
a implies b
a xor b
a nor b
a nand b
a xnor b
(a && b) and not (a xor !b)
blah blah blah and another_long_tag_241245!
a_and_b
test!
!test
aaaaa_\(bbb ccc \(\)\) and not x
(a || b) and c and d and e or f and x or not (y or k or z and (h or i or j or t and f))
"""
import re
#Generates tokens for the parser. Consumes the input string.
#As opposed to most lexers it doesn't split on spaces.
#In fact, it tries to avoid splitting when possible by only splitting on logical operators or parentheses.
#Lowercase input is assumed.
#Contains some special handling for:
# * escapes with the \ character (escaping any character is valid). 'a \or b' is parsed as a single tag 'a or b'.
# * to allow tags ending with ! and other special chars without escaping. '!a' is negation of 'a' but 'a!' is just a tag.
#Returns a token and the remaining (unconsumed) input
def next_token(src):
def check_tag_end(src):
if re.match(r"\s(and|or|implies|xor|nor|nand|xnor|iff)", src): return True
if re.match(r"&&|\|\||=>|<=>|\)|\(", src): return True
return False
src = src.strip()
if len(src) == 0: return ("end", None), ""
escape = False
if src[0] == '\\' and len(src) > 1:
escape = True
src = src[1:]
if not escape:
if src.startswith(("!","-")):
return ("not", None), src[1:]
if src.startswith("&&"):
return ("and", None), src[2:]
if src.startswith("||"):
return ("or", None), src[2:]
if src.startswith("=>"):
return ("implies", None), src[2:]
if src.startswith("<=>"):
return ("iff", None), src[3:]
if src.startswith("("):
return ("(", None), src[1:]
if src.startswith(")"):
return (")", None), src[1:]
m = re.match(r"(not|and|or|implies|xor|nor|nand|xnor|iff)[\s\(]", src)
if m:
kw = m.group(1)
return (kw if kw != "xnor" else "iff", None), src[len(kw):]
tag = ""
if escape:
tag += src[0]
src = src[1:]
while len(src) > 0 and not check_tag_end(src):
if len(src) > 1 and src[0] == '\\':
tag += src[1]
src = src[2:]
else:
tag += src[0]
src = src[1:]
tag = tag.strip()
if len(tag) == 0:
raise ValueError("Syntax error: empty search term")
return ("tag", tag), src
#Roughly following conventional preferences, or C/C++ for rarely used operators
precedence_table = { "not": 10, "and": 9, "or": 8, "nor": 7, "nand": 7, "xor": 6, "implies": 5, "iff": 4 }
def precedence(token):
if token[0] in precedence_table: return precedence_table[token[0]]
raise ValueError("Syntax error: '{}' is not an operator".format(token[0]))
#A simple class representing a node in a logical expression tree
class Node:
def __init__(self, op, children = []):
self.op = op
self.children = children[:]
def __str__(self): #pretty string form, for debug purposes
if self.op == "not":
return "not ({})".format(str(self.children[0]) if type(self.children[0]) != str else self.children[0])
else:
child_strs = ["("+(str(x) if type(x) != str else x)+")" for x in self.children]
final_str = ""
for child_s in child_strs[:-1]:
final_str += child_s
final_str += " "+self.op+" "
final_str += child_strs[-1]
return final_str
#Parse a string into a logical expression tree
#First uses the shunting-yard algorithm to parse into reverse polish notation (RPN),
#then builds the tree from that
def parse(src):
src = src.lower()
prev_tok_type = "start"
tok_type = "start"
rpn_result = []
operator_stack = []
#Parse into reverse polish notation using the shunting-yard algorithm
#Basic algorithm:
#https://en.wikipedia.org/wiki/Shunting-yard_algorithm
#Handling of unary operators:
#https://stackoverflow.com/questions/1593080/how-can-i-modify-my-shunting-yard-algorithm-so-it-accepts-unary-operators
#tl;dr - make unary operators right associative and higher precedence than any infix operator
#however it will also accept prefix operators as postfix - we check for that later
while True:
prev_tok_type = tok_type
token, src = next_token(src)
tok_type, tok_val = token
if tok_type == "end":
break
if tok_type == "tag":
rpn_result.append(token)
elif tok_type == "(":
operator_stack.append(token)
elif tok_type == ")":
while len(operator_stack) > 0 and operator_stack[-1][0] != "(":
rpn_result.append(operator_stack[-1])
del operator_stack[-1]
if len(operator_stack) > 0:
del operator_stack[-1]
else:
raise ValueError("Syntax error: mismatched parentheses")
else:
if tok_type == "not" and prev_tok_type in ["tag",")"]:
raise ValueError("Syntax error: invalid negation")
while len(operator_stack) > 0 and operator_stack[-1][0] != "(" and \
(precedence(operator_stack[-1]) > precedence(token) or (precedence(operator_stack[-1]) == precedence(token) and operator_stack[-1][0] != "not")):
rpn_result.append(operator_stack[-1])
del operator_stack[-1]
operator_stack.append(token)
while len(operator_stack) > 0:
if operator_stack[-1][0] in ["(", ")"]:
raise ValueError("Syntax error: mismatched parentheses")
rpn_result.append(operator_stack[-1])
del operator_stack[-1]
if len(rpn_result) == 0:
raise ValueError("Empty input!")
#Convert RPN into a tree
#The original shunting-yard algorithm doesn't check for wrong number of arguments so also check that here
rpn_result = list(reversed(rpn_result))
stack = []
while len(rpn_result) > 0:
if rpn_result[-1][0] == "tag":
stack.append(rpn_result[-1][1])
del rpn_result[-1]
else:
if rpn_result[-1][0] == "not":
if len(stack) == 0:
raise ValueError("Syntax error: wrong number of arguments")
op = Node("not", [stack[-1]])
del stack[-1]
stack.append(op)
else:
if len(stack) < 2:
raise ValueError("Syntax error: wrong number of arguments")
op = Node(rpn_result[-1][0], [stack[-2], stack[-1]])
del stack[-1]
del stack[-1]
stack.append(op)
del rpn_result[-1]
#The original shunting-yard algorithm also accepts prefix operators as postfix
#Check for that here
if len(stack) != 1:
raise ValueError("Parser error: unused values left in stack")
return stack[0]
#Input is an expression tree
#Convert all logical operators to 'and', 'or' and 'not'
def convert_to_and_or_not(node):
def negate(node):
return Node("not", [convert_to_and_or_not(node)])
if not hasattr(node, 'op'): return node
if node.op == "implies": #convert to !a || b
return Node("or", [negate(node.children[0]), convert_to_and_or_not(node.children[1])])
elif node.op == "xor": #convert to (a && !b) || (!a && b)
return Node("or", [
Node("and", [convert_to_and_or_not(node.children[0]), negate(node.children[1])]),
Node("and", [negate(node.children[0]), convert_to_and_or_not(node.children[1])])
])
elif node.op == "nor": #convert to !(a || b)
return negate(Node("or", node.children))
elif node.op == "nand": #convert to !(a && b)
return negate(Node("and", node.children))
elif node.op == "iff": #convert to (a && b) || (!a && !b)
return Node("or", [
convert_to_and_or_not(Node("and", node.children)),
Node("and", [negate(node.children[0]), negate(node.children[1])])
])
else:
return Node(node.op, list(map(convert_to_and_or_not, node.children)))
#Move negation inwards (downwards in the expr. tree) by using De Morgan's law,
#until they are directly apply to a term
#Also eliminates double negations
def move_not_inwards(node):
if hasattr(node, 'op'):
if node.op == "not" and hasattr(node.children[0], 'op'):
if node.children[0].op == "not": #eliminate double negation
return move_not_inwards(node.children[0].children[0])
elif node.children[0].op == "and": #apply De Morgan's law
return Node("or", [move_not_inwards(Node("not", [node.children[0].children[0]])), move_not_inwards(Node("not", [node.children[0].children[1]]))])
elif node.children[0].op == "or": #apply De Morgan's law
return Node("and", [move_not_inwards(Node("not", [node.children[0].children[0]])), move_not_inwards(Node("not", [node.children[0].children[1]]))])
else:
return Node(node.op, list(map(move_not_inwards, node.children)))
else:
return Node(node.op, list(map(move_not_inwards, node.children)))
return node
#Use the distribute law to swap 'and's and 'or's so we get CNF
#Basically pushes 'or's downwards in the expression tree
def distribute_and_over_or(node):
if hasattr(node, 'op'):
node.children = list(map(distribute_and_over_or, node.children))
if node.op == 'or' and hasattr(node.children[0], 'op') and node.children[0].op == 'and': #apply (A && B) || C -> (A || C) && (B || C)
a = node.children[0].children[0]
b = node.children[0].children[1]
c = node.children[1]
return Node("and", [distribute_and_over_or(Node("or", [a, c])), distribute_and_over_or(Node("or", [b, c]))])
elif node.op == 'or' and hasattr(node.children[1], 'op') and node.children[1].op == 'and': #apply C || (A && B) -> (A || C) && (B || C)
a = node.children[1].children[0]
b = node.children[1].children[1]
c = node.children[0]
return Node("and", [distribute_and_over_or(Node("or", [a, c])), distribute_and_over_or(Node("or", [b, c]))])
else:
return node
return node
#Flatten the tree so that 'and'/'or's don't have 'and'/'or's as direct children
#or(or(a,b),c) -> or(a,b,c)
#After this step, nodes can have more than two child
def flatten_tree(node):
if hasattr(node, 'op'):
node.children = list(map(flatten_tree, node.children))
if node.op == 'and':
new_children = []
for chld in node.children:
if hasattr(chld, 'op') and chld.op == 'and':
new_children += chld.children
else:
new_children.append(chld)
node.children = new_children
elif node.op == 'or':
new_children = []
for chld in node.children:
if hasattr(chld, 'op') and chld.op == 'or':
new_children += chld.children
else:
new_children.append(chld)
node.children = new_children
return node
#Convert the flattened tree to a list of sets of terms
#Do some basic simplification: removing tautological or repeating clauses
def convert_to_list_and_simplify(node):
res = []
if hasattr(node, 'op'):
if node.op == 'and':
for chld in node.children:
if type(chld) == str:
res.append(set([chld]))
elif chld.op == 'not':
res.append(set(["-"+chld.children[0]]))
else:
res.append(set(map(lambda x: "-"+x.children[0] if hasattr(x, "op") else x, chld.children)))
elif node.op == 'or':
res.append(set(map(lambda x: "-"+x.children[0] if hasattr(x, "op") else x, node.children)))
elif node.op == 'not':
res.append(set(["-"+node.children[0]]))
else:
res.append(set([node]))
filtered_res = []
last_found_always_true_clause = None
#Filter out tautologies
for clause in res:
always_true = False
for term in clause:
if "-"+term in clause:
always_true = True
last_found_always_true_clause = clause
break
if not always_true: filtered_res.append(clause)
#Remove repeating clauses
for i in range(len(filtered_res)):
for j in range(len(filtered_res)):
if i != j and filtered_res[i] == filtered_res[j]: filtered_res[i] = None
filtered_res = [x for x in filtered_res if x is not None]
#Do not return empty if all clauses are tautologies, return a single clause instead
if len(filtered_res) == 0 and last_found_always_true_clause:
filtered_res.append(last_found_always_true_clause)
return filtered_res
#This is the main function of this module that should be called from outside
def parse_logic_expression_query(input_str):
return convert_to_list_and_simplify(flatten_tree(distribute_and_over_or(move_not_inwards(convert_to_and_or_not(parse(input_str))))))