From 4ed500f346974a1aa9896e3f3103a92148a57936 Mon Sep 17 00:00:00 2001 From: Kriukov Aleksandr Almazovich Date: Wed, 10 May 2023 03:27:56 +0300 Subject: [PATCH 1/2] add html rule type for finding siblings --- hydrus/client/ClientParsing.py | 41 +++++++++++++++---- .../gui/parsing/ClientGUIParsingFormulae.py | 6 ++- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/hydrus/client/ClientParsing.py b/hydrus/client/ClientParsing.py index c00d4d07..4ea91f18 100644 --- a/hydrus/client/ClientParsing.py +++ b/hydrus/client/ClientParsing.py @@ -1433,12 +1433,14 @@ HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIAL HTML_RULE_TYPE_DESCENDING = 0 HTML_RULE_TYPE_ASCENDING = 1 +HTML_RULE_TYPE_NEXT_SIBLINGS = 2 +HTML_RULE_TYPE_PREV_SIBLINGS = 3 class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_RULE_HTML SERIALISABLE_NAME = 'HTML Parsing Rule' - SERIALISABLE_VERSION = 2 + SERIALISABLE_VERSION = 3 def __init__( self, rule_type = None, tag_name = None, tag_attributes = None, tag_index = None, tag_depth = None, should_test_tag_string = False, tag_string_string_match = None ): @@ -1454,7 +1456,7 @@ class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): - if rule_type == HTML_RULE_TYPE_DESCENDING: + if rule_type in [ HTML_RULE_TYPE_DESCENDING, HTML_RULE_TYPE_NEXT_SIBLINGS, HTML_RULE_TYPE_PREV_SIBLINGS ]: if tag_attributes is None: @@ -1512,7 +1514,11 @@ class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): new_serialisable_info = ( rule_type, tag_name, tag_attributes, tag_index, tag_depth, should_test_tag_string, serialisable_tag_string_string_match ) - return ( 2, new_serialisable_info ) + return ( 3, new_serialisable_info ) + + elif version == 2: + + return (3, old_serialisable_info) @@ -1522,7 +1528,7 @@ class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): for node in nodes: - if self._rule_type == HTML_RULE_TYPE_DESCENDING: + if self._rule_type in [ HTML_RULE_TYPE_DESCENDING, HTML_RULE_TYPE_NEXT_SIBLINGS, HTML_RULE_TYPE_PREV_SIBLINGS ]: # having class : [ 'a', 'b' ] works here, but it does OR not AND # instead do node.find_all( lambda tag: 'class' in tag.attrs and 'a' in tag[ 'class' ] and 'b' in tag[ 'class' ] ) @@ -1535,8 +1541,19 @@ class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): kwargs[ 'name' ] = self._tag_name - found_nodes = node.find_all( **kwargs ) + if self._rule_type == HTML_RULE_TYPE_DESCENDING: + + found_nodes = node.find_all( **kwargs ) + + elif self._rule_type == HTML_RULE_TYPE_NEXT_SIBLINGS: + + found_nodes = node.find_next_siblings( **kwargs ) + + elif self._rule_type == HTML_RULE_TYPE_PREV_SIBLINGS: + + found_nodes = node.find_previous_siblings( **kwargs ) + if self._tag_index is not None: try: @@ -1611,9 +1628,19 @@ class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): def ToString( self ): - if self._rule_type == HTML_RULE_TYPE_DESCENDING: + if self._rule_type in [ HTML_RULE_TYPE_DESCENDING, HTML_RULE_TYPE_NEXT_SIBLINGS, HTML_RULE_TYPE_PREV_SIBLINGS ]: - s = 'search descendants for' + if self._rule_type == HTML_RULE_TYPE_DESCENDING: + + s = 'search descendants for' + + elif self._rule_type == HTML_RULE_TYPE_NEXT_SIBLINGS: + + s = 'search next siblings for' + + elif self._rule_type == HTML_RULE_TYPE_PREV_SIBLINGS: + + s = 'search prev siblings for' if self._tag_index is None: diff --git a/hydrus/client/gui/parsing/ClientGUIParsingFormulae.py b/hydrus/client/gui/parsing/ClientGUIParsingFormulae.py index 43746589..3ffa2a18 100644 --- a/hydrus/client/gui/parsing/ClientGUIParsingFormulae.py +++ b/hydrus/client/gui/parsing/ClientGUIParsingFormulae.py @@ -574,6 +574,8 @@ class EditHTMLTagRulePanel( ClientGUIScrolledPanels.EditPanel ): self._rule_type.addItem( 'search descendants', ClientParsing.HTML_RULE_TYPE_DESCENDING ) self._rule_type.addItem( 'walk back up ancestors', ClientParsing.HTML_RULE_TYPE_ASCENDING ) + self._rule_type.addItem( 'search next siblings', ClientParsing.HTML_RULE_TYPE_NEXT_SIBLINGS ) + self._rule_type.addItem( 'search previous siblings', ClientParsing.HTML_RULE_TYPE_PREV_SIBLINGS ) self._tag_name = QW.QLineEdit( self ) @@ -663,7 +665,7 @@ class EditHTMLTagRulePanel( ClientGUIScrolledPanels.EditPanel ): rule_type = self._rule_type.GetValue() - if rule_type == ClientParsing.HTML_RULE_TYPE_DESCENDING: + if rule_type in [ ClientParsing.HTML_RULE_TYPE_DESCENDING, ClientParsing.HTML_RULE_TYPE_NEXT_SIBLINGS, ClientParsing.HTML_RULE_TYPE_PREV_SIBLINGS ]: self._tag_attributes.setEnabled( True ) self._tag_index.setEnabled( True ) @@ -719,7 +721,7 @@ class EditHTMLTagRulePanel( ClientGUIScrolledPanels.EditPanel ): should_test_tag_string = self._should_test_tag_string.isChecked() tag_string_string_match = self._tag_string_string_match.GetValue() - if rule_type == ClientParsing.HTML_RULE_TYPE_DESCENDING: + if rule_type in [ ClientParsing.HTML_RULE_TYPE_DESCENDING, ClientParsing.HTML_RULE_TYPE_NEXT_SIBLINGS, ClientParsing.HTML_RULE_TYPE_PREV_SIBLINGS ]: tag_attributes = self._tag_attributes.GetValue() tag_index = self._tag_index.GetValue() From 7233135f0341bdf866f9864e82211ff70faf65e2 Mon Sep 17 00:00:00 2001 From: Kriukov Aleksandr Almazovich Date: Wed, 10 May 2023 03:27:56 +0300 Subject: [PATCH 2/2] add html rule type for finding siblings --- hydrus/client/ClientParsing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hydrus/client/ClientParsing.py b/hydrus/client/ClientParsing.py index 4ea91f18..f487fa33 100644 --- a/hydrus/client/ClientParsing.py +++ b/hydrus/client/ClientParsing.py @@ -1463,6 +1463,7 @@ class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): tag_attributes = {} + elif rule_type == HTML_RULE_TYPE_ASCENDING: if tag_depth is None: @@ -1518,7 +1519,7 @@ class ParseRuleHTML( HydrusSerialisable.SerialisableBase ): elif version == 2: - return (3, old_serialisable_info) + return ( 3, old_serialisable_info )