hydrus/include/HydrusHTMLParsing.py

118 lines
3.0 KiB
Python

import bs4
import HydrusSerialisable
class ParseFormula( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_HTML_PARSE_FORMULA
SERIALISABLE_VERSION = 1
def __init__( self ):
self._tag_rules = []
self._content_rule = None
def _GetSerialisableInfo( self ):
return ( self._tag_rules, self._content_rule )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._tag_rules, self._content_rule ) = serialisable_info
def _ParseContent( self, root ):
if self._content_rule is None:
return root.string
else:
return root[ self._content_rule ]
def _ParseTags( self, root, name, attrs, index ):
results = root.find_all( name = name, attrs = attrs )
if index is not None:
try:
results = ( results[index], )
except IndexError:
text = 'Trying to parse ' + name + ' tags '
if len( attrs ) > 0: text += 'with attrs ' + str( attrs ) + ' '
text += 'failed because index ' + str( index ) + ' was requested, but only ' + str( len( results ) ) + ' tags were found.'
raise IndexError( text )
return results
def IsValid( self ):
return len( self._tag_rules ) > 0 and self._content_rule is not None
def PopTagsRule( self ):
self._tag_rules.pop()
def PushTagsRule( self, name = None, attrs = {}, index = None ):
self._tag_rules.append( ( name, attrs, index ) )
def Duplicate( self ):
new_formula = ParseFormula()
for ( name, attrs, index ) in self._tag_rules:
new_formula.PushTagsRule( name, dict( attrs ), index )
new_formula.SetContentRule( self._content_rule )
return new_formula
def Parse( self, html ):
root = bs4.BeautifulSoup( html, 'lxml' )
roots = ( root, )
for ( name, attrs, index ) in self._tag_rules:
next_roots = []
for root in roots:
next_roots.extend( self._ParseTags( root, name, attrs, index ) )
roots = next_roots
contents = [ self._ParseContent( root ) for root in roots ]
return contents
def SetContentRule( self, attr = None ):
self._content_rule = attr
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_HTML_PARSE_FORMULA ] = ParseFormula