118 lines
3.0 KiB
Python
118 lines
3.0 KiB
Python
import bs4
|
|
import HydrusSerialisable
|
|
|
|
class ParseFormula( HydrusSerialisable.SerialisableBase ):
|
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_HTML_PARSE_FORMULA
|
|
SERIALISABLE_VERSION = 1
|
|
|
|
def __init__( self ):
|
|
|
|
self._tag_rules = []
|
|
|
|
self._content_rule = None
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
return ( self._tag_rules, self._content_rule )
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
( self._tag_rules, self._content_rule ) = serialisable_info
|
|
|
|
|
|
def _ParseContent( self, root ):
|
|
|
|
if self._content_rule is None:
|
|
|
|
return root.string
|
|
|
|
else:
|
|
|
|
return root[ self._content_rule ]
|
|
|
|
|
|
|
|
def _ParseTags( self, root, name, attrs, index ):
|
|
|
|
results = root.find_all( name = name, attrs = attrs )
|
|
|
|
if index is not None:
|
|
|
|
try:
|
|
|
|
results = ( results[index], )
|
|
|
|
except IndexError:
|
|
|
|
text = 'Trying to parse ' + name + ' tags '
|
|
if len( attrs ) > 0: text += 'with attrs ' + str( attrs ) + ' '
|
|
text += 'failed because index ' + str( index ) + ' was requested, but only ' + str( len( results ) ) + ' tags were found.'
|
|
|
|
raise IndexError( text )
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
def IsValid( self ):
|
|
|
|
return len( self._tag_rules ) > 0 and self._content_rule is not None
|
|
|
|
|
|
def PopTagsRule( self ):
|
|
|
|
self._tag_rules.pop()
|
|
|
|
|
|
def PushTagsRule( self, name = None, attrs = {}, index = None ):
|
|
|
|
self._tag_rules.append( ( name, attrs, index ) )
|
|
|
|
|
|
def Duplicate( self ):
|
|
|
|
new_formula = ParseFormula()
|
|
|
|
for ( name, attrs, index ) in self._tag_rules:
|
|
|
|
new_formula.PushTagsRule( name, dict( attrs ), index )
|
|
|
|
|
|
new_formula.SetContentRule( self._content_rule )
|
|
|
|
return new_formula
|
|
|
|
|
|
def Parse( self, html ):
|
|
|
|
root = bs4.BeautifulSoup( html, 'lxml' )
|
|
|
|
roots = ( root, )
|
|
|
|
for ( name, attrs, index ) in self._tag_rules:
|
|
|
|
next_roots = []
|
|
|
|
for root in roots:
|
|
|
|
next_roots.extend( self._ParseTags( root, name, attrs, index ) )
|
|
|
|
|
|
roots = next_roots
|
|
|
|
|
|
contents = [ self._ParseContent( root ) for root in roots ]
|
|
|
|
return contents
|
|
|
|
|
|
def SetContentRule( self, attr = None ):
|
|
|
|
self._content_rule = attr
|
|
|
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_HTML_PARSE_FORMULA ] = ParseFormula |