2016-07-20 19:57:10 +00:00
|
|
|
import bs4
|
2016-11-02 21:09:14 +00:00
|
|
|
import ClientNetworking
|
2016-09-21 19:54:04 +00:00
|
|
|
import HydrusConstants as HC
|
2016-09-07 20:01:05 +00:00
|
|
|
import HydrusData
|
2016-07-20 19:57:10 +00:00
|
|
|
import HydrusSerialisable
|
2016-10-26 20:45:34 +00:00
|
|
|
import HydrusTags
|
2016-10-19 20:02:56 +00:00
|
|
|
import os
|
2016-11-02 21:09:14 +00:00
|
|
|
import urlparse
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def ChildHasDesiredContent( child, desired_content ):
|
|
|
|
|
|
|
|
return desired_content == 'all' or len( child.GetParsableContent().intersection( desired_content ) ) > 0
|
|
|
|
|
2016-10-26 20:45:34 +00:00
|
|
|
def ConvertContentResultToPrettyString( result ):
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
( ( name, content_type, additional_info ), parsed_text ) = result
|
2016-10-26 20:45:34 +00:00
|
|
|
|
|
|
|
if content_type == HC.CONTENT_TYPE_MAPPINGS:
|
|
|
|
|
|
|
|
return 'tag: ' + HydrusTags.CombineTag( additional_info, parsed_text )
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
elif content_type == HC.CONTENT_TYPE_VETO:
|
|
|
|
|
|
|
|
return 'veto'
|
|
|
|
|
2016-10-26 20:45:34 +00:00
|
|
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def ConvertParsableContentToPrettyString( parsable_content, include_veto = False ):
|
|
|
|
|
|
|
|
pretty_strings = []
|
|
|
|
|
|
|
|
content_type_to_additional_infos = HydrusData.BuildKeyToSetDict( ( ( content_type, additional_infos ) for ( name, content_type, additional_infos ) in parsable_content ) )
|
|
|
|
|
|
|
|
for ( content_type, additional_infos ) in content_type_to_additional_infos.items():
|
|
|
|
|
|
|
|
if content_type == HC.CONTENT_TYPE_MAPPINGS:
|
|
|
|
|
|
|
|
namespaces = [ namespace for namespace in additional_infos if namespace != '' ]
|
|
|
|
|
|
|
|
if '' in additional_infos:
|
|
|
|
|
|
|
|
namespaces.append( 'unnamespaced' )
|
|
|
|
|
|
|
|
|
|
|
|
pretty_strings.append( 'tags: ' + ', '.join( namespaces ) )
|
|
|
|
|
|
|
|
elif content_type == HC.CONTENT_TYPE_VETO:
|
|
|
|
|
|
|
|
if include_veto:
|
|
|
|
|
|
|
|
pretty_strings.append( 'veto' )
|
|
|
|
|
|
|
|
|
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
if len( pretty_strings ) == 0:
|
2016-10-05 20:22:40 +00:00
|
|
|
|
|
|
|
return 'nothing'
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
return ', '.join( pretty_strings )
|
2016-10-05 20:22:40 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
|
|
|
|
def GetChildrenContent( children, data, referral_url, desired_content ):
|
|
|
|
|
|
|
|
for child in children:
|
2016-10-19 20:02:56 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
if child.Vetoes( data ):
|
2016-10-19 20:02:56 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
return []
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
|
|
|
|
content = []
|
|
|
|
|
|
|
|
for child in children:
|
|
|
|
|
|
|
|
if ChildHasDesiredContent( child, desired_content ):
|
|
|
|
|
|
|
|
child_content = child.Parse( data, referral_url, desired_content )
|
|
|
|
|
|
|
|
content.extend( child_content )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
def GetVetoes( parsed_texts, additional_info ):
|
|
|
|
|
|
|
|
( veto_if_matches_found, match_if_text_present, search_text ) = additional_info
|
|
|
|
|
|
|
|
if match_if_text_present:
|
|
|
|
|
|
|
|
matches = [ 'veto' for parsed_text in parsed_texts if search_text in parsed_text ]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
matches = [ 'veto' for parsed_text in parsed_texts if search_text not in parsed_text ]
|
|
|
|
|
|
|
|
|
|
|
|
if veto_if_matches_found:
|
|
|
|
|
|
|
|
return matches
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
if len( matches ) == 0:
|
|
|
|
|
|
|
|
return [ 'veto through absence' ]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
|
|
|
|
|
2016-09-07 20:01:05 +00:00
|
|
|
def RenderTagRule( ( name, attrs, index ) ):
|
|
|
|
|
|
|
|
if index is None:
|
|
|
|
|
|
|
|
result = 'all ' + name + ' tags'
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
2016-10-26 20:45:34 +00:00
|
|
|
result = HydrusData.ConvertIntToFirst( index + 1 ) + ' ' + name + ' tag'
|
2016-09-07 20:01:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
if len( attrs ) > 0:
|
|
|
|
|
|
|
|
result += ' with ' + ' and '.join( [ key + ' = ' + value for ( key, value ) in attrs.items() ] )
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
class ParseFormulaHTML( HydrusSerialisable.SerialisableBase ):
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML
|
2016-07-20 19:57:10 +00:00
|
|
|
SERIALISABLE_VERSION = 1
|
|
|
|
|
2016-09-07 20:01:05 +00:00
|
|
|
def __init__( self, tag_rules = None, content_rule = None ):
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-09-07 20:01:05 +00:00
|
|
|
if tag_rules is None:
|
|
|
|
|
|
|
|
tag_rules = [ ( 'a', {}, None ) ]
|
|
|
|
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-09-07 20:01:05 +00:00
|
|
|
self._tag_rules = tag_rules
|
|
|
|
|
|
|
|
self._content_rule = content_rule
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
# I need extra rules here for chopping stuff off the beginning or end and appending or prepending strings
|
|
|
|
|
2016-07-20 19:57:10 +00:00
|
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
|
|
|
|
return ( self._tag_rules, self._content_rule )
|
|
|
|
|
|
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
|
|
|
|
( self._tag_rules, self._content_rule ) = serialisable_info
|
|
|
|
|
|
|
|
|
|
|
|
def _ParseContent( self, root ):
|
|
|
|
|
|
|
|
if self._content_rule is None:
|
|
|
|
|
2016-10-26 20:45:34 +00:00
|
|
|
result = root.string
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
if root.has_attr( self._content_rule ):
|
|
|
|
|
|
|
|
result = root[ self._content_rule ]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
result = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if result == '':
|
|
|
|
|
|
|
|
return None
|
2016-07-20 19:57:10 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
|
2016-10-26 20:45:34 +00:00
|
|
|
return result
|
2016-07-20 19:57:10 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _ParseTags( self, root, name, attrs, index ):
|
|
|
|
|
|
|
|
results = root.find_all( name = name, attrs = attrs )
|
|
|
|
|
|
|
|
if index is not None:
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
if len( results ) < index + 1:
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
results = []
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
else:
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
results = [ results[ index ] ]
|
2016-07-20 19:57:10 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
def Parse( self, html ):
|
|
|
|
|
|
|
|
root = bs4.BeautifulSoup( html, 'lxml' )
|
|
|
|
|
|
|
|
roots = ( root, )
|
|
|
|
|
|
|
|
for ( name, attrs, index ) in self._tag_rules:
|
|
|
|
|
|
|
|
next_roots = []
|
|
|
|
|
|
|
|
for root in roots:
|
|
|
|
|
|
|
|
next_roots.extend( self._ParseTags( root, name, attrs, index ) )
|
|
|
|
|
|
|
|
|
|
|
|
roots = next_roots
|
|
|
|
|
|
|
|
|
|
|
|
contents = [ self._ParseContent( root ) for root in roots ]
|
|
|
|
|
2016-10-26 20:45:34 +00:00
|
|
|
contents = [ content for content in contents if content is not None ]
|
|
|
|
|
2016-07-20 19:57:10 +00:00
|
|
|
return contents
|
|
|
|
|
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
def ToPrettyMultilineString( self ):
|
|
|
|
|
|
|
|
pretty_strings = []
|
|
|
|
|
|
|
|
for ( name, attrs, index ) in self._tag_rules:
|
|
|
|
|
|
|
|
s = ''
|
|
|
|
|
|
|
|
if index is None:
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
s += 'get every'
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
num = index + 1
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
s += 'get the ' + HydrusData.ConvertIntToPrettyOrdinalString( num )
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
s += ' <' + name + '> tag'
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
if len( attrs ) > 0:
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
s += ' with attributes ' + ', '.join( key + '=' + value for ( key, value ) in attrs.items() )
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
|
|
|
|
pretty_strings.append( s )
|
|
|
|
|
|
|
|
|
|
|
|
if self._content_rule is None:
|
|
|
|
|
|
|
|
pretty_strings.append( 'get the text content of those tags' )
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
pretty_strings.append( 'get the ' + self._content_rule + ' attribute of those tags' )
|
|
|
|
|
|
|
|
|
|
|
|
separator = os.linesep + 'and then '
|
|
|
|
|
|
|
|
pretty_multiline_string = separator.join( pretty_strings )
|
|
|
|
|
|
|
|
return pretty_multiline_string
|
|
|
|
|
|
|
|
|
2016-09-07 20:01:05 +00:00
|
|
|
def ToTuple( self ):
|
2016-07-20 19:57:10 +00:00
|
|
|
|
2016-09-07 20:01:05 +00:00
|
|
|
return ( self._tag_rules, self._content_rule )
|
2016-07-20 19:57:10 +00:00
|
|
|
|
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML ] = ParseFormulaHTML
|
|
|
|
|
|
|
|
class ParseNodeContent( HydrusSerialisable.SerialisableBase ):
|
|
|
|
|
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT
|
|
|
|
SERIALISABLE_VERSION = 1
|
|
|
|
|
|
|
|
def __init__( self, name = None, content_type = None, formula = None, additional_info = None ):
|
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
if name is None:
|
|
|
|
|
|
|
|
name = ''
|
|
|
|
|
|
|
|
|
|
|
|
if content_type is None:
|
|
|
|
|
|
|
|
content_type = HC.CONTENT_TYPE_MAPPINGS
|
|
|
|
|
|
|
|
|
|
|
|
if formula is None:
|
|
|
|
|
|
|
|
formula = ParseFormulaHTML()
|
|
|
|
|
|
|
|
|
|
|
|
if additional_info is None:
|
|
|
|
|
|
|
|
if content_type == HC.CONTENT_TYPE_MAPPINGS:
|
|
|
|
|
|
|
|
additional_info = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
self._name = name
|
|
|
|
self._content_type = content_type
|
|
|
|
self._formula = formula
|
|
|
|
self._additional_info = additional_info
|
|
|
|
|
|
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
|
|
|
|
serialisable_formula = self._formula.GetSerialisableTuple()
|
|
|
|
|
|
|
|
return ( self._name, self._content_type, serialisable_formula, self._additional_info )
|
|
|
|
|
|
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
|
|
|
|
( self._name, self._content_type, serialisable_formula, self._additional_info ) = serialisable_info
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
if isinstance( self._additional_info, list ):
|
|
|
|
|
|
|
|
self._additional_info = tuple( self._additional_info )
|
|
|
|
|
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
|
|
|
|
|
|
|
|
|
|
|
|
def GetParsableContent( self ):
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
return { ( self._name, self._content_type, self._additional_info ) }
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
def Parse( self, data, referral_url, desired_content ):
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
content_description = ( self._name, self._content_type, self._additional_info )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
parsed_texts = self._formula.Parse( data )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
if self._content_type == HC.CONTENT_TYPE_VETO:
|
|
|
|
|
|
|
|
vetoes = GetVetoes( parsed_texts, self._additional_info )
|
|
|
|
|
|
|
|
return [ ( content_description, veto ) for veto in vetoes ]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return [ ( content_description, parsed_text ) for parsed_text in parsed_texts ]
|
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
def ToPrettyStrings( self ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
return ( self._name, 'content', ConvertParsableContentToPrettyString( self.GetParsableContent(), include_veto = True ) )
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
|
|
|
|
def ToTuple( self ):
|
|
|
|
|
|
|
|
return ( self._name, self._content_type, self._formula, self._additional_info )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def Vetoes( self, data ):
|
|
|
|
|
|
|
|
if self._content_type == HC.CONTENT_TYPE_VETO:
|
|
|
|
|
|
|
|
parsed_texts = self._formula.Parse( data )
|
|
|
|
|
|
|
|
vetoes = GetVetoes( parsed_texts, self._additional_info )
|
|
|
|
|
|
|
|
return len( vetoes ) > 0
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT ] = ParseNodeContent
|
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
class ParseNodeContentLink( HydrusSerialisable.SerialisableBase ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK
|
2016-09-21 19:54:04 +00:00
|
|
|
SERIALISABLE_VERSION = 1
|
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
def __init__( self, name = None, formula = None, children = None ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
if name is None:
|
|
|
|
|
|
|
|
name = ''
|
|
|
|
|
|
|
|
|
|
|
|
if formula is None:
|
|
|
|
|
|
|
|
formula = ParseFormulaHTML()
|
|
|
|
|
|
|
|
|
|
|
|
if children is None:
|
|
|
|
|
|
|
|
children = []
|
|
|
|
|
|
|
|
|
|
|
|
self._name = name
|
2016-09-21 19:54:04 +00:00
|
|
|
self._formula = formula
|
|
|
|
self._children = children
|
|
|
|
|
|
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
|
|
|
|
serialisable_formula = self._formula.GetSerialisableTuple()
|
|
|
|
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
|
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
return ( self._name, serialisable_formula, serialisable_children )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
( self._name, serialisable_formula, serialisable_children ) = serialisable_info
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
|
|
|
|
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
|
|
|
|
|
|
|
|
|
|
|
|
def GetParsableContent( self ):
|
|
|
|
|
|
|
|
children_parsable_content = set()
|
|
|
|
|
|
|
|
for child in self._children:
|
|
|
|
|
|
|
|
children_parsable_content.update( child.GetParsableContent() )
|
|
|
|
|
|
|
|
|
|
|
|
return children_parsable_content
|
|
|
|
|
|
|
|
|
|
|
|
def Parse( self, data, referral_url, desired_content ):
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
search_urls = self.ParseURLs( data, referral_url )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
content = []
|
|
|
|
|
|
|
|
for search_url in search_urls:
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
headers = { 'Referer' : referral_url }
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
response = ClientNetworking.RequestsGet( search_url, headers = headers )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
children_content = GetChildrenContent( self._children, data, search_url, desired_content )
|
|
|
|
|
|
|
|
content.extend( children_content )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def ParseURLs( self, data, referral_url ):
|
|
|
|
|
|
|
|
basic_urls = self._formula.Parse( data )
|
|
|
|
|
|
|
|
absolute_urls = [ urlparse.urljoin( referral_url, basic_url ) for basic_url in basic_urls ]
|
|
|
|
|
|
|
|
return absolute_urls
|
|
|
|
|
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
def ToPrettyStrings( self ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
return ( self._name, 'link', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def ToTuple( self ):
|
|
|
|
|
|
|
|
return ( self._name, self._formula, self._children )
|
|
|
|
|
|
|
|
|
|
|
|
def Vetoes( self, data ):
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK ] = ParseNodeContentLink
|
|
|
|
|
|
|
|
FILE_IDENTIFIER_TYPE_FILE = 0
|
|
|
|
FILE_IDENTIFIER_TYPE_MD5 = 1
|
|
|
|
FILE_IDENTIFIER_TYPE_SHA1 = 2
|
|
|
|
FILE_IDENTIFIER_TYPE_SHA256 = 3
|
2016-10-12 21:52:50 +00:00
|
|
|
FILE_IDENTIFIER_TYPE_SHA512 = 4
|
2016-10-05 20:22:40 +00:00
|
|
|
FILE_IDENTIFIER_TYPE_USER_INPUT = 5
|
|
|
|
|
|
|
|
file_identifier_string_lookup = {}
|
|
|
|
|
|
|
|
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_FILE ] = 'the actual file (POST only)'
|
|
|
|
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_MD5 ] = 'md5 hash'
|
|
|
|
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA1 ] = 'sha1 hash'
|
|
|
|
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA256 ] = 'sha256 hash'
|
|
|
|
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA512 ] = 'sha512 hash'
|
|
|
|
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_USER_INPUT ] = 'custom user input'
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
class ParseRootFileLookup( HydrusSerialisable.SerialisableBaseNamed ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP
|
2016-09-21 19:54:04 +00:00
|
|
|
SERIALISABLE_VERSION = 1
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def __init__( self, name, url = None, query_type = None, file_identifier_type = None, file_identifier_encoding = None, file_identifier_arg_name = None, static_args = None, children = None ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
self._url = url
|
2016-09-21 19:54:04 +00:00
|
|
|
self._query_type = query_type
|
2016-10-05 20:22:40 +00:00
|
|
|
self._file_identifier_type = file_identifier_type
|
|
|
|
self._file_identifier_encoding = file_identifier_encoding
|
|
|
|
self._file_identifier_arg_name = file_identifier_arg_name
|
2016-09-21 19:54:04 +00:00
|
|
|
self._static_args = static_args
|
|
|
|
self._children = children
|
|
|
|
|
|
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
|
|
|
|
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
return ( self._url, self._query_type, self._file_identifier_type, self._file_identifier_encoding, self._file_identifier_arg_name, self._static_args, serialisable_children )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
( self._url, self._query_type, self._file_identifier_type, self._file_identifier_encoding, self._file_identifier_arg_name, self._static_args, serialisable_children ) = serialisable_info
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
|
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def FetchData( self, file_identifier ):
|
2016-10-05 20:22:40 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
request_args = dict( self._static_args )
|
|
|
|
|
|
|
|
if self._file_identifier_type != FILE_IDENTIFIER_TYPE_FILE:
|
2016-10-05 20:22:40 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
request_args[ self._file_identifier_arg_name ] = HydrusData.EncodeBytes( self._file_identifier_encoding, file_identifier )
|
2016-10-05 20:22:40 +00:00
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
if self._query_type == HC.GET:
|
|
|
|
|
|
|
|
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
|
|
|
|
|
|
|
|
raise Exception( 'Cannot have a file as an argument on a GET query!' )
|
|
|
|
|
|
|
|
|
|
|
|
response = ClientNetworking.RequestsGet( self._url, params = request_args )
|
|
|
|
|
|
|
|
elif self._query_type == HC.POST:
|
|
|
|
|
|
|
|
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
|
|
|
|
|
|
|
|
path = file_identifier
|
|
|
|
|
|
|
|
files = { self._file_identifier_arg_name : open( path, 'rb' ) }
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
files = None
|
|
|
|
|
|
|
|
|
|
|
|
response = ClientNetworking.RequestsPost( self._url, data = request_args, files = files )
|
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
data = response.content
|
2016-10-05 20:22:40 +00:00
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
2016-09-21 19:54:04 +00:00
|
|
|
def GetParsableContent( self ):
|
|
|
|
|
|
|
|
children_parsable_content = set()
|
|
|
|
|
|
|
|
for child in self._children:
|
|
|
|
|
|
|
|
children_parsable_content.update( child.GetParsableContent() )
|
|
|
|
|
|
|
|
|
|
|
|
return children_parsable_content
|
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def DoQuery( self, file_identifier, desired_content ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
# this should eventually take a job_key that will be propagated down and will have obeyed cancel and so on
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
data = self.FetchData( file_identifier )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
return self.Parse( data, desired_content )
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def GetFileIdentifier( self ):
|
2016-10-19 20:02:56 +00:00
|
|
|
|
|
|
|
return ( self._file_identifier_type, self._file_identifier_encoding )
|
|
|
|
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
def Parse( self, data, desired_content ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
content = GetChildrenContent( self._children, data, self._url, desired_content )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
|
|
def SetChildren( self, children ):
|
|
|
|
|
|
|
|
self._children = children
|
|
|
|
|
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
def ToPrettyStrings( self ):
|
2016-09-21 19:54:04 +00:00
|
|
|
|
2016-10-19 20:02:56 +00:00
|
|
|
return ( self._name, HC.query_type_string_lookup[ self._query_type ], 'File Lookup', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
def ToTuple( self ):
|
|
|
|
|
2016-11-02 21:09:14 +00:00
|
|
|
return ( self._name, self._url, self._query_type, self._file_identifier_type, self._file_identifier_encoding, self._file_identifier_arg_name, self._static_args, self._children )
|
2016-09-21 19:54:04 +00:00
|
|
|
|
|
|
|
|
2016-10-05 20:22:40 +00:00
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP ] = ParseRootFileLookup
|