hydrus/include/ClientParsing.py

888 lines
27 KiB
Python

import bs4
import ClientNetworking
import HydrusConstants as HC
import HydrusData
import HydrusExceptions
import HydrusGlobals
import HydrusSerialisable
import HydrusTags
import os
import time
import urlparse
def ChildHasDesiredContent( child, desired_content ):
return desired_content == 'all' or len( child.GetParsableContent().intersection( desired_content ) ) > 0
def ConvertContentResultToPrettyString( result ):
( ( name, content_type, additional_info ), parsed_text ) = result
if content_type == HC.CONTENT_TYPE_MAPPINGS:
return 'tag: ' + HydrusTags.CombineTag( additional_info, parsed_text )
elif content_type == HC.CONTENT_TYPE_VETO:
return 'veto'
raise NotImplementedError()
def ConvertParsableContentToPrettyString( parsable_content, include_veto = False ):
pretty_strings = []
content_type_to_additional_infos = HydrusData.BuildKeyToSetDict( ( ( content_type, additional_infos ) for ( name, content_type, additional_infos ) in parsable_content ) )
for ( content_type, additional_infos ) in content_type_to_additional_infos.items():
if content_type == HC.CONTENT_TYPE_MAPPINGS:
namespaces = [ namespace for namespace in additional_infos if namespace != '' ]
if '' in additional_infos:
namespaces.append( 'unnamespaced' )
pretty_strings.append( 'tags: ' + ', '.join( namespaces ) )
elif content_type == HC.CONTENT_TYPE_VETO:
if include_veto:
pretty_strings.append( 'veto' )
if len( pretty_strings ) == 0:
return 'nothing'
else:
return ', '.join( pretty_strings )
def GetChildrenContent( job_key, children, data, referral_url, desired_content ):
for child in children:
if child.Vetoes( data ):
return []
content = []
for child in children:
if ChildHasDesiredContent( child, desired_content ):
child_content = child.Parse( job_key, data, referral_url, desired_content )
content.extend( child_content )
return content
def GetTagsFromContentResults( results ):
tag_results = []
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_MAPPINGS:
tag_results.append( HydrusTags.CombineTag( additional_info, parsed_text ) )
tag_results = HydrusTags.CleanTags( tag_results )
return tag_results
def GetVetoes( parsed_texts, additional_info ):
( veto_if_matches_found, match_if_text_present, search_text ) = additional_info
if match_if_text_present:
matches = [ 'veto' for parsed_text in parsed_texts if search_text in parsed_text ]
else:
matches = [ 'veto' for parsed_text in parsed_texts if search_text not in parsed_text ]
if veto_if_matches_found:
return matches
else:
if len( matches ) == 0:
return [ 'veto through absence' ]
else:
return []
def RenderTagRule( ( name, attrs, index ) ):
if index is None:
result = 'all ' + name + ' tags'
else:
result = HydrusData.ConvertIntToFirst( index + 1 ) + ' ' + name + ' tag'
if len( attrs ) > 0:
result += ' with ' + ' and '.join( [ key + ' = ' + value for ( key, value ) in attrs.items() ] )
return result
class ParseFormulaHTML( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML
SERIALISABLE_VERSION = 2
def __init__( self, tag_rules = None, content_rule = None, culling_and_adding = None ):
if tag_rules is None:
tag_rules = [ ( 'a', {}, None ) ]
if culling_and_adding is None:
culling_and_adding = ( 0, 0, '', '' )
self._tag_rules = tag_rules
self._content_rule = content_rule
self._culling_and_adding = culling_and_adding
def _CullAndAdd( self, text ):
( cull_front, cull_back, prepend, append ) = self._culling_and_adding
if cull_front != 0:
text = text[ cull_front : ]
if cull_back != 0:
text = text[ : - cull_back ]
if text == '':
return None
text = prepend + text + append
return text
def _GetSerialisableInfo( self ):
return ( self._tag_rules, self._content_rule, self._culling_and_adding )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._tag_rules, self._content_rule, self._culling_and_adding ) = serialisable_info
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( tag_rules, content_rule ) = old_serialisable_info
culling_and_adding = ( 0, 0, '', '' )
new_serialisable_info = ( tag_rules, content_rule, culling_and_adding )
return ( 2, new_serialisable_info )
def _ParseContent( self, root ):
if self._content_rule is None:
result = root.string
else:
if root.has_attr( self._content_rule ):
result = root[ self._content_rule ][0]
else:
result = None
if result == '' or result is None:
return None
else:
return self._CullAndAdd( result )
def _ParseTags( self, root, name, attrs, index ):
results = root.find_all( name = name, attrs = attrs )
if index is not None:
if len( results ) < index + 1:
results = []
else:
results = [ results[ index ] ]
return results
def Parse( self, html ):
root = bs4.BeautifulSoup( html, 'lxml' )
roots = ( root, )
for ( name, attrs, index ) in self._tag_rules:
next_roots = []
for root in roots:
next_roots.extend( self._ParseTags( root, name, attrs, index ) )
roots = next_roots
contents = [ self._ParseContent( root ) for root in roots ]
contents = [ content for content in contents if content is not None ]
return contents
def ToPrettyMultilineString( self ):
pretty_strings = []
for ( name, attrs, index ) in self._tag_rules:
s = ''
if index is None:
s += 'get every'
else:
num = index + 1
s += 'get the ' + HydrusData.ConvertIntToPrettyOrdinalString( num )
s += ' <' + name + '> tag'
if len( attrs ) > 0:
s += ' with attributes ' + ', '.join( key + '=' + value for ( key, value ) in attrs.items() )
pretty_strings.append( s )
if self._content_rule is None:
pretty_strings.append( 'get the text content of those tags' )
else:
pretty_strings.append( 'get the ' + self._content_rule + ' attribute of those tags' )
cull_munge_strings = []
( cull_front, cull_back, prepend, append ) = self._culling_and_adding
if cull_front > 0:
cull_munge_strings.append( 'the first ' + HydrusData.ConvertIntToPrettyString( cull_front ) + ' characters' )
elif cull_front < 0:
cull_munge_strings.append( 'all but the last ' + HydrusData.ConvertIntToPrettyString( abs( cull_front ) ) + ' characters' )
if cull_back > 0:
cull_munge_strings.append( 'the last ' + HydrusData.ConvertIntToPrettyString( cull_back ) + ' characters' )
elif cull_back < 0:
cull_munge_strings.append( 'all but the first ' + HydrusData.ConvertIntToPrettyString( abs( cull_back ) ) + ' characters' )
if len( cull_munge_strings ) > 0:
pretty_strings.append( 'remove ' + ' and '.join( cull_munge_strings ) )
add_munge_strings = []
if prepend != '':
add_munge_strings.append( 'prepend "' + prepend + '"' )
if append != '':
add_munge_strings.append( 'append "' + append + '"' )
if len( add_munge_strings ) > 0:
pretty_strings.append( ' and '.join( add_munge_strings ) )
separator = os.linesep + 'and then '
pretty_multiline_string = separator.join( pretty_strings )
return pretty_multiline_string
def ToTuple( self ):
return ( self._tag_rules, self._content_rule, self._culling_and_adding )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML ] = ParseFormulaHTML
class ParseNodeContent( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT
SERIALISABLE_VERSION = 1
def __init__( self, name = None, content_type = None, formula = None, additional_info = None ):
if name is None:
name = ''
if content_type is None:
content_type = HC.CONTENT_TYPE_MAPPINGS
if formula is None:
formula = ParseFormulaHTML()
if additional_info is None:
if content_type == HC.CONTENT_TYPE_MAPPINGS:
additional_info = ''
self._name = name
self._content_type = content_type
self._formula = formula
self._additional_info = additional_info
def _GetSerialisableInfo( self ):
serialisable_formula = self._formula.GetSerialisableTuple()
return ( self._name, self._content_type, serialisable_formula, self._additional_info )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._name, self._content_type, serialisable_formula, self._additional_info ) = serialisable_info
if isinstance( self._additional_info, list ):
self._additional_info = tuple( self._additional_info )
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
def GetParsableContent( self ):
return { ( self._name, self._content_type, self._additional_info ) }
def Parse( self, job_key, data, referral_url, desired_content ):
content_description = ( self._name, self._content_type, self._additional_info )
parsed_texts = self._formula.Parse( data )
if self._content_type == HC.CONTENT_TYPE_VETO:
vetoes = GetVetoes( parsed_texts, self._additional_info )
return [ ( content_description, veto ) for veto in vetoes ]
else:
return [ ( content_description, parsed_text ) for parsed_text in parsed_texts ]
def ToPrettyStrings( self ):
return ( self._name, 'content', ConvertParsableContentToPrettyString( self.GetParsableContent(), include_veto = True ) )
def ToTuple( self ):
return ( self._name, self._content_type, self._formula, self._additional_info )
def Vetoes( self, data ):
if self._content_type == HC.CONTENT_TYPE_VETO:
parsed_texts = self._formula.Parse( data )
vetoes = GetVetoes( parsed_texts, self._additional_info )
return len( vetoes ) > 0
else:
return False
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT ] = ParseNodeContent
class ParseNodeContentLink( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK
SERIALISABLE_VERSION = 1
def __init__( self, name = None, formula = None, children = None ):
if name is None:
name = ''
if formula is None:
formula = ParseFormulaHTML()
if children is None:
children = []
self._name = name
self._formula = formula
self._children = children
def _GetSerialisableInfo( self ):
serialisable_formula = self._formula.GetSerialisableTuple()
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
return ( self._name, serialisable_formula, serialisable_children )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._name, serialisable_formula, serialisable_children ) = serialisable_info
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
def GetParsableContent( self ):
children_parsable_content = set()
for child in self._children:
children_parsable_content.update( child.GetParsableContent() )
return children_parsable_content
def Parse( self, job_key, data, referral_url, desired_content ):
search_urls = self.ParseURLs( job_key, data, referral_url )
content = []
for search_url in search_urls:
try:
job_key.SetVariable( 'script_status', 'fetching ' + search_url )
headers = { 'Referer' : referral_url }
response = ClientNetworking.RequestsGet( search_url, headers = headers )
except HydrusExceptions.NotFoundException:
job_key.SetVariable( 'script_status', '404 - nothing found' )
time.sleep( 2 )
continue
except HydrusExceptions.NetworkException as e:
job_key.SetVariable( 'script_status', 'Network error! Details written to log.' )
HydrusData.PrintException( e )
time.sleep( 2 )
continue
linked_data = response.content
children_content = GetChildrenContent( job_key, self._children, linked_data, search_url, desired_content )
content.extend( children_content )
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException()
return content
def ParseURLs( self, job_key, data, referral_url ):
basic_urls = self._formula.Parse( data )
absolute_urls = [ urlparse.urljoin( referral_url, basic_url ) for basic_url in basic_urls ]
for url in absolute_urls:
job_key.AddURL( url )
return absolute_urls
def ToPrettyStrings( self ):
return ( self._name, 'link', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
def ToTuple( self ):
return ( self._name, self._formula, self._children )
def Vetoes( self, data ):
return False
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK ] = ParseNodeContentLink
FILE_IDENTIFIER_TYPE_FILE = 0
FILE_IDENTIFIER_TYPE_MD5 = 1
FILE_IDENTIFIER_TYPE_SHA1 = 2
FILE_IDENTIFIER_TYPE_SHA256 = 3
FILE_IDENTIFIER_TYPE_SHA512 = 4
FILE_IDENTIFIER_TYPE_USER_INPUT = 5
file_identifier_string_lookup = {}
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_FILE ] = 'the actual file (POST only)'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_MD5 ] = 'md5 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA1 ] = 'sha1 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA256 ] = 'sha256 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA512 ] = 'sha512 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_USER_INPUT ] = 'custom user input'
class ParseRootFileLookup( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP
SERIALISABLE_VERSION = 1
def __init__( self, name, url = None, query_type = None, file_identifier_type = None, file_identifier_encoding = None, file_identifier_arg_name = None, static_args = None, children = None ):
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._url = url
self._query_type = query_type
self._file_identifier_type = file_identifier_type
self._file_identifier_encoding = file_identifier_encoding
self._file_identifier_arg_name = file_identifier_arg_name
self._static_args = static_args
self._children = children
def _GetSerialisableInfo( self ):
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
return ( self._url, self._query_type, self._file_identifier_type, self._file_identifier_encoding, self._file_identifier_arg_name, self._static_args, serialisable_children )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._url, self._query_type, self._file_identifier_type, self._file_identifier_encoding, self._file_identifier_arg_name, self._static_args, serialisable_children ) = serialisable_info
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
def ConvertMediaToFileIdentifier( self, media ):
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT:
raise Exception( 'Cannot convert media to file identifier--this script takes user input!' )
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA256:
return media.GetHash()
elif self._file_identifier_type in ( FILE_IDENTIFIER_TYPE_MD5, FILE_IDENTIFIER_TYPE_SHA1, FILE_IDENTIFIER_TYPE_SHA512 ):
sha256_hash = media.GetHash()
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_MD5:
hash_type = 'md5'
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA1:
hash_type = 'sha1'
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA512:
hash_type = 'sha512'
try:
( other_hash, ) = HydrusGlobals.client_controller.Read( 'file_hashes', ( sha256_hash, ), 'sha256', hash_type )
return other_hash
except:
raise Exception( 'I do not know that file\'s ' + hash_type + ' hash, so I cannot look it up!' )
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
hash = media.GetHash()
mime = media.GetMime()
client_files_manager = HydrusGlobals.client_controller.GetClientFilesManager()
try:
path = client_files_manager.GetFilePath( hash, mime )
return path
except HydrusExceptions.FileMissingException as e:
raise Exception( 'That file is not in the database\'s local files, so I cannot look it up!' )
def FetchData( self, job_key, file_identifier ):
# add gauge report hook and cancel support here
request_args = dict( self._static_args )
if self._file_identifier_type != FILE_IDENTIFIER_TYPE_FILE:
request_args[ self._file_identifier_arg_name ] = HydrusData.EncodeBytes( self._file_identifier_encoding, file_identifier )
if self._query_type == HC.GET:
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
raise Exception( 'Cannot have a file as an argument on a GET query!' )
rendered_url = self._url + '?' + '&'.join( ( HydrusData.ToByteString( key ) + '=' + HydrusData.ToByteString( value ) for ( key, value ) in request_args.items() ) )
job_key.SetVariable( 'script_status', 'fetching ' + rendered_url )
job_key.AddURL( rendered_url )
response = ClientNetworking.RequestsGet( self._url, params = request_args )
elif self._query_type == HC.POST:
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
job_key.SetVariable( 'script_status', 'uploading file' )
path = file_identifier
files = { self._file_identifier_arg_name : open( path, 'rb' ) }
else:
job_key.SetVariable( 'script_status', 'uploading identifier' )
files = None
response = ClientNetworking.RequestsPost( self._url, data = request_args, files = files )
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException()
data = response.content
return data
def GetParsableContent( self ):
children_parsable_content = set()
for child in self._children:
children_parsable_content.update( child.GetParsableContent() )
return children_parsable_content
def DoQuery( self, job_key, file_identifier, desired_content ):
try:
try:
data = self.FetchData( job_key, file_identifier )
except HydrusExceptions.NotFoundException:
job_key.SetVariable( 'script_status', '404 - nothing found' )
return []
except HydrusExceptions.NetworkException as e:
job_key.SetVariable( 'script_status', 'Network error!' )
HydrusData.ShowException( e )
return []
content_results = self.Parse( job_key, data, desired_content )
if len( content_results ) == 0:
job_key.SetVariable( 'script_status', 'Did not find anything.' )
else:
job_key.SetVariable( 'script_status', 'Found ' + HydrusData.ConvertIntToPrettyString( len( content_results ) ) + ' rows.' )
return content_results
except HydrusExceptions.CancelledException:
job_key.SetVariable( 'script_status', 'Cancelled!' )
return []
finally:
job_key.Finish()
def UsesUserInput( self ):
return self._file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT
def Parse( self, job_key, data, desired_content ):
content = GetChildrenContent( job_key, self._children, data, self._url, desired_content )
return content
def SetChildren( self, children ):
self._children = children
def ToPrettyStrings( self ):
return ( self._name, HC.query_type_string_lookup[ self._query_type ], 'File Lookup', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
def ToTuple( self ):
return ( self._name, self._url, self._query_type, self._file_identifier_type, self._file_identifier_encoding, self._file_identifier_arg_name, self._static_args, self._children )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP ] = ParseRootFileLookup