hydrus/include/ClientParsing.py

3035 lines
97 KiB
Python
Raw Normal View History

2016-07-20 19:57:10 +00:00
import bs4
2018-02-07 23:40:33 +00:00
import calendar
2018-04-18 22:10:15 +00:00
import ClientNetworkingDomain
import ClientNetworkingJobs
2017-12-13 22:33:07 +00:00
import collections
2018-08-15 20:40:30 +00:00
import cStringIO
2016-09-21 19:54:04 +00:00
import HydrusConstants as HC
2016-09-07 20:01:05 +00:00
import HydrusData
2016-11-09 23:13:22 +00:00
import HydrusExceptions
2017-05-10 21:33:58 +00:00
import HydrusGlobals as HG
2016-07-20 19:57:10 +00:00
import HydrusSerialisable
2016-10-26 20:45:34 +00:00
import HydrusTags
2018-01-31 22:58:15 +00:00
import json
2016-10-19 20:02:56 +00:00
import os
2017-09-13 20:50:41 +00:00
import re
2018-04-25 22:07:52 +00:00
import threading
2016-11-16 20:21:43 +00:00
import time
2016-11-02 21:09:14 +00:00
import urlparse
2016-07-20 19:57:10 +00:00
2018-04-11 22:30:40 +00:00
try:
import html5lib
HTML5LIB_IS_OK = True
except ImportError:
HTML5LIB_IS_OK = False
try:
import lxml
LXML_IS_OK = True
except ImportError:
LXML_IS_OK = False
2018-02-07 23:40:33 +00:00
def ConvertParseResultToPrettyString( result ):
2016-10-26 20:45:34 +00:00
2016-11-02 21:09:14 +00:00
( ( name, content_type, additional_info ), parsed_text ) = result
2016-10-26 20:45:34 +00:00
2017-12-13 22:33:07 +00:00
if content_type == HC.CONTENT_TYPE_URLS:
2018-02-07 23:40:33 +00:00
( url_type, priority ) = additional_info
2018-06-06 21:27:02 +00:00
if url_type == HC.URL_TYPE_DESIRED:
2018-02-07 23:40:33 +00:00
2018-06-20 20:20:22 +00:00
return 'downloadable/pursuable url (priority ' + str( priority ) + '): ' + parsed_text
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
elif url_type == HC.URL_TYPE_SOURCE:
2018-02-07 23:40:33 +00:00
2018-06-20 20:20:22 +00:00
return 'associable/source url (priority ' + str( priority ) + '): ' + parsed_text
2018-02-07 23:40:33 +00:00
elif url_type == HC.URL_TYPE_NEXT:
2018-06-20 20:20:22 +00:00
return 'next page url (priority ' + str( priority ) + '): ' + parsed_text
2018-02-07 23:40:33 +00:00
2017-12-13 22:33:07 +00:00
elif content_type == HC.CONTENT_TYPE_MAPPINGS:
2016-10-26 20:45:34 +00:00
2018-06-06 21:27:02 +00:00
try:
tag = HydrusTags.CleanTag( HydrusTags.CombineTag( additional_info, parsed_text ) )
except:
tag = 'unparsable tag, will likely be discarded'
return 'tag: ' + tag
2016-10-26 20:45:34 +00:00
2018-01-31 22:58:15 +00:00
elif content_type == HC.CONTENT_TYPE_HASH:
return additional_info + ' hash: ' + parsed_text.encode( 'hex' )
2018-02-07 23:40:33 +00:00
elif content_type == HC.CONTENT_TYPE_TIMESTAMP:
timestamp_type = additional_info
try:
timestamp = int( parsed_text )
timestamp_string = HydrusData.ConvertTimestampToPrettyTime( timestamp )
except:
timestamp_string = 'could not convert to integer'
if timestamp_type == HC.TIMESTAMP_TYPE_SOURCE:
return 'source time: ' + timestamp_string
elif content_type == HC.CONTENT_TYPE_TITLE:
priority = additional_info
2018-05-23 21:05:06 +00:00
return 'watcher page title (priority ' + str( priority ) + '): ' + parsed_text
2018-02-07 23:40:33 +00:00
2016-11-02 21:09:14 +00:00
elif content_type == HC.CONTENT_TYPE_VETO:
2018-04-25 22:07:52 +00:00
return 'veto: ' + name
2016-11-02 21:09:14 +00:00
2016-10-26 20:45:34 +00:00
raise NotImplementedError()
2016-11-02 21:09:14 +00:00
def ConvertParsableContentToPrettyString( parsable_content, include_veto = False ):
pretty_strings = []
2018-04-25 22:07:52 +00:00
content_type_to_additional_infos = HydrusData.BuildKeyToSetDict( ( ( ( content_type, name ), additional_infos ) for ( name, content_type, additional_infos ) in parsable_content ) )
2016-11-02 21:09:14 +00:00
2018-01-31 22:58:15 +00:00
data = list( content_type_to_additional_infos.items() )
data.sort()
2018-04-25 22:07:52 +00:00
for ( ( content_type, name ), additional_infos ) in data:
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
if content_type == HC.CONTENT_TYPE_URLS:
2018-02-07 23:40:33 +00:00
for ( url_type, priority ) in additional_infos:
2018-06-06 21:27:02 +00:00
if url_type == HC.URL_TYPE_DESIRED:
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
pretty_strings.append( 'downloadable/pursuable url' )
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
elif url_type == HC.URL_TYPE_SOURCE:
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
pretty_strings.append( 'associable/source url' )
2018-02-07 23:40:33 +00:00
elif url_type == HC.URL_TYPE_NEXT:
pretty_strings.append( 'gallery next page url' )
2017-12-13 22:33:07 +00:00
elif content_type == HC.CONTENT_TYPE_MAPPINGS:
2016-11-02 21:09:14 +00:00
namespaces = [ namespace for namespace in additional_infos if namespace != '' ]
if '' in additional_infos:
namespaces.append( 'unnamespaced' )
pretty_strings.append( 'tags: ' + ', '.join( namespaces ) )
2018-01-31 22:58:15 +00:00
elif content_type == HC.CONTENT_TYPE_HASH:
if len( additional_infos ) == 1:
( hash_type, ) = additional_infos
pretty_strings.append( 'hash: ' + hash_type )
else:
hash_types = list( additional_infos )
hash_types.sort()
pretty_strings.append( 'hashes: ' + ', '.join( hash_types ) )
2018-02-07 23:40:33 +00:00
elif content_type == HC.CONTENT_TYPE_TIMESTAMP:
for timestamp_type in additional_infos:
if timestamp_type == HC.TIMESTAMP_TYPE_SOURCE:
pretty_strings.append( 'source time' )
elif content_type == HC.CONTENT_TYPE_TITLE:
2018-05-23 21:05:06 +00:00
pretty_strings.append( 'watcher page title' )
2018-02-07 23:40:33 +00:00
2016-11-02 21:09:14 +00:00
elif content_type == HC.CONTENT_TYPE_VETO:
if include_veto:
2018-04-25 22:07:52 +00:00
pretty_strings.append( 'veto: ' + name )
2016-11-02 21:09:14 +00:00
2016-10-05 20:22:40 +00:00
2016-11-02 21:09:14 +00:00
if len( pretty_strings ) == 0:
2016-10-05 20:22:40 +00:00
return 'nothing'
else:
2016-11-02 21:09:14 +00:00
return ', '.join( pretty_strings )
2016-10-05 20:22:40 +00:00
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
def GetChildrenContent( job_key, children, data, referral_url ):
2016-11-02 21:09:14 +00:00
content = []
for child in children:
2017-12-13 22:33:07 +00:00
try:
2018-01-24 23:09:42 +00:00
if isinstance( child, ParseNodeContentLink ):
child_content = child.Parse( job_key, data, referral_url )
elif isinstance( child, ContentParser ):
2018-02-07 23:40:33 +00:00
child_content = child.Parse( {}, data )
2018-01-24 23:09:42 +00:00
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
except HydrusExceptions.VetoException:
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
return []
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
content.extend( child_content )
2016-11-02 21:09:14 +00:00
return content
2018-02-07 23:40:33 +00:00
def GetHashesFromParseResults( results ):
2018-01-31 22:58:15 +00:00
hash_results = []
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_HASH:
hash_results.append( ( additional_info, parsed_text ) )
return hash_results
2018-05-30 20:13:21 +00:00
def GetHTMLTagString( tag ):
all_strings = [ s for s in tag.strings if len( s ) > 0 ]
if len( all_strings ) == 0:
result = ''
else:
result = all_strings[0]
return result
2018-04-18 22:10:15 +00:00
def GetNamespacesFromParsableContent( parsable_content ):
content_type_to_additional_infos = HydrusData.BuildKeyToSetDict( ( ( content_type, additional_infos ) for ( name, content_type, additional_infos ) in parsable_content ) )
namespaces = content_type_to_additional_infos[ HC.CONTENT_TYPE_MAPPINGS ] # additional_infos is a set of namespaces
return namespaces
2018-04-11 22:30:40 +00:00
def GetSoup( html ):
if HTML5LIB_IS_OK:
parser = 'html5lib'
elif LXML_IS_OK:
parser = 'lxml'
else:
message = 'This client does not have access to either lxml or html5lib, and so it cannot parse html. Please install one of these parsing libraries and restart the client.'
raise HydrusExceptions.ParseException( message )
return bs4.BeautifulSoup( html, parser )
2018-02-07 23:40:33 +00:00
def GetTagsFromParseResults( results ):
2016-11-09 23:13:22 +00:00
tag_results = []
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_MAPPINGS:
tag_results.append( HydrusTags.CombineTag( additional_info, parsed_text ) )
tag_results = HydrusTags.CleanTags( tag_results )
return tag_results
2018-02-07 23:40:33 +00:00
def GetTimestampFromParseResults( results, desired_timestamp_type ):
timestamp_results = []
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_TIMESTAMP:
timestamp_type = additional_info
if timestamp_type == desired_timestamp_type:
try:
timestamp = int( parsed_text )
except:
continue
2018-05-02 20:45:20 +00:00
if timestamp_type == HC.TIMESTAMP_TYPE_SOURCE:
timestamp = min( HydrusData.GetNow() - 30, timestamp )
2018-02-07 23:40:33 +00:00
timestamp_results.append( timestamp )
if len( timestamp_results ) == 0:
return None
else:
return min( timestamp_results )
def GetTitleFromAllParseResults( all_parse_results ):
titles = []
for results in all_parse_results:
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_TITLE:
priority = additional_info
titles.append( ( priority, parsed_text ) )
if len( titles ) > 0:
titles.sort( reverse = True ) # highest priority first
( priority, title ) = titles[0]
return title
else:
return None
2018-05-02 20:45:20 +00:00
def GetURLsFromParseResults( results, desired_url_types, only_get_top_priority = False ):
2017-12-13 22:33:07 +00:00
url_results = collections.defaultdict( list )
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_URLS:
2018-02-07 23:40:33 +00:00
( url_type, priority ) = additional_info
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
if url_type in desired_url_types:
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
url_results[ priority ].append( parsed_text )
2017-12-13 22:33:07 +00:00
2018-05-02 20:45:20 +00:00
if only_get_top_priority:
2018-02-07 23:40:33 +00:00
2018-05-02 20:45:20 +00:00
# ( priority, url_list ) pairs
url_results = list( url_results.items() )
# ordered by descending priority
url_results.sort( reverse = True )
# url_lists of descending priority
if len( url_results ) > 0:
( priority, url_list ) = url_results[0]
else:
url_list = []
2018-02-07 23:40:33 +00:00
else:
url_list = []
2018-05-02 20:45:20 +00:00
for u_l in url_results.values():
url_list.extend( u_l )
2017-12-13 22:33:07 +00:00
2018-06-06 21:27:02 +00:00
urls_seen = set()
possible_dupe_urls = url_list
url_list = []
for url in possible_dupe_urls:
if url not in urls_seen:
urls_seen.add( url )
url_list.append( url )
2018-02-07 23:40:33 +00:00
return url_list
2017-12-13 22:33:07 +00:00
2018-02-21 21:59:37 +00:00
def MakeParsedTextPretty( parsed_text ):
try:
parsed_text = unicode( parsed_text )
except UnicodeDecodeError:
parsed_text = repr( parsed_text )
return parsed_text
2018-01-31 22:58:15 +00:00
def RenderJSONParseRule( parse_rule ):
if parse_rule is None:
s = 'get all items'
elif isinstance( parse_rule, int ):
index = parse_rule
num = index + 1
s = 'get the ' + HydrusData.ConvertIntToPrettyOrdinalString( num ) + ' item'
else:
s = 'get the "' + HydrusData.ToUnicode( parse_rule ) + '" entry'
return s
class ParseFormula( HydrusSerialisable.SerialisableBase ):
def __init__( self, string_match = None, string_converter = None ):
if string_match is None:
string_match = StringMatch()
if string_converter is None:
string_converter = StringConverter( example_string = 'parsed information' )
self._string_match = string_match
self._string_converter = string_converter
2018-02-21 21:59:37 +00:00
def _GetParsePrettySeparator( self ):
return os.linesep
def _ParseRawContents( self, parsing_context, data ):
2018-01-31 22:58:15 +00:00
raise NotImplementedError()
2018-02-21 21:59:37 +00:00
def Parse( self, parsing_context, data ):
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
raw_texts = self._ParseRawContents( parsing_context, data )
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
texts = []
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
for raw_text in raw_texts:
2018-01-31 22:58:15 +00:00
try:
2018-02-21 21:59:37 +00:00
self._string_match.Test( raw_text )
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
text = self._string_converter.Convert( raw_text )
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
texts.append( text )
2018-01-31 22:58:15 +00:00
except HydrusExceptions.ParseException:
continue
2018-02-21 21:59:37 +00:00
return texts
def ParsePretty( self, parsing_context, data ):
texts = self.Parse( parsing_context, data )
pretty_texts = [ MakeParsedTextPretty( text ) for text in texts ]
2018-07-04 20:48:28 +00:00
pretty_texts = [ '*** ' + HydrusData.ToHumanInt( len( pretty_texts ) ) + ' RESULTS BEGIN ***' ] + pretty_texts + [ '*** RESULTS END ***' ]
2018-02-21 21:59:37 +00:00
separator = self._GetParsePrettySeparator()
result = separator.join( pretty_texts )
return result
2018-01-31 22:58:15 +00:00
def ParsesSeparatedContent( self ):
return False
def ToPrettyString( self ):
raise NotImplementedError()
def ToPrettyMultilineString( self ):
raise NotImplementedError()
2016-09-07 20:01:05 +00:00
2018-01-31 22:58:15 +00:00
class ParseFormulaCompound( ParseFormula ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_COMPOUND
SERIALISABLE_NAME = 'Compound Parsing Formula'
SERIALISABLE_VERSION = 1
2016-09-07 20:01:05 +00:00
2018-01-31 22:58:15 +00:00
def __init__( self, formulae = None, sub_phrase = None, string_match = None, string_converter = None ):
ParseFormula.__init__( self, string_match, string_converter )
if formulae is None:
formulae = HydrusSerialisable.SerialisableList()
formulae.append( ParseFormulaHTML() )
if sub_phrase is None:
sub_phrase = '\\1'
self._formulae = formulae
self._sub_phrase = sub_phrase
def _GetSerialisableInfo( self ):
serialisable_formulae = HydrusSerialisable.SerialisableList( self._formulae ).GetSerialisableTuple()
serialisable_string_match = self._string_match.GetSerialisableTuple()
serialisable_string_converter = self._string_converter.GetSerialisableTuple()
return ( serialisable_formulae, self._sub_phrase, serialisable_string_match, serialisable_string_converter )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_formulae, self._sub_phrase, serialisable_string_match, serialisable_string_converter ) = serialisable_info
self._formulae = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formulae )
self._string_match = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_match )
self._string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents( self, parsing_context, data ):
2018-01-31 22:58:15 +00:00
def get_stream_data( index, s ):
if len( s ) == 0:
return ''
2018-06-27 19:27:05 +00:00
elif index >= len( s ):
2018-01-31 22:58:15 +00:00
return s[-1]
else:
return s[ index ]
streams = []
for formula in self._formulae:
2018-02-21 21:59:37 +00:00
stream = formula.Parse( parsing_context, data )
2018-02-07 23:40:33 +00:00
if len( stream ) == 0: # no contents were found for one of the /1 replace components, so no valid strings can be made.
return []
streams.append( stream )
2018-01-31 22:58:15 +00:00
num_raw_contents_to_make = max( ( len( stream ) for stream in streams ) )
raw_contents = []
for stream_index in range( num_raw_contents_to_make ):
raw_content = self._sub_phrase
for ( stream_num, stream ) in enumerate( streams, 1 ): # starts counting from 1
sub_component = '\\' + str( stream_num )
replace_string = get_stream_data( stream_index, stream )
raw_content = raw_content.replace( sub_component, replace_string )
raw_contents.append( raw_content )
return raw_contents
def ToPrettyString( self ):
2018-07-04 20:48:28 +00:00
return 'COMPOUND with ' + HydrusData.ToHumanInt( len( self._formulae ) ) + ' formulae.'
2018-01-31 22:58:15 +00:00
def ToPrettyMultilineString( self ):
s = []
for formula in self._formulae:
s.append( formula.ToPrettyMultilineString() )
s.append( 'and substitute into ' + self._sub_phrase )
separator = os.linesep * 2
text = '--COMPOUND--' + os.linesep * 2 + separator.join( s )
return text
def ToTuple( self ):
return ( self._formulae, self._sub_phrase, self._string_match, self._string_converter )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_COMPOUND ] = ParseFormulaCompound
2018-02-07 23:40:33 +00:00
class ParseFormulaContextVariable( ParseFormula ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_CONTEXT_VARIABLE
SERIALISABLE_NAME = 'Context Variable Formula'
SERIALISABLE_VERSION = 1
def __init__( self, variable_name = None, string_match = None, string_converter = None ):
ParseFormula.__init__( self, string_match, string_converter )
if variable_name is None:
variable_name = 'url'
self._variable_name = variable_name
def _GetSerialisableInfo( self ):
serialisable_string_match = self._string_match.GetSerialisableTuple()
serialisable_string_converter = self._string_converter.GetSerialisableTuple()
return ( self._variable_name, serialisable_string_match, serialisable_string_converter )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._variable_name, serialisable_string_match, serialisable_string_converter ) = serialisable_info
self._string_match = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_match )
self._string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents( self, parsing_context, data ):
2018-02-07 23:40:33 +00:00
raw_contents = []
2018-02-21 21:59:37 +00:00
if self._variable_name in parsing_context:
2018-02-07 23:40:33 +00:00
2018-02-21 21:59:37 +00:00
raw_contents.append( parsing_context[ self._variable_name ] )
2018-02-07 23:40:33 +00:00
return raw_contents
def ToPrettyString( self ):
return 'CONTEXT VARIABLE: ' + self._variable_name
def ToPrettyMultilineString( self ):
s = []
s.append( 'fetch the "' + self._variable_name + '" variable from the parsing context' )
separator = os.linesep * 2
text = '--CONTEXT VARIABLE--' + os.linesep * 2 + separator.join( s )
return text
def ToTuple( self ):
return ( self._variable_name, self._string_match, self._string_converter )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_CONTEXT_VARIABLE ] = ParseFormulaContextVariable
2017-12-13 22:33:07 +00:00
HTML_CONTENT_ATTRIBUTE = 0
HTML_CONTENT_STRING = 1
HTML_CONTENT_HTML = 2
2018-01-31 22:58:15 +00:00
class ParseFormulaHTML( ParseFormula ):
2016-07-20 19:57:10 +00:00
2016-09-21 19:54:04 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'HTML Parsing Formula'
2018-04-11 22:30:40 +00:00
SERIALISABLE_VERSION = 6
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
def __init__( self, tag_rules = None, content_to_fetch = None, attribute_to_fetch = None, string_match = None, string_converter = None ):
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
ParseFormula.__init__( self, string_match, string_converter )
2016-09-07 20:01:05 +00:00
if tag_rules is None:
2018-04-11 22:30:40 +00:00
tag_rules = HydrusSerialisable.SerialisableList()
tag_rules.append( ParseRuleHTML() )
2016-09-07 20:01:05 +00:00
2016-07-20 19:57:10 +00:00
2018-01-24 23:09:42 +00:00
if content_to_fetch is None:
content_to_fetch = HTML_CONTENT_ATTRIBUTE
if attribute_to_fetch is None:
attribute_to_fetch = 'href'
2018-04-11 22:30:40 +00:00
self._tag_rules = HydrusSerialisable.SerialisableList( tag_rules )
2016-09-07 20:01:05 +00:00
2017-12-13 22:33:07 +00:00
self._content_to_fetch = content_to_fetch
self._attribute_to_fetch = attribute_to_fetch
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
def _FindHTMLTags( self, root ):
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
tags = ( root, )
2016-07-20 19:57:10 +00:00
2018-04-11 22:30:40 +00:00
for tag_rule in self._tag_rules:
2018-01-31 22:58:15 +00:00
2018-04-11 22:30:40 +00:00
tags = tag_rule.GetNodes( tags )
2018-01-31 22:58:15 +00:00
2017-11-15 22:35:49 +00:00
2018-01-31 22:58:15 +00:00
return tags
2016-11-16 20:21:43 +00:00
2018-02-21 21:59:37 +00:00
def _GetParsePrettySeparator( self ):
if self._content_to_fetch == HTML_CONTENT_HTML:
return os.linesep * 2
else:
return os.linesep
2018-01-31 22:58:15 +00:00
def _GetRawContentFromTag( self, tag ):
2017-11-22 21:03:07 +00:00
2017-12-13 22:33:07 +00:00
if self._content_to_fetch == HTML_CONTENT_ATTRIBUTE:
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
if tag.has_attr( self._attribute_to_fetch ):
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
unknown_attr_result = tag[ self._attribute_to_fetch ]
2017-11-22 21:03:07 +00:00
# 'class' attr returns a list because it has multiple values under html spec, wew
if isinstance( unknown_attr_result, list ):
if len( unknown_attr_result ) == 0:
2017-12-13 22:33:07 +00:00
raise HydrusExceptions.ParseException( 'Attribute ' + self._attribute_to_fetch + ' not found!' )
2017-11-22 21:03:07 +00:00
else:
result = ' '.join( unknown_attr_result )
else:
result = unknown_attr_result
else:
2017-12-13 22:33:07 +00:00
raise HydrusExceptions.ParseException( 'Attribute ' + self._attribute_to_fetch + ' not found!' )
2017-11-22 21:03:07 +00:00
2017-12-13 22:33:07 +00:00
elif self._content_to_fetch == HTML_CONTENT_STRING:
2018-05-30 20:13:21 +00:00
result = GetHTMLTagString( tag )
2017-12-13 22:33:07 +00:00
elif self._content_to_fetch == HTML_CONTENT_HTML:
2018-01-31 22:58:15 +00:00
result = unicode( tag )
2017-12-13 22:33:07 +00:00
2017-11-22 21:03:07 +00:00
if result is None or result == '':
2017-12-13 22:33:07 +00:00
raise HydrusExceptions.ParseException( 'Empty/No results found!' )
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
return result
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
def _GetRawContentsFromTags( self, tags ):
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
raw_contents = []
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
for tag in tags:
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
try:
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
raw_content = self._GetRawContentFromTag( tag )
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
raw_contents.append( raw_content )
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
except HydrusExceptions.ParseException:
continue
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
return raw_contents
def _GetSerialisableInfo( self ):
2018-04-11 22:30:40 +00:00
serialisable_tag_rules = self._tag_rules.GetSerialisableTuple()
2018-01-31 22:58:15 +00:00
serialisable_string_match = self._string_match.GetSerialisableTuple()
serialisable_string_converter = self._string_converter.GetSerialisableTuple()
2018-04-11 22:30:40 +00:00
return ( serialisable_tag_rules, self._content_to_fetch, self._attribute_to_fetch, serialisable_string_match, serialisable_string_converter )
2018-01-31 22:58:15 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2018-04-11 22:30:40 +00:00
( serialisable_tag_rules, self._content_to_fetch, self._attribute_to_fetch, serialisable_string_match, serialisable_string_converter ) = serialisable_info
self._tag_rules = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_rules )
2018-01-31 22:58:15 +00:00
self._string_match = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_match )
self._string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents( self, parsing_context, data ):
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
try:
2018-04-25 22:07:52 +00:00
root = HG.client_controller.parsing_cache.GetSoup( data )
2018-02-21 21:59:37 +00:00
except Exception as e:
raise HydrusExceptions.ParseException( 'Unable to parse that HTML: ' + HydrusData.ToUnicode( e ) )
2018-01-31 22:58:15 +00:00
tags = self._FindHTMLTags( root )
raw_contents = self._GetRawContentsFromTags( tags )
return raw_contents
2017-11-22 21:03:07 +00:00
2016-11-16 20:21:43 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
2017-12-13 22:33:07 +00:00
( tag_rules, attribute_to_fetch ) = old_serialisable_info
2016-11-16 20:21:43 +00:00
culling_and_adding = ( 0, 0, '', '' )
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules, attribute_to_fetch, culling_and_adding )
2016-11-16 20:21:43 +00:00
return ( 2, new_serialisable_info )
2016-07-20 19:57:10 +00:00
2017-11-15 22:35:49 +00:00
if version == 2:
2017-12-13 22:33:07 +00:00
( tag_rules, attribute_to_fetch, culling_and_adding ) = old_serialisable_info
2017-11-15 22:35:49 +00:00
( cull_front, cull_back, prepend, append ) = culling_and_adding
transformations = []
if cull_front > 0:
transformations.append( ( STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING, cull_front ) )
elif cull_front < 0:
transformations.append( ( STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END, cull_front ) )
if cull_back > 0:
transformations.append( ( STRING_TRANSFORMATION_CLIP_TEXT_FROM_END, cull_back ) )
elif cull_back < 0:
transformations.append( ( STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING, cull_back ) )
if prepend != '':
transformations.append( ( STRING_TRANSFORMATION_PREPEND_TEXT, prepend ) )
if append != '':
transformations.append( ( STRING_TRANSFORMATION_APPEND_TEXT, append ) )
string_converter = StringConverter( transformations, 'parsed information' )
serialisable_string_converter = string_converter.GetSerialisableTuple()
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules, attribute_to_fetch, serialisable_string_converter )
2017-11-15 22:35:49 +00:00
return ( 3, new_serialisable_info )
2017-11-22 21:03:07 +00:00
if version == 3:
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
( tag_rules, attribute_to_fetch, serialisable_string_converter ) = old_serialisable_info
2016-10-26 20:45:34 +00:00
2017-11-22 21:03:07 +00:00
string_match = StringMatch()
2016-10-26 20:45:34 +00:00
2017-11-22 21:03:07 +00:00
serialisable_string_match = string_match.GetSerialisableTuple()
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules, attribute_to_fetch, serialisable_string_match, serialisable_string_converter )
2016-07-20 19:57:10 +00:00
2017-11-22 21:03:07 +00:00
return ( 4, new_serialisable_info )
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
if version == 4:
( tag_rules, attribute_to_fetch, serialisable_string_match, serialisable_string_converter ) = old_serialisable_info
if attribute_to_fetch is None:
content_to_fetch = HTML_CONTENT_STRING
attribute_to_fetch = ''
else:
content_to_fetch = HTML_CONTENT_ATTRIBUTE
new_serialisable_info = ( tag_rules, content_to_fetch, attribute_to_fetch, serialisable_string_match, serialisable_string_converter )
return ( 5, new_serialisable_info )
2018-04-11 22:30:40 +00:00
if version == 5:
( tag_rules, content_to_fetch, attribute_to_fetch, serialisable_string_match, serialisable_string_converter ) = old_serialisable_info
new_tag_rules = HydrusSerialisable.SerialisableList()
for ( name, attrs, index ) in tag_rules:
tag_rule = ParseRuleHTML( rule_type = HTML_RULE_TYPE_DESCENDING, tag_name = name, tag_attributes = attrs, tag_index = index )
new_tag_rules.append( tag_rule )
serialisable_new_tag_rules = new_tag_rules.GetSerialisableTuple()
new_serialisable_info = ( serialisable_new_tag_rules, content_to_fetch, attribute_to_fetch, serialisable_string_match, serialisable_string_converter )
return ( 6, new_serialisable_info )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
def ParsesSeparatedContent( self ):
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
return self._content_to_fetch == HTML_CONTENT_HTML
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
def ToPrettyString( self ):
2016-07-20 19:57:10 +00:00
2018-07-04 20:48:28 +00:00
return 'HTML with ' + HydrusData.ToHumanInt( len( self._tag_rules ) ) + ' tag rules.'
2018-01-31 22:58:15 +00:00
def ToPrettyMultilineString( self ):
2018-04-11 22:30:40 +00:00
pretty_strings = [ t_r.ToString() for t_r in self._tag_rules ]
2018-01-31 22:58:15 +00:00
if self._content_to_fetch == HTML_CONTENT_ATTRIBUTE:
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings.append( 'get the ' + self._attribute_to_fetch + ' attribute of those tags' )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
elif self._content_to_fetch == HTML_CONTENT_STRING:
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings.append( 'get the text content of those tags' )
elif self._content_to_fetch == HTML_CONTENT_HTML:
pretty_strings.append( 'get the html of those tags' )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings.extend( self._string_converter.GetTransformationStrings() )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
separator = os.linesep + 'and then '
pretty_multiline_string = '--HTML--' + os.linesep + separator.join( pretty_strings )
return pretty_multiline_string
def ToTuple( self ):
return ( self._tag_rules, self._content_to_fetch, self._attribute_to_fetch, self._string_match, self._string_converter )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML ] = ParseFormulaHTML
2018-04-11 22:30:40 +00:00
HTML_RULE_TYPE_DESCENDING = 0
HTML_RULE_TYPE_ASCENDING = 1
class ParseRuleHTML( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_RULE_HTML
SERIALISABLE_NAME = 'HTML Parsing Rule'
2018-05-30 20:13:21 +00:00
SERIALISABLE_VERSION = 2
2018-04-11 22:30:40 +00:00
2018-05-30 20:13:21 +00:00
def __init__( self, rule_type = None, tag_name = None, tag_attributes = None, tag_index = None, tag_depth = None, should_test_tag_string = False, tag_string_string_match = None ):
2018-04-11 22:30:40 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
if rule_type is None:
rule_type = HTML_RULE_TYPE_DESCENDING
if tag_name is None:
tag_name = 'a'
if rule_type == HTML_RULE_TYPE_DESCENDING:
if tag_attributes is None:
tag_attributes = {}
elif rule_type == HTML_RULE_TYPE_ASCENDING:
if tag_depth is None:
tag_depth = 1
2018-05-30 20:13:21 +00:00
if tag_string_string_match is None:
tag_string_string_match = StringMatch()
2018-04-11 22:30:40 +00:00
self._rule_type = rule_type
self._tag_name = tag_name
self._tag_attributes = tag_attributes
self._tag_index = tag_index
self._tag_depth = tag_depth
2018-05-30 20:13:21 +00:00
self._should_test_tag_string = should_test_tag_string
self._tag_string_string_match = tag_string_string_match
2018-04-11 22:30:40 +00:00
def _GetSerialisableInfo( self ):
2018-05-30 20:13:21 +00:00
serialisable_tag_string_string_match = self._tag_string_string_match.GetSerialisableTuple()
return ( self._rule_type, self._tag_name, self._tag_attributes, self._tag_index, self._tag_depth, self._should_test_tag_string, serialisable_tag_string_string_match )
2018-04-11 22:30:40 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2018-05-30 20:13:21 +00:00
( self._rule_type, self._tag_name, self._tag_attributes, self._tag_index, self._tag_depth, self._should_test_tag_string, serialisable_tag_string_string_match ) = serialisable_info
self._tag_string_string_match = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_string_string_match )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( rule_type, tag_name, tag_attributes, tag_index, tag_depth ) = old_serialisable_info
should_test_tag_string = False
tag_string_string_match = StringMatch()
serialisable_tag_string_string_match = tag_string_string_match.GetSerialisableTuple()
new_serialisable_info = ( rule_type, tag_name, tag_attributes, tag_index, tag_depth, should_test_tag_string, serialisable_tag_string_string_match )
return ( 2, new_serialisable_info )
2018-04-11 22:30:40 +00:00
def GetNodes( self, nodes ):
new_nodes = []
for node in nodes:
if self._rule_type == HTML_RULE_TYPE_DESCENDING:
kwargs = { 'attrs' : self._tag_attributes }
if self._tag_name is not None:
kwargs[ 'name' ] = self._tag_name
found_nodes = node.find_all( **kwargs )
if self._tag_index is not None:
if len( found_nodes ) < self._tag_index + 1:
found_nodes = []
else:
found_nodes = [ found_nodes[ self._tag_index ] ]
elif self._rule_type == HTML_RULE_TYPE_ASCENDING:
found_nodes = []
still_in_tree = lambda node: isinstance( node, bs4.element.Tag ) # if we go one above html, we get the BS document itself
num_found = 0
potential_parent = node.parent
while still_in_tree( potential_parent ):
if self._tag_name is None:
num_found += 1
else:
if potential_parent.name == self._tag_name:
num_found += 1
if num_found == self._tag_depth:
found_nodes = [ potential_parent ]
break
potential_parent = potential_parent.parent
new_nodes.extend( found_nodes )
2018-05-30 20:13:21 +00:00
if self._should_test_tag_string:
potential_nodes = new_nodes
new_nodes = []
for node in potential_nodes:
s = GetHTMLTagString( node )
if self._tag_string_string_match.Matches( s ):
new_nodes.append( node )
2018-04-11 22:30:40 +00:00
return new_nodes
def ToString( self ):
if self._rule_type == HTML_RULE_TYPE_DESCENDING:
s = 'search descendents for'
if self._tag_index is None:
s += ' every'
else:
num = self._tag_index + 1
s += ' the ' + HydrusData.ConvertIntToPrettyOrdinalString( num )
if self._tag_name is not None:
s += ' <' + self._tag_name + '>'
s += ' tag'
if len( self._tag_attributes ) > 0:
s += ' with attributes ' + ', '.join( key + '=' + value for ( key, value ) in self._tag_attributes.items() )
elif self._rule_type == HTML_RULE_TYPE_ASCENDING:
s = 'walk back up ancestors'
if self._tag_name is None:
2018-07-04 20:48:28 +00:00
s += ' ' + HydrusData.ToHumanInt( self._tag_depth ) + ' tag levels'
2018-04-11 22:30:40 +00:00
else:
s += ' to the ' + HydrusData.ConvertIntToPrettyOrdinalString( self._tag_depth ) + ' <' + self._tag_name + '> tag'
2018-05-30 20:13:21 +00:00
if self._should_test_tag_string:
s += ' with strings that match ' + self._tag_string_string_match.ToUnicode()
2018-04-11 22:30:40 +00:00
return s
def ToTuple( self ):
2018-05-30 20:13:21 +00:00
return ( self._rule_type, self._tag_name, self._tag_attributes, self._tag_index, self._tag_depth, self._should_test_tag_string, self._tag_string_string_match )
2018-04-11 22:30:40 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_RULE_HTML ] = ParseRuleHTML
2018-01-31 22:58:15 +00:00
JSON_CONTENT_STRING = 0
JSON_CONTENT_JSON = 1
class ParseFormulaJSON( ParseFormula ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_JSON
SERIALISABLE_NAME = 'JSON Parsing Formula'
SERIALISABLE_VERSION = 1
def __init__( self, parse_rules = None, content_to_fetch = None, string_match = None, string_converter = None ):
ParseFormula.__init__( self, string_match, string_converter )
if parse_rules is None:
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
parse_rules = [ 'posts' ]
2017-11-22 21:03:07 +00:00
2016-10-26 20:45:34 +00:00
2018-01-31 22:58:15 +00:00
if content_to_fetch is None:
content_to_fetch = JSON_CONTENT_STRING
self._parse_rules = parse_rules
self._content_to_fetch = content_to_fetch
2016-07-20 19:57:10 +00:00
2018-02-21 21:59:37 +00:00
def _GetParsePrettySeparator( self ):
if self._content_to_fetch == JSON_CONTENT_JSON:
return os.linesep * 2
else:
return os.linesep
2018-01-31 22:58:15 +00:00
def _GetRawContentsFromJSON( self, j ):
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
roots = ( j, )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
for parse_rule in self._parse_rules:
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
next_roots = []
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
for root in roots:
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
if parse_rule is None:
if not isinstance( root, list ):
continue
next_roots.extend( root )
elif isinstance( parse_rule, int ):
if not isinstance( root, list ):
continue
index = parse_rule
if len( root ) < index + 1:
continue
next_roots.append( root[ index ] )
else:
if not isinstance( root, dict ):
continue
key = parse_rule
if key not in root:
continue
next_roots.append( root[ key ] )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
roots = next_roots
raw_contents = []
for root in roots:
if self._content_to_fetch == JSON_CONTENT_STRING:
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
if isinstance( root, ( list, dict ) ):
continue
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
raw_content = HydrusData.ToUnicode( root )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
elif self._content_to_fetch == JSON_CONTENT_JSON:
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
raw_content = json.dumps( root )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
raw_contents.append( raw_content )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
return raw_contents
def _GetSerialisableInfo( self ):
serialisable_string_match = self._string_match.GetSerialisableTuple()
serialisable_string_converter = self._string_converter.GetSerialisableTuple()
return ( self._parse_rules, self._content_to_fetch, serialisable_string_match, serialisable_string_converter )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._parse_rules, self._content_to_fetch, serialisable_string_match, serialisable_string_converter ) = serialisable_info
self._string_match = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_match )
self._string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents( self, parsing_context, data ):
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
try:
2018-04-25 22:07:52 +00:00
j = HG.client_controller.parsing_cache.GetJSON( data )
2018-02-21 21:59:37 +00:00
except Exception as e:
raise HydrusExceptions.ParseException( 'Unable to parse that JSON: ' + HydrusData.ToUnicode( e ) )
2018-01-31 22:58:15 +00:00
raw_contents = self._GetRawContentsFromJSON( j )
return raw_contents
def ParsesSeparatedContent( self ):
return self._content_to_fetch == JSON_CONTENT_JSON
def ToPrettyString( self ):
2018-07-04 20:48:28 +00:00
return 'JSON with ' + HydrusData.ToHumanInt( len( self._parse_rules ) ) + ' parse rules.'
2018-01-31 22:58:15 +00:00
def ToPrettyMultilineString( self ):
pretty_strings = [ RenderJSONParseRule( p_r ) for p_r in self._parse_rules ]
if self._content_to_fetch == JSON_CONTENT_STRING:
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings.append( 'get final data content, converting to strings as needed' )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
elif self._content_to_fetch == JSON_CONTENT_JSON:
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings.append( 'get the json beneath' )
2016-10-19 20:02:56 +00:00
2017-11-15 22:35:49 +00:00
pretty_strings.extend( self._string_converter.GetTransformationStrings() )
2016-11-16 20:21:43 +00:00
2016-10-19 20:02:56 +00:00
separator = os.linesep + 'and then '
2018-01-31 22:58:15 +00:00
pretty_multiline_string = '--JSON--' + os.linesep + separator.join( pretty_strings )
2016-10-19 20:02:56 +00:00
return pretty_multiline_string
2016-09-07 20:01:05 +00:00
def ToTuple( self ):
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
return ( self._parse_rules, self._content_to_fetch, self._string_match, self._string_converter )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_JSON ] = ParseFormulaJSON
2016-09-21 19:54:04 +00:00
2018-04-11 22:30:40 +00:00
class SimpleDownloaderParsingFormula( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_PARSE_FORMULA
SERIALISABLE_NAME = 'Simple Downloader Parsing Formula'
SERIALISABLE_VERSION = 1
def __init__( self, name = None, formula = None ):
if name is None:
name = 'new parsing formula'
if formula is None:
formula = ParseFormulaHTML()
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._formula = formula
def _GetSerialisableInfo( self ):
serialisable_formula = self._formula.GetSerialisableTuple()
return serialisable_formula
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
serialisable_formula = serialisable_info
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
def GetFormula( self ):
return self._formula
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_PARSE_FORMULA ] = SimpleDownloaderParsingFormula
2017-12-13 22:33:07 +00:00
class ContentParser( HydrusSerialisable.SerialisableBase ):
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_CONTENT_PARSER
SERIALISABLE_NAME = 'Content Parser'
2018-06-06 21:27:02 +00:00
SERIALISABLE_VERSION = 3
2016-09-21 19:54:04 +00:00
def __init__( self, name = None, content_type = None, formula = None, additional_info = None ):
2016-10-19 20:02:56 +00:00
if name is None:
name = ''
if content_type is None:
content_type = HC.CONTENT_TYPE_MAPPINGS
if formula is None:
formula = ParseFormulaHTML()
if additional_info is None:
if content_type == HC.CONTENT_TYPE_MAPPINGS:
additional_info = ''
2016-09-21 19:54:04 +00:00
self._name = name
self._content_type = content_type
self._formula = formula
self._additional_info = additional_info
def _GetSerialisableInfo( self ):
serialisable_formula = self._formula.GetSerialisableTuple()
2018-02-07 23:40:33 +00:00
if self._content_type == HC.CONTENT_TYPE_VETO:
( veto_if_matches_found, string_match ) = self._additional_info
serialisable_additional_info = ( veto_if_matches_found, string_match.GetSerialisableTuple() )
else:
serialisable_additional_info = self._additional_info
return ( self._name, self._content_type, serialisable_formula, serialisable_additional_info )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2018-02-07 23:40:33 +00:00
( self._name, self._content_type, serialisable_formula, serialisable_additional_info ) = serialisable_info
2016-09-21 19:54:04 +00:00
2018-02-07 23:40:33 +00:00
if self._content_type == HC.CONTENT_TYPE_VETO:
( veto_if_matches_found, serialisable_string_match ) = serialisable_additional_info
string_match = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_match )
self._additional_info = ( veto_if_matches_found, string_match )
else:
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
self._additional_info = serialisable_additional_info
if isinstance( self._additional_info, list ):
self._additional_info = tuple( self._additional_info )
2016-11-02 21:09:14 +00:00
2016-09-21 19:54:04 +00:00
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
2018-02-07 23:40:33 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( name, content_type, serialisable_formula, additional_info ) = old_serialisable_info
if content_type == HC.CONTENT_TYPE_VETO:
( veto_if_matches_found, match_if_text_present, search_text ) = additional_info
if match_if_text_present:
string_match = StringMatch( match_type = STRING_MATCH_REGEX, match_value = search_text, example_string = search_text )
else:
string_match = StringMatch()
serialisable_string_match = string_match.GetSerialisableTuple()
additional_info = ( veto_if_matches_found, serialisable_string_match )
new_serialisable_info = ( name, content_type, serialisable_formula, additional_info )
return ( 2, new_serialisable_info )
2018-06-06 21:27:02 +00:00
if version == 2:
( name, content_type, serialisable_formula, additional_info ) = old_serialisable_info
if content_type == HC.CONTENT_TYPE_URLS:
( url_type, priority ) = additional_info
if url_type == HC.URL_TYPE_FILE:
url_type = HC.URL_TYPE_DESIRED
elif url_type == HC.URL_TYPE_POST:
url_type = HC.URL_TYPE_SOURCE
else:
url_type = HC.URL_TYPE_NEXT
additional_info = ( url_type, priority )
new_serialisable_info = ( name, content_type, serialisable_formula, additional_info )
return ( 3, new_serialisable_info )
2018-02-07 23:40:33 +00:00
2018-01-24 23:09:42 +00:00
def GetName( self ):
return self._name
2016-09-21 19:54:04 +00:00
def GetParsableContent( self ):
2016-11-02 21:09:14 +00:00
return { ( self._name, self._content_type, self._additional_info ) }
2016-09-21 19:54:04 +00:00
2018-02-21 21:59:37 +00:00
def Parse( self, parsing_context, data ):
2016-09-21 19:54:04 +00:00
2018-07-18 21:07:15 +00:00
try:
parsed_texts = self._formula.Parse( parsing_context, data )
except HydrusExceptions.ParseException as e:
prefix = 'Content Parser ' + self._name + ': '
e = HydrusExceptions.ParseException( prefix + HydrusData.ToUnicode( e ) )
raise e
2016-09-21 19:54:04 +00:00
2018-04-18 22:10:15 +00:00
if self._content_type == HC.CONTENT_TYPE_URLS:
if 'url' in parsing_context:
base_url = parsing_context[ 'url' ]
parsed_texts = [ urlparse.urljoin( base_url, parsed_text ) for parsed_text in parsed_texts ]
2016-11-02 21:09:14 +00:00
if self._content_type == HC.CONTENT_TYPE_VETO:
2018-02-07 23:40:33 +00:00
( veto_if_matches_found, string_match ) = self._additional_info
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
match_found = True in ( string_match.Matches( parsed_text ) for parsed_text in parsed_texts )
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
veto_if_missing = not veto_if_matches_found
do_veto = ( veto_if_matches_found and match_found ) or ( veto_if_missing and not match_found )
2018-01-24 23:09:42 +00:00
if do_veto:
raise HydrusExceptions.VetoException( self._name )
else:
return []
2016-11-02 21:09:14 +00:00
else:
2018-01-24 23:09:42 +00:00
content_description = ( self._name, self._content_type, self._additional_info )
2016-11-02 21:09:14 +00:00
return [ ( content_description, parsed_text ) for parsed_text in parsed_texts ]
2016-09-21 19:54:04 +00:00
2018-02-21 21:59:37 +00:00
def ParsePretty( self, parsing_context, data ):
try:
parse_results = self.Parse( parsing_context, data )
results = [ ConvertParseResultToPrettyString( parse_result ) for parse_result in parse_results ]
except HydrusExceptions.VetoException as e:
results = [ 'veto: ' + HydrusData.ToUnicode( e ) ]
2018-07-18 21:07:15 +00:00
except HydrusExceptions.ParseException as e:
prefix = 'Content Parser ' + self._name + ': '
e = HydrusExceptions.ParseException( prefix + HydrusData.ToUnicode( e ) )
raise e
2018-02-21 21:59:37 +00:00
2018-07-04 20:48:28 +00:00
result_lines = [ '*** ' + HydrusData.ToHumanInt( len( results ) ) + ' RESULTS BEGIN ***' ]
2018-02-21 21:59:37 +00:00
result_lines.extend( results )
result_lines.append( '*** RESULTS END ***' )
results_text = os.linesep.join( result_lines )
return results_text
2018-01-24 23:09:42 +00:00
def SetName( self, name ):
self._name = name
2016-10-19 20:02:56 +00:00
def ToPrettyStrings( self ):
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
return ( self._name, 'content', ConvertParsableContentToPrettyString( self.GetParsableContent(), include_veto = True ) )
2016-10-19 20:02:56 +00:00
def ToTuple( self ):
return ( self._name, self._content_type, self._formula, self._additional_info )
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_CONTENT_PARSER ] = ContentParser
class PageParser( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PAGE_PARSER
SERIALISABLE_NAME = 'Page Parser'
2018-02-07 23:40:33 +00:00
SERIALISABLE_VERSION = 2
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
def __init__( self, name, parser_key = None, string_converter = None, sub_page_parsers = None, content_parsers = None, example_urls = None, example_parsing_context = None ):
2018-01-24 23:09:42 +00:00
if parser_key is None:
parser_key = HydrusData.GenerateKey()
if string_converter is None:
string_converter = StringConverter()
2018-01-31 22:58:15 +00:00
if sub_page_parsers is None:
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
sub_page_parsers = []
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
if content_parsers is None:
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
content_parsers = []
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
2018-01-24 23:09:42 +00:00
if example_urls is None:
example_urls = []
2018-02-07 23:40:33 +00:00
if example_parsing_context is None:
example_parsing_context = {}
example_parsing_context[ 'url' ] = 'http://example.com/posts/index.php?id=123456'
2017-12-13 22:33:07 +00:00
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
2018-01-24 23:09:42 +00:00
self._parser_key = parser_key
self._string_converter = string_converter
2018-01-31 22:58:15 +00:00
self._sub_page_parsers = sub_page_parsers
2017-12-13 22:33:07 +00:00
self._content_parsers = content_parsers
2018-01-24 23:09:42 +00:00
self._example_urls = example_urls
2018-02-07 23:40:33 +00:00
self._example_parsing_context = example_parsing_context
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
def _GetSerialisableInfo( self ):
serialisable_parser_key = self._parser_key.encode( 'hex' )
serialisable_string_converter = self._string_converter.GetSerialisableTuple()
serialisable_sub_page_parsers = [ ( formula.GetSerialisableTuple(), page_parser.GetSerialisableTuple() ) for ( formula, page_parser ) in self._sub_page_parsers ]
serialisable_content_parsers = HydrusSerialisable.SerialisableList( self._content_parsers ).GetSerialisableTuple()
2018-02-07 23:40:33 +00:00
return ( self._name, serialisable_parser_key, serialisable_string_converter, serialisable_sub_page_parsers, serialisable_content_parsers, self._example_urls, self._example_parsing_context )
2018-01-31 22:58:15 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2018-02-07 23:40:33 +00:00
( self._name, serialisable_parser_key, serialisable_string_converter, serialisable_sub_page_parsers, serialisable_content_parsers, self._example_urls, self._example_parsing_context ) = serialisable_info
2018-01-31 22:58:15 +00:00
self._parser_key = serialisable_parser_key.decode( 'hex' )
self._string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_converter )
self._sub_page_parsers = [ ( HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula ), HydrusSerialisable.CreateFromSerialisableTuple( serialisable_page_parser ) ) for ( serialisable_formula, serialisable_page_parser ) in serialisable_sub_page_parsers ]
self._content_parsers = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_content_parsers )
2018-02-07 23:40:33 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( name, serialisable_parser_key, serialisable_string_converter, serialisable_sub_page_parsers, serialisable_content_parsers, example_urls ) = old_serialisable_info
example_parsing_context = {}
example_parsing_context[ 'url' ] = 'http://example.com/posts/index.php?id=123456'
new_serialisable_info = ( name, serialisable_parser_key, serialisable_string_converter, serialisable_sub_page_parsers, serialisable_content_parsers, example_urls, example_parsing_context )
return ( 2, new_serialisable_info )
2018-04-11 22:30:40 +00:00
2018-01-24 23:09:42 +00:00
def GetContentParsers( self ):
2018-01-31 22:58:15 +00:00
return ( self._sub_page_parsers, self._content_parsers )
2018-01-24 23:09:42 +00:00
2018-02-07 23:40:33 +00:00
def GetExampleParsingContext( self ):
return self._example_parsing_context
2018-01-24 23:09:42 +00:00
def GetExampleURLs( self ):
return self._example_urls
2018-04-18 22:10:15 +00:00
def GetNamespaces( self ):
# this in future could expand to be more granular like:
# 'I want the artist tags, but not the user-submitted.'
# 'I want the title here, but not the title there.'
# 'I want the original filename, but not the UNIX timestamp filename.'
# which the parser could present with its sub-parsing element names
return GetNamespacesFromParsableContent( self.GetParsableContent() )
2018-01-31 22:58:15 +00:00
def GetParsableContent( self ):
parsable_content = set()
for ( formula, page_parser ) in self._sub_page_parsers:
parsable_content.update( page_parser.GetParsableContent() )
for content_parser in self._content_parsers:
parsable_content.update( content_parser.GetParsableContent() )
return parsable_content
2018-01-24 23:09:42 +00:00
def GetParserKey( self ):
return self._parser_key
def GetStringConverter( self ):
return self._string_converter
2017-12-13 22:33:07 +00:00
2018-02-21 21:59:37 +00:00
def Parse( self, parsing_context, page_data ):
2017-12-13 22:33:07 +00:00
2018-06-20 20:20:22 +00:00
page_data = HydrusData.ToUnicode( page_data )
2018-01-24 23:09:42 +00:00
try:
converted_page_data = self._string_converter.Convert( page_data )
except HydrusExceptions.StringConvertException as e:
2018-03-14 21:01:02 +00:00
raise HydrusExceptions.ParseException( HydrusData.ToUnicode( e ) )
2018-01-24 23:09:42 +00:00
2018-07-18 21:07:15 +00:00
except HydrusExceptions.ParseException as e:
prefix = 'Page Parser ' + self._name + ': '
e = HydrusExceptions.ParseException( prefix + HydrusData.ToUnicode( e ) )
raise e
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
#
2018-02-07 23:40:33 +00:00
whole_page_parse_results = []
2018-01-24 23:09:42 +00:00
2018-07-18 21:07:15 +00:00
try:
for content_parser in self._content_parsers:
whole_page_parse_results.extend( content_parser.Parse( parsing_context, converted_page_data ) )
2018-01-24 23:09:42 +00:00
2018-07-18 21:07:15 +00:00
except HydrusExceptions.ParseException as e:
prefix = 'Page Parser ' + self._name + ': '
e = HydrusExceptions.ParseException( prefix + HydrusData.ToUnicode( e ) )
raise e
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
#
2018-02-07 23:40:33 +00:00
all_parse_results = []
2018-01-31 22:58:15 +00:00
if len( self._sub_page_parsers ) == 0:
2018-02-07 23:40:33 +00:00
if len( whole_page_parse_results ) > 0:
2018-01-24 23:09:42 +00:00
2018-02-07 23:40:33 +00:00
all_parse_results = [ whole_page_parse_results ]
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
else:
def sort_key( sub_page_parser ):
( formula, page_parser ) = sub_page_parser
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
return page_parser.GetName()
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
sub_page_parsers = list( self._sub_page_parsers )
sub_page_parsers.sort( key = sort_key )
2018-07-18 21:07:15 +00:00
try:
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
for ( formula, page_parser ) in self._sub_page_parsers:
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
posts = formula.Parse( parsing_context, converted_page_data )
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
for post in posts:
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
try:
page_parser_all_parse_results = page_parser.Parse( parsing_context, post )
except HydrusExceptions.VetoException:
continue
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
for page_parser_parse_results in page_parser_all_parse_results:
page_parser_parse_results.extend( whole_page_parse_results )
all_parse_results.append( page_parser_parse_results )
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
except HydrusExceptions.ParseException as e:
prefix = 'Page Parser ' + self._name + ': '
e = HydrusExceptions.ParseException( prefix + HydrusData.ToUnicode( e ) )
raise e
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
return all_parse_results
2018-01-24 23:09:42 +00:00
2018-02-21 21:59:37 +00:00
def ParsePretty( self, parsing_context, page_data ):
try:
all_parse_results = self.Parse( parsing_context, page_data )
pretty_groups_of_parse_results = [ os.linesep.join( [ ConvertParseResultToPrettyString( parse_result ) for parse_result in parse_results ] ) for parse_results in all_parse_results ]
group_separator = os.linesep * 2 + '*** SEPARATE FILE RESULTS BREAK ***' + os.linesep * 2
pretty_parse_result_text = group_separator.join( pretty_groups_of_parse_results )
except HydrusExceptions.VetoException as e:
2018-05-02 20:45:20 +00:00
all_parse_results = [ 1 ]
2018-04-25 22:07:52 +00:00
pretty_parse_result_text = 'veto: ' + HydrusData.ToUnicode( e )
2018-02-21 21:59:37 +00:00
result_lines = []
2018-07-04 20:48:28 +00:00
result_lines.append( '*** ' + HydrusData.ToHumanInt( len( all_parse_results ) ) + ' RESULTS BEGIN ***' + os.linesep )
2018-02-21 21:59:37 +00:00
result_lines.append( pretty_parse_result_text )
result_lines.append( os.linesep + '*** RESULTS END ***' )
results_text = os.linesep.join( result_lines )
return results_text
2018-01-24 23:09:42 +00:00
def RegenerateParserKey( self ):
self._parser_key = HydrusData.GenerateKey()
2017-12-13 22:33:07 +00:00
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PAGE_PARSER ] = PageParser
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
class ParseNodeContentLink( HydrusSerialisable.SerialisableBase ):
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'Content Parsing Link'
2016-09-21 19:54:04 +00:00
SERIALISABLE_VERSION = 1
2016-10-19 20:02:56 +00:00
def __init__( self, name = None, formula = None, children = None ):
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
if name is None:
name = ''
if formula is None:
formula = ParseFormulaHTML()
if children is None:
children = []
self._name = name
2016-09-21 19:54:04 +00:00
self._formula = formula
self._children = children
def _GetSerialisableInfo( self ):
serialisable_formula = self._formula.GetSerialisableTuple()
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
2016-10-19 20:02:56 +00:00
return ( self._name, serialisable_formula, serialisable_children )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2016-10-19 20:02:56 +00:00
( self._name, serialisable_formula, serialisable_children ) = serialisable_info
2016-09-21 19:54:04 +00:00
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
def GetParsableContent( self ):
children_parsable_content = set()
for child in self._children:
children_parsable_content.update( child.GetParsableContent() )
return children_parsable_content
2017-12-13 22:33:07 +00:00
def Parse( self, job_key, data, referral_url ):
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
search_urls = self.ParseURLs( job_key, data, referral_url )
2016-09-21 19:54:04 +00:00
content = []
for search_url in search_urls:
2017-08-30 20:27:47 +00:00
job_key.SetVariable( 'script_status', 'fetching ' + search_url )
2018-04-18 22:10:15 +00:00
network_job = ClientNetworkingJobs.NetworkJob( 'GET', search_url, referral_url = referral_url )
2017-08-30 20:27:47 +00:00
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
2017-09-06 20:18:20 +00:00
try:
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
network_job.WaitUntilDone()
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
except HydrusExceptions.CancelledException:
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
break
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
except HydrusExceptions.NetworkException as e:
2016-11-16 20:21:43 +00:00
2017-08-30 20:27:47 +00:00
if isinstance( e, HydrusExceptions.NotFoundException ):
job_key.SetVariable( 'script_status', '404 - nothing found' )
time.sleep( 2 )
continue
elif isinstance( e, HydrusExceptions.NetworkException ):
job_key.SetVariable( 'script_status', 'Network error! Details written to log.' )
HydrusData.Print( 'Problem fetching ' + search_url + ':' )
HydrusData.PrintException( e )
time.sleep( 2 )
continue
else:
2017-09-06 20:18:20 +00:00
raise
2017-08-30 20:27:47 +00:00
2016-11-16 20:21:43 +00:00
2016-09-21 19:54:04 +00:00
2017-08-30 20:27:47 +00:00
linked_data = network_job.GetContent()
2016-11-09 23:13:22 +00:00
2017-12-13 22:33:07 +00:00
children_content = GetChildrenContent( job_key, self._children, linked_data, search_url )
2016-11-02 21:09:14 +00:00
content.extend( children_content )
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException()
2016-09-21 19:54:04 +00:00
return content
2016-11-16 20:21:43 +00:00
def ParseURLs( self, job_key, data, referral_url ):
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
basic_urls = self._formula.Parse( {}, data )
2016-11-02 21:09:14 +00:00
absolute_urls = [ urlparse.urljoin( referral_url, basic_url ) for basic_url in basic_urls ]
2016-11-16 20:21:43 +00:00
for url in absolute_urls:
job_key.AddURL( url )
2016-11-02 21:09:14 +00:00
return absolute_urls
2016-10-19 20:02:56 +00:00
def ToPrettyStrings( self ):
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
return ( self._name, 'link', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
def ToTuple( self ):
return ( self._name, self._formula, self._children )
2016-10-05 20:22:40 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK ] = ParseNodeContentLink
FILE_IDENTIFIER_TYPE_FILE = 0
FILE_IDENTIFIER_TYPE_MD5 = 1
FILE_IDENTIFIER_TYPE_SHA1 = 2
FILE_IDENTIFIER_TYPE_SHA256 = 3
2016-10-12 21:52:50 +00:00
FILE_IDENTIFIER_TYPE_SHA512 = 4
2016-10-05 20:22:40 +00:00
FILE_IDENTIFIER_TYPE_USER_INPUT = 5
file_identifier_string_lookup = {}
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_FILE ] = 'the actual file (POST only)'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_MD5 ] = 'md5 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA1 ] = 'sha1 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA256 ] = 'sha256 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA512 ] = 'sha512 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_USER_INPUT ] = 'custom user input'
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
# eventually transition this to be a flat 'generate page/gallery urls'
# the rest of the parsing system can pick those up automatically
# this nullifies the need for contentlink stuff, at least in its current borked form
2016-10-05 20:22:40 +00:00
class ParseRootFileLookup( HydrusSerialisable.SerialisableBaseNamed ):
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'File Lookup Script'
2017-11-15 22:35:49 +00:00
SERIALISABLE_VERSION = 2
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
def __init__( self, name, url = None, query_type = None, file_identifier_type = None, file_identifier_string_converter = None, file_identifier_arg_name = None, static_args = None, children = None ):
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
2016-11-02 21:09:14 +00:00
self._url = url
2016-09-21 19:54:04 +00:00
self._query_type = query_type
2016-10-05 20:22:40 +00:00
self._file_identifier_type = file_identifier_type
2017-11-15 22:35:49 +00:00
self._file_identifier_string_converter = file_identifier_string_converter
2016-10-05 20:22:40 +00:00
self._file_identifier_arg_name = file_identifier_arg_name
2016-09-21 19:54:04 +00:00
self._static_args = static_args
self._children = children
def _GetSerialisableInfo( self ):
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
2017-11-15 22:35:49 +00:00
serialisable_file_identifier_string_converter = self._file_identifier_string_converter.GetSerialisableTuple()
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
return ( self._url, self._query_type, self._file_identifier_type, serialisable_file_identifier_string_converter, self._file_identifier_arg_name, self._static_args, serialisable_children )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2017-11-15 22:35:49 +00:00
( self._url, self._query_type, self._file_identifier_type, serialisable_file_identifier_string_converter, self._file_identifier_arg_name, self._static_args, serialisable_children ) = serialisable_info
2016-09-21 19:54:04 +00:00
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
2017-11-15 22:35:49 +00:00
self._file_identifier_string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_identifier_string_converter )
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( url, query_type, file_identifier_type, file_identifier_encoding, file_identifier_arg_name, static_args, serialisable_children ) = old_serialisable_info
transformations = []
if file_identifier_encoding == HC.ENCODING_RAW:
pass
elif file_identifier_encoding == HC.ENCODING_HEX:
transformations.append( ( STRING_TRANSFORMATION_ENCODE, 'hex' ) )
elif file_identifier_encoding == HC.ENCODING_BASE64:
transformations.append( ( STRING_TRANSFORMATION_ENCODE, 'base64' ) )
file_identifier_string_converter = StringConverter( transformations, 'some hash bytes' )
serialisable_file_identifier_string_converter = file_identifier_string_converter.GetSerialisableTuple()
new_serialisable_info = ( url, query_type, file_identifier_type, serialisable_file_identifier_string_converter, file_identifier_arg_name, static_args, serialisable_children )
return ( 2, new_serialisable_info )
2018-02-07 23:40:33 +00:00
2016-11-09 23:13:22 +00:00
def ConvertMediaToFileIdentifier( self, media ):
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT:
raise Exception( 'Cannot convert media to file identifier--this script takes user input!' )
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA256:
return media.GetHash()
elif self._file_identifier_type in ( FILE_IDENTIFIER_TYPE_MD5, FILE_IDENTIFIER_TYPE_SHA1, FILE_IDENTIFIER_TYPE_SHA512 ):
sha256_hash = media.GetHash()
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_MD5:
hash_type = 'md5'
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA1:
hash_type = 'sha1'
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA512:
hash_type = 'sha512'
try:
2017-05-10 21:33:58 +00:00
( other_hash, ) = HG.client_controller.Read( 'file_hashes', ( sha256_hash, ), 'sha256', hash_type )
2016-11-09 23:13:22 +00:00
return other_hash
except:
raise Exception( 'I do not know that file\'s ' + hash_type + ' hash, so I cannot look it up!' )
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
hash = media.GetHash()
mime = media.GetMime()
2017-06-28 20:23:21 +00:00
client_files_manager = HG.client_controller.client_files_manager
2016-11-09 23:13:22 +00:00
try:
path = client_files_manager.GetFilePath( hash, mime )
return path
except HydrusExceptions.FileMissingException as e:
raise Exception( 'That file is not in the database\'s local files, so I cannot look it up!' )
2016-11-16 20:21:43 +00:00
def FetchData( self, job_key, file_identifier ):
2017-09-06 20:18:20 +00:00
# add gauge report hook and in-stream cancel support to the get/post calls
request_args = dict( self._static_args )
if self._file_identifier_type != FILE_IDENTIFIER_TYPE_FILE:
2016-10-05 20:22:40 +00:00
2017-11-15 22:35:49 +00:00
request_args[ self._file_identifier_arg_name ] = self._file_identifier_string_converter.Convert( file_identifier )
2016-10-05 20:22:40 +00:00
2017-09-06 20:18:20 +00:00
if self._query_type == HC.GET:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
raise Exception( 'Cannot have a file as an argument on a GET query!' )
2016-11-02 21:09:14 +00:00
2018-08-22 21:10:59 +00:00
full_request_url = self._url + '?' + ClientNetworkingDomain.ConvertQueryDictToText( request_args )
2017-09-06 20:18:20 +00:00
job_key.SetVariable( 'script_status', 'fetching ' + full_request_url )
job_key.AddURL( full_request_url )
2018-04-18 22:10:15 +00:00
network_job = ClientNetworkingJobs.NetworkJob( 'GET', full_request_url )
2017-09-06 20:18:20 +00:00
elif self._query_type == HC.POST:
2018-08-15 20:40:30 +00:00
additional_headers = {}
files = None
2017-09-06 20:18:20 +00:00
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
job_key.SetVariable( 'script_status', 'uploading file' )
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
path = file_identifier
2016-11-02 21:09:14 +00:00
2018-08-15 20:40:30 +00:00
if self._file_identifier_string_converter.MakesChanges():
f_altered = cStringIO.StringIO()
with open( path, 'rb' ) as f:
file_content = f.read()
f_altered = self._file_identifier_string_converter.Convert( file_content )
request_args[ self._file_identifier_arg_name ] = f_altered
additional_headers[ 'content-type' ] = 'application/x-www-form-urlencoded'
else:
files = { self._file_identifier_arg_name : open( path, 'rb' ) }
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
else:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
job_key.SetVariable( 'script_status', 'uploading identifier' )
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
files = None
2016-11-02 21:09:14 +00:00
2018-04-18 22:10:15 +00:00
network_job = ClientNetworkingJobs.NetworkJob( 'POST', self._url, body = request_args )
2017-10-25 21:45:15 +00:00
2018-08-15 20:40:30 +00:00
if files is not None:
network_job.SetFiles( files )
for ( key, value ) in additional_headers.items():
network_job.AddAdditionalHeader( key, value )
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
# send nj to nj control on this panel here
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
try:
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
network_job.WaitUntilDone()
2016-12-07 22:12:52 +00:00
except HydrusExceptions.NotFoundException:
job_key.SetVariable( 'script_status', '404 - nothing found' )
raise
except HydrusExceptions.NetworkException as e:
job_key.SetVariable( 'script_status', 'Network error!' )
HydrusData.ShowException( e )
raise
2016-11-16 20:21:43 +00:00
2016-10-05 20:22:40 +00:00
2017-09-06 20:18:20 +00:00
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException()
data = network_job.GetContent()
return data
2016-10-05 20:22:40 +00:00
2016-09-21 19:54:04 +00:00
def GetParsableContent( self ):
children_parsable_content = set()
for child in self._children:
children_parsable_content.update( child.GetParsableContent() )
return children_parsable_content
2017-12-13 22:33:07 +00:00
def DoQuery( self, job_key, file_identifier ):
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
try:
try:
data = self.FetchData( job_key, file_identifier )
except HydrusExceptions.NetworkException as e:
return []
2018-02-07 23:40:33 +00:00
parse_results = self.Parse( job_key, data )
2016-11-16 20:21:43 +00:00
2018-02-07 23:40:33 +00:00
return parse_results
2016-11-16 20:21:43 +00:00
except HydrusExceptions.CancelledException:
job_key.SetVariable( 'script_status', 'Cancelled!' )
return []
finally:
job_key.Finish()
2016-10-19 20:02:56 +00:00
2016-11-09 23:13:22 +00:00
def UsesUserInput( self ):
2016-10-19 20:02:56 +00:00
2016-11-09 23:13:22 +00:00
return self._file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT
2016-10-19 20:02:56 +00:00
2017-12-13 22:33:07 +00:00
def Parse( self, job_key, data ):
2016-09-21 19:54:04 +00:00
2018-02-07 23:40:33 +00:00
parse_results = GetChildrenContent( job_key, self._children, data, self._url )
2016-09-21 19:54:04 +00:00
2018-02-07 23:40:33 +00:00
if len( parse_results ) == 0:
2016-12-07 22:12:52 +00:00
job_key.SetVariable( 'script_status', 'Did not find anything.' )
else:
2018-07-04 20:48:28 +00:00
job_key.SetVariable( 'script_status', 'Found ' + HydrusData.ToHumanInt( len( parse_results ) ) + ' rows.' )
2016-12-07 22:12:52 +00:00
2018-02-07 23:40:33 +00:00
return parse_results
2016-09-21 19:54:04 +00:00
def SetChildren( self, children ):
self._children = children
2016-10-05 20:22:40 +00:00
def ToPrettyStrings( self ):
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
return ( self._name, HC.query_type_string_lookup[ self._query_type ], 'File Lookup', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
2016-09-21 19:54:04 +00:00
def ToTuple( self ):
2017-11-15 22:35:49 +00:00
return ( self._name, self._url, self._query_type, self._file_identifier_type, self._file_identifier_string_converter, self._file_identifier_arg_name, self._static_args, self._children )
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP ] = ParseRootFileLookup
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING = 0
STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END = 1
2017-09-13 20:50:41 +00:00
STRING_TRANSFORMATION_PREPEND_TEXT = 2
STRING_TRANSFORMATION_APPEND_TEXT = 3
STRING_TRANSFORMATION_ENCODE = 4
STRING_TRANSFORMATION_DECODE = 5
2017-11-15 22:35:49 +00:00
STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING = 6
STRING_TRANSFORMATION_CLIP_TEXT_FROM_END = 7
2017-09-13 20:50:41 +00:00
STRING_TRANSFORMATION_REVERSE = 8
2018-01-17 22:52:10 +00:00
STRING_TRANSFORMATION_REGEX_SUB = 9
2018-02-07 23:40:33 +00:00
STRING_TRANSFORMATION_DATE_DECODE = 10
2018-06-06 21:27:02 +00:00
STRING_TRANSFORMATION_INTEGER_ADDITION = 11
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
transformation_type_str_lookup = {}
transformation_type_str_lookup[ STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING ] = 'remove text from beginning of string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END ] = 'remove text from end of string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_PREPEND_TEXT ] = 'prepend text'
transformation_type_str_lookup[ STRING_TRANSFORMATION_APPEND_TEXT ] = 'append text'
transformation_type_str_lookup[ STRING_TRANSFORMATION_ENCODE ] = 'encode'
transformation_type_str_lookup[ STRING_TRANSFORMATION_DECODE ] = 'decode'
transformation_type_str_lookup[ STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING ] = 'take the start of the string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_CLIP_TEXT_FROM_END ] = 'take the end of the string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_REVERSE ] = 'reverse text'
2018-01-17 22:52:10 +00:00
transformation_type_str_lookup[ STRING_TRANSFORMATION_REGEX_SUB ] = 'regex substitution'
2018-02-07 23:40:33 +00:00
transformation_type_str_lookup[ STRING_TRANSFORMATION_DATE_DECODE ] = 'date decode'
2018-06-06 21:27:02 +00:00
transformation_type_str_lookup[ STRING_TRANSFORMATION_INTEGER_ADDITION ] = 'integer addition'
2017-11-15 22:35:49 +00:00
class StringConverter( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_CONVERTER
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'String Converter'
2017-11-15 22:35:49 +00:00
SERIALISABLE_VERSION = 1
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def __init__( self, transformations = None, example_string = None ):
if transformations is None:
transformations = []
if example_string is None:
example_string = 'example string'
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
self.transformations = transformations
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
self.example_string = example_string
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def _GetSerialisableInfo( self ):
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return ( self.transformations, self.example_string )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2018-01-17 22:52:10 +00:00
( serialisable_transformations, self.example_string ) = serialisable_info
self.transformations = []
2018-05-02 20:45:20 +00:00
try: # I initialised this bad one time and broke a dialog on subsequent loads, fugg
2018-01-17 22:52:10 +00:00
2018-05-02 20:45:20 +00:00
for ( transformation_type, data ) in serialisable_transformations:
2018-01-17 22:52:10 +00:00
2018-05-02 20:45:20 +00:00
if isinstance( data, list ):
data = tuple( data ) # convert from list to tuple thing
self.transformations.append( ( transformation_type, data ) )
2018-01-17 22:52:10 +00:00
2018-05-02 20:45:20 +00:00
except:
pass
2018-01-17 22:52:10 +00:00
2017-11-15 22:35:49 +00:00
def Convert( self, s, max_steps_allowed = None ):
for ( i, transformation ) in enumerate( self.transformations ):
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
try:
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
( transformation_type, data ) = transformation
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
if transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING:
num_chars = data
s = s[ num_chars : ]
elif transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END:
num_chars = data
s = s[ : - num_chars ]
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING:
num_chars = data
s = s[ : num_chars ]
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_END:
num_chars = data
s = s[ - num_chars : ]
elif transformation_type == STRING_TRANSFORMATION_PREPEND_TEXT:
text = data
s = text + s
elif transformation_type == STRING_TRANSFORMATION_APPEND_TEXT:
text = data
s = s + text
elif transformation_type == STRING_TRANSFORMATION_ENCODE:
encode_type = data
s = s.encode( encode_type )
elif transformation_type == STRING_TRANSFORMATION_DECODE:
encode_type = data
s = s.decode( encode_type )
elif transformation_type == STRING_TRANSFORMATION_REVERSE:
s = s[::-1]
2018-01-17 22:52:10 +00:00
elif transformation_type == STRING_TRANSFORMATION_REGEX_SUB:
( pattern, repl ) = data
s = re.sub( pattern, repl, s, flags = re.UNICODE )
2018-02-07 23:40:33 +00:00
elif transformation_type == STRING_TRANSFORMATION_DATE_DECODE:
( phrase, timezone, timezone_offset ) = data
struct_time = time.strptime( s, phrase )
if timezone == HC.TIMEZONE_GMT:
# the given struct is in GMT, so calendar.timegm is appropriate here
timestamp = int( calendar.timegm( struct_time ) )
elif timezone == HC.TIMEZONE_LOCAL:
# the given struct is in local time, so time.mktime is correct
timestamp = int( time.mktime( struct_time ) )
elif timezone == HC.TIMEZONE_OFFSET:
# the given struct is in server time, which is the same as GMT minus an offset
# if we are 7200 seconds ahead, the correct GMT timestamp needs to be 7200 smaller
timestamp = int( calendar.timegm( struct_time ) ) - timezone_offset
s = str( timestamp )
2018-06-06 21:27:02 +00:00
elif transformation_type == STRING_TRANSFORMATION_INTEGER_ADDITION:
delta = data
s = str( int( s ) + int( delta ) )
2017-09-13 20:50:41 +00:00
2018-06-06 21:27:02 +00:00
except Exception as e:
2017-09-13 20:50:41 +00:00
2018-06-06 21:27:02 +00:00
raise HydrusExceptions.StringConvertException( 'ERROR: Could not apply "' + self.TransformationToUnicode( transformation ) + '" to string "' + repr( s ) + '":' + HydrusData.ToUnicode( e ) )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
if max_steps_allowed is not None and i + 1 >= max_steps_allowed:
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return s
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return s
def GetTransformationStrings( self ):
return [ self.TransformationToUnicode( transformation ) for transformation in self.transformations ]
2018-02-07 23:40:33 +00:00
def MakesChanges( self ):
return len( self.transformations ) > 0
2017-11-15 22:35:49 +00:00
@staticmethod
def TransformationToUnicode( transformation ):
( transformation_type, data ) = transformation
if transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING:
2018-07-04 20:48:28 +00:00
return 'remove the first ' + HydrusData.ToHumanInt( data ) + ' characters'
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END:
2018-07-04 20:48:28 +00:00
return 'remove the last ' + HydrusData.ToHumanInt( data ) + ' characters'
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING:
2018-07-04 20:48:28 +00:00
return 'take the first ' + HydrusData.ToHumanInt( data ) + ' characters'
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_END:
2018-07-04 20:48:28 +00:00
return 'take the last ' + HydrusData.ToHumanInt( data ) + ' characters'
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_PREPEND_TEXT:
return 'prepend with "' + data + '"'
elif transformation_type == STRING_TRANSFORMATION_APPEND_TEXT:
return 'append with "' + data + '"'
elif transformation_type == STRING_TRANSFORMATION_ENCODE:
return 'encode to ' + data
elif transformation_type == STRING_TRANSFORMATION_DECODE:
return 'decode from ' + data
elif transformation_type == STRING_TRANSFORMATION_REVERSE:
return transformation_type_str_lookup[ STRING_TRANSFORMATION_REVERSE ]
2018-01-17 22:52:10 +00:00
elif transformation_type == STRING_TRANSFORMATION_REGEX_SUB:
return 'regex substitution: ' + HydrusData.ToUnicode( data )
2018-04-11 22:30:40 +00:00
elif transformation_type == STRING_TRANSFORMATION_DATE_DECODE:
return 'date decode: ' + repr( data )
2018-06-06 21:27:02 +00:00
elif transformation_type == STRING_TRANSFORMATION_INTEGER_ADDITION:
return 'integer addition: add ' + HydrusData.ToUnicode( data )
2018-04-11 22:30:40 +00:00
else:
return 'unknown transformation'
2017-11-15 22:35:49 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_CONVERTER ] = StringConverter
2017-09-13 20:50:41 +00:00
STRING_MATCH_FIXED = 0
STRING_MATCH_FLEXIBLE = 1
STRING_MATCH_REGEX = 2
STRING_MATCH_ANY = 3
ALPHA = 0
ALPHANUMERIC = 1
NUMERIC = 2
2017-09-27 21:52:54 +00:00
class StringMatch( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_MATCH
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'String Match'
2017-09-27 21:52:54 +00:00
SERIALISABLE_VERSION = 1
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
def __init__( self, match_type = STRING_MATCH_ANY, match_value = '', min_chars = None, max_chars = None, example_string = 'example string' ):
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
2017-09-13 20:50:41 +00:00
# make a gui control that accepts one of these. displays expected input on the right and colours red/green (and does isvalid) based on current input
# think about replacing the veto stuff above with this.
self._match_type = match_type
self._match_value = match_value
2017-09-27 21:52:54 +00:00
self._min_chars = min_chars
self._max_chars = max_chars
self._example_string = example_string
def _GetSerialisableInfo( self ):
return ( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string ) = serialisable_info
2017-09-13 20:50:41 +00:00
def SetMaxChars( self, max_chars ):
self._max_chars = max_chars
def SetMinChars( self, min_chars ):
self._min_chars = min_chars
2018-02-07 23:40:33 +00:00
def Matches( self, text ):
try:
self.Test( text )
return True
except HydrusExceptions.StringMatchException:
return False
2017-09-13 20:50:41 +00:00
def Test( self, text ):
text_len = len( text )
presentation_text = '"' + text + '"'
if self._min_chars is not None and text_len < self._min_chars:
2018-07-04 20:48:28 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + ' had fewer than ' + HydrusData.ToHumanInt( self._min_chars ) + ' characters' )
2017-09-13 20:50:41 +00:00
if self._max_chars is not None and text_len > self._max_chars:
2018-07-04 20:48:28 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + ' had more than ' + HydrusData.ToHumanInt( self._max_chars ) + ' characters' )
2017-09-13 20:50:41 +00:00
if self._match_type == STRING_MATCH_FIXED:
2017-11-22 21:03:07 +00:00
if text != self._match_value:
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + ' did not exactly match "' + self._match_value + '"' )
2017-09-13 20:50:41 +00:00
elif self._match_type in ( STRING_MATCH_FLEXIBLE, STRING_MATCH_REGEX ):
if self._match_type == STRING_MATCH_FLEXIBLE:
if self._match_value == ALPHA:
r = '^[a-zA-Z]+$'
fail_reason = ' had non-alpha characters'
elif self._match_value == ALPHANUMERIC:
r = '^[a-zA-Z\d]+$'
fail_reason = ' had non-alphanumeric characters'
elif self._match_value == NUMERIC:
r = '^\d+$'
fail_reason = ' had non-numeric characters'
elif self._match_type == STRING_MATCH_REGEX:
r = self._match_value
fail_reason = ' did not match "' + r + '"'
2018-05-02 20:45:20 +00:00
try:
result = re.search( r, text, flags = re.UNICODE )
except Exception as e:
raise HydrusExceptions.StringMatchException( 'That regex did not work! ' + HydrusData.ToUnicode( e ) )
if result is None:
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + fail_reason )
2017-09-13 20:50:41 +00:00
elif self._match_type == STRING_MATCH_ANY:
2017-11-22 21:03:07 +00:00
pass
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
def ToTuple( self ):
return ( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string )
2017-09-27 21:52:54 +00:00
def ToUnicode( self ):
result = ''
2017-11-22 21:03:07 +00:00
if self._min_chars is None:
2017-09-27 21:52:54 +00:00
2017-11-22 21:03:07 +00:00
if self._max_chars is None:
2017-09-27 21:52:54 +00:00
2017-11-22 21:03:07 +00:00
result += 'any number of '
2017-09-27 21:52:54 +00:00
else:
2017-11-22 21:03:07 +00:00
result += 'at most ' + HydrusData.ToUnicode( self._max_chars ) + ' '
2017-09-27 21:52:54 +00:00
else:
2017-11-22 21:03:07 +00:00
if self._max_chars is None:
result += 'at least ' + HydrusData.ToUnicode( self._min_chars ) + ' '
else:
result += 'between ' + HydrusData.ToUnicode( self._min_chars ) + ' and ' + HydrusData.ToUnicode( self._max_chars ) + ' '
2017-09-27 21:52:54 +00:00
show_example = True
if self._match_type == STRING_MATCH_ANY:
result += 'characters'
2017-11-22 21:03:07 +00:00
show_example = False
2017-09-27 21:52:54 +00:00
elif self._match_type == STRING_MATCH_FIXED:
2017-11-22 21:03:07 +00:00
result = self._match_value
2017-09-27 21:52:54 +00:00
show_example = False
elif self._match_type == STRING_MATCH_FLEXIBLE:
if self._match_value == ALPHA:
result += 'alphabetical characters'
elif self._match_value == ALPHANUMERIC:
result += 'alphanumeric characters'
elif self._match_value == NUMERIC:
result += 'numeric characters'
elif self._match_type == STRING_MATCH_REGEX:
result += 'characters, matching regex "' + self._match_value + '"'
if show_example:
2017-11-22 21:03:07 +00:00
result += ', such as "' + self._example_string + '"'
2017-09-27 21:52:54 +00:00
return result
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_MATCH ] = StringMatch