2016-07-20 19:57:10 +00:00
import bs4
2018-02-07 23:40:33 +00:00
import calendar
2018-04-18 22:10:15 +00:00
import ClientNetworkingDomain
import ClientNetworkingJobs
2017-12-13 22:33:07 +00:00
import collections
2018-08-15 20:40:30 +00:00
import cStringIO
2016-09-21 19:54:04 +00:00
import HydrusConstants as HC
2016-09-07 20:01:05 +00:00
import HydrusData
2016-11-09 23:13:22 +00:00
import HydrusExceptions
2017-05-10 21:33:58 +00:00
import HydrusGlobals as HG
2016-07-20 19:57:10 +00:00
import HydrusSerialisable
2016-10-26 20:45:34 +00:00
import HydrusTags
2018-01-31 22:58:15 +00:00
import json
2016-10-19 20:02:56 +00:00
import os
2017-09-13 20:50:41 +00:00
import re
2018-04-25 22:07:52 +00:00
import threading
2016-11-16 20:21:43 +00:00
import time
2016-11-02 21:09:14 +00:00
import urlparse
2016-07-20 19:57:10 +00:00
2018-04-11 22:30:40 +00:00
try :
import html5lib
HTML5LIB_IS_OK = True
except ImportError :
HTML5LIB_IS_OK = False
try :
import lxml
LXML_IS_OK = True
except ImportError :
LXML_IS_OK = False
2018-02-07 23:40:33 +00:00
def ConvertParseResultToPrettyString ( result ) :
2016-10-26 20:45:34 +00:00
2016-11-02 21:09:14 +00:00
( ( name , content_type , additional_info ) , parsed_text ) = result
2016-10-26 20:45:34 +00:00
2017-12-13 22:33:07 +00:00
if content_type == HC . CONTENT_TYPE_URLS :
2018-02-07 23:40:33 +00:00
( url_type , priority ) = additional_info
2018-06-06 21:27:02 +00:00
if url_type == HC . URL_TYPE_DESIRED :
2018-02-07 23:40:33 +00:00
2018-06-20 20:20:22 +00:00
return ' downloadable/pursuable url (priority ' + str ( priority ) + ' ): ' + parsed_text
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
elif url_type == HC . URL_TYPE_SOURCE :
2018-02-07 23:40:33 +00:00
2018-06-20 20:20:22 +00:00
return ' associable/source url (priority ' + str ( priority ) + ' ): ' + parsed_text
2018-02-07 23:40:33 +00:00
elif url_type == HC . URL_TYPE_NEXT :
2018-06-20 20:20:22 +00:00
return ' next page url (priority ' + str ( priority ) + ' ): ' + parsed_text
2018-02-07 23:40:33 +00:00
2017-12-13 22:33:07 +00:00
elif content_type == HC . CONTENT_TYPE_MAPPINGS :
2016-10-26 20:45:34 +00:00
2018-06-06 21:27:02 +00:00
try :
tag = HydrusTags . CleanTag ( HydrusTags . CombineTag ( additional_info , parsed_text ) )
except :
tag = ' unparsable tag, will likely be discarded '
return ' tag: ' + tag
2016-10-26 20:45:34 +00:00
2018-01-31 22:58:15 +00:00
elif content_type == HC . CONTENT_TYPE_HASH :
return additional_info + ' hash: ' + parsed_text . encode ( ' hex ' )
2018-02-07 23:40:33 +00:00
elif content_type == HC . CONTENT_TYPE_TIMESTAMP :
timestamp_type = additional_info
try :
timestamp = int ( parsed_text )
timestamp_string = HydrusData . ConvertTimestampToPrettyTime ( timestamp )
except :
timestamp_string = ' could not convert to integer '
if timestamp_type == HC . TIMESTAMP_TYPE_SOURCE :
return ' source time: ' + timestamp_string
elif content_type == HC . CONTENT_TYPE_TITLE :
priority = additional_info
2018-05-23 21:05:06 +00:00
return ' watcher page title (priority ' + str ( priority ) + ' ): ' + parsed_text
2018-02-07 23:40:33 +00:00
2016-11-02 21:09:14 +00:00
elif content_type == HC . CONTENT_TYPE_VETO :
2018-04-25 22:07:52 +00:00
return ' veto: ' + name
2016-11-02 21:09:14 +00:00
2016-10-26 20:45:34 +00:00
raise NotImplementedError ( )
2016-11-02 21:09:14 +00:00
def ConvertParsableContentToPrettyString ( parsable_content , include_veto = False ) :
pretty_strings = [ ]
2018-04-25 22:07:52 +00:00
content_type_to_additional_infos = HydrusData . BuildKeyToSetDict ( ( ( ( content_type , name ) , additional_infos ) for ( name , content_type , additional_infos ) in parsable_content ) )
2016-11-02 21:09:14 +00:00
2018-01-31 22:58:15 +00:00
data = list ( content_type_to_additional_infos . items ( ) )
data . sort ( )
2018-04-25 22:07:52 +00:00
for ( ( content_type , name ) , additional_infos ) in data :
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
if content_type == HC . CONTENT_TYPE_URLS :
2018-02-07 23:40:33 +00:00
for ( url_type , priority ) in additional_infos :
2018-06-06 21:27:02 +00:00
if url_type == HC . URL_TYPE_DESIRED :
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
pretty_strings . append ( ' downloadable/pursuable url ' )
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
elif url_type == HC . URL_TYPE_SOURCE :
2018-02-07 23:40:33 +00:00
2018-06-06 21:27:02 +00:00
pretty_strings . append ( ' associable/source url ' )
2018-02-07 23:40:33 +00:00
elif url_type == HC . URL_TYPE_NEXT :
pretty_strings . append ( ' gallery next page url ' )
2017-12-13 22:33:07 +00:00
elif content_type == HC . CONTENT_TYPE_MAPPINGS :
2016-11-02 21:09:14 +00:00
namespaces = [ namespace for namespace in additional_infos if namespace != ' ' ]
if ' ' in additional_infos :
namespaces . append ( ' unnamespaced ' )
pretty_strings . append ( ' tags: ' + ' , ' . join ( namespaces ) )
2018-01-31 22:58:15 +00:00
elif content_type == HC . CONTENT_TYPE_HASH :
if len ( additional_infos ) == 1 :
( hash_type , ) = additional_infos
pretty_strings . append ( ' hash: ' + hash_type )
else :
hash_types = list ( additional_infos )
hash_types . sort ( )
pretty_strings . append ( ' hashes: ' + ' , ' . join ( hash_types ) )
2018-02-07 23:40:33 +00:00
elif content_type == HC . CONTENT_TYPE_TIMESTAMP :
for timestamp_type in additional_infos :
if timestamp_type == HC . TIMESTAMP_TYPE_SOURCE :
pretty_strings . append ( ' source time ' )
elif content_type == HC . CONTENT_TYPE_TITLE :
2018-05-23 21:05:06 +00:00
pretty_strings . append ( ' watcher page title ' )
2018-02-07 23:40:33 +00:00
2016-11-02 21:09:14 +00:00
elif content_type == HC . CONTENT_TYPE_VETO :
if include_veto :
2018-04-25 22:07:52 +00:00
pretty_strings . append ( ' veto: ' + name )
2016-11-02 21:09:14 +00:00
2016-10-05 20:22:40 +00:00
2016-11-02 21:09:14 +00:00
if len ( pretty_strings ) == 0 :
2016-10-05 20:22:40 +00:00
return ' nothing '
else :
2016-11-02 21:09:14 +00:00
return ' , ' . join ( pretty_strings )
2016-10-05 20:22:40 +00:00
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
def GetChildrenContent ( job_key , children , data , referral_url ) :
2016-11-02 21:09:14 +00:00
content = [ ]
for child in children :
2017-12-13 22:33:07 +00:00
try :
2018-01-24 23:09:42 +00:00
if isinstance ( child , ParseNodeContentLink ) :
child_content = child . Parse ( job_key , data , referral_url )
elif isinstance ( child , ContentParser ) :
2018-02-07 23:40:33 +00:00
child_content = child . Parse ( { } , data )
2018-01-24 23:09:42 +00:00
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
except HydrusExceptions . VetoException :
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
return [ ]
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
content . extend ( child_content )
2016-11-02 21:09:14 +00:00
return content
2018-02-07 23:40:33 +00:00
def GetHashesFromParseResults ( results ) :
2018-01-31 22:58:15 +00:00
hash_results = [ ]
for ( ( name , content_type , additional_info ) , parsed_text ) in results :
if content_type == HC . CONTENT_TYPE_HASH :
hash_results . append ( ( additional_info , parsed_text ) )
return hash_results
2018-05-30 20:13:21 +00:00
def GetHTMLTagString ( tag ) :
all_strings = [ s for s in tag . strings if len ( s ) > 0 ]
if len ( all_strings ) == 0 :
result = ' '
else :
result = all_strings [ 0 ]
return result
2018-04-18 22:10:15 +00:00
def GetNamespacesFromParsableContent ( parsable_content ) :
content_type_to_additional_infos = HydrusData . BuildKeyToSetDict ( ( ( content_type , additional_infos ) for ( name , content_type , additional_infos ) in parsable_content ) )
namespaces = content_type_to_additional_infos [ HC . CONTENT_TYPE_MAPPINGS ] # additional_infos is a set of namespaces
return namespaces
2018-04-11 22:30:40 +00:00
def GetSoup ( html ) :
if HTML5LIB_IS_OK :
parser = ' html5lib '
elif LXML_IS_OK :
parser = ' lxml '
else :
message = ' This client does not have access to either lxml or html5lib, and so it cannot parse html. Please install one of these parsing libraries and restart the client. '
raise HydrusExceptions . ParseException ( message )
return bs4 . BeautifulSoup ( html , parser )
2018-02-07 23:40:33 +00:00
def GetTagsFromParseResults ( results ) :
2016-11-09 23:13:22 +00:00
tag_results = [ ]
for ( ( name , content_type , additional_info ) , parsed_text ) in results :
if content_type == HC . CONTENT_TYPE_MAPPINGS :
tag_results . append ( HydrusTags . CombineTag ( additional_info , parsed_text ) )
tag_results = HydrusTags . CleanTags ( tag_results )
return tag_results
2018-02-07 23:40:33 +00:00
def GetTimestampFromParseResults ( results , desired_timestamp_type ) :
timestamp_results = [ ]
for ( ( name , content_type , additional_info ) , parsed_text ) in results :
if content_type == HC . CONTENT_TYPE_TIMESTAMP :
timestamp_type = additional_info
if timestamp_type == desired_timestamp_type :
try :
timestamp = int ( parsed_text )
except :
continue
2018-05-02 20:45:20 +00:00
if timestamp_type == HC . TIMESTAMP_TYPE_SOURCE :
timestamp = min ( HydrusData . GetNow ( ) - 30 , timestamp )
2018-02-07 23:40:33 +00:00
timestamp_results . append ( timestamp )
if len ( timestamp_results ) == 0 :
return None
else :
return min ( timestamp_results )
def GetTitleFromAllParseResults ( all_parse_results ) :
titles = [ ]
for results in all_parse_results :
for ( ( name , content_type , additional_info ) , parsed_text ) in results :
if content_type == HC . CONTENT_TYPE_TITLE :
priority = additional_info
titles . append ( ( priority , parsed_text ) )
if len ( titles ) > 0 :
titles . sort ( reverse = True ) # highest priority first
( priority , title ) = titles [ 0 ]
return title
else :
return None
2018-05-02 20:45:20 +00:00
def GetURLsFromParseResults ( results , desired_url_types , only_get_top_priority = False ) :
2017-12-13 22:33:07 +00:00
url_results = collections . defaultdict ( list )
for ( ( name , content_type , additional_info ) , parsed_text ) in results :
if content_type == HC . CONTENT_TYPE_URLS :
2018-02-07 23:40:33 +00:00
( url_type , priority ) = additional_info
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
if url_type in desired_url_types :
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
url_results [ priority ] . append ( parsed_text )
2017-12-13 22:33:07 +00:00
2018-05-02 20:45:20 +00:00
if only_get_top_priority :
2018-02-07 23:40:33 +00:00
2018-05-02 20:45:20 +00:00
# ( priority, url_list ) pairs
url_results = list ( url_results . items ( ) )
# ordered by descending priority
url_results . sort ( reverse = True )
# url_lists of descending priority
if len ( url_results ) > 0 :
( priority , url_list ) = url_results [ 0 ]
else :
url_list = [ ]
2018-02-07 23:40:33 +00:00
else :
url_list = [ ]
2018-05-02 20:45:20 +00:00
for u_l in url_results . values ( ) :
url_list . extend ( u_l )
2017-12-13 22:33:07 +00:00
2018-06-06 21:27:02 +00:00
urls_seen = set ( )
possible_dupe_urls = url_list
url_list = [ ]
for url in possible_dupe_urls :
if url not in urls_seen :
urls_seen . add ( url )
url_list . append ( url )
2018-02-07 23:40:33 +00:00
return url_list
2017-12-13 22:33:07 +00:00
2018-02-21 21:59:37 +00:00
def MakeParsedTextPretty ( parsed_text ) :
try :
parsed_text = unicode ( parsed_text )
except UnicodeDecodeError :
parsed_text = repr ( parsed_text )
return parsed_text
2018-01-31 22:58:15 +00:00
def RenderJSONParseRule ( parse_rule ) :
if parse_rule is None :
s = ' get all items '
elif isinstance ( parse_rule , int ) :
index = parse_rule
num = index + 1
s = ' get the ' + HydrusData . ConvertIntToPrettyOrdinalString ( num ) + ' item '
else :
s = ' get the " ' + HydrusData . ToUnicode ( parse_rule ) + ' " entry '
return s
class ParseFormula ( HydrusSerialisable . SerialisableBase ) :
def __init__ ( self , string_match = None , string_converter = None ) :
if string_match is None :
string_match = StringMatch ( )
if string_converter is None :
string_converter = StringConverter ( example_string = ' parsed information ' )
self . _string_match = string_match
self . _string_converter = string_converter
2018-02-21 21:59:37 +00:00
def _GetParsePrettySeparator ( self ) :
return os . linesep
def _ParseRawContents ( self , parsing_context , data ) :
2018-01-31 22:58:15 +00:00
raise NotImplementedError ( )
2018-02-21 21:59:37 +00:00
def Parse ( self , parsing_context , data ) :
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
raw_texts = self . _ParseRawContents ( parsing_context , data )
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
texts = [ ]
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
for raw_text in raw_texts :
2018-01-31 22:58:15 +00:00
try :
2018-02-21 21:59:37 +00:00
self . _string_match . Test ( raw_text )
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
text = self . _string_converter . Convert ( raw_text )
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
texts . append ( text )
2018-01-31 22:58:15 +00:00
except HydrusExceptions . ParseException :
continue
2018-02-21 21:59:37 +00:00
return texts
def ParsePretty ( self , parsing_context , data ) :
texts = self . Parse ( parsing_context , data )
pretty_texts = [ MakeParsedTextPretty ( text ) for text in texts ]
2018-07-04 20:48:28 +00:00
pretty_texts = [ ' *** ' + HydrusData . ToHumanInt ( len ( pretty_texts ) ) + ' RESULTS BEGIN *** ' ] + pretty_texts + [ ' *** RESULTS END *** ' ]
2018-02-21 21:59:37 +00:00
separator = self . _GetParsePrettySeparator ( )
result = separator . join ( pretty_texts )
return result
2018-01-31 22:58:15 +00:00
def ParsesSeparatedContent ( self ) :
return False
def ToPrettyString ( self ) :
raise NotImplementedError ( )
def ToPrettyMultilineString ( self ) :
raise NotImplementedError ( )
2016-09-07 20:01:05 +00:00
2018-01-31 22:58:15 +00:00
class ParseFormulaCompound ( ParseFormula ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_COMPOUND
SERIALISABLE_NAME = ' Compound Parsing Formula '
SERIALISABLE_VERSION = 1
2016-09-07 20:01:05 +00:00
2018-01-31 22:58:15 +00:00
def __init__ ( self , formulae = None , sub_phrase = None , string_match = None , string_converter = None ) :
ParseFormula . __init__ ( self , string_match , string_converter )
if formulae is None :
formulae = HydrusSerialisable . SerialisableList ( )
formulae . append ( ParseFormulaHTML ( ) )
if sub_phrase is None :
sub_phrase = ' \\ 1 '
self . _formulae = formulae
self . _sub_phrase = sub_phrase
def _GetSerialisableInfo ( self ) :
serialisable_formulae = HydrusSerialisable . SerialisableList ( self . _formulae ) . GetSerialisableTuple ( )
serialisable_string_match = self . _string_match . GetSerialisableTuple ( )
serialisable_string_converter = self . _string_converter . GetSerialisableTuple ( )
return ( serialisable_formulae , self . _sub_phrase , serialisable_string_match , serialisable_string_converter )
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
( serialisable_formulae , self . _sub_phrase , serialisable_string_match , serialisable_string_converter ) = serialisable_info
self . _formulae = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_formulae )
self . _string_match = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_match )
self . _string_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents ( self , parsing_context , data ) :
2018-01-31 22:58:15 +00:00
def get_stream_data ( index , s ) :
if len ( s ) == 0 :
return ' '
2018-06-27 19:27:05 +00:00
elif index > = len ( s ) :
2018-01-31 22:58:15 +00:00
return s [ - 1 ]
else :
return s [ index ]
streams = [ ]
for formula in self . _formulae :
2018-02-21 21:59:37 +00:00
stream = formula . Parse ( parsing_context , data )
2018-02-07 23:40:33 +00:00
if len ( stream ) == 0 : # no contents were found for one of the /1 replace components, so no valid strings can be made.
return [ ]
streams . append ( stream )
2018-01-31 22:58:15 +00:00
num_raw_contents_to_make = max ( ( len ( stream ) for stream in streams ) )
raw_contents = [ ]
for stream_index in range ( num_raw_contents_to_make ) :
raw_content = self . _sub_phrase
for ( stream_num , stream ) in enumerate ( streams , 1 ) : # starts counting from 1
sub_component = ' \\ ' + str ( stream_num )
replace_string = get_stream_data ( stream_index , stream )
raw_content = raw_content . replace ( sub_component , replace_string )
raw_contents . append ( raw_content )
return raw_contents
def ToPrettyString ( self ) :
2018-07-04 20:48:28 +00:00
return ' COMPOUND with ' + HydrusData . ToHumanInt ( len ( self . _formulae ) ) + ' formulae. '
2018-01-31 22:58:15 +00:00
def ToPrettyMultilineString ( self ) :
s = [ ]
for formula in self . _formulae :
s . append ( formula . ToPrettyMultilineString ( ) )
s . append ( ' and substitute into ' + self . _sub_phrase )
separator = os . linesep * 2
text = ' --COMPOUND-- ' + os . linesep * 2 + separator . join ( s )
return text
def ToTuple ( self ) :
return ( self . _formulae , self . _sub_phrase , self . _string_match , self . _string_converter )
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_COMPOUND ] = ParseFormulaCompound
2018-02-07 23:40:33 +00:00
class ParseFormulaContextVariable ( ParseFormula ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_CONTEXT_VARIABLE
SERIALISABLE_NAME = ' Context Variable Formula '
SERIALISABLE_VERSION = 1
def __init__ ( self , variable_name = None , string_match = None , string_converter = None ) :
ParseFormula . __init__ ( self , string_match , string_converter )
if variable_name is None :
variable_name = ' url '
self . _variable_name = variable_name
def _GetSerialisableInfo ( self ) :
serialisable_string_match = self . _string_match . GetSerialisableTuple ( )
serialisable_string_converter = self . _string_converter . GetSerialisableTuple ( )
return ( self . _variable_name , serialisable_string_match , serialisable_string_converter )
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
( self . _variable_name , serialisable_string_match , serialisable_string_converter ) = serialisable_info
self . _string_match = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_match )
self . _string_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents ( self , parsing_context , data ) :
2018-02-07 23:40:33 +00:00
raw_contents = [ ]
2018-02-21 21:59:37 +00:00
if self . _variable_name in parsing_context :
2018-02-07 23:40:33 +00:00
2018-02-21 21:59:37 +00:00
raw_contents . append ( parsing_context [ self . _variable_name ] )
2018-02-07 23:40:33 +00:00
return raw_contents
def ToPrettyString ( self ) :
return ' CONTEXT VARIABLE: ' + self . _variable_name
def ToPrettyMultilineString ( self ) :
s = [ ]
s . append ( ' fetch the " ' + self . _variable_name + ' " variable from the parsing context ' )
separator = os . linesep * 2
text = ' --CONTEXT VARIABLE-- ' + os . linesep * 2 + separator . join ( s )
return text
def ToTuple ( self ) :
return ( self . _variable_name , self . _string_match , self . _string_converter )
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_CONTEXT_VARIABLE ] = ParseFormulaContextVariable
2017-12-13 22:33:07 +00:00
HTML_CONTENT_ATTRIBUTE = 0
HTML_CONTENT_STRING = 1
HTML_CONTENT_HTML = 2
2018-01-31 22:58:15 +00:00
class ParseFormulaHTML ( ParseFormula ) :
2016-07-20 19:57:10 +00:00
2016-09-21 19:54:04 +00:00
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_HTML
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = ' HTML Parsing Formula '
2018-04-11 22:30:40 +00:00
SERIALISABLE_VERSION = 6
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
def __init__ ( self , tag_rules = None , content_to_fetch = None , attribute_to_fetch = None , string_match = None , string_converter = None ) :
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
ParseFormula . __init__ ( self , string_match , string_converter )
2016-09-07 20:01:05 +00:00
if tag_rules is None :
2018-04-11 22:30:40 +00:00
tag_rules = HydrusSerialisable . SerialisableList ( )
tag_rules . append ( ParseRuleHTML ( ) )
2016-09-07 20:01:05 +00:00
2016-07-20 19:57:10 +00:00
2018-01-24 23:09:42 +00:00
if content_to_fetch is None :
content_to_fetch = HTML_CONTENT_ATTRIBUTE
if attribute_to_fetch is None :
attribute_to_fetch = ' href '
2018-04-11 22:30:40 +00:00
self . _tag_rules = HydrusSerialisable . SerialisableList ( tag_rules )
2016-09-07 20:01:05 +00:00
2017-12-13 22:33:07 +00:00
self . _content_to_fetch = content_to_fetch
self . _attribute_to_fetch = attribute_to_fetch
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
def _FindHTMLTags ( self , root ) :
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
tags = ( root , )
2016-07-20 19:57:10 +00:00
2018-04-11 22:30:40 +00:00
for tag_rule in self . _tag_rules :
2018-01-31 22:58:15 +00:00
2018-04-11 22:30:40 +00:00
tags = tag_rule . GetNodes ( tags )
2018-01-31 22:58:15 +00:00
2017-11-15 22:35:49 +00:00
2018-01-31 22:58:15 +00:00
return tags
2016-11-16 20:21:43 +00:00
2018-02-21 21:59:37 +00:00
def _GetParsePrettySeparator ( self ) :
if self . _content_to_fetch == HTML_CONTENT_HTML :
return os . linesep * 2
else :
return os . linesep
2018-01-31 22:58:15 +00:00
def _GetRawContentFromTag ( self , tag ) :
2017-11-22 21:03:07 +00:00
2017-12-13 22:33:07 +00:00
if self . _content_to_fetch == HTML_CONTENT_ATTRIBUTE :
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
if tag . has_attr ( self . _attribute_to_fetch ) :
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
unknown_attr_result = tag [ self . _attribute_to_fetch ]
2017-11-22 21:03:07 +00:00
# 'class' attr returns a list because it has multiple values under html spec, wew
if isinstance ( unknown_attr_result , list ) :
if len ( unknown_attr_result ) == 0 :
2017-12-13 22:33:07 +00:00
raise HydrusExceptions . ParseException ( ' Attribute ' + self . _attribute_to_fetch + ' not found! ' )
2017-11-22 21:03:07 +00:00
else :
result = ' ' . join ( unknown_attr_result )
else :
result = unknown_attr_result
else :
2017-12-13 22:33:07 +00:00
raise HydrusExceptions . ParseException ( ' Attribute ' + self . _attribute_to_fetch + ' not found! ' )
2017-11-22 21:03:07 +00:00
2017-12-13 22:33:07 +00:00
elif self . _content_to_fetch == HTML_CONTENT_STRING :
2018-05-30 20:13:21 +00:00
result = GetHTMLTagString ( tag )
2017-12-13 22:33:07 +00:00
elif self . _content_to_fetch == HTML_CONTENT_HTML :
2018-01-31 22:58:15 +00:00
result = unicode ( tag )
2017-12-13 22:33:07 +00:00
2017-11-22 21:03:07 +00:00
if result is None or result == ' ' :
2017-12-13 22:33:07 +00:00
raise HydrusExceptions . ParseException ( ' Empty/No results found! ' )
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
return result
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
def _GetRawContentsFromTags ( self , tags ) :
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
raw_contents = [ ]
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
for tag in tags :
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
try :
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
raw_content = self . _GetRawContentFromTag ( tag )
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
raw_contents . append ( raw_content )
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
except HydrusExceptions . ParseException :
continue
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
return raw_contents
def _GetSerialisableInfo ( self ) :
2018-04-11 22:30:40 +00:00
serialisable_tag_rules = self . _tag_rules . GetSerialisableTuple ( )
2018-01-31 22:58:15 +00:00
serialisable_string_match = self . _string_match . GetSerialisableTuple ( )
serialisable_string_converter = self . _string_converter . GetSerialisableTuple ( )
2018-04-11 22:30:40 +00:00
return ( serialisable_tag_rules , self . _content_to_fetch , self . _attribute_to_fetch , serialisable_string_match , serialisable_string_converter )
2018-01-31 22:58:15 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2018-04-11 22:30:40 +00:00
( serialisable_tag_rules , self . _content_to_fetch , self . _attribute_to_fetch , serialisable_string_match , serialisable_string_converter ) = serialisable_info
self . _tag_rules = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_tag_rules )
2018-01-31 22:58:15 +00:00
self . _string_match = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_match )
self . _string_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents ( self , parsing_context , data ) :
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
try :
2018-04-25 22:07:52 +00:00
root = HG . client_controller . parsing_cache . GetSoup ( data )
2018-02-21 21:59:37 +00:00
except Exception as e :
raise HydrusExceptions . ParseException ( ' Unable to parse that HTML: ' + HydrusData . ToUnicode ( e ) )
2018-01-31 22:58:15 +00:00
tags = self . _FindHTMLTags ( root )
raw_contents = self . _GetRawContentsFromTags ( tags )
return raw_contents
2017-11-22 21:03:07 +00:00
2016-11-16 20:21:43 +00:00
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
2017-12-13 22:33:07 +00:00
( tag_rules , attribute_to_fetch ) = old_serialisable_info
2016-11-16 20:21:43 +00:00
culling_and_adding = ( 0 , 0 , ' ' , ' ' )
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules , attribute_to_fetch , culling_and_adding )
2016-11-16 20:21:43 +00:00
return ( 2 , new_serialisable_info )
2016-07-20 19:57:10 +00:00
2017-11-15 22:35:49 +00:00
if version == 2 :
2017-12-13 22:33:07 +00:00
( tag_rules , attribute_to_fetch , culling_and_adding ) = old_serialisable_info
2017-11-15 22:35:49 +00:00
( cull_front , cull_back , prepend , append ) = culling_and_adding
transformations = [ ]
if cull_front > 0 :
transformations . append ( ( STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING , cull_front ) )
elif cull_front < 0 :
transformations . append ( ( STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END , cull_front ) )
if cull_back > 0 :
transformations . append ( ( STRING_TRANSFORMATION_CLIP_TEXT_FROM_END , cull_back ) )
elif cull_back < 0 :
transformations . append ( ( STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING , cull_back ) )
if prepend != ' ' :
transformations . append ( ( STRING_TRANSFORMATION_PREPEND_TEXT , prepend ) )
if append != ' ' :
transformations . append ( ( STRING_TRANSFORMATION_APPEND_TEXT , append ) )
string_converter = StringConverter ( transformations , ' parsed information ' )
serialisable_string_converter = string_converter . GetSerialisableTuple ( )
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules , attribute_to_fetch , serialisable_string_converter )
2017-11-15 22:35:49 +00:00
return ( 3 , new_serialisable_info )
2017-11-22 21:03:07 +00:00
if version == 3 :
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
( tag_rules , attribute_to_fetch , serialisable_string_converter ) = old_serialisable_info
2016-10-26 20:45:34 +00:00
2017-11-22 21:03:07 +00:00
string_match = StringMatch ( )
2016-10-26 20:45:34 +00:00
2017-11-22 21:03:07 +00:00
serialisable_string_match = string_match . GetSerialisableTuple ( )
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules , attribute_to_fetch , serialisable_string_match , serialisable_string_converter )
2016-07-20 19:57:10 +00:00
2017-11-22 21:03:07 +00:00
return ( 4 , new_serialisable_info )
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
if version == 4 :
( tag_rules , attribute_to_fetch , serialisable_string_match , serialisable_string_converter ) = old_serialisable_info
if attribute_to_fetch is None :
content_to_fetch = HTML_CONTENT_STRING
attribute_to_fetch = ' '
else :
content_to_fetch = HTML_CONTENT_ATTRIBUTE
new_serialisable_info = ( tag_rules , content_to_fetch , attribute_to_fetch , serialisable_string_match , serialisable_string_converter )
return ( 5 , new_serialisable_info )
2018-04-11 22:30:40 +00:00
if version == 5 :
( tag_rules , content_to_fetch , attribute_to_fetch , serialisable_string_match , serialisable_string_converter ) = old_serialisable_info
new_tag_rules = HydrusSerialisable . SerialisableList ( )
for ( name , attrs , index ) in tag_rules :
tag_rule = ParseRuleHTML ( rule_type = HTML_RULE_TYPE_DESCENDING , tag_name = name , tag_attributes = attrs , tag_index = index )
new_tag_rules . append ( tag_rule )
serialisable_new_tag_rules = new_tag_rules . GetSerialisableTuple ( )
new_serialisable_info = ( serialisable_new_tag_rules , content_to_fetch , attribute_to_fetch , serialisable_string_match , serialisable_string_converter )
return ( 6 , new_serialisable_info )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
def ParsesSeparatedContent ( self ) :
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
return self . _content_to_fetch == HTML_CONTENT_HTML
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
def ToPrettyString ( self ) :
2016-07-20 19:57:10 +00:00
2018-07-04 20:48:28 +00:00
return ' HTML with ' + HydrusData . ToHumanInt ( len ( self . _tag_rules ) ) + ' tag rules. '
2018-01-31 22:58:15 +00:00
def ToPrettyMultilineString ( self ) :
2018-04-11 22:30:40 +00:00
pretty_strings = [ t_r . ToString ( ) for t_r in self . _tag_rules ]
2018-01-31 22:58:15 +00:00
if self . _content_to_fetch == HTML_CONTENT_ATTRIBUTE :
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings . append ( ' get the ' + self . _attribute_to_fetch + ' attribute of those tags ' )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
elif self . _content_to_fetch == HTML_CONTENT_STRING :
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings . append ( ' get the text content of those tags ' )
elif self . _content_to_fetch == HTML_CONTENT_HTML :
pretty_strings . append ( ' get the html of those tags ' )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings . extend ( self . _string_converter . GetTransformationStrings ( ) )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
separator = os . linesep + ' and then '
pretty_multiline_string = ' --HTML-- ' + os . linesep + separator . join ( pretty_strings )
return pretty_multiline_string
def ToTuple ( self ) :
return ( self . _tag_rules , self . _content_to_fetch , self . _attribute_to_fetch , self . _string_match , self . _string_converter )
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_HTML ] = ParseFormulaHTML
2018-04-11 22:30:40 +00:00
HTML_RULE_TYPE_DESCENDING = 0
HTML_RULE_TYPE_ASCENDING = 1
class ParseRuleHTML ( HydrusSerialisable . SerialisableBase ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PARSE_RULE_HTML
SERIALISABLE_NAME = ' HTML Parsing Rule '
2018-05-30 20:13:21 +00:00
SERIALISABLE_VERSION = 2
2018-04-11 22:30:40 +00:00
2018-05-30 20:13:21 +00:00
def __init__ ( self , rule_type = None , tag_name = None , tag_attributes = None , tag_index = None , tag_depth = None , should_test_tag_string = False , tag_string_string_match = None ) :
2018-04-11 22:30:40 +00:00
HydrusSerialisable . SerialisableBase . __init__ ( self )
if rule_type is None :
rule_type = HTML_RULE_TYPE_DESCENDING
if tag_name is None :
tag_name = ' a '
if rule_type == HTML_RULE_TYPE_DESCENDING :
if tag_attributes is None :
tag_attributes = { }
elif rule_type == HTML_RULE_TYPE_ASCENDING :
if tag_depth is None :
tag_depth = 1
2018-05-30 20:13:21 +00:00
if tag_string_string_match is None :
tag_string_string_match = StringMatch ( )
2018-04-11 22:30:40 +00:00
self . _rule_type = rule_type
self . _tag_name = tag_name
self . _tag_attributes = tag_attributes
self . _tag_index = tag_index
self . _tag_depth = tag_depth
2018-05-30 20:13:21 +00:00
self . _should_test_tag_string = should_test_tag_string
self . _tag_string_string_match = tag_string_string_match
2018-04-11 22:30:40 +00:00
def _GetSerialisableInfo ( self ) :
2018-05-30 20:13:21 +00:00
serialisable_tag_string_string_match = self . _tag_string_string_match . GetSerialisableTuple ( )
return ( self . _rule_type , self . _tag_name , self . _tag_attributes , self . _tag_index , self . _tag_depth , self . _should_test_tag_string , serialisable_tag_string_string_match )
2018-04-11 22:30:40 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2018-05-30 20:13:21 +00:00
( self . _rule_type , self . _tag_name , self . _tag_attributes , self . _tag_index , self . _tag_depth , self . _should_test_tag_string , serialisable_tag_string_string_match ) = serialisable_info
self . _tag_string_string_match = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_tag_string_string_match )
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
( rule_type , tag_name , tag_attributes , tag_index , tag_depth ) = old_serialisable_info
should_test_tag_string = False
tag_string_string_match = StringMatch ( )
serialisable_tag_string_string_match = tag_string_string_match . GetSerialisableTuple ( )
new_serialisable_info = ( rule_type , tag_name , tag_attributes , tag_index , tag_depth , should_test_tag_string , serialisable_tag_string_string_match )
return ( 2 , new_serialisable_info )
2018-04-11 22:30:40 +00:00
def GetNodes ( self , nodes ) :
new_nodes = [ ]
for node in nodes :
if self . _rule_type == HTML_RULE_TYPE_DESCENDING :
kwargs = { ' attrs ' : self . _tag_attributes }
if self . _tag_name is not None :
kwargs [ ' name ' ] = self . _tag_name
found_nodes = node . find_all ( * * kwargs )
if self . _tag_index is not None :
if len ( found_nodes ) < self . _tag_index + 1 :
found_nodes = [ ]
else :
found_nodes = [ found_nodes [ self . _tag_index ] ]
elif self . _rule_type == HTML_RULE_TYPE_ASCENDING :
found_nodes = [ ]
still_in_tree = lambda node : isinstance ( node , bs4 . element . Tag ) # if we go one above html, we get the BS document itself
num_found = 0
potential_parent = node . parent
while still_in_tree ( potential_parent ) :
if self . _tag_name is None :
num_found + = 1
else :
if potential_parent . name == self . _tag_name :
num_found + = 1
if num_found == self . _tag_depth :
found_nodes = [ potential_parent ]
break
potential_parent = potential_parent . parent
new_nodes . extend ( found_nodes )
2018-05-30 20:13:21 +00:00
if self . _should_test_tag_string :
potential_nodes = new_nodes
new_nodes = [ ]
for node in potential_nodes :
s = GetHTMLTagString ( node )
if self . _tag_string_string_match . Matches ( s ) :
new_nodes . append ( node )
2018-04-11 22:30:40 +00:00
return new_nodes
def ToString ( self ) :
if self . _rule_type == HTML_RULE_TYPE_DESCENDING :
s = ' search descendents for '
if self . _tag_index is None :
s + = ' every '
else :
num = self . _tag_index + 1
s + = ' the ' + HydrusData . ConvertIntToPrettyOrdinalString ( num )
if self . _tag_name is not None :
s + = ' < ' + self . _tag_name + ' > '
s + = ' tag '
if len ( self . _tag_attributes ) > 0 :
s + = ' with attributes ' + ' , ' . join ( key + ' = ' + value for ( key , value ) in self . _tag_attributes . items ( ) )
elif self . _rule_type == HTML_RULE_TYPE_ASCENDING :
s = ' walk back up ancestors '
if self . _tag_name is None :
2018-07-04 20:48:28 +00:00
s + = ' ' + HydrusData . ToHumanInt ( self . _tag_depth ) + ' tag levels '
2018-04-11 22:30:40 +00:00
else :
s + = ' to the ' + HydrusData . ConvertIntToPrettyOrdinalString ( self . _tag_depth ) + ' < ' + self . _tag_name + ' > tag '
2018-05-30 20:13:21 +00:00
if self . _should_test_tag_string :
s + = ' with strings that match ' + self . _tag_string_string_match . ToUnicode ( )
2018-04-11 22:30:40 +00:00
return s
def ToTuple ( self ) :
2018-05-30 20:13:21 +00:00
return ( self . _rule_type , self . _tag_name , self . _tag_attributes , self . _tag_index , self . _tag_depth , self . _should_test_tag_string , self . _tag_string_string_match )
2018-04-11 22:30:40 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PARSE_RULE_HTML ] = ParseRuleHTML
2018-01-31 22:58:15 +00:00
JSON_CONTENT_STRING = 0
JSON_CONTENT_JSON = 1
class ParseFormulaJSON ( ParseFormula ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_JSON
SERIALISABLE_NAME = ' JSON Parsing Formula '
SERIALISABLE_VERSION = 1
def __init__ ( self , parse_rules = None , content_to_fetch = None , string_match = None , string_converter = None ) :
ParseFormula . __init__ ( self , string_match , string_converter )
if parse_rules is None :
2017-11-22 21:03:07 +00:00
2018-01-31 22:58:15 +00:00
parse_rules = [ ' posts ' ]
2017-11-22 21:03:07 +00:00
2016-10-26 20:45:34 +00:00
2018-01-31 22:58:15 +00:00
if content_to_fetch is None :
content_to_fetch = JSON_CONTENT_STRING
self . _parse_rules = parse_rules
self . _content_to_fetch = content_to_fetch
2016-07-20 19:57:10 +00:00
2018-02-21 21:59:37 +00:00
def _GetParsePrettySeparator ( self ) :
if self . _content_to_fetch == JSON_CONTENT_JSON :
return os . linesep * 2
else :
return os . linesep
2018-01-31 22:58:15 +00:00
def _GetRawContentsFromJSON ( self , j ) :
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
roots = ( j , )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
for parse_rule in self . _parse_rules :
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
next_roots = [ ]
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
for root in roots :
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
if parse_rule is None :
if not isinstance ( root , list ) :
continue
next_roots . extend ( root )
elif isinstance ( parse_rule , int ) :
if not isinstance ( root , list ) :
continue
index = parse_rule
if len ( root ) < index + 1 :
continue
next_roots . append ( root [ index ] )
else :
if not isinstance ( root , dict ) :
continue
key = parse_rule
if key not in root :
continue
next_roots . append ( root [ key ] )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
roots = next_roots
raw_contents = [ ]
for root in roots :
if self . _content_to_fetch == JSON_CONTENT_STRING :
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
if isinstance ( root , ( list , dict ) ) :
continue
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
raw_content = HydrusData . ToUnicode ( root )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
elif self . _content_to_fetch == JSON_CONTENT_JSON :
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
raw_content = json . dumps ( root )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
raw_contents . append ( raw_content )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
return raw_contents
def _GetSerialisableInfo ( self ) :
serialisable_string_match = self . _string_match . GetSerialisableTuple ( )
serialisable_string_converter = self . _string_converter . GetSerialisableTuple ( )
return ( self . _parse_rules , self . _content_to_fetch , serialisable_string_match , serialisable_string_converter )
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
( self . _parse_rules , self . _content_to_fetch , serialisable_string_match , serialisable_string_converter ) = serialisable_info
self . _string_match = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_match )
self . _string_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_converter )
2018-02-21 21:59:37 +00:00
def _ParseRawContents ( self , parsing_context , data ) :
2018-01-31 22:58:15 +00:00
2018-02-21 21:59:37 +00:00
try :
2018-04-25 22:07:52 +00:00
j = HG . client_controller . parsing_cache . GetJSON ( data )
2018-02-21 21:59:37 +00:00
except Exception as e :
raise HydrusExceptions . ParseException ( ' Unable to parse that JSON: ' + HydrusData . ToUnicode ( e ) )
2018-01-31 22:58:15 +00:00
raw_contents = self . _GetRawContentsFromJSON ( j )
return raw_contents
def ParsesSeparatedContent ( self ) :
return self . _content_to_fetch == JSON_CONTENT_JSON
def ToPrettyString ( self ) :
2018-07-04 20:48:28 +00:00
return ' JSON with ' + HydrusData . ToHumanInt ( len ( self . _parse_rules ) ) + ' parse rules. '
2018-01-31 22:58:15 +00:00
def ToPrettyMultilineString ( self ) :
pretty_strings = [ RenderJSONParseRule ( p_r ) for p_r in self . _parse_rules ]
if self . _content_to_fetch == JSON_CONTENT_STRING :
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings . append ( ' get final data content, converting to strings as needed ' )
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
elif self . _content_to_fetch == JSON_CONTENT_JSON :
2016-10-19 20:02:56 +00:00
2018-01-31 22:58:15 +00:00
pretty_strings . append ( ' get the json beneath ' )
2016-10-19 20:02:56 +00:00
2017-11-15 22:35:49 +00:00
pretty_strings . extend ( self . _string_converter . GetTransformationStrings ( ) )
2016-11-16 20:21:43 +00:00
2016-10-19 20:02:56 +00:00
separator = os . linesep + ' and then '
2018-01-31 22:58:15 +00:00
pretty_multiline_string = ' --JSON-- ' + os . linesep + separator . join ( pretty_strings )
2016-10-19 20:02:56 +00:00
return pretty_multiline_string
2016-09-07 20:01:05 +00:00
def ToTuple ( self ) :
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
return ( self . _parse_rules , self . _content_to_fetch , self . _string_match , self . _string_converter )
2016-07-20 19:57:10 +00:00
2018-01-31 22:58:15 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PARSE_FORMULA_JSON ] = ParseFormulaJSON
2016-09-21 19:54:04 +00:00
2018-04-11 22:30:40 +00:00
class SimpleDownloaderParsingFormula ( HydrusSerialisable . SerialisableBaseNamed ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_PARSE_FORMULA
SERIALISABLE_NAME = ' Simple Downloader Parsing Formula '
SERIALISABLE_VERSION = 1
def __init__ ( self , name = None , formula = None ) :
if name is None :
name = ' new parsing formula '
if formula is None :
formula = ParseFormulaHTML ( )
HydrusSerialisable . SerialisableBaseNamed . __init__ ( self , name )
self . _formula = formula
def _GetSerialisableInfo ( self ) :
serialisable_formula = self . _formula . GetSerialisableTuple ( )
return serialisable_formula
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
serialisable_formula = serialisable_info
self . _formula = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_formula )
def GetFormula ( self ) :
return self . _formula
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_PARSE_FORMULA ] = SimpleDownloaderParsingFormula
2017-12-13 22:33:07 +00:00
class ContentParser ( HydrusSerialisable . SerialisableBase ) :
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_CONTENT_PARSER
SERIALISABLE_NAME = ' Content Parser '
2018-06-06 21:27:02 +00:00
SERIALISABLE_VERSION = 3
2016-09-21 19:54:04 +00:00
def __init__ ( self , name = None , content_type = None , formula = None , additional_info = None ) :
2016-10-19 20:02:56 +00:00
if name is None :
name = ' '
if content_type is None :
content_type = HC . CONTENT_TYPE_MAPPINGS
if formula is None :
formula = ParseFormulaHTML ( )
if additional_info is None :
if content_type == HC . CONTENT_TYPE_MAPPINGS :
additional_info = ' '
2016-09-21 19:54:04 +00:00
self . _name = name
self . _content_type = content_type
self . _formula = formula
self . _additional_info = additional_info
def _GetSerialisableInfo ( self ) :
serialisable_formula = self . _formula . GetSerialisableTuple ( )
2018-02-07 23:40:33 +00:00
if self . _content_type == HC . CONTENT_TYPE_VETO :
( veto_if_matches_found , string_match ) = self . _additional_info
serialisable_additional_info = ( veto_if_matches_found , string_match . GetSerialisableTuple ( ) )
else :
serialisable_additional_info = self . _additional_info
return ( self . _name , self . _content_type , serialisable_formula , serialisable_additional_info )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2018-02-07 23:40:33 +00:00
( self . _name , self . _content_type , serialisable_formula , serialisable_additional_info ) = serialisable_info
2016-09-21 19:54:04 +00:00
2018-02-07 23:40:33 +00:00
if self . _content_type == HC . CONTENT_TYPE_VETO :
( veto_if_matches_found , serialisable_string_match ) = serialisable_additional_info
string_match = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_match )
self . _additional_info = ( veto_if_matches_found , string_match )
else :
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
self . _additional_info = serialisable_additional_info
if isinstance ( self . _additional_info , list ) :
self . _additional_info = tuple ( self . _additional_info )
2016-11-02 21:09:14 +00:00
2016-09-21 19:54:04 +00:00
self . _formula = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_formula )
2018-02-07 23:40:33 +00:00
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
( name , content_type , serialisable_formula , additional_info ) = old_serialisable_info
if content_type == HC . CONTENT_TYPE_VETO :
( veto_if_matches_found , match_if_text_present , search_text ) = additional_info
if match_if_text_present :
string_match = StringMatch ( match_type = STRING_MATCH_REGEX , match_value = search_text , example_string = search_text )
else :
string_match = StringMatch ( )
serialisable_string_match = string_match . GetSerialisableTuple ( )
additional_info = ( veto_if_matches_found , serialisable_string_match )
new_serialisable_info = ( name , content_type , serialisable_formula , additional_info )
return ( 2 , new_serialisable_info )
2018-06-06 21:27:02 +00:00
if version == 2 :
( name , content_type , serialisable_formula , additional_info ) = old_serialisable_info
if content_type == HC . CONTENT_TYPE_URLS :
( url_type , priority ) = additional_info
if url_type == HC . URL_TYPE_FILE :
url_type = HC . URL_TYPE_DESIRED
elif url_type == HC . URL_TYPE_POST :
url_type = HC . URL_TYPE_SOURCE
else :
url_type = HC . URL_TYPE_NEXT
additional_info = ( url_type , priority )
new_serialisable_info = ( name , content_type , serialisable_formula , additional_info )
return ( 3 , new_serialisable_info )
2018-02-07 23:40:33 +00:00
2018-01-24 23:09:42 +00:00
def GetName ( self ) :
return self . _name
2016-09-21 19:54:04 +00:00
def GetParsableContent ( self ) :
2016-11-02 21:09:14 +00:00
return { ( self . _name , self . _content_type , self . _additional_info ) }
2016-09-21 19:54:04 +00:00
2018-02-21 21:59:37 +00:00
def Parse ( self , parsing_context , data ) :
2016-09-21 19:54:04 +00:00
2018-07-18 21:07:15 +00:00
try :
parsed_texts = self . _formula . Parse ( parsing_context , data )
except HydrusExceptions . ParseException as e :
prefix = ' Content Parser ' + self . _name + ' : '
e = HydrusExceptions . ParseException ( prefix + HydrusData . ToUnicode ( e ) )
raise e
2016-09-21 19:54:04 +00:00
2018-04-18 22:10:15 +00:00
if self . _content_type == HC . CONTENT_TYPE_URLS :
if ' url ' in parsing_context :
base_url = parsing_context [ ' url ' ]
parsed_texts = [ urlparse . urljoin ( base_url , parsed_text ) for parsed_text in parsed_texts ]
2016-11-02 21:09:14 +00:00
if self . _content_type == HC . CONTENT_TYPE_VETO :
2018-02-07 23:40:33 +00:00
( veto_if_matches_found , string_match ) = self . _additional_info
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
match_found = True in ( string_match . Matches ( parsed_text ) for parsed_text in parsed_texts )
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
veto_if_missing = not veto_if_matches_found
do_veto = ( veto_if_matches_found and match_found ) or ( veto_if_missing and not match_found )
2018-01-24 23:09:42 +00:00
if do_veto :
raise HydrusExceptions . VetoException ( self . _name )
else :
return [ ]
2016-11-02 21:09:14 +00:00
else :
2018-01-24 23:09:42 +00:00
content_description = ( self . _name , self . _content_type , self . _additional_info )
2016-11-02 21:09:14 +00:00
return [ ( content_description , parsed_text ) for parsed_text in parsed_texts ]
2016-09-21 19:54:04 +00:00
2018-02-21 21:59:37 +00:00
def ParsePretty ( self , parsing_context , data ) :
try :
parse_results = self . Parse ( parsing_context , data )
results = [ ConvertParseResultToPrettyString ( parse_result ) for parse_result in parse_results ]
except HydrusExceptions . VetoException as e :
results = [ ' veto: ' + HydrusData . ToUnicode ( e ) ]
2018-07-18 21:07:15 +00:00
except HydrusExceptions . ParseException as e :
prefix = ' Content Parser ' + self . _name + ' : '
e = HydrusExceptions . ParseException ( prefix + HydrusData . ToUnicode ( e ) )
raise e
2018-02-21 21:59:37 +00:00
2018-07-04 20:48:28 +00:00
result_lines = [ ' *** ' + HydrusData . ToHumanInt ( len ( results ) ) + ' RESULTS BEGIN *** ' ]
2018-02-21 21:59:37 +00:00
result_lines . extend ( results )
result_lines . append ( ' *** RESULTS END *** ' )
results_text = os . linesep . join ( result_lines )
return results_text
2018-01-24 23:09:42 +00:00
def SetName ( self , name ) :
self . _name = name
2016-10-19 20:02:56 +00:00
def ToPrettyStrings ( self ) :
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
return ( self . _name , ' content ' , ConvertParsableContentToPrettyString ( self . GetParsableContent ( ) , include_veto = True ) )
2016-10-19 20:02:56 +00:00
def ToTuple ( self ) :
return ( self . _name , self . _content_type , self . _formula , self . _additional_info )
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_CONTENT_PARSER ] = ContentParser
class PageParser ( HydrusSerialisable . SerialisableBaseNamed ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PAGE_PARSER
SERIALISABLE_NAME = ' Page Parser '
2018-02-07 23:40:33 +00:00
SERIALISABLE_VERSION = 2
2017-12-13 22:33:07 +00:00
2018-02-07 23:40:33 +00:00
def __init__ ( self , name , parser_key = None , string_converter = None , sub_page_parsers = None , content_parsers = None , example_urls = None , example_parsing_context = None ) :
2018-01-24 23:09:42 +00:00
if parser_key is None :
parser_key = HydrusData . GenerateKey ( )
if string_converter is None :
string_converter = StringConverter ( )
2018-01-31 22:58:15 +00:00
if sub_page_parsers is None :
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
sub_page_parsers = [ ]
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
if content_parsers is None :
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
content_parsers = [ ]
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
2018-01-24 23:09:42 +00:00
if example_urls is None :
example_urls = [ ]
2018-02-07 23:40:33 +00:00
if example_parsing_context is None :
example_parsing_context = { }
example_parsing_context [ ' url ' ] = ' http://example.com/posts/index.php?id=123456 '
2017-12-13 22:33:07 +00:00
HydrusSerialisable . SerialisableBaseNamed . __init__ ( self , name )
2018-01-24 23:09:42 +00:00
self . _parser_key = parser_key
self . _string_converter = string_converter
2018-01-31 22:58:15 +00:00
self . _sub_page_parsers = sub_page_parsers
2017-12-13 22:33:07 +00:00
self . _content_parsers = content_parsers
2018-01-24 23:09:42 +00:00
self . _example_urls = example_urls
2018-02-07 23:40:33 +00:00
self . _example_parsing_context = example_parsing_context
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
def _GetSerialisableInfo ( self ) :
serialisable_parser_key = self . _parser_key . encode ( ' hex ' )
serialisable_string_converter = self . _string_converter . GetSerialisableTuple ( )
serialisable_sub_page_parsers = [ ( formula . GetSerialisableTuple ( ) , page_parser . GetSerialisableTuple ( ) ) for ( formula , page_parser ) in self . _sub_page_parsers ]
serialisable_content_parsers = HydrusSerialisable . SerialisableList ( self . _content_parsers ) . GetSerialisableTuple ( )
2018-02-07 23:40:33 +00:00
return ( self . _name , serialisable_parser_key , serialisable_string_converter , serialisable_sub_page_parsers , serialisable_content_parsers , self . _example_urls , self . _example_parsing_context )
2018-01-31 22:58:15 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2018-02-07 23:40:33 +00:00
( self . _name , serialisable_parser_key , serialisable_string_converter , serialisable_sub_page_parsers , serialisable_content_parsers , self . _example_urls , self . _example_parsing_context ) = serialisable_info
2018-01-31 22:58:15 +00:00
self . _parser_key = serialisable_parser_key . decode ( ' hex ' )
self . _string_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_converter )
self . _sub_page_parsers = [ ( HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_formula ) , HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_page_parser ) ) for ( serialisable_formula , serialisable_page_parser ) in serialisable_sub_page_parsers ]
self . _content_parsers = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_content_parsers )
2018-02-07 23:40:33 +00:00
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
( name , serialisable_parser_key , serialisable_string_converter , serialisable_sub_page_parsers , serialisable_content_parsers , example_urls ) = old_serialisable_info
example_parsing_context = { }
example_parsing_context [ ' url ' ] = ' http://example.com/posts/index.php?id=123456 '
new_serialisable_info = ( name , serialisable_parser_key , serialisable_string_converter , serialisable_sub_page_parsers , serialisable_content_parsers , example_urls , example_parsing_context )
return ( 2 , new_serialisable_info )
2018-04-11 22:30:40 +00:00
2018-01-24 23:09:42 +00:00
def GetContentParsers ( self ) :
2018-01-31 22:58:15 +00:00
return ( self . _sub_page_parsers , self . _content_parsers )
2018-01-24 23:09:42 +00:00
2018-02-07 23:40:33 +00:00
def GetExampleParsingContext ( self ) :
return self . _example_parsing_context
2018-01-24 23:09:42 +00:00
def GetExampleURLs ( self ) :
return self . _example_urls
2018-04-18 22:10:15 +00:00
def GetNamespaces ( self ) :
# this in future could expand to be more granular like:
# 'I want the artist tags, but not the user-submitted.'
# 'I want the title here, but not the title there.'
# 'I want the original filename, but not the UNIX timestamp filename.'
# which the parser could present with its sub-parsing element names
return GetNamespacesFromParsableContent ( self . GetParsableContent ( ) )
2018-01-31 22:58:15 +00:00
def GetParsableContent ( self ) :
parsable_content = set ( )
for ( formula , page_parser ) in self . _sub_page_parsers :
parsable_content . update ( page_parser . GetParsableContent ( ) )
for content_parser in self . _content_parsers :
parsable_content . update ( content_parser . GetParsableContent ( ) )
return parsable_content
2018-01-24 23:09:42 +00:00
def GetParserKey ( self ) :
return self . _parser_key
def GetStringConverter ( self ) :
return self . _string_converter
2017-12-13 22:33:07 +00:00
2018-02-21 21:59:37 +00:00
def Parse ( self , parsing_context , page_data ) :
2017-12-13 22:33:07 +00:00
2018-06-20 20:20:22 +00:00
page_data = HydrusData . ToUnicode ( page_data )
2018-01-24 23:09:42 +00:00
try :
converted_page_data = self . _string_converter . Convert ( page_data )
except HydrusExceptions . StringConvertException as e :
2018-03-14 21:01:02 +00:00
raise HydrusExceptions . ParseException ( HydrusData . ToUnicode ( e ) )
2018-01-24 23:09:42 +00:00
2018-07-18 21:07:15 +00:00
except HydrusExceptions . ParseException as e :
prefix = ' Page Parser ' + self . _name + ' : '
e = HydrusExceptions . ParseException ( prefix + HydrusData . ToUnicode ( e ) )
raise e
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
#
2018-02-07 23:40:33 +00:00
whole_page_parse_results = [ ]
2018-01-24 23:09:42 +00:00
2018-07-18 21:07:15 +00:00
try :
for content_parser in self . _content_parsers :
whole_page_parse_results . extend ( content_parser . Parse ( parsing_context , converted_page_data ) )
2018-01-24 23:09:42 +00:00
2018-07-18 21:07:15 +00:00
except HydrusExceptions . ParseException as e :
prefix = ' Page Parser ' + self . _name + ' : '
e = HydrusExceptions . ParseException ( prefix + HydrusData . ToUnicode ( e ) )
raise e
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
#
2018-02-07 23:40:33 +00:00
all_parse_results = [ ]
2018-01-31 22:58:15 +00:00
if len ( self . _sub_page_parsers ) == 0 :
2018-02-07 23:40:33 +00:00
if len ( whole_page_parse_results ) > 0 :
2018-01-24 23:09:42 +00:00
2018-02-07 23:40:33 +00:00
all_parse_results = [ whole_page_parse_results ]
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
else :
def sort_key ( sub_page_parser ) :
( formula , page_parser ) = sub_page_parser
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
return page_parser . GetName ( )
2018-01-24 23:09:42 +00:00
2018-01-31 22:58:15 +00:00
sub_page_parsers = list ( self . _sub_page_parsers )
sub_page_parsers . sort ( key = sort_key )
2018-07-18 21:07:15 +00:00
try :
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
for ( formula , page_parser ) in self . _sub_page_parsers :
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
posts = formula . Parse ( parsing_context , converted_page_data )
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
for post in posts :
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
try :
page_parser_all_parse_results = page_parser . Parse ( parsing_context , post )
except HydrusExceptions . VetoException :
continue
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
for page_parser_parse_results in page_parser_all_parse_results :
page_parser_parse_results . extend ( whole_page_parse_results )
all_parse_results . append ( page_parser_parse_results )
2018-01-31 22:58:15 +00:00
2018-07-18 21:07:15 +00:00
except HydrusExceptions . ParseException as e :
prefix = ' Page Parser ' + self . _name + ' : '
e = HydrusExceptions . ParseException ( prefix + HydrusData . ToUnicode ( e ) )
raise e
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
return all_parse_results
2018-01-24 23:09:42 +00:00
2018-02-21 21:59:37 +00:00
def ParsePretty ( self , parsing_context , page_data ) :
try :
all_parse_results = self . Parse ( parsing_context , page_data )
pretty_groups_of_parse_results = [ os . linesep . join ( [ ConvertParseResultToPrettyString ( parse_result ) for parse_result in parse_results ] ) for parse_results in all_parse_results ]
group_separator = os . linesep * 2 + ' *** SEPARATE FILE RESULTS BREAK *** ' + os . linesep * 2
pretty_parse_result_text = group_separator . join ( pretty_groups_of_parse_results )
except HydrusExceptions . VetoException as e :
2018-05-02 20:45:20 +00:00
all_parse_results = [ 1 ]
2018-04-25 22:07:52 +00:00
pretty_parse_result_text = ' veto: ' + HydrusData . ToUnicode ( e )
2018-02-21 21:59:37 +00:00
result_lines = [ ]
2018-07-04 20:48:28 +00:00
result_lines . append ( ' *** ' + HydrusData . ToHumanInt ( len ( all_parse_results ) ) + ' RESULTS BEGIN *** ' + os . linesep )
2018-02-21 21:59:37 +00:00
result_lines . append ( pretty_parse_result_text )
result_lines . append ( os . linesep + ' *** RESULTS END *** ' )
results_text = os . linesep . join ( result_lines )
return results_text
2018-01-24 23:09:42 +00:00
def RegenerateParserKey ( self ) :
self . _parser_key = HydrusData . GenerateKey ( )
2017-12-13 22:33:07 +00:00
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PAGE_PARSER ] = PageParser
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
class ParseNodeContentLink ( HydrusSerialisable . SerialisableBase ) :
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = ' Content Parsing Link '
2016-09-21 19:54:04 +00:00
SERIALISABLE_VERSION = 1
2016-10-19 20:02:56 +00:00
def __init__ ( self , name = None , formula = None , children = None ) :
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
if name is None :
name = ' '
if formula is None :
formula = ParseFormulaHTML ( )
if children is None :
children = [ ]
self . _name = name
2016-09-21 19:54:04 +00:00
self . _formula = formula
self . _children = children
def _GetSerialisableInfo ( self ) :
serialisable_formula = self . _formula . GetSerialisableTuple ( )
serialisable_children = [ child . GetSerialisableTuple ( ) for child in self . _children ]
2016-10-19 20:02:56 +00:00
return ( self . _name , serialisable_formula , serialisable_children )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2016-10-19 20:02:56 +00:00
( self . _name , serialisable_formula , serialisable_children ) = serialisable_info
2016-09-21 19:54:04 +00:00
self . _formula = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_formula )
self . _children = [ HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_child ) for serialisable_child in serialisable_children ]
def GetParsableContent ( self ) :
children_parsable_content = set ( )
for child in self . _children :
children_parsable_content . update ( child . GetParsableContent ( ) )
return children_parsable_content
2017-12-13 22:33:07 +00:00
def Parse ( self , job_key , data , referral_url ) :
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
search_urls = self . ParseURLs ( job_key , data , referral_url )
2016-09-21 19:54:04 +00:00
content = [ ]
for search_url in search_urls :
2017-08-30 20:27:47 +00:00
job_key . SetVariable ( ' script_status ' , ' fetching ' + search_url )
2018-04-18 22:10:15 +00:00
network_job = ClientNetworkingJobs . NetworkJob ( ' GET ' , search_url , referral_url = referral_url )
2017-08-30 20:27:47 +00:00
network_job . OverrideBandwidth ( )
HG . client_controller . network_engine . AddJob ( network_job )
2017-09-06 20:18:20 +00:00
try :
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
network_job . WaitUntilDone ( )
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
except HydrusExceptions . CancelledException :
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
break
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
except HydrusExceptions . NetworkException as e :
2016-11-16 20:21:43 +00:00
2017-08-30 20:27:47 +00:00
if isinstance ( e , HydrusExceptions . NotFoundException ) :
job_key . SetVariable ( ' script_status ' , ' 404 - nothing found ' )
time . sleep ( 2 )
continue
elif isinstance ( e , HydrusExceptions . NetworkException ) :
job_key . SetVariable ( ' script_status ' , ' Network error! Details written to log. ' )
HydrusData . Print ( ' Problem fetching ' + search_url + ' : ' )
HydrusData . PrintException ( e )
time . sleep ( 2 )
continue
else :
2017-09-06 20:18:20 +00:00
raise
2017-08-30 20:27:47 +00:00
2016-11-16 20:21:43 +00:00
2016-09-21 19:54:04 +00:00
2017-08-30 20:27:47 +00:00
linked_data = network_job . GetContent ( )
2016-11-09 23:13:22 +00:00
2017-12-13 22:33:07 +00:00
children_content = GetChildrenContent ( job_key , self . _children , linked_data , search_url )
2016-11-02 21:09:14 +00:00
content . extend ( children_content )
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
if job_key . IsCancelled ( ) :
raise HydrusExceptions . CancelledException ( )
2016-09-21 19:54:04 +00:00
return content
2016-11-16 20:21:43 +00:00
def ParseURLs ( self , job_key , data , referral_url ) :
2016-11-02 21:09:14 +00:00
2018-02-07 23:40:33 +00:00
basic_urls = self . _formula . Parse ( { } , data )
2016-11-02 21:09:14 +00:00
absolute_urls = [ urlparse . urljoin ( referral_url , basic_url ) for basic_url in basic_urls ]
2016-11-16 20:21:43 +00:00
for url in absolute_urls :
job_key . AddURL ( url )
2016-11-02 21:09:14 +00:00
return absolute_urls
2016-10-19 20:02:56 +00:00
def ToPrettyStrings ( self ) :
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
return ( self . _name , ' link ' , ConvertParsableContentToPrettyString ( self . GetParsableContent ( ) ) )
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
def ToTuple ( self ) :
return ( self . _name , self . _formula , self . _children )
2016-10-05 20:22:40 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK ] = ParseNodeContentLink
FILE_IDENTIFIER_TYPE_FILE = 0
FILE_IDENTIFIER_TYPE_MD5 = 1
FILE_IDENTIFIER_TYPE_SHA1 = 2
FILE_IDENTIFIER_TYPE_SHA256 = 3
2016-10-12 21:52:50 +00:00
FILE_IDENTIFIER_TYPE_SHA512 = 4
2016-10-05 20:22:40 +00:00
FILE_IDENTIFIER_TYPE_USER_INPUT = 5
file_identifier_string_lookup = { }
file_identifier_string_lookup [ FILE_IDENTIFIER_TYPE_FILE ] = ' the actual file (POST only) '
file_identifier_string_lookup [ FILE_IDENTIFIER_TYPE_MD5 ] = ' md5 hash '
file_identifier_string_lookup [ FILE_IDENTIFIER_TYPE_SHA1 ] = ' sha1 hash '
file_identifier_string_lookup [ FILE_IDENTIFIER_TYPE_SHA256 ] = ' sha256 hash '
file_identifier_string_lookup [ FILE_IDENTIFIER_TYPE_SHA512 ] = ' sha512 hash '
file_identifier_string_lookup [ FILE_IDENTIFIER_TYPE_USER_INPUT ] = ' custom user input '
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
# eventually transition this to be a flat 'generate page/gallery urls'
# the rest of the parsing system can pick those up automatically
# this nullifies the need for contentlink stuff, at least in its current borked form
2016-10-05 20:22:40 +00:00
class ParseRootFileLookup ( HydrusSerialisable . SerialisableBaseNamed ) :
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = ' File Lookup Script '
2017-11-15 22:35:49 +00:00
SERIALISABLE_VERSION = 2
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
def __init__ ( self , name , url = None , query_type = None , file_identifier_type = None , file_identifier_string_converter = None , file_identifier_arg_name = None , static_args = None , children = None ) :
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
HydrusSerialisable . SerialisableBaseNamed . __init__ ( self , name )
2016-11-02 21:09:14 +00:00
self . _url = url
2016-09-21 19:54:04 +00:00
self . _query_type = query_type
2016-10-05 20:22:40 +00:00
self . _file_identifier_type = file_identifier_type
2017-11-15 22:35:49 +00:00
self . _file_identifier_string_converter = file_identifier_string_converter
2016-10-05 20:22:40 +00:00
self . _file_identifier_arg_name = file_identifier_arg_name
2016-09-21 19:54:04 +00:00
self . _static_args = static_args
self . _children = children
def _GetSerialisableInfo ( self ) :
serialisable_children = [ child . GetSerialisableTuple ( ) for child in self . _children ]
2017-11-15 22:35:49 +00:00
serialisable_file_identifier_string_converter = self . _file_identifier_string_converter . GetSerialisableTuple ( )
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
return ( self . _url , self . _query_type , self . _file_identifier_type , serialisable_file_identifier_string_converter , self . _file_identifier_arg_name , self . _static_args , serialisable_children )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2017-11-15 22:35:49 +00:00
( self . _url , self . _query_type , self . _file_identifier_type , serialisable_file_identifier_string_converter , self . _file_identifier_arg_name , self . _static_args , serialisable_children ) = serialisable_info
2016-09-21 19:54:04 +00:00
self . _children = [ HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_child ) for serialisable_child in serialisable_children ]
2017-11-15 22:35:49 +00:00
self . _file_identifier_string_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_file_identifier_string_converter )
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
( url , query_type , file_identifier_type , file_identifier_encoding , file_identifier_arg_name , static_args , serialisable_children ) = old_serialisable_info
transformations = [ ]
if file_identifier_encoding == HC . ENCODING_RAW :
pass
elif file_identifier_encoding == HC . ENCODING_HEX :
transformations . append ( ( STRING_TRANSFORMATION_ENCODE , ' hex ' ) )
elif file_identifier_encoding == HC . ENCODING_BASE64 :
transformations . append ( ( STRING_TRANSFORMATION_ENCODE , ' base64 ' ) )
file_identifier_string_converter = StringConverter ( transformations , ' some hash bytes ' )
serialisable_file_identifier_string_converter = file_identifier_string_converter . GetSerialisableTuple ( )
new_serialisable_info = ( url , query_type , file_identifier_type , serialisable_file_identifier_string_converter , file_identifier_arg_name , static_args , serialisable_children )
return ( 2 , new_serialisable_info )
2018-02-07 23:40:33 +00:00
2016-11-09 23:13:22 +00:00
def ConvertMediaToFileIdentifier ( self , media ) :
if self . _file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT :
raise Exception ( ' Cannot convert media to file identifier--this script takes user input! ' )
elif self . _file_identifier_type == FILE_IDENTIFIER_TYPE_SHA256 :
return media . GetHash ( )
elif self . _file_identifier_type in ( FILE_IDENTIFIER_TYPE_MD5 , FILE_IDENTIFIER_TYPE_SHA1 , FILE_IDENTIFIER_TYPE_SHA512 ) :
sha256_hash = media . GetHash ( )
if self . _file_identifier_type == FILE_IDENTIFIER_TYPE_MD5 :
hash_type = ' md5 '
elif self . _file_identifier_type == FILE_IDENTIFIER_TYPE_SHA1 :
hash_type = ' sha1 '
elif self . _file_identifier_type == FILE_IDENTIFIER_TYPE_SHA512 :
hash_type = ' sha512 '
try :
2017-05-10 21:33:58 +00:00
( other_hash , ) = HG . client_controller . Read ( ' file_hashes ' , ( sha256_hash , ) , ' sha256 ' , hash_type )
2016-11-09 23:13:22 +00:00
return other_hash
except :
raise Exception ( ' I do not know that file \' s ' + hash_type + ' hash, so I cannot look it up! ' )
elif self . _file_identifier_type == FILE_IDENTIFIER_TYPE_FILE :
hash = media . GetHash ( )
mime = media . GetMime ( )
2017-06-28 20:23:21 +00:00
client_files_manager = HG . client_controller . client_files_manager
2016-11-09 23:13:22 +00:00
try :
path = client_files_manager . GetFilePath ( hash , mime )
return path
except HydrusExceptions . FileMissingException as e :
raise Exception ( ' That file is not in the database \' s local files, so I cannot look it up! ' )
2016-11-16 20:21:43 +00:00
def FetchData ( self , job_key , file_identifier ) :
2017-09-06 20:18:20 +00:00
# add gauge report hook and in-stream cancel support to the get/post calls
request_args = dict ( self . _static_args )
if self . _file_identifier_type != FILE_IDENTIFIER_TYPE_FILE :
2016-10-05 20:22:40 +00:00
2017-11-15 22:35:49 +00:00
request_args [ self . _file_identifier_arg_name ] = self . _file_identifier_string_converter . Convert ( file_identifier )
2016-10-05 20:22:40 +00:00
2017-09-06 20:18:20 +00:00
if self . _query_type == HC . GET :
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
if self . _file_identifier_type == FILE_IDENTIFIER_TYPE_FILE :
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
raise Exception ( ' Cannot have a file as an argument on a GET query! ' )
2016-11-02 21:09:14 +00:00
2018-08-22 21:10:59 +00:00
full_request_url = self . _url + ' ? ' + ClientNetworkingDomain . ConvertQueryDictToText ( request_args )
2017-09-06 20:18:20 +00:00
job_key . SetVariable ( ' script_status ' , ' fetching ' + full_request_url )
job_key . AddURL ( full_request_url )
2018-04-18 22:10:15 +00:00
network_job = ClientNetworkingJobs . NetworkJob ( ' GET ' , full_request_url )
2017-09-06 20:18:20 +00:00
elif self . _query_type == HC . POST :
2018-08-15 20:40:30 +00:00
additional_headers = { }
files = None
2017-09-06 20:18:20 +00:00
if self . _file_identifier_type == FILE_IDENTIFIER_TYPE_FILE :
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
job_key . SetVariable ( ' script_status ' , ' uploading file ' )
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
path = file_identifier
2016-11-02 21:09:14 +00:00
2018-08-15 20:40:30 +00:00
if self . _file_identifier_string_converter . MakesChanges ( ) :
f_altered = cStringIO . StringIO ( )
with open ( path , ' rb ' ) as f :
file_content = f . read ( )
f_altered = self . _file_identifier_string_converter . Convert ( file_content )
request_args [ self . _file_identifier_arg_name ] = f_altered
additional_headers [ ' content-type ' ] = ' application/x-www-form-urlencoded '
else :
files = { self . _file_identifier_arg_name : open ( path , ' rb ' ) }
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
else :
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
job_key . SetVariable ( ' script_status ' , ' uploading identifier ' )
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
files = None
2016-11-02 21:09:14 +00:00
2018-04-18 22:10:15 +00:00
network_job = ClientNetworkingJobs . NetworkJob ( ' POST ' , self . _url , body = request_args )
2017-10-25 21:45:15 +00:00
2018-08-15 20:40:30 +00:00
if files is not None :
network_job . SetFiles ( files )
for ( key , value ) in additional_headers . items ( ) :
network_job . AddAdditionalHeader ( key , value )
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
# send nj to nj control on this panel here
network_job . OverrideBandwidth ( )
HG . client_controller . network_engine . AddJob ( network_job )
try :
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
network_job . WaitUntilDone ( )
2016-12-07 22:12:52 +00:00
except HydrusExceptions . NotFoundException :
job_key . SetVariable ( ' script_status ' , ' 404 - nothing found ' )
raise
except HydrusExceptions . NetworkException as e :
job_key . SetVariable ( ' script_status ' , ' Network error! ' )
HydrusData . ShowException ( e )
raise
2016-11-16 20:21:43 +00:00
2016-10-05 20:22:40 +00:00
2017-09-06 20:18:20 +00:00
if job_key . IsCancelled ( ) :
raise HydrusExceptions . CancelledException ( )
data = network_job . GetContent ( )
return data
2016-10-05 20:22:40 +00:00
2016-09-21 19:54:04 +00:00
def GetParsableContent ( self ) :
children_parsable_content = set ( )
for child in self . _children :
children_parsable_content . update ( child . GetParsableContent ( ) )
return children_parsable_content
2017-12-13 22:33:07 +00:00
def DoQuery ( self , job_key , file_identifier ) :
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
try :
try :
data = self . FetchData ( job_key , file_identifier )
except HydrusExceptions . NetworkException as e :
return [ ]
2018-02-07 23:40:33 +00:00
parse_results = self . Parse ( job_key , data )
2016-11-16 20:21:43 +00:00
2018-02-07 23:40:33 +00:00
return parse_results
2016-11-16 20:21:43 +00:00
except HydrusExceptions . CancelledException :
job_key . SetVariable ( ' script_status ' , ' Cancelled! ' )
return [ ]
finally :
job_key . Finish ( )
2016-10-19 20:02:56 +00:00
2016-11-09 23:13:22 +00:00
def UsesUserInput ( self ) :
2016-10-19 20:02:56 +00:00
2016-11-09 23:13:22 +00:00
return self . _file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT
2016-10-19 20:02:56 +00:00
2017-12-13 22:33:07 +00:00
def Parse ( self , job_key , data ) :
2016-09-21 19:54:04 +00:00
2018-02-07 23:40:33 +00:00
parse_results = GetChildrenContent ( job_key , self . _children , data , self . _url )
2016-09-21 19:54:04 +00:00
2018-02-07 23:40:33 +00:00
if len ( parse_results ) == 0 :
2016-12-07 22:12:52 +00:00
job_key . SetVariable ( ' script_status ' , ' Did not find anything. ' )
else :
2018-07-04 20:48:28 +00:00
job_key . SetVariable ( ' script_status ' , ' Found ' + HydrusData . ToHumanInt ( len ( parse_results ) ) + ' rows. ' )
2016-12-07 22:12:52 +00:00
2018-02-07 23:40:33 +00:00
return parse_results
2016-09-21 19:54:04 +00:00
def SetChildren ( self , children ) :
self . _children = children
2016-10-05 20:22:40 +00:00
def ToPrettyStrings ( self ) :
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
return ( self . _name , HC . query_type_string_lookup [ self . _query_type ] , ' File Lookup ' , ConvertParsableContentToPrettyString ( self . GetParsableContent ( ) ) )
2016-09-21 19:54:04 +00:00
def ToTuple ( self ) :
2017-11-15 22:35:49 +00:00
return ( self . _name , self . _url , self . _query_type , self . _file_identifier_type , self . _file_identifier_string_converter , self . _file_identifier_arg_name , self . _static_args , self . _children )
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP ] = ParseRootFileLookup
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING = 0
STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END = 1
2017-09-13 20:50:41 +00:00
STRING_TRANSFORMATION_PREPEND_TEXT = 2
STRING_TRANSFORMATION_APPEND_TEXT = 3
STRING_TRANSFORMATION_ENCODE = 4
STRING_TRANSFORMATION_DECODE = 5
2017-11-15 22:35:49 +00:00
STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING = 6
STRING_TRANSFORMATION_CLIP_TEXT_FROM_END = 7
2017-09-13 20:50:41 +00:00
STRING_TRANSFORMATION_REVERSE = 8
2018-01-17 22:52:10 +00:00
STRING_TRANSFORMATION_REGEX_SUB = 9
2018-02-07 23:40:33 +00:00
STRING_TRANSFORMATION_DATE_DECODE = 10
2018-06-06 21:27:02 +00:00
STRING_TRANSFORMATION_INTEGER_ADDITION = 11
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
transformation_type_str_lookup = { }
transformation_type_str_lookup [ STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING ] = ' remove text from beginning of string '
transformation_type_str_lookup [ STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END ] = ' remove text from end of string '
transformation_type_str_lookup [ STRING_TRANSFORMATION_PREPEND_TEXT ] = ' prepend text '
transformation_type_str_lookup [ STRING_TRANSFORMATION_APPEND_TEXT ] = ' append text '
transformation_type_str_lookup [ STRING_TRANSFORMATION_ENCODE ] = ' encode '
transformation_type_str_lookup [ STRING_TRANSFORMATION_DECODE ] = ' decode '
transformation_type_str_lookup [ STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING ] = ' take the start of the string '
transformation_type_str_lookup [ STRING_TRANSFORMATION_CLIP_TEXT_FROM_END ] = ' take the end of the string '
transformation_type_str_lookup [ STRING_TRANSFORMATION_REVERSE ] = ' reverse text '
2018-01-17 22:52:10 +00:00
transformation_type_str_lookup [ STRING_TRANSFORMATION_REGEX_SUB ] = ' regex substitution '
2018-02-07 23:40:33 +00:00
transformation_type_str_lookup [ STRING_TRANSFORMATION_DATE_DECODE ] = ' date decode '
2018-06-06 21:27:02 +00:00
transformation_type_str_lookup [ STRING_TRANSFORMATION_INTEGER_ADDITION ] = ' integer addition '
2017-11-15 22:35:49 +00:00
class StringConverter ( HydrusSerialisable . SerialisableBase ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_STRING_CONVERTER
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = ' String Converter '
2017-11-15 22:35:49 +00:00
SERIALISABLE_VERSION = 1
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def __init__ ( self , transformations = None , example_string = None ) :
if transformations is None :
transformations = [ ]
if example_string is None :
example_string = ' example string '
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
HydrusSerialisable . SerialisableBase . __init__ ( self )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
self . transformations = transformations
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
self . example_string = example_string
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def _GetSerialisableInfo ( self ) :
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return ( self . transformations , self . example_string )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2018-01-17 22:52:10 +00:00
( serialisable_transformations , self . example_string ) = serialisable_info
self . transformations = [ ]
2018-05-02 20:45:20 +00:00
try : # I initialised this bad one time and broke a dialog on subsequent loads, fugg
2018-01-17 22:52:10 +00:00
2018-05-02 20:45:20 +00:00
for ( transformation_type , data ) in serialisable_transformations :
2018-01-17 22:52:10 +00:00
2018-05-02 20:45:20 +00:00
if isinstance ( data , list ) :
data = tuple ( data ) # convert from list to tuple thing
self . transformations . append ( ( transformation_type , data ) )
2018-01-17 22:52:10 +00:00
2018-05-02 20:45:20 +00:00
except :
pass
2018-01-17 22:52:10 +00:00
2017-11-15 22:35:49 +00:00
def Convert ( self , s , max_steps_allowed = None ) :
for ( i , transformation ) in enumerate ( self . transformations ) :
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
try :
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
( transformation_type , data ) = transformation
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
if transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING :
num_chars = data
s = s [ num_chars : ]
elif transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END :
num_chars = data
s = s [ : - num_chars ]
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING :
num_chars = data
s = s [ : num_chars ]
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_END :
num_chars = data
s = s [ - num_chars : ]
elif transformation_type == STRING_TRANSFORMATION_PREPEND_TEXT :
text = data
s = text + s
elif transformation_type == STRING_TRANSFORMATION_APPEND_TEXT :
text = data
s = s + text
elif transformation_type == STRING_TRANSFORMATION_ENCODE :
encode_type = data
s = s . encode ( encode_type )
elif transformation_type == STRING_TRANSFORMATION_DECODE :
encode_type = data
s = s . decode ( encode_type )
elif transformation_type == STRING_TRANSFORMATION_REVERSE :
s = s [ : : - 1 ]
2018-01-17 22:52:10 +00:00
elif transformation_type == STRING_TRANSFORMATION_REGEX_SUB :
( pattern , repl ) = data
s = re . sub ( pattern , repl , s , flags = re . UNICODE )
2018-02-07 23:40:33 +00:00
elif transformation_type == STRING_TRANSFORMATION_DATE_DECODE :
( phrase , timezone , timezone_offset ) = data
struct_time = time . strptime ( s , phrase )
if timezone == HC . TIMEZONE_GMT :
# the given struct is in GMT, so calendar.timegm is appropriate here
timestamp = int ( calendar . timegm ( struct_time ) )
elif timezone == HC . TIMEZONE_LOCAL :
# the given struct is in local time, so time.mktime is correct
timestamp = int ( time . mktime ( struct_time ) )
elif timezone == HC . TIMEZONE_OFFSET :
# the given struct is in server time, which is the same as GMT minus an offset
# if we are 7200 seconds ahead, the correct GMT timestamp needs to be 7200 smaller
timestamp = int ( calendar . timegm ( struct_time ) ) - timezone_offset
s = str ( timestamp )
2018-06-06 21:27:02 +00:00
elif transformation_type == STRING_TRANSFORMATION_INTEGER_ADDITION :
delta = data
s = str ( int ( s ) + int ( delta ) )
2017-09-13 20:50:41 +00:00
2018-06-06 21:27:02 +00:00
except Exception as e :
2017-09-13 20:50:41 +00:00
2018-06-06 21:27:02 +00:00
raise HydrusExceptions . StringConvertException ( ' ERROR: Could not apply " ' + self . TransformationToUnicode ( transformation ) + ' " to string " ' + repr ( s ) + ' " : ' + HydrusData . ToUnicode ( e ) )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
if max_steps_allowed is not None and i + 1 > = max_steps_allowed :
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return s
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return s
def GetTransformationStrings ( self ) :
return [ self . TransformationToUnicode ( transformation ) for transformation in self . transformations ]
2018-02-07 23:40:33 +00:00
def MakesChanges ( self ) :
return len ( self . transformations ) > 0
2017-11-15 22:35:49 +00:00
@staticmethod
def TransformationToUnicode ( transformation ) :
( transformation_type , data ) = transformation
if transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING :
2018-07-04 20:48:28 +00:00
return ' remove the first ' + HydrusData . ToHumanInt ( data ) + ' characters '
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END :
2018-07-04 20:48:28 +00:00
return ' remove the last ' + HydrusData . ToHumanInt ( data ) + ' characters '
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING :
2018-07-04 20:48:28 +00:00
return ' take the first ' + HydrusData . ToHumanInt ( data ) + ' characters '
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_END :
2018-07-04 20:48:28 +00:00
return ' take the last ' + HydrusData . ToHumanInt ( data ) + ' characters '
2017-11-15 22:35:49 +00:00
elif transformation_type == STRING_TRANSFORMATION_PREPEND_TEXT :
return ' prepend with " ' + data + ' " '
elif transformation_type == STRING_TRANSFORMATION_APPEND_TEXT :
return ' append with " ' + data + ' " '
elif transformation_type == STRING_TRANSFORMATION_ENCODE :
return ' encode to ' + data
elif transformation_type == STRING_TRANSFORMATION_DECODE :
return ' decode from ' + data
elif transformation_type == STRING_TRANSFORMATION_REVERSE :
return transformation_type_str_lookup [ STRING_TRANSFORMATION_REVERSE ]
2018-01-17 22:52:10 +00:00
elif transformation_type == STRING_TRANSFORMATION_REGEX_SUB :
return ' regex substitution: ' + HydrusData . ToUnicode ( data )
2018-04-11 22:30:40 +00:00
elif transformation_type == STRING_TRANSFORMATION_DATE_DECODE :
return ' date decode: ' + repr ( data )
2018-06-06 21:27:02 +00:00
elif transformation_type == STRING_TRANSFORMATION_INTEGER_ADDITION :
return ' integer addition: add ' + HydrusData . ToUnicode ( data )
2018-04-11 22:30:40 +00:00
else :
return ' unknown transformation '
2017-11-15 22:35:49 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_STRING_CONVERTER ] = StringConverter
2017-09-13 20:50:41 +00:00
STRING_MATCH_FIXED = 0
STRING_MATCH_FLEXIBLE = 1
STRING_MATCH_REGEX = 2
STRING_MATCH_ANY = 3
ALPHA = 0
ALPHANUMERIC = 1
NUMERIC = 2
2017-09-27 21:52:54 +00:00
class StringMatch ( HydrusSerialisable . SerialisableBase ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_STRING_MATCH
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = ' String Match '
2017-09-27 21:52:54 +00:00
SERIALISABLE_VERSION = 1
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
def __init__ ( self , match_type = STRING_MATCH_ANY , match_value = ' ' , min_chars = None , max_chars = None , example_string = ' example string ' ) :
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
HydrusSerialisable . SerialisableBase . __init__ ( self )
2017-09-13 20:50:41 +00:00
# make a gui control that accepts one of these. displays expected input on the right and colours red/green (and does isvalid) based on current input
# think about replacing the veto stuff above with this.
self . _match_type = match_type
self . _match_value = match_value
2017-09-27 21:52:54 +00:00
self . _min_chars = min_chars
self . _max_chars = max_chars
self . _example_string = example_string
def _GetSerialisableInfo ( self ) :
return ( self . _match_type , self . _match_value , self . _min_chars , self . _max_chars , self . _example_string )
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
( self . _match_type , self . _match_value , self . _min_chars , self . _max_chars , self . _example_string ) = serialisable_info
2017-09-13 20:50:41 +00:00
def SetMaxChars ( self , max_chars ) :
self . _max_chars = max_chars
def SetMinChars ( self , min_chars ) :
self . _min_chars = min_chars
2018-02-07 23:40:33 +00:00
def Matches ( self , text ) :
try :
self . Test ( text )
return True
except HydrusExceptions . StringMatchException :
return False
2017-09-13 20:50:41 +00:00
def Test ( self , text ) :
text_len = len ( text )
presentation_text = ' " ' + text + ' " '
if self . _min_chars is not None and text_len < self . _min_chars :
2018-07-04 20:48:28 +00:00
raise HydrusExceptions . StringMatchException ( presentation_text + ' had fewer than ' + HydrusData . ToHumanInt ( self . _min_chars ) + ' characters ' )
2017-09-13 20:50:41 +00:00
if self . _max_chars is not None and text_len > self . _max_chars :
2018-07-04 20:48:28 +00:00
raise HydrusExceptions . StringMatchException ( presentation_text + ' had more than ' + HydrusData . ToHumanInt ( self . _max_chars ) + ' characters ' )
2017-09-13 20:50:41 +00:00
if self . _match_type == STRING_MATCH_FIXED :
2017-11-22 21:03:07 +00:00
if text != self . _match_value :
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
raise HydrusExceptions . StringMatchException ( presentation_text + ' did not exactly match " ' + self . _match_value + ' " ' )
2017-09-13 20:50:41 +00:00
elif self . _match_type in ( STRING_MATCH_FLEXIBLE , STRING_MATCH_REGEX ) :
if self . _match_type == STRING_MATCH_FLEXIBLE :
if self . _match_value == ALPHA :
r = ' ^[a-zA-Z]+$ '
fail_reason = ' had non-alpha characters '
elif self . _match_value == ALPHANUMERIC :
r = ' ^[a-zA-Z \ d]+$ '
fail_reason = ' had non-alphanumeric characters '
elif self . _match_value == NUMERIC :
r = ' ^ \ d+$ '
fail_reason = ' had non-numeric characters '
elif self . _match_type == STRING_MATCH_REGEX :
r = self . _match_value
fail_reason = ' did not match " ' + r + ' " '
2018-05-02 20:45:20 +00:00
try :
result = re . search ( r , text , flags = re . UNICODE )
except Exception as e :
raise HydrusExceptions . StringMatchException ( ' That regex did not work! ' + HydrusData . ToUnicode ( e ) )
if result is None :
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
raise HydrusExceptions . StringMatchException ( presentation_text + fail_reason )
2017-09-13 20:50:41 +00:00
elif self . _match_type == STRING_MATCH_ANY :
2017-11-22 21:03:07 +00:00
pass
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
def ToTuple ( self ) :
return ( self . _match_type , self . _match_value , self . _min_chars , self . _max_chars , self . _example_string )
2017-09-27 21:52:54 +00:00
def ToUnicode ( self ) :
result = ' '
2017-11-22 21:03:07 +00:00
if self . _min_chars is None :
2017-09-27 21:52:54 +00:00
2017-11-22 21:03:07 +00:00
if self . _max_chars is None :
2017-09-27 21:52:54 +00:00
2017-11-22 21:03:07 +00:00
result + = ' any number of '
2017-09-27 21:52:54 +00:00
else :
2017-11-22 21:03:07 +00:00
result + = ' at most ' + HydrusData . ToUnicode ( self . _max_chars ) + ' '
2017-09-27 21:52:54 +00:00
else :
2017-11-22 21:03:07 +00:00
if self . _max_chars is None :
result + = ' at least ' + HydrusData . ToUnicode ( self . _min_chars ) + ' '
else :
result + = ' between ' + HydrusData . ToUnicode ( self . _min_chars ) + ' and ' + HydrusData . ToUnicode ( self . _max_chars ) + ' '
2017-09-27 21:52:54 +00:00
show_example = True
if self . _match_type == STRING_MATCH_ANY :
result + = ' characters '
2017-11-22 21:03:07 +00:00
show_example = False
2017-09-27 21:52:54 +00:00
elif self . _match_type == STRING_MATCH_FIXED :
2017-11-22 21:03:07 +00:00
result = self . _match_value
2017-09-27 21:52:54 +00:00
show_example = False
elif self . _match_type == STRING_MATCH_FLEXIBLE :
if self . _match_value == ALPHA :
result + = ' alphabetical characters '
elif self . _match_value == ALPHANUMERIC :
result + = ' alphanumeric characters '
elif self . _match_value == NUMERIC :
result + = ' numeric characters '
elif self . _match_type == STRING_MATCH_REGEX :
result + = ' characters, matching regex " ' + self . _match_value + ' " '
if show_example :
2017-11-22 21:03:07 +00:00
result + = ' , such as " ' + self . _example_string + ' " '
2017-09-27 21:52:54 +00:00
return result
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_STRING_MATCH ] = StringMatch