hydrus/include/ClientParsing.py

1480 lines
48 KiB
Python
Raw Normal View History

2016-07-20 19:57:10 +00:00
import bs4
2016-11-02 21:09:14 +00:00
import ClientNetworking
2017-12-13 22:33:07 +00:00
import collections
2016-09-21 19:54:04 +00:00
import HydrusConstants as HC
2016-09-07 20:01:05 +00:00
import HydrusData
2016-11-09 23:13:22 +00:00
import HydrusExceptions
2017-05-10 21:33:58 +00:00
import HydrusGlobals as HG
2016-07-20 19:57:10 +00:00
import HydrusSerialisable
2016-10-26 20:45:34 +00:00
import HydrusTags
2016-10-19 20:02:56 +00:00
import os
2017-09-13 20:50:41 +00:00
import re
2016-11-16 20:21:43 +00:00
import time
2016-11-02 21:09:14 +00:00
import urlparse
2016-07-20 19:57:10 +00:00
2016-10-26 20:45:34 +00:00
def ConvertContentResultToPrettyString( result ):
2016-11-02 21:09:14 +00:00
( ( name, content_type, additional_info ), parsed_text ) = result
2016-10-26 20:45:34 +00:00
2017-12-13 22:33:07 +00:00
if content_type == HC.CONTENT_TYPE_URLS:
return 'url: ' + parsed_text
elif content_type == HC.CONTENT_TYPE_MAPPINGS:
2016-10-26 20:45:34 +00:00
return 'tag: ' + HydrusTags.CombineTag( additional_info, parsed_text )
2016-11-02 21:09:14 +00:00
elif content_type == HC.CONTENT_TYPE_VETO:
return 'veto'
2016-10-26 20:45:34 +00:00
raise NotImplementedError()
2016-11-02 21:09:14 +00:00
def ConvertParsableContentToPrettyString( parsable_content, include_veto = False ):
pretty_strings = []
content_type_to_additional_infos = HydrusData.BuildKeyToSetDict( ( ( content_type, additional_infos ) for ( name, content_type, additional_infos ) in parsable_content ) )
for ( content_type, additional_infos ) in content_type_to_additional_infos.items():
2017-12-13 22:33:07 +00:00
if content_type == HC.CONTENT_TYPE_URLS:
pretty_strings.append( 'urls' )
elif content_type == HC.CONTENT_TYPE_MAPPINGS:
2016-11-02 21:09:14 +00:00
namespaces = [ namespace for namespace in additional_infos if namespace != '' ]
if '' in additional_infos:
namespaces.append( 'unnamespaced' )
pretty_strings.append( 'tags: ' + ', '.join( namespaces ) )
elif content_type == HC.CONTENT_TYPE_VETO:
if include_veto:
pretty_strings.append( 'veto' )
2016-10-05 20:22:40 +00:00
2016-11-02 21:09:14 +00:00
if len( pretty_strings ) == 0:
2016-10-05 20:22:40 +00:00
return 'nothing'
else:
2016-11-02 21:09:14 +00:00
return ', '.join( pretty_strings )
2016-10-05 20:22:40 +00:00
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
def GetChildrenContent( job_key, children, data, referral_url ):
2016-11-02 21:09:14 +00:00
content = []
for child in children:
2017-12-13 22:33:07 +00:00
try:
child_content = child.Parse( job_key, data, referral_url )
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
except HydrusExceptions.VetoException:
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
return []
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
content.extend( child_content )
2016-11-02 21:09:14 +00:00
return content
2016-11-09 23:13:22 +00:00
def GetTagsFromContentResults( results ):
tag_results = []
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_MAPPINGS:
tag_results.append( HydrusTags.CombineTag( additional_info, parsed_text ) )
tag_results = HydrusTags.CleanTags( tag_results )
return tag_results
2017-12-13 22:33:07 +00:00
def GetURLsFromContentResults( results ):
url_results = collections.defaultdict( list )
for ( ( name, content_type, additional_info ), parsed_text ) in results:
if content_type == HC.CONTENT_TYPE_URLS:
priority = additional_info
if priority is None:
priority = -1
url_results[ priority ].append( parsed_text )
# ( priority, url_list ) pairs
url_results = list( url_results.items() )
# ordered by descending priority
url_results.sort( reverse = True )
# url_lists of descending priority
url_results = [ url_list for ( priority, url_list ) in url_results ]
return url_results
2016-11-02 21:09:14 +00:00
def GetVetoes( parsed_texts, additional_info ):
( veto_if_matches_found, match_if_text_present, search_text ) = additional_info
if match_if_text_present:
matches = [ 'veto' for parsed_text in parsed_texts if search_text in parsed_text ]
else:
matches = [ 'veto' for parsed_text in parsed_texts if search_text not in parsed_text ]
if veto_if_matches_found:
return matches
else:
if len( matches ) == 0:
return [ 'veto through absence' ]
else:
return []
2016-10-05 20:22:40 +00:00
2016-09-07 20:01:05 +00:00
def RenderTagRule( ( name, attrs, index ) ):
if index is None:
result = 'all ' + name + ' tags'
else:
2016-10-26 20:45:34 +00:00
result = HydrusData.ConvertIntToFirst( index + 1 ) + ' ' + name + ' tag'
2016-09-07 20:01:05 +00:00
if len( attrs ) > 0:
result += ' with ' + ' and '.join( [ key + ' = ' + value for ( key, value ) in attrs.items() ] )
return result
2017-12-13 22:33:07 +00:00
HTML_CONTENT_ATTRIBUTE = 0
HTML_CONTENT_STRING = 1
HTML_CONTENT_HTML = 2
2016-09-21 19:54:04 +00:00
class ParseFormulaHTML( HydrusSerialisable.SerialisableBase ):
2016-07-20 19:57:10 +00:00
2016-09-21 19:54:04 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'HTML Parsing Formula'
2017-12-13 22:33:07 +00:00
SERIALISABLE_VERSION = 5
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
def __init__( self, tag_rules = None, content_to_fetch = None, attribute_to_fetch = None, string_match = None, string_converter = None ):
2016-07-20 19:57:10 +00:00
2016-09-07 20:01:05 +00:00
if tag_rules is None:
tag_rules = [ ( 'a', {}, None ) ]
2016-07-20 19:57:10 +00:00
2017-11-22 21:03:07 +00:00
if string_match is None:
string_match = StringMatch()
2017-11-15 22:35:49 +00:00
if string_converter is None:
2016-11-16 20:21:43 +00:00
2017-11-15 22:35:49 +00:00
string_converter = StringConverter( example_string = 'parsed information' )
2016-11-16 20:21:43 +00:00
2016-09-07 20:01:05 +00:00
self._tag_rules = tag_rules
2017-12-13 22:33:07 +00:00
self._content_to_fetch = content_to_fetch
self._attribute_to_fetch = attribute_to_fetch
2016-07-20 19:57:10 +00:00
2017-11-22 21:03:07 +00:00
self._string_match = string_match
2017-11-15 22:35:49 +00:00
self._string_converter = string_converter
2016-10-19 20:02:56 +00:00
2016-07-20 19:57:10 +00:00
def _GetSerialisableInfo( self ):
2017-11-22 21:03:07 +00:00
serialisable_string_match = self._string_match.GetSerialisableTuple()
2017-11-15 22:35:49 +00:00
serialisable_string_converter = self._string_converter.GetSerialisableTuple()
2017-12-13 22:33:07 +00:00
return ( self._tag_rules, self._content_to_fetch, self._attribute_to_fetch, serialisable_string_match, serialisable_string_converter )
2016-07-20 19:57:10 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2017-12-13 22:33:07 +00:00
( self._tag_rules, self._content_to_fetch, self._attribute_to_fetch, serialisable_string_match, serialisable_string_converter ) = serialisable_info
2017-11-15 22:35:49 +00:00
2017-11-22 21:03:07 +00:00
self._string_match = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_match )
2017-11-15 22:35:49 +00:00
self._string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_string_converter )
2016-11-16 20:21:43 +00:00
2017-11-22 21:03:07 +00:00
def _ParseContent( self, root ):
2017-12-13 22:33:07 +00:00
if self._content_to_fetch == HTML_CONTENT_ATTRIBUTE:
2017-11-22 21:03:07 +00:00
2017-12-13 22:33:07 +00:00
if root.has_attr( self._attribute_to_fetch ):
2017-11-22 21:03:07 +00:00
2017-12-13 22:33:07 +00:00
unknown_attr_result = root[ self._attribute_to_fetch ]
2017-11-22 21:03:07 +00:00
# 'class' attr returns a list because it has multiple values under html spec, wew
if isinstance( unknown_attr_result, list ):
if len( unknown_attr_result ) == 0:
2017-12-13 22:33:07 +00:00
raise HydrusExceptions.ParseException( 'Attribute ' + self._attribute_to_fetch + ' not found!' )
2017-11-22 21:03:07 +00:00
else:
result = ' '.join( unknown_attr_result )
else:
result = unknown_attr_result
else:
2017-12-13 22:33:07 +00:00
raise HydrusExceptions.ParseException( 'Attribute ' + self._attribute_to_fetch + ' not found!' )
2017-11-22 21:03:07 +00:00
2017-12-13 22:33:07 +00:00
elif self._content_to_fetch == HTML_CONTENT_STRING:
result = root.string
elif self._content_to_fetch == HTML_CONTENT_HTML:
result = unicode( root )
2017-11-22 21:03:07 +00:00
if result is None or result == '':
2017-12-13 22:33:07 +00:00
raise HydrusExceptions.ParseException( 'Empty/No results found!' )
2017-11-22 21:03:07 +00:00
else:
self._string_match.Test( result )
return self._string_converter.Convert( result )
def _ParseTags( self, root, name, attrs, index ):
results = root.find_all( name = name, attrs = attrs )
if index is not None:
if len( results ) < index + 1:
results = []
else:
results = [ results[ index ] ]
return results
2016-11-16 20:21:43 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
2017-12-13 22:33:07 +00:00
( tag_rules, attribute_to_fetch ) = old_serialisable_info
2016-11-16 20:21:43 +00:00
culling_and_adding = ( 0, 0, '', '' )
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules, attribute_to_fetch, culling_and_adding )
2016-11-16 20:21:43 +00:00
return ( 2, new_serialisable_info )
2016-07-20 19:57:10 +00:00
2017-11-15 22:35:49 +00:00
if version == 2:
2017-12-13 22:33:07 +00:00
( tag_rules, attribute_to_fetch, culling_and_adding ) = old_serialisable_info
2017-11-15 22:35:49 +00:00
( cull_front, cull_back, prepend, append ) = culling_and_adding
transformations = []
if cull_front > 0:
transformations.append( ( STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING, cull_front ) )
elif cull_front < 0:
transformations.append( ( STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END, cull_front ) )
if cull_back > 0:
transformations.append( ( STRING_TRANSFORMATION_CLIP_TEXT_FROM_END, cull_back ) )
elif cull_back < 0:
transformations.append( ( STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING, cull_back ) )
if prepend != '':
transformations.append( ( STRING_TRANSFORMATION_PREPEND_TEXT, prepend ) )
if append != '':
transformations.append( ( STRING_TRANSFORMATION_APPEND_TEXT, append ) )
string_converter = StringConverter( transformations, 'parsed information' )
serialisable_string_converter = string_converter.GetSerialisableTuple()
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules, attribute_to_fetch, serialisable_string_converter )
2017-11-15 22:35:49 +00:00
return ( 3, new_serialisable_info )
2017-11-22 21:03:07 +00:00
if version == 3:
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
( tag_rules, attribute_to_fetch, serialisable_string_converter ) = old_serialisable_info
2016-10-26 20:45:34 +00:00
2017-11-22 21:03:07 +00:00
string_match = StringMatch()
2016-10-26 20:45:34 +00:00
2017-11-22 21:03:07 +00:00
serialisable_string_match = string_match.GetSerialisableTuple()
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
new_serialisable_info = ( tag_rules, attribute_to_fetch, serialisable_string_match, serialisable_string_converter )
2016-07-20 19:57:10 +00:00
2017-11-22 21:03:07 +00:00
return ( 4, new_serialisable_info )
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
if version == 4:
( tag_rules, attribute_to_fetch, serialisable_string_match, serialisable_string_converter ) = old_serialisable_info
if attribute_to_fetch is None:
content_to_fetch = HTML_CONTENT_STRING
attribute_to_fetch = ''
else:
content_to_fetch = HTML_CONTENT_ATTRIBUTE
new_serialisable_info = ( tag_rules, content_to_fetch, attribute_to_fetch, serialisable_string_match, serialisable_string_converter )
return ( 5, new_serialisable_info )
2016-07-20 19:57:10 +00:00
def Parse( self, html ):
root = bs4.BeautifulSoup( html, 'lxml' )
roots = ( root, )
for ( name, attrs, index ) in self._tag_rules:
next_roots = []
for root in roots:
next_roots.extend( self._ParseTags( root, name, attrs, index ) )
roots = next_roots
2017-11-22 21:03:07 +00:00
contents = []
2016-07-20 19:57:10 +00:00
2017-11-22 21:03:07 +00:00
for root in roots:
try:
content = self._ParseContent( root )
contents.append( content )
except HydrusExceptions.ParseException:
continue
2016-10-26 20:45:34 +00:00
2016-07-20 19:57:10 +00:00
return contents
2016-10-19 20:02:56 +00:00
def ToPrettyMultilineString( self ):
pretty_strings = []
for ( name, attrs, index ) in self._tag_rules:
s = ''
if index is None:
2016-11-02 21:09:14 +00:00
s += 'get every'
2016-10-19 20:02:56 +00:00
else:
num = index + 1
2016-11-02 21:09:14 +00:00
s += 'get the ' + HydrusData.ConvertIntToPrettyOrdinalString( num )
2016-10-19 20:02:56 +00:00
2016-11-02 21:09:14 +00:00
s += ' <' + name + '> tag'
2016-10-19 20:02:56 +00:00
if len( attrs ) > 0:
2016-11-02 21:09:14 +00:00
s += ' with attributes ' + ', '.join( key + '=' + value for ( key, value ) in attrs.items() )
2016-10-19 20:02:56 +00:00
pretty_strings.append( s )
2017-12-13 22:33:07 +00:00
if self._content_to_fetch == HTML_CONTENT_ATTRIBUTE:
pretty_strings.append( 'get the ' + self._attribute_to_fetch + ' attribute of those tags' )
elif self._content_to_fetch == HTML_CONTENT_STRING:
2016-10-19 20:02:56 +00:00
pretty_strings.append( 'get the text content of those tags' )
2017-12-13 22:33:07 +00:00
elif self._content_to_fetch == HTML_CONTENT_HTML:
2016-10-19 20:02:56 +00:00
2017-12-13 22:33:07 +00:00
pretty_strings.append( 'get the html of those tags' )
2016-10-19 20:02:56 +00:00
2017-11-15 22:35:49 +00:00
pretty_strings.extend( self._string_converter.GetTransformationStrings() )
2016-11-16 20:21:43 +00:00
2016-10-19 20:02:56 +00:00
separator = os.linesep + 'and then '
pretty_multiline_string = separator.join( pretty_strings )
return pretty_multiline_string
2016-09-07 20:01:05 +00:00
def ToTuple( self ):
2016-07-20 19:57:10 +00:00
2017-12-13 22:33:07 +00:00
return ( self._tag_rules, self._content_to_fetch, self._attribute_to_fetch, self._string_match, self._string_converter )
2016-07-20 19:57:10 +00:00
2016-09-21 19:54:04 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_FORMULA_HTML ] = ParseFormulaHTML
2017-12-13 22:33:07 +00:00
class ContentParser( HydrusSerialisable.SerialisableBase ):
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_CONTENT_PARSER
SERIALISABLE_NAME = 'Content Parser'
2016-09-21 19:54:04 +00:00
SERIALISABLE_VERSION = 1
def __init__( self, name = None, content_type = None, formula = None, additional_info = None ):
2016-10-19 20:02:56 +00:00
if name is None:
name = ''
if content_type is None:
content_type = HC.CONTENT_TYPE_MAPPINGS
if formula is None:
formula = ParseFormulaHTML()
if additional_info is None:
if content_type == HC.CONTENT_TYPE_MAPPINGS:
additional_info = ''
2016-09-21 19:54:04 +00:00
self._name = name
self._content_type = content_type
self._formula = formula
self._additional_info = additional_info
def _GetSerialisableInfo( self ):
serialisable_formula = self._formula.GetSerialisableTuple()
return ( self._name, self._content_type, serialisable_formula, self._additional_info )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._name, self._content_type, serialisable_formula, self._additional_info ) = serialisable_info
2016-11-02 21:09:14 +00:00
if isinstance( self._additional_info, list ):
self._additional_info = tuple( self._additional_info )
2016-09-21 19:54:04 +00:00
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
def GetParsableContent( self ):
2016-11-02 21:09:14 +00:00
return { ( self._name, self._content_type, self._additional_info ) }
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
def Parse( self, job_key, data, referral_url ):
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
content_description = ( self._name, self._content_type, self._additional_info )
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
parsed_texts = self._formula.Parse( data )
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
if self._content_type == HC.CONTENT_TYPE_VETO:
vetoes = GetVetoes( parsed_texts, self._additional_info )
2017-12-13 22:33:07 +00:00
for veto in vetoes:
raise HydrusExceptions.VetoException( self._name )
return []
2016-11-02 21:09:14 +00:00
else:
return [ ( content_description, parsed_text ) for parsed_text in parsed_texts ]
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
def ToPrettyStrings( self ):
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
return ( self._name, 'content', ConvertParsableContentToPrettyString( self.GetParsableContent(), include_veto = True ) )
2016-10-19 20:02:56 +00:00
def ToTuple( self ):
return ( self._name, self._content_type, self._formula, self._additional_info )
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_CONTENT_PARSER ] = ContentParser
PARSER_FILE_PAGE = 0
PARSER_FILES_PAGE = 1
class PageParser( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PAGE_PARSER
SERIALISABLE_NAME = 'Page Parser'
SERIALISABLE_VERSION = 1
def __init__( self, name, parser_type = None, content_parsers = None ):
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
if parser_type is None:
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
parser_type = PARSER_FILE_PAGE
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
if content_parsers is None:
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
content_parsers = []
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._content_parsers = content_parsers
def Parse( self, page_data ):
content_results = []
for content_parser in self._content_parsers:
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
content_results.extend( content_parser.Parse( page_data ) )
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
return content_results
2016-11-02 21:09:14 +00:00
2017-12-13 22:33:07 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PAGE_PARSER ] = PageParser
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
class ParseNodeContentLink( HydrusSerialisable.SerialisableBase ):
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'Content Parsing Link'
2016-09-21 19:54:04 +00:00
SERIALISABLE_VERSION = 1
2016-10-19 20:02:56 +00:00
def __init__( self, name = None, formula = None, children = None ):
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
if name is None:
name = ''
if formula is None:
formula = ParseFormulaHTML()
if children is None:
children = []
self._name = name
2016-09-21 19:54:04 +00:00
self._formula = formula
self._children = children
def _GetSerialisableInfo( self ):
serialisable_formula = self._formula.GetSerialisableTuple()
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
2016-10-19 20:02:56 +00:00
return ( self._name, serialisable_formula, serialisable_children )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2016-10-19 20:02:56 +00:00
( self._name, serialisable_formula, serialisable_children ) = serialisable_info
2016-09-21 19:54:04 +00:00
self._formula = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_formula )
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
def GetParsableContent( self ):
children_parsable_content = set()
for child in self._children:
children_parsable_content.update( child.GetParsableContent() )
return children_parsable_content
2017-12-13 22:33:07 +00:00
def Parse( self, job_key, data, referral_url ):
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
search_urls = self.ParseURLs( job_key, data, referral_url )
2016-09-21 19:54:04 +00:00
content = []
for search_url in search_urls:
2017-08-30 20:27:47 +00:00
job_key.SetVariable( 'script_status', 'fetching ' + search_url )
network_job = ClientNetworking.NetworkJob( 'GET', search_url, referral_url = referral_url )
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
2017-09-06 20:18:20 +00:00
try:
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
network_job.WaitUntilDone()
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
except HydrusExceptions.CancelledException:
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
break
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
except HydrusExceptions.NetworkException as e:
2016-11-16 20:21:43 +00:00
2017-08-30 20:27:47 +00:00
if isinstance( e, HydrusExceptions.NotFoundException ):
job_key.SetVariable( 'script_status', '404 - nothing found' )
time.sleep( 2 )
continue
elif isinstance( e, HydrusExceptions.NetworkException ):
job_key.SetVariable( 'script_status', 'Network error! Details written to log.' )
HydrusData.Print( 'Problem fetching ' + search_url + ':' )
HydrusData.PrintException( e )
time.sleep( 2 )
continue
else:
2017-09-06 20:18:20 +00:00
raise
2017-08-30 20:27:47 +00:00
2016-11-16 20:21:43 +00:00
2016-09-21 19:54:04 +00:00
2017-08-30 20:27:47 +00:00
linked_data = network_job.GetContent()
2016-11-09 23:13:22 +00:00
2017-12-13 22:33:07 +00:00
children_content = GetChildrenContent( job_key, self._children, linked_data, search_url )
2016-11-02 21:09:14 +00:00
content.extend( children_content )
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException()
2016-09-21 19:54:04 +00:00
return content
2016-11-16 20:21:43 +00:00
def ParseURLs( self, job_key, data, referral_url ):
2016-11-02 21:09:14 +00:00
basic_urls = self._formula.Parse( data )
absolute_urls = [ urlparse.urljoin( referral_url, basic_url ) for basic_url in basic_urls ]
2016-11-16 20:21:43 +00:00
for url in absolute_urls:
job_key.AddURL( url )
2016-11-02 21:09:14 +00:00
return absolute_urls
2016-10-19 20:02:56 +00:00
def ToPrettyStrings( self ):
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
return ( self._name, 'link', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
2016-09-21 19:54:04 +00:00
2016-11-02 21:09:14 +00:00
def ToTuple( self ):
return ( self._name, self._formula, self._children )
2016-10-05 20:22:40 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_NODE_CONTENT_LINK ] = ParseNodeContentLink
FILE_IDENTIFIER_TYPE_FILE = 0
FILE_IDENTIFIER_TYPE_MD5 = 1
FILE_IDENTIFIER_TYPE_SHA1 = 2
FILE_IDENTIFIER_TYPE_SHA256 = 3
2016-10-12 21:52:50 +00:00
FILE_IDENTIFIER_TYPE_SHA512 = 4
2016-10-05 20:22:40 +00:00
FILE_IDENTIFIER_TYPE_USER_INPUT = 5
file_identifier_string_lookup = {}
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_FILE ] = 'the actual file (POST only)'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_MD5 ] = 'md5 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA1 ] = 'sha1 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA256 ] = 'sha256 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_SHA512 ] = 'sha512 hash'
file_identifier_string_lookup[ FILE_IDENTIFIER_TYPE_USER_INPUT ] = 'custom user input'
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
# eventually transition this to be a flat 'generate page/gallery urls'
# the rest of the parsing system can pick those up automatically
# this nullifies the need for contentlink stuff, at least in its current borked form
2016-10-05 20:22:40 +00:00
class ParseRootFileLookup( HydrusSerialisable.SerialisableBaseNamed ):
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'File Lookup Script'
2017-11-15 22:35:49 +00:00
SERIALISABLE_VERSION = 2
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
def __init__( self, name, url = None, query_type = None, file_identifier_type = None, file_identifier_string_converter = None, file_identifier_arg_name = None, static_args = None, children = None ):
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
2016-11-02 21:09:14 +00:00
self._url = url
2016-09-21 19:54:04 +00:00
self._query_type = query_type
2016-10-05 20:22:40 +00:00
self._file_identifier_type = file_identifier_type
2017-11-15 22:35:49 +00:00
self._file_identifier_string_converter = file_identifier_string_converter
2016-10-05 20:22:40 +00:00
self._file_identifier_arg_name = file_identifier_arg_name
2016-09-21 19:54:04 +00:00
self._static_args = static_args
self._children = children
def _GetSerialisableInfo( self ):
serialisable_children = [ child.GetSerialisableTuple() for child in self._children ]
2017-11-15 22:35:49 +00:00
serialisable_file_identifier_string_converter = self._file_identifier_string_converter.GetSerialisableTuple()
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
return ( self._url, self._query_type, self._file_identifier_type, serialisable_file_identifier_string_converter, self._file_identifier_arg_name, self._static_args, serialisable_children )
2016-09-21 19:54:04 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2017-11-15 22:35:49 +00:00
( self._url, self._query_type, self._file_identifier_type, serialisable_file_identifier_string_converter, self._file_identifier_arg_name, self._static_args, serialisable_children ) = serialisable_info
2016-09-21 19:54:04 +00:00
self._children = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_child ) for serialisable_child in serialisable_children ]
2017-11-15 22:35:49 +00:00
self._file_identifier_string_converter = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_identifier_string_converter )
2016-09-21 19:54:04 +00:00
2017-11-15 22:35:49 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( url, query_type, file_identifier_type, file_identifier_encoding, file_identifier_arg_name, static_args, serialisable_children ) = old_serialisable_info
transformations = []
if file_identifier_encoding == HC.ENCODING_RAW:
pass
elif file_identifier_encoding == HC.ENCODING_HEX:
transformations.append( ( STRING_TRANSFORMATION_ENCODE, 'hex' ) )
elif file_identifier_encoding == HC.ENCODING_BASE64:
transformations.append( ( STRING_TRANSFORMATION_ENCODE, 'base64' ) )
file_identifier_string_converter = StringConverter( transformations, 'some hash bytes' )
serialisable_file_identifier_string_converter = file_identifier_string_converter.GetSerialisableTuple()
new_serialisable_info = ( url, query_type, file_identifier_type, serialisable_file_identifier_string_converter, file_identifier_arg_name, static_args, serialisable_children )
return ( 2, new_serialisable_info )
2016-11-09 23:13:22 +00:00
def ConvertMediaToFileIdentifier( self, media ):
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT:
raise Exception( 'Cannot convert media to file identifier--this script takes user input!' )
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA256:
return media.GetHash()
elif self._file_identifier_type in ( FILE_IDENTIFIER_TYPE_MD5, FILE_IDENTIFIER_TYPE_SHA1, FILE_IDENTIFIER_TYPE_SHA512 ):
sha256_hash = media.GetHash()
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_MD5:
hash_type = 'md5'
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA1:
hash_type = 'sha1'
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_SHA512:
hash_type = 'sha512'
try:
2017-05-10 21:33:58 +00:00
( other_hash, ) = HG.client_controller.Read( 'file_hashes', ( sha256_hash, ), 'sha256', hash_type )
2016-11-09 23:13:22 +00:00
return other_hash
except:
raise Exception( 'I do not know that file\'s ' + hash_type + ' hash, so I cannot look it up!' )
elif self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
hash = media.GetHash()
mime = media.GetMime()
2017-06-28 20:23:21 +00:00
client_files_manager = HG.client_controller.client_files_manager
2016-11-09 23:13:22 +00:00
try:
path = client_files_manager.GetFilePath( hash, mime )
return path
except HydrusExceptions.FileMissingException as e:
raise Exception( 'That file is not in the database\'s local files, so I cannot look it up!' )
2016-11-16 20:21:43 +00:00
def FetchData( self, job_key, file_identifier ):
2017-09-06 20:18:20 +00:00
# add gauge report hook and in-stream cancel support to the get/post calls
request_args = dict( self._static_args )
if self._file_identifier_type != FILE_IDENTIFIER_TYPE_FILE:
2016-10-05 20:22:40 +00:00
2017-11-15 22:35:49 +00:00
request_args[ self._file_identifier_arg_name ] = self._file_identifier_string_converter.Convert( file_identifier )
2016-10-05 20:22:40 +00:00
2017-09-06 20:18:20 +00:00
if self._query_type == HC.GET:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
raise Exception( 'Cannot have a file as an argument on a GET query!' )
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
full_request_url = ClientNetworking.CombineGETURLWithParameters( self._url, request_args )
job_key.SetVariable( 'script_status', 'fetching ' + full_request_url )
job_key.AddURL( full_request_url )
network_job = ClientNetworking.NetworkJob( 'GET', full_request_url )
elif self._query_type == HC.POST:
if self._file_identifier_type == FILE_IDENTIFIER_TYPE_FILE:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
job_key.SetVariable( 'script_status', 'uploading file' )
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
path = file_identifier
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
files = { self._file_identifier_arg_name : open( path, 'rb' ) }
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
else:
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
job_key.SetVariable( 'script_status', 'uploading identifier' )
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
files = None
2016-11-02 21:09:14 +00:00
2017-10-25 21:45:15 +00:00
network_job = ClientNetworking.NetworkJob( 'POST', self._url, body = request_args )
network_job.SetFiles( files )
2016-11-02 21:09:14 +00:00
2017-09-06 20:18:20 +00:00
# send nj to nj control on this panel here
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
try:
2016-11-16 20:21:43 +00:00
2017-09-06 20:18:20 +00:00
network_job.WaitUntilDone()
2016-12-07 22:12:52 +00:00
except HydrusExceptions.NotFoundException:
job_key.SetVariable( 'script_status', '404 - nothing found' )
raise
except HydrusExceptions.NetworkException as e:
job_key.SetVariable( 'script_status', 'Network error!' )
HydrusData.ShowException( e )
raise
2016-11-16 20:21:43 +00:00
2016-10-05 20:22:40 +00:00
2017-09-06 20:18:20 +00:00
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException()
data = network_job.GetContent()
return data
2016-10-05 20:22:40 +00:00
2016-09-21 19:54:04 +00:00
def GetParsableContent( self ):
children_parsable_content = set()
for child in self._children:
children_parsable_content.update( child.GetParsableContent() )
return children_parsable_content
2017-12-13 22:33:07 +00:00
def DoQuery( self, job_key, file_identifier ):
2016-09-21 19:54:04 +00:00
2016-11-16 20:21:43 +00:00
try:
try:
data = self.FetchData( job_key, file_identifier )
except HydrusExceptions.NetworkException as e:
return []
2017-12-13 22:33:07 +00:00
content_results = self.Parse( job_key, data )
2016-11-16 20:21:43 +00:00
return content_results
except HydrusExceptions.CancelledException:
job_key.SetVariable( 'script_status', 'Cancelled!' )
return []
finally:
job_key.Finish()
2016-10-19 20:02:56 +00:00
2016-11-09 23:13:22 +00:00
def UsesUserInput( self ):
2016-10-19 20:02:56 +00:00
2016-11-09 23:13:22 +00:00
return self._file_identifier_type == FILE_IDENTIFIER_TYPE_USER_INPUT
2016-10-19 20:02:56 +00:00
2017-12-13 22:33:07 +00:00
def Parse( self, job_key, data ):
2016-09-21 19:54:04 +00:00
2017-12-13 22:33:07 +00:00
content_results = GetChildrenContent( job_key, self._children, data, self._url )
2016-09-21 19:54:04 +00:00
2016-12-07 22:12:52 +00:00
if len( content_results ) == 0:
job_key.SetVariable( 'script_status', 'Did not find anything.' )
else:
job_key.SetVariable( 'script_status', 'Found ' + HydrusData.ConvertIntToPrettyString( len( content_results ) ) + ' rows.' )
return content_results
2016-09-21 19:54:04 +00:00
def SetChildren( self, children ):
self._children = children
2016-10-05 20:22:40 +00:00
def ToPrettyStrings( self ):
2016-09-21 19:54:04 +00:00
2016-10-19 20:02:56 +00:00
return ( self._name, HC.query_type_string_lookup[ self._query_type ], 'File Lookup', ConvertParsableContentToPrettyString( self.GetParsableContent() ) )
2016-09-21 19:54:04 +00:00
def ToTuple( self ):
2017-11-15 22:35:49 +00:00
return ( self._name, self._url, self._query_type, self._file_identifier_type, self._file_identifier_string_converter, self._file_identifier_arg_name, self._static_args, self._children )
2016-09-21 19:54:04 +00:00
2016-10-05 20:22:40 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PARSE_ROOT_FILE_LOOKUP ] = ParseRootFileLookup
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING = 0
STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END = 1
2017-09-13 20:50:41 +00:00
STRING_TRANSFORMATION_PREPEND_TEXT = 2
STRING_TRANSFORMATION_APPEND_TEXT = 3
STRING_TRANSFORMATION_ENCODE = 4
STRING_TRANSFORMATION_DECODE = 5
2017-11-15 22:35:49 +00:00
STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING = 6
STRING_TRANSFORMATION_CLIP_TEXT_FROM_END = 7
2017-09-13 20:50:41 +00:00
STRING_TRANSFORMATION_REVERSE = 8
2018-01-17 22:52:10 +00:00
STRING_TRANSFORMATION_REGEX_SUB = 9
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
transformation_type_str_lookup = {}
transformation_type_str_lookup[ STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING ] = 'remove text from beginning of string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END ] = 'remove text from end of string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_PREPEND_TEXT ] = 'prepend text'
transformation_type_str_lookup[ STRING_TRANSFORMATION_APPEND_TEXT ] = 'append text'
transformation_type_str_lookup[ STRING_TRANSFORMATION_ENCODE ] = 'encode'
transformation_type_str_lookup[ STRING_TRANSFORMATION_DECODE ] = 'decode'
transformation_type_str_lookup[ STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING ] = 'take the start of the string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_CLIP_TEXT_FROM_END ] = 'take the end of the string'
transformation_type_str_lookup[ STRING_TRANSFORMATION_REVERSE ] = 'reverse text'
2018-01-17 22:52:10 +00:00
transformation_type_str_lookup[ STRING_TRANSFORMATION_REGEX_SUB ] = 'regex substitution'
2017-11-15 22:35:49 +00:00
class StringConverter( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_CONVERTER
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'String Converter'
2017-11-15 22:35:49 +00:00
SERIALISABLE_VERSION = 1
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def __init__( self, transformations = None, example_string = None ):
if transformations is None:
transformations = []
if example_string is None:
example_string = 'example string'
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
self.transformations = transformations
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
self.example_string = example_string
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def _GetSerialisableInfo( self ):
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return ( self.transformations, self.example_string )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2018-01-17 22:52:10 +00:00
( serialisable_transformations, self.example_string ) = serialisable_info
self.transformations = []
for ( transformation_type, data ) in serialisable_transformations:
if transformation_type == STRING_TRANSFORMATION_REGEX_SUB:
data = tuple( data ) # convert from list to tuple thing
self.transformations.append( ( transformation_type, data ) )
2017-11-15 22:35:49 +00:00
def Convert( self, s, max_steps_allowed = None ):
for ( i, transformation ) in enumerate( self.transformations ):
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
try:
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
( transformation_type, data ) = transformation
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
if transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING:
num_chars = data
s = s[ num_chars : ]
elif transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END:
num_chars = data
s = s[ : - num_chars ]
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING:
num_chars = data
s = s[ : num_chars ]
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_END:
num_chars = data
s = s[ - num_chars : ]
elif transformation_type == STRING_TRANSFORMATION_PREPEND_TEXT:
text = data
s = text + s
elif transformation_type == STRING_TRANSFORMATION_APPEND_TEXT:
text = data
s = s + text
elif transformation_type == STRING_TRANSFORMATION_ENCODE:
encode_type = data
s = s.encode( encode_type )
elif transformation_type == STRING_TRANSFORMATION_DECODE:
encode_type = data
s = s.decode( encode_type )
elif transformation_type == STRING_TRANSFORMATION_REVERSE:
s = s[::-1]
2018-01-17 22:52:10 +00:00
elif transformation_type == STRING_TRANSFORMATION_REGEX_SUB:
( pattern, repl ) = data
s = re.sub( pattern, repl, s, flags = re.UNICODE )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
except:
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
raise HydrusExceptions.StringConvertException( 'ERROR: Could not apply "' + self.TransformationToUnicode( transformation ) + '" to string "' + repr( s ) + '".' )
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
if max_steps_allowed is not None and i + 1 >= max_steps_allowed:
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return s
2017-09-13 20:50:41 +00:00
2017-11-15 22:35:49 +00:00
return s
def GetTransformationStrings( self ):
return [ self.TransformationToUnicode( transformation ) for transformation in self.transformations ]
@staticmethod
def TransformationToUnicode( transformation ):
( transformation_type, data ) = transformation
if transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_BEGINNING:
return 'remove the first ' + HydrusData.ConvertIntToPrettyString( data ) + ' characters'
elif transformation_type == STRING_TRANSFORMATION_REMOVE_TEXT_FROM_END:
return 'remove the last ' + HydrusData.ConvertIntToPrettyString( data ) + ' characters'
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_BEGINNING:
return 'take the first ' + HydrusData.ConvertIntToPrettyString( data ) + ' characters'
elif transformation_type == STRING_TRANSFORMATION_CLIP_TEXT_FROM_END:
return 'take the first ' + HydrusData.ConvertIntToPrettyString( data ) + ' characters'
elif transformation_type == STRING_TRANSFORMATION_PREPEND_TEXT:
return 'prepend with "' + data + '"'
elif transformation_type == STRING_TRANSFORMATION_APPEND_TEXT:
return 'append with "' + data + '"'
elif transformation_type == STRING_TRANSFORMATION_ENCODE:
return 'encode to ' + data
elif transformation_type == STRING_TRANSFORMATION_DECODE:
return 'decode from ' + data
elif transformation_type == STRING_TRANSFORMATION_REVERSE:
return transformation_type_str_lookup[ STRING_TRANSFORMATION_REVERSE ]
2018-01-17 22:52:10 +00:00
elif transformation_type == STRING_TRANSFORMATION_REGEX_SUB:
return 'regex substitution: ' + HydrusData.ToUnicode( data )
2017-11-15 22:35:49 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_CONVERTER ] = StringConverter
2017-09-13 20:50:41 +00:00
STRING_MATCH_FIXED = 0
STRING_MATCH_FLEXIBLE = 1
STRING_MATCH_REGEX = 2
STRING_MATCH_ANY = 3
ALPHA = 0
ALPHANUMERIC = 1
NUMERIC = 2
2017-09-27 21:52:54 +00:00
class StringMatch( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_STRING_MATCH
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'String Match'
2017-09-27 21:52:54 +00:00
SERIALISABLE_VERSION = 1
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
def __init__( self, match_type = STRING_MATCH_ANY, match_value = '', min_chars = None, max_chars = None, example_string = 'example string' ):
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
2017-09-13 20:50:41 +00:00
# make a gui control that accepts one of these. displays expected input on the right and colours red/green (and does isvalid) based on current input
# think about replacing the veto stuff above with this.
self._match_type = match_type
self._match_value = match_value
2017-09-27 21:52:54 +00:00
self._min_chars = min_chars
self._max_chars = max_chars
self._example_string = example_string
def _GetSerialisableInfo( self ):
return ( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string ) = serialisable_info
2017-09-13 20:50:41 +00:00
def SetMaxChars( self, max_chars ):
self._max_chars = max_chars
def SetMinChars( self, min_chars ):
self._min_chars = min_chars
def Test( self, text ):
text_len = len( text )
presentation_text = '"' + text + '"'
if self._min_chars is not None and text_len < self._min_chars:
2017-11-22 21:03:07 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + ' had fewer than ' + HydrusData.ConvertIntToPrettyString( self._min_chars ) + ' characters' )
2017-09-13 20:50:41 +00:00
if self._max_chars is not None and text_len > self._max_chars:
2017-11-22 21:03:07 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + ' had more than ' + HydrusData.ConvertIntToPrettyString( self._max_chars ) + ' characters' )
2017-09-13 20:50:41 +00:00
if self._match_type == STRING_MATCH_FIXED:
2017-11-22 21:03:07 +00:00
if text != self._match_value:
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + ' did not exactly match "' + self._match_value + '"' )
2017-09-13 20:50:41 +00:00
elif self._match_type in ( STRING_MATCH_FLEXIBLE, STRING_MATCH_REGEX ):
if self._match_type == STRING_MATCH_FLEXIBLE:
if self._match_value == ALPHA:
r = '^[a-zA-Z]+$'
fail_reason = ' had non-alpha characters'
elif self._match_value == ALPHANUMERIC:
r = '^[a-zA-Z\d]+$'
fail_reason = ' had non-alphanumeric characters'
elif self._match_value == NUMERIC:
r = '^\d+$'
fail_reason = ' had non-numeric characters'
elif self._match_type == STRING_MATCH_REGEX:
r = self._match_value
fail_reason = ' did not match "' + r + '"'
2017-09-27 21:52:54 +00:00
if re.search( r, text, flags = re.UNICODE ) is None:
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
raise HydrusExceptions.StringMatchException( presentation_text + fail_reason )
2017-09-13 20:50:41 +00:00
elif self._match_type == STRING_MATCH_ANY:
2017-11-22 21:03:07 +00:00
pass
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
def ToTuple( self ):
return ( self._match_type, self._match_value, self._min_chars, self._max_chars, self._example_string )
2017-09-27 21:52:54 +00:00
def ToUnicode( self ):
result = ''
2017-11-22 21:03:07 +00:00
if self._min_chars is None:
2017-09-27 21:52:54 +00:00
2017-11-22 21:03:07 +00:00
if self._max_chars is None:
2017-09-27 21:52:54 +00:00
2017-11-22 21:03:07 +00:00
result += 'any number of '
2017-09-27 21:52:54 +00:00
else:
2017-11-22 21:03:07 +00:00
result += 'at most ' + HydrusData.ToUnicode( self._max_chars ) + ' '
2017-09-27 21:52:54 +00:00
else:
2017-11-22 21:03:07 +00:00
if self._max_chars is None:
result += 'at least ' + HydrusData.ToUnicode( self._min_chars ) + ' '
else:
result += 'between ' + HydrusData.ToUnicode( self._min_chars ) + ' and ' + HydrusData.ToUnicode( self._max_chars ) + ' '
2017-09-27 21:52:54 +00:00
show_example = True
if self._match_type == STRING_MATCH_ANY:
result += 'characters'
2017-11-22 21:03:07 +00:00
show_example = False
2017-09-27 21:52:54 +00:00
elif self._match_type == STRING_MATCH_FIXED:
2017-11-22 21:03:07 +00:00
result = self._match_value
2017-09-27 21:52:54 +00:00
show_example = False
elif self._match_type == STRING_MATCH_FLEXIBLE:
if self._match_value == ALPHA:
result += 'alphabetical characters'
elif self._match_value == ALPHANUMERIC:
result += 'alphanumeric characters'
elif self._match_value == NUMERIC:
result += 'numeric characters'
elif self._match_type == STRING_MATCH_REGEX:
result += 'characters, matching regex "' + self._match_value + '"'
if show_example:
2017-11-22 21:03:07 +00:00
result += ', such as "' + self._example_string + '"'
2017-09-27 21:52:54 +00:00
return result
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_STRING_MATCH ] = StringMatch