Merge pull request #1348 from CetaceanNation/master

Hash functions for string conversions and HTTP headers for content parsers
This commit is contained in:
Hydrus Network Developer 2023-03-25 13:01:55 -05:00 committed by GitHub
commit 67f8b3e651
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 199 additions and 7 deletions

View File

@ -139,7 +139,7 @@ def ConvertParseResultToPrettyString( result ):
if timestamp_type == HC.TIMESTAMP_TYPE_SOURCE:
return 'source time: ' + timestamp_string
elif content_type == HC.CONTENT_TYPE_TITLE:
@ -147,6 +147,12 @@ def ConvertParseResultToPrettyString( result ):
return 'watcher page title (priority ' + str( priority ) + '): ' + parsed_text
elif content_type == HC.CONTENT_TYPE_HTTP_HEADER:
header_name = additional_info
return 'http header "{}": "{}"'.format( header_name, parsed_text )
elif content_type == HC.CONTENT_TYPE_VETO:
return 'veto: ' + name
@ -239,6 +245,12 @@ def ConvertParsableContentToPrettyString( parsable_content, include_veto = False
pretty_strings.append( 'watcher page title' )
elif content_type == HC.CONTENT_TYPE_HTTP_HEADER:
headers = [ header for header in additional_infos if header not in ( '', None ) ]
pretty_strings.append( 'http headers: ' + ', '.join( headers ) )
elif content_type == HC.CONTENT_TYPE_VETO:
if include_veto:
@ -571,6 +583,23 @@ def GetTitleFromAllParseResults( all_parse_results ):
def GetHTTPHeadersFromParseResults( parse_results ):
headers = {}
for ( ( name, content_type, additional_info ), parsed_text ) in parse_results:
if content_type == HC.CONTENT_TYPE_HTTP_HEADER:
header_name = additional_info
headers[header_name] = parsed_text
return headers
def GetURLsFromParseResults( results, desired_url_types, only_get_top_priority = False ):
url_results = collections.defaultdict( list )

View File

@ -1,5 +1,6 @@
import base64
import calendar
import hashlib
import html
import re
import typing
@ -25,6 +26,7 @@ STRING_CONVERSION_REGEX_SUB = 9
STRING_CONVERSION_DATE_DECODE = 10
STRING_CONVERSION_INTEGER_ADDITION = 11
STRING_CONVERSION_DATE_ENCODE = 12
STRING_CONVERSION_HASH_FUNCTION = 13
conversion_type_str_lookup = {}
@ -41,6 +43,7 @@ conversion_type_str_lookup[ STRING_CONVERSION_REGEX_SUB ] = 'regex substitution'
conversion_type_str_lookup[ STRING_CONVERSION_DATE_DECODE ] = 'datestring to timestamp'
conversion_type_str_lookup[ STRING_CONVERSION_INTEGER_ADDITION ] = 'integer addition'
conversion_type_str_lookup[ STRING_CONVERSION_DATE_ENCODE ] = 'timestamp to datestring'
conversion_type_str_lookup[ STRING_CONVERSION_HASH_FUNCTION ] = 'get hash of string'
class StringProcessingStep( HydrusSerialisable.SerialisableBase ):
@ -307,7 +310,27 @@ class StringConverter( StringProcessingStep ):
s = str( int( s ) + int( delta ) )
elif conversion_type == STRING_CONVERSION_HASH_FUNCTION:
hash_function = data
if hash_function == 'md5':
s = hashlib.md5(s.encode('utf-8')).hexdigest()
elif hash_function == 'sha1':
s = hashlib.sha1(s.encode('utf-8')).hexdigest()
elif hash_function == 'sha256':
s = hashlib.sha256(s.encode('utf-8')).hexdigest()
elif hash_function == 'sha512':
s = hashlib.sha512(s.encode('utf-8')).hexdigest()
except Exception as e:
raise HydrusExceptions.StringConvertException( 'ERROR: Could not apply "' + self.ConversionToString( conversion ) + '" to string "' + repr( s ) + '":' + str( e ) )
@ -429,6 +452,10 @@ class StringConverter( StringProcessingStep ):
return 'integer addition: add ' + str( data )
elif conversion_type == STRING_CONVERSION_HASH_FUNCTION:
return 'hash string with ' + str( data )
else:
return 'unknown conversion'

View File

@ -530,6 +530,24 @@ class EditFileSeedCachePanel( ClientGUIScrolledPanels.EditPanel ):
ClientGUIMenus.AppendMenu( menu, url_submenu, 'additional urls' )
#
headers = selected_file_seed.GetHTTPHeaders()
if headers is None:
ClientGUIMenus.AppendMenuLabel( menu, 'no additional headers')
else:
header_submenu = QW.QMenu( menu )
for ( key, value ) in headers.items():
ClientGUIMenus.AppendMenuLabel( header_submenu, key + ': ' + value )
ClientGUIMenus.AppendMenu( menu, header_submenu, 'additional headers' )
#

View File

@ -704,7 +704,7 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._conversion_type = ClientGUICommon.BetterChoice( self._control_panel )
for t_type in ( ClientStrings.STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING, ClientStrings.STRING_CONVERSION_REMOVE_TEXT_FROM_END, ClientStrings.STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING, ClientStrings.STRING_CONVERSION_CLIP_TEXT_FROM_END, ClientStrings.STRING_CONVERSION_PREPEND_TEXT, ClientStrings.STRING_CONVERSION_APPEND_TEXT, ClientStrings.STRING_CONVERSION_ENCODE, ClientStrings.STRING_CONVERSION_DECODE, ClientStrings.STRING_CONVERSION_REVERSE, ClientStrings.STRING_CONVERSION_REGEX_SUB, ClientStrings.STRING_CONVERSION_DATE_DECODE, ClientStrings.STRING_CONVERSION_DATE_ENCODE, ClientStrings.STRING_CONVERSION_INTEGER_ADDITION ):
for t_type in ( ClientStrings.STRING_CONVERSION_REMOVE_TEXT_FROM_BEGINNING, ClientStrings.STRING_CONVERSION_REMOVE_TEXT_FROM_END, ClientStrings.STRING_CONVERSION_CLIP_TEXT_FROM_BEGINNING, ClientStrings.STRING_CONVERSION_CLIP_TEXT_FROM_END, ClientStrings.STRING_CONVERSION_PREPEND_TEXT, ClientStrings.STRING_CONVERSION_APPEND_TEXT, ClientStrings.STRING_CONVERSION_ENCODE, ClientStrings.STRING_CONVERSION_DECODE, ClientStrings.STRING_CONVERSION_REVERSE, ClientStrings.STRING_CONVERSION_REGEX_SUB, ClientStrings.STRING_CONVERSION_DATE_DECODE, ClientStrings.STRING_CONVERSION_DATE_ENCODE, ClientStrings.STRING_CONVERSION_INTEGER_ADDITION, ClientStrings.STRING_CONVERSION_HASH_FUNCTION ):
self._conversion_type.addItem( ClientStrings.conversion_type_str_lookup[ t_type ], t_type )
@ -718,6 +718,7 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_timezone_decode = ClientGUICommon.BetterChoice( self._control_panel )
self._data_timezone_encode = ClientGUICommon.BetterChoice( self._control_panel )
self._data_timezone_offset = ClientGUICommon.BetterSpinBox( self._control_panel, min=-86400, max=86400 )
self._data_hash_function = ClientGUICommon.BetterChoice( self._control_panel )
for e in ( 'hex', 'base64', 'url percent encoding', 'unicode escape characters', 'html entities' ):
@ -736,6 +737,11 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_timezone_encode.addItem( 'UTC', HC.TIMEZONE_GMT )
self._data_timezone_encode.addItem( 'Local', HC.TIMEZONE_LOCAL )
for e in ( 'md5', 'sha1', 'sha256', 'sha512' ):
self._data_hash_function.addItem( e, e )
#
self._example_panel = ClientGUICommon.StaticBox( self, 'test results' )
@ -799,6 +805,10 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_text.setText( phrase )
self._data_timezone_encode.SetValue( timezone_type )
elif conversion_type == ClientStrings.STRING_CONVERSION_HASH_FUNCTION:
self._data_hash_function.SetValue(data)
elif data is not None:
@ -827,6 +837,7 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_timezone_decode_label = ClientGUICommon.BetterStaticText( self, 'date decode timezone: ' )
self._data_timezone_offset_label = ClientGUICommon.BetterStaticText( self, 'timezone offset: ' )
self._data_timezone_encode_label = ClientGUICommon.BetterStaticText( self, 'date encode timezone: ' )
self._data_hash_function_label = ClientGUICommon.BetterStaticText( self, 'hashing function: ' )
rows.append( ( 'conversion type: ', self._conversion_type ) )
rows.append( ( self._data_text_label, self._data_text ) )
@ -838,6 +849,7 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
rows.append( ( self._data_timezone_decode_label, self._data_timezone_decode ) )
rows.append( ( self._data_timezone_offset_label, self._data_timezone_offset ) )
rows.append( ( self._data_timezone_encode_label, self._data_timezone_encode ) )
rows.append( ( self._data_hash_function_label, self._data_hash_function) )
self._control_gridbox = ClientGUICommon.WrapInGrid( self._control_panel, rows )
@ -879,6 +891,7 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_timezone_decode.currentIndexChanged.connect( self._UpdateExampleText )
self._data_timezone_offset.valueChanged.connect( self._UpdateExampleText )
self._data_timezone_encode.currentIndexChanged.connect( self._UpdateExampleText )
self._data_hash_function.currentIndexChanged.connect( self._UpdateExampleText )
self._data_timezone_decode.currentIndexChanged.connect( self._UpdateDataControls )
self._data_timezone_encode.currentIndexChanged.connect( self._UpdateDataControls )
@ -897,6 +910,7 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_timezone_decode_label.setVisible( False )
self._data_timezone_offset_label.setVisible( False )
self._data_timezone_encode_label.setVisible( False )
self._data_hash_function_label.setVisible( False )
self._data_text.setVisible( False )
self._data_number.setVisible( False )
@ -907,6 +921,7 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_timezone_decode.setVisible( False )
self._data_timezone_offset.setVisible( False )
self._data_timezone_encode.setVisible( False )
self._data_hash_function.setVisible( False )
conversion_type = self._conversion_type.GetValue()
@ -919,6 +934,11 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
self._data_decoding_label.setVisible( True )
self._data_decoding.setVisible( True )
elif conversion_type == ClientStrings.STRING_CONVERSION_HASH_FUNCTION:
self._data_hash_function_label.setVisible( True )
self._data_hash_function.setVisible( True )
elif conversion_type in ( ClientStrings.STRING_CONVERSION_PREPEND_TEXT, ClientStrings.STRING_CONVERSION_APPEND_TEXT, ClientStrings.STRING_CONVERSION_DATE_DECODE, ClientStrings.STRING_CONVERSION_DATE_ENCODE, ClientStrings.STRING_CONVERSION_REGEX_SUB ):
@ -1079,6 +1099,10 @@ class EditStringConverterPanel( ClientGUIScrolledPanels.EditPanel ):
data = ( phrase, timezone_time )
elif conversion_type == ClientStrings.STRING_CONVERSION_HASH_FUNCTION:
data = self._data_hash_function.GetValue()
else:
data = None

View File

@ -529,6 +529,7 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
types_to_str[ HC.CONTENT_TYPE_HASH ] = 'file hash'
types_to_str[ HC.CONTENT_TYPE_TIMESTAMP ] = 'timestamp'
types_to_str[ HC.CONTENT_TYPE_TITLE ] = 'watcher title'
types_to_str[ HC.CONTENT_TYPE_HTTP_HEADER ] = 'http header'
types_to_str[ HC.CONTENT_TYPE_VETO ] = 'veto'
types_to_str[ HC.CONTENT_TYPE_VARIABLE ] = 'temporary variable'
@ -602,6 +603,12 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
#
self._header_panel = QW.QWidget( self._content_panel )
self._header_name = QW.QLineEdit( self._header_panel )
#
self._veto_panel = QW.QWidget( self._content_panel )
self._veto_if_matches_found = QW.QCheckBox( self._veto_panel )
@ -663,6 +670,12 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
self._title_priority.setValue( priority )
elif content_type == HC.CONTENT_TYPE_HTTP_HEADER:
header_name = additional_info
self._header_name.setText(header_name)
elif content_type == HC.CONTENT_TYPE_VETO:
( veto_if_matches_found, string_match ) = additional_info
@ -750,6 +763,27 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
self._title_panel.setLayout( gridbox )
#
self._urls_panel.setLayout( gridbox )
rows = []
rows.append( ( 'header name: ', self._header_name ) )
gridbox = ClientGUICommon.WrapInGrid( self._header_panel, rows )
vbox = QP.VBoxLayout()
label = 'The value from this content parser will be used for the specified HTTP header on all URLs derived.'
st = ClientGUICommon.BetterStaticText( self._header_panel, label = label )
st.setWordWrap( True )
QP.AddToLayout( vbox, st, CC.FLAGS_EXPAND_PERPENDICULAR )
QP.AddToLayout( vbox, gridbox, CC.FLAGS_EXPAND_SIZER_PERPENDICULAR )
self._header_panel.setLayout( vbox )
#
vbox = QP.VBoxLayout()
@ -794,6 +828,7 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
self._content_panel.Add( self._hash_panel, CC.FLAGS_EXPAND_SIZER_PERPENDICULAR )
self._content_panel.Add( self._timestamp_panel, CC.FLAGS_EXPAND_SIZER_PERPENDICULAR )
self._content_panel.Add( self._title_panel, CC.FLAGS_EXPAND_SIZER_PERPENDICULAR )
self._content_panel.Add( self._header_panel, CC.FLAGS_EXPAND_SIZER_PERPENDICULAR )
self._content_panel.Add( self._veto_panel, CC.FLAGS_EXPAND_SIZER_PERPENDICULAR )
self._content_panel.Add( self._temp_variable_panel, CC.FLAGS_EXPAND_SIZER_PERPENDICULAR )
@ -842,6 +877,7 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
self._hash_panel.setVisible( False )
self._timestamp_panel.setVisible( False )
self._title_panel.setVisible( False )
self._header_panel.setVisible( False )
self._veto_panel.setVisible( False )
self._temp_variable_panel.setVisible( False )
@ -873,6 +909,10 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
elif content_type == HC.CONTENT_TYPE_TITLE:
self._title_panel.show()
elif content_type == HC.CONTENT_TYPE_HTTP_HEADER:
self._header_panel.show()
elif content_type == HC.CONTENT_TYPE_VETO:
@ -937,6 +977,12 @@ class EditContentParserPanel( ClientGUIScrolledPanels.EditPanel ):
additional_info = priority
elif content_type == HC.CONTENT_TYPE_HTTP_HEADER:
header_name = self._header_name.text()
additional_info = header_name
elif content_type == HC.CONTENT_TYPE_VETO:
veto_if_matches_found = self._veto_if_matches_found.isChecked()
@ -1222,7 +1268,7 @@ class EditPageParserPanel( ClientGUIScrolledPanels.EditPanel ):
#
permitted_content_types = [ HC.CONTENT_TYPE_URLS, HC.CONTENT_TYPE_MAPPINGS, HC.CONTENT_TYPE_NOTES, HC.CONTENT_TYPE_HASH, HC.CONTENT_TYPE_TIMESTAMP, HC.CONTENT_TYPE_TITLE, HC.CONTENT_TYPE_VETO ]
permitted_content_types = [ HC.CONTENT_TYPE_URLS, HC.CONTENT_TYPE_MAPPINGS, HC.CONTENT_TYPE_NOTES, HC.CONTENT_TYPE_HASH, HC.CONTENT_TYPE_TIMESTAMP, HC.CONTENT_TYPE_TITLE, HC.CONTENT_TYPE_HTTP_HEADER, HC.CONTENT_TYPE_VETO ]
self._content_parsers = EditContentParsersPanel( content_parsers_panel, self._test_panel.GetTestDataForChild, permitted_content_types )

View File

@ -139,6 +139,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
self._cloudflare_last_modified_time = None
self._referral_url = None
self._request_headers = None
self._external_filterable_tags = set()
self._external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags()
@ -260,6 +261,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
self.status,
self.note,
self._referral_url,
self._request_headers,
serialisable_external_filterable_tags,
serialisable_external_additional_service_keys_to_tags,
serialisable_primary_urls,
@ -281,6 +283,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
self.status,
self.note,
self._referral_url,
self._request_headers,
serialisable_external_filterable_tags,
serialisable_external_additional_service_keys_to_tags,
serialisable_primary_urls,
@ -577,6 +580,10 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )
for ( key, value ) in self._request_headers.items():
network_job.AddAdditionalHeader( key, value )
if override_bandwidth:
network_job.OverrideBandwidth( 3 )
@ -903,6 +910,10 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
def GetSourceURLs( self ):
return set( self._source_urls )
def GetHTTPHeaders( self ):
return self._request_headers
def HasHash( self ):
@ -1169,6 +1180,11 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
def SetReferralURL( self, referral_url: str ):
self._referral_url = referral_url
def SetRequestHeaders( self, request_headers: dict ):
self._request_headers = request_headers
def SetStatus( self, status: int, note: str = '', exception = None ):
@ -1290,6 +1306,14 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
if self._request_headers is not None:
for ( key, value ) in self._request_headers.items():
network_job.AddAdditionalHeader( key, value )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
@ -1412,6 +1436,8 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
self.CheckPreFetchMetadata( tag_import_options )
parsed_request_headers = ClientParsing.GetHTTPHeadersFromParseResults( parse_results )
desired_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
child_urls = []
@ -1471,6 +1497,8 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
duplicate_file_seed.file_seed_data = child_url
duplicate_file_seed.SetReferralURL( url_for_child_referral )
duplicate_file_seed.SetRequestHeaders( parsed_request_headers )
if self._referral_url is not None:

View File

@ -104,6 +104,7 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
self.note = ''
self._referral_url = None
self._request_headers = None
self._force_next_page_url_generation = False
@ -144,7 +145,8 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
self.modified,
self.status,
self.note,
self._referral_url
self._referral_url,
self._request_headers
)
@ -159,7 +161,8 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
self.modified,
self.status,
self.note,
self._referral_url
self._referral_url,
self._request_headers
) = serialisable_info
self._external_filterable_tags = set( serialisable_external_filterable_tags )
@ -261,6 +264,11 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
self._referral_url = referral_url
def SetRequestHeaders( self, request_headers: dict ):
self._request_headers = request_headers
def SetRunToken( self, run_token: bytes ):
self._run_token = run_token
@ -355,6 +363,10 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
for ( key, value ) in self._request_headers.items():
network_job.AddAdditionalHeader( key, value )
network_job.SetGalleryToken( gallery_token_name )
network_job.OverrideBandwidth( 30 )
@ -404,6 +416,8 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )
file_seed.SetReferralURL( url_for_child_referral )
file_seed.SetRequestHeaders( self._request_headers )
file_seeds = [ file_seed ]

View File

@ -43,6 +43,8 @@ def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_impor
for parse_results in all_parse_results:
parsed_request_headers = ClientParsing.GetHTTPHeadersFromParseResults( parse_results )
parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
parsed_urls = HydrusData.DedupeList( parsed_urls )
@ -59,6 +61,8 @@ def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_impor
file_seed.SetReferralURL( source_url )
file_seed.SetRequestHeaders( parsed_request_headers )
file_seed.AddParseResults( parse_results, file_import_options )
file_seeds.append( file_seed )

View File

@ -164,6 +164,7 @@ CONTENT_TYPE_NOTES = 18
CONTENT_TYPE_FILE_VIEWING_STATS = 19
CONTENT_TYPE_TAG = 20
CONTENT_TYPE_DEFINITIONS = 21
CONTENT_TYPE_HTTP_HEADER = 22
content_type_string_lookup = {
CONTENT_TYPE_MAPPINGS : 'mappings',
@ -186,7 +187,8 @@ content_type_string_lookup = {
CONTENT_TYPE_TITLE : 'title',
CONTENT_TYPE_NOTES : 'notes',
CONTENT_TYPE_FILE_VIEWING_STATS : 'file viewing stats',
CONTENT_TYPE_DEFINITIONS : 'definitions'
CONTENT_TYPE_DEFINITIONS : 'definitions',
CONTENT_TYPE_HTTP_HEADER : 'http header'
}
CONTENT_UPDATE_ADD = 0