2020-05-20 21:36:02 +00:00
import collections
import http . cookiejar
import os
import re
import threading
import time
import unicodedata
import urllib . parse
2020-04-22 21:00:35 +00:00
from hydrus . client import ClientConstants as CC
from hydrus . client import ClientParsing
from hydrus . client import ClientThreading
2020-05-20 21:36:02 +00:00
from hydrus . client . networking import ClientNetworkingContexts
2020-04-22 21:00:35 +00:00
from hydrus . core import HydrusConstants as HC
from hydrus . core import HydrusGlobals as HG
from hydrus . core import HydrusData
from hydrus . core import HydrusExceptions
from hydrus . core import HydrusNetworking
from hydrus . core import HydrusSerialisable
2017-09-13 20:50:41 +00:00
2020-04-16 02:14:58 +00:00
def AddCookieToSession ( session , name , value , domain , path , expires , secure = False , rest = None ) :
2020-04-16 00:09:42 +00:00
version = 0
port = None
port_specified = False
domain_specified = True
domain_initial_dot = domain . startswith ( ' . ' )
path_specified = True
discard = False
comment = None
comment_url = None
2020-04-16 02:14:58 +00:00
if rest is None :
rest = { }
2020-04-16 00:09:42 +00:00
cookie = http . cookiejar . Cookie ( version , name , value , port , port_specified , domain , domain_specified , domain_initial_dot , path , path_specified , secure , expires , discard , comment , comment_url , rest )
session . cookies . set_cookie ( cookie )
2018-08-22 21:10:59 +00:00
def AlphabetiseQueryText ( query_text ) :
2018-04-18 22:10:15 +00:00
2019-11-28 01:11:46 +00:00
( query_dict , param_order ) = ConvertQueryTextToDict ( query_text )
return ConvertQueryDictToText ( query_dict )
2018-04-18 22:10:15 +00:00
2018-10-31 21:41:14 +00:00
def ConvertDomainIntoAllApplicableDomains ( domain , discard_www = True ) :
2017-10-04 17:51:58 +00:00
2018-10-17 21:00:09 +00:00
# is an ip address or localhost, possibly with a port
if ' . ' not in domain or re . search ( r ' ^[ \ d \ .:]+$ ' , domain ) is not None :
2017-10-25 21:45:15 +00:00
return [ domain ]
2017-10-04 17:51:58 +00:00
domains = [ ]
2020-05-06 21:31:41 +00:00
if discard_www :
2017-10-04 17:51:58 +00:00
2020-05-06 21:31:41 +00:00
domain = RemoveWWWFromDomain ( domain )
2017-10-04 17:51:58 +00:00
2020-05-06 21:31:41 +00:00
while domain . count ( ' . ' ) > 0 :
2018-10-31 21:41:14 +00:00
2020-05-06 21:31:41 +00:00
domains . append ( domain )
2017-10-04 17:51:58 +00:00
2020-05-06 21:31:41 +00:00
domain = ConvertDomainIntoNextLevelDomain ( domain )
2017-10-04 17:51:58 +00:00
return domains
2020-05-06 21:31:41 +00:00
def ConvertDomainIntoNextLevelDomain ( domain ) :
return ' . ' . join ( domain . split ( ' . ' ) [ 1 : ] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com
2017-11-01 20:37:39 +00:00
def ConvertDomainIntoSecondLevelDomain ( domain ) :
2018-05-09 20:23:00 +00:00
domains = ConvertDomainIntoAllApplicableDomains ( domain )
if len ( domains ) == 0 :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' That url or domain did not seem to be valid! ' )
2018-05-09 20:23:00 +00:00
return domains [ - 1 ]
2018-04-25 22:07:52 +00:00
def ConvertHTTPSToHTTP ( url ) :
if url . startswith ( ' http:// ' ) :
return url
elif url . startswith ( ' https:// ' ) :
http_url = ' http:// ' + url [ 8 : ]
return http_url
else :
raise Exception ( ' Given a url that did not have a scheme! ' )
def ConvertHTTPToHTTPS ( url ) :
if url . startswith ( ' https:// ' ) :
return url
elif url . startswith ( ' http:// ' ) :
https_url = ' https:// ' + url [ 7 : ]
return https_url
else :
raise Exception ( ' Given a url that did not have a scheme! ' )
2019-11-28 01:11:46 +00:00
def ConvertQueryDictToText ( query_dict , param_order = None ) :
2018-08-22 21:10:59 +00:00
2019-01-09 22:59:03 +00:00
# we now do everything with requests, which does all the unicode -> %20 business naturally, phew
# we still want to call str explicitly to coerce integers and so on that'll slip in here and there
2018-08-22 21:10:59 +00:00
2019-11-28 01:11:46 +00:00
if param_order is None :
2020-05-13 19:03:16 +00:00
param_pairs = sorted ( query_dict . items ( ) )
2019-11-28 01:11:46 +00:00
else :
param_pairs = [ ( key , query_dict [ key ] ) for key in param_order if key in query_dict ]
2018-08-22 21:10:59 +00:00
2019-01-09 22:59:03 +00:00
query_text = ' & ' . join ( ( str ( key ) + ' = ' + str ( value ) for ( key , value ) in param_pairs ) )
2018-08-22 21:10:59 +00:00
return query_text
def ConvertQueryTextToDict ( query_text ) :
2019-01-09 22:59:03 +00:00
# we generally do not want quote characters, %20 stuff, in our urls. we would prefer properly formatted unicode
2018-10-03 21:00:15 +00:00
2019-08-15 00:40:48 +00:00
# so, let's replace all keys and values with unquoted versions
# -but-
# we only replace if it is a completely reversable operation!
# odd situations like '6+girls+skirt', which comes here encoded as '6%2Bgirls+skirt', shouldn't turn into '6+girls+skirt'
# so if there are a mix of encoded and non-encoded, we won't touch it here m8
# except these chars, which screw with GET arg syntax when unquoted
2020-05-13 19:03:16 +00:00
bad_chars = [ ' & ' , ' = ' , ' / ' , ' ? ' , ' # ' ]
2019-08-15 00:40:48 +00:00
2019-11-28 01:11:46 +00:00
param_order = [ ]
2018-08-29 20:20:41 +00:00
query_dict = { }
pairs = query_text . split ( ' & ' )
for pair in pairs :
result = pair . split ( ' = ' , 1 )
# for the moment, ignore tracker bugs and so on that have only key and no value
if len ( result ) == 2 :
( key , value ) = result
try :
2019-01-09 22:59:03 +00:00
unquoted_key = urllib . parse . unquote ( key )
2018-10-03 21:00:15 +00:00
2019-08-15 00:40:48 +00:00
if True not in ( bad_char in unquoted_key for bad_char in bad_chars ) :
requoted_key = urllib . parse . quote ( unquoted_key )
2018-10-03 21:00:15 +00:00
2019-08-15 00:40:48 +00:00
if requoted_key == key :
key = unquoted_key
2018-10-03 21:00:15 +00:00
2018-08-29 20:20:41 +00:00
except :
pass
try :
2019-01-09 22:59:03 +00:00
unquoted_value = urllib . parse . unquote ( value )
2018-10-03 21:00:15 +00:00
2019-08-15 00:40:48 +00:00
if True not in ( bad_char in unquoted_value for bad_char in bad_chars ) :
requoted_value = urllib . parse . quote ( unquoted_value )
2018-10-03 21:00:15 +00:00
2019-08-15 00:40:48 +00:00
if requoted_value == value :
value = unquoted_value
2018-10-03 21:00:15 +00:00
2018-08-29 20:20:41 +00:00
except :
pass
2019-11-28 01:11:46 +00:00
param_order . append ( key )
2018-08-29 20:20:41 +00:00
query_dict [ key ] = value
2018-08-22 21:10:59 +00:00
2019-11-28 01:11:46 +00:00
return ( query_dict , param_order )
2018-08-22 21:10:59 +00:00
2019-05-08 21:06:42 +00:00
def ConvertURLClassesIntoAPIPairs ( url_classes ) :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
url_classes = list ( url_classes )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
NetworkDomainManager . STATICSortURLClassesDescendingComplexity ( url_classes )
2018-09-19 21:54:51 +00:00
2018-02-07 23:40:33 +00:00
pairs = [ ]
2019-05-08 21:06:42 +00:00
for url_class in url_classes :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
if not url_class . UsesAPIURL ( ) :
2018-02-07 23:40:33 +00:00
continue
2019-05-08 21:06:42 +00:00
api_url = url_class . GetAPIURL ( url_class . GetExampleURL ( ) )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
for other_url_class in url_classes :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
if other_url_class == url_class :
2018-02-07 23:40:33 +00:00
continue
2019-05-08 21:06:42 +00:00
if other_url_class . Matches ( api_url ) :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
pairs . append ( ( url_class , other_url_class ) )
2018-02-07 23:40:33 +00:00
2018-09-19 21:54:51 +00:00
break
2018-02-07 23:40:33 +00:00
return pairs
2017-11-01 20:37:39 +00:00
2017-10-04 17:51:58 +00:00
def ConvertURLIntoDomain ( url ) :
2020-05-13 19:03:16 +00:00
parser_result = ParseURL ( url )
2017-10-04 17:51:58 +00:00
2018-03-14 21:01:02 +00:00
if parser_result . scheme == ' ' :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' URL " ' + url + ' " was not recognised--did you forget the http:// or https://? ' )
2018-03-14 21:01:02 +00:00
if parser_result . netloc == ' ' :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' URL " ' + url + ' " was not recognised--is it missing a domain? ' )
2018-03-14 21:01:02 +00:00
2019-01-09 22:59:03 +00:00
domain = parser_result . netloc
2017-10-04 17:51:58 +00:00
return domain
2018-08-22 21:10:59 +00:00
def ConvertURLIntoSecondLevelDomain ( url ) :
domain = ConvertURLIntoDomain ( url )
return ConvertDomainIntoSecondLevelDomain ( domain )
2018-04-25 22:07:52 +00:00
def DomainEqualsAnotherForgivingWWW ( test_domain , wwwable_domain ) :
# domain is either the same or starts with www. or www2. or something
rule = r ' ^(www[^ \ .]* \ .)? ' + re . escape ( wwwable_domain ) + ' $ '
return re . search ( rule , test_domain ) is not None
2018-11-14 23:10:55 +00:00
def CookieDomainMatches ( cookie , search_domain ) :
2017-11-01 20:37:39 +00:00
2018-11-14 23:10:55 +00:00
cookie_domain = cookie . domain
2017-11-01 20:37:39 +00:00
2018-11-14 23:10:55 +00:00
# blah.com is viewable by blah.com
matches_exactly = cookie_domain == search_domain
# .blah.com is viewable by blah.com
matches_dot = cookie_domain == ' . ' + search_domain
# .blah.com applies to subdomain.blah.com, blah.com does not
valid_subdomain = cookie_domain . startswith ( ' . ' ) and search_domain . endswith ( cookie_domain )
return matches_exactly or matches_dot or valid_subdomain
def GetCookie ( cookies , search_domain , cookie_name_string_match ) :
for cookie in cookies :
2017-11-01 20:37:39 +00:00
2018-11-14 23:10:55 +00:00
if CookieDomainMatches ( cookie , search_domain ) and cookie_name_string_match . Matches ( cookie . name ) :
2017-11-01 20:37:39 +00:00
2018-11-14 23:10:55 +00:00
return cookie
2017-11-01 20:37:39 +00:00
2019-01-09 22:59:03 +00:00
raise HydrusExceptions . DataMissing ( ' Cookie " ' + cookie_name_string_match . ToString ( ) + ' " not found for domain ' + search_domain + ' ! ' )
2017-11-01 20:37:39 +00:00
2018-04-25 22:07:52 +00:00
def GetSearchURLs ( url ) :
search_urls = set ( )
search_urls . add ( url )
2019-07-31 22:01:02 +00:00
try :
normalised_url = HG . client_controller . network_engine . domain_manager . NormaliseURL ( url )
search_urls . add ( normalised_url )
except HydrusExceptions . URLClassException :
pass
2018-04-25 22:07:52 +00:00
for url in list ( search_urls ) :
if url . startswith ( ' http:// ' ) :
search_urls . add ( ConvertHTTPToHTTPS ( url ) )
elif url . startswith ( ' https:// ' ) :
search_urls . add ( ConvertHTTPSToHTTP ( url ) )
2018-08-01 20:44:57 +00:00
for url in list ( search_urls ) :
2020-05-13 19:03:16 +00:00
p = ParseURL ( url )
2018-08-01 20:44:57 +00:00
scheme = p . scheme
netloc = p . netloc
path = p . path
params = ' '
query = p . query
fragment = ' '
if netloc . startswith ( ' www ' ) :
try :
netloc = ConvertDomainIntoSecondLevelDomain ( netloc )
2019-05-08 21:06:42 +00:00
except HydrusExceptions . URLClassException :
2018-08-01 20:44:57 +00:00
continue
else :
netloc = ' www. ' + netloc
2019-01-09 22:59:03 +00:00
r = urllib . parse . ParseResult ( scheme , netloc , path , params , query , fragment )
2018-08-01 20:44:57 +00:00
search_urls . add ( r . geturl ( ) )
2019-05-22 22:35:06 +00:00
for url in list ( search_urls ) :
if url . endswith ( ' / ' ) :
search_urls . add ( url [ : - 1 ] )
else :
search_urls . add ( url + ' / ' )
2018-04-25 22:07:52 +00:00
return search_urls
2020-05-13 19:03:16 +00:00
def ParseURL ( url : str ) - > urllib . parse . ParseResult :
url = url . strip ( )
url = UnicodeNormaliseURL ( url )
return urllib . parse . urlparse ( url )
OH_NO_NO_NETLOC_CHARACTERS = ' ?# '
OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord ( char ) : ' _ ' for char in OH_NO_NO_NETLOC_CHARACTERS }
def UnicodeNormaliseURL ( url : str ) :
if url . startswith ( ' file: ' ) :
return url
# the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise
# urllib.urlparse throws a valueerror if these are in, so let's switch out
scheme_splitter = ' :// '
netloc_splitter = ' / '
if scheme_splitter in url :
( scheme , netloc_and_path_and_rest ) = url . split ( scheme_splitter , 1 )
if netloc_splitter in netloc_and_path_and_rest :
( netloc , path_and_rest ) = netloc_and_path_and_rest . split ( netloc_splitter , 1 )
else :
netloc = netloc_and_path_and_rest
path_and_rest = None
netloc = unicodedata . normalize ( ' NFKC ' , netloc )
netloc = netloc . translate ( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE )
scheme_and_netlock = scheme_splitter . join ( ( scheme , netloc ) )
if path_and_rest is None :
url = scheme_and_netlock
else :
url = netloc_splitter . join ( ( scheme_and_netlock , path_and_rest ) )
return url
2017-10-04 17:51:58 +00:00
VALID_DENIED = 0
VALID_APPROVED = 1
VALID_UNKNOWN = 2
2017-10-11 17:38:14 +00:00
valid_str_lookup = { }
valid_str_lookup [ VALID_DENIED ] = ' denied '
valid_str_lookup [ VALID_APPROVED ] = ' approved '
valid_str_lookup [ VALID_UNKNOWN ] = ' unknown '
2017-10-04 17:51:58 +00:00
class NetworkDomainManager ( HydrusSerialisable . SerialisableBase ) :
2017-10-11 17:38:14 +00:00
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_NETWORK_DOMAIN_MANAGER
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = ' Domain Manager '
2018-09-05 20:52:32 +00:00
SERIALISABLE_VERSION = 6
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
def __init__ ( self ) :
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
HydrusSerialisable . SerialisableBase . __init__ ( self )
self . engine = None
2018-08-22 21:10:59 +00:00
self . _gugs = HydrusSerialisable . SerialisableList ( )
2019-05-08 21:06:42 +00:00
self . _url_classes = HydrusSerialisable . SerialisableList ( )
2018-01-17 22:52:10 +00:00
self . _parsers = HydrusSerialisable . SerialisableList ( )
2017-10-11 17:38:14 +00:00
self . _network_contexts_to_custom_header_dicts = collections . defaultdict ( dict )
2017-10-04 17:51:58 +00:00
2018-08-08 20:29:54 +00:00
self . _parser_namespaces = [ ]
2018-09-05 20:52:32 +00:00
self . _gug_keys_to_display = set ( )
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_display = set ( )
self . _url_class_keys_to_parser_keys = HydrusSerialisable . SerialisableBytesDictionary ( )
2017-12-06 22:06:56 +00:00
2020-01-16 02:08:23 +00:00
self . _second_level_domains_to_url_classes = collections . defaultdict ( list )
2017-10-04 17:51:58 +00:00
2020-04-16 00:09:42 +00:00
self . _second_level_domains_to_network_infrastructure_errors = collections . defaultdict ( list )
2020-04-22 21:00:35 +00:00
from hydrus . client . importing import ClientImportOptions
2018-04-18 22:10:15 +00:00
self . _file_post_default_tag_import_options = ClientImportOptions . TagImportOptions ( )
self . _watchable_default_tag_import_options = ClientImportOptions . TagImportOptions ( )
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_default_tag_import_options = { }
2018-04-18 22:10:15 +00:00
2018-09-05 20:52:32 +00:00
self . _gug_keys_to_gugs = { }
self . _gug_names_to_gugs = { }
2018-01-17 22:52:10 +00:00
self . _parser_keys_to_parsers = { }
2017-10-04 17:51:58 +00:00
self . _dirty = False
2017-09-13 20:50:41 +00:00
self . _lock = threading . Lock ( )
2017-10-04 17:51:58 +00:00
self . _RecalcCache ( )
2017-09-13 20:50:41 +00:00
2020-06-17 21:31:54 +00:00
def _CleanURLClassKeysToParserKeys ( self ) :
api_pairs = ConvertURLClassesIntoAPIPairs ( self . _url_classes )
# anything that goes to an api url will be parsed by that api's parser--it can't have its own
for ( a , b ) in api_pairs :
unparseable_url_class_key = a . GetClassKey ( )
if unparseable_url_class_key in self . _url_class_keys_to_parser_keys :
del self . _url_class_keys_to_parser_keys [ unparseable_url_class_key ]
2018-07-04 20:48:28 +00:00
def _GetDefaultTagImportOptionsForURL ( self , url ) :
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2018-07-04 20:48:28 +00:00
2019-05-08 21:06:42 +00:00
if url_class is None or url_class . GetURLType ( ) not in ( HC . URL_TYPE_POST , HC . URL_TYPE_WATCHABLE ) :
2018-07-04 20:48:28 +00:00
2018-08-08 20:29:54 +00:00
return self . _file_post_default_tag_import_options
2018-07-04 20:48:28 +00:00
try :
2019-05-08 21:06:42 +00:00
( url_class , url ) = self . _GetNormalisedAPIURLClassAndURL ( url )
2018-07-04 20:48:28 +00:00
2019-05-08 21:06:42 +00:00
except HydrusExceptions . URLClassException :
2018-07-04 20:48:28 +00:00
2018-08-08 20:29:54 +00:00
return self . _file_post_default_tag_import_options
2018-07-04 20:48:28 +00:00
2018-04-18 22:10:15 +00:00
2020-03-18 21:35:57 +00:00
# some lad decided to api convert one url type to another
if url_class . GetURLType ( ) not in ( HC . URL_TYPE_POST , HC . URL_TYPE_WATCHABLE ) :
return self . _file_post_default_tag_import_options
2020-06-17 21:31:54 +00:00
url_class_key = url_class . GetClassKey ( )
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
if url_class_key in self . _url_class_keys_to_default_tag_import_options :
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
return self . _url_class_keys_to_default_tag_import_options [ url_class_key ]
2018-04-18 22:10:15 +00:00
else :
2019-05-08 21:06:42 +00:00
url_type = url_class . GetURLType ( )
2018-04-18 22:10:15 +00:00
2018-08-08 20:29:54 +00:00
if url_type == HC . URL_TYPE_POST :
2018-06-20 20:20:22 +00:00
2018-08-08 20:29:54 +00:00
return self . _file_post_default_tag_import_options
2018-06-20 20:20:22 +00:00
2018-08-08 20:29:54 +00:00
elif url_type == HC . URL_TYPE_WATCHABLE :
2018-06-20 20:20:22 +00:00
2018-08-08 20:29:54 +00:00
return self . _watchable_default_tag_import_options
2018-06-20 20:20:22 +00:00
2018-08-08 20:29:54 +00:00
else :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not find tag import options for that kind of URL Class! ' )
2018-06-20 20:20:22 +00:00
2018-04-18 22:10:15 +00:00
2018-09-05 20:52:32 +00:00
def _GetGUG ( self , gug_key_and_name ) :
( gug_key , gug_name ) = gug_key_and_name
if gug_key in self . _gug_keys_to_gugs :
return self . _gug_keys_to_gugs [ gug_key ]
elif gug_name in self . _gug_names_to_gugs :
return self . _gug_names_to_gugs [ gug_name ]
else :
return None
2019-05-08 21:06:42 +00:00
def _GetNormalisedAPIURLClassAndURL ( self , url ) :
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
if url_class is None :
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not find a URL Class for ' + url + ' ! ' )
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
seen_url_classes = set ( )
2018-06-06 21:27:02 +00:00
2019-05-08 21:06:42 +00:00
seen_url_classes . add ( url_class )
2018-06-06 21:27:02 +00:00
2019-05-08 21:06:42 +00:00
api_url_class = url_class
2018-05-30 20:13:21 +00:00
api_url = url
2019-05-08 21:06:42 +00:00
while api_url_class . UsesAPIURL ( ) :
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
api_url = api_url_class . GetAPIURL ( api_url )
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
api_url_class = self . _GetURLClass ( api_url )
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
if api_url_class is None :
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not find an API URL Class for ' + api_url + ' URL, which originally came from ' + url + ' ! ' )
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
if api_url_class in seen_url_classes :
2018-06-06 21:27:02 +00:00
2019-05-08 21:06:42 +00:00
loop_size = len ( seen_url_classes )
2018-06-06 21:27:02 +00:00
if loop_size == 1 :
message = ' Could not find an API URL Class for ' + url + ' as the url class API-linked to itself! '
elif loop_size == 2 :
message = ' Could not find an API URL Class for ' + url + ' as the url class and its API url class API-linked to each other! '
else :
2018-07-04 20:48:28 +00:00
message = ' Could not find an API URL Class for ' + url + ' as it and its API url classes linked in a loop of size ' + HydrusData . ToHumanInt ( loop_size ) + ' ! '
2018-06-06 21:27:02 +00:00
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( message )
2018-06-06 21:27:02 +00:00
2019-05-08 21:06:42 +00:00
seen_url_classes . add ( api_url_class )
2018-06-06 21:27:02 +00:00
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
api_url = api_url_class . Normalise ( api_url )
2018-05-30 20:13:21 +00:00
2019-05-08 21:06:42 +00:00
return ( api_url_class , api_url )
2018-05-30 20:13:21 +00:00
2017-10-11 17:38:14 +00:00
def _GetSerialisableInfo ( self ) :
2018-08-29 20:20:41 +00:00
serialisable_gugs = self . _gugs . GetSerialisableTuple ( )
2019-01-09 22:59:03 +00:00
serialisable_gug_keys_to_display = [ gug_key . hex ( ) for gug_key in self . _gug_keys_to_display ]
2018-08-29 20:20:41 +00:00
2019-05-08 21:06:42 +00:00
serialisable_url_classes = self . _url_classes . GetSerialisableTuple ( )
serialisable_url_class_keys_to_display = [ url_class_key . hex ( ) for url_class_key in self . _url_class_keys_to_display ]
serialisable_url_class_keys_to_parser_keys = self . _url_class_keys_to_parser_keys . GetSerialisableTuple ( )
2018-04-18 22:10:15 +00:00
serialisable_file_post_default_tag_import_options = self . _file_post_default_tag_import_options . GetSerialisableTuple ( )
serialisable_watchable_default_tag_import_options = self . _watchable_default_tag_import_options . GetSerialisableTuple ( )
2019-05-08 21:06:42 +00:00
serialisable_url_class_keys_to_default_tag_import_options = [ ( url_class_key . hex ( ) , tag_import_options . GetSerialisableTuple ( ) ) for ( url_class_key , tag_import_options ) in list ( self . _url_class_keys_to_default_tag_import_options . items ( ) ) ]
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
serialisable_default_tag_import_options_tuple = ( serialisable_file_post_default_tag_import_options , serialisable_watchable_default_tag_import_options , serialisable_url_class_keys_to_default_tag_import_options )
2018-04-18 22:10:15 +00:00
2018-01-17 22:52:10 +00:00
serialisable_parsers = self . _parsers . GetSerialisableTuple ( )
2019-01-09 22:59:03 +00:00
serialisable_network_contexts_to_custom_header_dicts = [ ( network_context . GetSerialisableTuple ( ) , list ( custom_header_dict . items ( ) ) ) for ( network_context , custom_header_dict ) in list ( self . _network_contexts_to_custom_header_dicts . items ( ) ) ]
2017-10-11 17:38:14 +00:00
2019-05-08 21:06:42 +00:00
return ( serialisable_gugs , serialisable_gug_keys_to_display , serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_default_tag_import_options_tuple , serialisable_parsers , serialisable_network_contexts_to_custom_header_dicts )
2017-10-11 17:38:14 +00:00
2019-05-08 21:06:42 +00:00
def _GetURLClass ( self , url ) :
2017-09-27 21:52:54 +00:00
2020-04-01 21:51:42 +00:00
domain = ConvertURLIntoSecondLevelDomain ( url )
2017-09-27 21:52:54 +00:00
2020-01-16 02:08:23 +00:00
if domain in self . _second_level_domains_to_url_classes :
2017-09-27 21:52:54 +00:00
2020-01-16 02:08:23 +00:00
url_classes = self . _second_level_domains_to_url_classes [ domain ]
2017-09-27 21:52:54 +00:00
2019-05-08 21:06:42 +00:00
for url_class in url_classes :
2017-09-27 21:52:54 +00:00
2017-11-22 21:03:07 +00:00
try :
2019-05-08 21:06:42 +00:00
url_class . Test ( url )
2017-11-22 21:03:07 +00:00
2019-05-08 21:06:42 +00:00
return url_class
2017-09-27 21:52:54 +00:00
2019-05-08 21:06:42 +00:00
except HydrusExceptions . URLClassException :
2017-11-22 21:03:07 +00:00
continue
2017-09-27 21:52:54 +00:00
return None
2019-02-06 22:41:35 +00:00
def _GetURLToFetchAndParser ( self , url ) :
try :
2019-05-08 21:06:42 +00:00
( parser_url_class , parser_url ) = self . _GetNormalisedAPIURLClassAndURL ( url )
2019-02-06 22:41:35 +00:00
2019-05-08 21:06:42 +00:00
except HydrusExceptions . URLClassException as e :
2019-02-06 22:41:35 +00:00
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not find a parser for ' + url + ' ! ' + os . linesep * 2 + str ( e ) )
2019-02-06 22:41:35 +00:00
2020-06-17 21:31:54 +00:00
url_class_key = parser_url_class . GetClassKey ( )
2019-02-06 22:41:35 +00:00
2019-05-08 21:06:42 +00:00
if url_class_key in self . _url_class_keys_to_parser_keys :
2019-02-06 22:41:35 +00:00
2019-05-08 21:06:42 +00:00
parser_key = self . _url_class_keys_to_parser_keys [ url_class_key ]
2019-02-06 22:41:35 +00:00
if parser_key is not None and parser_key in self . _parser_keys_to_parsers :
return ( parser_url , self . _parser_keys_to_parsers [ parser_key ] )
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not find a parser for ' + parser_url_class . GetName ( ) + ' URL Class! ' )
2019-02-06 22:41:35 +00:00
2017-10-11 17:38:14 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2019-05-08 21:06:42 +00:00
( serialisable_gugs , serialisable_gug_keys_to_display , serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_default_tag_import_options_tuple , serialisable_parsers , serialisable_network_contexts_to_custom_header_dicts ) = serialisable_info
2018-08-29 20:20:41 +00:00
self . _gugs = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_gugs )
2017-10-11 17:38:14 +00:00
2019-01-09 22:59:03 +00:00
self . _gug_keys_to_display = { bytes . fromhex ( serialisable_gug_key ) for serialisable_gug_key in serialisable_gug_keys_to_display }
2018-09-05 20:52:32 +00:00
2019-05-08 21:06:42 +00:00
self . _url_classes = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_url_classes )
2017-10-11 17:38:14 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_display = { bytes . fromhex ( serialisable_url_class_key ) for serialisable_url_class_key in serialisable_url_class_keys_to_display }
self . _url_class_keys_to_parser_keys = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_url_class_keys_to_parser_keys )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
( serialisable_file_post_default_tag_import_options , serialisable_watchable_default_tag_import_options , serialisable_url_class_keys_to_default_tag_import_options ) = serialisable_default_tag_import_options_tuple
2018-04-18 22:10:15 +00:00
self . _file_post_default_tag_import_options = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_file_post_default_tag_import_options )
self . _watchable_default_tag_import_options = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_watchable_default_tag_import_options )
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_default_tag_import_options = { bytes . fromhex ( serialisable_url_class_key ) : HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_tag_import_options ) for ( serialisable_url_class_key , serialisable_tag_import_options ) in serialisable_url_class_keys_to_default_tag_import_options }
2018-04-18 22:10:15 +00:00
2018-01-17 22:52:10 +00:00
self . _parsers = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_parsers )
2017-12-06 22:06:56 +00:00
2017-10-11 17:38:14 +00:00
self . _network_contexts_to_custom_header_dicts = collections . defaultdict ( dict )
for ( serialisable_network_context , custom_header_dict_items ) in serialisable_network_contexts_to_custom_header_dicts :
network_context = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_network_context )
custom_header_dict = dict ( custom_header_dict_items )
self . _network_contexts_to_custom_header_dicts [ network_context ] = custom_header_dict
2017-10-04 17:51:58 +00:00
def _RecalcCache ( self ) :
2020-01-16 02:08:23 +00:00
self . _second_level_domains_to_url_classes = collections . defaultdict ( list )
2017-09-13 20:50:41 +00:00
2019-05-08 21:06:42 +00:00
for url_class in self . _url_classes :
2017-10-04 17:51:58 +00:00
2020-01-16 02:08:23 +00:00
domain = ConvertDomainIntoSecondLevelDomain ( url_class . GetDomain ( ) )
2017-10-04 17:51:58 +00:00
2020-01-16 02:08:23 +00:00
self . _second_level_domains_to_url_classes [ domain ] . append ( url_class )
2017-10-04 17:51:58 +00:00
2017-09-13 20:50:41 +00:00
2020-01-16 02:08:23 +00:00
for url_classes in self . _second_level_domains_to_url_classes . values ( ) :
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
NetworkDomainManager . STATICSortURLClassesDescendingComplexity ( url_classes )
2017-12-06 22:06:56 +00:00
2018-09-05 20:52:32 +00:00
self . _gug_keys_to_gugs = { gug . GetGUGKey ( ) : gug for gug in self . _gugs }
self . _gug_names_to_gugs = { gug . GetName ( ) : gug for gug in self . _gugs }
2018-01-17 22:52:10 +00:00
2018-09-05 20:52:32 +00:00
self . _parser_keys_to_parsers = { parser . GetParserKey ( ) : parser for parser in self . _parsers }
2018-01-17 22:52:10 +00:00
2018-08-08 20:29:54 +00:00
namespaces = set ( )
for parser in self . _parsers :
namespaces . update ( parser . GetNamespaces ( ) )
2020-05-13 19:03:16 +00:00
self . _parser_namespaces = sorted ( namespaces )
2018-08-08 20:29:54 +00:00
2017-10-04 17:51:58 +00:00
def _SetDirty ( self ) :
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
self . _dirty = True
2017-09-13 20:50:41 +00:00
2017-12-06 22:06:56 +00:00
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
2019-05-08 21:06:42 +00:00
( serialisable_url_classes , serialisable_network_contexts_to_custom_header_dicts ) = old_serialisable_info
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
url_classes = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_url_classes )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
url_class_names_to_display = { }
url_class_names_to_page_parser_keys = HydrusSerialisable . SerialisableBytesDictionary ( )
url_class_names_to_gallery_parser_keys = HydrusSerialisable . SerialisableBytesDictionary ( )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
for url_class in url_classes :
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
name = url_class . GetName ( )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
if url_class . IsPostURL ( ) :
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
url_class_names_to_display [ name ] = True
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
url_class_names_to_page_parser_keys [ name ] = None
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
if url_class . IsGalleryURL ( ) or url_class . IsWatchableURL ( ) :
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
url_class_names_to_gallery_parser_keys [ name ] = None
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
serialisable_url_class_names_to_display = list ( url_class_names_to_display . items ( ) )
serialisable_url_class_names_to_page_parser_keys = url_class_names_to_page_parser_keys . GetSerialisableTuple ( )
serialisable_url_class_names_to_gallery_parser_keys = url_class_names_to_gallery_parser_keys . GetSerialisableTuple ( )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_classes , serialisable_url_class_names_to_display , serialisable_url_class_names_to_page_parser_keys , serialisable_url_class_names_to_gallery_parser_keys , serialisable_network_contexts_to_custom_header_dicts )
2017-12-06 22:06:56 +00:00
return ( 2 , new_serialisable_info )
2018-01-17 22:52:10 +00:00
if version == 2 :
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
( serialisable_url_classes , serialisable_url_class_names_to_display , serialisable_url_class_names_to_page_parser_keys , serialisable_url_class_names_to_gallery_parser_keys , serialisable_network_contexts_to_custom_header_dicts ) = old_serialisable_info
2017-12-06 22:06:56 +00:00
2018-01-17 22:52:10 +00:00
parsers = HydrusSerialisable . SerialisableList ( )
serialisable_parsing_parsers = parsers . GetSerialisableTuple ( )
2019-05-08 21:06:42 +00:00
url_class_names_to_display = dict ( serialisable_url_class_names_to_display )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_class_keys_to_display = [ ]
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_class_names_to_gallery_parser_keys = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_url_class_names_to_gallery_parser_keys )
url_class_names_to_page_parser_keys = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_url_class_names_to_page_parser_keys )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_class_keys_to_parser_keys = HydrusSerialisable . SerialisableBytesDictionary ( )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_classes = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_url_classes )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
for url_class in url_classes :
2017-12-06 22:06:56 +00:00
2020-06-17 21:31:54 +00:00
url_class_key = url_class . GetClassKey ( )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
name = url_class . GetName ( )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
if name in url_class_names_to_display and url_class_names_to_display [ name ] :
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
url_class_keys_to_display . append ( url_class_key )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
serialisable_url_classes = url_classes . GetSerialisableTuple ( ) # added random key this week, so save these changes back again!
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
serialisable_url_class_keys_to_display = [ url_class_key . hex ( ) for url_class_key in url_class_keys_to_display ]
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
serialisable_url_class_keys_to_parser_keys = url_class_keys_to_parser_keys . GetSerialisableTuple ( )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_parsing_parsers , serialisable_network_contexts_to_custom_header_dicts )
2018-01-17 22:52:10 +00:00
return ( 3 , new_serialisable_info )
2017-12-06 22:06:56 +00:00
2018-04-18 22:10:15 +00:00
if version == 3 :
2019-05-08 21:06:42 +00:00
( serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_parsing_parsers , serialisable_network_contexts_to_custom_header_dicts ) = old_serialisable_info
2018-04-18 22:10:15 +00:00
2020-04-22 21:00:35 +00:00
from hydrus . client . importing import ClientImportOptions
2018-04-18 22:10:15 +00:00
self . _file_post_default_tag_import_options = ClientImportOptions . TagImportOptions ( )
self . _watchable_default_tag_import_options = ClientImportOptions . TagImportOptions ( )
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_default_tag_import_options = { }
2018-04-18 22:10:15 +00:00
serialisable_file_post_default_tag_import_options = self . _file_post_default_tag_import_options . GetSerialisableTuple ( )
serialisable_watchable_default_tag_import_options = self . _watchable_default_tag_import_options . GetSerialisableTuple ( )
2019-05-08 21:06:42 +00:00
serialisable_url_class_keys_to_default_tag_import_options = [ ( url_class_key . hex ( ) , tag_import_options . GetSerialisableTuple ( ) ) for ( url_class_key , tag_import_options ) in list ( self . _url_class_keys_to_default_tag_import_options . items ( ) ) ]
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
serialisable_default_tag_import_options_tuple = ( serialisable_file_post_default_tag_import_options , serialisable_watchable_default_tag_import_options , serialisable_url_class_keys_to_default_tag_import_options )
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_default_tag_import_options_tuple , serialisable_parsing_parsers , serialisable_network_contexts_to_custom_header_dicts )
2018-04-18 22:10:15 +00:00
return ( 4 , new_serialisable_info )
2018-08-29 20:20:41 +00:00
if version == 4 :
2019-05-08 21:06:42 +00:00
( serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_default_tag_import_options_tuple , serialisable_parsing_parsers , serialisable_network_contexts_to_custom_header_dicts ) = old_serialisable_info
2018-08-29 20:20:41 +00:00
gugs = HydrusSerialisable . SerialisableList ( )
serialisable_gugs = gugs . GetSerialisableTuple ( )
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_gugs , serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_default_tag_import_options_tuple , serialisable_parsing_parsers , serialisable_network_contexts_to_custom_header_dicts )
2018-08-29 20:20:41 +00:00
return ( 5 , new_serialisable_info )
2018-09-05 20:52:32 +00:00
if version == 5 :
2019-05-08 21:06:42 +00:00
( serialisable_gugs , serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_default_tag_import_options_tuple , serialisable_parsing_parsers , serialisable_network_contexts_to_custom_header_dicts ) = old_serialisable_info
2018-09-05 20:52:32 +00:00
gugs = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_gugs )
gug_keys_to_display = [ gug . GetGUGKey ( ) for gug in gugs if ' ugoira ' not in gug . GetName ( ) ]
2019-01-09 22:59:03 +00:00
serialisable_gug_keys_to_display = [ gug_key . hex ( ) for gug_key in gug_keys_to_display ]
2018-09-05 20:52:32 +00:00
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_gugs , serialisable_gug_keys_to_display , serialisable_url_classes , serialisable_url_class_keys_to_display , serialisable_url_class_keys_to_parser_keys , serialisable_default_tag_import_options_tuple , serialisable_parsing_parsers , serialisable_network_contexts_to_custom_header_dicts )
2018-09-05 20:52:32 +00:00
return ( 6 , new_serialisable_info )
def AddGUGs ( self , new_gugs ) :
with self . _lock :
gugs = list ( self . _gugs )
2018-09-19 21:54:51 +00:00
for gug in new_gugs :
gug . SetNonDupeName ( [ g . GetName ( ) for g in gugs ] )
gugs . append ( gug )
2018-09-05 20:52:32 +00:00
self . SetGUGs ( gugs )
def AddParsers ( self , new_parsers ) :
with self . _lock :
parsers = list ( self . _parsers )
2018-09-19 21:54:51 +00:00
for parser in new_parsers :
parser . SetNonDupeName ( [ p . GetName ( ) for p in parsers ] )
parsers . append ( parser )
2018-09-05 20:52:32 +00:00
self . SetParsers ( parsers )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
def AddURLClasses ( self , new_url_classes ) :
2018-09-19 21:54:51 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
url_classes = list ( self . _url_classes )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
for url_class in new_url_classes :
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
url_class . SetNonDupeName ( [ u . GetName ( ) for u in url_classes ] )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
url_classes . append ( url_class )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
self . SetURLClasses ( url_classes )
2018-09-19 21:54:51 +00:00
2018-10-03 21:00:15 +00:00
def AlreadyHaveExactlyTheseHeaders ( self , network_context , headers_list ) :
with self . _lock :
if network_context in self . _network_contexts_to_custom_header_dicts :
custom_headers_dict = self . _network_contexts_to_custom_header_dicts [ network_context ]
if len ( headers_list ) != len ( custom_headers_dict ) :
return False
for ( key , value , reason ) in headers_list :
if key not in custom_headers_dict :
return False
( existing_value , existing_approved , existing_reason ) = custom_headers_dict [ key ]
if existing_value != value :
return False
return True
return False
2018-09-19 21:54:51 +00:00
def AlreadyHaveExactlyThisGUG ( self , new_gug ) :
with self . _lock :
# absent irrelevant variables, do we have the exact same object already in?
gug_key_and_name = new_gug . GetGUGKeyAndName ( )
dupe_gugs = [ gug . Duplicate ( ) for gug in self . _gugs ]
for dupe_gug in dupe_gugs :
dupe_gug . SetGUGKeyAndName ( gug_key_and_name )
if dupe_gug . DumpToString ( ) == new_gug . DumpToString ( ) :
return True
return False
def AlreadyHaveExactlyThisParser ( self , new_parser ) :
with self . _lock :
# absent irrelevant variables, do we have the exact same object already in?
new_name = new_parser . GetName ( )
new_parser_key = new_parser . GetParserKey ( )
new_example_urls = new_parser . GetExampleURLs ( )
new_example_parsing_context = new_parser . GetExampleParsingContext ( )
dupe_parsers = [ ( parser . Duplicate ( ) , parser ) for parser in self . _parsers ]
for ( dupe_parser , parser ) in dupe_parsers :
dupe_parser . SetName ( new_name )
dupe_parser . SetParserKey ( new_parser_key )
dupe_parser . SetExampleURLs ( new_example_urls )
dupe_parser . SetExampleParsingContext ( new_example_parsing_context )
if dupe_parser . DumpToString ( ) == new_parser . DumpToString ( ) :
# since these are the 'same', let's merge example urls
parser_example_urls = set ( parser . GetExampleURLs ( ) )
parser_example_urls . update ( new_example_urls )
parser_example_urls = list ( parser_example_urls )
parser . SetExampleURLs ( parser_example_urls )
self . _SetDirty ( )
return True
return False
2019-05-08 21:06:42 +00:00
def AlreadyHaveExactlyThisURLClass ( self , new_url_class ) :
2018-09-19 21:54:51 +00:00
with self . _lock :
# absent irrelevant variables, do we have the exact same object already in?
2019-05-08 21:06:42 +00:00
name = new_url_class . GetName ( )
2020-06-17 21:31:54 +00:00
match_key = new_url_class . GetClassKey ( )
2019-05-08 21:06:42 +00:00
example_url = new_url_class . GetExampleURL ( )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
dupe_url_classes = [ url_class . Duplicate ( ) for url_class in self . _url_classes ]
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
for dupe_url_class in dupe_url_classes :
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
dupe_url_class . SetName ( name )
2020-06-17 21:31:54 +00:00
dupe_url_class . SetClassKey ( match_key )
2019-05-08 21:06:42 +00:00
dupe_url_class . SetExampleURL ( example_url )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
if dupe_url_class . DumpToString ( ) == new_url_class . DumpToString ( ) :
2018-09-19 21:54:51 +00:00
return True
return False
2018-10-03 21:00:15 +00:00
def AutoAddDomainMetadatas ( self , domain_metadatas , approved = False ) :
for domain_metadata in domain_metadatas :
if not domain_metadata . HasHeaders ( ) :
continue
with self . _lock :
domain = domain_metadata . GetDomain ( )
network_context = ClientNetworkingContexts . NetworkContext ( CC . NETWORK_CONTEXT_DOMAIN , domain )
headers_list = domain_metadata . GetHeaders ( )
custom_headers_dict = { key : ( value , approved , reason ) for ( key , value , reason ) in headers_list }
self . _network_contexts_to_custom_header_dicts [ network_context ] = custom_headers_dict
2019-05-08 21:06:42 +00:00
def AutoAddURLClassesAndParsers ( self , new_url_classes , dupe_url_classes , new_parsers ) :
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
for url_class in new_url_classes :
2018-09-19 21:54:51 +00:00
2020-06-17 21:31:54 +00:00
url_class . RegenerateClassKey ( )
2018-09-19 21:54:51 +00:00
for parser in new_parsers :
parser . RegenerateParserKey ( )
# any existing url matches that already do the job of the new ones should be hung on to but renamed
with self . _lock :
prefix = ' zzz - renamed due to auto-import - '
renamees = [ ]
2019-05-08 21:06:42 +00:00
for existing_url_class in self . _url_classes :
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
if existing_url_class . GetName ( ) . startswith ( prefix ) :
2018-09-19 21:54:51 +00:00
continue
2019-05-08 21:06:42 +00:00
for new_url_class in new_url_classes :
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
if new_url_class . Matches ( existing_url_class . GetExampleURL ( ) ) and existing_url_class . Matches ( new_url_class . GetExampleURL ( ) ) :
2018-09-19 21:54:51 +00:00
# the url matches match each other, so they are doing the same job
2019-05-08 21:06:42 +00:00
renamees . append ( existing_url_class )
2018-09-19 21:54:51 +00:00
break
for renamee in renamees :
2019-05-08 21:06:42 +00:00
existing_names = [ url_class . GetName ( ) for url_class in self . _url_classes if url_class != renamee ]
2018-09-19 21:54:51 +00:00
renamee . SetName ( prefix + renamee . GetName ( ) )
renamee . SetNonDupeName ( existing_names )
2019-05-08 21:06:42 +00:00
self . AddURLClasses ( new_url_classes )
2018-09-19 21:54:51 +00:00
self . AddParsers ( new_parsers )
# we want to match these url matches and parsers together if possible
with self . _lock :
2019-05-08 21:06:42 +00:00
url_classes_to_link = list ( new_url_classes )
2018-09-19 21:54:51 +00:00
# if downloader adds existing url match but updated parser, we want to update the existing link
2019-05-08 21:06:42 +00:00
for dupe_url_class in dupe_url_classes :
2018-09-19 21:54:51 +00:00
# this is to make sure we have the right match keys for the link update in a minute
2019-05-08 21:06:42 +00:00
actual_existing_dupe_url_class = self . _GetURLClass ( dupe_url_class . GetExampleURL ( ) )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
if actual_existing_dupe_url_class is not None :
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
url_classes_to_link . append ( actual_existing_dupe_url_class )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
new_url_class_keys_to_parser_keys = NetworkDomainManager . STATICLinkURLClassesAndParsers ( url_classes_to_link , new_parsers , { } )
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_parser_keys . update ( new_url_class_keys_to_parser_keys )
2018-09-19 21:54:51 +00:00
2020-06-17 21:31:54 +00:00
self . _CleanURLClassKeysToParserKeys ( )
2018-09-19 21:54:51 +00:00
# let's do a trytolink just in case there are loose ends due to some dupe being discarded earlier (e.g. url match is new, but parser was not).
2019-05-08 21:06:42 +00:00
self . TryToLinkURLClassesAndParsers ( )
2018-09-19 21:54:51 +00:00
2017-10-04 17:51:58 +00:00
def CanValidateInPopup ( self , network_contexts ) :
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
# we can always do this for headers
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
return True
2017-09-27 21:52:54 +00:00
2017-12-06 22:06:56 +00:00
def ConvertURLsToMediaViewerTuples ( self , urls ) :
2020-06-11 12:01:08 +00:00
show_unmatched_urls_in_media_viewer = HG . client_controller . new_options . GetBoolean ( ' show_unmatched_urls_in_media_viewer ' )
2017-12-06 22:06:56 +00:00
url_tuples = [ ]
2020-06-11 12:01:08 +00:00
unmatched_url_tuples = [ ]
2017-12-06 22:06:56 +00:00
with self . _lock :
for url in urls :
2020-04-01 21:51:42 +00:00
try :
url_class = self . _GetURLClass ( url )
except HydrusExceptions . URLClassException :
continue
2017-12-06 22:06:56 +00:00
2020-06-11 12:01:08 +00:00
if url_class is None :
if show_unmatched_urls_in_media_viewer :
try :
domain = ConvertURLIntoDomain ( url )
except HydrusExceptions . URLClassException :
continue
unmatched_url_tuples . append ( ( domain , url ) )
else :
2017-12-06 22:06:56 +00:00
2020-06-17 21:31:54 +00:00
url_class_key = url_class . GetClassKey ( )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
if url_class_key in self . _url_class_keys_to_display :
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
url_class_name = url_class . GetName ( )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_tuples . append ( ( url_class_name , url ) )
2017-12-06 22:06:56 +00:00
if len ( url_tuples ) == 10 :
break
url_tuples . sort ( )
2020-06-11 12:01:08 +00:00
unmatched_url_tuples . sort ( )
url_tuples . extend ( unmatched_url_tuples )
2017-12-06 22:06:56 +00:00
return url_tuples
2018-09-05 20:52:32 +00:00
def DeleteGUGs ( self , deletee_names ) :
with self . _lock :
gugs = [ gug for gug in self . _gugs if gug . GetName ( ) not in deletee_names ]
self . SetGUGs ( gugs )
2020-01-16 02:08:23 +00:00
def DeleteURLClasses ( self , deletee_names ) :
with self . _lock :
url_classes = [ url_class for url_class in self . _url_classes if url_class . GetName ( ) not in deletee_names ]
self . SetURLClasses ( url_classes )
2020-04-16 00:09:42 +00:00
def DomainOK ( self , url ) :
with self . _lock :
try :
domain = ConvertURLIntoSecondLevelDomain ( url )
except :
return True
number_of_errors = HG . client_controller . new_options . GetInteger ( ' domain_network_infrastructure_error_number ' )
error_time_delta = HG . client_controller . new_options . GetInteger ( ' domain_network_infrastructure_error_time_delta ' )
if number_of_errors == 0 :
return True
# this will become flexible and customisable when I have domain profiles/status/ui
# also should extend it to 'global', so if multiple domains are having trouble, we maybe assume the whole connection is down? it would really be nicer to have a better sockets-level check there
if domain in self . _second_level_domains_to_network_infrastructure_errors :
network_infrastructure_errors = self . _second_level_domains_to_network_infrastructure_errors [ domain ]
network_infrastructure_errors = [ timestamp for timestamp in network_infrastructure_errors if not HydrusData . TimeHasPassed ( timestamp + error_time_delta ) ]
self . _second_level_domains_to_network_infrastructure_errors [ domain ] = network_infrastructure_errors
if len ( network_infrastructure_errors ) > = number_of_errors :
return False
elif len ( network_infrastructure_errors ) == 0 :
del self . _second_level_domains_to_network_infrastructure_errors [ domain ]
return True
2017-10-11 17:38:14 +00:00
def GenerateValidationPopupProcess ( self , network_contexts ) :
2017-10-04 17:51:58 +00:00
2017-09-13 20:50:41 +00:00
with self . _lock :
2017-10-11 17:38:14 +00:00
header_tuples = [ ]
for network_context in network_contexts :
if network_context in self . _network_contexts_to_custom_header_dicts :
custom_header_dict = self . _network_contexts_to_custom_header_dicts [ network_context ]
2019-01-09 22:59:03 +00:00
for ( key , ( value , approved , reason ) ) in list ( custom_header_dict . items ( ) ) :
2017-10-11 17:38:14 +00:00
if approved == VALID_UNKNOWN :
header_tuples . append ( ( network_context , key , value , reason ) )
2017-09-27 21:52:54 +00:00
2017-10-11 17:38:14 +00:00
process = DomainValidationPopupProcess ( self , header_tuples )
return process
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
2017-09-27 21:52:54 +00:00
2018-09-05 20:52:32 +00:00
def GetDefaultGUGKeyAndName ( self ) :
with self . _lock :
2018-09-26 19:05:12 +00:00
gug_key = HG . client_controller . new_options . GetKey ( ' default_gug_key ' )
gug_name = HG . client_controller . new_options . GetString ( ' default_gug_name ' )
return ( gug_key , gug_name )
2018-09-05 20:52:32 +00:00
2018-04-18 22:10:15 +00:00
def GetDefaultTagImportOptions ( self ) :
with self . _lock :
2019-05-08 21:06:42 +00:00
return ( self . _file_post_default_tag_import_options , self . _watchable_default_tag_import_options , self . _url_class_keys_to_default_tag_import_options )
2018-04-18 22:10:15 +00:00
2018-07-11 20:23:51 +00:00
def GetDefaultTagImportOptionsForPosts ( self ) :
with self . _lock :
return self . _file_post_default_tag_import_options . Duplicate ( )
2018-04-18 22:10:15 +00:00
def GetDefaultTagImportOptionsForURL ( self , url ) :
with self . _lock :
2018-07-04 20:48:28 +00:00
return self . _GetDefaultTagImportOptionsForURL ( url )
2018-04-18 22:10:15 +00:00
2017-09-27 21:52:54 +00:00
def GetDownloader ( self , url ) :
with self . _lock :
# this might be better as getdownloaderkey, but we'll see how it shakes out
# might also be worth being a getifhasdownloader
2019-05-08 21:06:42 +00:00
# match the url to a url_class, then lookup that in a 'this downloader can handle this url_class type' dict that we'll manage
2017-09-27 21:52:54 +00:00
pass
2018-09-05 20:52:32 +00:00
def GetGUG ( self , gug_key_and_name ) :
with self . _lock :
return self . _GetGUG ( gug_key_and_name )
2018-08-22 21:10:59 +00:00
def GetGUGs ( self ) :
with self . _lock :
return list ( self . _gugs )
2018-09-05 20:52:32 +00:00
def GetGUGKeysToDisplay ( self ) :
with self . _lock :
return set ( self . _gug_keys_to_display )
2017-10-11 17:38:14 +00:00
def GetHeaders ( self , network_contexts ) :
with self . _lock :
headers = { }
for network_context in network_contexts :
if network_context in self . _network_contexts_to_custom_header_dicts :
custom_header_dict = self . _network_contexts_to_custom_header_dicts [ network_context ]
2019-01-09 22:59:03 +00:00
for ( key , ( value , approved , reason ) ) in list ( custom_header_dict . items ( ) ) :
2017-10-11 17:38:14 +00:00
if approved == VALID_APPROVED :
headers [ key ] = value
return headers
2018-09-05 20:52:32 +00:00
def GetInitialSearchText ( self , gug_key_and_name ) :
with self . _lock :
gug = self . _GetGUG ( gug_key_and_name )
if gug is None :
return ' unknown downloader '
else :
return gug . GetInitialSearchText ( )
2017-10-11 17:38:14 +00:00
def GetNetworkContextsToCustomHeaderDicts ( self ) :
with self . _lock :
return dict ( self . _network_contexts_to_custom_header_dicts )
2018-06-20 20:20:22 +00:00
def GetParser ( self , name ) :
with self . _lock :
for parser in self . _parsers :
if parser . GetName ( ) == name :
return parser
return None
2018-01-24 23:09:42 +00:00
def GetParsers ( self ) :
with self . _lock :
return list ( self . _parsers )
2018-08-08 20:29:54 +00:00
def GetParserNamespaces ( self ) :
with self . _lock :
return list ( self . _parser_namespaces )
2019-10-09 22:03:03 +00:00
def GetReferralURL ( self , url , referral_url ) :
with self . _lock :
url_class = self . _GetURLClass ( url )
if url_class is None :
return referral_url
else :
return url_class . GetReferralURL ( url , referral_url )
2018-10-03 21:00:15 +00:00
def GetShareableCustomHeaders ( self , network_context ) :
with self . _lock :
headers_list = [ ]
if network_context in self . _network_contexts_to_custom_header_dicts :
custom_header_dict = self . _network_contexts_to_custom_header_dicts [ network_context ]
2019-01-09 22:59:03 +00:00
for ( key , ( value , approved , reason ) ) in list ( custom_header_dict . items ( ) ) :
2018-10-03 21:00:15 +00:00
headers_list . append ( ( key , value , reason ) )
return headers_list
2019-05-08 21:06:42 +00:00
def GetURLClass ( self , url ) :
2018-05-09 20:23:00 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
return self . _GetURLClass ( url )
2018-05-09 20:23:00 +00:00
2019-05-08 21:06:42 +00:00
def GetURLClasses ( self ) :
2017-11-29 21:48:23 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
return list ( self . _url_classes )
2017-11-29 21:48:23 +00:00
2019-05-08 21:06:42 +00:00
def GetURLClassKeysToParserKeys ( self ) :
2018-09-05 20:52:32 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
return dict ( self . _url_class_keys_to_parser_keys )
2018-09-05 20:52:32 +00:00
2019-05-08 21:06:42 +00:00
def GetURLClassKeysToDisplay ( self ) :
2017-12-06 22:06:56 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
return set ( self . _url_class_keys_to_display )
2017-12-06 22:06:56 +00:00
2018-01-17 22:52:10 +00:00
def GetURLParseCapability ( self , url ) :
2018-02-07 23:40:33 +00:00
with self . _lock :
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
if url_class is None :
2018-02-07 23:40:33 +00:00
return ( HC . URL_TYPE_UNKNOWN , ' unknown url ' , False )
2019-05-08 21:06:42 +00:00
url_type = url_class . GetURLType ( )
match_name = url_class . GetName ( )
2018-02-07 23:40:33 +00:00
2018-05-30 20:13:21 +00:00
try :
2018-02-07 23:40:33 +00:00
2018-07-04 20:48:28 +00:00
( url_to_fetch , parser ) = self . _GetURLToFetchAndParser ( url )
2018-02-07 23:40:33 +00:00
2018-05-30 20:13:21 +00:00
can_parse = True
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
except HydrusExceptions . URLClassException :
2018-02-07 23:40:33 +00:00
can_parse = False
2018-01-17 22:52:10 +00:00
2018-02-07 23:40:33 +00:00
return ( url_type , match_name , can_parse )
2018-01-17 22:52:10 +00:00
2018-02-07 23:40:33 +00:00
def GetURLToFetchAndParser ( self , url ) :
2018-01-17 22:52:10 +00:00
2018-02-07 23:40:33 +00:00
with self . _lock :
2018-01-17 22:52:10 +00:00
2018-10-17 21:00:09 +00:00
result = self . _GetURLToFetchAndParser ( url )
if HG . network_report_mode :
( url_to_fetch , parser ) = result
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2019-03-27 22:01:02 +00:00
2019-05-08 21:06:42 +00:00
url_name = url_class . GetName ( )
2019-03-27 22:01:02 +00:00
2019-05-08 21:06:42 +00:00
url_to_fetch_match = self . _GetURLClass ( url_to_fetch )
2019-03-27 22:01:02 +00:00
url_to_fetch_name = url_to_fetch_match . GetName ( )
HydrusData . ShowText ( ' request for URL to fetch and parser: {} ( {} ) -> {} ( {} ): {} ' . format ( url , url_name , url_to_fetch , url_to_fetch_name , parser . GetName ( ) ) )
2018-10-17 21:00:09 +00:00
return result
2018-01-17 22:52:10 +00:00
2018-10-03 21:00:15 +00:00
def HasCustomHeaders ( self , network_context ) :
with self . _lock :
return network_context in self . _network_contexts_to_custom_header_dicts and len ( self . _network_contexts_to_custom_header_dicts [ network_context ] ) > 0
2017-12-06 22:06:56 +00:00
def Initialise ( self ) :
self . _RecalcCache ( )
2017-10-11 17:38:14 +00:00
def IsDirty ( self ) :
with self . _lock :
return self . _dirty
2017-10-04 17:51:58 +00:00
def IsValid ( self , network_contexts ) :
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
# for now, let's say that denied headers are simply not added, not that they invalidate a query
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
for network_context in network_contexts :
2017-10-11 17:38:14 +00:00
if network_context in self . _network_contexts_to_custom_header_dicts :
2017-10-04 17:51:58 +00:00
2017-10-11 17:38:14 +00:00
custom_header_dict = self . _network_contexts_to_custom_header_dicts [ network_context ]
2017-10-04 17:51:58 +00:00
2019-01-09 22:59:03 +00:00
for ( value , approved , reason ) in list ( custom_header_dict . values ( ) ) :
2017-10-04 17:51:58 +00:00
if approved == VALID_UNKNOWN :
return False
return True
2017-09-27 21:52:54 +00:00
def NormaliseURL ( self , url ) :
with self . _lock :
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2017-09-27 21:52:54 +00:00
2019-05-08 21:06:42 +00:00
if url_class is None :
2017-09-13 20:50:41 +00:00
2020-05-13 19:03:16 +00:00
p = ParseURL ( url )
2018-08-22 21:10:59 +00:00
scheme = p . scheme
netloc = p . netloc
path = p . path
params = p . params
query = AlphabetiseQueryText ( p . query )
fragment = p . fragment
2019-01-09 22:59:03 +00:00
r = urllib . parse . ParseResult ( scheme , netloc , path , params , query , fragment )
2018-08-22 21:10:59 +00:00
normalised_url = r . geturl ( )
else :
2019-05-08 21:06:42 +00:00
normalised_url = url_class . Normalise ( url )
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
return normalised_url
2017-09-13 20:50:41 +00:00
2018-09-05 20:52:32 +00:00
def OverwriteDefaultGUGs ( self , gug_names ) :
with self . _lock :
2020-04-22 21:00:35 +00:00
from hydrus . client import ClientDefaults
2018-09-05 20:52:32 +00:00
default_gugs = ClientDefaults . GetDefaultGUGs ( )
2020-06-17 21:31:54 +00:00
existing_gug_names_to_keys = { gug . GetName ( ) : gug . GetGUGKey ( ) for gug in self . _gugs }
2018-09-05 20:52:32 +00:00
for gug in default_gugs :
2020-06-17 21:31:54 +00:00
gug_name = gug . GetName ( )
if gug_name in existing_gug_names_to_keys :
gug . SetGUGKey ( existing_gug_names_to_keys [ gug_name ] )
else :
gug . RegenerateGUGKey ( )
2018-09-05 20:52:32 +00:00
existing_gugs = list ( self . _gugs )
new_gugs = [ gug for gug in existing_gugs if gug . GetName ( ) not in gug_names ]
new_gugs . extend ( [ gug for gug in default_gugs if gug . GetName ( ) in gug_names ] )
self . SetGUGs ( new_gugs )
2018-05-09 20:23:00 +00:00
def OverwriteDefaultParsers ( self , parser_names ) :
with self . _lock :
2020-04-22 21:00:35 +00:00
from hydrus . client import ClientDefaults
2018-05-09 20:23:00 +00:00
default_parsers = ClientDefaults . GetDefaultParsers ( )
2020-06-17 21:31:54 +00:00
existing_parser_names_to_keys = { parser . GetName ( ) : parser . GetParserKey ( ) for parser in self . _parsers }
2018-08-22 21:10:59 +00:00
for parser in default_parsers :
2020-06-17 21:31:54 +00:00
name = parser . GetName ( )
if name in existing_parser_names_to_keys :
parser . SetParserKey ( existing_parser_names_to_keys [ name ] )
else :
parser . RegenerateParserKey ( )
2018-08-22 21:10:59 +00:00
2018-05-09 20:23:00 +00:00
existing_parsers = list ( self . _parsers )
new_parsers = [ parser for parser in existing_parsers if parser . GetName ( ) not in parser_names ]
new_parsers . extend ( [ parser for parser in default_parsers if parser . GetName ( ) in parser_names ] )
self . SetParsers ( new_parsers )
2019-05-08 21:06:42 +00:00
def OverwriteDefaultURLClasses ( self , url_class_names ) :
2018-05-09 20:23:00 +00:00
with self . _lock :
2020-04-22 21:00:35 +00:00
from hydrus . client import ClientDefaults
2018-05-09 20:23:00 +00:00
2019-05-08 21:06:42 +00:00
default_url_classes = ClientDefaults . GetDefaultURLClasses ( )
2018-05-09 20:23:00 +00:00
2020-06-17 21:31:54 +00:00
existing_class_names_to_keys = { url_class . GetName ( ) : url_class . GetClassKey ( ) for url_class in self . _url_classes }
2019-05-08 21:06:42 +00:00
for url_class in default_url_classes :
2018-08-22 21:10:59 +00:00
2020-06-17 21:31:54 +00:00
name = url_class . GetName ( )
if name in existing_class_names_to_keys :
url_class . SetClassKey ( existing_class_names_to_keys [ name ] )
else :
url_class . RegenerateClassKey ( )
for url_class in default_url_classes :
url_class . RegenerateClassKey ( )
2018-08-22 21:10:59 +00:00
2019-05-08 21:06:42 +00:00
existing_url_classes = list ( self . _url_classes )
2018-05-09 20:23:00 +00:00
2019-05-08 21:06:42 +00:00
new_url_classes = [ url_class for url_class in existing_url_classes if url_class . GetName ( ) not in url_class_names ]
new_url_classes . extend ( [ url_class for url_class in default_url_classes if url_class . GetName ( ) in url_class_names ] )
2018-05-09 20:23:00 +00:00
2019-05-08 21:06:42 +00:00
self . SetURLClasses ( new_url_classes )
2018-05-09 20:23:00 +00:00
2019-05-08 21:06:42 +00:00
def OverwriteParserLink ( self , url_class , parser ) :
2018-06-20 20:20:22 +00:00
with self . _lock :
2020-06-17 21:31:54 +00:00
url_class_key = url_class . GetClassKey ( )
2018-06-20 20:20:22 +00:00
parser_key = parser . GetParserKey ( )
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_parser_keys [ url_class_key ] = parser_key
2018-06-20 20:20:22 +00:00
2020-04-16 00:09:42 +00:00
def ReportNetworkInfrastructureError ( self , url ) :
with self . _lock :
try :
domain = ConvertURLIntoDomain ( url )
except :
return
self . _second_level_domains_to_network_infrastructure_errors [ domain ] . append ( HydrusData . GetNow ( ) )
2017-10-04 17:51:58 +00:00
def SetClean ( self ) :
2017-09-13 20:50:41 +00:00
with self . _lock :
2017-10-04 17:51:58 +00:00
self . _dirty = False
2018-09-26 19:05:12 +00:00
def SetDefaultGUGKeyAndName ( self , gug_key_and_name ) :
with self . _lock :
( gug_key , gug_name ) = gug_key_and_name
HG . client_controller . new_options . SetKey ( ' default_gug_key ' , gug_key )
HG . client_controller . new_options . SetString ( ' default_gug_name ' , gug_name )
2019-05-08 21:06:42 +00:00
def SetDefaultTagImportOptions ( self , file_post_default_tag_import_options , watchable_default_tag_import_options , url_class_keys_to_tag_import_options ) :
2018-04-18 22:10:15 +00:00
with self . _lock :
self . _file_post_default_tag_import_options = file_post_default_tag_import_options
self . _watchable_default_tag_import_options = watchable_default_tag_import_options
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_default_tag_import_options = url_class_keys_to_tag_import_options
2018-04-18 22:10:15 +00:00
2018-05-09 20:23:00 +00:00
self . _SetDirty ( )
2018-04-18 22:10:15 +00:00
2018-08-22 21:10:59 +00:00
def SetGUGs ( self , gugs ) :
with self . _lock :
2018-09-05 20:52:32 +00:00
# by default, we will show new gugs
old_gug_keys = { gug . GetGUGKey ( ) for gug in self . _gugs }
gug_keys = { gug . GetGUGKey ( ) for gug in gugs }
added_gug_keys = gug_keys . difference ( old_gug_keys )
self . _gug_keys_to_display . update ( added_gug_keys )
#
2018-08-22 21:10:59 +00:00
2018-08-29 20:20:41 +00:00
self . _gugs = HydrusSerialisable . SerialisableList ( gugs )
2018-09-05 20:52:32 +00:00
self . _RecalcCache ( )
self . _SetDirty ( )
def SetGUGKeysToDisplay ( self , gug_keys_to_display ) :
with self . _lock :
self . _gug_keys_to_display = set ( )
self . _gug_keys_to_display . update ( gug_keys_to_display )
2018-08-29 20:20:41 +00:00
self . _SetDirty ( )
2018-08-22 21:10:59 +00:00
2017-10-04 17:51:58 +00:00
def SetHeaderValidation ( self , network_context , key , approved ) :
with self . _lock :
2017-10-11 17:38:14 +00:00
if network_context in self . _network_contexts_to_custom_header_dicts :
custom_header_dict = self . _network_contexts_to_custom_header_dicts [ network_context ]
if key in custom_header_dict :
( value , old_approved , reason ) = custom_header_dict [ key ]
custom_header_dict [ key ] = ( value , approved , reason )
self . _SetDirty ( )
2017-10-04 17:51:58 +00:00
2017-10-11 17:38:14 +00:00
def SetNetworkContextsToCustomHeaderDicts ( self , network_contexts_to_custom_header_dicts ) :
with self . _lock :
self . _network_contexts_to_custom_header_dicts = network_contexts_to_custom_header_dicts
2017-10-04 17:51:58 +00:00
2017-10-11 17:38:14 +00:00
self . _SetDirty ( )
2017-10-04 17:51:58 +00:00
2018-01-24 23:09:42 +00:00
def SetParsers ( self , parsers ) :
with self . _lock :
self . _parsers = HydrusSerialisable . SerialisableList ( )
self . _parsers . extend ( parsers )
2018-04-25 22:07:52 +00:00
self . _parsers . sort ( key = lambda p : p . GetName ( ) )
2018-01-24 23:09:42 +00:00
# delete orphans
2018-01-31 22:58:15 +00:00
parser_keys = { parser . GetParserKey ( ) for parser in parsers }
2018-01-24 23:09:42 +00:00
2019-05-08 21:06:42 +00:00
deletee_url_class_keys = set ( )
2018-01-24 23:09:42 +00:00
2020-06-17 21:31:54 +00:00
for ( url_class_key , parser_key ) in self . _url_class_keys_to_parser_keys . items ( ) :
2018-01-24 23:09:42 +00:00
if parser_key not in parser_keys :
2019-05-08 21:06:42 +00:00
deletee_url_class_keys . add ( url_class_key )
2018-01-24 23:09:42 +00:00
2019-05-08 21:06:42 +00:00
for deletee_url_class_key in deletee_url_class_keys :
2018-01-24 23:09:42 +00:00
2019-05-08 21:06:42 +00:00
del self . _url_class_keys_to_parser_keys [ deletee_url_class_key ]
2018-01-24 23:09:42 +00:00
#
self . _RecalcCache ( )
self . _SetDirty ( )
2019-05-08 21:06:42 +00:00
def SetURLClasses ( self , url_classes ) :
2017-11-29 21:48:23 +00:00
with self . _lock :
2018-05-09 20:23:00 +00:00
# by default, we will show post urls
2018-01-17 22:52:10 +00:00
2020-06-17 21:31:54 +00:00
old_post_url_class_keys = { url_class . GetClassKey ( ) for url_class in self . _url_classes if url_class . IsPostURL ( ) }
post_url_class_keys = { url_class . GetClassKey ( ) for url_class in url_classes if url_class . IsPostURL ( ) }
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
added_post_url_class_keys = post_url_class_keys . difference ( old_post_url_class_keys )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_display . update ( added_post_url_class_keys )
2018-01-17 22:52:10 +00:00
#
2019-05-08 21:06:42 +00:00
self . _url_classes = HydrusSerialisable . SerialisableList ( )
2017-11-29 21:48:23 +00:00
2019-05-08 21:06:42 +00:00
self . _url_classes . extend ( url_classes )
2017-11-29 21:48:23 +00:00
2019-05-08 21:06:42 +00:00
self . _url_classes . sort ( key = lambda u : u . GetName ( ) )
2018-04-25 22:07:52 +00:00
2018-01-17 22:52:10 +00:00
#
# delete orphans
2020-06-17 21:31:54 +00:00
url_class_keys = { url_class . GetClassKey ( ) for url_class in url_classes }
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_display . intersection_update ( url_class_keys )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
for deletee_key in set ( self . _url_class_keys_to_parser_keys . keys ( ) ) . difference ( url_class_keys ) :
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
del self . _url_class_keys_to_parser_keys [ deletee_key ]
2018-01-17 22:52:10 +00:00
2017-12-06 22:06:56 +00:00
2018-02-07 23:40:33 +00:00
# any url matches that link to another via the API conversion will not be using parsers
2019-05-08 21:06:42 +00:00
url_class_api_pairs = ConvertURLClassesIntoAPIPairs ( self . _url_classes )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
for ( url_class_original , url_class_api ) in url_class_api_pairs :
2018-02-07 23:40:33 +00:00
2020-06-17 21:31:54 +00:00
url_class_key = url_class_original . GetClassKey ( )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
if url_class_key in self . _url_class_keys_to_parser_keys :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
del self . _url_class_keys_to_parser_keys [ url_class_key ]
2018-02-07 23:40:33 +00:00
2017-12-06 22:06:56 +00:00
self . _RecalcCache ( )
self . _SetDirty ( )
2019-05-08 21:06:42 +00:00
def SetURLClassKeysToParserKeys ( self , url_class_keys_to_parser_keys ) :
2017-12-06 22:06:56 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_parser_keys = HydrusSerialisable . SerialisableBytesDictionary ( )
2017-12-06 22:06:56 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_parser_keys . update ( url_class_keys_to_parser_keys )
2017-12-06 22:06:56 +00:00
2020-06-17 21:31:54 +00:00
self . _CleanURLClassKeysToParserKeys ( )
2017-11-29 21:48:23 +00:00
self . _SetDirty ( )
2019-05-08 21:06:42 +00:00
def SetURLClassKeysToDisplay ( self , url_class_keys_to_display ) :
2018-09-05 20:52:32 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_display = set ( )
2018-09-05 20:52:32 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_display . update ( url_class_keys_to_display )
2018-09-05 20:52:32 +00:00
self . _SetDirty ( )
2018-04-18 22:10:15 +00:00
def ShouldAssociateURLWithFiles ( self , url ) :
with self . _lock :
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
if url_class is None :
2018-04-18 22:10:15 +00:00
return True
2019-05-08 21:06:42 +00:00
return url_class . ShouldAssociateWithFiles ( )
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
def TryToLinkURLClassesAndParsers ( self ) :
2018-02-07 23:40:33 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
new_url_class_keys_to_parser_keys = NetworkDomainManager . STATICLinkURLClassesAndParsers ( self . _url_classes , self . _parsers , self . _url_class_keys_to_parser_keys )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_keys_to_parser_keys . update ( new_url_class_keys_to_parser_keys )
2018-02-07 23:40:33 +00:00
2020-06-17 21:31:54 +00:00
self . _CleanURLClassKeysToParserKeys ( )
2018-02-07 23:40:33 +00:00
self . _SetDirty ( )
2018-05-09 20:23:00 +00:00
def URLCanReferToMultipleFiles ( self , url ) :
2018-04-25 22:07:52 +00:00
with self . _lock :
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2018-04-25 22:07:52 +00:00
2019-05-08 21:06:42 +00:00
if url_class is None :
2018-04-25 22:07:52 +00:00
return False
2019-05-08 21:06:42 +00:00
return url_class . CanReferToMultipleFiles ( )
2018-04-25 22:07:52 +00:00
2018-04-18 22:10:15 +00:00
def URLDefinitelyRefersToOneFile ( self , url ) :
with self . _lock :
2019-05-08 21:06:42 +00:00
url_class = self . _GetURLClass ( url )
2018-04-18 22:10:15 +00:00
2019-05-08 21:06:42 +00:00
if url_class is None :
2018-04-18 22:10:15 +00:00
return False
2019-05-08 21:06:42 +00:00
return url_class . RefersToOneFile ( )
2018-04-18 22:10:15 +00:00
2018-02-07 23:40:33 +00:00
@staticmethod
2019-05-08 21:06:42 +00:00
def STATICLinkURLClassesAndParsers ( url_classes , parsers , existing_url_class_keys_to_parser_keys ) :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
url_classes = list ( url_classes )
2018-08-01 20:44:57 +00:00
2019-05-08 21:06:42 +00:00
NetworkDomainManager . STATICSortURLClassesDescendingComplexity ( url_classes )
2018-08-01 20:44:57 +00:00
2018-05-09 20:23:00 +00:00
parsers = list ( parsers )
parsers . sort ( key = lambda p : p . GetName ( ) )
2019-05-08 21:06:42 +00:00
new_url_class_keys_to_parser_keys = { }
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
api_pairs = ConvertURLClassesIntoAPIPairs ( url_classes )
2018-08-01 20:44:57 +00:00
# anything that goes to an api url will be parsed by that api's parser--it can't have its own
2019-05-08 21:06:42 +00:00
api_pair_unparsable_url_classes = set ( )
2018-08-01 20:44:57 +00:00
for ( a , b ) in api_pairs :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
api_pair_unparsable_url_classes . add ( a )
2018-08-01 20:44:57 +00:00
#
2019-05-08 21:06:42 +00:00
# I have to do this backwards, going through parsers and then url_classes, so I can do a proper url match lookup like the real domain manager does it
2018-08-01 20:44:57 +00:00
# otherwise, if we iterate through url matches looking for parsers to match them, we have gallery url matches thinking they match parser post urls
# e.g.
# The page parser might say it supports https://danbooru.donmai.us/posts/3198277
# But the gallery url class might think it recognises that as https://danbooru.donmai.us/posts
#
# So we have to do the normal lookup in the proper descending complexity order, not searching any further than the first, correct match
for parser in parsers :
example_urls = parser . GetExampleURLs ( )
for example_url in example_urls :
2019-05-08 21:06:42 +00:00
for url_class in url_classes :
2018-08-01 20:44:57 +00:00
2020-06-17 21:31:54 +00:00
if url_class in api_pair_unparsable_url_classes :
continue
2019-05-08 21:06:42 +00:00
if url_class . Matches ( example_url ) :
2018-08-01 20:44:57 +00:00
# we have a match. this is the 'correct' match for this example url, and we should not search any more, so we break below
2020-06-17 21:31:54 +00:00
url_class_key = url_class . GetClassKey ( )
2018-08-01 20:44:57 +00:00
2019-05-08 21:06:42 +00:00
parsable = url_class . IsParsable ( )
linkable = url_class_key not in existing_url_class_keys_to_parser_keys and url_class_key not in new_url_class_keys_to_parser_keys
2018-08-01 20:44:57 +00:00
if parsable and linkable :
2019-05-08 21:06:42 +00:00
new_url_class_keys_to_parser_keys [ url_class_key ] = parser . GetParserKey ( )
2018-08-01 20:44:57 +00:00
break
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
'''
#
2019-05-08 21:06:42 +00:00
for url_class in url_classes :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
if not url_class . IsParsable ( ) or url_class in api_pair_unparsable_url_classes :
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
continue
2018-02-07 23:40:33 +00:00
2020-06-17 21:31:54 +00:00
url_class_key = url_class . GetClassKey ( )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
if url_class_key in existing_url_class_keys_to_parser_keys :
2018-08-01 20:44:57 +00:00
continue
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
for parser in parsers :
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
example_urls = parser . GetExampleURLs ( )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
if True in ( url_class . Matches ( example_url ) for example_url in example_urls ) :
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
new_url_class_keys_to_parser_keys [ url_class_key ] = parser . GetParserKey ( )
2018-08-01 20:44:57 +00:00
break
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
'''
2019-05-08 21:06:42 +00:00
return new_url_class_keys_to_parser_keys
2018-08-01 20:44:57 +00:00
@staticmethod
2019-05-08 21:06:42 +00:00
def STATICSortURLClassesDescendingComplexity ( url_classes ) :
2018-08-01 20:44:57 +00:00
# we sort them in descending complexity so that
# post url/manga subpage
# is before
# post url
# also, put more 'precise' URL types above more typically permissive, in the order:
# file
# post
# gallery/watchable
# sorting in reverse, so higher number means more precise
def key ( u_m ) :
u_t = u_m . GetURLType ( )
if u_t == HC . URL_TYPE_FILE :
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
u_t_precision_value = 2
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
elif u_t == HC . URL_TYPE_POST :
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
u_t_precision_value = 1
else :
u_t_precision_value = 0
u_e = u_m . GetExampleURL ( )
return ( u_t_precision_value , u_e . count ( ' / ' ) , u_e . count ( ' = ' ) )
2018-02-07 23:40:33 +00:00
2019-05-08 21:06:42 +00:00
url_classes . sort ( key = key , reverse = True )
2018-02-07 23:40:33 +00:00
2017-10-04 17:51:58 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_NETWORK_DOMAIN_MANAGER ] = NetworkDomainManager
2018-10-03 21:00:15 +00:00
class DomainMetadataPackage ( HydrusSerialisable . SerialisableBase ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_DOMAIN_METADATA_PACKAGE
SERIALISABLE_NAME = ' Domain Metadata '
SERIALISABLE_VERSION = 1
def __init__ ( self , domain = None , headers_list = None , bandwidth_rules = None ) :
HydrusSerialisable . SerialisableBase . __init__ ( self )
if domain is None :
domain = ' example.com '
self . _domain = domain
self . _headers_list = headers_list
self . _bandwidth_rules = bandwidth_rules
def _GetSerialisableInfo ( self ) :
if self . _bandwidth_rules is None :
serialisable_bandwidth_rules = self . _bandwidth_rules
else :
serialisable_bandwidth_rules = self . _bandwidth_rules . GetSerialisableTuple ( )
return ( self . _domain , self . _headers_list , serialisable_bandwidth_rules )
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
( self . _domain , self . _headers_list , serialisable_bandwidth_rules ) = serialisable_info
if serialisable_bandwidth_rules is None :
self . _bandwidth_rules = serialisable_bandwidth_rules
else :
self . _bandwidth_rules = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_bandwidth_rules )
def GetBandwidthRules ( self ) :
return self . _bandwidth_rules
def GetDetailedSafeSummary ( self ) :
components = [ ' For domain " ' + self . _domain + ' " : ' ]
if self . HasBandwidthRules ( ) :
m = ' Bandwidth rules: '
m + = os . linesep
m + = os . linesep . join ( [ HydrusNetworking . ConvertBandwidthRuleToString ( rule ) for rule in self . _bandwidth_rules . GetRules ( ) ] )
components . append ( m )
if self . HasHeaders ( ) :
m = ' Headers: '
m + = os . linesep
m + = os . linesep . join ( [ key + ' : ' + value + ' - ' + reason for ( key , value , reason ) in self . _headers_list ] )
components . append ( m )
joiner = os . linesep * 2
s = joiner . join ( components )
return s
def GetDomain ( self ) :
return self . _domain
def GetHeaders ( self ) :
return self . _headers_list
def GetSafeSummary ( self ) :
components = [ ]
if self . HasBandwidthRules ( ) :
components . append ( ' bandwidth rules ' )
if self . HasHeaders ( ) :
components . append ( ' headers ' )
return ' and ' . join ( components ) + ' - ' + self . _domain
def HasBandwidthRules ( self ) :
return self . _bandwidth_rules is not None
def HasHeaders ( self ) :
return self . _headers_list is not None
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_DOMAIN_METADATA_PACKAGE ] = DomainMetadataPackage
2017-10-11 17:38:14 +00:00
class DomainValidationPopupProcess ( object ) :
2017-10-04 17:51:58 +00:00
def __init__ ( self , domain_manager , header_tuples ) :
self . _domain_manager = domain_manager
self . _header_tuples = header_tuples
self . _is_done = False
def IsDone ( self ) :
return self . _is_done
def Start ( self ) :
try :
results = [ ]
2017-10-11 17:38:14 +00:00
for ( network_context , key , value , reason ) in self . _header_tuples :
2017-10-04 17:51:58 +00:00
job_key = ClientThreading . JobKey ( )
# generate question
2017-10-11 17:38:14 +00:00
2019-01-09 22:59:03 +00:00
question = ' For the network context ' + network_context . ToString ( ) + ' , can the client set this header? '
2017-10-11 17:38:14 +00:00
question + = os . linesep * 2
question + = key + ' : ' + value
question + = os . linesep * 2
question + = reason
2017-10-04 17:51:58 +00:00
job_key . SetVariable ( ' popup_yes_no_question ' , question )
2017-10-11 17:38:14 +00:00
HG . client_controller . pub ( ' message ' , job_key )
2017-10-04 17:51:58 +00:00
result = job_key . GetIfHasVariable ( ' popup_yes_no_answer ' )
while result is None :
if HG . view_shutdown :
return
time . sleep ( 0.25 )
2017-10-11 17:38:14 +00:00
result = job_key . GetIfHasVariable ( ' popup_yes_no_answer ' )
2017-10-04 17:51:58 +00:00
if result :
approved = VALID_APPROVED
else :
approved = VALID_DENIED
self . _domain_manager . SetHeaderValidation ( network_context , key , approved )
finally :
self . _is_done = True
2017-09-13 20:50:41 +00:00
2018-08-15 20:40:30 +00:00
GALLERY_INDEX_TYPE_PATH_COMPONENT = 0
GALLERY_INDEX_TYPE_PARAMETER = 1
2018-08-22 21:10:59 +00:00
class GalleryURLGenerator ( HydrusSerialisable . SerialisableBaseNamed ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_GALLERY_URL_GENERATOR
SERIALISABLE_NAME = ' Gallery URL Generator '
SERIALISABLE_VERSION = 1
def __init__ ( self , name , gug_key = None , url_template = None , replacement_phrase = None , search_terms_separator = None , initial_search_text = None , example_search_text = None ) :
if gug_key is None :
gug_key = HydrusData . GenerateKey ( )
if url_template is None :
url_template = ' https://example.com/search?q= % tags % &index=0 '
if replacement_phrase is None :
replacement_phrase = ' % tags % '
if search_terms_separator is None :
search_terms_separator = ' + '
if initial_search_text is None :
initial_search_text = ' search tags '
if example_search_text is None :
example_search_text = ' blue_eyes blonde_hair '
HydrusSerialisable . SerialisableBaseNamed . __init__ ( self , name )
self . _gallery_url_generator_key = gug_key
self . _url_template = url_template
self . _replacement_phrase = replacement_phrase
self . _search_terms_separator = search_terms_separator
self . _initial_search_text = initial_search_text
self . _example_search_text = example_search_text
def _GetSerialisableInfo ( self ) :
2019-01-09 22:59:03 +00:00
serialisable_gallery_url_generator_key = self . _gallery_url_generator_key . hex ( )
2018-08-22 21:10:59 +00:00
return ( serialisable_gallery_url_generator_key , self . _url_template , self . _replacement_phrase , self . _search_terms_separator , self . _initial_search_text , self . _example_search_text )
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
( serialisable_gallery_url_generator_key , self . _url_template , self . _replacement_phrase , self . _search_terms_separator , self . _initial_search_text , self . _example_search_text ) = serialisable_info
2019-01-09 22:59:03 +00:00
self . _gallery_url_generator_key = bytes . fromhex ( serialisable_gallery_url_generator_key )
2018-08-22 21:10:59 +00:00
2018-09-05 20:52:32 +00:00
def GenerateGalleryURL ( self , query_text ) :
2018-08-22 21:10:59 +00:00
if self . _replacement_phrase == ' ' :
raise HydrusExceptions . GUGException ( ' No replacement phrase! ' )
if self . _replacement_phrase not in self . _url_template :
raise HydrusExceptions . GUGException ( ' Replacement phrase not in URL template! ' )
2018-09-05 20:52:32 +00:00
( first_part , second_part ) = self . _url_template . split ( self . _replacement_phrase , 1 )
search_phrase_seems_to_go_in_path = ' ? ' not in first_part
search_terms = query_text . split ( ' ' )
2020-05-20 21:36:02 +00:00
# if a user enters "%20" in a query, or any other percent-encoded char, we turn it into human here, lest it be re-quoted in a moment
# if a user enters "%25", i.e. "%", followed by some characters, then all bets are off
search_terms = [ urllib . parse . unquote ( search_term ) for search_term in search_terms ]
2018-09-05 20:52:32 +00:00
if search_phrase_seems_to_go_in_path :
2019-01-09 22:59:03 +00:00
# encode all this gubbins since requests won't be able to do it
2018-09-05 20:52:32 +00:00
# this basically fixes e621 searches for 'male/female', which through some httpconf trickery are embedded in path but end up in a query, so need to be encoded right beforehand
2019-01-09 22:59:03 +00:00
encoded_search_terms = [ urllib . parse . quote ( search_term , safe = ' ' ) for search_term in search_terms ]
2018-10-03 21:00:15 +00:00
else :
encoded_search_terms = [ ]
for search_term in search_terms :
2019-01-09 22:59:03 +00:00
# when the tags separator is '+' but the tags include '6+girls', we run into fun internet land
2020-05-13 19:03:16 +00:00
bad_chars = [ self . _search_terms_separator , ' & ' , ' = ' , ' / ' , ' ? ' , ' # ' ]
2019-08-15 00:40:48 +00:00
if True in ( bad_char in search_term for bad_char in bad_chars ) :
2018-10-03 21:00:15 +00:00
2019-01-09 22:59:03 +00:00
search_term = urllib . parse . quote ( search_term , safe = ' ' )
2018-10-03 21:00:15 +00:00
encoded_search_terms . append ( search_term )
2018-09-05 20:52:32 +00:00
2018-08-22 21:10:59 +00:00
try :
2018-10-03 21:00:15 +00:00
search_phrase = self . _search_terms_separator . join ( encoded_search_terms )
2018-08-22 21:10:59 +00:00
gallery_url = self . _url_template . replace ( self . _replacement_phrase , search_phrase )
except Exception as e :
2019-01-09 22:59:03 +00:00
raise HydrusExceptions . GUGException ( str ( e ) )
2018-08-22 21:10:59 +00:00
return gallery_url
2018-09-05 20:52:32 +00:00
def GenerateGalleryURLs ( self , query_text ) :
return ( self . GenerateGalleryURL ( query_text ) , )
2018-08-22 21:10:59 +00:00
def GetExampleURL ( self ) :
2018-09-05 20:52:32 +00:00
return self . GenerateGalleryURL ( self . _example_search_text )
2018-08-22 21:10:59 +00:00
2018-10-03 21:00:15 +00:00
def GetExampleURLs ( self ) :
return ( self . GetExampleURL ( ) , )
2018-08-22 21:10:59 +00:00
def GetGUGKey ( self ) :
return self . _gallery_url_generator_key
2018-09-05 20:52:32 +00:00
def GetGUGKeyAndName ( self ) :
return ( self . _gallery_url_generator_key , self . _name )
2018-08-22 21:10:59 +00:00
def GetInitialSearchText ( self ) :
return self . _initial_search_text
2018-09-19 21:54:51 +00:00
def GetSafeSummary ( self ) :
return ' Downloader " ' + self . _name + ' " - ' + ConvertURLIntoDomain ( self . GetExampleURL ( ) )
2018-08-22 21:10:59 +00:00
def GetURLTemplateVariables ( self ) :
return ( self . _url_template , self . _replacement_phrase , self . _search_terms_separator , self . _example_search_text )
2020-06-17 21:31:54 +00:00
def SetGUGKey ( self , gug_key : bytes ) :
self . _gallery_url_generator_key = gug_key
2018-09-19 21:54:51 +00:00
def SetGUGKeyAndName ( self , gug_key_and_name ) :
( gug_key , name ) = gug_key_and_name
self . _gallery_url_generator_key = gug_key
self . _name = name
2018-09-05 20:52:32 +00:00
def IsFunctional ( self ) :
2018-09-12 21:36:26 +00:00
try :
example_url = self . GetExampleURL ( )
( url_type , match_name , can_parse ) = HG . client_controller . network_engine . domain_manager . GetURLParseCapability ( example_url )
except :
return False
2018-09-05 20:52:32 +00:00
return can_parse
2018-08-22 21:10:59 +00:00
def RegenerateGUGKey ( self ) :
self . _gallery_url_generator_key = HydrusData . GenerateKey ( )
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_GALLERY_URL_GENERATOR ] = GalleryURLGenerator
class NestedGalleryURLGenerator ( HydrusSerialisable . SerialisableBaseNamed ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_NESTED_GALLERY_URL_GENERATOR
SERIALISABLE_NAME = ' Nested Gallery URL Generator '
SERIALISABLE_VERSION = 1
2018-09-05 20:52:32 +00:00
def __init__ ( self , name , gug_key = None , initial_search_text = None , gug_keys_and_names = None ) :
if gug_key is None :
gug_key = HydrusData . GenerateKey ( )
2018-08-22 21:10:59 +00:00
if initial_search_text is None :
initial_search_text = ' search tags '
2018-09-05 20:52:32 +00:00
if gug_keys_and_names is None :
2018-08-22 21:10:59 +00:00
2018-09-05 20:52:32 +00:00
gug_keys_and_names = [ ]
2018-08-22 21:10:59 +00:00
HydrusSerialisable . SerialisableBaseNamed . __init__ ( self , name )
2018-09-05 20:52:32 +00:00
self . _gallery_url_generator_key = gug_key
2018-08-22 21:10:59 +00:00
self . _initial_search_text = initial_search_text
2018-09-05 20:52:32 +00:00
self . _gug_keys_and_names = gug_keys_and_names
2018-08-22 21:10:59 +00:00
def _GetSerialisableInfo ( self ) :
2019-01-09 22:59:03 +00:00
serialisable_gug_key = self . _gallery_url_generator_key . hex ( )
serialisable_gug_keys_and_names = [ ( gug_key . hex ( ) , gug_name ) for ( gug_key , gug_name ) in self . _gug_keys_and_names ]
2018-08-22 21:10:59 +00:00
2018-09-05 20:52:32 +00:00
return ( serialisable_gug_key , self . _initial_search_text , serialisable_gug_keys_and_names )
2018-08-22 21:10:59 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2018-09-05 20:52:32 +00:00
( serialisable_gug_key , self . _initial_search_text , serialisable_gug_keys_and_names ) = serialisable_info
2018-08-22 21:10:59 +00:00
2019-01-09 22:59:03 +00:00
self . _gallery_url_generator_key = bytes . fromhex ( serialisable_gug_key )
self . _gug_keys_and_names = [ ( bytes . fromhex ( gug_key ) , gug_name ) for ( gug_key , gug_name ) in serialisable_gug_keys_and_names ]
2018-08-22 21:10:59 +00:00
2018-09-05 20:52:32 +00:00
def GenerateGalleryURLs ( self , query_text ) :
2018-08-22 21:10:59 +00:00
gallery_urls = [ ]
2018-09-05 20:52:32 +00:00
for gug_key_and_name in self . _gug_keys_and_names :
2018-08-22 21:10:59 +00:00
2018-09-05 20:52:32 +00:00
gug = HG . client_controller . network_engine . domain_manager . GetGUG ( gug_key_and_name )
2018-08-22 21:10:59 +00:00
if gug is not None :
2018-09-05 20:52:32 +00:00
gallery_urls . append ( gug . GenerateGalleryURL ( query_text ) )
2018-08-22 21:10:59 +00:00
return gallery_urls
2018-09-19 21:54:51 +00:00
def GetExampleURLs ( self ) :
example_urls = [ ]
for gug_key_and_name in self . _gug_keys_and_names :
gug = HG . client_controller . network_engine . domain_manager . GetGUG ( gug_key_and_name )
if gug is not None :
example_urls . append ( gug . GetExampleURL ( ) )
return example_urls
2018-09-05 20:52:32 +00:00
def GetGUGKey ( self ) :
return self . _gallery_url_generator_key
def GetGUGKeys ( self ) :
return [ gug_key for ( gug_key , gug_name ) in self . _gug_keys_and_names ]
def GetGUGKeysAndNames ( self ) :
return list ( self . _gug_keys_and_names )
def GetGUGKeyAndName ( self ) :
return ( self . _gallery_url_generator_key , self . _name )
def GetGUGNames ( self ) :
return [ gug_name for ( gug_key , gug_name ) in self . _gug_keys_and_names ]
2018-08-22 21:10:59 +00:00
def GetInitialSearchText ( self ) :
return self . _initial_search_text
2018-09-19 21:54:51 +00:00
def GetSafeSummary ( self ) :
return ' Nested downloader " ' + self . _name + ' " - ' + ' , ' . join ( ( name for ( gug_key , name ) in self . _gug_keys_and_names ) )
2018-09-05 20:52:32 +00:00
def IsFunctional ( self ) :
for gug_key_and_name in self . _gug_keys_and_names :
gug = HG . client_controller . network_engine . domain_manager . GetGUG ( gug_key_and_name )
if gug is not None :
if gug . IsFunctional ( ) :
return True
return False
def RegenerateGUGKey ( self ) :
self . _gallery_url_generator_key = HydrusData . GenerateKey ( )
def RepairGUGs ( self , available_gugs ) :
available_keys_to_gugs = { gug . GetGUGKey ( ) : gug for gug in available_gugs }
available_names_to_gugs = { gug . GetName ( ) : gug for gug in available_gugs }
good_gug_keys_and_names = [ ]
for ( gug_key , gug_name ) in self . _gug_keys_and_names :
if gug_key in available_keys_to_gugs :
gug = available_keys_to_gugs [ gug_key ]
elif gug_name in available_names_to_gugs :
gug = available_names_to_gugs [ gug_name ]
else :
continue
good_gug_keys_and_names . append ( ( gug . GetGUGKey ( ) , gug . GetName ( ) ) )
self . _gug_keys_and_names = good_gug_keys_and_names
2020-06-17 21:31:54 +00:00
def SetGUGKey ( self , gug_key : bytes ) :
self . _gallery_url_generator_key = gug_key
2018-09-19 21:54:51 +00:00
def SetGUGKeyAndName ( self , gug_key_and_name ) :
( gug_key , name ) = gug_key_and_name
self . _gallery_url_generator_key = gug_key
self . _name = name
2018-08-22 21:10:59 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_NESTED_GALLERY_URL_GENERATOR ] = NestedGalleryURLGenerator
2020-05-06 21:31:41 +00:00
def RemoveWWWFromDomain ( domain ) :
if domain . count ( ' . ' ) > 1 and domain . startswith ( ' www ' ) :
domain = ConvertDomainIntoNextLevelDomain ( domain )
return domain
2019-10-09 22:03:03 +00:00
SEND_REFERRAL_URL_ONLY_IF_PROVIDED = 0
SEND_REFERRAL_URL_NEVER = 1
SEND_REFERRAL_URL_CONVERTER_IF_NONE_PROVIDED = 2
SEND_REFERRAL_URL_ONLY_CONVERTER = 3
SEND_REFERRAL_URL_TYPES = [ SEND_REFERRAL_URL_ONLY_IF_PROVIDED , SEND_REFERRAL_URL_NEVER , SEND_REFERRAL_URL_CONVERTER_IF_NONE_PROVIDED , SEND_REFERRAL_URL_ONLY_CONVERTER ]
send_referral_url_string_lookup = { }
send_referral_url_string_lookup [ SEND_REFERRAL_URL_ONLY_IF_PROVIDED ] = ' send a referral url if available '
send_referral_url_string_lookup [ SEND_REFERRAL_URL_NEVER ] = ' never send a referral url '
send_referral_url_string_lookup [ SEND_REFERRAL_URL_CONVERTER_IF_NONE_PROVIDED ] = ' use the converter if no referral is available '
send_referral_url_string_lookup [ SEND_REFERRAL_URL_ONLY_CONVERTER ] = ' always use the converter referral url '
2019-05-08 21:06:42 +00:00
class URLClass ( HydrusSerialisable . SerialisableBaseNamed ) :
2017-09-13 20:50:41 +00:00
2019-05-08 21:06:42 +00:00
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_URL_CLASS
2018-06-20 20:20:22 +00:00
SERIALISABLE_NAME = ' URL Class '
2019-11-28 01:11:46 +00:00
SERIALISABLE_VERSION = 8
2017-09-27 21:52:54 +00:00
2019-11-28 01:11:46 +00:00
def __init__ ( self , name , url_class_key = None , url_type = None , preferred_scheme = ' https ' , netloc = ' hostname.com ' , path_components = None , parameters = None , api_lookup_converter = None , send_referral_url = SEND_REFERRAL_URL_ONLY_IF_PROVIDED , referral_url_converter = None , gallery_index_type = None , gallery_index_identifier = None , gallery_index_delta = 1 , example_url = ' https://hostname.com/post/page.php?id=123456&s=view ' ) :
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
if url_class_key is None :
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_class_key = HydrusData . GenerateKey ( )
2018-01-17 22:52:10 +00:00
2017-11-29 21:48:23 +00:00
if url_type is None :
url_type = HC . URL_TYPE_POST
2017-09-27 21:52:54 +00:00
if path_components is None :
2018-08-29 20:20:41 +00:00
path_components = [ ]
2017-09-27 21:52:54 +00:00
2018-08-29 20:20:41 +00:00
path_components . append ( ( ClientParsing . StringMatch ( match_type = ClientParsing . STRING_MATCH_FIXED , match_value = ' post ' , example_string = ' post ' ) , None ) )
path_components . append ( ( ClientParsing . StringMatch ( match_type = ClientParsing . STRING_MATCH_FIXED , match_value = ' page.php ' , example_string = ' page.php ' ) , None ) )
2017-09-27 21:52:54 +00:00
if parameters is None :
2018-08-29 20:20:41 +00:00
parameters = { }
2017-09-27 21:52:54 +00:00
2018-08-29 20:20:41 +00:00
parameters [ ' s ' ] = ( ClientParsing . StringMatch ( match_type = ClientParsing . STRING_MATCH_FIXED , match_value = ' view ' , example_string = ' view ' ) , None )
parameters [ ' id ' ] = ( ClientParsing . StringMatch ( match_type = ClientParsing . STRING_MATCH_FLEXIBLE , match_value = ClientParsing . NUMERIC , example_string = ' 123456 ' ) , None )
2017-09-27 21:52:54 +00:00
2017-09-13 20:50:41 +00:00
2018-01-17 22:52:10 +00:00
if api_lookup_converter is None :
api_lookup_converter = ClientParsing . StringConverter ( example_string = ' https://hostname.com/post/page.php?id=123456&s=view ' )
2019-10-09 22:03:03 +00:00
if referral_url_converter is None :
referral_url_converter = ClientParsing . StringConverter ( example_string = ' https://hostname.com/post/page.php?id=123456&s=view ' )
2017-11-29 21:48:23 +00:00
# if the args are not serialisable stuff, lets overwrite here
path_components = HydrusSerialisable . SerialisableList ( path_components )
parameters = HydrusSerialisable . SerialisableDictionary ( parameters )
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
HydrusSerialisable . SerialisableBaseNamed . __init__ ( self , name )
2019-05-08 21:06:42 +00:00
self . _url_class_key = url_class_key
2017-11-29 21:48:23 +00:00
self . _url_type = url_type
2017-10-04 17:51:58 +00:00
self . _preferred_scheme = preferred_scheme
self . _netloc = netloc
2018-08-15 20:40:30 +00:00
2019-11-28 01:11:46 +00:00
self . _match_subdomains = False
self . _keep_matched_subdomains = False
self . _alphabetise_get_parameters = True
self . _can_produce_multiple_files = False
self . _should_be_associated_with_files = True
2018-08-15 20:40:30 +00:00
2017-10-04 17:51:58 +00:00
self . _path_components = path_components
self . _parameters = parameters
2018-01-17 22:52:10 +00:00
self . _api_lookup_converter = api_lookup_converter
2018-08-15 20:40:30 +00:00
2019-10-09 22:03:03 +00:00
self . _send_referral_url = send_referral_url
self . _referral_url_converter = referral_url_converter
2018-08-15 20:40:30 +00:00
self . _gallery_index_type = gallery_index_type
self . _gallery_index_identifier = gallery_index_identifier
self . _gallery_index_delta = gallery_index_delta
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
self . _example_url = example_url
2017-09-13 20:50:41 +00:00
def _ClipNetLoc ( self , netloc ) :
2018-04-25 22:07:52 +00:00
if self . _keep_matched_subdomains :
2017-09-13 20:50:41 +00:00
# for domains like artistname.website.com, where removing the subdomain may break the url, we leave it alone
pass
else :
# for domains like mediaserver4.website.com, where multiple subdomains serve the same content as the larger site
2018-08-01 20:44:57 +00:00
if not DomainEqualsAnotherForgivingWWW ( netloc , self . _netloc ) :
netloc = self . _netloc
2017-09-13 20:50:41 +00:00
return netloc
2018-08-29 20:20:41 +00:00
def _ClipAndFleshOutPath ( self , path , allow_clip = True ) :
2017-09-13 20:50:41 +00:00
# /post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
while path . startswith ( ' / ' ) :
path = path [ 1 : ]
# post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
path_components = path . split ( ' / ' )
2018-08-29 20:20:41 +00:00
if allow_clip or len ( path_components ) < len ( self . _path_components ) :
clipped_path_components = [ ]
for ( index , ( string_match , default ) ) in enumerate ( self . _path_components ) :
if len ( path_components ) > index : # the given path has the value
clipped_path_component = path_components [ index ]
elif default is not None :
clipped_path_component = default
else :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not clip path--given url appeared to be too short! ' )
2018-08-29 20:20:41 +00:00
clipped_path_components . append ( clipped_path_component )
path = ' / ' . join ( clipped_path_components )
2017-09-13 20:50:41 +00:00
# post/show/1326143
2019-02-06 22:41:35 +00:00
path = ' / ' + path
2017-09-13 20:50:41 +00:00
# /post/show/1326143
return path
2018-08-29 20:20:41 +00:00
def _ClipAndFleshOutQuery ( self , query , allow_clip = True ) :
2017-09-13 20:50:41 +00:00
2019-11-28 01:11:46 +00:00
( query_dict , param_order ) = ConvertQueryTextToDict ( query )
2017-09-13 20:50:41 +00:00
2018-08-29 20:20:41 +00:00
if allow_clip :
2019-11-28 01:11:46 +00:00
query_dict = { key : value for ( key , value ) in query_dict . items ( ) if key in self . _parameters }
2018-08-29 20:20:41 +00:00
2017-09-13 20:50:41 +00:00
2019-11-28 01:11:46 +00:00
for ( key , ( string_match , default ) ) in self . _parameters . items ( ) :
2018-08-29 20:20:41 +00:00
if key not in query_dict :
if default is None :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not flesh out query--no default for ' + key + ' defined! ' )
2018-08-29 20:20:41 +00:00
else :
query_dict [ key ] = default
2019-11-28 01:11:46 +00:00
if self . _alphabetise_get_parameters :
param_order = None
query = ConvertQueryDictToText ( query_dict , param_order = param_order )
2017-09-13 20:50:41 +00:00
return query
2018-01-17 22:52:10 +00:00
def _GetSerialisableInfo ( self ) :
2019-05-08 21:06:42 +00:00
serialisable_url_class_key = self . _url_class_key . hex ( )
2018-08-29 20:20:41 +00:00
serialisable_path_components = [ ( string_match . GetSerialisableTuple ( ) , default ) for ( string_match , default ) in self . _path_components ]
2019-01-09 22:59:03 +00:00
serialisable_parameters = [ ( key , ( string_match . GetSerialisableTuple ( ) , default ) ) for ( key , ( string_match , default ) ) in list ( self . _parameters . items ( ) ) ]
2018-01-17 22:52:10 +00:00
serialisable_api_lookup_converter = self . _api_lookup_converter . GetSerialisableTuple ( )
2019-10-09 22:03:03 +00:00
serialisable_referral_url_converter = self . _referral_url_converter . GetSerialisableTuple ( )
2018-01-17 22:52:10 +00:00
2019-11-28 01:11:46 +00:00
return ( serialisable_url_class_key , self . _url_type , self . _preferred_scheme , self . _netloc , self . _match_subdomains , self . _keep_matched_subdomains , self . _alphabetise_get_parameters , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , self . _send_referral_url , serialisable_referral_url_converter , self . _can_produce_multiple_files , self . _should_be_associated_with_files , self . _gallery_index_type , self . _gallery_index_identifier , self . _gallery_index_delta , self . _example_url )
2018-01-17 22:52:10 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2019-11-28 01:11:46 +00:00
( serialisable_url_class_key , self . _url_type , self . _preferred_scheme , self . _netloc , self . _match_subdomains , self . _keep_matched_subdomains , self . _alphabetise_get_parameters , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , self . _send_referral_url , serialisable_referral_url_converter , self . _can_produce_multiple_files , self . _should_be_associated_with_files , self . _gallery_index_type , self . _gallery_index_identifier , self . _gallery_index_delta , self . _example_url ) = serialisable_info
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_key = bytes . fromhex ( serialisable_url_class_key )
2018-08-29 20:20:41 +00:00
self . _path_components = [ ( HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_match ) , default ) for ( serialisable_string_match , default ) in serialisable_path_components ]
self . _parameters = { key : ( HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_string_match ) , default ) for ( key , ( serialisable_string_match , default ) ) in serialisable_parameters }
2018-01-17 22:52:10 +00:00
self . _api_lookup_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_api_lookup_converter )
2019-10-09 22:03:03 +00:00
self . _referral_url_converter = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_referral_url_converter )
2018-01-17 22:52:10 +00:00
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
2018-04-25 22:07:52 +00:00
( url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , example_url ) = old_serialisable_info
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
url_class_key = HydrusData . GenerateKey ( )
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
serialisable_url_class_key = url_class_key . hex ( )
2018-01-17 22:52:10 +00:00
api_lookup_converter = ClientParsing . StringConverter ( example_string = example_url )
serialisable_api_lookup_converter = api_lookup_converter . GetSerialisableTuple ( )
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , example_url )
2018-01-17 22:52:10 +00:00
return ( 2 , new_serialisable_info )
2018-04-18 22:10:15 +00:00
if version == 2 :
2019-05-08 21:06:42 +00:00
( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , example_url ) = old_serialisable_info
2018-04-18 22:10:15 +00:00
if url_type in ( HC . URL_TYPE_FILE , HC . URL_TYPE_POST ) :
should_be_associated_with_files = True
else :
should_be_associated_with_files = False
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , should_be_associated_with_files , example_url )
2018-04-18 22:10:15 +00:00
return ( 3 , new_serialisable_info )
2018-05-09 20:23:00 +00:00
if version == 3 :
2019-05-08 21:06:42 +00:00
( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , should_be_associated_with_files , example_url ) = old_serialisable_info
2018-05-09 20:23:00 +00:00
can_produce_multiple_files = False
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , can_produce_multiple_files , should_be_associated_with_files , example_url )
2018-05-09 20:23:00 +00:00
return ( 4 , new_serialisable_info )
2018-08-15 20:40:30 +00:00
if version == 4 :
2019-05-08 21:06:42 +00:00
( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , can_produce_multiple_files , should_be_associated_with_files , example_url ) = old_serialisable_info
2018-08-15 20:40:30 +00:00
gallery_index_type = None
gallery_index_identifier = None
gallery_index_delta = 1
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , can_produce_multiple_files , should_be_associated_with_files , gallery_index_type , gallery_index_identifier , gallery_index_delta , example_url )
2018-08-15 20:40:30 +00:00
return ( 5 , new_serialisable_info )
2018-08-29 20:20:41 +00:00
if version == 5 :
2019-05-08 21:06:42 +00:00
( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , can_produce_multiple_files , should_be_associated_with_files , gallery_index_type , gallery_index_identifier , gallery_index_delta , example_url ) = old_serialisable_info
2018-08-29 20:20:41 +00:00
path_components = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_path_components )
parameters = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_parameters )
path_components = [ ( value , None ) for value in path_components ]
2019-01-09 22:59:03 +00:00
parameters = { key : ( value , None ) for ( key , value ) in list ( parameters . items ( ) ) }
2018-08-29 20:20:41 +00:00
serialisable_path_components = [ ( string_match . GetSerialisableTuple ( ) , default ) for ( string_match , default ) in path_components ]
2019-01-09 22:59:03 +00:00
serialisable_parameters = [ ( key , ( string_match . GetSerialisableTuple ( ) , default ) ) for ( key , ( string_match , default ) ) in list ( parameters . items ( ) ) ]
2018-08-29 20:20:41 +00:00
2019-05-08 21:06:42 +00:00
new_serialisable_info = ( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , can_produce_multiple_files , should_be_associated_with_files , gallery_index_type , gallery_index_identifier , gallery_index_delta , example_url )
2018-08-29 20:20:41 +00:00
return ( 6 , new_serialisable_info )
2019-10-09 22:03:03 +00:00
if version == 6 :
( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , can_produce_multiple_files , should_be_associated_with_files , gallery_index_type , gallery_index_identifier , gallery_index_delta , example_url ) = old_serialisable_info
send_referral_url = SEND_REFERRAL_URL_ONLY_IF_PROVIDED
referral_url_converter = ClientParsing . StringConverter ( example_string = ' https://hostname.com/post/page.php?id=123456&s=view ' )
serialisable_referrel_url_converter = referral_url_converter . GetSerialisableTuple ( )
new_serialisable_info = ( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , send_referral_url , serialisable_referrel_url_converter , can_produce_multiple_files , should_be_associated_with_files , gallery_index_type , gallery_index_identifier , gallery_index_delta , example_url )
return ( 7 , new_serialisable_info )
2019-11-28 01:11:46 +00:00
if version == 7 :
( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , send_referral_url , serialisable_referrel_url_converter , can_produce_multiple_files , should_be_associated_with_files , gallery_index_type , gallery_index_identifier , gallery_index_delta , example_url ) = old_serialisable_info
alphabetise_get_parameters = True
new_serialisable_info = ( serialisable_url_class_key , url_type , preferred_scheme , netloc , match_subdomains , keep_matched_subdomains , alphabetise_get_parameters , serialisable_path_components , serialisable_parameters , serialisable_api_lookup_converter , send_referral_url , serialisable_referrel_url_converter , can_produce_multiple_files , should_be_associated_with_files , gallery_index_type , gallery_index_identifier , gallery_index_delta , example_url )
return ( 8 , new_serialisable_info )
def AlphabetiseGetParameters ( self ) :
return self . _alphabetise_get_parameters
2018-08-15 20:40:30 +00:00
def CanGenerateNextGalleryPage ( self ) :
if self . _url_type == HC . URL_TYPE_GALLERY :
if self . _gallery_index_type is not None :
return True
return False
2018-05-09 20:23:00 +00:00
def CanReferToMultipleFiles ( self ) :
is_a_gallery_page = self . _url_type in ( HC . URL_TYPE_GALLERY , HC . URL_TYPE_WATCHABLE )
is_a_multipost_post_page = self . _url_type == HC . URL_TYPE_POST and self . _can_produce_multiple_files
return is_a_gallery_page or is_a_multipost_post_page
2018-01-17 22:52:10 +00:00
2018-08-29 20:20:41 +00:00
def ClippingIsAppropriate ( self ) :
return self . _should_be_associated_with_files or self . UsesAPIURL ( )
2018-05-30 20:13:21 +00:00
def GetAPIURL ( self , url = None ) :
if url is None :
url = self . _example_url
2018-01-17 22:52:10 +00:00
2018-07-11 20:23:51 +00:00
url = self . Normalise ( url )
2018-01-17 22:52:10 +00:00
return self . _api_lookup_converter . Convert ( url )
2017-10-04 17:51:58 +00:00
def GetDomain ( self ) :
2020-01-16 02:08:23 +00:00
return self . _netloc
2017-10-04 17:51:58 +00:00
2017-11-29 21:48:23 +00:00
def GetExampleURL ( self ) :
return self . _example_url
2018-08-15 20:40:30 +00:00
def GetGalleryIndexValues ( self ) :
return ( self . _gallery_index_type , self . _gallery_index_identifier , self . _gallery_index_delta )
2020-06-17 21:31:54 +00:00
def GetClassKey ( self ) :
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
return self . _url_class_key
2018-01-17 22:52:10 +00:00
2018-08-15 20:40:30 +00:00
def GetNextGalleryPage ( self , url ) :
2018-08-29 20:20:41 +00:00
url = self . Normalise ( url )
2020-05-13 19:03:16 +00:00
p = ParseURL ( url )
2018-08-15 20:40:30 +00:00
scheme = p . scheme
netloc = p . netloc
path = p . path
query = p . query
params = ' '
fragment = ' '
if self . _gallery_index_type == GALLERY_INDEX_TYPE_PATH_COMPONENT :
page_index_path_component_index = self . _gallery_index_identifier
while path . startswith ( ' / ' ) :
path = path [ 1 : ]
path_components = path . split ( ' / ' )
try :
page_index = path_components [ page_index_path_component_index ]
except IndexError :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not generate next gallery page--not enough path components! ' )
2018-08-15 20:40:30 +00:00
try :
page_index = int ( page_index )
except :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not generate next gallery page--index component was not an integer! ' )
2018-08-15 20:40:30 +00:00
2018-08-22 21:10:59 +00:00
path_components [ page_index_path_component_index ] = str ( page_index + self . _gallery_index_delta )
2018-08-15 20:40:30 +00:00
path = ' / ' + ' / ' . join ( path_components )
elif self . _gallery_index_type == GALLERY_INDEX_TYPE_PARAMETER :
page_index_name = self . _gallery_index_identifier
2019-11-28 01:11:46 +00:00
( query_dict , param_order ) = ConvertQueryTextToDict ( query )
2018-08-15 20:40:30 +00:00
if page_index_name not in query_dict :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not generate next gallery page--did not find ' + str ( self . _gallery_index_identifier ) + ' in parameters! ' )
2018-08-15 20:40:30 +00:00
page_index = query_dict [ page_index_name ]
try :
page_index = int ( page_index )
except :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( ' Could not generate next gallery page--index component was not an integer! ' )
2018-08-15 20:40:30 +00:00
query_dict [ page_index_name ] = page_index + self . _gallery_index_delta
2019-11-28 01:11:46 +00:00
if self . _alphabetise_get_parameters :
param_order = None
query = ConvertQueryDictToText ( query_dict , param_order = param_order )
2018-08-15 20:40:30 +00:00
else :
raise NotImplementedError ( ' Did not understand the next gallery page rules! ' )
2019-01-09 22:59:03 +00:00
r = urllib . parse . ParseResult ( scheme , netloc , path , params , query , fragment )
2018-08-15 20:40:30 +00:00
return r . geturl ( )
2019-10-09 22:03:03 +00:00
def GetReferralURL ( self , url , referral_url ) :
if self . _send_referral_url == SEND_REFERRAL_URL_ONLY_IF_PROVIDED :
return referral_url
elif self . _send_referral_url == SEND_REFERRAL_URL_NEVER :
return None
elif self . _send_referral_url in ( SEND_REFERRAL_URL_CONVERTER_IF_NONE_PROVIDED , SEND_REFERRAL_URL_ONLY_CONVERTER ) :
try :
converted_referral_url = self . _referral_url_converter . Convert ( url )
except HydrusExceptions . StringConvertException :
return referral_url
p1 = self . _send_referral_url == SEND_REFERRAL_URL_ONLY_CONVERTER
p2 = self . _send_referral_url == SEND_REFERRAL_URL_CONVERTER_IF_NONE_PROVIDED and referral_url is None
if p1 or p2 :
return converted_referral_url
else :
return referral_url
return referral_url
2018-09-19 21:54:51 +00:00
def GetSafeSummary ( self ) :
return ' URL Class " ' + self . _name + ' " - ' + ConvertURLIntoDomain ( self . GetExampleURL ( ) )
2019-11-28 01:11:46 +00:00
def GetURLBooleans ( self ) :
return ( self . _match_subdomains , self . _keep_matched_subdomains , self . _alphabetise_get_parameters , self . _can_produce_multiple_files , self . _should_be_associated_with_files )
2017-11-29 21:48:23 +00:00
def GetURLType ( self ) :
return self . _url_type
2017-12-06 22:06:56 +00:00
def IsGalleryURL ( self ) :
return self . _url_type == HC . URL_TYPE_GALLERY
2018-01-17 22:52:10 +00:00
def IsParsable ( self ) :
return self . _url_type in ( HC . URL_TYPE_POST , HC . URL_TYPE_GALLERY , HC . URL_TYPE_WATCHABLE )
2017-12-06 22:06:56 +00:00
def IsPostURL ( self ) :
return self . _url_type == HC . URL_TYPE_POST
2017-12-13 22:33:07 +00:00
def IsWatchableURL ( self ) :
return self . _url_type == HC . URL_TYPE_WATCHABLE
2018-01-17 22:52:10 +00:00
def Matches ( self , url ) :
try :
self . Test ( url )
return True
2019-05-08 21:06:42 +00:00
except HydrusExceptions . URLClassException :
2018-01-17 22:52:10 +00:00
return False
2019-12-11 23:18:37 +00:00
def MatchesSubdomains ( self ) :
return self . _match_subdomains
2018-01-17 22:52:10 +00:00
2017-09-27 21:52:54 +00:00
def Normalise ( self , url ) :
2017-09-13 20:50:41 +00:00
2020-05-13 19:03:16 +00:00
p = ParseURL ( url )
2017-09-13 20:50:41 +00:00
scheme = self . _preferred_scheme
params = ' '
fragment = ' '
2018-08-29 20:20:41 +00:00
if self . ClippingIsAppropriate ( ) :
2018-01-17 22:52:10 +00:00
netloc = self . _ClipNetLoc ( p . netloc )
2018-08-29 20:20:41 +00:00
path = self . _ClipAndFleshOutPath ( p . path )
query = self . _ClipAndFleshOutQuery ( p . query )
2018-01-17 22:52:10 +00:00
else :
netloc = p . netloc
2018-08-29 20:20:41 +00:00
path = self . _ClipAndFleshOutPath ( p . path , allow_clip = False )
query = self . _ClipAndFleshOutQuery ( p . query , allow_clip = False )
2018-01-17 22:52:10 +00:00
2019-01-09 22:59:03 +00:00
r = urllib . parse . ParseResult ( scheme , netloc , path , params , query , fragment )
2017-09-13 20:50:41 +00:00
return r . geturl ( )
2018-05-09 20:23:00 +00:00
def RefersToOneFile ( self ) :
2018-04-25 22:07:52 +00:00
2018-05-09 20:23:00 +00:00
is_a_direct_file_page = self . _url_type == HC . URL_TYPE_FILE
2018-04-25 22:07:52 +00:00
2018-05-09 20:23:00 +00:00
is_a_single_file_post_page = self . _url_type == HC . URL_TYPE_POST and not self . _can_produce_multiple_files
2018-04-18 22:10:15 +00:00
2018-05-09 20:23:00 +00:00
return is_a_direct_file_page or is_a_single_file_post_page
2018-04-18 22:10:15 +00:00
2020-06-17 21:31:54 +00:00
def RegenerateClassKey ( self ) :
2018-01-17 22:52:10 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_key = HydrusData . GenerateKey ( )
2018-01-17 22:52:10 +00:00
2018-09-19 21:54:51 +00:00
def SetExampleURL ( self , example_url ) :
self . _example_url = example_url
2020-06-17 21:31:54 +00:00
def SetClassKey ( self , match_key ) :
2018-09-19 21:54:51 +00:00
2019-05-08 21:06:42 +00:00
self . _url_class_key = match_key
2018-09-19 21:54:51 +00:00
2019-11-28 01:11:46 +00:00
def SetURLBooleans ( self , match_subdomains , keep_matched_subdomains , alphabetise_get_parameters , can_produce_multiple_files , should_be_associated_with_files ) :
self . _match_subdomains = match_subdomains
self . _keep_matched_subdomains = keep_matched_subdomains
self . _alphabetise_get_parameters = alphabetise_get_parameters
self . _can_produce_multiple_files = can_produce_multiple_files
self . _should_be_associated_with_files = should_be_associated_with_files
2018-04-18 22:10:15 +00:00
def ShouldAssociateWithFiles ( self ) :
return self . _should_be_associated_with_files
2017-09-13 20:50:41 +00:00
def Test ( self , url ) :
2020-05-13 19:03:16 +00:00
p = ParseURL ( url )
2017-09-13 20:50:41 +00:00
2018-04-25 22:07:52 +00:00
if self . _match_subdomains :
2017-11-29 21:48:23 +00:00
if p . netloc != self . _netloc and not p . netloc . endswith ( ' . ' + self . _netloc ) :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( p . netloc + ' (potentially excluding subdomains) did not match ' + self . _netloc )
2017-11-29 21:48:23 +00:00
else :
2018-04-25 22:07:52 +00:00
if not DomainEqualsAnotherForgivingWWW ( p . netloc , self . _netloc ) :
2017-11-29 21:48:23 +00:00
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( p . netloc + ' did not match ' + self . _netloc )
2017-11-29 21:48:23 +00:00
2017-09-13 20:50:41 +00:00
url_path = p . path
while url_path . startswith ( ' / ' ) :
url_path = url_path [ 1 : ]
2017-11-29 21:48:23 +00:00
url_path_components = url_path . split ( ' / ' )
2017-09-13 20:50:41 +00:00
2018-08-29 20:20:41 +00:00
for ( index , ( string_match , default ) ) in enumerate ( self . _path_components ) :
2017-09-13 20:50:41 +00:00
2018-08-29 20:20:41 +00:00
if len ( url_path_components ) > index :
2017-11-22 21:03:07 +00:00
2018-08-29 20:20:41 +00:00
url_path_component = url_path_components [ index ]
2017-09-13 20:50:41 +00:00
2018-08-29 20:20:41 +00:00
try :
string_match . Test ( url_path_component )
except HydrusExceptions . StringMatchException as e :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( str ( e ) )
2018-08-29 20:20:41 +00:00
2017-11-22 21:03:07 +00:00
2018-08-29 20:20:41 +00:00
elif default is None :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( url_path + ' did not have enough of the required path components! ' )
2017-09-13 20:50:41 +00:00
2019-11-28 01:11:46 +00:00
( url_parameters , param_order ) = ConvertQueryTextToDict ( p . query )
2017-11-29 21:48:23 +00:00
2019-01-09 22:59:03 +00:00
for ( key , ( string_match , default ) ) in list ( self . _parameters . items ( ) ) :
2017-09-13 20:50:41 +00:00
2017-11-29 21:48:23 +00:00
if key not in url_parameters :
2017-09-13 20:50:41 +00:00
2018-08-29 20:20:41 +00:00
if default is None :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( key + ' not found in ' + p . query )
2018-08-29 20:20:41 +00:00
else :
continue
2017-09-13 20:50:41 +00:00
2017-11-29 21:48:23 +00:00
value = url_parameters [ key ]
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
try :
2017-11-29 21:48:23 +00:00
string_match . Test ( value )
2017-09-13 20:50:41 +00:00
2017-11-22 21:03:07 +00:00
except HydrusExceptions . StringMatchException as e :
2019-05-08 21:06:42 +00:00
raise HydrusExceptions . URLClassException ( str ( e ) )
2017-09-13 20:50:41 +00:00
2017-11-29 21:48:23 +00:00
def ToTuple ( self ) :
2019-11-28 01:11:46 +00:00
return ( self . _url_type , self . _preferred_scheme , self . _netloc , self . _path_components , self . _parameters , self . _api_lookup_converter , self . _send_referral_url , self . _referral_url_converter , self . _example_url )
2017-11-29 21:48:23 +00:00
2018-02-07 23:40:33 +00:00
def UsesAPIURL ( self ) :
return self . _api_lookup_converter . MakesChanges ( )
2019-05-08 21:06:42 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_URL_CLASS ] = URLClass