hydrus/include/ClientNetworkingDomain.py

493 lines
16 KiB
Python
Raw Normal View History

2017-09-13 20:50:41 +00:00
import ClientConstants as CC
2017-09-27 21:52:54 +00:00
import ClientParsing
2017-10-04 17:51:58 +00:00
import ClientThreading
import collections
2017-09-13 20:50:41 +00:00
import HydrusConstants as HC
import HydrusGlobals as HG
import HydrusData
import HydrusExceptions
2017-09-27 21:52:54 +00:00
import HydrusSerialisable
2017-09-13 20:50:41 +00:00
import threading
2017-10-04 17:51:58 +00:00
import time
2017-09-13 20:50:41 +00:00
import urlparse
2017-10-04 17:51:58 +00:00
def ConvertDomainIntoAllApplicableDomains( domain ):
domains = []
while domain.count( '.' ) > 0:
# let's discard www.blah.com so we don't end up tracking it separately to blah.com--there's not much point!
startswith_www = domain.count( '.' ) > 1 and domain.startswith( 'www' )
if not startswith_www:
domains.append( domain )
domain = '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com
return domains
def ConvertURLIntoDomain( url ):
parser_result = urlparse.urlparse( url )
domain = HydrusData.ToByteString( parser_result.netloc )
return domain
VALID_DENIED = 0
VALID_APPROVED = 1
VALID_UNKNOWN = 2
2017-09-13 20:50:41 +00:00
# this should do network_contexts->user-agent as well, with some kind of approval system in place
2017-09-27 21:52:54 +00:00
# approval needs a new queue in the network engine. this will eventually test downloader validity and so on. failable at that stage
2017-09-13 20:50:41 +00:00
# user-agent info should be exportable/importable on the ui as well
2017-09-27 21:52:54 +00:00
# eventually extend this to do urlmatch->downloader_key, I think.
2017-09-13 20:50:41 +00:00
# hence we'll be able to do some kind of dnd_url->new thread watcher page
2017-09-27 21:52:54 +00:00
# hide urls on media viewer based on domain
# decide whether we want to add this to the dirtyobjects loop, and it which case, if anything is appropriate to store in the db separately
# hence making this a serialisableobject itself.
2017-10-04 17:51:58 +00:00
class NetworkDomainManager( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_NETWORK_BANDWIDTH_MANAGER
SERIALISABLE_VERSION = 1
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
def __init__( self ):
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
self.engine = None
self._url_matches = HydrusSerialisable.SerialisableList()
self._network_contexts_to_custom_headers = {}
self._domains_to_url_matches = collections.defaultdict( list )
self._dirty = False
2017-09-13 20:50:41 +00:00
self._lock = threading.Lock()
2017-10-04 17:51:58 +00:00
self._RecalcCache()
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
def _GetURLMatch( self, url ):
2017-10-04 17:51:58 +00:00
domain = ConvertURLIntoDomain( url )
2017-09-27 21:52:54 +00:00
if domain in self._domains_to_url_matches:
url_matches = self._domains_to_url_matches[ domain ]
# it would be nice to somehow sort these based on descending complexity
# maybe by length of example url
# in this way, url matches can have overlapping desmaign
# e.g. 'post url' vs 'post url, manga subpage'
for url_match in url_matches:
( result_bool, result_reason ) = url_match.Test( url )
if result_bool:
return url_match
return None
2017-10-04 17:51:58 +00:00
def _RecalcCache( self ):
self._domains_to_url_matches = collections.defaultdict( list )
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
for url_match in self._url_matches:
domain = url_match.GetDomain()
self._domains_to_url_matches[ domain ].append( url_match )
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
def _SetDirty( self ):
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
self._dirty = True
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
def CanValidateInPopup( self, network_contexts ):
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
# we can always do this for headers
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
return True
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
def GenerateValidationProcess( self, network_contexts ):
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
# generate a process that will, when threadcalled maybe with .Start() , ask the user, one after another, all the key-value pairs
# Should (network context) apply "(key)" header "(value)"?
# Reason given is: "You need this to make it work lol."
# once all the yes/nos are set, update db, reinitialise domain manager, set IsDone to true.
2017-09-27 21:52:54 +00:00
pass
def GetCustomHeaders( self, network_contexts ):
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
keys_to_values = {}
2017-09-13 20:50:41 +00:00
with self._lock:
2017-09-27 21:52:54 +00:00
pass
# good order is global = least powerful, which I _think_ is how these come.
2017-10-04 17:51:58 +00:00
# e.g. a site User-Agent should overwrite a global default
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
return keys_to_values
2017-09-27 21:52:54 +00:00
def GetDownloader( self, url ):
with self._lock:
# this might be better as getdownloaderkey, but we'll see how it shakes out
# might also be worth being a getifhasdownloader
# match the url to a url_match, then lookup that in a 'this downloader can handle this url_match type' dict that we'll manage
pass
2017-10-04 17:51:58 +00:00
def IsValid( self, network_contexts ):
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
# for now, let's say that denied headers are simply not added, not that they invalidate a query
2017-09-27 21:52:54 +00:00
2017-10-04 17:51:58 +00:00
for network_context in network_contexts:
if network_context in self._network_contexts_to_custom_headers:
custom_headers = self._network_contexts_to_custom_headers[ network_context ]
for ( key, value, approved, reason ) in custom_headers:
if approved == VALID_UNKNOWN:
return False
return True
2017-09-27 21:52:54 +00:00
def NormaliseURL( self, url ):
# call this before an entry into a seed cache or the db
# use it in the dialog to review mass db-level changes
with self._lock:
url_match = self._GetURLMatch( url )
if url_match is None:
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
return url
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
normalised_url = url_match.Normalise( url )
return normalised_url
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
def SetClean( self ):
2017-09-13 20:50:41 +00:00
with self._lock:
2017-10-04 17:51:58 +00:00
self._dirty = False
def SetHeaderValidation( self, network_context, key, approved ):
with self._lock:
custom_headers = self._network_contexts_to_custom_headers[ network_context ]
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_NETWORK_DOMAIN_MANAGER ] = NetworkDomainManager
class DomainValidationProcess( object ):
def __init__( self, domain_manager, header_tuples ):
self._domain_manager = domain_manager
self._header_tuples = header_tuples
self._is_done = False
def IsDone( self ):
return self._is_done
def Start( self ):
try:
results = []
for ( network_context, key, value, approval_reason ) in self._header_tuples:
job_key = ClientThreading.JobKey()
# generate question
question = 'intro text ' + approval_reason
job_key.SetVariable( 'popup_yes_no_question', question )
# pub it
result = job_key.GetIfHasVariable( 'popup_yes_no_answer' )
while result is None:
if HG.view_shutdown:
return
time.sleep( 0.25 )
if result:
approved = VALID_APPROVED
else:
approved = VALID_DENIED
self._domain_manager.SetHeaderValidation( network_context, key, approved )
finally:
self._is_done = True
2017-09-13 20:50:41 +00:00
# make this serialisable--maybe with name as the name of a named serialisable
# __hash__ for name? not sure
# maybe all serialisable should return __hash__ of ( type, name ) if they don't already
# that might lead to problems elsewhere, so careful
2017-09-27 21:52:54 +00:00
class URLMatch( HydrusSerialisable.SerialisableBaseNamed ):
2017-09-13 20:50:41 +00:00
2017-09-27 21:52:54 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_URL_MATCH
SERIALISABLE_VERSION = 1
2017-10-04 17:51:58 +00:00
def __init__( self, name, preferred_scheme = 'https', netloc = 'hostname.com', subdomain_is_important = False, path_components = None, parameters = None, example_url = 'https://hostname.com/post/page.php?id=123456&s=view' ):
2017-09-27 21:52:54 +00:00
if path_components is None:
path_components = HydrusSerialisable.SerialisableList()
path_components.append( ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FIXED, match_value = 'post', example_string = 'post' ) )
path_components.append( ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FIXED, match_value = 'page.php', example_string = 'page.php' ) )
if parameters is None:
parameters = HydrusSerialisable.SerialisableDictionary()
parameters[ 's' ] = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FIXED, match_value = 'view', example_string = 'view' )
parameters[ 'id' ] = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FLEXIBLE, match_value = ClientParsing.NUMERIC, example_string = '123456' )
2017-09-13 20:50:41 +00:00
# an edit dialog panel for this that has example url and testing of current values
# a parent panel or something that lists all current urls in the db that match and how they will be clipped, is this ok? kind of thing.
2017-09-27 21:52:54 +00:00
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
2017-10-04 17:51:58 +00:00
self._preferred_scheme = preferred_scheme
self._netloc = netloc
self._subdomain_is_important = subdomain_is_important
self._path_components = path_components
self._parameters = parameters
2017-09-13 20:50:41 +00:00
2017-10-04 17:51:58 +00:00
self._example_url = example_url
2017-09-13 20:50:41 +00:00
def _ClipNetLoc( self, netloc ):
if self._subdomain_is_important:
# for domains like artistname.website.com, where removing the subdomain may break the url, we leave it alone
pass
else:
# for domains like mediaserver4.website.com, where multiple subdomains serve the same content as the larger site
# if the main site doesn't deliver the same content as the subdomain, then subdomain_is_important
netloc = self._netloc
return netloc
2017-09-27 21:52:54 +00:00
def _GetSerialisableInfo( self ):
serialisable_path_components = self._path_components.GetSerialisableTuple()
serialisable_parameters = self._parameters.GetSerialisableTuple()
return ( self._preferred_scheme, self._netloc, self._subdomain_is_important, serialisable_path_components, serialisable_parameters, self._example_url )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._preferred_scheme, self._netloc, self._subdomain_is_important, serialisable_path_components, serialisable_parameters, self._example_url ) = serialisable_info
self._path_components = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_path_components )
self._parameters = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_parameters )
2017-09-13 20:50:41 +00:00
def _ClipPath( self, path ):
# /post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
while path.startswith( '/' ):
path = path[ 1 : ]
# post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
path_components = path.split( '/' )
path = '/'.join( path_components[ : len( self._path_components ) ] )
# post/show/1326143
if len( path ) > 0:
path = '/' + path
# /post/show/1326143
return path
def _ClipQuery( self, query ):
valid_parameters = []
for ( key, value ) in urlparse.parse_qsl( query ):
if key in self._parameters:
valid_parameters.append( ( key, value ) )
valid_parameters.sort()
query = '&'.join( ( key + '=' + value for ( key, value ) in valid_parameters ) )
return query
2017-10-04 17:51:58 +00:00
def GetDomain( self ):
return ConvertURLIntoDomain( self._example_url )
2017-09-27 21:52:54 +00:00
def Normalise( self, url ):
2017-09-13 20:50:41 +00:00
p = urlparse.urlparse( url )
scheme = self._preferred_scheme
netloc = self._ClipNetLoc( p.netloc )
path = self._ClipPath( p.path )
params = ''
query = self._ClipQuery( p.query )
fragment = ''
r = urlparse.ParseResult( scheme, netloc, path, params, query, fragment )
return r.geturl()
def Test( self, url ):
# split the url into parts according to urlparse
p = urlparse.urlparse( url )
# test p.netloc with netloc, taking subdomain_is_important into account
url_path = p.path
while url_path.startswith( '/' ):
url_path = url_path[ 1 : ]
url_path_components = p.path.split( '/' )
if len( url_path_components ) < len( self._path_components ):
return ( False, p.path + ' did not have ' + str( len( self._path_components ) ) + ' components' )
for ( url_path_component, expected_path_component ) in zip( url_path_components, self._path_components ):
( bool_result, reason ) = expected_path_component.Test( url_path_component )
if not bool_result:
return ( bool_result, reason )
url_parameters_list = urlparse.parse_qsl( p.query )
if len( url_parameters_list ) < len( self._parameters ):
return ( False, p.query + ' did not have ' + str( len( self._parameters ) ) + ' value pairs' )
for ( key, url_value ) in url_parameters_list:
if key not in self._parameters:
return ( False, key + ' not found in ' + p.query )
expected_value = self._parameters[ key ]
( bool_result, reason ) = expected_value.Test( url_value )
if not bool_result:
return ( bool_result, reason )
return ( True, 'good' )
2017-09-27 21:52:54 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_URLS_IMPORT ] = URLMatch