hydrus/hydrus/client/networking/ClientNetworkingFunctions.py

513 lines
14 KiB
Python
Raw Normal View History

import http.cookiejar
import re
import unicodedata
import urllib.parse
from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusExceptions
2024-02-14 21:20:24 +00:00
from hydrus.client import ClientGlobals as CG
def AddCookieToSession( session, name, value, domain, path, expires, secure = False, rest = None ):
version = 0
port = None
port_specified = False
domain_specified = True
domain_initial_dot = domain.startswith( '.' )
path_specified = True
discard = False
comment = None
comment_url = None
if rest is None:
rest = {}
cookie = http.cookiejar.Cookie( version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest )
session.cookies.set_cookie( cookie )
def ConvertDomainIntoAllApplicableDomains( domain, discard_www = True ):
# is an ip address or localhost, possibly with a port
2022-03-30 20:28:13 +00:00
if '.' not in domain or re.search( r'^[\d.:]+$', domain ) is not None:
return [ domain ]
domains = []
if discard_www:
domain = RemoveWWWFromDomain( domain )
while domain.count( '.' ) > 0:
domains.append( domain )
domain = ConvertDomainIntoNextLevelDomain( domain )
return domains
def ConvertDomainIntoNextLevelDomain( domain ):
return '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com
def ConvertDomainIntoSecondLevelDomain( domain ):
domains = ConvertDomainIntoAllApplicableDomains( domain )
if len( domains ) == 0:
raise HydrusExceptions.URLClassException( 'That url or domain did not seem to be valid!' )
return domains[-1]
def ConvertHTTPSToHTTP( url ):
if url.startswith( 'http://' ):
return url
elif url.startswith( 'https://' ):
http_url = 'http://' + url[8:]
return http_url
else:
raise Exception( 'Given a url that did not have a scheme!' )
def ConvertHTTPToHTTPS( url ):
if url.startswith( 'https://' ):
return url
elif url.startswith( 'http://' ):
https_url = 'https://' + url[7:]
return https_url
else:
raise Exception( 'Given a url that did not have a scheme!' )
def ConvertQueryDictToText( query_dict, single_value_parameters, param_order = None ):
# we now do everything with requests, which does all the unicode -> %20 business naturally, phew
# we still want to call str explicitly to coerce integers and so on that'll slip in here and there
if param_order is None:
param_order = sorted( query_dict.keys() )
single_value_parameters = list( single_value_parameters )
single_value_parameters.sort()
for i in range( len( single_value_parameters ) ):
param_order.append( None )
params = []
single_value_parameter_index = 0
for key in param_order:
if key is None:
try:
params.append( single_value_parameters[ single_value_parameter_index ] )
except IndexError:
continue
single_value_parameter_index += 1
else:
if key in query_dict:
params.append( '{}={}'.format( key, query_dict[ key ] ) )
query_text = '&'.join( params )
return query_text
def ConvertQueryTextToDict( query_text ):
# we generally do not want quote characters, %20 stuff, in our urls. we would prefer properly formatted unicode
# so, let's replace all keys and values with unquoted versions
# -but-
# we only replace if it is a completely reversable operation!
# odd situations like '6+girls+skirt', which comes here encoded as '6%2Bgirls+skirt', shouldn't turn into '6+girls+skirt'
# so if there are a mix of encoded and non-encoded, we won't touch it here m8
# except these chars, which screw with GET arg syntax when unquoted
bad_chars = [ '&', '=', '/', '?', '#', ';', '+', ',' ]
param_order = []
query_dict = {}
single_value_parameters = []
pairs = query_text.split( '&' )
for pair in pairs:
result = pair.split( '=', 1 )
# for the moment, ignore tracker bugs and so on that have only key and no value
if len( result ) == 1:
( value, ) = result
if value == '':
continue
try:
unquoted_value = urllib.parse.unquote( value )
if True not in ( bad_char in unquoted_value for bad_char in bad_chars ):
requoted_value = urllib.parse.quote( unquoted_value )
if requoted_value == value:
value = unquoted_value
except:
pass
single_value_parameters.append( value )
param_order.append( None )
elif len( result ) == 2:
( key, value ) = result
try:
unquoted_key = urllib.parse.unquote( key )
if True not in ( bad_char in unquoted_key for bad_char in bad_chars ):
requoted_key = urllib.parse.quote( unquoted_key )
if requoted_key == key:
key = unquoted_key
except:
pass
try:
unquoted_value = urllib.parse.unquote( value )
if True not in ( bad_char in unquoted_value for bad_char in bad_chars ):
requoted_value = urllib.parse.quote( unquoted_value )
if requoted_value == value:
value = unquoted_value
except:
pass
param_order.append( key )
query_dict[ key ] = value
return ( query_dict, single_value_parameters, param_order )
def ConvertURLIntoDomain( url ):
parser_result = ParseURL( url )
if parser_result.scheme == '':
raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--did you forget the http:// or https://?' )
if parser_result.netloc == '':
raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--is it missing a domain?' )
domain = parser_result.netloc
return domain
def ConvertURLIntoSecondLevelDomain( url ):
domain = ConvertURLIntoDomain( url )
return ConvertDomainIntoSecondLevelDomain( domain )
def CookieDomainMatches( cookie, search_domain ):
cookie_domain = cookie.domain
# blah.com is viewable by blah.com
matches_exactly = cookie_domain == search_domain
# .blah.com is viewable by blah.com
matches_dot = cookie_domain == '.' + search_domain
# .blah.com applies to subdomain.blah.com, blah.com does not
valid_subdomain = cookie_domain.startswith( '.' ) and search_domain.endswith( cookie_domain )
return matches_exactly or matches_dot or valid_subdomain
def DomainEqualsAnotherForgivingWWW( test_domain, wwwable_domain ):
# domain is either the same or starts with www. or www2. or something
rule = r'^(www[^\.]*\.)?' + re.escape( wwwable_domain ) + '$'
return re.search( rule, test_domain ) is not None
def GetCookie( cookies, search_domain, cookie_name_string_match ):
for cookie in cookies:
if CookieDomainMatches( cookie, search_domain ) and cookie_name_string_match.Matches( cookie.name ):
return cookie
raise HydrusExceptions.DataMissing( 'Cookie "' + cookie_name_string_match.ToString() + '" not found for domain ' + search_domain + '!' )
def GetSearchURLs( url ):
search_urls = set()
search_urls.add( url )
try:
2024-02-14 21:20:24 +00:00
normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
search_urls.add( normalised_url )
except HydrusExceptions.URLClassException:
pass
for url in list( search_urls ):
if url.startswith( 'http://' ):
search_urls.add( ConvertHTTPToHTTPS( url ) )
elif url.startswith( 'https://' ):
search_urls.add( ConvertHTTPSToHTTP( url ) )
for url in list( search_urls ):
p = ParseURL( url )
scheme = p.scheme
netloc = p.netloc
path = p.path
params = ''
query = p.query
fragment = p.fragment
if netloc.startswith( 'www' ):
try:
netloc = ConvertDomainIntoSecondLevelDomain( netloc )
except HydrusExceptions.URLClassException:
continue
else:
netloc = 'www.' + netloc
r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )
search_urls.add( r.geturl() )
for url in list( search_urls ):
if url.endswith( '/' ):
search_urls.add( url[:-1] )
else:
search_urls.add( url + '/' )
return search_urls
2023-05-31 20:54:09 +00:00
def LooksLikeAFullURL( text: str ) -> bool:
try:
result = urllib.parse.urlparse( text )
if result.scheme == '':
return False
if result.netloc == '':
return False
return True
except:
return False
def NormaliseAndFilterAssociableURLs( urls ):
normalised_urls = set()
for url in urls:
try:
2024-02-14 21:20:24 +00:00
url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
except HydrusExceptions.URLClassException:
continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha
normalised_urls.add( url )
2024-02-14 21:20:24 +00:00
associable_urls = { url for url in normalised_urls if CG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
return associable_urls
def ParseURL( url: str ) -> urllib.parse.ParseResult:
url = url.strip()
url = UnicodeNormaliseURL( url )
2023-06-07 20:07:22 +00:00
try:
return urllib.parse.urlparse( url )
except Exception as e:
raise HydrusExceptions.URLClassException( str( e ) )
2022-03-30 20:28:13 +00:00
OH_NO_NO_NETLOC_CHARACTERS = '?#'
OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord( char ) : '_' for char in OH_NO_NO_NETLOC_CHARACTERS }
def RemoveWWWFromDomain( domain ):
if domain.count( '.' ) > 1 and domain.startswith( 'www' ):
domain = ConvertDomainIntoNextLevelDomain( domain )
return domain
def UnicodeNormaliseURL( url: str ):
if url.startswith( 'file:' ):
return url
# the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise
# urllib.urlparse throws a valueerror if these are in, so let's switch out
scheme_splitter = '://'
netloc_splitter = '/'
if scheme_splitter in url:
( scheme, netloc_and_path_and_rest ) = url.split( scheme_splitter, 1 )
if netloc_splitter in netloc_and_path_and_rest:
( netloc, path_and_rest ) = netloc_and_path_and_rest.split( netloc_splitter, 1 )
else:
netloc = netloc_and_path_and_rest
path_and_rest = None
netloc = unicodedata.normalize( 'NFKC', netloc )
netloc = netloc.translate( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE )
scheme_and_netlock = scheme_splitter.join( ( scheme, netloc ) )
if path_and_rest is None:
url = scheme_and_netlock
else:
url = netloc_splitter.join( ( scheme_and_netlock, path_and_rest ) )
return url