hydrus/hydrus/client/networking/ClientNetworkingFunctions.py

567 lines
17 KiB
Python

import http.cookiejar
import re
import typing
import unicodedata
import urllib.parse
from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusExceptions
from hydrus.client import ClientGlobals as CG
def AddCookieToSession( session, name, value, domain, path, expires, secure = False, rest = None ):
version = 0
port = None
port_specified = False
domain_specified = True
domain_initial_dot = domain.startswith( '.' )
path_specified = True
discard = False
comment = None
comment_url = None
if rest is None:
rest = {}
cookie = http.cookiejar.Cookie( version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest )
session.cookies.set_cookie( cookie )
def ConvertDomainIntoAllApplicableDomains( domain, discard_www = True ):
# is an ip address or localhost, possibly with a port
if '.' not in domain or re.search( r'^[\d.:]+$', domain ) is not None:
return [ domain ]
domains = []
if discard_www:
domain = RemoveWWWFromDomain( domain )
while domain.count( '.' ) > 0:
domains.append( domain )
domain = ConvertDomainIntoNextLevelDomain( domain )
return domains
def ConvertDomainIntoNextLevelDomain( domain ):
return '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com
def ConvertDomainIntoSecondLevelDomain( domain ):
domains = ConvertDomainIntoAllApplicableDomains( domain )
if len( domains ) == 0:
raise HydrusExceptions.URLClassException( 'That url or domain did not seem to be valid!' )
return domains[-1]
def ConvertHTTPSToHTTP( url ):
if url.startswith( 'http://' ):
return url
elif url.startswith( 'https://' ):
http_url = 'http://' + url[8:]
return http_url
else:
raise Exception( 'Given a url that did not have a scheme!' )
def ConvertHTTPToHTTPS( url ):
if url.startswith( 'https://' ):
return url
elif url.startswith( 'http://' ):
https_url = 'https://' + url[7:]
return https_url
else:
raise Exception( 'Given a url that did not have a scheme!' )
def ConvertPathTextToList( path: str ) -> typing.List[ str ]:
# yo sometimes you see a URL with double slashes in a weird place. maybe we should just split( '/' ) and then remove empty '' results?
# /post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
# for a while we've had URLs like this:
# https://img2.gelbooru.com//images/80/c8/80c8646b4a49395fb36c805f316c49a9.jpg
# I was going to be careful as I unified all this to preserve the double-slash to help legacy known url storage matching, but it seems we've been nuking the extra slash for ages in actual db storage, so w/e!
while path.startswith( '/' ):
path = path[ 1 : ]
# post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
path_components = path.split( '/' )
return path_components
def ConvertQueryDictToText( query_dict, single_value_parameters, param_order = None ):
# we now do everything with requests, which does all the unicode -> %20 business naturally, phew
# we still want to call str explicitly to coerce integers and so on that'll slip in here and there
if param_order is None:
param_order = sorted( query_dict.keys() )
single_value_parameters = list( single_value_parameters )
single_value_parameters.sort()
for i in range( len( single_value_parameters ) ):
param_order.append( None )
params = []
single_value_parameter_index = 0
for key in param_order:
if key is None:
try:
params.append( single_value_parameters[ single_value_parameter_index ] )
except IndexError:
continue
single_value_parameter_index += 1
else:
if key in query_dict:
params.append( f'{key}={query_dict[ key ]}' )
query_text = '&'.join( params )
return query_text
def ConvertQueryTextToDict( query_text ):
# in the old version of this func, we played silly games with character encoding. I made the foolish decision to try to handle/save URLs with %20 stuff decoded
# this lead to complexity with odd situations like '6+girls+skirt', which would come here encoded as '6%2Bgirls+skirt'
# I flipped back and forth and tried to preserve the encoding if it did stepped on x or did not change y, what a mess!
# I no longer do this. I will encode if there is no '%' in there already, which catches cases of humans pasting/typing an URL with something human, but only if it is non-destructive
# Update: I still hate this a bit. I should have a parameter that says 'from human=True' and then anything we ingest should go through a normalisation( from_human = True ) wash
# I don't like the '+' exception we have to do here, and it would be better isolated to just the initian from_human wash rather than basically every time we look at an url for normalisation
# indeed, instead of having 'from_human' in here, I could have a 'EncodeQueryDict' that does best-attempt smart encoding from_human, once
# this guy would then just be a glorified dict parser, great
param_order = []
query_dict = {}
single_value_parameters = []
pairs = query_text.split( '&' )
for pair in pairs:
result = pair.split( '=', 1 )
# for the moment, ignore tracker bugs and so on that have only key and no value
if len( result ) == 1:
( value, ) = result
if value == '':
continue
single_value_parameters.append( value )
param_order.append( None )
elif len( result ) == 2:
( key, value ) = result
param_order.append( key )
query_dict[ key ] = value
return ( query_dict, single_value_parameters, param_order )
def EnsureURLInfoIsEncoded( path_components: typing.List[ str ], query_dict: typing.Dict[ str, str ], single_value_parameters: typing.List[ str ] ):
# ok so the user just posted a URL at us, and this query dict could either be from a real url, like "tags=skirt%20blonde_hair", or it could be a pretty URL they typed or whatever, "tags=skirt blonde_hair"
# so, let's do our best to figure out if the thing was pre-encoded or not, and wash it through a safe encoding process so it is encoded when we give it back
# what's the potential problem? '+' is a special character that may or may not be encoded, e.g. "tags=6%2Bgirls+skirt" WEW
percent_encoding_re = re.compile( r'%[0-9A-Fa-f]{2}' )
all_gubbins = set( path_components )
all_gubbins.update( query_dict.keys() )
all_gubbins.update( query_dict.values() )
all_gubbins.update( single_value_parameters )
there_are_percent_encoding_chars = True in ( percent_encoding_re.search( text ) is not None for text in all_gubbins )
# if there are percent-encoded characters anywhere, we have to assume the whole URL is already encoded correctly!
if not there_are_percent_encoding_chars:
path_components = [ urllib.parse.quote( value, safe = '+' ) for value in path_components ]
query_dict = { urllib.parse.quote( key, safe = '+' ) : urllib.parse.quote( value, safe = '+' ) for ( key, value ) in query_dict.items() }
single_value_parameters = [ urllib.parse.quote( value, safe = '+' ) for value in single_value_parameters ]
return ( path_components, query_dict, single_value_parameters )
def ConvertURLIntoDomain( url ):
parser_result = ParseURL( url )
if parser_result.scheme == '':
raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--did you forget the http:// or https://?' )
if parser_result.netloc == '':
raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--is it missing a domain?' )
domain = parser_result.netloc
return domain
def ConvertURLIntoSecondLevelDomain( url ):
domain = ConvertURLIntoDomain( url )
return ConvertDomainIntoSecondLevelDomain( domain )
def ConvertURLToHumanString( url: str ) -> str:
# ok so the idea here is that we want to store 'ugly' urls behind the scenes, with quoted %20 gubbins, but any time we present to the user, we want to convert all that to real (URL-invalid) characters
# although there are some caveats, we can pretty much just do a dequote on the whole string and it'll be fine most of the time mate
# if we have a unicode domain, we'll need to figure out 'punycode' decoding, but w/e for now
pretty_url = urllib.parse.unquote( url )
return pretty_url
def CookieDomainMatches( cookie, search_domain ):
cookie_domain = cookie.domain
# blah.com is viewable by blah.com
matches_exactly = cookie_domain == search_domain
# .blah.com is viewable by blah.com
matches_dot = cookie_domain == '.' + search_domain
# .blah.com applies to subdomain.blah.com, blah.com does not
valid_subdomain = cookie_domain.startswith( '.' ) and search_domain.endswith( cookie_domain )
return matches_exactly or matches_dot or valid_subdomain
def DomainEqualsAnotherForgivingWWW( test_domain, wwwable_domain ):
# domain is either the same or starts with www. or www2. or something
rule = r'^(www[^\.]*\.)?' + re.escape( wwwable_domain ) + '$'
return re.search( rule, test_domain ) is not None
def GetCookie( cookies, search_domain, cookie_name_string_match ):
for cookie in cookies:
if CookieDomainMatches( cookie, search_domain ) and cookie_name_string_match.Matches( cookie.name ):
return cookie
raise HydrusExceptions.DataMissing( 'Cookie "' + cookie_name_string_match.ToString() + '" not found for domain ' + search_domain + '!' )
def GetSearchURLs( url ):
search_urls = set()
search_urls.add( url )
try:
ephemeral_normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url, for_server = True )
search_urls.add( ephemeral_normalised_url )
normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
search_urls.add( normalised_url )
except HydrusExceptions.URLClassException:
pass
for url in list( search_urls ):
if url.startswith( 'http://' ):
search_urls.add( ConvertHTTPToHTTPS( url ) )
elif url.startswith( 'https://' ):
search_urls.add( ConvertHTTPSToHTTP( url ) )
for url in list( search_urls ):
p = ParseURL( url )
scheme = p.scheme
netloc = p.netloc
path = p.path
params = ''
query = p.query
fragment = p.fragment
if netloc.startswith( 'www' ):
try:
netloc = ConvertDomainIntoSecondLevelDomain( netloc )
except HydrusExceptions.URLClassException:
continue
else:
netloc = 'www.' + netloc
r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )
search_urls.add( r.geturl() )
for url in list( search_urls ):
if url.endswith( '/' ):
search_urls.add( url[:-1] )
else:
search_urls.add( url + '/' )
return search_urls
def LooksLikeAFullURL( text: str ) -> bool:
try:
p = ParseURL( text )
if p.scheme == '':
return False
if p.netloc == '':
return False
return True
except:
return False
def NormaliseAndFilterAssociableURLs( urls ):
normalised_urls = set()
for url in urls:
try:
url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
except HydrusExceptions.URLClassException:
continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha
normalised_urls.add( url )
associable_urls = { url for url in normalised_urls if CG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
return associable_urls
def ParseURL( url: str ) -> urllib.parse.ParseResult:
url = url.strip()
url = UnicodeNormaliseURL( url )
try:
return urllib.parse.urlparse( url )
except Exception as e:
raise HydrusExceptions.URLClassException( str( e ) )
def WashURL( url: str, keep_fragment = True ) -> str:
if not LooksLikeAFullURL( url ):
return url
try:
p = ParseURL( url )
scheme = p.scheme
netloc = p.netloc
params = p.params # just so you know, this is ancient web semicolon tech, can be ignored
fragment = p.fragment
path_components = ConvertPathTextToList( p.path )
( query_dict, single_value_parameters, param_order ) = ConvertQueryTextToDict( p.query )
( path_components, query_dict, single_value_parameters ) = EnsureURLInfoIsEncoded( path_components, query_dict, single_value_parameters )
path = '/' + '/'.join( path_components )
query = ConvertQueryDictToText( query_dict, single_value_parameters )
if not keep_fragment:
fragment = ''
r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )
clean_url = r.geturl()
return clean_url
except:
return url
OH_NO_NO_NETLOC_CHARACTERS = '?#'
OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord( char ) : '_' for char in OH_NO_NO_NETLOC_CHARACTERS }
def RemoveWWWFromDomain( domain ):
if domain.count( '.' ) > 1 and domain.startswith( 'www' ):
domain = ConvertDomainIntoNextLevelDomain( domain )
return domain
def UnicodeNormaliseURL( url: str ):
if url.startswith( 'file:' ):
return url
# the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise
# urllib.urlparse throws a valueerror if these are in, so let's switch out
scheme_splitter = '://'
netloc_splitter = '/'
if scheme_splitter in url:
( scheme, netloc_and_path_and_rest ) = url.split( scheme_splitter, 1 )
if netloc_splitter in netloc_and_path_and_rest:
( netloc, path_and_rest ) = netloc_and_path_and_rest.split( netloc_splitter, 1 )
else:
netloc = netloc_and_path_and_rest
path_and_rest = None
netloc = unicodedata.normalize( 'NFKC', netloc )
netloc = netloc.translate( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE )
scheme_and_netlock = scheme_splitter.join( ( scheme, netloc ) )
if path_and_rest is None:
url = scheme_and_netlock
else:
url = netloc_splitter.join( ( scheme_and_netlock, path_and_rest ) )
return url