567 lines
17 KiB
Python
567 lines
17 KiB
Python
import http.cookiejar
|
|
import re
|
|
import typing
|
|
|
|
import unicodedata
|
|
import urllib.parse
|
|
|
|
from hydrus.core import HydrusGlobals as HG
|
|
from hydrus.core import HydrusExceptions
|
|
|
|
from hydrus.client import ClientGlobals as CG
|
|
def AddCookieToSession( session, name, value, domain, path, expires, secure = False, rest = None ):
|
|
|
|
version = 0
|
|
port = None
|
|
port_specified = False
|
|
domain_specified = True
|
|
domain_initial_dot = domain.startswith( '.' )
|
|
path_specified = True
|
|
discard = False
|
|
comment = None
|
|
comment_url = None
|
|
|
|
if rest is None:
|
|
|
|
rest = {}
|
|
|
|
|
|
cookie = http.cookiejar.Cookie( version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest )
|
|
|
|
session.cookies.set_cookie( cookie )
|
|
|
|
def ConvertDomainIntoAllApplicableDomains( domain, discard_www = True ):
|
|
|
|
# is an ip address or localhost, possibly with a port
|
|
if '.' not in domain or re.search( r'^[\d.:]+$', domain ) is not None:
|
|
|
|
return [ domain ]
|
|
|
|
|
|
domains = []
|
|
|
|
if discard_www:
|
|
|
|
domain = RemoveWWWFromDomain( domain )
|
|
|
|
|
|
while domain.count( '.' ) > 0:
|
|
|
|
domains.append( domain )
|
|
|
|
domain = ConvertDomainIntoNextLevelDomain( domain )
|
|
|
|
|
|
return domains
|
|
|
|
def ConvertDomainIntoNextLevelDomain( domain ):
|
|
|
|
return '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com
|
|
|
|
def ConvertDomainIntoSecondLevelDomain( domain ):
|
|
|
|
domains = ConvertDomainIntoAllApplicableDomains( domain )
|
|
|
|
if len( domains ) == 0:
|
|
|
|
raise HydrusExceptions.URLClassException( 'That url or domain did not seem to be valid!' )
|
|
|
|
|
|
return domains[-1]
|
|
|
|
def ConvertHTTPSToHTTP( url ):
|
|
|
|
if url.startswith( 'http://' ):
|
|
|
|
return url
|
|
|
|
elif url.startswith( 'https://' ):
|
|
|
|
http_url = 'http://' + url[8:]
|
|
|
|
return http_url
|
|
|
|
else:
|
|
|
|
raise Exception( 'Given a url that did not have a scheme!' )
|
|
|
|
|
|
def ConvertHTTPToHTTPS( url ):
|
|
|
|
if url.startswith( 'https://' ):
|
|
|
|
return url
|
|
|
|
elif url.startswith( 'http://' ):
|
|
|
|
https_url = 'https://' + url[7:]
|
|
|
|
return https_url
|
|
|
|
else:
|
|
|
|
raise Exception( 'Given a url that did not have a scheme!' )
|
|
|
|
|
|
|
|
def ConvertPathTextToList( path: str ) -> typing.List[ str ]:
|
|
|
|
# yo sometimes you see a URL with double slashes in a weird place. maybe we should just split( '/' ) and then remove empty '' results?
|
|
|
|
# /post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
|
|
|
|
# for a while we've had URLs like this:
|
|
# https://img2.gelbooru.com//images/80/c8/80c8646b4a49395fb36c805f316c49a9.jpg
|
|
# I was going to be careful as I unified all this to preserve the double-slash to help legacy known url storage matching, but it seems we've been nuking the extra slash for ages in actual db storage, so w/e!
|
|
while path.startswith( '/' ):
|
|
|
|
path = path[ 1 : ]
|
|
|
|
|
|
# post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem
|
|
|
|
path_components = path.split( '/' )
|
|
|
|
return path_components
|
|
|
|
|
|
def ConvertQueryDictToText( query_dict, single_value_parameters, param_order = None ):
|
|
|
|
# we now do everything with requests, which does all the unicode -> %20 business naturally, phew
|
|
# we still want to call str explicitly to coerce integers and so on that'll slip in here and there
|
|
|
|
if param_order is None:
|
|
|
|
param_order = sorted( query_dict.keys() )
|
|
|
|
single_value_parameters = list( single_value_parameters )
|
|
single_value_parameters.sort()
|
|
|
|
for i in range( len( single_value_parameters ) ):
|
|
|
|
param_order.append( None )
|
|
|
|
|
|
|
|
params = []
|
|
|
|
single_value_parameter_index = 0
|
|
|
|
for key in param_order:
|
|
|
|
if key is None:
|
|
|
|
try:
|
|
|
|
params.append( single_value_parameters[ single_value_parameter_index ] )
|
|
|
|
except IndexError:
|
|
|
|
continue
|
|
|
|
|
|
single_value_parameter_index += 1
|
|
|
|
else:
|
|
|
|
if key in query_dict:
|
|
|
|
params.append( f'{key}={query_dict[ key ]}' )
|
|
|
|
|
|
|
|
|
|
query_text = '&'.join( params )
|
|
|
|
return query_text
|
|
|
|
|
|
def ConvertQueryTextToDict( query_text ):
|
|
|
|
# in the old version of this func, we played silly games with character encoding. I made the foolish decision to try to handle/save URLs with %20 stuff decoded
|
|
# this lead to complexity with odd situations like '6+girls+skirt', which would come here encoded as '6%2Bgirls+skirt'
|
|
# I flipped back and forth and tried to preserve the encoding if it did stepped on x or did not change y, what a mess!
|
|
|
|
# I no longer do this. I will encode if there is no '%' in there already, which catches cases of humans pasting/typing an URL with something human, but only if it is non-destructive
|
|
|
|
# Update: I still hate this a bit. I should have a parameter that says 'from human=True' and then anything we ingest should go through a normalisation( from_human = True ) wash
|
|
# I don't like the '+' exception we have to do here, and it would be better isolated to just the initian from_human wash rather than basically every time we look at an url for normalisation
|
|
# indeed, instead of having 'from_human' in here, I could have a 'EncodeQueryDict' that does best-attempt smart encoding from_human, once
|
|
# this guy would then just be a glorified dict parser, great
|
|
|
|
param_order = []
|
|
|
|
query_dict = {}
|
|
single_value_parameters = []
|
|
|
|
pairs = query_text.split( '&' )
|
|
|
|
for pair in pairs:
|
|
|
|
result = pair.split( '=', 1 )
|
|
|
|
# for the moment, ignore tracker bugs and so on that have only key and no value
|
|
|
|
if len( result ) == 1:
|
|
|
|
( value, ) = result
|
|
|
|
if value == '':
|
|
|
|
continue
|
|
|
|
|
|
single_value_parameters.append( value )
|
|
param_order.append( None )
|
|
|
|
elif len( result ) == 2:
|
|
|
|
( key, value ) = result
|
|
|
|
param_order.append( key )
|
|
|
|
query_dict[ key ] = value
|
|
|
|
|
|
|
|
return ( query_dict, single_value_parameters, param_order )
|
|
|
|
|
|
def EnsureURLInfoIsEncoded( path_components: typing.List[ str ], query_dict: typing.Dict[ str, str ], single_value_parameters: typing.List[ str ] ):
|
|
|
|
# ok so the user just posted a URL at us, and this query dict could either be from a real url, like "tags=skirt%20blonde_hair", or it could be a pretty URL they typed or whatever, "tags=skirt blonde_hair"
|
|
# so, let's do our best to figure out if the thing was pre-encoded or not, and wash it through a safe encoding process so it is encoded when we give it back
|
|
# what's the potential problem? '+' is a special character that may or may not be encoded, e.g. "tags=6%2Bgirls+skirt" WEW
|
|
|
|
percent_encoding_re = re.compile( r'%[0-9A-Fa-f]{2}' )
|
|
|
|
all_gubbins = set( path_components )
|
|
all_gubbins.update( query_dict.keys() )
|
|
all_gubbins.update( query_dict.values() )
|
|
all_gubbins.update( single_value_parameters )
|
|
|
|
there_are_percent_encoding_chars = True in ( percent_encoding_re.search( text ) is not None for text in all_gubbins )
|
|
|
|
# if there are percent-encoded characters anywhere, we have to assume the whole URL is already encoded correctly!
|
|
if not there_are_percent_encoding_chars:
|
|
|
|
path_components = [ urllib.parse.quote( value, safe = '+' ) for value in path_components ]
|
|
query_dict = { urllib.parse.quote( key, safe = '+' ) : urllib.parse.quote( value, safe = '+' ) for ( key, value ) in query_dict.items() }
|
|
single_value_parameters = [ urllib.parse.quote( value, safe = '+' ) for value in single_value_parameters ]
|
|
|
|
|
|
return ( path_components, query_dict, single_value_parameters )
|
|
|
|
|
|
def ConvertURLIntoDomain( url ):
|
|
|
|
parser_result = ParseURL( url )
|
|
|
|
if parser_result.scheme == '':
|
|
|
|
raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--did you forget the http:// or https://?' )
|
|
|
|
|
|
if parser_result.netloc == '':
|
|
|
|
raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--is it missing a domain?' )
|
|
|
|
|
|
domain = parser_result.netloc
|
|
|
|
return domain
|
|
|
|
def ConvertURLIntoSecondLevelDomain( url ):
|
|
|
|
domain = ConvertURLIntoDomain( url )
|
|
|
|
return ConvertDomainIntoSecondLevelDomain( domain )
|
|
|
|
|
|
def ConvertURLToHumanString( url: str ) -> str:
|
|
|
|
# ok so the idea here is that we want to store 'ugly' urls behind the scenes, with quoted %20 gubbins, but any time we present to the user, we want to convert all that to real (URL-invalid) characters
|
|
# although there are some caveats, we can pretty much just do a dequote on the whole string and it'll be fine most of the time mate
|
|
# if we have a unicode domain, we'll need to figure out 'punycode' decoding, but w/e for now
|
|
|
|
pretty_url = urllib.parse.unquote( url )
|
|
|
|
return pretty_url
|
|
|
|
|
|
def CookieDomainMatches( cookie, search_domain ):
|
|
|
|
cookie_domain = cookie.domain
|
|
|
|
# blah.com is viewable by blah.com
|
|
matches_exactly = cookie_domain == search_domain
|
|
|
|
# .blah.com is viewable by blah.com
|
|
matches_dot = cookie_domain == '.' + search_domain
|
|
|
|
# .blah.com applies to subdomain.blah.com, blah.com does not
|
|
valid_subdomain = cookie_domain.startswith( '.' ) and search_domain.endswith( cookie_domain )
|
|
|
|
return matches_exactly or matches_dot or valid_subdomain
|
|
|
|
def DomainEqualsAnotherForgivingWWW( test_domain, wwwable_domain ):
|
|
|
|
# domain is either the same or starts with www. or www2. or something
|
|
rule = r'^(www[^\.]*\.)?' + re.escape( wwwable_domain ) + '$'
|
|
|
|
return re.search( rule, test_domain ) is not None
|
|
|
|
def GetCookie( cookies, search_domain, cookie_name_string_match ):
|
|
|
|
for cookie in cookies:
|
|
|
|
if CookieDomainMatches( cookie, search_domain ) and cookie_name_string_match.Matches( cookie.name ):
|
|
|
|
return cookie
|
|
|
|
|
|
|
|
raise HydrusExceptions.DataMissing( 'Cookie "' + cookie_name_string_match.ToString() + '" not found for domain ' + search_domain + '!' )
|
|
|
|
def GetSearchURLs( url ):
|
|
|
|
search_urls = set()
|
|
|
|
search_urls.add( url )
|
|
|
|
try:
|
|
|
|
ephemeral_normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url, for_server = True )
|
|
|
|
search_urls.add( ephemeral_normalised_url )
|
|
|
|
normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
|
|
|
|
search_urls.add( normalised_url )
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
pass
|
|
|
|
|
|
for url in list( search_urls ):
|
|
|
|
if url.startswith( 'http://' ):
|
|
|
|
search_urls.add( ConvertHTTPToHTTPS( url ) )
|
|
|
|
elif url.startswith( 'https://' ):
|
|
|
|
search_urls.add( ConvertHTTPSToHTTP( url ) )
|
|
|
|
|
|
|
|
for url in list( search_urls ):
|
|
|
|
p = ParseURL( url )
|
|
|
|
scheme = p.scheme
|
|
netloc = p.netloc
|
|
path = p.path
|
|
params = ''
|
|
query = p.query
|
|
fragment = p.fragment
|
|
|
|
if netloc.startswith( 'www' ):
|
|
|
|
try:
|
|
|
|
netloc = ConvertDomainIntoSecondLevelDomain( netloc )
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
continue
|
|
|
|
|
|
else:
|
|
|
|
netloc = 'www.' + netloc
|
|
|
|
|
|
r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )
|
|
|
|
search_urls.add( r.geturl() )
|
|
|
|
|
|
for url in list( search_urls ):
|
|
|
|
if url.endswith( '/' ):
|
|
|
|
search_urls.add( url[:-1] )
|
|
|
|
else:
|
|
|
|
search_urls.add( url + '/' )
|
|
|
|
|
|
|
|
return search_urls
|
|
|
|
|
|
def LooksLikeAFullURL( text: str ) -> bool:
|
|
|
|
try:
|
|
|
|
p = ParseURL( text )
|
|
|
|
if p.scheme == '':
|
|
|
|
return False
|
|
|
|
|
|
if p.netloc == '':
|
|
|
|
return False
|
|
|
|
|
|
return True
|
|
|
|
except:
|
|
|
|
return False
|
|
|
|
|
|
|
|
def NormaliseAndFilterAssociableURLs( urls ):
|
|
|
|
normalised_urls = set()
|
|
|
|
for url in urls:
|
|
|
|
try:
|
|
|
|
url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha
|
|
|
|
|
|
normalised_urls.add( url )
|
|
|
|
|
|
associable_urls = { url for url in normalised_urls if CG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
|
|
|
|
return associable_urls
|
|
|
|
|
|
def ParseURL( url: str ) -> urllib.parse.ParseResult:
|
|
|
|
url = url.strip()
|
|
|
|
url = UnicodeNormaliseURL( url )
|
|
|
|
try:
|
|
|
|
return urllib.parse.urlparse( url )
|
|
|
|
except Exception as e:
|
|
|
|
raise HydrusExceptions.URLClassException( str( e ) )
|
|
|
|
|
|
|
|
def WashURL( url: str, keep_fragment = True ) -> str:
|
|
|
|
if not LooksLikeAFullURL( url ):
|
|
|
|
return url
|
|
|
|
|
|
try:
|
|
|
|
p = ParseURL( url )
|
|
|
|
scheme = p.scheme
|
|
netloc = p.netloc
|
|
params = p.params # just so you know, this is ancient web semicolon tech, can be ignored
|
|
fragment = p.fragment
|
|
|
|
path_components = ConvertPathTextToList( p.path )
|
|
( query_dict, single_value_parameters, param_order ) = ConvertQueryTextToDict( p.query )
|
|
|
|
( path_components, query_dict, single_value_parameters ) = EnsureURLInfoIsEncoded( path_components, query_dict, single_value_parameters )
|
|
|
|
path = '/' + '/'.join( path_components )
|
|
query = ConvertQueryDictToText( query_dict, single_value_parameters )
|
|
|
|
if not keep_fragment:
|
|
|
|
fragment = ''
|
|
|
|
|
|
r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )
|
|
|
|
clean_url = r.geturl()
|
|
|
|
return clean_url
|
|
|
|
except:
|
|
|
|
return url
|
|
|
|
|
|
|
|
OH_NO_NO_NETLOC_CHARACTERS = '?#'
|
|
OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord( char ) : '_' for char in OH_NO_NO_NETLOC_CHARACTERS }
|
|
|
|
def RemoveWWWFromDomain( domain ):
|
|
|
|
if domain.count( '.' ) > 1 and domain.startswith( 'www' ):
|
|
|
|
domain = ConvertDomainIntoNextLevelDomain( domain )
|
|
|
|
|
|
return domain
|
|
|
|
|
|
def UnicodeNormaliseURL( url: str ):
|
|
|
|
if url.startswith( 'file:' ):
|
|
|
|
return url
|
|
|
|
|
|
# the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise
|
|
# urllib.urlparse throws a valueerror if these are in, so let's switch out
|
|
|
|
scheme_splitter = '://'
|
|
netloc_splitter = '/'
|
|
|
|
if scheme_splitter in url:
|
|
|
|
( scheme, netloc_and_path_and_rest ) = url.split( scheme_splitter, 1 )
|
|
|
|
if netloc_splitter in netloc_and_path_and_rest:
|
|
|
|
( netloc, path_and_rest ) = netloc_and_path_and_rest.split( netloc_splitter, 1 )
|
|
|
|
else:
|
|
|
|
netloc = netloc_and_path_and_rest
|
|
path_and_rest = None
|
|
|
|
|
|
netloc = unicodedata.normalize( 'NFKC', netloc )
|
|
|
|
netloc = netloc.translate( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE )
|
|
|
|
scheme_and_netlock = scheme_splitter.join( ( scheme, netloc ) )
|
|
|
|
if path_and_rest is None:
|
|
|
|
url = scheme_and_netlock
|
|
|
|
else:
|
|
|
|
url = netloc_splitter.join( ( scheme_and_netlock, path_and_rest ) )
|
|
|
|
|
|
|
|
return url
|