hydrus/hydrus/client/networking/ClientNetworkingFunctions.py

import http.cookiejar
import re
import unicodedata
import urllib.parse

from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusExceptions

from hydrus.client import ClientGlobals as CG
def AddCookieToSession( session, name, value, domain, path, expires, secure = False, rest = None ):
    
    version = 0
    port = None
    port_specified = False
    domain_specified = True
    domain_initial_dot = domain.startswith( '.' )
    path_specified = True
    discard = False
    comment = None
    comment_url = None
    
    if rest is None:
        
        rest = {}
        
    
    cookie = http.cookiejar.Cookie( version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest )
    
    session.cookies.set_cookie( cookie )
    
def ConvertDomainIntoAllApplicableDomains( domain, discard_www = True ):
    
    # is an ip address or localhost, possibly with a port
    if '.' not in domain or re.search( r'^[\d.:]+$', domain ) is not None:
        
        return [ domain ]
        
    
    domains = []
    
    if discard_www:
        
        domain = RemoveWWWFromDomain( domain )
        
    
    while domain.count( '.' ) > 0:
        
        domains.append( domain )
        
        domain = ConvertDomainIntoNextLevelDomain( domain )
        
    
    return domains
    
def ConvertDomainIntoNextLevelDomain( domain ):
    
    return '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com
    
def ConvertDomainIntoSecondLevelDomain( domain ):
    
    domains = ConvertDomainIntoAllApplicableDomains( domain )
    
    if len( domains ) == 0:
        
        raise HydrusExceptions.URLClassException( 'That url or domain did not seem to be valid!' )
        
    
    return domains[-1]
    
def ConvertHTTPSToHTTP( url ):
    
    if url.startswith( 'http://' ):
        
        return url
        
    elif url.startswith( 'https://' ):
        
        http_url = 'http://' + url[8:]
        
        return http_url
        
    else:
        
        raise Exception( 'Given a url that did not have a scheme!' )
        
    
def ConvertHTTPToHTTPS( url ):
    
    if url.startswith( 'https://' ):
        
        return url
        
    elif url.startswith( 'http://' ):
        
        https_url = 'https://' + url[7:]
        
        return https_url
        
    else:
        
        raise Exception( 'Given a url that did not have a scheme!' )
        
    
def ConvertQueryDictToText( query_dict, single_value_parameters, param_order = None ):
    
    # we now do everything with requests, which does all the unicode -> %20 business naturally, phew
    # we still want to call str explicitly to coerce integers and so on that'll slip in here and there
    
    if param_order is None:
        
        param_order = sorted( query_dict.keys() )
        
        single_value_parameters = list( single_value_parameters )
        single_value_parameters.sort()
        
        for i in range( len( single_value_parameters ) ):
            
            param_order.append( None )
            
        
    
    params = []
    
    single_value_parameter_index = 0
    
    for key in param_order:
        
        if key is None:
            
            try:
                
                params.append( single_value_parameters[ single_value_parameter_index ] )
                
            except IndexError:
                
                continue
                
            
            single_value_parameter_index += 1
            
        else:
            
            if key in query_dict:
                
                params.append( '{}={}'.format( key, query_dict[ key ] ) )
                
            
        
    
    query_text = '&'.join( params )
    
    return query_text
    
def ConvertQueryTextToDict( query_text ):
    
    # we generally do not want quote characters, %20 stuff, in our urls. we would prefer properly formatted unicode
    
    # so, let's replace all keys and values with unquoted versions
    # -but-
    # we only replace if it is a completely reversable operation!
    # odd situations like '6+girls+skirt', which comes here encoded as '6%2Bgirls+skirt', shouldn't turn into '6+girls+skirt'
    # so if there are a mix of encoded and non-encoded, we won't touch it here m8
    
    # except these chars, which screw with GET arg syntax when unquoted
    bad_chars = [ '&', '=', '/', '?', '#', ';', '+', ',' ]
    
    param_order = []
    
    query_dict = {}
    single_value_parameters = []
    
    pairs = query_text.split( '&' )
    
    for pair in pairs:
        
        result = pair.split( '=', 1 )
        
        # for the moment, ignore tracker bugs and so on that have only key and no value
        
        if len( result ) == 1:
            
            ( value, ) = result
            
            if value == '':
                
                continue
                
            
            try:
                
                unquoted_value = urllib.parse.unquote( value )
                
                if True not in ( bad_char in unquoted_value for bad_char in bad_chars ):
                    
                    requoted_value = urllib.parse.quote( unquoted_value )
                    
                    if requoted_value == value:
                        
                        value = unquoted_value
                        
                    
                
            except:
                
                pass
                
            
            single_value_parameters.append( value )
            param_order.append( None )
            
        elif len( result ) == 2:
            
            ( key, value ) = result
            
            try:
                
                unquoted_key = urllib.parse.unquote( key )
                
                if True not in ( bad_char in unquoted_key for bad_char in bad_chars ):
                    
                    requoted_key = urllib.parse.quote( unquoted_key )
                    
                    if requoted_key == key:
                        
                        key = unquoted_key
                        
                    
                
            except:
                
                pass
                
            
            try:
                
                unquoted_value = urllib.parse.unquote( value )
                
                if True not in ( bad_char in unquoted_value for bad_char in bad_chars ):
                    
                    requoted_value = urllib.parse.quote( unquoted_value )
                    
                    if requoted_value == value:
                        
                        value = unquoted_value
                        
                    
                
            except:
                
                pass
                
            
            param_order.append( key )
            
            query_dict[ key ] = value
            
        
    
    return ( query_dict, single_value_parameters, param_order )
    
def ConvertURLIntoDomain( url ):
    
    parser_result = ParseURL( url )
    
    if parser_result.scheme == '':
        
        raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--did you forget the http:// or https://?' )
        
    
    if parser_result.netloc == '':
        
        raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--is it missing a domain?' )
        
    
    domain = parser_result.netloc
    
    return domain
    
def ConvertURLIntoSecondLevelDomain( url ):
    
    domain = ConvertURLIntoDomain( url )
    
    return ConvertDomainIntoSecondLevelDomain( domain )
    
def CookieDomainMatches( cookie, search_domain ):
    
    cookie_domain = cookie.domain
    
    # blah.com is viewable by blah.com
    matches_exactly = cookie_domain == search_domain
    
    # .blah.com is viewable by blah.com
    matches_dot = cookie_domain == '.' + search_domain
    
    # .blah.com applies to subdomain.blah.com, blah.com does not
    valid_subdomain = cookie_domain.startswith( '.' ) and search_domain.endswith( cookie_domain )
    
    return matches_exactly or matches_dot or valid_subdomain
    
def DomainEqualsAnotherForgivingWWW( test_domain, wwwable_domain ):
    
    # domain is either the same or starts with www. or www2. or something
    rule = r'^(www[^\.]*\.)?' + re.escape( wwwable_domain ) + '$'
    
    return re.search( rule, test_domain ) is not None
    
def GetCookie( cookies, search_domain, cookie_name_string_match ):
    
    for cookie in cookies:
        
        if CookieDomainMatches( cookie, search_domain ) and cookie_name_string_match.Matches( cookie.name ):
            
            return cookie
            
        
    
    raise HydrusExceptions.DataMissing( 'Cookie "' + cookie_name_string_match.ToString() + '" not found for domain ' + search_domain + '!' )
    
def GetSearchURLs( url ):
    
    search_urls = set()
    
    search_urls.add( url )
    
    try:
        
        normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
        
        search_urls.add( normalised_url )
        
    except HydrusExceptions.URLClassException:
        
        pass
        
    
    for url in list( search_urls ):
        
        if url.startswith( 'http://' ):
            
            search_urls.add( ConvertHTTPToHTTPS( url ) )
            
        elif url.startswith( 'https://' ):
            
            search_urls.add( ConvertHTTPSToHTTP( url ) )
            
        
    
    for url in list( search_urls ):
        
        p = ParseURL( url )
        
        scheme = p.scheme
        netloc = p.netloc
        path = p.path
        params = ''
        query = p.query
        fragment = p.fragment
        
        if netloc.startswith( 'www' ):
            
            try:
                
                netloc = ConvertDomainIntoSecondLevelDomain( netloc )
                
            except HydrusExceptions.URLClassException:
                
                continue
                
            
        else:
            
            netloc = 'www.' + netloc
            
        
        r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )
        
        search_urls.add( r.geturl() )
        
    
    for url in list( search_urls ):
        
        if url.endswith( '/' ):
            
            search_urls.add( url[:-1] )
            
        else:
            
            search_urls.add( url + '/' )
            
        
    
    return search_urls
    

def LooksLikeAFullURL( text: str ) -> bool:
    
    try:
        
        result = urllib.parse.urlparse( text )
        
        if result.scheme == '':
            
            return False
            
        
        if result.netloc == '':
            
            return False
            
        
        return True
        
    except:
        
        return False
        
    

def NormaliseAndFilterAssociableURLs( urls ):
    
    normalised_urls = set()
    
    for url in urls:
        
        try:
            
            url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )
            
        except HydrusExceptions.URLClassException:
            
            continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha
            
        
        normalised_urls.add( url )
        
    
    associable_urls = { url for url in normalised_urls if CG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
    
    return associable_urls
    
def ParseURL( url: str ) -> urllib.parse.ParseResult:
    
    url = url.strip()
    
    url = UnicodeNormaliseURL( url )
    
    try:
        
        return urllib.parse.urlparse( url )
        
    except Exception as e:
        
        raise HydrusExceptions.URLClassException( str( e ) )
        
    

OH_NO_NO_NETLOC_CHARACTERS = '?#'
OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord( char ) : '_' for char in OH_NO_NO_NETLOC_CHARACTERS }

def RemoveWWWFromDomain( domain ):
    
    if domain.count( '.' ) > 1 and domain.startswith( 'www' ):
        
        domain = ConvertDomainIntoNextLevelDomain( domain )
        
    
    return domain
    
def UnicodeNormaliseURL( url: str ):
    
    if url.startswith( 'file:' ):
        
        return url
        
    
    # the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise
    # urllib.urlparse throws a valueerror if these are in, so let's switch out
    
    scheme_splitter = '://'
    netloc_splitter = '/'
    
    if scheme_splitter in url:
        
        ( scheme, netloc_and_path_and_rest ) = url.split( scheme_splitter, 1 )
        
        if netloc_splitter in netloc_and_path_and_rest:
            
            ( netloc, path_and_rest ) = netloc_and_path_and_rest.split( netloc_splitter, 1 )
            
        else:
            
            netloc = netloc_and_path_and_rest
            path_and_rest = None
            
        
        netloc = unicodedata.normalize( 'NFKC', netloc )
        
        netloc = netloc.translate( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE )
        
        scheme_and_netlock = scheme_splitter.join( ( scheme, netloc ) )
        
        if path_and_rest is None:
            
            url = scheme_and_netlock
            
        else:
            
            url = netloc_splitter.join( ( scheme_and_netlock, path_and_rest ) )
            
        
    
    return url
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00			`import http.cookiejar`
			`import re`
			`import unicodedata`
			`import urllib.parse`

			`from hydrus.core import HydrusGlobals as HG`
			`from hydrus.core import HydrusExceptions`

Version 562 closes #1520, closes #1518 2024-02-14 21:20:24 +00:00			`from hydrus.client import ClientGlobals as CG`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00			`def AddCookieToSession( session, name, value, domain, path, expires, secure = False, rest = None ):`

			`version = 0`
			`port = None`
			`port_specified = False`
			`domain_specified = True`
			`domain_initial_dot = domain.startswith( '.' )`
			`path_specified = True`
			`discard = False`
			`comment = None`
			`comment_url = None`

			`if rest is None:`

			`rest = {}`


			`cookie = http.cookiejar.Cookie( version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest )`

			`session.cookies.set_cookie( cookie )`

			`def ConvertDomainIntoAllApplicableDomains( domain, discard_www = True ):`

			`# is an ip address or localhost, possibly with a port`
Version 479 closes #1095, closes #1105 2022-03-30 20:28:13 +00:00			`if '.' not in domain or re.search( r'^[\d.:]+$', domain ) is not None:`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
			`return [ domain ]`


			`domains = []`

			`if discard_www:`

			`domain = RemoveWWWFromDomain( domain )`


			`while domain.count( '.' ) > 0:`

			`domains.append( domain )`

			`domain = ConvertDomainIntoNextLevelDomain( domain )`


			`return domains`

			`def ConvertDomainIntoNextLevelDomain( domain ):`

			`return '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com`

			`def ConvertDomainIntoSecondLevelDomain( domain ):`

			`domains = ConvertDomainIntoAllApplicableDomains( domain )`

			`if len( domains ) == 0:`

			`raise HydrusExceptions.URLClassException( 'That url or domain did not seem to be valid!' )`


			`return domains[-1]`

			`def ConvertHTTPSToHTTP( url ):`

			`if url.startswith( 'http://' ):`

			`return url`

			`elif url.startswith( 'https://' ):`

			`http_url = 'http://' + url[8:]`

			`return http_url`

			`else:`

			`raise Exception( 'Given a url that did not have a scheme!' )`


			`def ConvertHTTPToHTTPS( url ):`

			`if url.startswith( 'https://' ):`

			`return url`

			`elif url.startswith( 'http://' ):`

			`https_url = 'https://' + url[7:]`

			`return https_url`

			`else:`

			`raise Exception( 'Given a url that did not have a scheme!' )`


			`def ConvertQueryDictToText( query_dict, single_value_parameters, param_order = None ):`

			`# we now do everything with requests, which does all the unicode -> %20 business naturally, phew`
			`# we still want to call str explicitly to coerce integers and so on that'll slip in here and there`

			`if param_order is None:`

			`param_order = sorted( query_dict.keys() )`

			`single_value_parameters = list( single_value_parameters )`
			`single_value_parameters.sort()`

			`for i in range( len( single_value_parameters ) ):`

			`param_order.append( None )`



			`params = []`

			`single_value_parameter_index = 0`

			`for key in param_order:`

			`if key is None:`

			`try:`

			`params.append( single_value_parameters[ single_value_parameter_index ] )`

			`except IndexError:`

			`continue`


			`single_value_parameter_index += 1`

			`else:`

			`if key in query_dict:`

Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`params.append( '{}={}'.format( key, query_dict[ key ] ) )`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00



			`query_text = '&'.join( params )`

			`return query_text`

			`def ConvertQueryTextToDict( query_text ):`

Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`# we generally do not want quote characters, %20 stuff, in our urls. we would prefer properly formatted unicode`

			`# so, let's replace all keys and values with unquoted versions`
			`# -but-`
			`# we only replace if it is a completely reversable operation!`
			`# odd situations like '6+girls+skirt', which comes here encoded as '6%2Bgirls+skirt', shouldn't turn into '6+girls+skirt'`
			`# so if there are a mix of encoded and non-encoded, we won't touch it here m8`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`# except these chars, which screw with GET arg syntax when unquoted`
			`bad_chars = [ '&', '=', '/', '?', '#', ';', '+', ',' ]`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
			`param_order = []`

			`query_dict = {}`
			`single_value_parameters = []`

			`pairs = query_text.split( '&' )`

			`for pair in pairs:`

			`result = pair.split( '=', 1 )`

			`# for the moment, ignore tracker bugs and so on that have only key and no value`

			`if len( result ) == 1:`

			`( value, ) = result`

			`if value == '':`

			`continue`


Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`try:`

			`unquoted_value = urllib.parse.unquote( value )`

			`if True not in ( bad_char in unquoted_value for bad_char in bad_chars ):`

			`requoted_value = urllib.parse.quote( unquoted_value )`

			`if requoted_value == value:`

			`value = unquoted_value`



			`except:`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`pass`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00

			`single_value_parameters.append( value )`
			`param_order.append( None )`

			`elif len( result ) == 2:`

			`( key, value ) = result`

Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`try:`

			`unquoted_key = urllib.parse.unquote( key )`

			`if True not in ( bad_char in unquoted_key for bad_char in bad_chars ):`

			`requoted_key = urllib.parse.quote( unquoted_key )`

			`if requoted_key == key:`

			`key = unquoted_key`


Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`except:`

			`pass`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00

Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`try:`

			`unquoted_value = urllib.parse.unquote( value )`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
Revert "Version 567" This reverts commit 35eca25409cbb85e86e236b55bf84ca4ffbc7261. 2024-03-20 22:28:54 +00:00			`if True not in ( bad_char in unquoted_value for bad_char in bad_chars ):`

			`requoted_value = urllib.parse.quote( unquoted_value )`

			`if requoted_value == value:`

			`value = unquoted_value`



			`except:`

			`pass`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00

			`param_order.append( key )`

			`query_dict[ key ] = value`



			`return ( query_dict, single_value_parameters, param_order )`

			`def ConvertURLIntoDomain( url ):`

			`parser_result = ParseURL( url )`

			`if parser_result.scheme == '':`

			`raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--did you forget the http:// or https://?' )`


			`if parser_result.netloc == '':`

			`raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--is it missing a domain?' )`


			`domain = parser_result.netloc`

			`return domain`

			`def ConvertURLIntoSecondLevelDomain( url ):`

			`domain = ConvertURLIntoDomain( url )`

			`return ConvertDomainIntoSecondLevelDomain( domain )`

			`def CookieDomainMatches( cookie, search_domain ):`

			`cookie_domain = cookie.domain`

			`# blah.com is viewable by blah.com`
			`matches_exactly = cookie_domain == search_domain`

			`# .blah.com is viewable by blah.com`
			`matches_dot = cookie_domain == '.' + search_domain`

			`# .blah.com applies to subdomain.blah.com, blah.com does not`
			`valid_subdomain = cookie_domain.startswith( '.' ) and search_domain.endswith( cookie_domain )`

			`return matches_exactly or matches_dot or valid_subdomain`

			`def DomainEqualsAnotherForgivingWWW( test_domain, wwwable_domain ):`

			`# domain is either the same or starts with www. or www2. or something`
			`rule = r'^(www[^\.]*\.)?' + re.escape( wwwable_domain ) + '$'`

			`return re.search( rule, test_domain ) is not None`

			`def GetCookie( cookies, search_domain, cookie_name_string_match ):`

			`for cookie in cookies:`

			`if CookieDomainMatches( cookie, search_domain ) and cookie_name_string_match.Matches( cookie.name ):`

			`return cookie`



			`raise HydrusExceptions.DataMissing( 'Cookie "' + cookie_name_string_match.ToString() + '" not found for domain ' + search_domain + '!' )`

			`def GetSearchURLs( url ):`

			`search_urls = set()`

			`search_urls.add( url )`

			`try:`

Version 562 closes #1520, closes #1518 2024-02-14 21:20:24 +00:00			`normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
			`search_urls.add( normalised_url )`

			`except HydrusExceptions.URLClassException:`

			`pass`


			`for url in list( search_urls ):`

			`if url.startswith( 'http://' ):`

			`search_urls.add( ConvertHTTPToHTTPS( url ) )`

			`elif url.startswith( 'https://' ):`

			`search_urls.add( ConvertHTTPSToHTTP( url ) )`



			`for url in list( search_urls ):`

			`p = ParseURL( url )`

			`scheme = p.scheme`
			`netloc = p.netloc`
			`path = p.path`
			`params = ''`
			`query = p.query`
			`fragment = p.fragment`

			`if netloc.startswith( 'www' ):`

			`try:`

			`netloc = ConvertDomainIntoSecondLevelDomain( netloc )`

			`except HydrusExceptions.URLClassException:`

			`continue`


			`else:`

			`netloc = 'www.' + netloc`


			`r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )`

			`search_urls.add( r.geturl() )`


			`for url in list( search_urls ):`

			`if url.endswith( '/' ):`

			`search_urls.add( url[:-1] )`

			`else:`

			`search_urls.add( url + '/' )`



			`return search_urls`

Version 530 2023-05-31 20:54:09 +00:00
			`def LooksLikeAFullURL( text: str ) -> bool:`

			`try:`

			`result = urllib.parse.urlparse( text )`

			`if result.scheme == '':`

			`return False`


			`if result.netloc == '':`

			`return False`


			`return True`

			`except:`

			`return False`



Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00			`def NormaliseAndFilterAssociableURLs( urls ):`

			`normalised_urls = set()`

			`for url in urls:`

			`try:`

Version 562 closes #1520, closes #1518 2024-02-14 21:20:24 +00:00			`url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
			`except HydrusExceptions.URLClassException:`

			`continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha`


			`normalised_urls.add( url )`


Version 562 closes #1520, closes #1518 2024-02-14 21:20:24 +00:00			`associable_urls = { url for url in normalised_urls if CG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }`
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
			`return associable_urls`

			`def ParseURL( url: str ) -> urllib.parse.ParseResult:`

			`url = url.strip()`

			`url = UnicodeNormaliseURL( url )`

Version 531 closes #1037 2023-06-07 20:07:22 +00:00			`try:`

			`return urllib.parse.urlparse( url )`

			`except Exception as e:`

			`raise HydrusExceptions.URLClassException( str( e ) )`

Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00
Version 479 closes #1095, closes #1105 2022-03-30 20:28:13 +00:00
Version 468 closes #1036, closes #1039, closes #1033 2022-01-05 22:15:56 +00:00			`OH_NO_NO_NETLOC_CHARACTERS = '?#'`
			`OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord( char ) : '_' for char in OH_NO_NO_NETLOC_CHARACTERS }`

			`def RemoveWWWFromDomain( domain ):`

			`if domain.count( '.' ) > 1 and domain.startswith( 'www' ):`

			`domain = ConvertDomainIntoNextLevelDomain( domain )`


			`return domain`

			`def UnicodeNormaliseURL( url: str ):`

			`if url.startswith( 'file:' ):`

			`return url`


			`# the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise`
			`# urllib.urlparse throws a valueerror if these are in, so let's switch out`

			`scheme_splitter = '://'`
			`netloc_splitter = '/'`

			`if scheme_splitter in url:`

			`( scheme, netloc_and_path_and_rest ) = url.split( scheme_splitter, 1 )`

			`if netloc_splitter in netloc_and_path_and_rest:`

			`( netloc, path_and_rest ) = netloc_and_path_and_rest.split( netloc_splitter, 1 )`

			`else:`

			`netloc = netloc_and_path_and_rest`
			`path_and_rest = None`


			`netloc = unicodedata.normalize( 'NFKC', netloc )`

			`netloc = netloc.translate( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE )`

			`scheme_and_netlock = scheme_splitter.join( ( scheme, netloc ) )`

			`if path_and_rest is None:`

			`url = scheme_and_netlock`

			`else:`

			`url = netloc_splitter.join( ( scheme_and_netlock, path_and_rest ) )`



			`return url`