hydrus/hydrus/client/networking/ClientNetworkingFunctions.py

import http.cookiejar
import re
import typing

import unicodedata
import urllib.parse

from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusExceptions

from hydrus.client import ClientGlobals as CG
def AddCookieToSession( session, name, value, domain, path, expires, secure = False, rest = None ):

    version = 0
    port = None
    port_specified = False
    domain_specified = True
    domain_initial_dot = domain.startswith( '.' )
    path_specified = True
    discard = False
    comment = None
    comment_url = None

    if rest is None:

        rest = {}


    cookie = http.cookiejar.Cookie( version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest )

    session.cookies.set_cookie( cookie )

def ConvertDomainIntoAllApplicableDomains( domain, discard_www = True ):

    # is an ip address or localhost, possibly with a port
    if '.' not in domain or re.search( r'^[\d.:]+$', domain ) is not None:

        return [ domain ]


    domains = []

    if discard_www:

        domain = RemoveWWWFromDomain( domain )


    while domain.count( '.' ) > 0:

        domains.append( domain )

        domain = ConvertDomainIntoNextLevelDomain( domain )


    return domains

def ConvertDomainIntoNextLevelDomain( domain ):

    return '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com

def ConvertDomainIntoSecondLevelDomain( domain ):

    domains = ConvertDomainIntoAllApplicableDomains( domain )

    if len( domains ) == 0:

        raise HydrusExceptions.URLClassException( 'That url or domain did not seem to be valid!' )


    return domains[-1]

def ConvertHTTPSToHTTP( url ):

    if url.startswith( 'http://' ):

        return url

    elif url.startswith( 'https://' ):

        http_url = 'http://' + url[8:]

        return http_url

    else:

        raise Exception( 'Given a url that did not have a scheme!' )


def ConvertHTTPToHTTPS( url ):

    if url.startswith( 'https://' ):

        return url

    elif url.startswith( 'http://' ):

        https_url = 'https://' + url[7:]

        return https_url

    else:

        raise Exception( 'Given a url that did not have a scheme!' )


def ConvertPathTextToList( path: str ) -> typing.List[ str ]:

    # yo sometimes you see a URL with double slashes in a weird place. maybe we should just split( '/' ) and then remove empty '' results?

    # /post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem

    # for a while we've had URLs like this:
    # https://img2.gelbooru.com//images/80/c8/80c8646b4a49395fb36c805f316c49a9.jpg
    # I was going to be careful as I unified all this to preserve the double-slash to help legacy known url storage matching, but it seems we've been nuking the extra slash for ages in actual db storage, so w/e!
    while path.startswith( '/' ):

        path = path[ 1 : ]


    # post/show/1326143/akunim-anthro-armband-armwear-clothed-clothing-fem

    path_components = path.split( '/' )

    return path_components


def ConvertQueryDictToText( query_dict, single_value_parameters, param_order = None ):

    # we now do everything with requests, which does all the unicode -> %20 business naturally, phew
    # we still want to call str explicitly to coerce integers and so on that'll slip in here and there

    if param_order is None:

        param_order = sorted( query_dict.keys() )

        single_value_parameters = list( single_value_parameters )
        single_value_parameters.sort()

        for i in range( len( single_value_parameters ) ):

            param_order.append( None )


    params = []

    single_value_parameter_index = 0

    for key in param_order:

        if key is None:

            try:

                params.append( single_value_parameters[ single_value_parameter_index ] )

            except IndexError:

                continue


            single_value_parameter_index += 1

        else:

            if key in query_dict:

                params.append( f'{key}={query_dict[ key ]}' )


    query_text = '&'.join( params )

    return query_text


def ConvertQueryTextToDict( query_text ):

    # in the old version of this func, we played silly games with character encoding. I made the foolish decision to try to handle/save URLs with %20 stuff decoded
    # this lead to complexity with odd situations like '6+girls+skirt', which would come here encoded as '6%2Bgirls+skirt'
    # I flipped back and forth and tried to preserve the encoding if it did stepped on x or did not change y, what a mess!

    # I no longer do this. I will encode if there is no '%' in there already, which catches cases of humans pasting/typing an URL with something human, but only if it is non-destructive

    # Update: I still hate this a bit. I should have a parameter that says 'from human=True' and then anything we ingest should go through a normalisation( from_human = True ) wash
    # I don't like the '+' exception we have to do here, and it would be better isolated to just the initian from_human wash rather than basically every time we look at an url for normalisation
    # indeed, instead of having 'from_human' in here, I could have a 'EncodeQueryDict' that does best-attempt smart encoding from_human, once
    # this guy would then just be a glorified dict parser, great

    param_order = []

    query_dict = {}
    single_value_parameters = []

    pairs = query_text.split( '&' )

    for pair in pairs:

        result = pair.split( '=', 1 )

        # for the moment, ignore tracker bugs and so on that have only key and no value

        if len( result ) == 1:

            ( value, ) = result

            if value == '':

                continue


            single_value_parameters.append( value )
            param_order.append( None )

        elif len( result ) == 2:

            ( key, value ) = result

            param_order.append( key )

            query_dict[ key ] = value


    return ( query_dict, single_value_parameters, param_order )


def EnsureURLInfoIsEncoded( path_components: typing.List[ str ], query_dict: typing.Dict[ str, str ], single_value_parameters: typing.List[ str ] ):

    # ok so the user just posted a URL at us, and this query dict could either be from a real url, like "tags=skirt%20blonde_hair", or it could be a pretty URL they typed or whatever, "tags=skirt blonde_hair"
    # so, let's do our best to figure out if the thing was pre-encoded or not, and wash it through a safe encoding process so it is encoded when we give it back
    # what's the potential problem? '+' is a special character that may or may not be encoded, e.g. "tags=6%2Bgirls+skirt" WEW

    percent_encoding_re = re.compile( r'%[0-9A-Fa-f]{2}' )

    all_gubbins = set( path_components )
    all_gubbins.update( query_dict.keys() )
    all_gubbins.update( query_dict.values() )
    all_gubbins.update( single_value_parameters )

    there_are_percent_encoding_chars = True in ( percent_encoding_re.search( text ) is not None for text in all_gubbins )

    # if there are percent-encoded characters anywhere, we have to assume the whole URL is already encoded correctly!
    if not there_are_percent_encoding_chars:

        path_components = [ urllib.parse.quote( value, safe = '+' ) for value in path_components ]
        query_dict = { urllib.parse.quote( key, safe = '+' ) : urllib.parse.quote( value, safe = '+' ) for ( key, value ) in query_dict.items() }
        single_value_parameters = [ urllib.parse.quote( value, safe = '+' ) for value in single_value_parameters ]


    return ( path_components, query_dict, single_value_parameters )


def ConvertURLIntoDomain( url ):

    parser_result = ParseURL( url )

    if parser_result.scheme == '':

        raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--did you forget the http:// or https://?' )


    if parser_result.netloc == '':

        raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--is it missing a domain?' )


    domain = parser_result.netloc

    return domain

def ConvertURLIntoSecondLevelDomain( url ):

    domain = ConvertURLIntoDomain( url )

    return ConvertDomainIntoSecondLevelDomain( domain )


def ConvertURLToHumanString( url: str ) -> str:

    # ok so the idea here is that we want to store 'ugly' urls behind the scenes, with quoted %20 gubbins, but any time we present to the user, we want to convert all that to real (URL-invalid) characters
    # although there are some caveats, we can pretty much just do a dequote on the whole string and it'll be fine most of the time mate
    # if we have a unicode domain, we'll need to figure out 'punycode' decoding, but w/e for now

    pretty_url = urllib.parse.unquote( url )

    return pretty_url


def CookieDomainMatches( cookie, search_domain ):

    cookie_domain = cookie.domain

    # blah.com is viewable by blah.com
    matches_exactly = cookie_domain == search_domain

    # .blah.com is viewable by blah.com
    matches_dot = cookie_domain == '.' + search_domain

    # .blah.com applies to subdomain.blah.com, blah.com does not
    valid_subdomain = cookie_domain.startswith( '.' ) and search_domain.endswith( cookie_domain )

    return matches_exactly or matches_dot or valid_subdomain

def DomainEqualsAnotherForgivingWWW( test_domain, wwwable_domain ):

    # domain is either the same or starts with www. or www2. or something
    rule = r'^(www[^\.]*\.)?' + re.escape( wwwable_domain ) + '$'

    return re.search( rule, test_domain ) is not None

def GetCookie( cookies, search_domain, cookie_name_string_match ):

    for cookie in cookies:

        if CookieDomainMatches( cookie, search_domain ) and cookie_name_string_match.Matches( cookie.name ):

            return cookie


    raise HydrusExceptions.DataMissing( 'Cookie "' + cookie_name_string_match.ToString() + '" not found for domain ' + search_domain + '!' )

def GetSearchURLs( url ):

    search_urls = set()

    search_urls.add( url )

    try:

        ephemeral_normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url, for_server = True )

        search_urls.add( ephemeral_normalised_url )

        normalised_url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )

        search_urls.add( normalised_url )

    except HydrusExceptions.URLClassException:

        pass


    for url in list( search_urls ):

        if url.startswith( 'http://' ):

            search_urls.add( ConvertHTTPToHTTPS( url ) )

        elif url.startswith( 'https://' ):

            search_urls.add( ConvertHTTPSToHTTP( url ) )


    for url in list( search_urls ):

        p = ParseURL( url )

        scheme = p.scheme
        netloc = p.netloc
        path = p.path
        params = ''
        query = p.query
        fragment = p.fragment

        if netloc.startswith( 'www' ):

            try:

                netloc = ConvertDomainIntoSecondLevelDomain( netloc )

            except HydrusExceptions.URLClassException:

                continue


        else:

            netloc = 'www.' + netloc


        r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )

        search_urls.add( r.geturl() )


    for url in list( search_urls ):

        if url.endswith( '/' ):

            search_urls.add( url[:-1] )

        else:

            search_urls.add( url + '/' )


    return search_urls


def LooksLikeAFullURL( text: str ) -> bool:

    try:

        p = ParseURL( text )

        if p.scheme == '':

            return False


        if p.netloc == '':

            return False


        return True

    except:

        return False


def NormaliseAndFilterAssociableURLs( urls ):

    normalised_urls = set()

    for url in urls:

        try:

            url = CG.client_controller.network_engine.domain_manager.NormaliseURL( url )

        except HydrusExceptions.URLClassException:

            continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha


        normalised_urls.add( url )


    associable_urls = { url for url in normalised_urls if CG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }

    return associable_urls


def ParseURL( url: str ) -> urllib.parse.ParseResult:

    url = url.strip()

    url = UnicodeNormaliseURL( url )

    try:

        return urllib.parse.urlparse( url )

    except Exception as e:

        raise HydrusExceptions.URLClassException( str( e ) )


def WashURL( url: str, keep_fragment = True ) -> str:

    if not LooksLikeAFullURL( url ):

        return url


    try:

        p = ParseURL( url )

        scheme = p.scheme
        netloc = p.netloc
        params = p.params # just so you know, this is ancient web semicolon tech, can be ignored
        fragment = p.fragment

        path_components = ConvertPathTextToList( p.path )
        ( query_dict, single_value_parameters, param_order ) = ConvertQueryTextToDict( p.query )

        ( path_components, query_dict, single_value_parameters ) = EnsureURLInfoIsEncoded( path_components, query_dict, single_value_parameters )

        path = '/' + '/'.join( path_components )
        query = ConvertQueryDictToText( query_dict, single_value_parameters )

        if not keep_fragment:

            fragment = ''


        r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment )

        clean_url = r.geturl()

        return clean_url

    except:

        return url


OH_NO_NO_NETLOC_CHARACTERS = '?#'
OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord( char ) : '_' for char in OH_NO_NO_NETLOC_CHARACTERS }

def RemoveWWWFromDomain( domain ):

    if domain.count( '.' ) > 1 and domain.startswith( 'www' ):

        domain = ConvertDomainIntoNextLevelDomain( domain )


    return domain


def UnicodeNormaliseURL( url: str ):

    if url.startswith( 'file:' ):

        return url


    # the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise
    # urllib.urlparse throws a valueerror if these are in, so let's switch out

    scheme_splitter = '://'
    netloc_splitter = '/'

    if scheme_splitter in url:

        ( scheme, netloc_and_path_and_rest ) = url.split( scheme_splitter, 1 )

        if netloc_splitter in netloc_and_path_and_rest:

            ( netloc, path_and_rest ) = netloc_and_path_and_rest.split( netloc_splitter, 1 )

        else:

            netloc = netloc_and_path_and_rest
            path_and_rest = None


        netloc = unicodedata.normalize( 'NFKC', netloc )

        netloc = netloc.translate( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE )

        scheme_and_netlock = scheme_splitter.join( ( scheme, netloc ) )

        if path_and_rest is None:

            url = scheme_and_netlock

        else:

            url = netloc_splitter.join( ( scheme_and_netlock, path_and_rest ) )


    return url