import http.cookiejar import re import unicodedata import urllib.parse from hydrus.core import HydrusGlobals as HG from hydrus.core import HydrusExceptions def AddCookieToSession( session, name, value, domain, path, expires, secure = False, rest = None ): version = 0 port = None port_specified = False domain_specified = True domain_initial_dot = domain.startswith( '.' ) path_specified = True discard = False comment = None comment_url = None if rest is None: rest = {} cookie = http.cookiejar.Cookie( version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest ) session.cookies.set_cookie( cookie ) def ConvertDomainIntoAllApplicableDomains( domain, discard_www = True ): # is an ip address or localhost, possibly with a port if '.' not in domain or re.search( r'^[\d.:]+$', domain ) is not None: return [ domain ] domains = [] if discard_www: domain = RemoveWWWFromDomain( domain ) while domain.count( '.' ) > 0: domains.append( domain ) domain = ConvertDomainIntoNextLevelDomain( domain ) return domains def ConvertDomainIntoNextLevelDomain( domain ): return '.'.join( domain.split( '.' )[1:] ) # i.e. strip off the leftmost subdomain maps.google.com -> google.com def ConvertDomainIntoSecondLevelDomain( domain ): domains = ConvertDomainIntoAllApplicableDomains( domain ) if len( domains ) == 0: raise HydrusExceptions.URLClassException( 'That url or domain did not seem to be valid!' ) return domains[-1] def ConvertHTTPSToHTTP( url ): if url.startswith( 'http://' ): return url elif url.startswith( 'https://' ): http_url = 'http://' + url[8:] return http_url else: raise Exception( 'Given a url that did not have a scheme!' ) def ConvertHTTPToHTTPS( url ): if url.startswith( 'https://' ): return url elif url.startswith( 'http://' ): https_url = 'https://' + url[7:] return https_url else: raise Exception( 'Given a url that did not have a scheme!' ) def ConvertQueryDictToText( query_dict, single_value_parameters, param_order = None ): # we now do everything with requests, which does all the unicode -> %20 business naturally, phew # we still want to call str explicitly to coerce integers and so on that'll slip in here and there if param_order is None: param_order = sorted( query_dict.keys() ) single_value_parameters = list( single_value_parameters ) single_value_parameters.sort() for i in range( len( single_value_parameters ) ): param_order.append( None ) params = [] single_value_parameter_index = 0 for key in param_order: if key is None: try: params.append( single_value_parameters[ single_value_parameter_index ] ) except IndexError: continue single_value_parameter_index += 1 else: if key in query_dict: params.append( '{}={}'.format( key, query_dict[ key ] ) ) query_text = '&'.join( params ) return query_text def ConvertQueryTextToDict( query_text ): # we generally do not want quote characters, %20 stuff, in our urls. we would prefer properly formatted unicode # so, let's replace all keys and values with unquoted versions # -but- # we only replace if it is a completely reversable operation! # odd situations like '6+girls+skirt', which comes here encoded as '6%2Bgirls+skirt', shouldn't turn into '6+girls+skirt' # so if there are a mix of encoded and non-encoded, we won't touch it here m8 # except these chars, which screw with GET arg syntax when unquoted bad_chars = [ '&', '=', '/', '?', '#', ';', '+' ] param_order = [] query_dict = {} single_value_parameters = [] pairs = query_text.split( '&' ) for pair in pairs: result = pair.split( '=', 1 ) # for the moment, ignore tracker bugs and so on that have only key and no value if len( result ) == 1: ( value, ) = result if value == '': continue try: unquoted_value = urllib.parse.unquote( value ) if True not in ( bad_char in unquoted_value for bad_char in bad_chars ): requoted_value = urllib.parse.quote( unquoted_value ) if requoted_value == value: value = unquoted_value except: pass single_value_parameters.append( value ) param_order.append( None ) elif len( result ) == 2: ( key, value ) = result try: unquoted_key = urllib.parse.unquote( key ) if True not in ( bad_char in unquoted_key for bad_char in bad_chars ): requoted_key = urllib.parse.quote( unquoted_key ) if requoted_key == key: key = unquoted_key except: pass try: unquoted_value = urllib.parse.unquote( value ) if True not in ( bad_char in unquoted_value for bad_char in bad_chars ): requoted_value = urllib.parse.quote( unquoted_value ) if requoted_value == value: value = unquoted_value except: pass param_order.append( key ) query_dict[ key ] = value return ( query_dict, single_value_parameters, param_order ) def ConvertURLIntoDomain( url ): parser_result = ParseURL( url ) if parser_result.scheme == '': raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--did you forget the http:// or https://?' ) if parser_result.netloc == '': raise HydrusExceptions.URLClassException( 'URL "' + url + '" was not recognised--is it missing a domain?' ) domain = parser_result.netloc return domain def ConvertURLIntoSecondLevelDomain( url ): domain = ConvertURLIntoDomain( url ) return ConvertDomainIntoSecondLevelDomain( domain ) def CookieDomainMatches( cookie, search_domain ): cookie_domain = cookie.domain # blah.com is viewable by blah.com matches_exactly = cookie_domain == search_domain # .blah.com is viewable by blah.com matches_dot = cookie_domain == '.' + search_domain # .blah.com applies to subdomain.blah.com, blah.com does not valid_subdomain = cookie_domain.startswith( '.' ) and search_domain.endswith( cookie_domain ) return matches_exactly or matches_dot or valid_subdomain def DomainEqualsAnotherForgivingWWW( test_domain, wwwable_domain ): # domain is either the same or starts with www. or www2. or something rule = r'^(www[^\.]*\.)?' + re.escape( wwwable_domain ) + '$' return re.search( rule, test_domain ) is not None def GetCookie( cookies, search_domain, cookie_name_string_match ): for cookie in cookies: if CookieDomainMatches( cookie, search_domain ) and cookie_name_string_match.Matches( cookie.name ): return cookie raise HydrusExceptions.DataMissing( 'Cookie "' + cookie_name_string_match.ToString() + '" not found for domain ' + search_domain + '!' ) def GetSearchURLs( url ): search_urls = set() search_urls.add( url ) try: normalised_url = HG.client_controller.network_engine.domain_manager.NormaliseURL( url ) search_urls.add( normalised_url ) except HydrusExceptions.URLClassException: pass for url in list( search_urls ): if url.startswith( 'http://' ): search_urls.add( ConvertHTTPToHTTPS( url ) ) elif url.startswith( 'https://' ): search_urls.add( ConvertHTTPSToHTTP( url ) ) for url in list( search_urls ): p = ParseURL( url ) scheme = p.scheme netloc = p.netloc path = p.path params = '' query = p.query fragment = p.fragment if netloc.startswith( 'www' ): try: netloc = ConvertDomainIntoSecondLevelDomain( netloc ) except HydrusExceptions.URLClassException: continue else: netloc = 'www.' + netloc r = urllib.parse.ParseResult( scheme, netloc, path, params, query, fragment ) search_urls.add( r.geturl() ) for url in list( search_urls ): if url.endswith( '/' ): search_urls.add( url[:-1] ) else: search_urls.add( url + '/' ) return search_urls def NormaliseAndFilterAssociableURLs( urls ): normalised_urls = set() for url in urls: try: url = HG.client_controller.network_engine.domain_manager.NormaliseURL( url ) except HydrusExceptions.URLClassException: continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha normalised_urls.add( url ) associable_urls = { url for url in normalised_urls if HG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) } return associable_urls def ParseURL( url: str ) -> urllib.parse.ParseResult: url = url.strip() url = UnicodeNormaliseURL( url ) return urllib.parse.urlparse( url ) OH_NO_NO_NETLOC_CHARACTERS = '?#' OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE = { ord( char ) : '_' for char in OH_NO_NO_NETLOC_CHARACTERS } def RemoveWWWFromDomain( domain ): if domain.count( '.' ) > 1 and domain.startswith( 'www' ): domain = ConvertDomainIntoNextLevelDomain( domain ) return domain def UnicodeNormaliseURL( url: str ): if url.startswith( 'file:' ): return url # the issue is netloc, blah.com, cannot have certain unicode characters that look like others, or double ( e + accent ) characters that can be one accented-e, so we normalise # urllib.urlparse throws a valueerror if these are in, so let's switch out scheme_splitter = '://' netloc_splitter = '/' if scheme_splitter in url: ( scheme, netloc_and_path_and_rest ) = url.split( scheme_splitter, 1 ) if netloc_splitter in netloc_and_path_and_rest: ( netloc, path_and_rest ) = netloc_and_path_and_rest.split( netloc_splitter, 1 ) else: netloc = netloc_and_path_and_rest path_and_rest = None netloc = unicodedata.normalize( 'NFKC', netloc ) netloc = netloc.translate( OH_NO_NO_NETLOC_CHARACTERS_UNICODE_TRANSLATE ) scheme_and_netlock = scheme_splitter.join( ( scheme, netloc ) ) if path_and_rest is None: url = scheme_and_netlock else: url = netloc_splitter.join( ( scheme_and_netlock, path_and_rest ) ) return url