diff --git a/hydrus/client/importing/ClientImportFileSeeds.py b/hydrus/client/importing/ClientImportFileSeeds.py index fa0d5be0..576d5b57 100644 --- a/hydrus/client/importing/ClientImportFileSeeds.py +++ b/hydrus/client/importing/ClientImportFileSeeds.py @@ -33,7 +33,6 @@ from hydrus.client.importing.options import TagImportOptions from hydrus.client.media import ClientMediaManagers from hydrus.client.metadata import ClientTags from hydrus.client.networking import ClientNetworkingFunctions -from hydrus.client.networking import ClientNetworkingJobs FILE_SEED_TYPE_HDD = 0 FILE_SEED_TYPE_URL = 1 @@ -642,7 +641,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ): url_to_fetch = HG.client_controller.network_engine.domain_manager.GetURLToFetch( file_url ) - network_job: ClientNetworkingJobs.NetworkJob = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url ) + network_job = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url ) for ( key, value ) in self._request_headers.items(): @@ -668,15 +667,13 @@ class FileSeed( HydrusSerialisable.SerialisableBase ): self._AddPrimaryURLs( ( url_to_fetch, ) ) - #actual_fetched_url = network_job.GetActualFetchedURL() + actual_fetched_url = network_job.GetActualFetchedURL() - redirected_url = network_job.GetRedirectedUrl() - - if redirected_url is not None: + if actual_fetched_url not in ( file_url, url_to_fetch ): - self._AddPrimaryURLs( ( redirected_url, ) ) + self._AddPrimaryURLs( ( actual_fetched_url, ) ) - ( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url ) + ( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if actual_url_type == HC.URL_TYPE_POST and actual_can_parse: @@ -684,7 +681,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ): if file_seed_cache is None: - raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, redirected_url ) ) + raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, actual_fetched_url ) ) else: @@ -692,10 +689,10 @@ class FileSeed( HydrusSerialisable.SerialisableBase ): if original_url_type == actual_url_type and original_match_name == actual_match_name: - raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, redirected_url ) ) + raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, actual_fetched_url ) ) - file_seed = FileSeed( FILE_SEED_TYPE_URL, redirected_url ) + file_seed = FileSeed( FILE_SEED_TYPE_URL, actual_fetched_url ) file_seed.SetReferralURL( file_url ) @@ -1383,19 +1380,19 @@ class FileSeed( HydrusSerialisable.SerialisableBase ): parsing_text = network_job.GetContentText() - redirected_url = network_job.GetRedirectedUrl() + actual_fetched_url = network_job.GetActualFetchedURL() - if redirected_url is not None: + if actual_fetched_url != url_to_check: # we have redirected, a 3XX response - ( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url ) + ( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if actual_url_type == HC.URL_TYPE_POST and actual_can_parse: - self._AddPrimaryURLs( ( redirected_url, ) ) + self._AddPrimaryURLs( ( actual_fetched_url, ) ) - post_url = redirected_url + post_url = actual_fetched_url url_for_child_referral = post_url diff --git a/hydrus/client/importing/ClientImportGallerySeeds.py b/hydrus/client/importing/ClientImportGallerySeeds.py index 8900fe94..d216569f 100644 --- a/hydrus/client/importing/ClientImportGallerySeeds.py +++ b/hydrus/client/importing/ClientImportGallerySeeds.py @@ -447,19 +447,19 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ): parsing_text = network_job.GetContentText() - redirected_url = network_job.GetRedirectedUrl() + actual_fetched_url = network_job.GetActualFetchedURL() do_parse = True - if redirected_url is not None: + if actual_fetched_url != url_to_check: - ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url ) + ( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url ) if url_type == HC.URL_TYPE_GALLERY: if can_parse: - gallery_url = redirected_url + gallery_url = actual_fetched_url url_for_child_referral = gallery_url @@ -480,7 +480,7 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ): from hydrus.client.importing import ClientImportFileSeeds - file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, redirected_url ) + file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url ) file_seed.SetReferralURL( url_for_child_referral ) diff --git a/hydrus/client/networking/ClientNetworkingJobs.py b/hydrus/client/networking/ClientNetworkingJobs.py index 0e49d338..0c9998cc 100644 --- a/hydrus/client/networking/ClientNetworkingJobs.py +++ b/hydrus/client/networking/ClientNetworkingJobs.py @@ -180,8 +180,6 @@ class NetworkJob( object ): self._actual_fetched_url = self._url self._temp_path = temp_path - self._redirected_url = None - self._response_server_header = None self._response_last_modified = None @@ -787,11 +785,11 @@ class NetworkJob( object ): snc = self._session_network_context - session: requests.Session = self.engine.session_manager.GetSession( snc ) + session = self.engine.session_manager.GetSession( snc ) ( connect_timeout, read_timeout ) = self._GetTimeouts() - response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ), allow_redirects = False ) + response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ) ) with self._lock: @@ -1223,14 +1221,6 @@ class NetworkJob( object ): - def GetRedirectedUrl( self ): - - with self._lock: - - return self._redirected_url - - - def GetContentBytes( self ): with self._lock: @@ -1322,7 +1312,7 @@ class NetworkJob( object ): - def GetSession( self ) -> requests.Session: + def GetSession( self ): with self._lock: @@ -1564,21 +1554,20 @@ class NetworkJob( object ): response = self._SendRequestAndGetResponse() - if response.is_redirect: + # I think tbh I would rather tell requests not to do 3XX, which is possible with allow_redirects = False on request, and then just raise various 3XX exceptions with url info, so I can requeue easier and keep a record + # figuring out correct new url seems a laugh, requests has slight helpers, but lots of exceptions + # SessionRedirectMixin here https://requests.readthedocs.io/en/latest/_modules/requests/sessions/ + # but this will do as a patch for now + self._actual_fetched_url = response.url + + if self._actual_fetched_url != self._url and HG.network_report_mode: - session = self.GetSession() - - self._redirected_url = session.get_redirect_target( response ) - - if HG.network_report_mode: - - HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._redirected_url ) ) - + HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._actual_fetched_url ) ) - elif response.ok: - - self._ParseFirstResponseHeaders( response ) + self._ParseFirstResponseHeaders( response ) + + if response.ok: with self._lock: @@ -1643,8 +1632,6 @@ class NetworkJob( object ): else: - self._ParseFirstResponseHeaders( response ) - with self._lock: self._status_text = str( response.status_code ) + ' - ' + str( response.reason )