Better handling of redirects (#1493)

Flatten the logic a bit

Co-authored-by: Paul Friederichsen <floogulinc@gmail.com>
This commit is contained in:
Hydrus Network Developer 2023-12-16 13:16:06 -06:00 committed by GitHub
parent e7c66e5626
commit 5fee29dbd7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 48 additions and 32 deletions

View File

@ -33,6 +33,7 @@ from hydrus.client.importing.options import TagImportOptions
from hydrus.client.media import ClientMediaManagers
from hydrus.client.metadata import ClientTags
from hydrus.client.networking import ClientNetworkingFunctions
from hydrus.client.networking import ClientNetworkingJobs
FILE_SEED_TYPE_HDD = 0
FILE_SEED_TYPE_URL = 1
@ -641,7 +642,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
url_to_fetch = HG.client_controller.network_engine.domain_manager.GetURLToFetch( file_url )
network_job = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url )
network_job: ClientNetworkingJobs.NetworkJob = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url )
for ( key, value ) in self._request_headers.items():
@ -667,13 +668,15 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
self._AddPrimaryURLs( ( url_to_fetch, ) )
actual_fetched_url = network_job.GetActualFetchedURL()
#actual_fetched_url = network_job.GetActualFetchedURL()
if actual_fetched_url not in ( file_url, url_to_fetch ):
redirected_url = network_job.GetRedirectedUrl()
if redirected_url is not None:
self._AddPrimaryURLs( ( actual_fetched_url, ) )
self._AddPrimaryURLs( ( redirected_url, ) )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:
@ -681,7 +684,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
if file_seed_cache is None:
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, actual_fetched_url ) )
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, redirected_url ) )
else:
@ -689,10 +692,10 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
if original_url_type == actual_url_type and original_match_name == actual_match_name:
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, actual_fetched_url ) )
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, redirected_url ) )
file_seed = FileSeed( FILE_SEED_TYPE_URL, actual_fetched_url )
file_seed = FileSeed( FILE_SEED_TYPE_URL, redirected_url )
file_seed.SetReferralURL( file_url )
@ -1380,19 +1383,19 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
parsing_text = network_job.GetContentText()
actual_fetched_url = network_job.GetActualFetchedURL()
redirected_url = network_job.GetRedirectedUrl()
if actual_fetched_url != url_to_check:
if redirected_url is not None:
# we have redirected, a 3XX response
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:
self._AddPrimaryURLs( ( actual_fetched_url, ) )
self._AddPrimaryURLs( ( redirected_url, ) )
post_url = actual_fetched_url
post_url = redirected_url
url_for_child_referral = post_url

View File

@ -447,19 +447,19 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
parsing_text = network_job.GetContentText()
actual_fetched_url = network_job.GetActualFetchedURL()
redirected_url = network_job.GetRedirectedUrl()
do_parse = True
if actual_fetched_url != url_to_check:
if redirected_url is not None:
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
if url_type == HC.URL_TYPE_GALLERY:
if can_parse:
gallery_url = actual_fetched_url
gallery_url = redirected_url
url_for_child_referral = gallery_url
@ -480,7 +480,7 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
from hydrus.client.importing import ClientImportFileSeeds
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, redirected_url )
file_seed.SetReferralURL( url_for_child_referral )

View File

@ -180,6 +180,8 @@ class NetworkJob( object ):
self._actual_fetched_url = self._url
self._temp_path = temp_path
self._redirected_url = None
self._response_server_header = None
self._response_last_modified = None
@ -785,11 +787,11 @@ class NetworkJob( object ):
snc = self._session_network_context
session = self.engine.session_manager.GetSession( snc )
session: requests.Session = self.engine.session_manager.GetSession( snc )
( connect_timeout, read_timeout ) = self._GetTimeouts()
response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ) )
response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ), allow_redirects = False )
with self._lock:
@ -1221,6 +1223,14 @@ class NetworkJob( object ):
def GetRedirectedUrl( self ):
with self._lock:
return self._redirected_url
def GetContentBytes( self ):
with self._lock:
@ -1312,7 +1322,7 @@ class NetworkJob( object ):
def GetSession( self ):
def GetSession( self ) -> requests.Session:
with self._lock:
@ -1554,20 +1564,21 @@ class NetworkJob( object ):
response = self._SendRequestAndGetResponse()
# I think tbh I would rather tell requests not to do 3XX, which is possible with allow_redirects = False on request, and then just raise various 3XX exceptions with url info, so I can requeue easier and keep a record
# figuring out correct new url seems a laugh, requests has slight helpers, but lots of exceptions
# SessionRedirectMixin here https://requests.readthedocs.io/en/latest/_modules/requests/sessions/
# but this will do as a patch for now
self._actual_fetched_url = response.url
if self._actual_fetched_url != self._url and HG.network_report_mode:
if response.is_redirect:
HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._actual_fetched_url ) )
session = self.GetSession()
self._redirected_url = session.get_redirect_target( response )
if HG.network_report_mode:
HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._redirected_url ) )
self._ParseFirstResponseHeaders( response )
if response.ok:
elif response.ok:
self._ParseFirstResponseHeaders( response )
with self._lock:
@ -1632,6 +1643,8 @@ class NetworkJob( object ):
else:
self._ParseFirstResponseHeaders( response )
with self._lock:
self._status_text = str( response.status_code ) + ' - ' + str( response.reason )