Revert "Better handling of redirects (#1493)"

This reverts commit 5fee29dbd7.
This commit is contained in:
Hydrus Network Developer 2023-12-18 16:53:45 -06:00
parent 5fee29dbd7
commit a825fce02e
No known key found for this signature in database
GPG Key ID: 76249F053212133C
3 changed files with 32 additions and 48 deletions

View File

@ -33,7 +33,6 @@ from hydrus.client.importing.options import TagImportOptions
from hydrus.client.media import ClientMediaManagers
from hydrus.client.metadata import ClientTags
from hydrus.client.networking import ClientNetworkingFunctions
from hydrus.client.networking import ClientNetworkingJobs
FILE_SEED_TYPE_HDD = 0
FILE_SEED_TYPE_URL = 1
@ -642,7 +641,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
url_to_fetch = HG.client_controller.network_engine.domain_manager.GetURLToFetch( file_url )
network_job: ClientNetworkingJobs.NetworkJob = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url )
network_job = network_job_factory( 'GET', url_to_fetch, temp_path = temp_path, referral_url = referral_url )
for ( key, value ) in self._request_headers.items():
@ -668,15 +667,13 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
self._AddPrimaryURLs( ( url_to_fetch, ) )
#actual_fetched_url = network_job.GetActualFetchedURL()
actual_fetched_url = network_job.GetActualFetchedURL()
redirected_url = network_job.GetRedirectedUrl()
if redirected_url is not None:
if actual_fetched_url not in ( file_url, url_to_fetch ):
self._AddPrimaryURLs( ( redirected_url, ) )
self._AddPrimaryURLs( ( actual_fetched_url, ) )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:
@ -684,7 +681,7 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
if file_seed_cache is None:
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, redirected_url ) )
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, actual_fetched_url ) )
else:
@ -692,10 +689,10 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
if original_url_type == actual_url_type and original_match_name == actual_match_name:
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, redirected_url ) )
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, actual_fetched_url ) )
file_seed = FileSeed( FILE_SEED_TYPE_URL, redirected_url )
file_seed = FileSeed( FILE_SEED_TYPE_URL, actual_fetched_url )
file_seed.SetReferralURL( file_url )
@ -1383,19 +1380,19 @@ class FileSeed( HydrusSerialisable.SerialisableBase ):
parsing_text = network_job.GetContentText()
redirected_url = network_job.GetRedirectedUrl()
actual_fetched_url = network_job.GetActualFetchedURL()
if redirected_url is not None:
if actual_fetched_url != url_to_check:
# we have redirected, a 3XX response
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:
self._AddPrimaryURLs( ( redirected_url, ) )
self._AddPrimaryURLs( ( actual_fetched_url, ) )
post_url = redirected_url
post_url = actual_fetched_url
url_for_child_referral = post_url

View File

@ -447,19 +447,19 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
parsing_text = network_job.GetContentText()
redirected_url = network_job.GetRedirectedUrl()
actual_fetched_url = network_job.GetActualFetchedURL()
do_parse = True
if redirected_url is not None:
if actual_fetched_url != url_to_check:
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( redirected_url )
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
if url_type == HC.URL_TYPE_GALLERY:
if can_parse:
gallery_url = redirected_url
gallery_url = actual_fetched_url
url_for_child_referral = gallery_url
@ -480,7 +480,7 @@ class GallerySeed( HydrusSerialisable.SerialisableBase ):
from hydrus.client.importing import ClientImportFileSeeds
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, redirected_url )
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )
file_seed.SetReferralURL( url_for_child_referral )

View File

@ -180,8 +180,6 @@ class NetworkJob( object ):
self._actual_fetched_url = self._url
self._temp_path = temp_path
self._redirected_url = None
self._response_server_header = None
self._response_last_modified = None
@ -787,11 +785,11 @@ class NetworkJob( object ):
snc = self._session_network_context
session: requests.Session = self.engine.session_manager.GetSession( snc )
session = self.engine.session_manager.GetSession( snc )
( connect_timeout, read_timeout ) = self._GetTimeouts()
response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ), allow_redirects = False )
response = session.request( method, url, data = data, files = files, headers = headers, stream = True, timeout = ( connect_timeout, read_timeout ) )
with self._lock:
@ -1223,14 +1221,6 @@ class NetworkJob( object ):
def GetRedirectedUrl( self ):
with self._lock:
return self._redirected_url
def GetContentBytes( self ):
with self._lock:
@ -1322,7 +1312,7 @@ class NetworkJob( object ):
def GetSession( self ) -> requests.Session:
def GetSession( self ):
with self._lock:
@ -1564,21 +1554,20 @@ class NetworkJob( object ):
response = self._SendRequestAndGetResponse()
if response.is_redirect:
# I think tbh I would rather tell requests not to do 3XX, which is possible with allow_redirects = False on request, and then just raise various 3XX exceptions with url info, so I can requeue easier and keep a record
# figuring out correct new url seems a laugh, requests has slight helpers, but lots of exceptions
# SessionRedirectMixin here https://requests.readthedocs.io/en/latest/_modules/requests/sessions/
# but this will do as a patch for now
self._actual_fetched_url = response.url
if self._actual_fetched_url != self._url and HG.network_report_mode:
session = self.GetSession()
self._redirected_url = session.get_redirect_target( response )
if HG.network_report_mode:
HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._redirected_url ) )
HydrusData.ShowText( 'Network Jobs Redirect: {} -> {}'.format( self._url, self._actual_fetched_url ) )
elif response.ok:
self._ParseFirstResponseHeaders( response )
self._ParseFirstResponseHeaders( response )
if response.ok:
with self._lock:
@ -1643,8 +1632,6 @@ class NetworkJob( object ):
else:
self._ParseFirstResponseHeaders( response )
with self._lock:
self._status_text = str( response.status_code ) + ' - ' + str( response.reason )