hydrus/include/ClientImporting.py

369 lines
11 KiB
Python
Raw Normal View History

2019-01-09 22:59:03 +00:00
from . import ClientConstants as CC
from . import ClientData
from . import ClientDefaults
from . import ClientDownloading
from . import ClientFiles
from . import ClientImportOptions
from . import ClientImportFileSeeds
from . import ClientImportGallerySeeds
from . import ClientNetworkingContexts
from . import ClientNetworkingJobs
from . import ClientParsing
from . import ClientPaths
from . import ClientThreading
from . import HydrusConstants as HC
from . import HydrusData
from . import HydrusExceptions
from . import HydrusFileHandling
from . import HydrusGlobals as HG
from . import HydrusPaths
from . import HydrusSerialisable
from . import HydrusThreading
2015-07-22 19:40:39 +00:00
import os
2015-09-09 22:04:39 +00:00
import random
2015-06-03 21:05:13 +00:00
import threading
2015-07-22 19:40:39 +00:00
import time
2015-06-03 21:05:13 +00:00
import traceback
2015-07-22 19:40:39 +00:00
import wx
2015-06-03 21:05:13 +00:00
2017-11-08 22:07:12 +00:00
CHECKER_STATUS_OK = 0
CHECKER_STATUS_DEAD = 1
CHECKER_STATUS_404 = 2
2018-01-17 22:52:10 +00:00
DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME = 0.1
2017-09-06 20:18:20 +00:00
2018-05-23 21:05:06 +00:00
REPEATING_JOB_TYPICAL_PERIOD = 30.0
2018-05-16 20:09:50 +00:00
2018-10-03 21:00:15 +00:00
def ConvertAllParseResultsToFileSeeds( all_parse_results, source_url, file_import_options ):
2018-08-22 21:10:59 +00:00
2018-10-31 21:41:14 +00:00
file_seeds = []
seen_urls = set()
2018-08-22 21:10:59 +00:00
for parse_results in all_parse_results:
parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
2018-10-31 21:41:14 +00:00
parsed_urls = HydrusData.DedupeList( parsed_urls )
2018-10-24 21:34:02 +00:00
2018-10-31 21:41:14 +00:00
parsed_urls = [ url for url in parsed_urls if url not in seen_urls ]
2018-10-24 21:34:02 +00:00
2018-10-31 21:41:14 +00:00
seen_urls.update( parsed_urls )
2018-10-24 21:34:02 +00:00
2018-10-31 21:41:14 +00:00
# note we do this recursively due to parse_results being appropriate only for these urls--don't move this out again, or tags will be messed up
2018-10-24 21:34:02 +00:00
2018-10-31 21:41:14 +00:00
for url in parsed_urls:
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
file_seed.SetReferralURL( source_url )
file_seed.AddParseResults( parse_results, file_import_options )
file_seeds.append( file_seed )
2018-08-22 21:10:59 +00:00
return file_seeds
2018-05-09 20:23:00 +00:00
def GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key ):
2018-05-02 20:45:20 +00:00
2018-05-09 20:23:00 +00:00
def network_job_presentation_context_factory( network_job ):
2018-05-02 20:45:20 +00:00
2018-05-09 20:23:00 +00:00
def enter_call():
job_key.SetVariable( 'popup_network_job', network_job )
2018-05-02 20:45:20 +00:00
2018-05-09 20:23:00 +00:00
def exit_call():
2018-08-15 20:40:30 +00:00
job_key.SetVariable( 'popup_network_job', None )
2018-05-09 20:23:00 +00:00
2018-05-02 20:45:20 +00:00
2018-05-09 20:23:00 +00:00
return NetworkJobPresentationContext( enter_call, exit_call )
2018-05-02 20:45:20 +00:00
2018-05-09 20:23:00 +00:00
return network_job_presentation_context_factory
2018-05-02 20:45:20 +00:00
2018-05-09 20:23:00 +00:00
def GenerateSinglePopupNetworkJobPresentationContextFactory( job_key ):
2018-05-02 20:45:20 +00:00
def network_job_presentation_context_factory( network_job ):
def enter_call():
job_key.SetVariable( 'popup_network_job', network_job )
def exit_call():
2018-05-09 20:23:00 +00:00
job_key.DeleteVariable( 'popup_network_job' )
2018-05-02 20:45:20 +00:00
return NetworkJobPresentationContext( enter_call, exit_call )
return network_job_presentation_context_factory
2018-05-23 21:05:06 +00:00
def GetRepeatingJobInitialDelay():
return 0.5 + ( random.random() * 0.5 )
2018-05-16 20:09:50 +00:00
def PageImporterShouldStopWorking( page_key ):
2018-05-23 21:05:06 +00:00
return HG.view_shutdown or not HG.client_controller.PageAlive( page_key )
2018-05-16 20:09:50 +00:00
2018-10-17 21:00:09 +00:00
def PublishPresentationHashes( publishing_label, hashes, publish_to_popup_button, publish_files_to_page ):
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
if publish_to_popup_button:
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
files_job_key = ClientThreading.JobKey()
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
files_job_key.SetVariable( 'popup_files_mergable', True )
2018-10-17 21:00:09 +00:00
files_job_key.SetVariable( 'popup_files', ( list( hashes ), publishing_label ) )
2017-08-30 20:27:47 +00:00
2018-05-09 20:23:00 +00:00
HG.client_controller.pub( 'message', files_job_key )
2017-08-30 20:27:47 +00:00
2018-05-09 20:23:00 +00:00
if publish_files_to_page:
2017-08-30 20:27:47 +00:00
2018-10-17 21:00:09 +00:00
HG.client_controller.pub( 'imported_files_to_page', list( hashes ), publishing_label )
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
def THREADDownloadURL( job_key, url, url_string ):
job_key.SetVariable( 'popup_title', url_string )
job_key.SetVariable( 'popup_text_1', 'downloading and importing' )
#
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
def network_job_factory( *args, **kwargs ):
2017-08-30 20:27:47 +00:00
2018-05-09 20:23:00 +00:00
network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )
2017-07-19 21:21:41 +00:00
2018-05-23 21:05:06 +00:00
network_job.OverrideBandwidth( 30 )
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
return network_job
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
network_job_presentation_context_factory = GenerateSinglePopupNetworkJobPresentationContextFactory( job_key )
2018-06-27 19:27:05 +00:00
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
2018-05-09 20:23:00 +00:00
#
try:
2017-07-19 21:21:41 +00:00
2018-06-27 19:27:05 +00:00
file_seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory )
2017-07-19 21:21:41 +00:00
2018-06-27 19:27:05 +00:00
status = file_seed.status
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
if status in CC.SUCCESSFUL_IMPORT_STATES:
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
if status == CC.STATUS_SUCCESSFUL_AND_NEW:
job_key.SetVariable( 'popup_text_1', 'successful!' )
elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
job_key.SetVariable( 'popup_text_1', 'was already in the database!' )
2017-07-19 21:21:41 +00:00
2018-10-24 21:34:02 +00:00
if file_seed.HasHash():
hash = file_seed.GetHash()
job_key.SetVariable( 'popup_files', ( [ hash ], 'download' ) )
2018-05-09 20:23:00 +00:00
elif status == CC.STATUS_DELETED:
job_key.SetVariable( 'popup_text_1', 'had already been deleted!' )
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
finally:
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
job_key.Finish()
2017-07-19 21:21:41 +00:00
def THREADDownloadURLs( job_key, urls, title ):
job_key.SetVariable( 'popup_title', title )
job_key.SetVariable( 'popup_text_1', 'initialising' )
num_successful = 0
num_redundant = 0
num_deleted = 0
num_failed = 0
2017-12-20 22:55:48 +00:00
presentation_hashes = []
presentation_hashes_fast = set()
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )
network_job.OverrideBandwidth()
return network_job
network_job_presentation_context_factory = GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key )
2017-07-19 21:21:41 +00:00
for ( i, url ) in enumerate( urls ):
( i_paused, should_quit ) = job_key.WaitIfNeeded()
if should_quit:
break
job_key.SetVariable( 'popup_text_1', HydrusData.ConvertValueRangeToPrettyString( i + 1, len( urls ) ) )
job_key.SetVariable( 'popup_gauge_1', ( i + 1, len( urls ) ) )
2018-06-27 19:27:05 +00:00
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, url )
2017-07-19 21:21:41 +00:00
try:
2018-06-27 19:27:05 +00:00
file_seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory )
2017-08-30 20:27:47 +00:00
2018-06-27 19:27:05 +00:00
status = file_seed.status
2017-08-30 20:27:47 +00:00
2018-05-09 20:23:00 +00:00
if status in CC.SUCCESSFUL_IMPORT_STATES:
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
if status == CC.STATUS_SUCCESSFUL_AND_NEW:
num_successful += 1
elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
num_redundant += 1
2017-07-19 21:21:41 +00:00
2018-10-24 21:34:02 +00:00
if file_seed.HasHash():
2018-05-09 20:23:00 +00:00
2018-10-24 21:34:02 +00:00
hash = file_seed.GetHash()
if hash not in presentation_hashes_fast:
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
2018-05-09 20:23:00 +00:00
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
elif status == CC.STATUS_DELETED:
2017-12-20 22:55:48 +00:00
2018-05-09 20:23:00 +00:00
num_deleted += 1
2017-12-20 22:55:48 +00:00
2018-05-09 20:23:00 +00:00
except Exception as e:
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
num_failed += 1
2017-07-19 21:21:41 +00:00
2018-05-09 20:23:00 +00:00
HydrusData.Print( url + ' failed to import!' )
HydrusData.PrintException( e )
2017-07-19 21:21:41 +00:00
2017-08-30 20:27:47 +00:00
job_key.DeleteVariable( 'popup_network_job' )
2017-07-19 21:21:41 +00:00
text_components = []
if num_successful > 0:
2018-07-04 20:48:28 +00:00
text_components.append( HydrusData.ToHumanInt( num_successful ) + ' successful' )
2017-07-19 21:21:41 +00:00
if num_redundant > 0:
2018-07-04 20:48:28 +00:00
text_components.append( HydrusData.ToHumanInt( num_redundant ) + ' already in db' )
2017-07-19 21:21:41 +00:00
if num_deleted > 0:
2018-07-04 20:48:28 +00:00
text_components.append( HydrusData.ToHumanInt( num_deleted ) + ' deleted' )
2017-07-19 21:21:41 +00:00
if num_failed > 0:
2018-07-04 20:48:28 +00:00
text_components.append( HydrusData.ToHumanInt( num_failed ) + ' failed (errors written to log)' )
2017-07-19 21:21:41 +00:00
job_key.SetVariable( 'popup_text_1', ', '.join( text_components ) )
2017-12-20 22:55:48 +00:00
if len( presentation_hashes ) > 0:
2017-07-19 21:21:41 +00:00
2017-12-20 22:55:48 +00:00
job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) )
2017-07-19 21:21:41 +00:00
job_key.DeleteVariable( 'popup_gauge_1' )
job_key.Finish()
2018-08-22 21:10:59 +00:00
def UpdateFileSeedCacheWithFileSeeds( file_seed_cache, file_seeds, max_new_urls_allowed = None ):
2018-02-07 23:40:33 +00:00
2018-06-27 19:27:05 +00:00
new_file_seeds = []
2018-02-07 23:40:33 +00:00
2018-08-01 20:44:57 +00:00
num_urls_added = 0
num_urls_already_in_file_seed_cache = 0
2018-09-05 20:52:32 +00:00
can_search_for_more_files = True
2018-08-22 21:10:59 +00:00
stop_reason = ''
2018-02-07 23:40:33 +00:00
2018-08-22 21:10:59 +00:00
for file_seed in file_seeds:
2018-02-07 23:40:33 +00:00
2018-08-22 21:10:59 +00:00
if max_new_urls_allowed is not None and num_urls_added >= max_new_urls_allowed:
2018-02-07 23:40:33 +00:00
2018-09-05 20:52:32 +00:00
can_search_for_more_files = False
2018-02-07 23:40:33 +00:00
2018-08-22 21:10:59 +00:00
stop_reason = 'hit file limit'
2018-04-18 22:10:15 +00:00
2018-08-22 21:10:59 +00:00
break
2018-07-04 20:48:28 +00:00
2018-08-22 21:10:59 +00:00
if file_seed_cache.HasFileSeed( file_seed ):
2018-07-04 20:48:28 +00:00
2018-08-22 21:10:59 +00:00
num_urls_already_in_file_seed_cache += 1
else:
num_urls_added += 1
new_file_seeds.append( file_seed )
2018-02-07 23:40:33 +00:00
2018-06-27 19:27:05 +00:00
file_seed_cache.AddFileSeeds( new_file_seeds )
2018-02-07 23:40:33 +00:00
2018-09-05 20:52:32 +00:00
return ( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason )
2018-02-07 23:40:33 +00:00
2018-05-16 20:09:50 +00:00
def WakeRepeatingJob( job ):
if job is not None:
job.Wake()
2018-08-01 20:44:57 +00:00
class NetworkJobPresentationContext( object ):
2015-09-09 22:04:39 +00:00
2018-08-01 20:44:57 +00:00
def __init__( self, enter_call, exit_call ):
2018-02-14 21:47:18 +00:00
2018-08-01 20:44:57 +00:00
self._enter_call = enter_call
self._exit_call = exit_call
2018-02-14 21:47:18 +00:00
2018-08-01 20:44:57 +00:00
def __enter__( self ):
2015-09-09 22:04:39 +00:00
2018-08-01 20:44:57 +00:00
self._enter_call()
2018-02-14 21:47:18 +00:00
2018-08-01 20:44:57 +00:00
def __exit__( self, exc_type, exc_val, exc_tb ):
2018-02-14 21:47:18 +00:00
2018-08-01 20:44:57 +00:00
self._exit_call()
2018-02-14 21:47:18 +00:00
2018-05-16 20:09:50 +00:00