hydrus/include/ClientImporting.py

6085 lines
204 KiB
Python

import bs4
import ClientConstants as CC
import ClientData
import ClientDefaults
import ClientDownloading
import ClientFiles
import ClientImageHandling
import ClientImportOptions
import ClientNetworkingContexts
import ClientNetworkingDomain
import ClientNetworkingJobs
import ClientParsing
import ClientPaths
import ClientTags
import ClientThreading
import collections
import HydrusConstants as HC
import HydrusData
import HydrusExceptions
import HydrusFileHandling
import HydrusImageHandling
import HydrusGlobals as HG
import HydrusPaths
import HydrusSerialisable
import HydrusTags
import HydrusText
import json
import os
import random
import re
import shutil
import threading
import time
import traceback
import urlparse
import wx
import HydrusThreading
CHECKER_STATUS_OK = 0
CHECKER_STATUS_DEAD = 1
CHECKER_STATUS_404 = 2
DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME = 0.1
REPEATING_JOB_TYPICAL_PERIOD = 30.0
def GenerateDownloaderNetworkJobFactory( page_key ):
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, *args, **kwargs )
return network_job
return network_job_factory
def GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key ):
def network_job_presentation_context_factory( network_job ):
def enter_call():
job_key.SetVariable( 'popup_network_job', network_job )
def exit_call():
pass
return NetworkJobPresentationContext( enter_call, exit_call )
return network_job_presentation_context_factory
def GenerateSeedCacheStatus( statuses_to_counts ):
num_successful_and_new = statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
num_successful_but_redundant = statuses_to_counts[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ]
num_ignored = statuses_to_counts[ CC.STATUS_VETOED ]
num_deleted = statuses_to_counts[ CC.STATUS_DELETED ]
num_failed = statuses_to_counts[ CC.STATUS_ERROR ]
num_skipped = statuses_to_counts[ CC.STATUS_SKIPPED ]
num_unknown = statuses_to_counts[ CC.STATUS_UNKNOWN ]
status_strings = []
num_successful = num_successful_and_new + num_successful_but_redundant
if num_successful > 0:
s = HydrusData.ConvertIntToPrettyString( num_successful ) + ' successful'
if num_successful_and_new > 0:
if num_successful_but_redundant > 0:
s += ' (' + HydrusData.ConvertIntToPrettyString( num_successful_but_redundant ) + ' already in db)'
else:
s += ' (all already in db)'
status_strings.append( s )
if num_ignored > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_ignored ) + ' ignored' )
if num_deleted > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_deleted ) + ' previously deleted' )
if num_failed > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_failed ) + ' failed' )
if num_skipped > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_skipped ) + ' skipped' )
status = ', '.join( status_strings )
total = sum( statuses_to_counts.values() )
total_processed = total - num_unknown
return ( status, ( total_processed, total ) )
def GenerateSinglePopupNetworkJobPresentationContextFactory( job_key ):
def network_job_presentation_context_factory( network_job ):
def enter_call():
job_key.SetVariable( 'popup_network_job', network_job )
def exit_call():
job_key.DeleteVariable( 'popup_network_job' )
return NetworkJobPresentationContext( enter_call, exit_call )
return network_job_presentation_context_factory
def GenerateSubscriptionNetworkJobFactory( subscription_key ):
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobSubscription( subscription_key, *args, **kwargs )
network_job.OverrideBandwidth( 30 )
return network_job
return network_job_factory
def GenerateWatcherNetworkJobFactory( watcher_key ):
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobWatcherPage( watcher_key, *args, **kwargs )
return network_job
return network_job_factory
def GetRepeatingJobInitialDelay():
return 0.5 + ( random.random() * 0.5 )
def PageImporterShouldStopWorking( page_key ):
return HG.view_shutdown or not HG.client_controller.PageAlive( page_key )
def PublishPresentationHashes( name, hashes, publish_to_popup_button, publish_files_to_page ):
if publish_to_popup_button:
files_job_key = ClientThreading.JobKey()
files_job_key.SetVariable( 'popup_files_mergable', True )
files_job_key.SetVariable( 'popup_files', ( list( hashes ), name ) )
HG.client_controller.pub( 'message', files_job_key )
if publish_files_to_page:
HG.client_controller.pub( 'imported_files_to_page', list( hashes ), name )
def THREADDownloadURL( job_key, url, url_string ):
job_key.SetVariable( 'popup_title', url_string )
job_key.SetVariable( 'popup_text_1', 'downloading and importing' )
#
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )
network_job.OverrideBandwidth( 30 )
return network_job
network_job_presentation_context_factory = GenerateSinglePopupNetworkJobPresentationContextFactory( job_key )
seed = Seed( SEED_TYPE_URL, url )
#
try:
seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory )
status = seed.status
if status in CC.SUCCESSFUL_IMPORT_STATES:
if status == CC.STATUS_SUCCESSFUL_AND_NEW:
job_key.SetVariable( 'popup_text_1', 'successful!' )
elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
job_key.SetVariable( 'popup_text_1', 'was already in the database!' )
hash = seed.GetHash()
job_key.SetVariable( 'popup_files', ( [ hash ], 'download' ) )
elif status == CC.STATUS_DELETED:
job_key.SetVariable( 'popup_text_1', 'had already been deleted!' )
finally:
job_key.Finish()
def THREADDownloadURLs( job_key, urls, title ):
job_key.SetVariable( 'popup_title', title )
job_key.SetVariable( 'popup_text_1', 'initialising' )
num_successful = 0
num_redundant = 0
num_deleted = 0
num_failed = 0
presentation_hashes = []
presentation_hashes_fast = set()
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )
network_job.OverrideBandwidth()
return network_job
network_job_presentation_context_factory = GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key )
for ( i, url ) in enumerate( urls ):
( i_paused, should_quit ) = job_key.WaitIfNeeded()
if should_quit:
break
job_key.SetVariable( 'popup_text_1', HydrusData.ConvertValueRangeToPrettyString( i + 1, len( urls ) ) )
job_key.SetVariable( 'popup_gauge_1', ( i + 1, len( urls ) ) )
seed = Seed( SEED_TYPE_URL, url )
try:
seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory )
status = seed.status
if status in CC.SUCCESSFUL_IMPORT_STATES:
if status == CC.STATUS_SUCCESSFUL_AND_NEW:
num_successful += 1
elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
num_redundant += 1
hash = seed.GetHash()
if hash not in presentation_hashes_fast:
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
elif status == CC.STATUS_DELETED:
num_deleted += 1
except Exception as e:
num_failed += 1
HydrusData.Print( url + ' failed to import!' )
HydrusData.PrintException( e )
job_key.DeleteVariable( 'popup_network_job' )
text_components = []
if num_successful > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_successful ) + ' successful' )
if num_redundant > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_redundant ) + ' already in db' )
if num_deleted > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_deleted ) + ' deleted' )
if num_failed > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_failed ) + ' failed (errors written to log)' )
job_key.SetVariable( 'popup_text_1', ', '.join( text_components ) )
if len( presentation_hashes ) > 0:
job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) )
job_key.DeleteVariable( 'popup_gauge_1' )
job_key.Finish()
def UpdateSeedCacheWithAllParseResults( seed_cache, all_parse_results, source_url = None, tag_import_options = None ):
# need a limit param here for 'stop at 40 total new because of file limit'
new_seeds = []
num_new = 0
num_already_in = 0
for parse_results in all_parse_results:
parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST ), only_get_top_priority = True )
for url in parsed_urls:
seed = Seed( SEED_TYPE_URL, url )
if source_url is not None:
seed.AddURL( source_url )
if seed_cache.HasSeed( seed ):
num_already_in += 1
else:
num_new += 1
seed.AddParseResults( parse_results )
new_seeds.append( seed )
seed_cache.AddSeeds( new_seeds )
return ( num_new, num_already_in )
def WakeRepeatingJob( job ):
if job is not None:
job.Wake()
class FileImportJob( object ):
def __init__( self, temp_path, file_import_options = None ):
if file_import_options is None:
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._temp_path = temp_path
self._file_import_options = file_import_options
self._hash = None
self._pre_import_status = None
self._file_info = None
self._thumbnail = None
self._phashes = None
self._extra_hashes = None
def CheckIsGoodToImport( self ):
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
self._file_import_options.CheckFileIsValid( size, mime, width, height )
def GetExtraHashes( self ):
return self._extra_hashes
def GetFileImportOptions( self ):
return self._file_import_options
def GetFileInfo( self ):
return self._file_info
def GetHash( self ):
return self._hash
def GetMime( self ):
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
return mime
def GetPreImportStatus( self ):
return self._pre_import_status
def GetPHashes( self ):
return self._phashes
def GetTempPathAndThumbnail( self ):
return ( self._temp_path, self._thumbnail )
def PubsubContentUpdates( self ):
if self._pre_import_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._file_import_options.AutomaticallyArchives():
service_keys_to_content_updates = { CC.COMBINED_LOCAL_FILE_SERVICE_KEY : [ HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, set( ( self._hash, ) ) ) ] }
HG.client_controller.Write( 'content_updates', service_keys_to_content_updates )
def IsNewToDB( self ):
if self._pre_import_status == CC.STATUS_UNKNOWN:
return True
if self._pre_import_status == CC.STATUS_DELETED:
if not self._file_import_options.ExcludesDeleted():
return True
return False
def GenerateHashAndStatus( self ):
HydrusImageHandling.ConvertToPngIfBmp( self._temp_path )
self._hash = HydrusFileHandling.GetHashFromPath( self._temp_path )
( self._pre_import_status, hash, note ) = HG.client_controller.Read( 'hash_status', 'sha256', self._hash, prefix = 'recognised during import' )
return ( self._pre_import_status, self._hash, note )
def GenerateInfo( self ):
mime = HydrusFileHandling.GetMime( self._temp_path )
new_options = HG.client_controller.new_options
if mime in HC.DECOMPRESSION_BOMB_IMAGES and not self._file_import_options.AllowsDecompressionBombs():
if HydrusImageHandling.IsDecompressionBomb( self._temp_path ):
raise HydrusExceptions.DecompressionBombException( 'Image seems to be a Decompression Bomb!' )
self._file_info = HydrusFileHandling.GetFileInfo( self._temp_path, mime )
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
if mime in HC.MIMES_WITH_THUMBNAILS:
percentage_in = HG.client_controller.new_options.GetInteger( 'video_thumbnail_percentage_in' )
self._thumbnail = HydrusFileHandling.GenerateThumbnail( self._temp_path, mime, percentage_in = percentage_in )
if mime in HC.MIMES_WE_CAN_PHASH:
self._phashes = ClientImageHandling.GenerateShapePerceptualHashes( self._temp_path, mime )
self._extra_hashes = HydrusFileHandling.GetExtraHashesFromPath( self._temp_path )
class GalleryImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT
SERIALISABLE_NAME = 'Gallery Import'
SERIALISABLE_VERSION = 2
def __init__( self, gallery_identifier = None ):
if gallery_identifier is None:
gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )
HydrusSerialisable.SerialisableBase.__init__( self )
self._gallery_identifier = gallery_identifier
self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )
self._current_query = None
self._current_query_num_urls = 0
self._current_gallery_stream_identifier = None
self._current_gallery_stream_identifier_page_index = 0
self._current_gallery_stream_identifier_found_urls = set()
self._pending_gallery_stream_identifiers = []
self._pending_queries = []
new_options = HG.client_controller.new_options
self._file_limit = HC.options[ 'gallery_file_limit' ]
self._gallery_paused = False
self._files_paused = False
self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._tag_import_options = new_options.GetDefaultTagImportOptions( self._gallery_identifier )
self._last_gallery_page_hit_timestamp = 0
self._seed_cache = SeedCache()
self._lock = threading.Lock()
self._gallery = None
self._gallery_status = ''
self._gallery_status_can_change_timestamp = 0
self._current_action = ''
self._download_control_file_set = None
self._download_control_file_clear = None
self._download_control_gallery_set = None
self._download_control_gallery_clear = None
self._files_repeating_job = None
self._gallery_repeating_job = None
HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )
def _GetSerialisableInfo( self ):
serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]
if self._current_gallery_stream_identifier is None:
serialisable_current_gallery_stream_identifier = None
else:
serialisable_current_gallery_stream_identifier = self._current_gallery_stream_identifier.GetSerialisableTuple()
serialisable_current_gallery_stream_identifier_found_urls = list( self._current_gallery_stream_identifier_found_urls )
serialisable_pending_gallery_stream_identifiers = [ pending_gallery_stream_identifier.GetSerialisableTuple() for pending_gallery_stream_identifier in self._pending_gallery_stream_identifiers ]
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
serialisable_tag_options = self._tag_import_options.GetSerialisableTuple()
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_current_query_stuff = ( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifiers )
return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache ) = serialisable_info
( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifier ) = serialisable_current_query_stuff
self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )
self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]
if serialisable_current_gallery_stream_identifier is None:
self._current_gallery_stream_identifier = None
else:
self._current_gallery_stream_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_current_gallery_stream_identifier )
self._current_gallery_stream_identifier_found_urls = set( serialisable_current_gallery_stream_identifier_found_urls )
self._pending_gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_pending_gallery_stream_identifier ) for serialisable_pending_gallery_stream_identifier in serialisable_pending_gallery_stream_identifier ]
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def _FileNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_file_set is not None:
wx.CallAfter( self._download_control_file_set, network_job )
def exit_call():
with self._lock:
if self._download_control_file_clear is not None:
wx.CallAfter( self._download_control_file_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _SetGalleryStatus( self, status, timeout = None ):
if HydrusData.TimeHasPassed( self._gallery_status_can_change_timestamp ):
self._gallery_status = status
if timeout is not None:
self._gallery_status_can_change_timestamp = HydrusData.GetNow() + timeout
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, pending_queries, get_tags_if_url_known_and_file_redundant, file_limit, gallery_paused, files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache ) = old_serialisable_info
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, pending_queries, file_limit, gallery_paused, files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache )
return ( 2, new_serialisable_info )
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, method, url, **kwargs )
wx.CallAfter( self._download_control_file_set, network_job )
return network_job
try:
gallery = ClientDownloading.GetGallery( self._gallery_identifier )
except Exception as e:
HydrusData.PrintException( e )
with self._lock:
self._files_paused = True
self._gallery_paused = True
HydrusData.ShowText( 'A downloader could not load its gallery! It has been paused and the full error has been written to the log!' )
return
gallery.SetNetworkJobFactory( network_job_factory )
try:
if seed.WorksInNewSystem():
def status_hook( text ):
with self._lock:
self._current_action = text
did_substantial_work = seed.WorkOnPostURL( self._file_import_options, self._tag_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._FileNetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
else:
with self._lock:
self._current_action = 'reviewing file'
seed.PredictPreImportStatus( self._file_import_options )
status = seed.status
url = seed.seed_data
if status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB() and self._tag_import_options.WorthFetchingTags():
downloaded_tags = gallery.GetTags( url )
seed.AddTags( downloaded_tags )
elif status == CC.STATUS_UNKNOWN:
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
try:
with self._lock:
self._current_action = 'downloading file'
if self._tag_import_options.WorthFetchingTags():
downloaded_tags = gallery.GetFileAndTags( temp_path, url )
seed.AddTags( downloaded_tags )
else:
gallery.GetFile( temp_path, url )
seed.CheckPreFetchMetadata( self._tag_import_options )
with self._lock:
self._current_action = 'importing file'
seed.Import( temp_path, self._file_import_options )
did_substantial_work = True
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
did_substantial_work = seed.WriteContentUpdates( self._tag_import_options )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
seed.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
seed.SetStatus( status, note = note )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
time.sleep( 3 )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
wx.CallAfter( self._download_control_file_clear )
with self._lock:
self._current_action = ''
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _WorkOnGallery( self, page_key ):
with self._lock:
if self._current_query is None:
if len( self._pending_queries ) == 0:
self._SetGalleryStatus( '' )
return False
else:
self._current_query = self._pending_queries.pop( 0 )
self._current_query_num_urls = 0
self._current_gallery_stream_identifier = None
self._pending_gallery_stream_identifiers = list( self._gallery_stream_identifiers )
if self._current_gallery_stream_identifier is None:
if len( self._pending_gallery_stream_identifiers ) == 0:
self._SetGalleryStatus( self._current_query + ': produced ' + HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls', 5 )
self._current_query = None
return False
else:
self._current_gallery_stream_identifier = self._pending_gallery_stream_identifiers.pop( 0 )
self._current_gallery_stream_identifier_page_index = 0
self._current_gallery_stream_identifier_found_urls = set()
next_gallery_page_hit_timestamp = self._last_gallery_page_hit_timestamp + HG.client_controller.new_options.GetInteger( 'gallery_page_wait_period_pages' )
if not HydrusData.TimeHasPassed( next_gallery_page_hit_timestamp ):
if self._current_gallery_stream_identifier_page_index == 0:
page_check_status = 'checking first page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
else:
page_check_status = HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found, checking next page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
self._SetGalleryStatus( self._current_query + ': ' + page_check_status )
return True
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, method, url, **kwargs )
network_job.OverrideBandwidth( 30 )
wx.CallAfter( self._download_control_gallery_set, network_job )
return network_job
try:
gallery = ClientDownloading.GetGallery( self._current_gallery_stream_identifier )
except Exception as e:
HydrusData.PrintException( e )
with self._lock:
self._files_paused = True
self._gallery_paused = True
HydrusData.ShowText( 'A downloader could not load its gallery! It has been paused and the full error has been written to the log!' )
return False
gallery.SetNetworkJobFactory( network_job_factory )
query = self._current_query
page_index = self._current_gallery_stream_identifier_page_index
self._SetGalleryStatus( self._current_query + ': ' + HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found, now checking page ' + HydrusData.ConvertIntToPrettyString( self._current_gallery_stream_identifier_page_index + 1 ) )
error_occured = False
num_already_in_seed_cache = 0
new_seeds = []
try:
try:
( page_of_seeds, definitely_no_more_pages ) = gallery.GetPage( query, page_index )
finally:
self._last_gallery_page_hit_timestamp = HydrusData.GetNow()
with self._lock:
no_urls_found = len( page_of_seeds ) == 0
page_of_urls = [ seed.seed_data for seed in page_of_seeds ]
no_new_urls = len( self._current_gallery_stream_identifier_found_urls.intersection( page_of_urls ) ) == len( page_of_seeds )
if definitely_no_more_pages or no_urls_found or no_new_urls:
self._current_gallery_stream_identifier = None
else:
self._current_gallery_stream_identifier_page_index += 1
self._current_gallery_stream_identifier_found_urls.update( page_of_urls )
for seed in page_of_seeds:
if self._seed_cache.HasSeed( seed ):
num_already_in_seed_cache += 1
else:
with self._lock:
if self._file_limit is not None and self._current_query_num_urls + 1 > self._file_limit:
self._current_gallery_stream_identifier = None
self._pending_gallery_stream_identifiers = []
break
self._current_query_num_urls += 1
new_seeds.append( seed )
self._seed_cache.AddSeeds( new_seeds )
if len( new_seeds ) > 0:
WakeRepeatingJob( self._files_repeating_job )
except Exception as e:
if isinstance( e, HydrusExceptions.NotFoundException ):
text = 'gallery 404'
else:
text = HydrusData.ToUnicode( e )
HydrusData.DebugPrint( traceback.format_exc() )
with self._lock:
self._current_gallery_stream_identifier = None
self._SetGalleryStatus( text, 5 )
time.sleep( 5 )
finally:
wx.CallAfter( self._download_control_gallery_clear )
with self._lock:
status = query + ': ' + HydrusData.ConvertIntToPrettyString( len( new_seeds ) ) + ' new urls found'
if num_already_in_seed_cache > 0:
status += ' (' + HydrusData.ConvertIntToPrettyString( num_already_in_seed_cache ) + ' of last page already in queue)'
self._SetGalleryStatus( status )
return True
def AdvanceQueries( self, queries ):
with self._lock:
queries_lookup = set( queries )
for query in queries:
if query in self._pending_queries:
index = self._pending_queries.index( query )
if index > 0 and self._pending_queries[ index - 1 ] not in queries_lookup:
self._pending_queries.remove( query )
self._pending_queries.insert( index - 1, query )
def CurrentlyWorking( self ):
with self._lock:
finished = not self._seed_cache.WorkToDo()
return not finished and not self._files_paused
def DelayQueries( self, queries ):
with self._lock:
queries = list( queries )
queries.reverse()
queries_lookup = set( queries )
for query in queries:
if query in self._pending_queries:
index = self._pending_queries.index( query )
if index + 1 < len( self._pending_queries ) and self._pending_queries[ index + 1 ] not in queries_lookup:
self._pending_queries.remove( query )
self._pending_queries.insert( index + 1, query )
def DeleteQueries( self, queries ):
with self._lock:
for query in queries:
if query in self._pending_queries:
self._pending_queries.remove( query )
def FinishCurrentQuery( self ):
with self._lock:
self._current_query = None
self._gallery_paused = False
WakeRepeatingJob( self._gallery_repeating_job )
def GetGalleryIdentifier( self ):
return self._gallery_identifier
def GetOptions( self ):
with self._lock:
return ( self._file_import_options, self._tag_import_options, self._file_limit )
def GetSeedCache( self ):
return self._seed_cache
def GetStatus( self ):
with self._lock:
cancellable = self._current_query is not None
return ( list( self._pending_queries ), self._gallery_status, self._current_action, self._files_paused, self._gallery_paused, cancellable )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def NotifySeedsUpdated( self, seed_cache_key, seeds ):
if seed_cache_key == self._seed_cache.GetSeedCacheKey():
WakeRepeatingJob( self._files_repeating_job )
def PausePlayFiles( self ):
with self._lock:
self._files_paused = not self._files_paused
WakeRepeatingJob( self._files_repeating_job )
def PausePlayGallery( self ):
with self._lock:
self._gallery_paused = not self._gallery_paused
WakeRepeatingJob( self._gallery_repeating_job )
def PendQuery( self, query ):
with self._lock:
if query not in self._pending_queries:
self._pending_queries.append( query )
WakeRepeatingJob( self._gallery_repeating_job )
def SetDownloadControls( self, file_download_control, gallery_download_control ):
with self._lock:
self._download_control_file_set = file_download_control.SetNetworkJob
self._download_control_file_clear = file_download_control.ClearNetworkJob
self._download_control_gallery_set = gallery_download_control.SetNetworkJob
self._download_control_gallery_clear = gallery_download_control.ClearNetworkJob
def SetFileLimit( self, file_limit ):
with self._lock:
self._file_limit = file_limit
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def SetTagImportOptions( self, tag_import_options ):
with self._lock:
self._tag_import_options = tag_import_options
def Start( self, page_key ):
self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )
self._gallery_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnGallery, page_key )
def REPEATINGWorkOnFiles( self, page_key ):
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
while work_to_do:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
def REPEATINGWorkOnGallery( self, page_key ):
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._gallery_repeating_job.Cancel()
return
ok_to_work = not ( self._gallery_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
while ok_to_work:
try:
work_to_do = self._WorkOnGallery( page_key )
if work_to_do:
time.sleep( 1 )
else:
return
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._gallery_repeating_job.Cancel()
return
ok_to_work = not ( self._gallery_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT ] = GalleryImport
class HDDImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT
SERIALISABLE_NAME = 'Local File Import'
SERIALISABLE_VERSION = 1
def __init__( self, paths = None, file_import_options = None, paths_to_tags = None, delete_after_success = None ):
HydrusSerialisable.SerialisableBase.__init__( self )
if paths is None:
self._seed_cache = None
else:
self._seed_cache = SeedCache()
seeds = []
for path in paths:
seed = Seed( SEED_TYPE_HDD, path )
try:
s = os.stat( path )
seed.source_time = int( min( s.st_mtime, s.st_ctime ) )
except:
pass
seeds.append( seed )
self._seed_cache.AddSeeds( seeds )
self._file_import_options = file_import_options
self._paths_to_tags = paths_to_tags
self._delete_after_success = delete_after_success
self._current_action = ''
self._paused = False
self._lock = threading.Lock()
self._files_repeating_job = None
HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )
def _GetSerialisableInfo( self ):
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_options = self._file_import_options.GetSerialisableTuple()
serialisable_paths_to_tags = { path : { service_key.encode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in self._paths_to_tags.items() }
return ( serialisable_seed_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_seed_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused ) = serialisable_info
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_options )
self._paths_to_tags = { path : { service_key.decode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in serialisable_paths_to_tags.items() }
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
path = seed.seed_data
with self._lock:
if path in self._paths_to_tags:
service_keys_to_tags = self._paths_to_tags[ path ]
else:
service_keys_to_tags = {}
try:
if not os.path.exists( path ):
raise Exception( 'Source file does not exist!' )
with self._lock:
self._current_action = 'importing'
seed.ImportPath( self._file_import_options )
did_substantial_work = True
if seed.status in CC.SUCCESSFUL_IMPORT_STATES:
hash = seed.GetHash()
service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
did_substantial_work = True
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
if self._delete_after_success:
try:
ClientPaths.DeletePath( path )
except Exception as e:
HydrusData.ShowText( 'While attempting to delete ' + path + ', the following error occured:' )
HydrusData.ShowException( e )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
try:
ClientPaths.DeletePath( txt_path )
except Exception as e:
HydrusData.ShowText( 'While attempting to delete ' + txt_path + ', the following error occured:' )
HydrusData.ShowException( e )
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
seed.SetStatus( status, note = note )
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
with self._lock:
self._current_action = ''
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def CurrentlyWorking( self ):
with self._lock:
work_to_do = self._seed_cache.WorkToDo()
return work_to_do and not self._paused
def GetFileImportOptions( self ):
with self._lock:
return self._file_import_options
def GetSeedCache( self ):
return self._seed_cache
def GetStatus( self ):
with self._lock:
return ( self._current_action, self._paused )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def NotifySeedsUpdated( self, seed_cache_key, seeds ):
if seed_cache_key == self._seed_cache.GetSeedCacheKey():
WakeRepeatingJob( self._files_repeating_job )
def PausePlay( self ):
with self._lock:
self._paused = not self._paused
WakeRepeatingJob( self._files_repeating_job )
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def Start( self, page_key ):
self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )
def REPEATINGWorkOnFiles( self, page_key ):
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
while work_to_do:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT ] = HDDImport
class ImportFolder( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER
SERIALISABLE_NAME = 'Import Folder'
SERIALISABLE_VERSION = 6
def __init__( self, name, path = '', file_import_options = None, tag_import_options = None, tag_service_keys_to_filename_tagging_options = None, mimes = None, actions = None, action_locations = None, period = 3600, check_regularly = True, show_working_popup = True, publish_files_to_popup_button = True, publish_files_to_page = False ):
if mimes is None:
mimes = HC.ALLOWED_MIMES
if file_import_options is None:
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' )
if tag_import_options is None:
tag_import_options = HG.client_controller.new_options.GetDefaultTagImportOptions( ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEFAULT ) )
if tag_service_keys_to_filename_tagging_options is None:
tag_service_keys_to_filename_tagging_options = {}
if actions is None:
actions = {}
actions[ CC.STATUS_SUCCESSFUL_AND_NEW ] = CC.IMPORT_FOLDER_IGNORE
actions[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ] = CC.IMPORT_FOLDER_IGNORE
actions[ CC.STATUS_DELETED ] = CC.IMPORT_FOLDER_IGNORE
actions[ CC.STATUS_ERROR ] = CC.IMPORT_FOLDER_IGNORE
if action_locations is None:
action_locations = {}
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._path = path
self._mimes = mimes
self._file_import_options = file_import_options
self._tag_import_options = tag_import_options
self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
self._actions = actions
self._action_locations = action_locations
self._period = period
self._check_regularly = check_regularly
self._seed_cache = SeedCache()
self._last_checked = 0
self._paused = False
self._check_now = False
self._show_working_popup = show_working_popup
self._publish_files_to_popup_button = publish_files_to_popup_button
self._publish_files_to_page = publish_files_to_page
def _ActionPaths( self ):
for status in ( CC.STATUS_SUCCESSFUL_AND_NEW, CC.STATUS_SUCCESSFUL_BUT_REDUNDANT, CC.STATUS_DELETED, CC.STATUS_ERROR ):
action = self._actions[ status ]
if action == CC.IMPORT_FOLDER_DELETE:
while True:
seed = self._seed_cache.GetNextSeed( status )
if seed is None or HG.view_shutdown:
break
path = seed.seed_data
try:
if os.path.exists( path ):
ClientPaths.DeletePath( path )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
ClientPaths.DeletePath( txt_path )
self._seed_cache.RemoveSeeds( ( seed, ) )
except Exception as e:
HydrusData.ShowText( 'Import folder tried to delete ' + path + ', but could not:' )
HydrusData.ShowException( e )
HydrusData.ShowText( 'Import folder has been paused.' )
self._paused = True
return
elif action == CC.IMPORT_FOLDER_MOVE:
while True:
seed = self._seed_cache.GetNextSeed( status )
if seed is None or HG.view_shutdown:
break
path = seed.seed_data
try:
dest_dir = self._action_locations[ status ]
if not os.path.exists( dest_dir ):
raise HydrusExceptions.DataMissing( 'The move location "' + dest_dir + '" does not exist!' )
if os.path.exists( path ):
filename = os.path.basename( path )
dest_path = os.path.join( dest_dir, filename )
dest_path = HydrusPaths.AppendPathUntilNoConflicts( dest_path )
HydrusPaths.MergeFile( path, dest_path )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
txt_filename = os.path.basename( txt_path )
txt_dest_path = os.path.join( dest_dir, txt_filename )
txt_dest_path = HydrusPaths.AppendPathUntilNoConflicts( txt_dest_path )
HydrusPaths.MergeFile( txt_path, txt_dest_path )
self._seed_cache.RemoveSeeds( ( seed, ) )
except Exception as e:
HydrusData.ShowText( 'Import folder tried to move ' + path + ', but could not:' )
HydrusData.ShowException( e )
HydrusData.ShowText( 'Import folder has been paused.' )
self._paused = True
return
elif status == CC.IMPORT_FOLDER_IGNORE:
pass
def _CheckFolder( self, job_key ):
filenames = os.listdir( HydrusData.ToUnicode( self._path ) )
raw_paths = [ os.path.join( self._path, filename ) for filename in filenames ]
all_paths = ClientFiles.GetAllPaths( raw_paths )
all_paths = HydrusPaths.FilterFreePaths( all_paths )
seeds = []
for path in all_paths:
if job_key.IsCancelled():
break
if path.endswith( '.txt' ):
continue
seed = Seed( SEED_TYPE_HDD, path )
if not self._seed_cache.HasSeed( seed ):
seeds.append( seed )
job_key.SetVariable( 'popup_text_1', 'checking: found ' + HydrusData.ConvertIntToPrettyString( len( seeds ) ) + ' new files' )
self._seed_cache.AddSeeds( seeds )
self._last_checked = HydrusData.GetNow()
self._check_now = False
def _GetSerialisableInfo( self ):
serialisable_file_import_options = self._file_import_options.GetSerialisableTuple()
serialisable_tag_import_options = self._tag_import_options.GetSerialisableTuple()
serialisable_tag_service_keys_to_filename_tagging_options = [ ( service_key.encode( 'hex' ), filename_tagging_options.GetSerialisableTuple() ) for ( service_key, filename_tagging_options ) in self._tag_service_keys_to_filename_tagging_options.items() ]
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
# json turns int dict keys to strings
action_pairs = self._actions.items()
action_location_pairs = self._action_locations.items()
return ( self._path, self._mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, self._period, self._check_regularly, serialisable_seed_cache, self._last_checked, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page )
def _ImportFiles( self, job_key ):
did_work = False
time_to_save = HydrusData.GetNow() + 600
num_files_imported = 0
presentation_hashes = []
presentation_hashes_fast = set()
i = 0
num_total = len( self._seed_cache )
num_total_unknown = self._seed_cache.GetSeedCount( CC.STATUS_UNKNOWN )
num_total_done = num_total - num_total_unknown
while True:
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
p1 = HC.options[ 'pause_import_folders_sync' ] or self._paused
p2 = HydrusThreading.IsThreadShuttingDown()
p3 = job_key.IsCancelled()
if seed is None or p1 or p2 or p3:
break
if HydrusData.TimeHasPassed( time_to_save ):
HG.client_controller.WriteSynchronous( 'serialisable', self )
time_to_save = HydrusData.GetNow() + 600
gauge_num_done = num_total_done + num_files_imported + 1
job_key.SetVariable( 'popup_text_1', 'importing file ' + HydrusData.ConvertValueRangeToPrettyString( gauge_num_done, num_total ) )
job_key.SetVariable( 'popup_gauge_1', ( gauge_num_done, num_total ) )
path = seed.seed_data
try:
mime = HydrusFileHandling.GetMime( path )
if mime in self._mimes:
seed.ImportPath( self._file_import_options )
hash = seed.GetHash()
if seed.status in CC.SUCCESSFUL_IMPORT_STATES:
downloaded_tags = []
service_keys_to_content_updates = self._tag_import_options.GetServiceKeysToContentUpdates( hash, downloaded_tags ) # additional tags
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
service_keys_to_tags = {}
for ( tag_service_key, filename_tagging_options ) in self._tag_service_keys_to_filename_tagging_options.items():
if not HG.client_controller.services_manager.ServiceExists( tag_service_key ):
continue
try:
tags = filename_tagging_options.GetTags( tag_service_key, path )
if len( tags ) > 0:
service_keys_to_tags[ tag_service_key ] = tags
except Exception as e:
HydrusData.ShowText( 'Trying to parse filename tags in the import folder "' + self._name + '" threw an error!' )
HydrusData.ShowException( e )
if len( service_keys_to_tags ) > 0:
service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
num_files_imported += 1
if hash not in presentation_hashes_fast:
if seed.ShouldPresent( self._file_import_options ):
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
else:
seed.SetStatus( CC.STATUS_VETOED )
except Exception as e:
error_text = traceback.format_exc()
HydrusData.Print( 'A file failed to import from import folder ' + self._name + ':' + path )
seed.SetStatus( CC.STATUS_ERROR, exception = e )
finally:
did_work = True
i += 1
if i % 10 == 0:
self._ActionPaths()
if num_files_imported > 0:
HydrusData.Print( 'Import folder ' + self._name + ' imported ' + HydrusData.ConvertIntToPrettyString( num_files_imported ) + ' files.' )
if len( presentation_hashes ) > 0:
PublishPresentationHashes( self._name, presentation_hashes, self._publish_files_to_popup_button, self._publish_files_to_page )
self._ActionPaths()
return did_work
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._path, self._mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, self._period, self._check_regularly, serialisable_seed_cache, self._last_checked, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page ) = serialisable_info
self._actions = dict( action_pairs )
self._action_locations = dict( action_location_pairs )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_import_options )
self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_import_options )
self._tag_service_keys_to_filename_tagging_options = dict( [ ( encoded_service_key.decode( 'hex' ), HydrusSerialisable.CreateFromSerialisableTuple( serialisable_filename_tagging_options ) ) for ( encoded_service_key, serialisable_filename_tagging_options ) in serialisable_tag_service_keys_to_filename_tagging_options ] )
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( path, mimes, serialisable_file_import_options, action_pairs, action_location_pairs, period, open_popup, tag, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info
service_keys_to_additional_tags = {}
if tag is not None:
service_keys_to_additional_tags[ CC.LOCAL_TAG_SERVICE_KEY ] = { tag }
tag_import_options = ClientImportOptions.TagImportOptions( service_keys_to_additional_tags = service_keys_to_additional_tags )
serialisable_tag_import_options = tag_import_options.GetSerialisableTuple()
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused )
return ( 2, new_serialisable_info )
if version == 2:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info
serialisable_txt_parse_tag_service_keys = []
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused )
return ( 3, new_serialisable_info )
if version == 3:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info
check_now = False
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now )
return ( 4, new_serialisable_info )
if version == 4:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now ) = old_serialisable_info
txt_parse_tag_service_keys = [ service_key.decode( 'hex' ) for service_key in serialisable_txt_parse_tag_service_keys ]
tag_service_keys_to_filename_tagging_options = {}
for service_key in txt_parse_tag_service_keys:
filename_tagging_options = ClientImportOptions.FilenameTaggingOptions()
filename_tagging_options._load_from_neighbouring_txt_files = True
tag_service_keys_to_filename_tagging_options[ service_key ] = filename_tagging_options
serialisable_tag_service_keys_to_filename_tagging_options = [ ( service_key.encode( 'hex' ), filename_tagging_options.GetSerialisableTuple() ) for ( service_key, filename_tagging_options ) in tag_service_keys_to_filename_tagging_options.items() ]
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now )
return ( 5, new_serialisable_info )
if version == 5:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now ) = old_serialisable_info
check_regularly = not paused
show_working_popup = True
publish_files_to_page = False
publish_files_to_popup_button = open_popup
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, check_regularly, serialisable_seed_cache, last_checked, paused, check_now, show_working_popup, publish_files_to_popup_button, publish_files_to_page )
return ( 6, new_serialisable_info )
def CheckNow( self ):
self._check_now = True
def DoWork( self ):
if HG.view_shutdown:
return
if HC.options[ 'pause_import_folders_sync' ] or self._paused:
return
if not os.path.exists( self._path ) or not os.path.isdir( self._path ):
return
pubbed_job_key = False
job_key = ClientThreading.JobKey( pausable = False, cancellable = True )
job_key.SetVariable( 'popup_title', 'import folder - ' + self._name )
due_by_check_now = self._check_now
due_by_period = self._check_regularly and HydrusData.TimeHasPassed( self._last_checked + self._period )
checked_folder = False
if due_by_check_now or due_by_period:
if not pubbed_job_key and self._show_working_popup:
HG.client_controller.pub( 'message', job_key )
pubbed_job_key = True
self._CheckFolder( job_key )
checked_folder = True
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
did_import_file_work = False
if seed is not None:
if not pubbed_job_key and self._show_working_popup:
HG.client_controller.pub( 'message', job_key )
pubbed_job_key = True
did_import_file_work = self._ImportFiles( job_key )
if checked_folder or did_import_file_work:
HG.client_controller.WriteSynchronous( 'serialisable', self )
job_key.Delete()
def GetSeedCache( self ):
return self._seed_cache
def ToListBoxTuple( self ):
return ( self._name, self._path, self._period )
def ToTuple( self ):
return ( self._name, self._path, self._mimes, self._file_import_options, self._tag_import_options, self._tag_service_keys_to_filename_tagging_options, self._actions, self._action_locations, self._period, self._check_regularly, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page )
def SetSeedCache( self, seed_cache ):
self._seed_cache = seed_cache
def SetTuple( self, name, path, mimes, file_import_options, tag_import_options, tag_service_keys_to_filename_tagging_options, actions, action_locations, period, check_regularly, paused, check_now, show_working_popup, publish_files_to_popup_button, publish_files_to_page ):
if path != self._path:
self._seed_cache = SeedCache()
if set( mimes ) != set( self._mimes ):
self._seed_cache.RemoveSeedsByStatus( ( CC.STATUS_VETOED, ) )
self._name = name
self._path = path
self._mimes = mimes
self._file_import_options = file_import_options
self._tag_import_options = tag_import_options
self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
self._actions = actions
self._action_locations = action_locations
self._period = period
self._check_regularly = check_regularly
self._paused = paused
self._check_now = check_now
self._show_working_popup = show_working_popup
self._publish_files_to_popup_button = publish_files_to_popup_button
self._publish_files_to_page = publish_files_to_page
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER ] = ImportFolder
class NetworkJobPresentationContext( object ):
def __init__( self, enter_call, exit_call ):
self._enter_call = enter_call
self._exit_call = exit_call
def __enter__( self ):
self._enter_call()
def __exit__( self, exc_type, exc_val, exc_tb ):
self._exit_call()
SEED_TYPE_HDD = 0
SEED_TYPE_URL = 1
class Seed( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED
SERIALISABLE_NAME = 'File Import'
SERIALISABLE_VERSION = 1
def __init__( self, seed_type = None, seed_data = None ):
if seed_type is None:
seed_type = SEED_TYPE_URL
if seed_data is None:
seed_data = 'https://big-guys.4u/monica_lewinsky_hott.tiff.exe.vbs'
HydrusSerialisable.SerialisableBase.__init__( self )
self.seed_type = seed_type
self.seed_data = seed_data
self.created = HydrusData.GetNow()
self.modified = self.created
self.source_time = None
self.status = CC.STATUS_UNKNOWN
self.note = ''
self._urls = set()
self._tags = set()
self._hashes = {}
def __eq__( self, other ):
return self.__hash__() == other.__hash__()
def __hash__( self ):
return ( self.seed_type, self.seed_data ).__hash__()
def __ne__( self, other ):
return self.__hash__() != other.__hash__()
def _CheckTagsBlacklist( self, tags, tag_import_options ):
tag_import_options.CheckBlacklist( tags )
def _GetSerialisableInfo( self ):
serialisable_urls = list( self._urls )
serialisable_tags = list( self._tags )
serialisable_hashes = [ ( hash_type, hash.encode( 'hex' ) ) for ( hash_type, hash ) in self._hashes.items() if hash is not None ]
return ( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, serialisable_urls, serialisable_tags, serialisable_hashes )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, serialisable_urls, serialisable_tags, serialisable_hashes ) = serialisable_info
self._urls = set( serialisable_urls )
self._tags = set( serialisable_tags )
self._hashes = { hash_type : encoded_hash.decode( 'hex' ) for ( hash_type, encoded_hash ) in serialisable_hashes if encoded_hash is not None }
def _NormaliseAndFilterAssociableURLs( self, urls ):
normalised_urls = { HG.client_controller.network_engine.domain_manager.NormaliseURL( url ) for url in urls }
associable_urls = { url for url in normalised_urls if HG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
return associable_urls
def _UpdateModified( self ):
self.modified = HydrusData.GetNow()
def AddParseResults( self, parse_results ):
for ( hash_type, hash ) in ClientParsing.GetHashesFromParseResults( parse_results ):
if hash_type not in self._hashes:
self._hashes[ hash_type ] = hash
urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST ) )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
associable_urls.discard( self.seed_data )
self._urls.update( associable_urls )
tags = ClientParsing.GetTagsFromParseResults( parse_results )
self._tags.update( tags )
source_timestamp = ClientParsing.GetTimestampFromParseResults( parse_results, HC.TIMESTAMP_TYPE_SOURCE )
source_timestamp = min( HydrusData.GetNow() - 30, source_timestamp )
if source_timestamp is not None:
self.source_time = source_timestamp
self._UpdateModified()
def AddTags( self, tags ):
tags = HydrusTags.CleanTags( tags )
self._tags.update( tags )
self._UpdateModified()
def AddURL( self, url ):
urls = ( url, )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
associable_urls.discard( self.seed_data )
self._urls.update( associable_urls )
def CheckPreFetchMetadata( self, tag_import_options ):
self._CheckTagsBlacklist( self._tags, tag_import_options )
def DownloadAndImportRawFile( self, file_url, file_import_options, network_job_factory, network_job_presentation_context_factory ):
self.AddURL( file_url )
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
try:
if self.seed_data != file_url:
referral_url = self.seed_data
else:
referral_url = None
network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
self.Import( temp_path, file_import_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
def FetchPageMetadata( self, tag_import_options ):
pass
def PredictPreImportStatus( self, file_import_options, file_url = None ):
if self.status != CC.STATUS_UNKNOWN:
return
UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )
( status, hash, note ) = UNKNOWN_DEFAULT
# urls
urls = set( self._urls )
if file_url is not None:
urls.add( file_url )
if self.seed_type == SEED_TYPE_URL:
urls.add( self.seed_data )
unrecognised_url_results = set()
for url in urls:
if HG.client_controller.network_engine.domain_manager.URLCanReferToMultipleFiles( url ):
continue
# we now only trust url-matched single urls and the post/file urls
# trusting unmatched source urls was too much of a hassle with too many boorus providing bad source urls like user account pages
if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToOneFile( url ) or url in ( self.seed_data, file_url ):
results = HG.client_controller.Read( 'url_statuses', url )
if len( results ) == 0: # if no match found, no useful data discovered
continue
elif len( results ) > 1: # if more than one file claims this url, it cannot be relied on to guess the file
continue
else: # i.e. 1 match found
( status, hash, note ) = results[0]
if status != CC.STATUS_UNKNOWN:
break # if a known one-file url gives a single clear result, that result is reliable
# hashes
if status == CC.STATUS_UNKNOWN:
for ( hash_type, found_hash ) in self._hashes.items():
( status, hash, note ) = HG.client_controller.Read( 'hash_status', hash_type, found_hash )
if status != CC.STATUS_UNKNOWN:
break
#
if status == CC.STATUS_DELETED:
if not file_import_options.ExcludesDeleted():
status = CC.STATUS_UNKNOWN
note = ''
self.status = status
if hash is not None:
self._hashes[ 'sha256' ] = hash
self.note = note
self._UpdateModified()
def GetHash( self ):
if 'sha256' in self._hashes:
return self._hashes[ 'sha256' ]
return None
def GetSearchSeeds( self ):
if self.seed_type == SEED_TYPE_URL:
search_urls = ClientNetworkingDomain.GetSearchURLs( self.seed_data )
search_seeds = [ Seed( SEED_TYPE_URL, search_url ) for search_url in search_urls ]
else:
search_seeds = [ self ]
return search_seeds
def HasHash( self ):
return self.GetHash() is not None
def Import( self, temp_path, file_import_options ):
file_import_job = FileImportJob( temp_path, file_import_options )
( status, hash, note ) = HG.client_controller.client_files_manager.ImportFile( file_import_job )
self.SetStatus( status, note = note )
self.SetHash( hash )
def ImportPath( self, file_import_options ):
if self.seed_type != SEED_TYPE_HDD:
raise Exception( 'Attempted to import as a path, but I do not think I am a path!' )
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
try:
path = self.seed_data
copied = HydrusPaths.MirrorFile( path, temp_path )
if not copied:
raise Exception( 'File failed to copy to temp path--see log for error.' )
self.Import( temp_path, file_import_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
def Normalise( self ):
if self.seed_type == SEED_TYPE_URL:
self.seed_data = HG.client_controller.network_engine.domain_manager.NormaliseURL( self.seed_data )
def PresentToPage( self, page_key ):
hash = self.GetHash()
if hash is not None:
( media_result, ) = HG.client_controller.Read( 'media_results', ( hash, ) )
HG.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
def SetHash( self, hash ):
if hash is not None:
self._hashes[ 'sha256' ] = hash
def SetStatus( self, status, note = '', exception = None ):
if exception is not None:
first_line = HydrusData.ToUnicode( exception ).split( os.linesep )[0]
note = first_line + u'\u2026 (Copy note to see full error)'
note += os.linesep
note += HydrusData.ToUnicode( traceback.format_exc() )
HydrusData.Print( 'Error when processing ' + self.seed_data + ' !' )
HydrusData.Print( traceback.format_exc() )
self.status = status
self.note = note
self._UpdateModified()
def ShouldDownloadFile( self ):
return self.status == CC.STATUS_UNKNOWN
def ShouldFetchPageMetadata( self, tag_import_options ):
if self.status == CC.STATUS_UNKNOWN:
return True
if self.status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if tag_import_options.WorthFetchingTags() and tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB():
return True
return False
def ShouldPresent( self, file_import_options ):
hash = self.GetHash()
if hash is not None and self.status in CC.SUCCESSFUL_IMPORT_STATES:
if file_import_options.ShouldPresentIgnorantOfInbox( self.status ):
return True
in_inbox = HG.client_controller.Read( 'in_inbox', hash )
if file_import_options.ShouldPresent( self.status, in_inbox ):
return True
return False
def WorkOnFileURL( self, file_import_options, status_hook, network_job_factory, network_job_presentation_context_factory, tag_import_options = None ):
did_substantial_work = False
try:
status_hook( 'checking url status' )
self.PredictPreImportStatus( file_import_options )
if self.status == CC.STATUS_UNKNOWN:
file_url = self.seed_data
status_hook( 'downloading file' )
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )
did_substantial_work = True
did_substantial_work |= self.WriteContentUpdates( tag_import_options )
except HydrusExceptions.ShutdownException:
return False
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
self.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
status_hook( 'cancelled!' )
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
self.SetStatus( status, note = note )
status_hook( '404' )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
self.SetStatus( status, exception = e )
status_hook( 'error!' )
time.sleep( 3 )
return did_substantial_work
def WorkOnPostURL( self, file_import_options, tag_import_options, status_hook, network_job_factory, network_job_presentation_context_factory ):
did_substantial_work = False
try:
status_hook( 'checking url status' )
self.PredictPreImportStatus( file_import_options )
if self.ShouldFetchPageMetadata( tag_import_options ):
post_url = self.seed_data
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
status_hook( 'downloading page' )
network_job = network_job_factory( 'GET', url_to_check )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
data = network_job.GetContent()
parsing_context = {}
parsing_context[ 'post_url' ] = post_url
parsing_context[ 'url' ] = url_to_check
all_parse_results = parser.Parse( parsing_context, data )
if len( all_parse_results ) == 0:
raise HydrusExceptions.VetoException( 'Could not parse any data!' )
parse_results = all_parse_results[0]
# this now needs to deal with multiple file post urls cleverly, which I think means no longer associating file_urls at this point--do that url association in DownloadAndImportRawFile only
self.AddParseResults( parse_results )
self.CheckPreFetchMetadata( tag_import_options )
file_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, ), only_get_top_priority = True )
if len( file_urls ) == 0:
raise HydrusExceptions.VetoException( 'Could not file a file URL!' )
elif len( file_urls ) == 1 or True: # let's still mandata this for a bit
file_url = file_urls[0]
self.PredictPreImportStatus( file_import_options, file_url )
if self.ShouldDownloadFile():
status_hook( 'downloading file' )
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )
did_substantial_work = True
else:
raise HydrusExceptions.VetoException( 'Multiple-file post pages are not yet supported!' )
for file_url in file_urls:
duplicate_seed = self.Duplicate() # inherits all urls and tags from here
duplicate_seed.seed_data = file_url
duplicate_seed.AddURL( self.seed_data )
# set referral url as my seed_data--this should probably auto-do AddURL( self.seed_data ) tbh
# insert in my seed cache just after me
status = CC.STATUS_SUCCESSFUL_AND_NEW
note = 'Found ' + HydrusData.ConvertIntToPrettyString( len( file_urls ) ) + ' File URLs in this page.'
self.SetStatus( status, note = note )
# alter seeds so:
# referral url is saved and used in workonfileurl and workonposturl
# gallery/sub import loops can now handle workonfileurl
# there is also the question of pixiv manga pages, which may need linking from the mode=medium page, which is two jumps
# this presumably means adding a new url content type to the parser like 'addable post url' or something
did_substantial_work |= self.WriteContentUpdates( tag_import_options )
except HydrusExceptions.ShutdownException:
return False
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
self.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
status_hook( 'cancelled!' )
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
self.SetStatus( status, note = note )
status_hook( '404' )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
self.SetStatus( status, exception = e )
status_hook( 'error!' )
time.sleep( 3 )
return did_substantial_work
def WorksInNewSystem( self ):
if self.seed_type == SEED_TYPE_URL:
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.seed_data )
if url_type == HC.URL_TYPE_POST and can_parse:
return True
return False
def WriteContentUpdates( self, tag_import_options = None ):
did_work = False
if self.status == CC.STATUS_ERROR:
return did_work
hash = self.GetHash()
if hash is None:
return did_work
service_keys_to_content_updates = collections.defaultdict( list )
urls = set( self._urls )
if self.seed_type == SEED_TYPE_URL:
urls.add( self.seed_data )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
if len( associable_urls ) > 0:
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( associable_urls, ( hash, ) ) )
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
if tag_import_options is not None:
for ( service_key, content_updates ) in tag_import_options.GetServiceKeysToContentUpdates( hash, set( self._tags ) ).items():
service_keys_to_content_updates[ service_key ].extend( content_updates )
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
did_work = True
return did_work
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED ] = Seed
class SeedCache( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE
SERIALISABLE_NAME = 'Import File Status Cache'
SERIALISABLE_VERSION = 8
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._seeds = HydrusSerialisable.SerialisableList()
self._seeds_to_indices = {}
self._seed_cache_key = HydrusData.GenerateKey()
self._status_cache = None
self._status_cache_generation_time = 0
self._dirty = True
self._lock = threading.Lock()
def __len__( self ):
return len( self._seeds )
def _GenerateStatus( self ):
statuses_to_counts = self._GetStatusesToCounts()
self._status_cache = GenerateSeedCacheStatus( statuses_to_counts )
self._status_cache_generation_time = HydrusData.GetNow()
self._dirty = False
def _GetStatusesToCounts( self ):
statuses_to_counts = collections.Counter()
for seed in self._seeds:
statuses_to_counts[ seed.status ] += 1
return statuses_to_counts
def _GetSeeds( self, status = None ):
if status is None:
return list( self._seeds )
else:
return [ seed for seed in self._seeds if seed.status == status ]
def _GetSerialisableInfo( self ):
with self._lock:
return self._seeds.GetSerialisableTuple()
def _GetSourceTimestamp( self, seed ):
source_timestamp = seed.source_time
if source_timestamp is None:
# decent fallback compromise
# -30 since added and 'last check' timestamps are often the same, and this messes up calculations
source_timestamp = seed.created - 30
return source_timestamp
def _HasSeed( self, seed ):
search_seeds = seed.GetSearchSeeds()
has_seed = True in ( search_seed in self._seeds_to_indices for search_seed in search_seeds )
return has_seed
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
with self._lock:
self._seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
def _SetDirty( self ):
self._dirty = True
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
if 'note' in seed_info:
seed_info[ 'note' ] = HydrusData.ToUnicode( seed_info[ 'note' ] )
new_serialisable_info.append( ( seed, seed_info ) )
return ( 2, new_serialisable_info )
if version in ( 2, 3 ):
# gelbooru replaced their thumbnail links with this redirect spam
# 'https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY4ODA1OA=='
# I missed some http ones here, so I've broadened the test and rescheduled it
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
if 'gelbooru.com/redirect.php' in seed:
continue
new_serialisable_info.append( ( seed, seed_info ) )
return ( 4, new_serialisable_info )
if version == 4:
def ConvertRegularToRawURL( regular_url ):
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
# to this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# the 500 part can be a bunch of stuff, including letters
url_components = regular_url.split( '_' )
last_component = url_components[ -1 ]
( number_gubbins, file_ext ) = last_component.split( '.' )
raw_last_component = 'raw.' + file_ext
url_components[ -1 ] = raw_last_component
raw_url = '_'.join( url_components )
return raw_url
def Remove68Subdomain( long_url ):
# sometimes the 68 subdomain gives a 404 on the raw url, so:
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# to this:
# http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# I am not sure if it is always 68, but let's not assume
( scheme, rest ) = long_url.split( '://', 1 )
if rest.startswith( 'media.tumblr.com' ):
return long_url
( gumpf, shorter_rest ) = rest.split( '.', 1 )
shorter_url = scheme + '://' + shorter_rest
return shorter_url
new_serialisable_info = []
good_seeds = set()
for ( seed, seed_info ) in old_serialisable_info:
try:
parse = urlparse.urlparse( seed )
if 'media.tumblr.com' in parse.netloc:
seed = Remove68Subdomain( seed )
seed = ConvertRegularToRawURL( seed )
seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )
if 'pixiv.net' in parse.netloc:
seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )
if seed in good_seeds: # we hit a dupe, so skip it
continue
except:
pass
good_seeds.add( seed )
new_serialisable_info.append( ( seed, seed_info ) )
return ( 5, new_serialisable_info )
if version == 5:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
seed_info[ 'source_timestamp' ] = None
new_serialisable_info.append( ( seed, seed_info ) )
return ( 6, new_serialisable_info )
if version == 6:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
try:
magic_phrase = '//media.tumblr.com'
replacement = '//data.tumblr.com'
if magic_phrase in seed:
seed = seed.replace( magic_phrase, replacement )
except:
pass
new_serialisable_info.append( ( seed, seed_info ) )
return ( 7, new_serialisable_info )
if version == 7:
seeds = HydrusSerialisable.SerialisableList()
for ( seed_text, seed_info ) in old_serialisable_info:
if seed_text.startswith( 'http' ):
seed_type = SEED_TYPE_URL
else:
seed_type = SEED_TYPE_HDD
seed = Seed( seed_type, seed_text )
seed.status = seed_info[ 'status' ]
seed.created = seed_info[ 'added_timestamp' ]
seed.modified = seed_info[ 'last_modified_timestamp' ]
seed.source_time = seed_info[ 'source_timestamp' ]
seed.note = seed_info[ 'note' ]
seeds.append( seed )
new_serialisable_info = seeds.GetSerialisableTuple()
return ( 8, new_serialisable_info )
def AddSeeds( self, seeds ):
if len( seeds ) == 0:
return 0
new_seeds = []
with self._lock:
for seed in seeds:
if self._HasSeed( seed ):
continue
seed.Normalise()
new_seeds.append( seed )
self._seeds.append( seed )
self._seeds_to_indices[ seed ] = len( self._seeds ) - 1
self._SetDirty()
self.NotifySeedsUpdated( new_seeds )
return len( new_seeds )
def AdvanceSeed( self, seed ):
with self._lock:
if seed in self._seeds_to_indices:
index = self._seeds_to_indices[ seed ]
if index > 0:
self._seeds.remove( seed )
self._seeds.insert( index - 1, seed )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self.NotifySeedsUpdated( ( seed, ) )
def CanCompact( self, compact_before_this_source_time ):
with self._lock:
if len( self._seeds ) <= 100:
return False
for seed in self._seeds[:-100]:
if seed.status == CC.STATUS_UNKNOWN:
continue
if self._GetSourceTimestamp( seed ) < compact_before_this_source_time:
return True
return False
def Compact( self, compact_before_this_source_time ):
with self._lock:
if len( self._seeds ) <= 100:
return
new_seeds = HydrusSerialisable.SerialisableList()
for seed in self._seeds[:-100]:
still_to_do = seed.status == CC.STATUS_UNKNOWN
still_relevant = self._GetSourceTimestamp( seed ) > compact_before_this_source_time
if still_to_do or still_relevant:
new_seeds.append( seed )
new_seeds.extend( self._seeds[-100:] )
self._seeds = new_seeds
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self._SetDirty()
def DelaySeed( self, seed ):
with self._lock:
if seed in self._seeds_to_indices:
index = self._seeds_to_indices[ seed ]
if index < len( self._seeds ) - 1:
self._seeds.remove( seed )
self._seeds.insert( index + 1, seed )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self.NotifySeedsUpdated( ( seed, ) )
def GetEarliestSourceTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return None
earliest_timestamp = min( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )
return earliest_timestamp
def GetLatestAddedTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return 0
latest_timestamp = max( ( seed.created for seed in self._seeds ) )
return latest_timestamp
def GetLatestSourceTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return 0
latest_timestamp = max( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )
return latest_timestamp
def GetNextSeed( self, status ):
with self._lock:
for seed in self._seeds:
if seed.status == status:
return seed
return None
def GetNumNewFilesSince( self, since ):
num_files = 0
with self._lock:
for seed in self._seeds:
source_timestamp = self._GetSourceTimestamp( seed )
if source_timestamp >= since:
num_files += 1
return num_files
def GetPresentedHashes( self, file_import_options ):
with self._lock:
hashes = []
for seed in self._seeds:
if seed.HasHash() and seed.ShouldPresent( file_import_options ):
hashes.append( seed.GetHash() )
return hashes
def GetSeedCacheKey( self ):
return self._seed_cache_key
def GetSeedCount( self, status = None ):
result = 0
with self._lock:
if status is None:
result = len( self._seeds )
else:
for seed in self._seeds:
if seed.status == status:
result += 1
return result
def GetSeeds( self, status = None ):
with self._lock:
return self._GetSeeds( status )
def GetSeedIndex( self, seed ):
with self._lock:
return self._seeds_to_indices[ seed ]
def GetStatus( self ):
with self._lock:
if self._dirty:
self._GenerateStatus()
return self._status_cache
def GetStatusGenerationTime( self ):
with self._lock:
if self._dirty:
return HydrusData.GetNow()
return self._status_cache_generation_time
def GetStatusesToCounts( self ):
with self._lock:
return self._GetStatusesToCounts()
def GetValueRange( self ):
with self._lock:
if self._dirty:
self._GenerateStatus()
( status, ( total_processed, total ) ) = self._status_cache
return ( total_processed, total )
def HasSeed( self, seed ):
with self._lock:
return self._HasSeed( seed )
def NotifySeedsUpdated( self, seeds ):
with self._lock:
self._SetDirty()
HG.client_controller.pub( 'seed_cache_seeds_updated', self._seed_cache_key, seeds )
def RemoveSeeds( self, seeds ):
with self._lock:
seeds_to_delete = set( seeds )
self._seeds = HydrusSerialisable.SerialisableList( [ seed for seed in self._seeds if seed not in seeds_to_delete ] )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self._SetDirty()
self.NotifySeedsUpdated( seeds_to_delete )
def RemoveSeedsByStatus( self, statuses_to_remove ):
with self._lock:
seeds_to_delete = [ seed for seed in self._seeds if seed.status in statuses_to_remove ]
self.RemoveSeeds( seeds_to_delete )
def RemoveAllButUnknownSeeds( self ):
with self._lock:
seeds_to_delete = [ seed for seed in self._seeds if seed.status != CC.STATUS_UNKNOWN ]
self.RemoveSeeds( seeds_to_delete )
def RetryFailures( self ):
with self._lock:
failed_seeds = self._GetSeeds( CC.STATUS_ERROR )
for seed in failed_seeds:
seed.SetStatus( CC.STATUS_UNKNOWN )
self.NotifySeedsUpdated( failed_seeds )
def WorkToDo( self ):
with self._lock:
if self._dirty:
self._GenerateStatus()
( status, ( total_processed, total ) ) = self._status_cache
return total_processed < total
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE ] = SeedCache
class SimpleDownloaderImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_IMPORT
SERIALISABLE_NAME = 'Simple Downloader Import'
SERIALISABLE_VERSION = 4
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._pending_jobs = []
self._seed_cache = SeedCache()
self._file_import_options = file_import_options
self._formula_name = 'all files linked by images in page'
self._queue_paused = False
self._files_paused = False
self._parser_status = ''
self._current_action = ''
self._download_control_file_set = None
self._download_control_file_clear = None
self._download_control_page_set = None
self._download_control_page_clear = None
self._lock = threading.Lock()
self._files_repeating_job = None
self._queue_repeating_job = None
HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )
def _FileNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_file_set is not None:
wx.CallAfter( self._download_control_file_set, network_job )
def exit_call():
with self._lock:
if self._download_control_file_clear is not None:
wx.CallAfter( self._download_control_file_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _GetSerialisableInfo( self ):
serialisable_pending_jobs = [ ( url, simple_downloader_formula.GetSerialisableTuple() ) for ( url, simple_downloader_formula ) in self._pending_jobs ]
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
return ( serialisable_pending_jobs, serialisable_seed_cache, serialisable_file_options, self._formula_name, self._queue_paused, self._files_paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_pending_jobs, serialisable_seed_cache, serialisable_file_options, self._formula_name, self._queue_paused, self._files_paused ) = serialisable_info
self._pending_jobs = [ ( url, HydrusSerialisable.CreateFromSerialisableTuple( serialisable_simple_downloader_formula ) ) for ( url, serialisable_simple_downloader_formula ) in serialisable_pending_jobs ]
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
def _PageNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_page_set is not None:
wx.CallAfter( self._download_control_page_set, network_job )
def exit_call():
with self._lock:
if self._download_control_page_clear is not None:
wx.CallAfter( self._download_control_page_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, paused ) = old_serialisable_info
queue_paused = paused
files_paused = paused
new_serialisable_info = ( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, queue_paused, files_paused )
return ( 2, new_serialisable_info )
if version == 2:
( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, queue_paused, files_paused ) = old_serialisable_info
pending_jobs = []
new_serialisable_info = ( pending_jobs, serialisable_seed_cache, serialisable_file_options, queue_paused, files_paused )
return ( 3, new_serialisable_info )
if version == 3:
( pending_jobs, serialisable_seed_cache, serialisable_file_options, queue_paused, files_paused ) = old_serialisable_info
pending_jobs = []
formula_name = 'all files linked by images in page'
new_serialisable_info = ( pending_jobs, serialisable_seed_cache, serialisable_file_options, formula_name, queue_paused, files_paused )
return ( 4, new_serialisable_info )
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
file_url = seed.seed_data
try:
def status_hook( text ):
with self._lock:
self._current_action = text
did_substantial_work = seed.WorkOnFileURL( self._file_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._FileNetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
time.sleep( 3 )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
with self._lock:
self._current_action = ''
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _WorkOnQueue( self, page_key ):
if len( self._pending_jobs ) > 0:
with self._lock:
( url, simple_downloader_formula ) = self._pending_jobs.pop( 0 )
self._parser_status = 'checking ' + url
error_occurred = False
try:
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, 'GET', url )
network_job.OverrideBandwidth( 30 )
HG.client_controller.network_engine.AddJob( network_job )
with self._PageNetworkJobPresentationContextFactory( network_job ):
network_job.WaitUntilDone()
data = network_job.GetContent()
#
parsing_context = {}
parsing_context[ 'url' ] = url
parsing_formula = simple_downloader_formula.GetFormula()
file_urls = [ urlparse.urljoin( url, parsed_text ) for parsed_text in parsing_formula.Parse( parsing_context, data ) ]
seeds = [ Seed( SEED_TYPE_URL, file_url ) for file_url in file_urls ]
for seed in seeds:
seed.AddURL( url )
num_new = self._seed_cache.AddSeeds( seeds )
if num_new > 0:
WakeRepeatingJob( self._files_repeating_job )
parser_status = 'page checked OK - ' + HydrusData.ConvertIntToPrettyString( num_new ) + ' new urls'
num_already_in_seed_cache = len( file_urls ) - num_new
if num_already_in_seed_cache > 0:
parser_status += ' (' + HydrusData.ConvertIntToPrettyString( num_already_in_seed_cache ) + ' already in queue)'
except HydrusExceptions.ShutdownException:
return
except HydrusExceptions.NotFoundException:
error_occurred = True
parser_status = 'page 404'
except Exception as e:
error_occurred = True
parser_status = HydrusData.ToUnicode( e )
with self._lock:
self._parser_status = parser_status
if error_occurred:
time.sleep( 5 )
return True
else:
with self._lock:
self._parser_status = ''
return False
def AdvanceJob( self, job ):
with self._lock:
if job in self._pending_jobs:
index = self._pending_jobs.index( job )
if index - 1 >= 0:
self._pending_jobs.remove( job )
self._pending_jobs.insert( index - 1, job )
def CurrentlyWorking( self ):
with self._lock:
finished = not self._seed_cache.WorkToDo() or len( self._pending_jobs ) > 0
return not finished and not self._files_paused
def DelayJob( self, job ):
with self._lock:
if job in self._pending_jobs:
index = self._pending_jobs.index( job )
if index + 1 < len( self._pending_jobs ):
self._pending_jobs.remove( job )
self._pending_jobs.insert( index + 1, job )
def DeleteJob( self, job ):
with self._lock:
if job in self._pending_jobs:
self._pending_jobs.remove( job )
def GetSeedCache( self ):
with self._lock:
return self._seed_cache
def GetFileImportOptions( self ):
with self._lock:
return self._file_import_options
def GetFormulaName( self ):
with self._lock:
return self._formula_name
def GetStatus( self ):
with self._lock:
return ( list( self._pending_jobs ), self._parser_status, self._current_action, self._queue_paused, self._files_paused )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def NotifySeedsUpdated( self, seed_cache_key, seeds ):
if seed_cache_key == self._seed_cache.GetSeedCacheKey():
WakeRepeatingJob( self._files_repeating_job )
def PausePlayFiles( self ):
with self._lock:
self._files_paused = not self._files_paused
WakeRepeatingJob( self._files_repeating_job )
def PausePlayQueue( self ):
with self._lock:
self._queue_paused = not self._queue_paused
WakeRepeatingJob( self._queue_repeating_job )
def PendJob( self, job ):
with self._lock:
if job not in self._pending_jobs:
self._pending_jobs.append( job )
WakeRepeatingJob( self._queue_repeating_job )
def SetDownloadControlFile( self, download_control ):
with self._lock:
self._download_control_file_set = download_control.SetNetworkJob
self._download_control_file_clear = download_control.ClearNetworkJob
def SetDownloadControlPage( self, download_control ):
with self._lock:
self._download_control_page_set = download_control.SetNetworkJob
self._download_control_page_clear = download_control.ClearNetworkJob
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def SetFormulaName( self, formula_name ):
with self._lock:
self._formula_name = formula_name
def Start( self, page_key ):
self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )
self._queue_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnQueue, page_key )
def REPEATINGWorkOnFiles( self, page_key ):
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
while work_to_do:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
def REPEATINGWorkOnQueue( self, page_key ):
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._queue_repeating_job.Cancel()
return
ok_to_work = not ( self._queue_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
while ok_to_work:
try:
did_work = self._WorkOnQueue( page_key )
if did_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
else:
return
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._queue_repeating_job.Cancel()
return
ok_to_work = not ( self._queue_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_IMPORT ] = SimpleDownloaderImport
class Subscription( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION
SERIALISABLE_NAME = 'Subscription'
SERIALISABLE_VERSION = 6
def __init__( self, name ):
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )
self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )
self._queries = []
new_options = HG.client_controller.new_options
self._checker_options = ClientDefaults.GetDefaultCheckerOptions( 'artist subscription' )
if HC.options[ 'gallery_file_limit' ] is None:
self._initial_file_limit = 200
else:
self._initial_file_limit = min( 200, HC.options[ 'gallery_file_limit' ] )
self._periodic_file_limit = 50
self._paused = False
self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' )
new_options = HG.client_controller.new_options
self._tag_import_options = new_options.GetDefaultTagImportOptions( self._gallery_identifier )
self._last_gallery_page_hit_timestamp = 0
self._no_work_until = 0
self._no_work_until_reason = ''
self._publish_files_to_popup_button = True
self._publish_files_to_page = False
self._merge_query_publish_events = True
def _DelayWork( self, time_delta, reason ):
self._no_work_until = HydrusData.GetNow() + time_delta
self._no_work_until_reason = reason
def _GetExampleNetworkContexts( self, query ):
seed_cache = query.GetSeedCache()
seed = seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return [ ClientNetworkingContexts.NetworkContext( CC.NETWORK_CONTEXT_SUBSCRIPTION, self._GetNetworkJobSubscriptionKey( query ) ), ClientNetworkingContexts.GLOBAL_NETWORK_CONTEXT ]
url = seed.seed_data
example_nj = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), 'GET', url )
example_network_contexts = example_nj.GetNetworkContexts()
return example_network_contexts
def _GetNetworkJobSubscriptionKey( self, query ):
query_text = query.GetQueryText()
return self._name + ': ' + query_text
def _GetQueriesForProcessing( self ):
queries = list( self._queries )
if HG.client_controller.new_options.GetBoolean( 'process_subs_in_random_order' ):
random.shuffle( queries )
else:
def key( q ):
return q.GetQueryText()
queries.sort( key = key )
return queries
def _GetSerialisableInfo( self ):
serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]
serialisable_queries = [ query.GetSerialisableTuple() for query in self._queries ]
serialisable_checker_options = self._checker_options.GetSerialisableTuple()
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
serialisable_tag_options = self._tag_import_options.GetSerialisableTuple()
return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._no_work_until, self._no_work_until_reason, self._publish_files_to_popup_button, self._publish_files_to_page, self._merge_query_publish_events )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._no_work_until, self._no_work_until_reason, self._publish_files_to_popup_button, self._publish_files_to_page, self._merge_query_publish_events ) = serialisable_info
self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )
self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]
self._queries = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_query ) for serialisable_query in serialisable_queries ]
self._checker_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_checker_options )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
def _NoDelays( self ):
return HydrusData.TimeHasPassed( self._no_work_until )
def _QueryBandwidthIsOK( self, query ):
example_network_contexts = self._GetExampleNetworkContexts( query )
# just a little padding here
expected_requests = 3
expected_bytes = 1048576
threshold = 30
result = HG.client_controller.network_engine.bandwidth_manager.CanDoWork( example_network_contexts, expected_requests = expected_requests, expected_bytes = expected_bytes, threshold = threshold )
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + query.GetQueryText() + '" pre-work bandwidth test. Bandwidth ok: ' + str( result ) + '.' )
return result
def _ShowHitPeriodicFileLimitMessage( self, query_text ):
message = 'When syncing, the query "' + query_text + '" for subscription "' + self._name + '" hit its periodic file limit!'
message += os.linesep * 2
message += 'This may be because the query has not run in a while--so the backlog of files has built up--or that the site has changed how it presents file urls on its gallery pages (and so the subscription thinks it is seeing new files when it truly is not).'
message += os.linesep * 2
message += 'If the former is true, you might want to fill in the gap with a manual download page, but if the latter is true, the maintainer for the download parser (hydrus dev or whoever), would be interested in knowing this information so they can roll out a fix.'
HydrusData.ShowText( message )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, last_error, serialisable_seed_cache ) = old_serialisable_info
check_now = False
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, serialisable_seed_cache )
return ( 2, new_serialisable_info )
if version == 2:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, serialisable_seed_cache ) = old_serialisable_info
no_work_until = 0
no_work_until_reason = ''
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, no_work_until, no_work_until_reason, serialisable_seed_cache )
return ( 3, new_serialisable_info )
if version == 3:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, no_work_until, no_work_until_reason, serialisable_seed_cache ) = old_serialisable_info
checker_options = ClientImportOptions.CheckerOptions( 5, period / 5, period * 10, ( 1, period * 10 ) )
seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
query = SubscriptionQuery( query )
query._seed_cache = seed_cache
query._last_check_time = last_checked
query.UpdateNextCheckTime( checker_options )
queries = [ query ]
serialisable_queries = [ query.GetSerialisableTuple() for query in queries ]
serialisable_checker_options = checker_options.GetSerialisableTuple()
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason )
return ( 4, new_serialisable_info )
if version == 4:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason ) = old_serialisable_info
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason )
return ( 5, new_serialisable_info )
if version == 5:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason ) = old_serialisable_info
publish_files_to_popup_button = True
publish_files_to_page = False
merge_query_publish_events = True
new_serialisable_info = new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason, publish_files_to_popup_button, publish_files_to_page, merge_query_publish_events )
return ( 6, new_serialisable_info )
def _WorkOnFiles( self, job_key ):
try:
gallery = ClientDownloading.GetGallery( self._gallery_identifier )
except Exception as e:
HydrusData.PrintException( e )
self._DelayWork( HC.UPDATE_DURATION, 'gallery would not load' )
self._paused = True
HydrusData.ShowText( 'The subscription ' + self._name + ' could not load its gallery! It has been paused and the full error has been written to the log!' )
return
error_count = 0
all_presentation_hashes = []
all_presentation_hashes_fast = set()
queries = self._GetQueriesForProcessing()
for query in queries:
this_query_has_done_work = False
query_text = query.GetQueryText()
seed_cache = query.GetSeedCache()
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), method, url, **kwargs )
network_job.OverrideBandwidth( 30 )
job_key.SetVariable( 'popup_network_job', network_job )
return network_job
gallery.SetNetworkJobFactory( network_job_factory )
text_1 = 'downloading files'
query_summary_name = self._name
if query_text != self._name:
text_1 += ' for "' + query_text + '"'
query_summary_name += ': ' + query_text
job_key.SetVariable( 'popup_text_1', text_1 )
num_urls = seed_cache.GetSeedCount()
presentation_hashes = []
presentation_hashes_fast = set()
while True:
num_unknown = seed_cache.GetSeedCount( CC.STATUS_UNKNOWN )
num_done = num_urls - num_unknown
seed = seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + query_text + '" can do no more file work due to running out of unknown urls.' )
break
if job_key.IsCancelled():
self._DelayWork( 300, 'recently cancelled' )
break
p1 = HC.options[ 'pause_subs_sync' ]
p3 = HG.view_shutdown
p4 = not self._QueryBandwidthIsOK( query )
if p1 or p3 or p4:
if p4 and this_query_has_done_work:
job_key.SetVariable( 'popup_text_2', 'no more bandwidth to download files, will do some more later' )
time.sleep( 5 )
break
try:
x_out_of_y = 'file ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_urls ) + ': '
job_key.SetVariable( 'popup_gauge_2', ( num_done, num_urls ) )
if seed.WorksInNewSystem():
def status_hook( text ):
job_key.SetVariable( 'popup_text_2', x_out_of_y + text )
seed.WorkOnPostURL( self._file_import_options, self._tag_import_options, status_hook, GenerateSubscriptionNetworkJobFactory( self._GetNetworkJobSubscriptionKey( query ) ), GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key ) )
if seed.ShouldPresent( self._file_import_options ):
hash = seed.GetHash()
if hash not in presentation_hashes_fast:
if hash not in all_presentation_hashes_fast:
all_presentation_hashes.append( hash )
all_presentation_hashes_fast.add( hash )
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
else:
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'checking url status' )
seed.PredictPreImportStatus( self._file_import_options )
status = seed.status
url = seed.seed_data
if status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB() and self._tag_import_options.WorthFetchingTags():
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'found file in db, fetching tags' )
downloaded_tags = gallery.GetTags( url )
seed.AddTags( downloaded_tags )
elif status == CC.STATUS_UNKNOWN:
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
try:
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'downloading file' )
if self._tag_import_options.WorthFetchingTags():
downloaded_tags = gallery.GetFileAndTags( temp_path, url )
seed.AddTags( downloaded_tags )
else:
gallery.GetFile( temp_path, url )
seed.CheckPreFetchMetadata( self._tag_import_options )
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'importing file' )
seed.Import( temp_path, self._file_import_options )
hash = seed.GetHash()
if hash not in presentation_hashes_fast:
if seed.ShouldPresent( self._file_import_options ):
if hash not in all_presentation_hashes_fast:
all_presentation_hashes.append( hash )
all_presentation_hashes_fast.add( hash )
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
seed.WriteContentUpdates( self._tag_import_options )
except HydrusExceptions.CancelledException as e:
self._DelayWork( 300, HydrusData.ToUnicode( e ) )
break
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
seed.SetStatus( status, note = note )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
seed.SetStatus( status, note = note )
except Exception as e:
status = CC.STATUS_ERROR
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'file failed' )
seed.SetStatus( status, exception = e )
if isinstance( e, HydrusExceptions.DataMissing ):
# DataMissing is a quick thing to avoid subscription abandons when lots of deleted files in e621 (or any other booru)
# this should be richer in any case in the new system
pass
else:
error_count += 1
time.sleep( 10 )
if error_count > 4:
raise Exception( 'The subscription ' + self._name + ' encountered several errors when downloading files, so it abandoned its sync.' )
this_query_has_done_work = True
if len( presentation_hashes ) > 0:
job_key.SetVariable( 'popup_files', ( list( presentation_hashes ), query_summary_name ) )
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
HG.client_controller.WaitUntilViewFree()
if not self._merge_query_publish_events and len( presentation_hashes ) > 0:
PublishPresentationHashes( query_summary_name, presentation_hashes, self._publish_files_to_popup_button, self._publish_files_to_page )
if self._merge_query_publish_events and len( all_presentation_hashes ) > 0:
PublishPresentationHashes( self._name, all_presentation_hashes, self._publish_files_to_popup_button, self._publish_files_to_page )
job_key.DeleteVariable( 'popup_files' )
job_key.DeleteVariable( 'popup_text_1' )
job_key.DeleteVariable( 'popup_text_2' )
job_key.DeleteVariable( 'popup_gauge_2' )
def _WorkOnFilesCanDoWork( self ):
for query in self._queries:
if query.CanWorkOnFiles():
if self._QueryBandwidthIsOK( query ):
return True
return False
def _SyncQuery( self, job_key ):
have_made_an_initial_sync_bandwidth_notification = False
queries = self._GetQueriesForProcessing()
for query in queries:
can_sync = query.CanSync()
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + query.GetQueryText() + '" started. Current can_sync is ' + str( can_sync ) + '.' )
if not can_sync:
continue
done_first_page = False
query_text = query.GetQueryText()
seed_cache = query.GetSeedCache()
this_is_initial_sync = query.IsInitialSync()
total_new_urls = 0
seeds_to_add = set()
seeds_to_add_ordered = []
prefix = 'synchronising'
if query_text != self._name:
prefix += ' "' + query_text + '"'
job_key.SetVariable( 'popup_text_1', prefix )
for gallery_stream_identifier in self._gallery_stream_identifiers:
if this_is_initial_sync:
if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:
break
else:
if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:
self._ShowHitPeriodicFileLimitMessage( query_text )
break
p1 = HC.options[ 'pause_subs_sync' ]
p2 = job_key.IsCancelled()
p3 = HG.view_shutdown
if p1 or p2 or p3:
break
try:
gallery = ClientDownloading.GetGallery( gallery_stream_identifier )
except Exception as e:
HydrusData.PrintException( e )
self._DelayWork( HC.UPDATE_DURATION, 'gallery would not load' )
self._paused = True
HydrusData.ShowText( 'The subscription ' + self._name + ' could not load its gallery! It has been paused and the full error has been written to the log!' )
return
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), method, url, **kwargs )
job_key.SetVariable( 'popup_network_job', network_job )
network_job.OverrideBandwidth( 30 )
return network_job
gallery.SetNetworkJobFactory( network_job_factory )
page_index = 0
num_existing_urls = 0
keep_checking = True
while keep_checking:
new_urls_this_page = 0
try:
p1 = HC.options[ 'pause_subs_sync' ]
p2 = HG.view_shutdown
if p1 or p2:
return
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException( 'gallery parsing cancelled, likely by user' )
next_gallery_page_hit_timestamp = self._last_gallery_page_hit_timestamp + HG.client_controller.new_options.GetInteger( 'gallery_page_wait_period_subscriptions' )
if not HydrusData.TimeHasPassed( next_gallery_page_hit_timestamp ):
if not done_first_page:
page_check_status = 'checking first page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
else:
page_check_status = HydrusData.ConvertIntToPrettyString( total_new_urls ) + ' new urls found, checking next page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
job_key.SetVariable( 'popup_text_1', prefix + ': ' + page_check_status )
time.sleep( 1 )
continue
job_key.SetVariable( 'popup_text_1', prefix + ': found ' + HydrusData.ConvertIntToPrettyString( total_new_urls ) + ' new urls, checking next page' )
try:
( page_of_seeds, definitely_no_more_pages ) = gallery.GetPage( query_text, page_index )
finally:
self._last_gallery_page_hit_timestamp = HydrusData.GetNow()
done_first_page = True
page_index += 1
if definitely_no_more_pages:
keep_checking = False
for seed in page_of_seeds:
if this_is_initial_sync:
if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:
keep_checking = False
break
else:
if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:
self._ShowHitPeriodicFileLimitMessage( query_text )
keep_checking = False
break
if seed in seeds_to_add:
# this catches the occasional overflow when a new file is uploaded while gallery parsing is going on
continue
if seed_cache.HasSeed( seed ):
num_existing_urls += 1
if num_existing_urls > 5:
keep_checking = False
break
else:
seeds_to_add.add( seed )
seeds_to_add_ordered.append( seed )
new_urls_this_page += 1
total_new_urls += 1
if new_urls_this_page == 0:
keep_checking = False
except HydrusExceptions.CancelledException as e:
self._DelayWork( 300, HydrusData.ToUnicode( e ) )
break
except HydrusExceptions.NotFoundException:
# paheal now 404s when no results, so just naturally break
break
seeds_to_add_ordered.reverse()
# 'first' urls are now at the end, so the seed_cache should stay roughly in oldest->newest order
seed_cache.AddSeeds( seeds_to_add_ordered )
query.RegisterSyncComplete()
query.UpdateNextCheckTime( self._checker_options )
if query.IsDead():
if this_is_initial_sync:
HydrusData.ShowText( 'The query "' + query_text + '" for subscription "' + self._name + '" did not find any files on its first sync! Could the query text have a typo, like a missing underscore?' )
else:
HydrusData.ShowText( 'The query "' + query_text + '" for subscription "' + self._name + '" appears to be dead!' )
else:
if this_is_initial_sync:
if not self._QueryBandwidthIsOK( query ) and not have_made_an_initial_sync_bandwidth_notification:
HydrusData.ShowText( 'FYI: The query "' + query_text + '" for subscription "' + self._name + '" performed its initial sync ok, but that domain is short on bandwidth right now, so no files will be downloaded yet. The subscription will catch up in future as bandwidth becomes available. You can review the estimated time until bandwidth is available under the manage subscriptions dialog. If more queries are performing initial syncs in this run, they may be the same.' )
have_made_an_initial_sync_bandwidth_notification = True
def _SyncQueryCanDoWork( self ):
return True in ( query.CanSync() for query in self._queries )
def CanCheckNow( self ):
return True in ( query.CanCheckNow() for query in self._queries )
def CanCompact( self ):
return True in ( query.CanCompact( self._checker_options ) for query in self._queries )
def CanReset( self ):
return True in ( not query.IsInitialSync() for query in self._queries )
def CanRetryFailures( self ):
return True in ( query.CanRetryFailed() for query in self._queries )
def CanScrubDelay( self ):
return not HydrusData.TimeHasPassed( self._no_work_until )
def CheckNow( self ):
for query in self._queries:
query.CheckNow()
self.ScrubDelay()
def Compact( self ):
for query in self._queries:
query.Compact( self._checker_options )
def GetBandwidthWaitingEstimate( self, query ):
example_network_contexts = self._GetExampleNetworkContexts( query )
estimate = HG.client_controller.network_engine.bandwidth_manager.GetWaitingEstimate( example_network_contexts )
return estimate
def GetBandwidthWaitingEstimateMinMax( self ):
if len( self._queries ) == 0:
return ( 0, 0 )
estimates = []
for query in self._queries:
example_network_contexts = self._GetExampleNetworkContexts( query )
estimate = HG.client_controller.network_engine.bandwidth_manager.GetWaitingEstimate( example_network_contexts )
estimates.append( estimate )
min_estimate = min( estimates )
max_estimate = max( estimates )
return ( min_estimate, max_estimate )
def GetGalleryIdentifier( self ):
return self._gallery_identifier
def GetQueries( self ):
return self._queries
def GetPresentationOptions( self ):
return ( self._publish_files_to_popup_button, self._publish_files_to_page, self._merge_query_publish_events )
def GetTagImportOptions( self ):
return self._tag_import_options
def HasQuerySearchText( self, search_text ):
for query in self._queries:
query_text = query.GetQueryText()
if search_text in query_text:
return True
return False
def Merge( self, potential_mergee_subscriptions ):
unmergable_subscriptions = []
for subscription in potential_mergee_subscriptions:
if subscription._gallery_identifier == self._gallery_identifier:
my_new_queries = [ query.Duplicate() for query in subscription._queries ]
self._queries.extend( my_new_queries )
else:
unmergable_subscriptions.append( subscription )
return unmergable_subscriptions
def PauseResume( self ):
self._paused = not self._paused
def Reset( self ):
for query in self._queries:
query.Reset()
self.ScrubDelay()
def RetryFailures( self ):
for query in self._queries:
query.RetryFailures()
def ReviveDead( self ):
for query in self._queries:
if query.IsDead():
query.CheckNow()
def Separate( self, base_name, only_these_queries = None ):
if only_these_queries is None:
only_these_queries = set( self._queries )
else:
only_these_queries = set( only_these_queries )
subscriptions = []
for query in self._queries:
if query not in only_these_queries:
continue
subscription = self.Duplicate()
subscription._queries = [ query.Duplicate() ]
subscription.SetName( base_name + ': ' + query.GetQueryText() )
subscriptions.append( subscription )
self._queries = [ query for query in self._queries if query not in only_these_queries ]
return subscriptions
def SetCheckerOptions( self, checker_options ):
self._checker_options = checker_options
for query in self._queries:
query.UpdateNextCheckTime( self._checker_options )
def SetPresentationOptions( self, publish_files_to_popup_button, publish_files_to_page, merge_query_publish_events ):
self._publish_files_to_popup_button = publish_files_to_popup_button
self._publish_files_to_page = publish_files_to_page
self._merge_query_publish_events = merge_query_publish_events
def SetTuple( self, gallery_identifier, gallery_stream_identifiers, queries, checker_options, initial_file_limit, periodic_file_limit, paused, file_import_options, tag_import_options, no_work_until ):
self._gallery_identifier = gallery_identifier
self._gallery_stream_identifiers = gallery_stream_identifiers
self._queries = queries
self._checker_options = checker_options
self._initial_file_limit = initial_file_limit
self._periodic_file_limit = periodic_file_limit
self._paused = paused
self._file_import_options = file_import_options
self._tag_import_options = tag_import_options
self._no_work_until = no_work_until
def ScrubDelay( self ):
self._no_work_until = 0
self._no_work_until_reason = ''
def Sync( self ):
p1 = not self._paused
p2 = not HG.view_shutdown
p3 = self._NoDelays()
p4 = self._SyncQueryCanDoWork()
p5 = self._WorkOnFilesCanDoWork()
if HG.subscription_report_mode:
message = 'Subscription "' + self._name + '" entered sync.'
message += os.linesep
message += 'Unpaused: ' + str( p1 )
message += os.linesep
message += 'No delays: ' + str( p3 )
message += os.linesep
message += 'Sync can do work: ' + str( p4 )
message += os.linesep
message += 'Files can do work: ' + str( p5 )
HydrusData.ShowText( message )
if p1 and p2 and p3 and ( p4 or p5 ):
job_key = ClientThreading.JobKey( pausable = False, cancellable = True )
try:
job_key.SetVariable( 'popup_title', 'subscriptions - ' + self._name )
HG.client_controller.pub( 'message', job_key )
self._SyncQuery( job_key )
self._WorkOnFiles( job_key )
except HydrusExceptions.NetworkException as e:
if isinstance( e, HydrusExceptions.NetworkInfrastructureException ):
delay = 3600
else:
delay = HC.UPDATE_DURATION
HydrusData.Print( 'The subscription ' + self._name + ' encountered an exception when trying to sync:' )
HydrusData.PrintException( e )
job_key.SetVariable( 'popup_text_1', 'Encountered a network error, will retry again later' )
self._DelayWork( delay, 'network error: ' + HydrusData.ToUnicode( e ) )
time.sleep( 5 )
except Exception as e:
HydrusData.ShowText( 'The subscription ' + self._name + ' encountered an exception when trying to sync:' )
HydrusData.ShowException( e )
self._DelayWork( HC.UPDATE_DURATION, 'error: ' + HydrusData.ToUnicode( e ) )
finally:
job_key.DeleteVariable( 'popup_network_job' )
HG.client_controller.WriteSynchronous( 'serialisable', self )
if job_key.HasVariable( 'popup_files' ):
job_key.Finish()
else:
job_key.Delete()
def ToTuple( self ):
return ( self._name, self._gallery_identifier, self._gallery_stream_identifiers, self._queries, self._checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, self._file_import_options, self._tag_import_options, self._no_work_until, self._no_work_until_reason )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION ] = Subscription
class SubscriptionQuery( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION_QUERY
SERIALISABLE_NAME = 'Subscription Query'
SERIALISABLE_VERSION = 1
def __init__( self, query = 'query text' ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._query = query
self._check_now = False
self._last_check_time = 0
self._next_check_time = 0
self._paused = False
self._status = CHECKER_STATUS_OK
self._seed_cache = SeedCache()
def _GetSerialisableInfo( self ):
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
return ( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, serialisable_seed_cache )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, serialisable_seed_cache ) = serialisable_info
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def CanWorkOnFiles( self ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + self._query + '" CanWorkOnFiles test. Next import is ' + repr( seed ) + '.' )
return seed is not None
def CanCheckNow( self ):
return not self._check_now
def CanCompact( self, checker_options ):
death_period = checker_options.GetDeathFileVelocityPeriod()
compact_before_this_source_time = self._last_check_time - ( death_period * 2 )
return self._seed_cache.CanCompact( compact_before_this_source_time )
def CanRetryFailed( self ):
return self._seed_cache.GetSeedCount( CC.STATUS_ERROR ) > 0
def CanSync( self ):
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + self._query + '" CanSync test. Paused status is ' + str( self._paused ) + ' and check time due is ' + str( HydrusData.TimeHasPassed( self._next_check_time ) ) + ' and check_now is ' + str( self._check_now ) + '.' )
if self._paused:
return False
return HydrusData.TimeHasPassed( self._next_check_time ) or self._check_now
def CheckNow( self ):
self._check_now = True
self._paused = False
self._next_check_time = 0
self._status = CHECKER_STATUS_OK
def Compact( self, checker_options ):
death_period = checker_options.GetDeathFileVelocityPeriod()
compact_before_this_time = self._last_check_time - ( death_period * 2 )
return self._seed_cache.Compact( compact_before_this_time )
def GetLastChecked( self ):
return self._last_check_time
def GetLatestAddedTime( self ):
return self._seed_cache.GetLatestAddedTime()
def GetNextCheckStatusString( self ):
if self._check_now:
return 'checking on dialog ok'
elif self._status == CHECKER_STATUS_DEAD:
return 'dead, so not checking'
elif self._paused:
return 'paused, but would be ' + HydrusData.ConvertTimestampToPrettyPending( self._next_check_time )
else:
return HydrusData.ConvertTimestampToPrettyPending( self._next_check_time )
def GetNumURLsAndFailed( self ):
return ( self._seed_cache.GetSeedCount( CC.STATUS_UNKNOWN ), len( self._seed_cache ), self._seed_cache.GetSeedCount( CC.STATUS_ERROR ) )
def GetQueryText( self ):
return self._query
def GetSeedCache( self ):
return self._seed_cache
def IsDead( self ):
return self._status == CHECKER_STATUS_DEAD
def IsInitialSync( self ):
return self._last_check_time == 0
def IsPaused( self ):
return self._paused
def PausePlay( self ):
self._paused = not self._paused
def RegisterSyncComplete( self ):
self._last_check_time = HydrusData.GetNow()
self._check_now = False
def Reset( self ):
self._last_check_time = 0
self._next_check_time = 0
self._status = CHECKER_STATUS_OK
self._paused = False
self._seed_cache = SeedCache()
def RetryFailures( self ):
self._seed_cache.RetryFailures()
def SetCheckNow( self, check_now ):
self._check_now = check_now
def SetPaused( self, paused ):
self._paused = paused
def SetQueryAndSeedCache( self, query, seed_cache ):
self._query = query
self._seed_cache = seed_cache
def UpdateNextCheckTime( self, checker_options ):
if self._check_now:
self._next_check_time = 0
self._status = CHECKER_STATUS_OK
else:
if checker_options.IsDead( self._seed_cache, self._last_check_time ):
self._status = CHECKER_STATUS_DEAD
self._paused = True
self._next_check_time = checker_options.GetNextCheckTime( self._seed_cache, self._last_check_time )
def ToTuple( self ):
return ( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, self._seed_cache )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION_QUERY ] = SubscriptionQuery
class URLsImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_URLS_IMPORT
SERIALISABLE_NAME = 'URL Import'
SERIALISABLE_VERSION = 1
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._seed_cache = SeedCache()
self._file_import_options = file_import_options
self._paused = False
self._seed_cache_status = ( 'initialising', ( 0, 1 ) )
self._download_control_file_set = None
self._download_control_file_clear = None
self._lock = threading.Lock()
self._files_repeating_job = None
HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )
def _GetSerialisableInfo( self ):
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
return ( serialisable_seed_cache, serialisable_file_options, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_seed_cache, serialisable_file_options, self._paused ) = serialisable_info
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
def _NetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_file_set is not None:
wx.CallAfter( self._download_control_file_set, network_job )
def exit_call():
with self._lock:
if self._download_control_file_clear is not None:
wx.CallAfter( self._download_control_file_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _RegenerateSeedCacheStatus( self ):
new_seed_cache_status = self._seed_cache.GetStatus()
if self._seed_cache_status != new_seed_cache_status:
self._seed_cache_status = new_seed_cache_status
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
url = seed.seed_data
try:
with self._lock:
self._RegenerateSeedCacheStatus()
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( url )
if url_type in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_POST, HC.URL_TYPE_WATCHABLE ) and not can_parse:
message = 'This URL was recognised as a "' + match_name + '" but this URL class does not yet have a parsing script linked to it!'
message += os.linesep * 2
message += 'Since this URL cannot be parsed, a downloader cannot be created for it! Please check your url class links under the \'networking\' menu.'
raise HydrusExceptions.ParseException( message )
if url_type in ( HC.URL_TYPE_UNKNOWN, HC.URL_TYPE_FILE ):
did_substantial_work = self._WorkOnFilesRawURL( page_key, seed )
elif url_type == HC.URL_TYPE_POST:
did_substantial_work = self._WorkOnFilesPostURL( page_key, seed )
elif url_type in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
raise NotImplementedError( 'Unfortunately, galleries and watchable urls do not work here yet!' )
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
time.sleep( 3 )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
with self._lock:
self._RegenerateSeedCacheStatus()
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _WorkOnFilesPostURL( self, page_key, seed ):
url = seed.seed_data
tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( url )
status_hook = lambda s: s # do nothing for now
did_substantial_work = seed.WorkOnPostURL( self._file_import_options, tag_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._NetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
return did_substantial_work
def _WorkOnFilesRawURL( self, page_key, seed ):
status_hook = lambda s: s # do nothing for now
did_substantial_work = seed.WorkOnFileURL( self._file_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._NetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
return did_substantial_work
def CurrentlyWorking( self ):
with self._lock:
finished = not self._seed_cache.WorkToDo()
return not finished and not self._paused
def GetSeedCache( self ):
return self._seed_cache
def GetOptions( self ):
with self._lock:
return self._file_import_options
def GetStatus( self ):
with self._lock:
return ( self._seed_cache_status, self._paused )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def NotifySeedsUpdated( self, seed_cache_key, seeds ):
if seed_cache_key == self._seed_cache.GetSeedCacheKey():
WakeRepeatingJob( self._files_repeating_job )
def PausePlay( self ):
with self._lock:
self._paused = not self._paused
WakeRepeatingJob( self._files_repeating_job )
def PendURLs( self, urls ):
with self._lock:
urls = filter( lambda u: len( u ) > 1, urls ) # > _1_ to take out the occasional whitespace
seeds = [ Seed( SEED_TYPE_URL, url ) for url in urls ]
if len( seeds ) > 0:
self._seed_cache.AddSeeds( seeds )
WakeRepeatingJob( self._files_repeating_job )
def SetDownloadControlFile( self, download_control ):
with self._lock:
self._download_control_file_set = download_control.SetNetworkJob
self._download_control_file_clear = download_control.ClearNetworkJob
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def Start( self, page_key ):
with self._lock:
self._RegenerateSeedCacheStatus()
self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )
def REPEATINGWorkOnFiles( self, page_key ):
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
while work_to_do:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
with self._lock:
if PageImporterShouldStopWorking( page_key ):
self._files_repeating_job.Cancel()
return
work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_URLS_IMPORT ] = URLsImport