hydrus/include/ClientImporting.py

6633 lines
221 KiB
Python

import bs4
import ClientConstants as CC
import ClientData
import ClientDefaults
import ClientDownloading
import ClientFiles
import ClientImageHandling
import ClientImportOptions
import ClientNetworkingContexts
import ClientNetworkingDomain
import ClientNetworkingJobs
import ClientParsing
import ClientTags
import ClientThreading
import collections
import HydrusConstants as HC
import HydrusData
import HydrusExceptions
import HydrusFileHandling
import HydrusImageHandling
import HydrusGlobals as HG
import HydrusPaths
import HydrusSerialisable
import HydrusTags
import HydrusText
import json
import os
import random
import re
import shutil
import threading
import time
import traceback
import urlparse
import wx
import HydrusThreading
CHECKER_STATUS_OK = 0
CHECKER_STATUS_DEAD = 1
CHECKER_STATUS_404 = 2
DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME = 0.1
def GenerateDownloaderNetworkJobFactory( page_key ):
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, *args, **kwargs )
return network_job
return network_job_factory
def GenerateSubscriptionNetworkJobFactory( subscription_key ):
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobSubscription( subscription_key, *args, **kwargs )
network_job.OverrideBandwidth( 30 )
return network_job
return network_job_factory
def GenerateSubscriptionNetworkJobPresentationContextFactory( job_key ):
def network_job_presentation_context_factory( network_job ):
def enter_call():
job_key.SetVariable( 'popup_network_job', network_job )
def exit_call():
pass
return NetworkJobPresentationContext( enter_call, exit_call )
return network_job_presentation_context_factory
def GenerateWatcherNetworkJobFactory( thread_key ):
def network_job_factory( *args, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobWatcherPage( thread_key, *args, **kwargs )
return network_job
return network_job_factory
def THREADDownloadURL( job_key, url, url_string ):
job_key.SetVariable( 'popup_title', url_string )
job_key.SetVariable( 'popup_text_1', 'initialising' )
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
network_job = ClientNetworkingJobs.NetworkJob( 'GET', url, temp_path = temp_path )
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
job_key.SetVariable( 'popup_network_job', network_job )
try:
network_job.WaitUntilDone()
except ( HydrusExceptions.ShutdownException, HydrusExceptions.CancelledException, HydrusExceptions.NetworkException ):
job_key.Cancel()
raise
job_key.DeleteVariable( 'popup_network_job' )
job_key.SetVariable( 'popup_text_1', 'importing' )
file_import_job = FileImportJob( temp_path )
( status, hash ) = HG.client_controller.client_files_manager.ImportFile( file_import_job )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
if status in CC.SUCCESSFUL_IMPORT_STATES:
if status == CC.STATUS_SUCCESSFUL_AND_NEW:
job_key.SetVariable( 'popup_text_1', 'successful!' )
elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
job_key.SetVariable( 'popup_text_1', 'was already in the database!' )
job_key.SetVariable( 'popup_files', ( [ hash ], 'download' ) )
elif status == CC.STATUS_DELETED:
job_key.SetVariable( 'popup_text_1', 'had already been deleted!' )
job_key.Finish()
def THREADDownloadURLs( job_key, urls, title ):
job_key.SetVariable( 'popup_title', title )
job_key.SetVariable( 'popup_text_1', 'initialising' )
num_successful = 0
num_redundant = 0
num_deleted = 0
num_failed = 0
presentation_hashes = []
presentation_hashes_fast = set()
for ( i, url ) in enumerate( urls ):
( i_paused, should_quit ) = job_key.WaitIfNeeded()
if should_quit:
break
job_key.SetVariable( 'popup_text_1', HydrusData.ConvertValueRangeToPrettyString( i + 1, len( urls ) ) )
job_key.SetVariable( 'popup_gauge_1', ( i + 1, len( urls ) ) )
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
network_job = ClientNetworkingJobs.NetworkJob( 'GET', url, temp_path = temp_path )
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
job_key.SetVariable( 'popup_network_job', network_job )
try:
network_job.WaitUntilDone()
except ( HydrusExceptions.ShutdownException, HydrusExceptions.CancelledException, HydrusExceptions.NetworkException ):
break
try:
job_key.SetVariable( 'popup_text_2', 'importing' )
file_import_job = FileImportJob( temp_path )
( status, hash ) = HG.client_controller.client_files_manager.ImportFile( file_import_job )
except Exception as e:
job_key.DeleteVariable( 'popup_text_2' )
HydrusData.Print( url + ' failed to import!' )
HydrusData.PrintException( e )
num_failed += 1
continue
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
if status in CC.SUCCESSFUL_IMPORT_STATES:
if status == CC.STATUS_SUCCESSFUL_AND_NEW:
num_successful += 1
elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
num_redundant += 1
if hash not in presentation_hashes_fast:
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
elif status == CC.STATUS_DELETED:
num_deleted += 1
job_key.DeleteVariable( 'popup_network_job' )
text_components = []
if num_successful > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_successful ) + ' successful' )
if num_redundant > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_redundant ) + ' already in db' )
if num_deleted > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_deleted ) + ' deleted' )
if num_failed > 0:
text_components.append( HydrusData.ConvertIntToPrettyString( num_failed ) + ' failed (errors written to log)' )
job_key.SetVariable( 'popup_text_1', ', '.join( text_components ) )
if len( presentation_hashes ) > 0:
job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) )
job_key.DeleteVariable( 'popup_gauge_1' )
job_key.DeleteVariable( 'popup_text_2' )
job_key.Finish()
def UpdateSeedCacheWithAllParseResults( seed_cache, all_parse_results, source_url = None, tag_import_options = None ):
# need a limit param here for 'stop at 40 total new because of file limit'
new_seeds = []
num_new = 0
num_already_in = 0
for parse_results in all_parse_results:
parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST ), only_get_top_priority = True )
for url in parsed_urls:
seed = Seed( SEED_TYPE_URL, url )
if source_url is not None:
seed.AddURL( source_url )
if seed_cache.HasSeed( seed ):
num_already_in += 1
else:
num_new += 1
seed.AddParseResults( parse_results )
new_seeds.append( seed )
seed_cache.AddSeeds( new_seeds )
return ( num_new, num_already_in )
class FileImportJob( object ):
def __init__( self, temp_path, file_import_options = None ):
if file_import_options is None:
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._temp_path = temp_path
self._file_import_options = file_import_options
self._hash = None
self._pre_import_status = None
self._file_info = None
self._thumbnail = None
self._phashes = None
self._extra_hashes = None
def CheckIsGoodToImport( self ):
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
self._file_import_options.CheckFileIsValid( size, mime, width, height )
def GetExtraHashes( self ):
return self._extra_hashes
def GetFileImportOptions( self ):
return self._file_import_options
def GetFileInfo( self ):
return self._file_info
def GetHash( self ):
return self._hash
def GetMime( self ):
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
return mime
def GetPreImportStatus( self ):
return self._pre_import_status
def GetPHashes( self ):
return self._phashes
def GetTempPathAndThumbnail( self ):
return ( self._temp_path, self._thumbnail )
def PubsubContentUpdates( self ):
if self._pre_import_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._file_import_options.AutomaticallyArchives():
service_keys_to_content_updates = { CC.COMBINED_LOCAL_FILE_SERVICE_KEY : [ HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, set( ( self._hash, ) ) ) ] }
HG.client_controller.Write( 'content_updates', service_keys_to_content_updates )
def IsNewToDB( self ):
if self._pre_import_status == CC.STATUS_UNKNOWN:
return True
if self._pre_import_status == CC.STATUS_DELETED:
if not self._file_import_options.ExcludesDeleted():
return True
return False
def GenerateHashAndStatus( self ):
HydrusImageHandling.ConvertToPngIfBmp( self._temp_path )
self._hash = HydrusFileHandling.GetHashFromPath( self._temp_path )
( self._pre_import_status, hash, note ) = HG.client_controller.Read( 'hash_status', 'sha256', self._hash )
def GenerateInfo( self ):
mime = HydrusFileHandling.GetMime( self._temp_path )
new_options = HG.client_controller.new_options
if mime in HC.DECOMPRESSION_BOMB_IMAGES and not self._file_import_options.AllowsDecompressionBombs():
if HydrusImageHandling.IsDecompressionBomb( self._temp_path ):
raise HydrusExceptions.DecompressionBombException( 'Image seems to be a Decompression Bomb!' )
self._file_info = HydrusFileHandling.GetFileInfo( self._temp_path, mime )
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
if mime in HC.MIMES_WITH_THUMBNAILS:
self._thumbnail = HydrusFileHandling.GenerateThumbnail( self._temp_path, mime )
if mime in HC.MIMES_WE_CAN_PHASH:
self._phashes = ClientImageHandling.GenerateShapePerceptualHashes( self._temp_path, mime )
self._extra_hashes = HydrusFileHandling.GetExtraHashesFromPath( self._temp_path )
class GalleryImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT
SERIALISABLE_NAME = 'Gallery Import'
SERIALISABLE_VERSION = 2
def __init__( self, gallery_identifier = None ):
if gallery_identifier is None:
gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )
HydrusSerialisable.SerialisableBase.__init__( self )
self._gallery_identifier = gallery_identifier
self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )
self._current_query = None
self._current_query_num_urls = 0
self._current_gallery_stream_identifier = None
self._current_gallery_stream_identifier_page_index = 0
self._current_gallery_stream_identifier_found_urls = set()
self._pending_gallery_stream_identifiers = []
self._pending_queries = []
new_options = HG.client_controller.new_options
self._file_limit = HC.options[ 'gallery_file_limit' ]
self._gallery_paused = False
self._files_paused = False
self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._tag_import_options = new_options.GetDefaultTagImportOptions( self._gallery_identifier )
self._last_gallery_page_hit_timestamp = 0
self._seed_cache = SeedCache()
self._lock = threading.Lock()
self._new_files_event = threading.Event()
self._new_query_event = threading.Event()
self._gallery = None
self._gallery_status = ''
self._gallery_status_can_change_timestamp = 0
self._current_action = ''
self._download_control_file_set = None
self._download_control_file_clear = None
self._download_control_gallery_set = None
self._download_control_gallery_clear = None
def _GetSerialisableInfo( self ):
serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]
if self._current_gallery_stream_identifier is None:
serialisable_current_gallery_stream_identifier = None
else:
serialisable_current_gallery_stream_identifier = self._current_gallery_stream_identifier.GetSerialisableTuple()
serialisable_current_gallery_stream_identifier_found_urls = list( self._current_gallery_stream_identifier_found_urls )
serialisable_pending_gallery_stream_identifiers = [ pending_gallery_stream_identifier.GetSerialisableTuple() for pending_gallery_stream_identifier in self._pending_gallery_stream_identifiers ]
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
serialisable_tag_options = self._tag_import_options.GetSerialisableTuple()
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_current_query_stuff = ( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifiers )
return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache ) = serialisable_info
( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifier ) = serialisable_current_query_stuff
self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )
self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]
if serialisable_current_gallery_stream_identifier is None:
self._current_gallery_stream_identifier = None
else:
self._current_gallery_stream_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_current_gallery_stream_identifier )
self._current_gallery_stream_identifier_found_urls = set( serialisable_current_gallery_stream_identifier_found_urls )
self._pending_gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_pending_gallery_stream_identifier ) for serialisable_pending_gallery_stream_identifier in serialisable_pending_gallery_stream_identifier ]
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def _FileNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_file_set is not None:
wx.CallAfter( self._download_control_file_set, network_job )
def exit_call():
with self._lock:
if self._download_control_file_clear is not None:
wx.CallAfter( self._download_control_file_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _SetGalleryStatus( self, status, timeout = None ):
if HydrusData.TimeHasPassed( self._gallery_status_can_change_timestamp ):
self._gallery_status = status
if timeout is not None:
self._gallery_status_can_change_timestamp = HydrusData.GetNow() + timeout
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, pending_queries, get_tags_if_url_known_and_file_redundant, file_limit, gallery_paused, files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache ) = old_serialisable_info
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, pending_queries, file_limit, gallery_paused, files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache )
return ( 2, new_serialisable_info )
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, method, url, **kwargs )
wx.CallAfter( self._download_control_file_set, network_job )
return network_job
try:
gallery = ClientDownloading.GetGallery( self._gallery_identifier )
except Exception as e:
HydrusData.PrintException( e )
with self._lock:
self._files_paused = True
self._gallery_paused = True
HydrusData.ShowText( 'A downloader could not load its gallery! It has been paused and the full error has been written to the log!' )
return
gallery.SetNetworkJobFactory( network_job_factory )
try:
if seed.WorksInNewSystem():
def status_hook( text ):
with self._lock:
self._current_action = text
did_substantial_work = seed.WorkOnPostURL( self._file_import_options, self._tag_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._FileNetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
else:
with self._lock:
self._current_action = 'reviewing file'
seed.PredictPreImportStatus( self._file_import_options )
status = seed.status
url = seed.seed_data
if status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB() and self._tag_import_options.WorthFetchingTags():
downloaded_tags = gallery.GetTags( url )
seed.AddTags( downloaded_tags )
elif status == CC.STATUS_UNKNOWN:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
with self._lock:
self._current_action = 'downloading file'
if self._tag_import_options.WorthFetchingTags():
downloaded_tags = gallery.GetFileAndTags( temp_path, url )
seed.AddTags( downloaded_tags )
else:
gallery.GetFile( temp_path, url )
seed.CheckPreFetchMetadata( self._tag_import_options )
with self._lock:
self._current_action = 'importing file'
seed.Import( temp_path, self._file_import_options )
did_substantial_work = True
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
did_substantial_work = seed.WriteContentUpdates( self._tag_import_options )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
seed.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
seed.SetStatus( status, note = note )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
time.sleep( 3 )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
HG.client_controller.pub( 'refresh_page_name', page_key )
wx.CallAfter( self._download_control_file_clear )
with self._lock:
self._current_action = ''
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _WorkOnGallery( self, page_key ):
with self._lock:
if self._current_query is None:
if len( self._pending_queries ) == 0:
self._SetGalleryStatus( '' )
return False
else:
self._current_query = self._pending_queries.pop( 0 )
self._current_query_num_urls = 0
self._current_gallery_stream_identifier = None
self._pending_gallery_stream_identifiers = list( self._gallery_stream_identifiers )
if self._current_gallery_stream_identifier is None:
if len( self._pending_gallery_stream_identifiers ) == 0:
self._SetGalleryStatus( self._current_query + ': produced ' + HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls', 5 )
self._current_query = None
return False
else:
self._current_gallery_stream_identifier = self._pending_gallery_stream_identifiers.pop( 0 )
self._current_gallery_stream_identifier_page_index = 0
self._current_gallery_stream_identifier_found_urls = set()
next_gallery_page_hit_timestamp = self._last_gallery_page_hit_timestamp + HG.client_controller.new_options.GetInteger( 'gallery_page_wait_period_pages' )
if not HydrusData.TimeHasPassed( next_gallery_page_hit_timestamp ):
if self._current_gallery_stream_identifier_page_index == 0:
page_check_status = 'checking first page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
else:
page_check_status = HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found, checking next page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
self._SetGalleryStatus( self._current_query + ': ' + page_check_status )
return True
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, method, url, **kwargs )
network_job.OverrideBandwidth()
wx.CallAfter( self._download_control_gallery_set, network_job )
return network_job
try:
gallery = ClientDownloading.GetGallery( self._current_gallery_stream_identifier )
except Exception as e:
HydrusData.PrintException( e )
with self._lock:
self._files_paused = True
self._gallery_paused = True
HydrusData.ShowText( 'A downloader could not load its gallery! It has been paused and the full error has been written to the log!' )
return False
gallery.SetNetworkJobFactory( network_job_factory )
query = self._current_query
page_index = self._current_gallery_stream_identifier_page_index
self._SetGalleryStatus( self._current_query + ': ' + HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found, now checking page ' + HydrusData.ConvertIntToPrettyString( self._current_gallery_stream_identifier_page_index + 1 ) )
error_occured = False
num_already_in_seed_cache = 0
new_seeds = []
try:
try:
( page_of_seeds, definitely_no_more_pages ) = gallery.GetPage( query, page_index )
finally:
self._last_gallery_page_hit_timestamp = HydrusData.GetNow()
with self._lock:
no_urls_found = len( page_of_seeds ) == 0
page_of_urls = [ seed.seed_data for seed in page_of_seeds ]
no_new_urls = len( self._current_gallery_stream_identifier_found_urls.intersection( page_of_urls ) ) == len( page_of_seeds )
if definitely_no_more_pages or no_urls_found or no_new_urls:
self._current_gallery_stream_identifier = None
else:
self._current_gallery_stream_identifier_page_index += 1
self._current_gallery_stream_identifier_found_urls.update( page_of_urls )
for seed in page_of_seeds:
if self._seed_cache.HasSeed( seed ):
num_already_in_seed_cache += 1
else:
with self._lock:
if self._file_limit is not None and self._current_query_num_urls + 1 > self._file_limit:
self._current_gallery_stream_identifier = None
self._pending_gallery_stream_identifiers = []
break
self._current_query_num_urls += 1
new_seeds.append( seed )
self._seed_cache.AddSeeds( new_seeds )
if len( new_seeds ) > 0:
self._new_files_event.set()
except Exception as e:
if isinstance( e, HydrusExceptions.NotFoundException ):
text = 'gallery 404'
else:
text = HydrusData.ToUnicode( e )
HydrusData.DebugPrint( traceback.format_exc() )
with self._lock:
self._current_gallery_stream_identifier = None
self._SetGalleryStatus( text, 5 )
time.sleep( 5 )
finally:
HG.client_controller.pub( 'refresh_page_name', page_key )
wx.CallAfter( self._download_control_gallery_clear )
with self._lock:
status = self._current_query + ': ' + HydrusData.ConvertIntToPrettyString( len( new_seeds ) ) + ' new urls found'
if num_already_in_seed_cache > 0:
status += ' (' + HydrusData.ConvertIntToPrettyString( num_already_in_seed_cache ) + ' of last page already in queue)'
self._SetGalleryStatus( status )
return True
def _THREADWorkOnFiles( self, page_key ):
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
no_work_to_do = self._files_paused or not self._seed_cache.WorkToDo()
if no_work_to_do or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_files_event.wait( 5 )
else:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
self._new_files_event.clear()
def _THREADWorkOnGallery( self, page_key ):
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
if self._gallery_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_query_event.wait( 5 )
else:
try:
work_to_do = self._WorkOnGallery( page_key )
if work_to_do:
time.sleep( 1 )
else:
self._new_query_event.wait( 5 )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
self._new_query_event.clear()
def AdvanceQueries( self, queries ):
with self._lock:
queries_lookup = set( queries )
for query in queries:
if query in self._pending_queries:
index = self._pending_queries.index( query )
if index > 0 and self._pending_queries[ index - 1 ] not in queries_lookup:
self._pending_queries.remove( query )
self._pending_queries.insert( index - 1, query )
def CurrentlyWorking( self ):
with self._lock:
finished = not self._seed_cache.WorkToDo()
return not finished and not self._files_paused
def DelayQueries( self, queries ):
with self._lock:
queries = list( queries )
queries.reverse()
queries_lookup = set( queries )
for query in queries:
if query in self._pending_queries:
index = self._pending_queries.index( query )
if index + 1 < len( self._pending_queries ) and self._pending_queries[ index + 1] not in queries_lookup:
self._pending_queries.remove( query )
self._pending_queries.insert( index + 1, query )
def DeleteQueries( self, queries ):
with self._lock:
for query in queries:
if query in self._pending_queries:
self._pending_queries.remove( query )
def FinishCurrentQuery( self ):
with self._lock:
self._current_query = None
self._gallery_paused = False
def GetGalleryIdentifier( self ):
return self._gallery_identifier
def GetOptions( self ):
with self._lock:
return ( self._file_import_options, self._tag_import_options, self._file_limit )
def GetSeedCache( self ):
return self._seed_cache
def GetStatus( self ):
with self._lock:
cancellable = self._current_query is not None
return ( list( self._pending_queries ), self._gallery_status, self._current_action, self._files_paused, self._gallery_paused, cancellable )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def PausePlayFiles( self ):
with self._lock:
self._files_paused = not self._files_paused
self._new_files_event.set()
def PausePlayGallery( self ):
with self._lock:
self._gallery_paused = not self._gallery_paused
self._new_query_event.set()
def PendQuery( self, query ):
with self._lock:
if query not in self._pending_queries:
self._pending_queries.append( query )
self._new_query_event.set()
def SetDownloadControls( self, file_download_control, gallery_download_control ):
with self._lock:
self._download_control_file_set = file_download_control.SetNetworkJob
self._download_control_file_clear = file_download_control.ClearNetworkJob
self._download_control_gallery_set = gallery_download_control.SetNetworkJob
self._download_control_gallery_clear = gallery_download_control.ClearNetworkJob
def SetFileLimit( self, file_limit ):
with self._lock:
self._file_limit = file_limit
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def SetTagImportOptions( self, tag_import_options ):
with self._lock:
self._tag_import_options = tag_import_options
def Start( self, page_key ):
HG.client_controller.CallToThreadLongRunning( self._THREADWorkOnGallery, page_key )
HG.client_controller.CallToThreadLongRunning( self._THREADWorkOnFiles, page_key )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT ] = GalleryImport
class HDDImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT
SERIALISABLE_NAME = 'Local File Import'
SERIALISABLE_VERSION = 1
def __init__( self, paths = None, file_import_options = None, paths_to_tags = None, delete_after_success = None ):
HydrusSerialisable.SerialisableBase.__init__( self )
if paths is None:
self._seed_cache = None
else:
self._seed_cache = SeedCache()
seeds = []
for path in paths:
seed = Seed( SEED_TYPE_HDD, path )
try:
s = os.stat( path )
seed.source_time = int( min( s.st_mtime, s.st_ctime ) )
except:
pass
seeds.append( seed )
self._seed_cache.AddSeeds( seeds )
self._file_import_options = file_import_options
self._paths_to_tags = paths_to_tags
self._delete_after_success = delete_after_success
self._current_action = ''
self._paused = False
self._lock = threading.Lock()
self._new_files_event = threading.Event()
def _GetSerialisableInfo( self ):
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_options = self._file_import_options.GetSerialisableTuple()
serialisable_paths_to_tags = { path : { service_key.encode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in self._paths_to_tags.items() }
return ( serialisable_seed_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_seed_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused ) = serialisable_info
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_options )
self._paths_to_tags = { path : { service_key.decode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in serialisable_paths_to_tags.items() }
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
path = seed.seed_data
with self._lock:
if path in self._paths_to_tags:
service_keys_to_tags = self._paths_to_tags[ path ]
else:
service_keys_to_tags = {}
try:
if not os.path.exists( path ):
raise Exception( 'Source file does not exist!' )
with self._lock:
self._current_action = 'importing'
seed.ImportPath( self._file_import_options )
did_substantial_work = True
if seed.status in CC.SUCCESSFUL_IMPORT_STATES:
hash = seed.GetHash()
service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
did_substantial_work = True
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
if self._delete_after_success:
try:
ClientData.DeletePath( path )
except Exception as e:
HydrusData.ShowText( 'While attempting to delete ' + path + ', the following error occured:' )
HydrusData.ShowException( e )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
try:
ClientData.DeletePath( txt_path )
except Exception as e:
HydrusData.ShowText( 'While attempting to delete ' + txt_path + ', the following error occured:' )
HydrusData.ShowException( e )
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
seed.SetStatus( status, note = note )
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
HG.client_controller.pub( 'refresh_page_name', page_key )
with self._lock:
self._current_action = ''
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _THREADWork( self, page_key ):
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
no_work_to_do = self._paused or not self._seed_cache.WorkToDo()
if no_work_to_do or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_files_event.wait( 5 )
else:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
self._new_files_event.clear()
def CurrentlyWorking( self ):
with self._lock:
work_to_do = self._seed_cache.WorkToDo()
return work_to_do and not self._paused
def GetFileImportOptions( self ):
with self._lock:
return self._file_import_options
def GetSeedCache( self ):
return self._seed_cache
def GetStatus( self ):
with self._lock:
return ( self._current_action, self._paused )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def PausePlay( self ):
with self._lock:
self._paused = not self._paused
self._new_files_event.set()
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def Start( self, page_key ):
HG.client_controller.CallToThreadLongRunning( self._THREADWork, page_key )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT ] = HDDImport
class ImportFolder( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER
SERIALISABLE_NAME = 'Import Folder'
SERIALISABLE_VERSION = 6
def __init__( self, name, path = '', file_import_options = None, tag_import_options = None, tag_service_keys_to_filename_tagging_options = None, mimes = None, actions = None, action_locations = None, period = 3600, check_regularly = True, show_working_popup = True, publish_files_to_popup_button = True, publish_files_to_page = False ):
if mimes is None:
mimes = HC.ALLOWED_MIMES
if file_import_options is None:
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' )
if tag_import_options is None:
tag_import_options = HG.client_controller.new_options.GetDefaultTagImportOptions( ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEFAULT ) )
if tag_service_keys_to_filename_tagging_options is None:
tag_service_keys_to_filename_tagging_options = {}
if actions is None:
actions = {}
actions[ CC.STATUS_SUCCESSFUL_AND_NEW ] = CC.IMPORT_FOLDER_IGNORE
actions[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ] = CC.IMPORT_FOLDER_IGNORE
actions[ CC.STATUS_DELETED ] = CC.IMPORT_FOLDER_IGNORE
actions[ CC.STATUS_ERROR ] = CC.IMPORT_FOLDER_IGNORE
if action_locations is None:
action_locations = {}
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._path = path
self._mimes = mimes
self._file_import_options = file_import_options
self._tag_import_options = tag_import_options
self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
self._actions = actions
self._action_locations = action_locations
self._period = period
self._check_regularly = check_regularly
self._seed_cache = SeedCache()
self._last_checked = 0
self._paused = False
self._check_now = False
self._show_working_popup = show_working_popup
self._publish_files_to_popup_button = publish_files_to_popup_button
self._publish_files_to_page = publish_files_to_page
def _ActionPaths( self ):
for status in ( CC.STATUS_SUCCESSFUL_AND_NEW, CC.STATUS_SUCCESSFUL_BUT_REDUNDANT, CC.STATUS_DELETED, CC.STATUS_ERROR ):
action = self._actions[ status ]
if action == CC.IMPORT_FOLDER_DELETE:
while True:
seed = self._seed_cache.GetNextSeed( status )
if seed is None or HG.view_shutdown:
break
path = seed.seed_data
try:
if os.path.exists( path ):
ClientData.DeletePath( path )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
ClientData.DeletePath( txt_path )
self._seed_cache.RemoveSeeds( ( seed, ) )
except Exception as e:
HydrusData.ShowText( 'Import folder tried to delete ' + path + ', but could not:' )
HydrusData.ShowException( e )
HydrusData.ShowText( 'Import folder has been paused.' )
self._paused = True
return
elif action == CC.IMPORT_FOLDER_MOVE:
while True:
seed = self._seed_cache.GetNextSeed( status )
if seed is None or HG.view_shutdown:
break
path = seed.seed_data
try:
dest_dir = self._action_locations[ status ]
if not os.path.exists( dest_dir ):
raise HydrusExceptions.DataMissing( 'The move location "' + dest_dir + '" does not exist!' )
if os.path.exists( path ):
filename = os.path.basename( path )
dest_path = os.path.join( dest_dir, filename )
dest_path = HydrusPaths.AppendPathUntilNoConflicts( dest_path )
HydrusPaths.MergeFile( path, dest_path )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
txt_filename = os.path.basename( txt_path )
txt_dest_path = os.path.join( dest_dir, txt_filename )
txt_dest_path = HydrusPaths.AppendPathUntilNoConflicts( txt_dest_path )
HydrusPaths.MergeFile( txt_path, txt_dest_path )
self._seed_cache.RemoveSeeds( ( seed, ) )
except Exception as e:
HydrusData.ShowText( 'Import folder tried to move ' + path + ', but could not:' )
HydrusData.ShowException( e )
HydrusData.ShowText( 'Import folder has been paused.' )
self._paused = True
return
elif status == CC.IMPORT_FOLDER_IGNORE:
pass
def _CheckFolder( self, job_key ):
filenames = os.listdir( HydrusData.ToUnicode( self._path ) )
raw_paths = [ os.path.join( self._path, filename ) for filename in filenames ]
all_paths = ClientFiles.GetAllPaths( raw_paths )
all_paths = HydrusPaths.FilterFreePaths( all_paths )
seeds = []
for path in all_paths:
if job_key.IsCancelled():
break
if path.endswith( '.txt' ):
continue
seed = Seed( SEED_TYPE_HDD, path )
if not self._seed_cache.HasSeed( seed ):
seeds.append( seed )
job_key.SetVariable( 'popup_text_1', 'checking: found ' + HydrusData.ConvertIntToPrettyString( len( seeds ) ) + ' new files' )
self._seed_cache.AddSeeds( seeds )
self._last_checked = HydrusData.GetNow()
self._check_now = False
def _GetSerialisableInfo( self ):
serialisable_file_import_options = self._file_import_options.GetSerialisableTuple()
serialisable_tag_import_options = self._tag_import_options.GetSerialisableTuple()
serialisable_tag_service_keys_to_filename_tagging_options = [ ( service_key.encode( 'hex' ), filename_tagging_options.GetSerialisableTuple() ) for ( service_key, filename_tagging_options ) in self._tag_service_keys_to_filename_tagging_options.items() ]
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
# json turns int dict keys to strings
action_pairs = self._actions.items()
action_location_pairs = self._action_locations.items()
return ( self._path, self._mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, self._period, self._check_regularly, serialisable_seed_cache, self._last_checked, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page )
def _ImportFiles( self, job_key ):
did_work = False
time_to_save = HydrusData.GetNow() + 600
num_files_imported = 0
presentation_hashes = []
presentation_hashes_fast = set()
i = 0
num_total = len( self._seed_cache )
num_total_unknown = self._seed_cache.GetSeedCount( CC.STATUS_UNKNOWN )
num_total_done = num_total - num_total_unknown
while True:
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
p1 = HC.options[ 'pause_import_folders_sync' ] or self._paused
p2 = HydrusThreading.IsThreadShuttingDown()
p3 = job_key.IsCancelled()
if seed is None or p1 or p2 or p3:
break
if HydrusData.TimeHasPassed( time_to_save ):
HG.client_controller.WriteSynchronous( 'serialisable', self )
time_to_save = HydrusData.GetNow() + 600
gauge_num_done = num_total_done + num_files_imported + 1
job_key.SetVariable( 'popup_text_1', 'importing file ' + HydrusData.ConvertValueRangeToPrettyString( gauge_num_done, num_total ) )
job_key.SetVariable( 'popup_gauge_1', ( gauge_num_done, num_total ) )
path = seed.seed_data
try:
mime = HydrusFileHandling.GetMime( path )
if mime in self._mimes:
seed.ImportPath( self._file_import_options )
if seed.status in CC.SUCCESSFUL_IMPORT_STATES:
downloaded_tags = []
service_keys_to_content_updates = self._tag_import_options.GetServiceKeysToContentUpdates( hash, downloaded_tags ) # additional tags
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
service_keys_to_tags = {}
for ( tag_service_key, filename_tagging_options ) in self._tag_service_keys_to_filename_tagging_options.items():
if not HG.client_controller.services_manager.ServiceExists( tag_service_key ):
continue
try:
tags = filename_tagging_options.GetTags( tag_service_key, path )
if len( tags ) > 0:
service_keys_to_tags[ tag_service_key ] = tags
except Exception as e:
HydrusData.ShowText( 'Trying to parse filename tags in the import folder "' + self._name + '" threw an error!' )
HydrusData.ShowException( e )
if len( service_keys_to_tags ) > 0:
service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
num_files_imported += 1
if hash not in presentation_hashes_fast:
if seed.ShouldPresent( self._file_import_options ):
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
else:
seed.SetStatus( CC.STATUS_VETOED )
except Exception as e:
error_text = traceback.format_exc()
HydrusData.Print( 'A file failed to import from import folder ' + self._name + ':' + path )
seed.SetStatus( CC.STATUS_ERROR, exception = e )
finally:
did_work = True
i += 1
if i % 10 == 0:
self._ActionPaths()
if num_files_imported > 0:
HydrusData.Print( 'Import folder ' + self._name + ' imported ' + HydrusData.ConvertIntToPrettyString( num_files_imported ) + ' files.' )
if len( presentation_hashes ) > 0:
if self._publish_files_to_popup_button:
job_key = ClientThreading.JobKey()
job_key.SetVariable( 'popup_files_mergable', True )
job_key.SetVariable( 'popup_files', ( list( presentation_hashes ), self._name ) )
HG.client_controller.pub( 'message', job_key )
if self._publish_files_to_page:
HG.client_controller.pub( 'imported_files_to_page', list( presentation_hashes ), self._name )
self._ActionPaths()
return did_work
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._path, self._mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, self._period, self._check_regularly, serialisable_seed_cache, self._last_checked, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page ) = serialisable_info
self._actions = dict( action_pairs )
self._action_locations = dict( action_location_pairs )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_import_options )
self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_import_options )
self._tag_service_keys_to_filename_tagging_options = dict( [ ( encoded_service_key.decode( 'hex' ), HydrusSerialisable.CreateFromSerialisableTuple( serialisable_filename_tagging_options ) ) for ( encoded_service_key, serialisable_filename_tagging_options ) in serialisable_tag_service_keys_to_filename_tagging_options ] )
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( path, mimes, serialisable_file_import_options, action_pairs, action_location_pairs, period, open_popup, tag, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info
service_keys_to_additional_tags = {}
if tag is not None:
service_keys_to_additional_tags[ CC.LOCAL_TAG_SERVICE_KEY ] = { tag }
tag_import_options = ClientImportOptions.TagImportOptions( service_keys_to_additional_tags = service_keys_to_additional_tags )
serialisable_tag_import_options = tag_import_options.GetSerialisableTuple()
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused )
return ( 2, new_serialisable_info )
if version == 2:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info
serialisable_txt_parse_tag_service_keys = []
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused )
return ( 3, new_serialisable_info )
if version == 3:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info
check_now = False
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now )
return ( 4, new_serialisable_info )
if version == 4:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now ) = old_serialisable_info
txt_parse_tag_service_keys = [ service_key.decode( 'hex' ) for service_key in serialisable_txt_parse_tag_service_keys ]
tag_service_keys_to_filename_tagging_options = {}
for service_key in txt_parse_tag_service_keys:
filename_tagging_options = ClientImportOptions.FilenameTaggingOptions()
filename_tagging_options._load_from_neighbouring_txt_files = True
tag_service_keys_to_filename_tagging_options[ service_key ] = filename_tagging_options
serialisable_tag_service_keys_to_filename_tagging_options = [ ( service_key.encode( 'hex' ), filename_tagging_options.GetSerialisableTuple() ) for ( service_key, filename_tagging_options ) in tag_service_keys_to_filename_tagging_options.items() ]
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now )
return ( 5, new_serialisable_info )
if version == 5:
( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now ) = old_serialisable_info
check_regularly = not paused
show_working_popup = True
publish_files_to_page = False
publish_files_to_popup_button = open_popup
new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, check_regularly, serialisable_seed_cache, last_checked, paused, check_now, show_working_popup, publish_files_to_popup_button, publish_files_to_page )
return ( 6, new_serialisable_info )
def CheckNow( self ):
self._check_now = True
def DoWork( self ):
if HG.view_shutdown:
return
if HC.options[ 'pause_import_folders_sync' ] or self._paused:
return
if not os.path.exists( self._path ) or not os.path.isdir( self._path ):
return
pubbed_job_key = False
job_key = ClientThreading.JobKey( pausable = False, cancellable = True )
job_key.SetVariable( 'popup_title', 'import folder - ' + self._name )
due_by_check_now = self._check_now
due_by_period = self._check_regularly and HydrusData.TimeHasPassed( self._last_checked + self._period )
checked_folder = False
if due_by_check_now or due_by_period:
if not pubbed_job_key and self._show_working_popup:
HG.client_controller.pub( 'message', job_key )
pubbed_job_key = True
self._CheckFolder( job_key )
checked_folder = True
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
did_import_file_work = False
if seed is not None:
if not pubbed_job_key and self._show_working_popup:
HG.client_controller.pub( 'message', job_key )
pubbed_job_key = True
did_import_file_work = self._ImportFiles( job_key )
if checked_folder or did_import_file_work:
HG.client_controller.WriteSynchronous( 'serialisable', self )
job_key.Delete()
def GetSeedCache( self ):
return self._seed_cache
def ToListBoxTuple( self ):
return ( self._name, self._path, self._period )
def ToTuple( self ):
return ( self._name, self._path, self._mimes, self._file_import_options, self._tag_import_options, self._tag_service_keys_to_filename_tagging_options, self._actions, self._action_locations, self._period, self._check_regularly, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page )
def SetSeedCache( self, seed_cache ):
self._seed_cache = seed_cache
def SetTuple( self, name, path, mimes, file_import_options, tag_import_options, tag_service_keys_to_filename_tagging_options, actions, action_locations, period, check_regularly, paused, check_now, show_working_popup, publish_files_to_popup_button, publish_files_to_page ):
if path != self._path:
self._seed_cache = SeedCache()
if set( mimes ) != set( self._mimes ):
self._seed_cache.RemoveSeedsByStatus( CC.STATUS_VETOED )
self._name = name
self._path = path
self._mimes = mimes
self._file_import_options = file_import_options
self._tag_import_options = tag_import_options
self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
self._actions = actions
self._action_locations = action_locations
self._period = period
self._check_regularly = check_regularly
self._paused = paused
self._check_now = check_now
self._show_working_popup = show_working_popup
self._publish_files_to_popup_button = publish_files_to_popup_button
self._publish_files_to_page = publish_files_to_page
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER ] = ImportFolder
class NetworkJobPresentationContext( object ):
def __init__( self, enter_call, exit_call ):
self._enter_call = enter_call
self._exit_call = exit_call
def __enter__( self ):
self._enter_call()
def __exit__( self, exc_type, exc_val, exc_tb ):
self._exit_call()
SEED_TYPE_HDD = 0
SEED_TYPE_URL = 1
class Seed( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED
SERIALISABLE_NAME = 'File Import'
SERIALISABLE_VERSION = 1
def __init__( self, seed_type = None, seed_data = None ):
if seed_type is None:
seed_type = SEED_TYPE_URL
if seed_data is None:
seed_data = 'https://big-guys.4u/monica_lewinsky_hott.tiff.exe.vbs'
HydrusSerialisable.SerialisableBase.__init__( self )
self.seed_type = seed_type
self.seed_data = seed_data
self.created = HydrusData.GetNow()
self.modified = self.created
self.source_time = None
self.status = CC.STATUS_UNKNOWN
self.note = ''
self._urls = set()
self._tags = set()
self._hashes = {}
def __eq__( self, other ):
return self.__hash__() == other.__hash__()
def __hash__( self ):
return ( self.seed_type, self.seed_data ).__hash__()
def __ne__( self, other ):
return self.__hash__() != other.__hash__()
def _CheckTagsBlacklist( self, tags, tag_import_options ):
tag_import_options.CheckBlacklist( tags )
def _GetSerialisableInfo( self ):
serialisable_urls = list( self._urls )
serialisable_tags = list( self._tags )
serialisable_hashes = [ ( hash_type, hash.encode( 'hex' ) ) for ( hash_type, hash ) in self._hashes.items() ]
return ( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, serialisable_urls, serialisable_tags, serialisable_hashes )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, serialisable_urls, serialisable_tags, serialisable_hashes ) = serialisable_info
self._urls = set( serialisable_urls )
self._tags = set( serialisable_tags )
self._hashes = { hash_type : encoded_hash.decode( 'hex' ) for ( hash_type, encoded_hash ) in serialisable_hashes }
def _NormaliseAndFilterAssociableURLs( self, urls ):
normalised_urls = { HG.client_controller.network_engine.domain_manager.NormaliseURL( url ) for url in urls }
associable_urls = { url for url in normalised_urls if url != self.seed_data and HG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
return associable_urls
def _UpdateModified( self ):
self.modified = HydrusData.GetNow()
def AddParseResults( self, parse_results ):
for ( hash_type, hash ) in ClientParsing.GetHashesFromParseResults( parse_results ):
if hash_type not in self._hashes:
self._hashes[ hash_type ] = hash
urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST ) )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
self._urls.update( associable_urls )
tags = ClientParsing.GetTagsFromParseResults( parse_results )
self._tags.update( tags )
source_timestamp = ClientParsing.GetTimestampFromParseResults( parse_results, HC.TIMESTAMP_TYPE_SOURCE )
source_timestamp = min( HydrusData.GetNow() - 30, source_timestamp )
if source_timestamp is not None:
self.source_time = source_timestamp
self._UpdateModified()
def AddTags( self, tags ):
self._tags.update( tags )
self._UpdateModified()
def AddURL( self, url ):
urls = ( url, )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
self._urls.update( associable_urls )
def CheckPreFetchMetadata( self, tag_import_options ):
self._CheckTagsBlacklist( self._tags, tag_import_options )
def DownloadAndImportRawFile( self, file_url, file_import_options, network_job_factory, network_job_presentation_context_factory ):
self.AddURL( file_url )
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
if self.seed_data != file_url:
referral_url = self.seed_data
else:
referral_url = None
network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
self.Import( temp_path, file_import_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
def FetchPageMetadata( self, tag_import_options ):
pass
def PredictPreImportStatus( self, file_import_options ):
if self.status != CC.STATUS_UNKNOWN:
return
UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )
( status, hash, note ) = UNKNOWN_DEFAULT
# urls
def select_best_url_result( url_results ):
# most of the time, this is just going to be selecting the one and only interesting result
# but if there are 1->n conflicts in url->hash mappings due to dupe/sample/cloudflare gubbins, we'll prefer redundant results over deleted
# if nothing interesting, default back to unknown
for ( status, hash, note ) in url_results:
if status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
return ( status, hash, note )
for ( status, hash, note ) in url_results:
if status == CC.STATUS_DELETED:
return ( status, hash, note )
return UNKNOWN_DEFAULT
urls = set( self._urls )
if self.seed_type == SEED_TYPE_URL:
urls.add( self.seed_data )
unrecognised_url_results = set()
for url in urls:
if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToMultipleFiles( url ):
continue
results = HG.client_controller.Read( 'url_statuses', url )
if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToOneFile( url ):
( status, hash, note ) = select_best_url_result( results )
if status != CC.STATUS_UNKNOWN:
break # if a known one-file url gives a clear result, that result is reliable
else:
if len( results ) == 0: # this url has no result, so it is a vote, absent clear evidence otherwise, for 'this is a new unknown thing'
result = UNKNOWN_DEFAULT
elif len( results ) == 1: # this url is possibly a one-file url
result = results[0]
else: # this url is likely a gallery url, which are useless for determining url status
continue
unrecognised_url_results.add( result )
if status == CC.STATUS_UNKNOWN and len( unrecognised_url_results ) > 0:
# no known one-file url gave us a response, so let's check our unrecognised urls
# these are likely one-file urls but could also be gallery urls, so if they have non-unknown results, we should nonetheless proceed with the least certain scenario
# if any of them say unknown, the whole thing should stay unknown
# but if they are all known, we are probably in luck
unrecognised_url_results = list( unrecognised_url_results )
seen_statuses = { status for ( status, hash, note ) in unrecognised_url_results }
if CC.STATUS_UNKNOWN in seen_statuses:
( status, hash, note ) = UNKNOWN_DEFAULT
else:
( status, hash, note ) = select_best_url_result( unrecognised_url_results )
# hashes
if status == CC.STATUS_UNKNOWN:
for ( hash_type, found_hash ) in self._hashes.items():
( status, hash, note ) = HG.client_controller.Read( 'hash_status', hash_type, found_hash )
if status != CC.STATUS_UNKNOWN:
break
#
if status == CC.STATUS_DELETED:
if not file_import_options.ExcludesDeleted():
status = CC.STATUS_UNKNOWN
note = ''
self.status = status
if hash is not None:
self._hashes[ 'sha256' ] = hash
self.note = note
self._UpdateModified()
def GetHash( self ):
if 'sha256' in self._hashes:
return self._hashes[ 'sha256' ]
return None
def GetSearchSeeds( self ):
if self.seed_type == SEED_TYPE_URL:
search_urls = ClientNetworkingDomain.GetSearchURLs( self.seed_data )
search_seeds = [ Seed( SEED_TYPE_URL, search_url ) for search_url in search_urls ]
else:
search_seeds = [ self ]
return search_seeds
def Import( self, temp_path, file_import_options ):
file_import_job = FileImportJob( temp_path, file_import_options )
( status, hash ) = HG.client_controller.client_files_manager.ImportFile( file_import_job )
self.SetStatus( status )
self.SetHash( hash )
def ImportPath( self, file_import_options ):
if self.seed_type != SEED_TYPE_HDD:
raise Exception( 'Attempted to import as a path, but I do not think I am a path!' )
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
path = self.seed_data
copied = HydrusPaths.MirrorFile( path, temp_path )
if not copied:
raise Exception( 'File failed to copy to temp path--see log for error.' )
self.Import( temp_path, file_import_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
def Normalise( self ):
if self.seed_type == SEED_TYPE_URL:
self.seed_data = HG.client_controller.network_engine.domain_manager.NormaliseURL( self.seed_data )
def PresentToPage( self, page_key ):
hash = self.GetHash()
if hash is not None:
( media_result, ) = HG.client_controller.Read( 'media_results', ( hash, ) )
HG.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
def SetHash( self, hash ):
self._hashes[ 'sha256' ] = hash
def SetStatus( self, status, note = '', exception = None ):
if exception is not None:
first_line = HydrusData.ToUnicode( exception ).split( os.linesep )[0]
note = first_line + u'\u2026 (Copy note to see full error)'
note += os.linesep
note += HydrusData.ToUnicode( traceback.format_exc() )
HydrusData.Print( 'Error when processing ' + self.seed_data + ' !' )
HydrusData.Print( traceback.format_exc() )
self.status = status
self.note = note
self._UpdateModified()
def ShouldDownloadFile( self ):
return self.status == CC.STATUS_UNKNOWN
def ShouldFetchPageMetadata( self, tag_import_options ):
if self.status == CC.STATUS_UNKNOWN:
return True
if self.status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if tag_import_options.WorthFetchingTags() and tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB():
return True
return False
def ShouldPresent( self, file_import_options ):
if 'sha256' not in self._hashes:
return False
hash = self._hashes[ 'sha256' ]
if self.status in CC.SUCCESSFUL_IMPORT_STATES:
if file_import_options.ShouldPresentIgnorantOfInbox( self.status ):
return True
in_inbox = HG.client_controller.Read( 'in_inbox', hash )
if file_import_options.ShouldPresent( self.status, in_inbox ):
return True
return False
def WorkOnFileURL( self, file_import_options, status_hook, network_job_factory, network_job_presentation_context_factory, tag_import_options = None ):
did_substantial_work = False
try:
status_hook( 'checking url status' )
self.PredictPreImportStatus( file_import_options )
if self.status == CC.STATUS_UNKNOWN:
file_url = self.seed_data
status_hook( 'downloading file' )
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )
did_substantial_work = True
did_substantial_work |= self.WriteContentUpdates( tag_import_options )
except HydrusExceptions.ShutdownException:
return False
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
self.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
status_hook( 'cancelled!' )
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
self.SetStatus( status, note = note )
status_hook( '404' )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
self.SetStatus( status, exception = e )
status_hook( 'error!' )
time.sleep( 3 )
return did_substantial_work
def WorkOnPostURL( self, file_import_options, tag_import_options, status_hook, network_job_factory, network_job_presentation_context_factory ):
did_substantial_work = False
try:
status_hook( 'checking url status' )
self.PredictPreImportStatus( file_import_options )
if self.ShouldFetchPageMetadata( tag_import_options ):
post_url = self.seed_data
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
if parser is None:
raise HydrusExceptions.ParseException( 'Could not find a parser for the given URL!' )
status_hook( 'downloading page' )
network_job = network_job_factory( 'GET', url_to_check )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
data = network_job.GetContent()
parsing_context = {}
parsing_context[ 'post_url' ] = post_url
parsing_context[ 'url' ] = url_to_check
all_parse_results = parser.Parse( parsing_context, data )
if len( all_parse_results ) == 0:
raise HydrusExceptions.VetoException( 'Could not parse any data!' )
parse_results = all_parse_results[0]
self.AddParseResults( parse_results )
self.CheckPreFetchMetadata( tag_import_options )
self.PredictPreImportStatus( file_import_options )
if self.ShouldDownloadFile():
file_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, ), only_get_top_priority = True )
if len( file_urls ) == 0:
raise HydrusExceptions.VetoException( 'Could not file a file URL!' )
file_url = file_urls[0]
status_hook( 'downloading file' )
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )
did_substantial_work = True
did_substantial_work |= self.WriteContentUpdates( tag_import_options )
except HydrusExceptions.ShutdownException:
return False
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
self.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
status_hook( 'cancelled!' )
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
self.SetStatus( status, note = note )
status_hook( '404' )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
self.SetStatus( status, exception = e )
status_hook( 'error!' )
time.sleep( 3 )
return did_substantial_work
def WorksInNewSystem( self ):
if self.seed_type == SEED_TYPE_URL:
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.seed_data )
if url_type == HC.URL_TYPE_POST and can_parse:
return True
return False
def WriteContentUpdates( self, tag_import_options = None ):
did_work = False
if self.status == CC.STATUS_ERROR:
return did_work
if 'sha256' not in self._hashes:
return did_work
hash = self._hashes[ 'sha256' ]
service_keys_to_content_updates = collections.defaultdict( list )
urls = set( self._urls )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
if self.seed_type == SEED_TYPE_URL:
associable_urls.add( self.seed_data )
if len( associable_urls ) > 0:
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( hash, associable_urls ) )
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
if tag_import_options is not None:
for ( service_key, content_updates ) in tag_import_options.GetServiceKeysToContentUpdates( hash, set( self._tags ) ).items():
service_keys_to_content_updates[ service_key ].extend( content_updates )
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
did_work = True
return did_work
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED ] = Seed
class SeedCache( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE
SERIALISABLE_NAME = 'Import File Status Cache'
SERIALISABLE_VERSION = 8
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._seeds = HydrusSerialisable.SerialisableList()
self._seeds_to_indices = {}
self._seed_cache_key = HydrusData.GenerateKey()
self._status_cache = None
self._dirty = True
self._lock = threading.Lock()
def __len__( self ):
return len( self._seeds )
def _GenerateStatus( self ):
statuses_to_counts = collections.Counter()
for seed in self._seeds:
statuses_to_counts[ seed.status ] += 1
num_successful_and_new = statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
num_successful_but_redundant = statuses_to_counts[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ]
num_ignored = statuses_to_counts[ CC.STATUS_VETOED ]
num_deleted = statuses_to_counts[ CC.STATUS_DELETED ]
num_failed = statuses_to_counts[ CC.STATUS_ERROR ]
num_skipped = statuses_to_counts[ CC.STATUS_SKIPPED ]
num_unknown = statuses_to_counts[ CC.STATUS_UNKNOWN ]
status_strings = []
num_successful = num_successful_and_new + num_successful_but_redundant
if num_successful > 0:
s = HydrusData.ConvertIntToPrettyString( num_successful ) + ' successful'
if num_successful_and_new > 0:
if num_successful_but_redundant > 0:
s += ' (' + HydrusData.ConvertIntToPrettyString( num_successful_but_redundant ) + ' already in db)'
else:
s += ' (all already in db)'
status_strings.append( s )
if num_ignored > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_ignored ) + ' ignored' )
if num_deleted > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_deleted ) + ' previously deleted' )
if num_failed > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_failed ) + ' failed' )
if num_skipped > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_skipped ) + ' skipped' )
status = ', '.join( status_strings )
total = len( self._seeds )
total_processed = total - num_unknown
self._status_cache = ( status, ( total_processed, total ) )
self._dirty = False
def _GetSeeds( self, status = None ):
if status is None:
return list( self._seeds )
else:
return [ seed for seed in self._seeds if seed.status == status ]
def _GetSerialisableInfo( self ):
with self._lock:
return self._seeds.GetSerialisableTuple()
def _GetSourceTimestamp( self, seed ):
source_timestamp = seed.source_time
if source_timestamp is None:
# decent fallback compromise
# -30 since added and 'last check' timestamps are often the same, and this messes up calculations
source_timestamp = seed.created - 30
return source_timestamp
def _HasSeed( self, seed ):
search_seeds = seed.GetSearchSeeds()
has_seed = True in ( search_seed in self._seeds_to_indices for search_seed in search_seeds )
return has_seed
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
with self._lock:
self._seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
def _SetDirty( self ):
self._dirty = True
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
if 'note' in seed_info:
seed_info[ 'note' ] = HydrusData.ToUnicode( seed_info[ 'note' ] )
new_serialisable_info.append( ( seed, seed_info ) )
return ( 2, new_serialisable_info )
if version in ( 2, 3 ):
# gelbooru replaced their thumbnail links with this redirect spam
# 'https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY4ODA1OA=='
# I missed some http ones here, so I've broadened the test and rescheduled it
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
if 'gelbooru.com/redirect.php' in seed:
continue
new_serialisable_info.append( ( seed, seed_info ) )
return ( 4, new_serialisable_info )
if version == 4:
def ConvertRegularToRawURL( regular_url ):
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
# to this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# the 500 part can be a bunch of stuff, including letters
url_components = regular_url.split( '_' )
last_component = url_components[ -1 ]
( number_gubbins, file_ext ) = last_component.split( '.' )
raw_last_component = 'raw.' + file_ext
url_components[ -1 ] = raw_last_component
raw_url = '_'.join( url_components )
return raw_url
def Remove68Subdomain( long_url ):
# sometimes the 68 subdomain gives a 404 on the raw url, so:
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# to this:
# http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# I am not sure if it is always 68, but let's not assume
( scheme, rest ) = long_url.split( '://', 1 )
if rest.startswith( 'media.tumblr.com' ):
return long_url
( gumpf, shorter_rest ) = rest.split( '.', 1 )
shorter_url = scheme + '://' + shorter_rest
return shorter_url
new_serialisable_info = []
good_seeds = set()
for ( seed, seed_info ) in old_serialisable_info:
try:
parse = urlparse.urlparse( seed )
if 'media.tumblr.com' in parse.netloc:
seed = Remove68Subdomain( seed )
seed = ConvertRegularToRawURL( seed )
seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )
if 'pixiv.net' in parse.netloc:
seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )
if seed in good_seeds: # we hit a dupe, so skip it
continue
except:
pass
good_seeds.add( seed )
new_serialisable_info.append( ( seed, seed_info ) )
return ( 5, new_serialisable_info )
if version == 5:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
seed_info[ 'source_timestamp' ] = None
new_serialisable_info.append( ( seed, seed_info ) )
return ( 6, new_serialisable_info )
if version == 6:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
try:
magic_phrase = '//media.tumblr.com'
replacement = '//data.tumblr.com'
if magic_phrase in seed:
seed = seed.replace( magic_phrase, replacement )
except:
pass
new_serialisable_info.append( ( seed, seed_info ) )
return ( 7, new_serialisable_info )
if version == 7:
seeds = HydrusSerialisable.SerialisableList()
for ( seed_text, seed_info ) in old_serialisable_info:
if seed_text.startswith( 'http' ):
seed_type = SEED_TYPE_URL
else:
seed_type = SEED_TYPE_HDD
seed = Seed( seed_type, seed_text )
seed.status = seed_info[ 'status' ]
seed.created = seed_info[ 'added_timestamp' ]
seed.modified = seed_info[ 'last_modified_timestamp' ]
seed.source_time = seed_info[ 'source_timestamp' ]
seed.note = seed_info[ 'note' ]
seeds.append( seed )
new_serialisable_info = seeds.GetSerialisableTuple()
return ( 8, new_serialisable_info )
def AddSeeds( self, seeds ):
if len( seeds ) == 0:
return 0
new_seeds = []
with self._lock:
for seed in seeds:
if self._HasSeed( seed ):
continue
seed.Normalise()
new_seeds.append( seed )
self._seeds.append( seed )
self._seeds_to_indices[ seed ] = len( self._seeds ) - 1
self._SetDirty()
self.NotifySeedsUpdated( new_seeds )
return len( new_seeds )
def AdvanceSeed( self, seed ):
with self._lock:
if seed in self._seeds_to_indices:
index = self._seeds_to_indices[ seed ]
if index > 0:
self._seeds.remove( seed )
self._seeds.insert( index - 1, seed )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self.NotifySeedsUpdated( ( seed, ) )
def DelaySeed( self, seed ):
with self._lock:
if seed in self._seeds_to_indices:
index = self._seeds_to_indices[ seed ]
if index < len( self._seeds ) - 1:
self._seeds.remove( seed )
self._seeds.insert( index + 1, seed )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self.NotifySeedsUpdated( ( seed, ) )
def GetEarliestSourceTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return None
earliest_timestamp = min( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )
return earliest_timestamp
def GetLatestAddedTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return 0
latest_timestamp = max( ( seed.created for seed in self._seeds ) )
return latest_timestamp
def GetLatestSourceTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return 0
latest_timestamp = max( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )
return latest_timestamp
def GetNextSeed( self, status ):
with self._lock:
for seed in self._seeds:
if seed.status == status:
return seed
return None
def GetNumNewFilesSince( self, since ):
num_files = 0
with self._lock:
for seed in self._seeds:
source_timestamp = self._GetSourceTimestamp( seed )
if source_timestamp >= since:
num_files += 1
return num_files
def GetSeedCacheKey( self ):
return self._seed_cache_key
def GetSeedCount( self, status = None ):
result = 0
with self._lock:
if status is None:
result = len( self._seeds )
else:
for seed in self._seeds:
if seed.status == status:
result += 1
return result
def GetSeeds( self, status = None ):
with self._lock:
return self._GetSeeds( status )
def GetSeedIndex( self, seed ):
with self._lock:
return self._seeds_to_indices[ seed ]
def GetStatus( self ):
with self._lock:
if self._dirty:
self._GenerateStatus()
return self._status_cache
def GetValueRange( self ):
with self._lock:
if self._dirty:
self._GenerateStatus()
( status, ( total_processed, total ) ) = self._status_cache
return ( total_processed, total )
def HasSeed( self, seed ):
with self._lock:
return self._HasSeed( seed )
def NotifySeedsUpdated( self, seeds ):
with self._lock:
self._SetDirty()
HG.client_controller.pub( 'seed_cache_seeds_updated', self._seed_cache_key, seeds )
def RemoveProcessedSeeds( self ):
with self._lock:
seeds_to_delete = [ seed for seed in self._seeds if seed.status != CC.STATUS_UNKNOWN ]
self.RemoveSeeds( seeds_to_delete )
def RemoveSeeds( self, seeds ):
with self._lock:
seeds_to_delete = set( seeds )
self._seeds = HydrusSerialisable.SerialisableList( [ seed for seed in self._seeds if seed not in seeds_to_delete ] )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self._SetDirty()
self.NotifySeedsUpdated( seeds_to_delete )
def RemoveSeedsByStatus( self, status ):
with self._lock:
seeds_to_delete = [ seed for seed in self._seeds if seed.status == status ]
self.RemoveSeeds( seeds_to_delete )
def RemoveSuccessfulSeeds( self ):
with self._lock:
seeds_to_delete = [ seed for seed in self._seeds if seed.status in CC.SUCCESSFUL_IMPORT_STATES ]
self.RemoveSeeds( seeds_to_delete )
def RetryFailures( self ):
with self._lock:
failed_seeds = self._GetSeeds( CC.STATUS_ERROR )
for seed in failed_seeds:
seed.SetStatus( CC.STATUS_UNKNOWN )
self.NotifySeedsUpdated( failed_seeds )
def WorkToDo( self ):
with self._lock:
if self._dirty:
self._GenerateStatus()
( status, ( total_processed, total ) ) = self._status_cache
return total_processed < total
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE ] = SeedCache
class SimpleDownloaderImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_IMPORT
SERIALISABLE_NAME = 'Simple Downloader Import'
SERIALISABLE_VERSION = 4
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._pending_jobs = []
self._seed_cache = SeedCache()
self._file_import_options = file_import_options
self._formula_name = 'all files linked by images in page'
self._queue_paused = False
self._files_paused = False
self._parser_status = ''
self._current_action = ''
self._download_control_file_set = None
self._download_control_file_clear = None
self._download_control_page_set = None
self._download_control_page_clear = None
self._lock = threading.Lock()
self._new_files_event = threading.Event()
self._new_page_event = threading.Event()
def _FileNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_file_set is not None:
wx.CallAfter( self._download_control_file_set, network_job )
def exit_call():
with self._lock:
if self._download_control_file_clear is not None:
wx.CallAfter( self._download_control_file_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _GetSerialisableInfo( self ):
serialisable_pending_jobs = [ ( url, simple_downloader_formula.GetSerialisableTuple() ) for ( url, simple_downloader_formula ) in self._pending_jobs ]
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
return ( serialisable_pending_jobs, serialisable_seed_cache, serialisable_file_options, self._formula_name, self._queue_paused, self._files_paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_pending_jobs, serialisable_seed_cache, serialisable_file_options, self._formula_name, self._queue_paused, self._files_paused ) = serialisable_info
self._pending_jobs = [ ( url, HydrusSerialisable.CreateFromSerialisableTuple( serialisable_simple_downloader_formula ) ) for ( url, serialisable_simple_downloader_formula ) in serialisable_pending_jobs ]
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
def _PageNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_page_set is not None:
wx.CallAfter( self._download_control_page_set, network_job )
def exit_call():
with self._lock:
if self._download_control_page_clear is not None:
wx.CallAfter( self._download_control_page_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, paused ) = old_serialisable_info
queue_paused = paused
files_paused = paused
new_serialisable_info = ( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, queue_paused, files_paused )
return ( 2, new_serialisable_info )
if version == 2:
( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, queue_paused, files_paused ) = old_serialisable_info
pending_jobs = []
new_serialisable_info = ( pending_jobs, serialisable_seed_cache, serialisable_file_options, queue_paused, files_paused )
return ( 3, new_serialisable_info )
if version == 3:
( pending_jobs, serialisable_seed_cache, serialisable_file_options, queue_paused, files_paused ) = old_serialisable_info
pending_jobs = []
formula_name = 'all files linked by images in page'
new_serialisable_info = ( pending_jobs, serialisable_seed_cache, serialisable_file_options, formula_name, queue_paused, files_paused )
return ( 4, new_serialisable_info )
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
file_url = seed.seed_data
try:
def status_hook( text ):
with self._lock:
self._current_action = text
did_substantial_work = seed.WorkOnFileURL( self._file_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._FileNetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
time.sleep( 3 )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
HG.client_controller.pub( 'refresh_page_name', page_key )
with self._lock:
self._current_action = ''
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _WorkOnQueue( self, page_key ):
if len( self._pending_jobs ) > 0:
with self._lock:
( url, simple_downloader_formula ) = self._pending_jobs.pop( 0 )
self._parser_status = 'checking ' + url
error_occurred = False
try:
network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, 'GET', url )
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
with self._PageNetworkJobPresentationContextFactory( network_job ):
network_job.WaitUntilDone()
data = network_job.GetContent()
#
parsing_context = {}
parsing_context[ 'url' ] = url
parsing_formula = simple_downloader_formula.GetFormula()
file_urls = [ urlparse.urljoin( url, parsed_text ) for parsed_text in parsing_formula.Parse( parsing_context, data ) ]
seeds = [ Seed( SEED_TYPE_URL, file_url ) for file_url in file_urls ]
for seed in seeds:
seed.AddURL( url )
num_new = self._seed_cache.AddSeeds( seeds )
if num_new > 0:
self._new_files_event.set()
parser_status = 'page checked OK - ' + HydrusData.ConvertIntToPrettyString( num_new ) + ' new urls'
num_already_in_seed_cache = len( file_urls ) - num_new
if num_already_in_seed_cache > 0:
parser_status += ' (' + HydrusData.ConvertIntToPrettyString( num_already_in_seed_cache ) + ' already in queue)'
except HydrusExceptions.ShutdownException:
return
except HydrusExceptions.NotFoundException:
error_occurred = True
parser_status = 'page 404'
except Exception as e:
error_occurred = True
parser_status = HydrusData.ToUnicode( e )
finally:
HG.client_controller.pub( 'refresh_page_name', page_key )
with self._lock:
self._parser_status = parser_status
if error_occurred:
time.sleep( 5 )
return True
else:
with self._lock:
self._parser_status = ''
return False
def _THREADWorkOnFiles( self, page_key ):
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
no_work_to_do = self._files_paused or not self._seed_cache.WorkToDo()
if no_work_to_do or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_files_event.wait( 5 )
else:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
self._new_files_event.clear()
def _THREADWorkOnQueue( self, page_key ):
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
if self._queue_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_page_event.wait( 5 )
else:
try:
did_work = self._WorkOnQueue( page_key )
if did_work:
time.sleep( 5 )
else:
self._new_page_event.wait( 5 )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
self._new_page_event.clear()
def AdvanceJob( self, job ):
with self._lock:
if job in self._pending_jobs:
index = self._pending_jobs.index( job )
if index - 1 >= 0:
self._pending_jobs.remove( job )
self._pending_jobs.insert( index - 1, job )
def CurrentlyWorking( self ):
with self._lock:
finished = not self._seed_cache.WorkToDo() or len( self._pending_jobs ) > 0
return not finished and not self._files_paused
def DelayJob( self, job ):
with self._lock:
if job in self._pending_jobs:
index = self._pending_jobs.index( job )
if index + 1 < len( self._pending_jobs ):
self._pending_jobs.remove( job )
self._pending_jobs.insert( index + 1, job )
def DeleteJob( self, job ):
with self._lock:
if job in self._pending_jobs:
self._pending_jobs.remove( job )
def GetSeedCache( self ):
with self._lock:
return self._seed_cache
def GetFileImportOptions( self ):
with self._lock:
return self._file_import_options
def GetFormulaName( self ):
with self._lock:
return self._formula_name
def GetStatus( self ):
with self._lock:
return ( list( self._pending_jobs ), self._parser_status, self._current_action, self._queue_paused, self._files_paused )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def PausePlayFiles( self ):
with self._lock:
self._files_paused = not self._files_paused
self._new_files_event.set()
def PausePlayQueue( self ):
with self._lock:
self._queue_paused = not self._queue_paused
self._new_page_event.set()
def PendJob( self, job ):
with self._lock:
if job not in self._pending_jobs:
self._pending_jobs.append( job )
self._new_page_event.set()
def SetDownloadControlFile( self, download_control ):
with self._lock:
self._download_control_file_set = download_control.SetNetworkJob
self._download_control_file_clear = download_control.ClearNetworkJob
def SetDownloadControlPage( self, download_control ):
with self._lock:
self._download_control_page_set = download_control.SetNetworkJob
self._download_control_page_clear = download_control.ClearNetworkJob
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def SetFormulaName( self, formula_name ):
with self._lock:
self._formula_name = formula_name
def Start( self, page_key ):
HG.client_controller.CallToThreadLongRunning( self._THREADWorkOnQueue, page_key )
HG.client_controller.CallToThreadLongRunning( self._THREADWorkOnFiles, page_key )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_IMPORT ] = SimpleDownloaderImport
class Subscription( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION
SERIALISABLE_NAME = 'Subscription'
SERIALISABLE_VERSION = 5
def __init__( self, name ):
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )
self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )
self._queries = []
new_options = HG.client_controller.new_options
self._checker_options = ClientData.CheckerOptions( intended_files_per_check = 5, never_faster_than = 86400, never_slower_than = 90 * 86400, death_file_velocity = ( 1, 90 * 86400 ) )
if HC.options[ 'gallery_file_limit' ] is None:
self._initial_file_limit = 200
else:
self._initial_file_limit = min( 200, HC.options[ 'gallery_file_limit' ] )
self._periodic_file_limit = 50
self._paused = False
self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' )
new_options = HG.client_controller.new_options
self._tag_import_options = new_options.GetDefaultTagImportOptions( self._gallery_identifier )
self._last_gallery_page_hit_timestamp = 0
self._no_work_until = 0
self._no_work_until_reason = ''
def _DelayWork( self, time_delta, reason ):
self._no_work_until = HydrusData.GetNow() + time_delta
self._no_work_until_reason = reason
def _GetExampleNetworkContexts( self, query ):
seed_cache = query.GetSeedCache()
seed = seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return [ ClientNetworkingContexts.NetworkContext( CC.NETWORK_CONTEXT_SUBSCRIPTION, self._GetNetworkJobSubscriptionKey( query ) ), ClientNetworkingContexts.GLOBAL_NETWORK_CONTEXT ]
url = seed.seed_data
example_nj = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), 'GET', url )
example_network_contexts = example_nj.GetNetworkContexts()
return example_network_contexts
def _GetNetworkJobSubscriptionKey( self, query ):
query_text = query.GetQueryText()
return self._name + ': ' + query_text
def _GetQueriesForProcessing( self ):
queries = list( self._queries )
if HG.client_controller.new_options.GetBoolean( 'process_subs_in_random_order' ):
random.shuffle( queries )
else:
def key( q ):
return q.GetQueryText()
queries.sort( key = key )
return queries
def _GetSerialisableInfo( self ):
serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]
serialisable_queries = [ query.GetSerialisableTuple() for query in self._queries ]
serialisable_checker_options = self._checker_options.GetSerialisableTuple()
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
serialisable_tag_options = self._tag_import_options.GetSerialisableTuple()
return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._no_work_until, self._no_work_until_reason )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._no_work_until, self._no_work_until_reason ) = serialisable_info
self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )
self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]
self._queries = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_query ) for serialisable_query in serialisable_queries ]
self._checker_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_checker_options )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
def _NoDelays( self ):
return HydrusData.TimeHasPassed( self._no_work_until )
def _QueryBandwidthIsOK( self, query ):
example_network_contexts = self._GetExampleNetworkContexts( query )
# just a little padding here
expected_requests = 3
expected_bytes = 1048576
threshold = 30
result = HG.client_controller.network_engine.bandwidth_manager.CanDoWork( example_network_contexts, expected_requests = expected_requests, expected_bytes = expected_bytes, threshold = threshold )
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + query.GetQueryText() + '" pre-work bandwidth test. Bandwidth ok: ' + str( result ) + '.' )
return result
def _ShowHitPeriodicFileLimitMessage( self, query_text ):
message = 'When syncing, the query "' + query_text + '" for subscription "' + self._name + '" hit its periodic file limit!'
message += os.linesep * 2
message += 'This may be because the query has not run in a while--so the backlog of files has built up--or that the site has changed how it presents file urls on its gallery pages (and so the subscription thinks it is seeing new files when it truly is not).'
message += os.linesep * 2
message += 'If the former is true, you might want to fill in the gap with a manual download page, but if the latter is true, the maintainer for the download parser (hydrus dev or whoever), would be interested in knowing this information so they can roll out a fix.'
HydrusData.ShowText( message )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, last_error, serialisable_seed_cache ) = old_serialisable_info
check_now = False
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, serialisable_seed_cache )
return ( 2, new_serialisable_info )
if version == 2:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, serialisable_seed_cache ) = old_serialisable_info
no_work_until = 0
no_work_until_reason = ''
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, no_work_until, no_work_until_reason, serialisable_seed_cache )
return ( 3, new_serialisable_info )
if version == 3:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, no_work_until, no_work_until_reason, serialisable_seed_cache ) = old_serialisable_info
checker_options = ClientData.CheckerOptions( 5, period / 5, period * 10, ( 1, period * 10 ) )
seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
query = SubscriptionQuery( query )
query._seed_cache = seed_cache
query._last_check_time = last_checked
query.UpdateNextCheckTime( checker_options )
queries = [ query ]
serialisable_queries = [ query.GetSerialisableTuple() for query in queries ]
serialisable_checker_options = checker_options.GetSerialisableTuple()
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason )
return ( 4, new_serialisable_info )
if version == 4:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason ) = old_serialisable_info
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason )
return ( 5, new_serialisable_info )
def _WorkOnFiles( self, job_key ):
try:
gallery = ClientDownloading.GetGallery( self._gallery_identifier )
except Exception as e:
HydrusData.PrintException( e )
self._DelayWork( HC.UPDATE_DURATION, 'gallery would not load' )
self._paused = True
HydrusData.ShowText( 'The subscription ' + self._name + ' could not load its gallery! It has been paused and the full error has been written to the log!' )
return
error_count = 0
all_presentation_hashes = []
all_presentation_hashes_fast = set()
queries = self._GetQueriesForProcessing()
for query in queries:
this_query_has_done_work = False
query_text = query.GetQueryText()
seed_cache = query.GetSeedCache()
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), method, url, **kwargs )
network_job.OverrideBandwidth( 30 )
job_key.SetVariable( 'popup_network_job', network_job )
return network_job
gallery.SetNetworkJobFactory( network_job_factory )
text_1 = 'downloading files'
file_popup_text = self._name
if query_text != self._name:
text_1 += ' for "' + query_text + '"'
file_popup_text += ': ' + query_text
job_key.SetVariable( 'popup_text_1', text_1 )
num_urls = seed_cache.GetSeedCount()
presentation_hashes = []
presentation_hashes_fast = set()
while True:
num_unknown = seed_cache.GetSeedCount( CC.STATUS_UNKNOWN )
num_done = num_urls - num_unknown
seed = seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + query_text + '" can do no more file work due to running out of unknown urls.' )
break
if job_key.IsCancelled():
self._DelayWork( 300, 'recently cancelled' )
break
p1 = HC.options[ 'pause_subs_sync' ]
p3 = HG.view_shutdown
p4 = not self._QueryBandwidthIsOK( query )
if p1 or p3 or p4:
if p4 and this_query_has_done_work:
job_key.SetVariable( 'popup_text_2', 'no more bandwidth to download files, will do some more later' )
time.sleep( 5 )
break
try:
x_out_of_y = 'file ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_urls ) + ': '
job_key.SetVariable( 'popup_gauge_2', ( num_done, num_urls ) )
if seed.WorksInNewSystem():
def status_hook( text ):
job_key.SetVariable( 'popup_text_2', x_out_of_y + text )
seed.WorkOnPostURL( self._file_import_options, self._tag_import_options, status_hook, GenerateSubscriptionNetworkJobFactory( self._GetNetworkJobSubscriptionKey( query ) ), GenerateSubscriptionNetworkJobPresentationContextFactory( job_key ) )
if seed.ShouldPresent( self._file_import_options ):
hash = seed.GetHash()
if hash not in presentation_hashes_fast:
if hash not in all_presentation_hashes_fast:
all_presentation_hashes.append( hash )
all_presentation_hashes_fast.add( hash )
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
else:
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'checking url status' )
seed.PredictPreImportStatus( self._file_import_options )
status = seed.status
url = seed.seed_data
if status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB() and self._tag_import_options.WorthFetchingTags():
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'found file in db, fetching tags' )
downloaded_tags = gallery.GetTags( url )
seed.AddTags( downloaded_tags )
elif status == CC.STATUS_UNKNOWN:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'downloading file' )
if self._tag_import_options.WorthFetchingTags():
downloaded_tags = gallery.GetFileAndTags( temp_path, url )
seed.AddTags( downloaded_tags )
else:
gallery.GetFile( temp_path, url )
seed.CheckPreFetchMetadata( self._tag_import_options )
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'importing file' )
seed.Import( temp_path, self._file_import_options )
hash = seed.GetHash()
if hash not in presentation_hashes_fast:
if seed.ShouldPresent( self._file_import_options ):
if hash not in all_presentation_hashes_fast:
all_presentation_hashes.append( hash )
all_presentation_hashes_fast.add( hash )
presentation_hashes.append( hash )
presentation_hashes_fast.add( hash )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
seed.WriteContentUpdates( self._tag_import_options )
except HydrusExceptions.CancelledException as e:
self._DelayWork( 300, HydrusData.ToUnicode( e ) )
break
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
seed.SetStatus( status, note = note )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
seed.SetStatus( status, note = note )
except Exception as e:
status = CC.STATUS_ERROR
job_key.SetVariable( 'popup_text_2', x_out_of_y + 'file failed' )
seed.SetStatus( status, exception = e )
if isinstance( e, HydrusExceptions.DataMissing ):
# DataMissing is a quick thing to avoid subscription abandons when lots of deleted files in e621 (or any other booru)
# this should be richer in any case in the new system
pass
else:
error_count += 1
time.sleep( 10 )
if error_count > 4:
raise Exception( 'The subscription ' + self._name + ' encountered several errors when downloading files, so it abandoned its sync.' )
this_query_has_done_work = True
if len( presentation_hashes ) > 0:
job_key.SetVariable( 'popup_files', ( list( presentation_hashes ), file_popup_text ) )
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
HG.client_controller.WaitUntilViewFree()
if len( all_presentation_hashes ) > 0:
files_job_key = ClientThreading.JobKey()
files_job_key.SetVariable( 'popup_files_mergable', True )
files_job_key.SetVariable( 'popup_files', ( all_presentation_hashes, self._name ) )
HG.client_controller.pub( 'message', files_job_key )
job_key.DeleteVariable( 'popup_files' )
job_key.DeleteVariable( 'popup_text_1' )
job_key.DeleteVariable( 'popup_text_2' )
job_key.DeleteVariable( 'popup_gauge_2' )
def _WorkOnFilesCanDoWork( self ):
for query in self._queries:
if query.CanWorkOnFiles():
if self._QueryBandwidthIsOK( query ):
return True
return False
def _SyncQuery( self, job_key ):
have_made_an_initial_sync_bandwidth_notification = False
queries = self._GetQueriesForProcessing()
for query in queries:
can_sync = query.CanSync()
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + query.GetQueryText() + '" started. Current can_sync is ' + str( can_sync ) + '.' )
if not can_sync:
continue
done_first_page = False
query_text = query.GetQueryText()
seed_cache = query.GetSeedCache()
this_is_initial_sync = query.IsInitialSync()
total_new_urls = 0
seeds_to_add = set()
seeds_to_add_ordered = []
prefix = 'synchronising'
if query_text != self._name:
prefix += ' "' + query_text + '"'
job_key.SetVariable( 'popup_text_1', prefix )
for gallery_stream_identifier in self._gallery_stream_identifiers:
if this_is_initial_sync:
if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:
break
else:
if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:
self._ShowHitPeriodicFileLimitMessage( query_text )
break
p1 = HC.options[ 'pause_subs_sync' ]
p2 = job_key.IsCancelled()
p3 = HG.view_shutdown
if p1 or p2 or p3:
break
try:
gallery = ClientDownloading.GetGallery( gallery_stream_identifier )
except Exception as e:
HydrusData.PrintException( e )
self._DelayWork( HC.UPDATE_DURATION, 'gallery would not load' )
self._paused = True
HydrusData.ShowText( 'The subscription ' + self._name + ' could not load its gallery! It has been paused and the full error has been written to the log!' )
return
def network_job_factory( method, url, **kwargs ):
network_job = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), method, url, **kwargs )
job_key.SetVariable( 'popup_network_job', network_job )
network_job.OverrideBandwidth()
return network_job
gallery.SetNetworkJobFactory( network_job_factory )
page_index = 0
num_existing_urls = 0
keep_checking = True
while keep_checking:
new_urls_this_page = 0
try:
p1 = HC.options[ 'pause_subs_sync' ]
p2 = HG.view_shutdown
if p1 or p2:
return
if job_key.IsCancelled():
raise HydrusExceptions.CancelledException( 'gallery parsing cancelled, likely by user' )
next_gallery_page_hit_timestamp = self._last_gallery_page_hit_timestamp + HG.client_controller.new_options.GetInteger( 'gallery_page_wait_period_subscriptions' )
if not HydrusData.TimeHasPassed( next_gallery_page_hit_timestamp ):
if not done_first_page:
page_check_status = 'checking first page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
else:
page_check_status = HydrusData.ConvertIntToPrettyString( total_new_urls ) + ' new urls found, checking next page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )
job_key.SetVariable( 'popup_text_1', prefix + ': ' + page_check_status )
time.sleep( 1 )
continue
job_key.SetVariable( 'popup_text_1', prefix + ': found ' + HydrusData.ConvertIntToPrettyString( total_new_urls ) + ' new urls, checking next page' )
try:
( page_of_seeds, definitely_no_more_pages ) = gallery.GetPage( query_text, page_index )
finally:
self._last_gallery_page_hit_timestamp = HydrusData.GetNow()
done_first_page = True
page_index += 1
if definitely_no_more_pages:
keep_checking = False
for seed in page_of_seeds:
if this_is_initial_sync:
if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:
keep_checking = False
break
else:
if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:
self._ShowHitPeriodicFileLimitMessage( query_text )
keep_checking = False
break
if seed in seeds_to_add:
# this catches the occasional overflow when a new file is uploaded while gallery parsing is going on
continue
if seed_cache.HasSeed( seed ):
num_existing_urls += 1
if num_existing_urls > 5:
keep_checking = False
break
else:
seeds_to_add.add( seed )
seeds_to_add_ordered.append( seed )
new_urls_this_page += 1
total_new_urls += 1
if new_urls_this_page == 0:
keep_checking = False
except HydrusExceptions.CancelledException as e:
self._DelayWork( 300, HydrusData.ToUnicode( e ) )
break
except HydrusExceptions.NotFoundException:
# paheal now 404s when no results, so just naturally break
break
seeds_to_add_ordered.reverse()
# 'first' urls are now at the end, so the seed_cache should stay roughly in oldest->newest order
seed_cache.AddSeeds( seeds_to_add_ordered )
query.RegisterSyncComplete()
query.UpdateNextCheckTime( self._checker_options )
if query.IsDead():
if this_is_initial_sync:
HydrusData.ShowText( 'The query "' + query_text + '" for subscription "' + self._name + '" did not find any files on its first sync! Could the query text have a typo, like a missing underscore?' )
else:
HydrusData.ShowText( 'The query "' + query_text + '" for subscription "' + self._name + '" appears to be dead!' )
else:
if this_is_initial_sync:
if not self._QueryBandwidthIsOK( query ) and not have_made_an_initial_sync_bandwidth_notification:
HydrusData.ShowText( 'FYI: The query "' + query_text + '" for subscription "' + self._name + '" performed its initial sync ok, but that domain is short on bandwidth right now, so no files will be downloaded yet. The subscription will catch up in future as bandwidth becomes available. You can review the estimated time until bandwidth is available under the manage subscriptions dialog. If more queries are performing initial syncs in this run, they may be the same.' )
have_made_an_initial_sync_bandwidth_notification = True
def _SyncQueryCanDoWork( self ):
return True in ( query.CanSync() for query in self._queries )
def CanCheckNow( self ):
return True in ( query.CanCheckNow() for query in self._queries )
def CanReset( self ):
return True in ( not query.IsInitialSync() for query in self._queries )
def CanRetryFailures( self ):
return True in ( query.CanRetryFailed() for query in self._queries )
def CanScrubDelay( self ):
return not HydrusData.TimeHasPassed( self._no_work_until )
def CheckNow( self ):
for query in self._queries:
query.CheckNow()
self.ScrubDelay()
def GetBandwidthWaitingEstimate( self, query ):
example_network_contexts = self._GetExampleNetworkContexts( query )
estimate = HG.client_controller.network_engine.bandwidth_manager.GetWaitingEstimate( example_network_contexts )
return estimate
def GetBandwidthWaitingEstimateMinMax( self ):
if len( self._queries ) == 0:
return ( 0, 0 )
estimates = []
for query in self._queries:
example_network_contexts = self._GetExampleNetworkContexts( query )
estimate = HG.client_controller.network_engine.bandwidth_manager.GetWaitingEstimate( example_network_contexts )
estimates.append( estimate )
min_estimate = min( estimates )
max_estimate = max( estimates )
return ( min_estimate, max_estimate )
def GetGalleryIdentifier( self ):
return self._gallery_identifier
def GetQueries( self ):
return self._queries
def GetTagImportOptions( self ):
return self._tag_import_options
def HasQuerySearchText( self, search_text ):
for query in self._queries:
query_text = query.GetQueryText()
if search_text in query_text:
return True
return False
def Merge( self, potential_mergee_subscriptions ):
unmergable_subscriptions = []
for subscription in potential_mergee_subscriptions:
if subscription._gallery_identifier == self._gallery_identifier:
my_new_queries = [ query.Duplicate() for query in subscription._queries ]
self._queries.extend( my_new_queries )
else:
unmergable_subscriptions.append( subscription )
return unmergable_subscriptions
def PauseResume( self ):
self._paused = not self._paused
def Reset( self ):
for query in self._queries:
query.Reset()
self.ScrubDelay()
def RetryFailures( self ):
for query in self._queries:
query.RetryFailures()
def ReviveDead( self ):
for query in self._queries:
if query.IsDead():
query.CheckNow()
def Separate( self, base_name, only_these_queries = None ):
if only_these_queries is None:
only_these_queries = set( self._queries )
else:
only_these_queries = set( only_these_queries )
subscriptions = []
for query in self._queries:
if query not in only_these_queries:
continue
subscription = self.Duplicate()
subscription._queries = [ query.Duplicate() ]
subscription.SetName( base_name + ': ' + query.GetQueryText() )
subscriptions.append( subscription )
self._queries = [ query for query in self._queries if query not in only_these_queries ]
return subscriptions
def SetCheckerOptions( self, checker_options ):
self._checker_options = checker_options
for query in self._queries:
query.UpdateNextCheckTime( self._checker_options )
def SetTuple( self, gallery_identifier, gallery_stream_identifiers, queries, checker_options, initial_file_limit, periodic_file_limit, paused, file_import_options, tag_import_options, no_work_until ):
self._gallery_identifier = gallery_identifier
self._gallery_stream_identifiers = gallery_stream_identifiers
self._queries = queries
self._checker_options = checker_options
self._initial_file_limit = initial_file_limit
self._periodic_file_limit = periodic_file_limit
self._paused = paused
self._file_import_options = file_import_options
self._tag_import_options = tag_import_options
self._no_work_until = no_work_until
def ScrubDelay( self ):
self._no_work_until = 0
self._no_work_until_reason = ''
def Sync( self ):
p1 = not self._paused
p2 = not HG.view_shutdown
p3 = self._NoDelays()
p4 = self._SyncQueryCanDoWork()
p5 = self._WorkOnFilesCanDoWork()
if HG.subscription_report_mode:
message = 'Subscription "' + self._name + '" entered sync.'
message += os.linesep
message += 'Unpaused: ' + str( p1 )
message += os.linesep
message += 'No delays: ' + str( p3 )
message += os.linesep
message += 'Sync can do work: ' + str( p4 )
message += os.linesep
message += 'Files can do work: ' + str( p5 )
HydrusData.ShowText( message )
if p1 and p2 and p3 and ( p4 or p5 ):
job_key = ClientThreading.JobKey( pausable = False, cancellable = True )
try:
job_key.SetVariable( 'popup_title', 'subscriptions - ' + self._name )
HG.client_controller.pub( 'message', job_key )
self._SyncQuery( job_key )
self._WorkOnFiles( job_key )
except HydrusExceptions.NetworkException as e:
if isinstance( e, HydrusExceptions.NetworkInfrastructureException ):
delay = 3600
else:
delay = HC.UPDATE_DURATION
HydrusData.Print( 'The subscription ' + self._name + ' encountered an exception when trying to sync:' )
HydrusData.PrintException( e )
job_key.SetVariable( 'popup_text_1', 'Encountered a network error, will retry again later' )
self._DelayWork( delay, 'network error: ' + HydrusData.ToUnicode( e ) )
time.sleep( 5 )
except Exception as e:
HydrusData.ShowText( 'The subscription ' + self._name + ' encountered an exception when trying to sync:' )
HydrusData.ShowException( e )
self._DelayWork( HC.UPDATE_DURATION, 'error: ' + HydrusData.ToUnicode( e ) )
finally:
job_key.DeleteVariable( 'popup_network_job' )
HG.client_controller.WriteSynchronous( 'serialisable', self )
if job_key.HasVariable( 'popup_files' ):
job_key.Finish()
else:
job_key.Delete()
def ToTuple( self ):
return ( self._name, self._gallery_identifier, self._gallery_stream_identifiers, self._queries, self._checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, self._file_import_options, self._tag_import_options, self._no_work_until, self._no_work_until_reason )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION ] = Subscription
class SubscriptionQuery( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION_QUERY
SERIALISABLE_NAME = 'Subscription Query'
SERIALISABLE_VERSION = 1
def __init__( self, query = 'query text' ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._query = query
self._check_now = False
self._last_check_time = 0
self._next_check_time = 0
self._paused = False
self._status = CHECKER_STATUS_OK
self._seed_cache = SeedCache()
def _GetSerialisableInfo( self ):
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
return ( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, serialisable_seed_cache )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, serialisable_seed_cache ) = serialisable_info
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def CanWorkOnFiles( self ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + self._query + '" CanWorkOnFiles test. Next import is ' + repr( seed ) + '.' )
return seed is not None
def CanCheckNow( self ):
return not self._check_now
def CanRetryFailed( self ):
return self._seed_cache.GetSeedCount( CC.STATUS_ERROR ) > 0
def CanSync( self ):
if HG.subscription_report_mode:
HydrusData.ShowText( 'Query "' + self._query + '" CanSync test. Paused status is ' + str( self._paused ) + ' and check time due is ' + str( HydrusData.TimeHasPassed( self._next_check_time ) ) + ' and check_now is ' + str( self._check_now ) + '.' )
if self._paused:
return False
return HydrusData.TimeHasPassed( self._next_check_time ) or self._check_now
def CheckNow( self ):
self._check_now = True
self._paused = False
self._next_check_time = 0
self._status = CHECKER_STATUS_OK
def GetLastChecked( self ):
return self._last_check_time
def GetLatestAddedTime( self ):
return self._seed_cache.GetLatestAddedTime()
def GetNextCheckStatusString( self ):
if self._check_now:
return 'checking on dialog ok'
elif self._status == CHECKER_STATUS_DEAD:
return 'dead, so not checking'
elif self._paused:
return 'paused, but would be ' + HydrusData.ConvertTimestampToPrettyPending( self._next_check_time )
else:
return HydrusData.ConvertTimestampToPrettyPending( self._next_check_time )
def GetNumURLsAndFailed( self ):
return ( self._seed_cache.GetSeedCount( CC.STATUS_UNKNOWN ), len( self._seed_cache ), self._seed_cache.GetSeedCount( CC.STATUS_ERROR ) )
def GetQueryText( self ):
return self._query
def GetSeedCache( self ):
return self._seed_cache
def IsDead( self ):
return self._status == CHECKER_STATUS_DEAD
def IsInitialSync( self ):
return self._last_check_time == 0
def IsPaused( self ):
return self._paused
def PausePlay( self ):
self._paused = not self._paused
def RegisterSyncComplete( self ):
self._last_check_time = HydrusData.GetNow()
self._check_now = False
def Reset( self ):
self._last_check_time = 0
self._next_check_time = 0
self._status = CHECKER_STATUS_OK
self._paused = False
self._seed_cache = SeedCache()
def RetryFailures( self ):
self._seed_cache.RetryFailures()
def SetCheckNow( self, check_now ):
self._check_now = check_now
def SetPaused( self, paused ):
self._paused = paused
def SetQueryAndSeedCache( self, query, seed_cache ):
self._query = query
self._seed_cache = seed_cache
def UpdateNextCheckTime( self, checker_options ):
if self._check_now:
self._next_check_time = 0
self._status = CHECKER_STATUS_OK
else:
if checker_options.IsDead( self._seed_cache, self._last_check_time ):
self._status = CHECKER_STATUS_DEAD
self._paused = True
self._next_check_time = checker_options.GetNextCheckTime( self._seed_cache, self._last_check_time )
def ToTuple( self ):
return ( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, self._seed_cache )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION_QUERY ] = SubscriptionQuery
class ThreadWatcherImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_THREAD_WATCHER_IMPORT
SERIALISABLE_NAME = 'Thread Watcher'
SERIALISABLE_VERSION = 4
MIN_CHECK_PERIOD = 30
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
new_options = HG.client_controller.new_options
tag_import_options = new_options.GetDefaultTagImportOptions( ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_THREAD_WATCHER ) )
self._thread_url = ''
self._seed_cache = SeedCache()
self._urls_to_filenames = {}
self._urls_to_md5_base64 = {}
self._checker_options = new_options.GetDefaultThreadCheckerOptions()
self._file_import_options = file_import_options
self._tag_import_options = tag_import_options
self._last_check_time = 0
self._thread_status = CHECKER_STATUS_OK
self._thread_subject = 'unknown subject'
self._next_check_time = None
self._download_control_file_set = None
self._download_control_file_clear = None
self._download_control_thread_set = None
self._download_control_thread_clear = None
self._check_now = False
self._files_paused = False
self._thread_paused = False
self._no_work_until = 0
self._no_work_until_reason = ''
self._file_velocity_status = ''
self._current_action = ''
self._watcher_status = ''
self._thread_key = HydrusData.GenerateKey()
self._lock = threading.Lock()
self._last_pubbed_page_name = ''
self._new_files_event = threading.Event()
self._new_thread_event = threading.Event()
def _CheckThread( self, page_key ):
error_occurred = False
watcher_status_should_stick = True
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self._thread_url )
if url_type != HC.URL_TYPE_WATCHABLE:
error_occurred = True
watcher_status = 'Did not understand the given URL as watchable!'
elif not can_parse:
error_occurred = True
watcher_status = 'Could not parse the given URL!'
if not error_occurred:
# convert to API url as appropriate
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( self._thread_url )
if parser is None:
error_occurred = True
watcher_status = 'Could not find a parser for the given URL!'
if error_occurred:
self._FinishCheck( page_key, watcher_status, error_occurred, watcher_status_should_stick )
return
#
with self._lock:
self._watcher_status = 'checking thread'
try:
network_job = ClientNetworkingJobs.NetworkJobWatcherPage( self._thread_key, 'GET', url_to_check )
network_job.OverrideBandwidth()
HG.client_controller.network_engine.AddJob( network_job )
with self._ThreadNetworkJobPresentationContextFactory( network_job ):
network_job.WaitUntilDone()
data = network_job.GetContent()
parsing_context = {}
parsing_context[ 'thread_url' ] = self._thread_url
parsing_context[ 'url' ] = url_to_check
all_parse_results = parser.Parse( parsing_context, data )
subject = ClientParsing.GetTitleFromAllParseResults( all_parse_results )
if subject is None:
subject = ''
with self._lock:
self._thread_subject = subject
( num_new, num_already_in ) = UpdateSeedCacheWithAllParseResults( self._seed_cache, all_parse_results, source_url = self._thread_url, tag_import_options = self._tag_import_options )
watcher_status = 'thread checked OK - ' + HydrusData.ConvertIntToPrettyString( num_new ) + ' new urls'
watcher_status_should_stick = False
if num_new > 0:
self._new_files_event.set()
except HydrusExceptions.ShutdownException:
return
except HydrusExceptions.ParseException as e:
error_occurred = True
watcher_status = 'Was unable to parse the returned data! Full error written to log!'
HydrusData.PrintException( e )
except HydrusExceptions.NotFoundException:
error_occurred = True
with self._lock:
self._thread_status = CHECKER_STATUS_404
watcher_status = ''
except HydrusExceptions.NetworkException as e:
self._DelayWork( 4 * 3600, 'Network problem: ' + HydrusData.ToUnicode( e ) )
watcher_status = ''
HydrusData.PrintException( e )
except Exception as e:
error_occurred = True
watcher_status = HydrusData.ToUnicode( e )
HydrusData.PrintException( e )
self._FinishCheck( page_key, watcher_status, error_occurred, watcher_status_should_stick )
def _DelayWork( self, time_delta, reason ):
self._no_work_until = HydrusData.GetNow() + time_delta
self._no_work_until_reason = reason
def _FileNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_file_set is not None:
wx.CallAfter( self._download_control_file_set, network_job )
def exit_call():
with self._lock:
if self._download_control_file_clear is not None:
wx.CallAfter( self._download_control_file_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _FinishCheck( self, page_key, watcher_status, error_occurred, watcher_status_should_stick ):
if error_occurred:
# the [DEAD] stuff can override watcher status, so let's give a brief time for this to display the error
with self._lock:
self._thread_paused = True
self._watcher_status = watcher_status
time.sleep( 5 )
with self._lock:
if self._check_now:
self._check_now = False
self._watcher_status = watcher_status
self._last_check_time = HydrusData.GetNow()
self._UpdateFileVelocityStatus()
self._UpdateNextCheckTime()
self._PublishPageName( page_key )
if not watcher_status_should_stick:
time.sleep( 5 )
with self._lock:
self._watcher_status = ''
HG.client_controller.pub( 'refresh_page_name', page_key )
def _GetSerialisableInfo( self ):
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_checker_options = self._checker_options.GetSerialisableTuple()
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
serialisable_tag_options = self._tag_import_options.GetSerialisableTuple()
return ( self._thread_url, serialisable_seed_cache, self._urls_to_filenames, self._urls_to_md5_base64, serialisable_checker_options, serialisable_file_options, serialisable_tag_options, self._last_check_time, self._files_paused, self._thread_paused, self._thread_status, self._thread_subject, self._no_work_until, self._no_work_until_reason )
def _HasThread( self ):
return self._thread_url != ''
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._thread_url, serialisable_seed_cache, self._urls_to_filenames, self._urls_to_md5_base64, serialisable_checker_options, serialisable_file_options, serialisable_tag_options, self._last_check_time, self._files_paused, self._thread_paused, self._thread_status, self._thread_subject, self._no_work_until, self._no_work_until_reason ) = serialisable_info
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
self._checker_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_checker_options )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
def _PublishPageName( self, page_key ):
new_options = HG.client_controller.new_options
cannot_rename = not new_options.GetBoolean( 'permit_watchers_to_name_their_pages' )
if cannot_rename:
page_name = 'thread watcher'
elif self._thread_subject in ( '', 'unknown subject' ):
page_name = 'thread watcher'
else:
page_name = self._thread_subject
if self._thread_status == CHECKER_STATUS_404:
thread_watcher_not_found_page_string = new_options.GetNoneableString( 'thread_watcher_not_found_page_string' )
if thread_watcher_not_found_page_string is not None:
page_name = thread_watcher_not_found_page_string + ' ' + page_name
elif self._thread_status == CHECKER_STATUS_DEAD:
thread_watcher_dead_page_string = new_options.GetNoneableString( 'thread_watcher_dead_page_string' )
if thread_watcher_dead_page_string is not None:
page_name = thread_watcher_dead_page_string + ' ' + page_name
elif self._thread_paused:
thread_watcher_paused_page_string = new_options.GetNoneableString( 'thread_watcher_paused_page_string' )
if thread_watcher_paused_page_string is not None:
page_name = thread_watcher_paused_page_string + ' ' + page_name
if page_name != self._last_pubbed_page_name:
HG.client_controller.pub( 'rename_page', page_key, page_name )
self._last_pubbed_page_name = page_name
def _ThreadNetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_thread_set is not None:
wx.CallAfter( self._download_control_thread_set, network_job )
def exit_call():
with self._lock:
if self._download_control_thread_clear is not None:
wx.CallAfter( self._download_control_thread_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _UpdateFileVelocityStatus( self ):
self._file_velocity_status = self._checker_options.GetPrettyCurrentVelocity( self._seed_cache, self._last_check_time )
def _UpdateNextCheckTime( self ):
if self._check_now:
self._next_check_time = self._last_check_time + self.MIN_CHECK_PERIOD
else:
if not HydrusData.TimeHasPassed( self._no_work_until ):
self._next_check_time = self._no_work_until + 1
else:
if self._thread_status != CHECKER_STATUS_404:
if self._checker_options.IsDead( self._seed_cache, self._last_check_time ):
self._thread_status = CHECKER_STATUS_DEAD
self._thread_paused = True
self._next_check_time = self._checker_options.GetNextCheckTime( self._seed_cache, self._last_check_time )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( thread_url, serialisable_seed_cache, urls_to_filenames, urls_to_md5_base64, serialisable_file_options, serialisable_tag_options, times_to_check, check_period, last_check_time, paused ) = old_serialisable_info
checker_options = ClientData.CheckerOptions( intended_files_per_check = 8, never_faster_than = 300, never_slower_than = 86400, death_file_velocity = ( 1, 86400 ) )
serialisable_checker_options = checker_options.GetSerialisableTuple()
files_paused = paused
thread_paused = paused
new_serialisable_info = ( thread_url, serialisable_seed_cache, urls_to_filenames, urls_to_md5_base64, serialisable_checker_options, serialisable_file_options, serialisable_tag_options, last_check_time, files_paused, thread_paused )
return ( 2, new_serialisable_info )
if version == 2:
( thread_url, serialisable_seed_cache, urls_to_filenames, urls_to_md5_base64, serialisable_checker_options, serialisable_file_options, serialisable_tag_options, last_check_time, files_paused, thread_paused ) = old_serialisable_info
thread_status = CHECKER_STATUS_OK
thread_subject = 'unknown subject'
new_serialisable_info = ( thread_url, serialisable_seed_cache, urls_to_filenames, urls_to_md5_base64, serialisable_checker_options, serialisable_file_options, serialisable_tag_options, last_check_time, files_paused, thread_paused, thread_status, thread_subject )
return ( 3, new_serialisable_info )
if version == 3:
( thread_url, serialisable_seed_cache, urls_to_filenames, urls_to_md5_base64, serialisable_checker_options, serialisable_file_options, serialisable_tag_options, last_check_time, files_paused, thread_paused, thread_status, thread_subject ) = old_serialisable_info
no_work_until = 0
no_work_until_reason = ''
new_serialisable_info = ( thread_url, serialisable_seed_cache, urls_to_filenames, urls_to_md5_base64, serialisable_checker_options, serialisable_file_options, serialisable_tag_options, last_check_time, files_paused, thread_paused, thread_status, thread_subject, no_work_until, no_work_until_reason )
return ( 4, new_serialisable_info )
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
file_url = seed.seed_data
try:
def status_hook( text ):
with self._lock:
self._current_action = text
did_substantial_work = seed.WorkOnFileURL( self._file_import_options, status_hook, GenerateWatcherNetworkJobFactory( self._thread_key ), self._FileNetworkJobPresentationContextFactory, tag_import_options = self._tag_import_options )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
except HydrusExceptions.ShutdownException:
return
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
seed.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
seed.SetStatus( status, note = note )
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
time.sleep( 3 )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
HG.client_controller.pub( 'refresh_page_name', page_key )
with self._lock:
self._current_action = ''
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _THREADWorkOnFiles( self, page_key ):
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
no_work_to_do = self._files_paused or self._thread_url == '' or not self._seed_cache.WorkToDo()
if no_work_to_do or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_files_event.wait( 5 )
else:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
self._new_files_event.clear()
def _THREADWorkOnThread( self, page_key ):
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
with self._lock:
able_to_check = self._HasThread() and not self._thread_paused
check_due = HydrusData.TimeHasPassed( self._next_check_time )
no_delays = HydrusData.TimeHasPassed( self._no_work_until )
time_to_check = able_to_check and check_due and no_delays
if not time_to_check or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_thread_event.wait( 5 )
else:
try:
self._CheckThread( page_key )
time.sleep( 5 )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
with self._lock:
self._PublishPageName( page_key )
self._new_thread_event.clear()
def CheckNow( self ):
with self._lock:
self._check_now = True
self._thread_paused = False
self._no_work_until = 0
self._no_work_until_reason = ''
self._thread_status = CHECKER_STATUS_OK
self._UpdateNextCheckTime()
self._new_thread_event.set()
def CurrentlyWorking( self ):
with self._lock:
finished = not self._seed_cache.WorkToDo()
return not finished and not self._files_paused
def GetSeedCache( self ):
return self._seed_cache
def GetOptions( self ):
with self._lock:
return ( self._thread_url, self._file_import_options, self._tag_import_options )
def GetStatus( self ):
with self._lock:
if not HydrusData.TimeHasPassed( self._no_work_until ):
watcher_status = self._no_work_until_reason + ' - ' + 'next check ' + HydrusData.ConvertTimestampToPrettyPending( self._next_check_time )
elif self._thread_status == CHECKER_STATUS_404:
watcher_status = 'Thread 404'
elif self._thread_status == CHECKER_STATUS_DEAD:
watcher_status = 'Thread dead'
else:
watcher_status = self._watcher_status
return ( self._current_action, self._files_paused, self._file_velocity_status, self._next_check_time, watcher_status, self._thread_subject, self._thread_status, self._check_now, self._thread_paused )
def GetCheckerOptions( self ):
with self._lock:
return self._checker_options
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def HasThread( self ):
with self._lock:
return self._HasThread()
def IsDead( self ):
with self._lock:
return self._thread_status in ( CHECKER_STATUS_404, CHECKER_STATUS_DEAD )
def PausePlayFiles( self ):
with self._lock:
self._files_paused = not self._files_paused
self._new_files_event.set()
def PausePlayThread( self ):
with self._lock:
if self._thread_paused and self._checker_options.IsDead( self._seed_cache, self._last_check_time ):
return # thread is dead, so don't unpause until a checknow event
else:
self._thread_paused = not self._thread_paused
self._new_thread_event.set()
def SetDownloadControlFile( self, download_control ):
with self._lock:
self._download_control_file_set = download_control.SetNetworkJob
self._download_control_file_clear = download_control.ClearNetworkJob
def SetDownloadControlThread( self, download_control ):
with self._lock:
self._download_control_thread_set = download_control.SetNetworkJob
self._download_control_thread_clear = download_control.ClearNetworkJob
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def SetTagImportOptions( self, tag_import_options ):
with self._lock:
self._tag_import_options = tag_import_options
def SetThreadURL( self, thread_url ):
if thread_url is None:
thread_url = ''
if thread_url != '':
thread_url = HG.client_controller.network_engine.domain_manager.NormaliseURL( thread_url )
with self._lock:
self._thread_url = thread_url
self._new_thread_event.set()
def SetCheckerOptions( self, checker_options ):
with self._lock:
self._checker_options = checker_options
self._thread_paused = False
self._UpdateNextCheckTime()
self._UpdateFileVelocityStatus()
self._new_thread_event.set()
def Start( self, page_key ):
self._UpdateNextCheckTime()
self._PublishPageName( page_key )
self._UpdateFileVelocityStatus()
HG.client_controller.CallToThreadLongRunning( self._THREADWorkOnThread, page_key )
HG.client_controller.CallToThreadLongRunning( self._THREADWorkOnFiles, page_key )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_THREAD_WATCHER_IMPORT ] = ThreadWatcherImport
class URLsImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_URLS_IMPORT
SERIALISABLE_NAME = 'URL Import'
SERIALISABLE_VERSION = 1
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._seed_cache = SeedCache()
self._file_import_options = file_import_options
self._paused = False
self._seed_cache_status = ( 'initialising', ( 0, 1 ) )
self._download_control_file_set = None
self._download_control_file_clear = None
self._lock = threading.Lock()
self._new_urls_event = threading.Event()
def _GetSerialisableInfo( self ):
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_file_options = self._file_import_options.GetSerialisableTuple()
return ( serialisable_seed_cache, serialisable_file_options, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_seed_cache, serialisable_file_options, self._paused ) = serialisable_info
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
def _NetworkJobPresentationContextFactory( self, network_job ):
def enter_call():
with self._lock:
if self._download_control_file_set is not None:
wx.CallAfter( self._download_control_file_set, network_job )
def exit_call():
with self._lock:
if self._download_control_file_clear is not None:
wx.CallAfter( self._download_control_file_clear )
return NetworkJobPresentationContext( enter_call, exit_call )
def _RegenerateSeedCacheStatus( self ):
new_seed_cache_status = self._seed_cache.GetStatus()
if self._seed_cache_status != new_seed_cache_status:
self._seed_cache_status = new_seed_cache_status
def _WorkOnFiles( self, page_key ):
seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if seed is None:
return
did_substantial_work = False
url = seed.seed_data
try:
with self._lock:
self._RegenerateSeedCacheStatus()
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( url )
if url_type in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_POST, HC.URL_TYPE_WATCHABLE ) and not can_parse:
message = 'This URL was recognised as a "' + match_name + '" but this URL class does not yet have a parsing script linked to it!'
message += os.linesep * 2
message += 'Since this URL cannot be parsed, a downloader cannot be created for it! Please check your url class links under the \'networking\' menu.'
raise HydrusExceptions.ParseException( message )
if url_type in ( HC.URL_TYPE_UNKNOWN, HC.URL_TYPE_FILE ):
did_substantial_work = self._WorkOnFilesRawURL( page_key, seed )
elif url_type == HC.URL_TYPE_POST:
did_substantial_work = self._WorkOnFilesPostURL( page_key, seed )
elif url_type in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
raise NotImplementedError( 'Unfortunately, galleries and watchable urls do not work here yet!' )
except Exception as e:
status = CC.STATUS_ERROR
seed.SetStatus( status, exception = e )
time.sleep( 3 )
finally:
self._seed_cache.NotifySeedsUpdated( ( seed, ) )
HG.client_controller.pub( 'refresh_page_name', page_key )
with self._lock:
self._RegenerateSeedCacheStatus()
if did_substantial_work:
time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )
def _WorkOnFilesPostURL( self, page_key, seed ):
url = seed.seed_data
tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( url )
status_hook = lambda s: s # do nothing for now
did_substantial_work = seed.WorkOnPostURL( self._file_import_options, tag_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._NetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
return did_substantial_work
def _WorkOnFilesRawURL( self, page_key, seed ):
status_hook = lambda s: s # do nothing for now
did_substantial_work = seed.WorkOnFileURL( self._file_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._NetworkJobPresentationContextFactory )
if seed.ShouldPresent( self._file_import_options ):
seed.PresentToPage( page_key )
did_substantial_work = True
return did_substantial_work
def _THREADWork( self, page_key ):
with self._lock:
self._RegenerateSeedCacheStatus()
while not ( HG.view_shutdown or HG.client_controller.PageCompletelyDestroyed( page_key ) ):
no_work_to_do = self._paused or not self._seed_cache.WorkToDo()
if no_work_to_do or HG.client_controller.PageClosedButNotDestroyed( page_key ):
self._new_urls_event.wait( 5 )
else:
try:
self._WorkOnFiles( page_key )
HG.client_controller.WaitUntilViewFree()
except Exception as e:
HydrusData.ShowException( e )
return
self._new_urls_event.clear()
def CurrentlyWorking( self ):
with self._lock:
finished = not self._seed_cache.WorkToDo()
return not finished and not self._paused
def GetSeedCache( self ):
return self._seed_cache
def GetOptions( self ):
with self._lock:
return self._file_import_options
def GetStatus( self ):
with self._lock:
return ( self._seed_cache_status, self._paused )
def GetValueRange( self ):
with self._lock:
return self._seed_cache.GetValueRange()
def PausePlay( self ):
with self._lock:
self._paused = not self._paused
self._new_urls_event.set()
def PendURLs( self, urls ):
with self._lock:
urls = filter( lambda u: len( u ) > 1, urls ) # > _1_ to take out the occasional whitespace
seeds = [ Seed( SEED_TYPE_URL, url ) for url in urls ]
if len( seeds ) > 0:
self._seed_cache.AddSeeds( seeds )
self._new_urls_event.set()
def SetDownloadControlFile( self, download_control ):
with self._lock:
self._download_control_file_set = download_control.SetNetworkJob
self._download_control_file_clear = download_control.ClearNetworkJob
def SetFileImportOptions( self, file_import_options ):
with self._lock:
self._file_import_options = file_import_options
def Start( self, page_key ):
HG.client_controller.CallToThreadLongRunning( self._THREADWork, page_key )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_URLS_IMPORT ] = URLsImport