hydrus/include/ClientImporting.py

2990 lines
103 KiB
Python

import bs4
import ClientConstants as CC
import ClientData
import ClientDefaults
import ClientDownloading
import ClientFiles
import ClientThreading
import collections
import HydrusConstants as HC
import HydrusData
import HydrusExceptions
import HydrusFileHandling
import HydrusGlobals
import HydrusPaths
import HydrusSerialisable
import HydrusTags
import json
import os
import random
import shutil
import threading
import time
import traceback
import urlparse
import wx
import HydrusThreading
class GalleryImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT
SERIALISABLE_VERSION = 1
def __init__( self, gallery_identifier = None ):
if gallery_identifier is None:
gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )
HydrusSerialisable.SerialisableBase.__init__( self )
self._gallery_identifier = gallery_identifier
self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )
self._current_query = None
self._current_query_num_urls = 0
self._current_gallery_stream_identifier = None
self._current_gallery_stream_identifier_page_index = 0
self._current_gallery_stream_identifier_found_urls = set()
self._pending_gallery_stream_identifiers = []
self._pending_queries = []
self._get_tags_if_redundant = True
self._file_limit = HC.options[ 'gallery_file_limit' ]
self._gallery_paused = False
self._files_paused = False
self._import_file_options = ClientDefaults.GetDefaultImportFileOptions()
new_options = HydrusGlobals.client_controller.GetNewOptions()
self._import_tag_options = new_options.GetDefaultImportTagOptions( self._gallery_identifier )
self._seed_cache = SeedCache()
self._lock = threading.Lock()
self._gallery = None
self._gallery_status = 'ready to start'
self._seed_cache_status = ( 'initialising', ( 0, 1 ) )
self._file_download_hook = None
def _GetSerialisableInfo( self ):
serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]
if self._current_gallery_stream_identifier is None:
serialisable_current_gallery_stream_identifier = None
else:
serialisable_current_gallery_stream_identifier = self._current_gallery_stream_identifier.GetSerialisableTuple()
serialisable_current_gallery_stream_identifier_found_urls = list( self._current_gallery_stream_identifier_found_urls )
serialisable_pending_gallery_stream_identifiers = [ pending_gallery_stream_identifier.GetSerialisableTuple() for pending_gallery_stream_identifier in self._pending_gallery_stream_identifiers ]
serialisable_file_options = self._import_file_options.GetSerialisableTuple()
serialisable_tag_options = self._import_tag_options.GetSerialisableTuple()
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
serialisable_current_query_stuff = ( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifiers )
return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._get_tags_if_redundant, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._get_tags_if_redundant, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache ) = serialisable_info
( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifier ) = serialisable_current_query_stuff
self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )
self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]
if serialisable_current_gallery_stream_identifier is None:
self._current_gallery_stream_identifier = None
else:
self._current_gallery_stream_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_current_gallery_stream_identifier )
self._current_gallery_stream_identifier_found_urls = set( serialisable_current_gallery_stream_identifier_found_urls )
self._pending_gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_pending_gallery_stream_identifier ) for serialisable_pending_gallery_stream_identifier in serialisable_pending_gallery_stream_identifier ]
self._import_file_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._import_tag_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def _RegenerateSeedCacheStatus( self, page_key ):
new_seed_cache_status = self._seed_cache.GetStatus()
if self._seed_cache_status != new_seed_cache_status:
self._seed_cache_status = new_seed_cache_status
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _SetGalleryStatus( self, page_key, text ):
if self._gallery_status != text:
self._gallery_status = text
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _WorkOnFiles( self, page_key ):
if self._files_paused:
return
do_wait = True
url = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if url is None:
return
gallery = ClientDownloading.GetGallery( self._gallery_identifier )
try:
( status, hash ) = HydrusGlobals.client_controller.Read( 'url_status', url )
if status == CC.STATUS_DELETED:
if not self._import_file_options.GetExcludeDeleted():
status = CC.STATUS_NEW
tags = []
if status == CC.STATUS_REDUNDANT:
if self._get_tags_if_redundant and self._import_tag_options.ShouldFetchTags():
tags = gallery.GetTags( url, report_hooks = [ self._file_download_hook ] )
else:
do_wait = False
elif status == CC.STATUS_NEW:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
# status: x_out_of_y + 'downloading file'
if self._import_tag_options.ShouldFetchTags():
tags = gallery.GetFileAndTags( temp_path, url, report_hooks = [ self._file_download_hook ] )
else:
gallery.GetFile( temp_path, url, report_hooks = [ self._file_download_hook ] )
client_files_manager = HydrusGlobals.client_controller.GetClientFilesManager()
( status, hash ) = client_files_manager.ImportFile( temp_path, import_file_options = self._import_file_options, url = url )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
else:
do_wait = False
self._seed_cache.UpdateSeedStatus( url, status )
if status in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
service_keys_to_content_updates = self._import_tag_options.GetServiceKeysToContentUpdates( hash, tags )
if len( service_keys_to_content_updates ) > 0:
HydrusGlobals.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', ( hash, ) )
HydrusGlobals.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
except HydrusExceptions.MimeException as e:
status = CC.STATUS_UNINTERESTING_MIME
self._seed_cache.UpdateSeedStatus( url, status )
except Exception as e:
status = CC.STATUS_FAILED
self._seed_cache.UpdateSeedStatus( url, status, exception = e )
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
if do_wait:
ClientData.WaitPolitely( page_key )
def _WorkOnGallery( self, page_key ):
with self._lock:
if self._gallery_paused:
self._SetGalleryStatus( page_key, 'paused' )
return
if self._current_query is None:
if len( self._pending_queries ) == 0:
self._SetGalleryStatus( page_key, '' )
return
else:
self._current_query = self._pending_queries.pop( 0 )
self._current_query_num_urls = 0
self._current_gallery_stream_identifier = None
self._pending_gallery_stream_identifiers = list( self._gallery_stream_identifiers )
if self._current_gallery_stream_identifier is None:
if len( self._pending_gallery_stream_identifiers ) == 0:
self._SetGalleryStatus( page_key, self._current_query + ' produced ' + HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls' )
self._current_query = None
return
else:
self._current_gallery_stream_identifier = self._pending_gallery_stream_identifiers.pop( 0 )
self._current_gallery_stream_identifier_page_index = 0
self._current_gallery_stream_identifier_found_urls = set()
gallery = ClientDownloading.GetGallery( self._current_gallery_stream_identifier )
query = self._current_query
page_index = self._current_gallery_stream_identifier_page_index
self._SetGalleryStatus( page_key, HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found, now checking page ' + HydrusData.ConvertIntToPrettyString( self._current_gallery_stream_identifier_page_index + 1 ) )
error_occured = False
try:
( page_of_urls, definitely_no_more_pages ) = gallery.GetPage( query, page_index )
with self._lock:
no_urls_found = len( page_of_urls ) == 0
no_new_urls = len( self._current_gallery_stream_identifier_found_urls.intersection( page_of_urls ) ) == len( page_of_urls )
if definitely_no_more_pages or no_urls_found or no_new_urls:
self._current_gallery_stream_identifier = None
else:
self._current_gallery_stream_identifier_page_index += 1
self._current_gallery_stream_identifier_found_urls.update( page_of_urls )
for url in page_of_urls:
if not self._seed_cache.HasSeed( url ):
with self._lock:
if self._file_limit is not None and self._current_query_num_urls + 1 > self._file_limit:
self._current_gallery_stream_identifier = None
self._pending_gallery_stream_identifiers = []
break
self._current_query_num_urls += 1
self._seed_cache.AddSeed( url )
except Exception as e:
if isinstance( e, HydrusExceptions.NotFoundException ):
text = 'Gallery 404'
else:
text = str( e )
traceback.print_exc()
with self._lock:
self._current_gallery_stream_identifier = None
self._SetGalleryStatus( page_key, text )
time.sleep( 5 )
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
self._SetGalleryStatus( page_key, HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found so far for ' + query )
ClientData.WaitPolitely( page_key )
def _THREADWork( self, page_key ):
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
while not ( HydrusGlobals.view_shutdown or HydrusGlobals.client_controller.PageDeleted( page_key ) ):
if HydrusGlobals.client_controller.PageHidden( page_key ):
time.sleep( 0.1 )
else:
try:
if not self._gallery_paused:
self._WorkOnGallery( page_key )
if not self._files_paused:
self._WorkOnFiles( page_key )
time.sleep( 0.1 )
HydrusGlobals.client_controller.WaitUntilPubSubsEmpty()
except Exception as e:
HydrusData.ShowException( e )
return
def AdvanceQuery( self, query ):
with self._lock:
if query in self._pending_queries:
index = self._pending_queries.index( query )
if index - 1 >= 0:
self._pending_queries.remove( query )
self._pending_queries.insert( index - 1, query )
def DelayQuery( self, query ):
with self._lock:
if query in self._pending_queries:
index = self._pending_queries.index( query )
if index + 1 < len( self._pending_queries ):
self._pending_queries.remove( query )
self._pending_queries.insert( index + 1, query )
def DeleteQuery( self, query ):
with self._lock:
if query in self._pending_queries:
self._pending_queries.remove( query )
def FinishCurrentQuery( self ):
with self._lock:
self._current_query = None
self._gallery_paused = False
def GetGalleryIdentifier( self ):
return self._gallery_identifier
def GetOptions( self ):
with self._lock:
return ( self._import_file_options, self._import_tag_options, self._get_tags_if_redundant, self._file_limit )
def GetSeedCache( self ):
return self._seed_cache
def GetStatus( self ):
with self._lock:
cancellable = self._current_query is not None
return ( list( self._pending_queries ), self._gallery_status, self._seed_cache_status, self._files_paused, self._gallery_paused, cancellable )
def PausePlayFiles( self ):
with self._lock:
self._files_paused = not self._files_paused
def PausePlayGallery( self ):
with self._lock:
self._gallery_paused = not self._gallery_paused
def PendQuery( self, query ):
with self._lock:
if query not in self._pending_queries:
self._pending_queries.append( query )
def SetDownloadHook( self, hook ):
with self._lock:
self._file_download_hook = hook
def SetFileLimit( self, file_limit ):
with self._lock:
self._file_limit = file_limit
def SetGetTagsIfRedundant( self, get_tags_if_redundant ):
with self._lock:
self._get_tags_if_redundant = get_tags_if_redundant
def SetImportFileOptions( self, import_file_options ):
with self._lock:
self._import_file_options = import_file_options
def SetImportTagOptions( self, import_tag_options ):
with self._lock:
self._import_tag_options = import_tag_options
def Start( self, page_key ):
threading.Thread( target = self._THREADWork, args = ( page_key, ) ).start()
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT ] = GalleryImport
class HDDImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT
SERIALISABLE_VERSION = 1
def __init__( self, paths = None, import_file_options = None, paths_to_tags = None, delete_after_success = None ):
HydrusSerialisable.SerialisableBase.__init__( self )
if paths is None:
self._paths_cache = None
else:
self._paths_cache = SeedCache()
for path in paths:
self._paths_cache.AddSeed( path )
self._import_file_options = import_file_options
self._paths_to_tags = paths_to_tags
self._delete_after_success = delete_after_success
self._paused = False
self._seed_cache_status = ( 'initialising', ( 0, 1 ) )
self._lock = threading.Lock()
def _GetSerialisableInfo( self ):
serialisable_url_cache = self._paths_cache.GetSerialisableTuple()
serialisable_options = self._import_file_options.GetSerialisableTuple()
serialisable_paths_to_tags = { path : { service_key.encode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in self._paths_to_tags.items() }
return ( serialisable_url_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_url_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused ) = serialisable_info
self._paths_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_url_cache )
self._import_file_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_options )
self._paths_to_tags = { path : { service_key.decode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in serialisable_paths_to_tags.items() }
def _RegenerateSeedCacheStatus( self, page_key ):
new_seed_cache_status = self._paths_cache.GetStatus()
if self._seed_cache_status != new_seed_cache_status:
self._seed_cache_status = new_seed_cache_status
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _WorkOnFiles( self, page_key ):
path = self._paths_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if path is None:
time.sleep( 1 )
return
with self._lock:
if path in self._paths_to_tags:
service_keys_to_tags = self._paths_to_tags[ path ]
else:
service_keys_to_tags = {}
try:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
HydrusPaths.MirrorFile( path, temp_path )
client_files_manager = HydrusGlobals.client_controller.GetClientFilesManager()
( status, hash ) = client_files_manager.ImportFile( temp_path, import_file_options = self._import_file_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
self._paths_cache.UpdateSeedStatus( path, status )
if status in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )
if len( service_keys_to_content_updates ) > 0:
HydrusGlobals.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', ( hash, ) )
HydrusGlobals.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
if self._delete_after_success:
try:
ClientData.DeletePath( path )
except Exception as e:
HydrusData.ShowText( 'While attempting to delete ' + path + ', the following error occured:' )
HydrusData.ShowException( e )
except HydrusExceptions.MimeException as e:
status = CC.STATUS_UNINTERESTING_MIME
self._paths_cache.UpdateSeedStatus( path, status )
except Exception as e:
status = CC.STATUS_FAILED
self._paths_cache.UpdateSeedStatus( path, status, exception = e )
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _THREADWork( self, page_key ):
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
while not ( HydrusGlobals.view_shutdown or HydrusGlobals.client_controller.PageDeleted( page_key ) ):
if self._paused or HydrusGlobals.client_controller.PageHidden( page_key ):
time.sleep( 0.1 )
else:
try:
self._WorkOnFiles( page_key )
HydrusGlobals.client_controller.WaitUntilPubSubsEmpty()
except Exception as e:
HydrusData.ShowException( e )
return
def GetSeedCache( self ):
return self._paths_cache
def GetStatus( self ):
with self._lock:
return ( self._seed_cache_status, self._paused )
def PausePlay( self ):
with self._lock:
self._paused = not self._paused
def Start( self, page_key ):
threading.Thread( target = self._THREADWork, args = ( page_key, ) ).start()
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT ] = HDDImport
class ImportFolder( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER
SERIALISABLE_VERSION = 3
def __init__( self, name, path = '', import_file_options = None, import_tag_options = None, txt_parse_tag_service_keys = None, mimes = None, actions = None, action_locations = None, period = 3600, open_popup = True ):
if mimes is None:
mimes = HC.ALLOWED_MIMES
if import_file_options is None:
import_file_options = ClientDefaults.GetDefaultImportFileOptions()
if import_tag_options is None:
new_options = HydrusGlobals.client_controller.GetNewOptions()
import_tag_options = new_options.GetDefaultImportTagOptions( ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEFAULT ) )
if txt_parse_tag_service_keys is None:
txt_parse_tag_service_keys = []
if actions is None:
actions = {}
actions[ CC.STATUS_SUCCESSFUL ] = CC.IMPORT_FOLDER_DELETE
actions[ CC.STATUS_REDUNDANT ] = CC.IMPORT_FOLDER_DELETE
actions[ CC.STATUS_DELETED ] = CC.IMPORT_FOLDER_DELETE
actions[ CC.STATUS_FAILED ] = CC.IMPORT_FOLDER_IGNORE
if action_locations is None:
action_locations = {}
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._path = path
self._mimes = mimes
self._import_file_options = import_file_options
self._import_tag_options = import_tag_options
self._txt_parse_tag_service_keys = txt_parse_tag_service_keys
self._actions = actions
self._action_locations = action_locations
self._period = period
self._open_popup = open_popup
self._path_cache = SeedCache()
self._last_checked = 0
self._paused = False
def _ActionPaths( self ):
for status in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT, CC.STATUS_DELETED, CC.STATUS_FAILED ):
action = self._actions[ status ]
if action == CC.IMPORT_FOLDER_DELETE:
while True:
path = self._path_cache.GetNextSeed( status )
if path is None or HydrusGlobals.view_shutdown:
break
try:
if os.path.exists( path ):
ClientData.DeletePath( path )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
ClientData.DeletePath( txt_path )
self._path_cache.RemoveSeed( path )
except Exception as e:
HydrusData.ShowText( 'Import folder tried to delete ' + path + ', but could not:' )
HydrusData.ShowException( e )
HydrusData.ShowText( 'Import folder has been paused.' )
self._paused = True
return
elif action == CC.IMPORT_FOLDER_MOVE:
while True:
path = self._path_cache.GetNextSeed( status )
if path is None or HydrusGlobals.view_shutdown:
break
try:
if os.path.exists( path ):
dest_dir = self._action_locations[ status ]
filename = os.path.basename( path )
dest_path = os.path.join( dest_dir, filename )
dest_path = HydrusPaths.AppendPathUntilNoConflicts( dest_path )
HydrusPaths.MergeFile( path, dest_path )
txt_path = path + '.txt'
if os.path.exists( txt_path ):
dest_dir = self._action_locations[ status ]
txt_filename = os.path.basename( txt_path )
txt_dest_path = os.path.join( dest_dir, txt_filename )
txt_dest_path = HydrusPaths.AppendPathUntilNoConflicts( txt_dest_path )
HydrusPaths.MergeFile( txt_path, txt_dest_path )
self._path_cache.RemoveSeed( path )
except Exception as e:
HydrusData.ShowText( 'Import folder tried to move ' + path + ', but could not:' )
HydrusData.ShowException( e )
HydrusData.ShowText( 'Import folder has been paused.' )
self._paused = True
return
elif status == CC.IMPORT_FOLDER_IGNORE:
pass
def _GetSerialisableInfo( self ):
serialisable_import_file_options = self._import_file_options.GetSerialisableTuple()
serialisable_import_tag_options = self._import_tag_options.GetSerialisableTuple()
serialisable_txt_parse_tag_service_keys = [ service_key.encode( 'hex' ) for service_key in self._txt_parse_tag_service_keys ]
serialisable_path_cache = self._path_cache.GetSerialisableTuple()
# json turns int dict keys to strings
action_pairs = self._actions.items()
action_location_pairs = self._action_locations.items()
return ( self._path, self._mimes, serialisable_import_file_options, serialisable_import_tag_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, self._period, self._open_popup, serialisable_path_cache, self._last_checked, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._path, self._mimes, serialisable_import_file_options, serialisable_import_tag_options, serialisable_txt_parse_service_keys, action_pairs, action_location_pairs, self._period, self._open_popup, serialisable_path_cache, self._last_checked, self._paused ) = serialisable_info
self._actions = dict( action_pairs )
self._action_locations = dict( action_location_pairs )
self._import_file_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_import_file_options )
self._import_tag_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_import_tag_options )
self._txt_parse_tag_service_keys = [ service_key.decode( 'hex' ) for service_key in serialisable_txt_parse_service_keys ]
self._path_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_path_cache )
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( path, mimes, serialisable_import_file_options, action_pairs, action_location_pairs, period, open_popup, tag, serialisable_path_cache, last_checked, paused ) = old_serialisable_info
service_keys_to_explicit_tags = {}
if tag is not None:
service_keys_to_explicit_tags[ CC.LOCAL_TAG_SERVICE_KEY ] = { tag }
import_tag_options = ClientData.ImportTagOptions( service_keys_to_explicit_tags = service_keys_to_explicit_tags )
serialisable_import_tag_options = import_tag_options.GetSerialisableTuple()
new_serialisable_info = ( path, mimes, serialisable_import_file_options, serialisable_import_tag_options, action_pairs, action_location_pairs, period, open_popup, serialisable_path_cache, last_checked, paused )
return ( 2, new_serialisable_info )
if version == 2:
( path, mimes, serialisable_import_file_options, serialisable_import_tag_options, action_pairs, action_location_pairs, period, open_popup, serialisable_path_cache, last_checked, paused ) = old_serialisable_info
serialisable_txt_parse_tag_service_keys = []
new_serialisable_info = ( path, mimes, serialisable_import_file_options, serialisable_import_tag_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_path_cache, last_checked, paused )
return ( 3, new_serialisable_info )
def DoWork( self ):
if HydrusGlobals.view_shutdown:
return
if not self._paused and HydrusData.TimeHasPassed( self._last_checked + self._period ):
if os.path.exists( self._path ) and os.path.isdir( self._path ):
filenames = os.listdir( HydrusData.ToUnicode( self._path ) )
raw_paths = [ os.path.join( self._path, filename ) for filename in filenames ]
all_paths = ClientFiles.GetAllPaths( raw_paths )
all_paths = HydrusPaths.FilterFreePaths( all_paths )
for path in all_paths:
if path.endswith( '.txt' ):
continue
if not self._path_cache.HasSeed( path ):
self._path_cache.AddSeed( path )
successful_hashes = set()
while True:
path = self._path_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if path is None or HydrusGlobals.view_shutdown:
break
try:
mime = HydrusFileHandling.GetMime( path )
if mime in self._mimes:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
HydrusPaths.MirrorFile( path, temp_path )
client_files_manager = HydrusGlobals.client_controller.GetClientFilesManager()
( status, hash ) = client_files_manager.ImportFile( temp_path, import_file_options = self._import_file_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
self._path_cache.UpdateSeedStatus( path, status )
if status in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
service_keys_to_content_updates = self._import_tag_options.GetServiceKeysToContentUpdates( hash, set() )
if len( service_keys_to_content_updates ) > 0:
HydrusGlobals.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
txt_path = path + '.txt'
if len( self._txt_parse_tag_service_keys ) > 0 and os.path.exists( txt_path ):
try:
with open( txt_path, 'rb' ) as f:
txt_tags_string = f.read()
txt_tags = [ HydrusData.ToUnicode( tag ) for tag in HydrusData.SplitByLinesep( txt_tags_string ) ]
txt_tags = HydrusTags.CleanTags( txt_tags )
service_keys_to_tags = { service_key : txt_tags for service_key in self._txt_parse_tag_service_keys }
service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )
if len( service_keys_to_content_updates ) > 0:
HydrusGlobals.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
except Exception as e:
HydrusData.ShowText( 'Trying to load tags from the .txt file "' + txt_path + '" in the import folder "' + self._name + '" threw an error!' )
HydrusData.ShowException( e )
if status == CC.STATUS_SUCCESSFUL:
successful_hashes.add( hash )
else:
self._path_cache.UpdateSeedStatus( path, CC.STATUS_UNINTERESTING_MIME )
except Exception as e:
error_text = traceback.format_exc()
HydrusData.Print( 'A file failed to import from import folder ' + self._name + ':' )
self._path_cache.UpdateSeedStatus( path, CC.STATUS_FAILED, exception = e )
if len( successful_hashes ) > 0:
HydrusData.Print( 'Import folder ' + self._name + ' imported ' + HydrusData.ConvertIntToPrettyString( len( successful_hashes ) ) + ' files.' )
if self._open_popup:
job_key = ClientThreading.JobKey()
job_key.SetVariable( 'popup_title', 'import folder - ' + self._name )
job_key.SetVariable( 'popup_files', successful_hashes )
HydrusGlobals.client_controller.pub( 'message', job_key )
self._ActionPaths()
self._last_checked = HydrusData.GetNow()
HydrusGlobals.client_controller.WriteSynchronous( 'serialisable', self )
def GetSeedCache( self ):
return self._path_cache
def ToListBoxTuple( self ):
return ( self._name, self._path, self._period )
def ToTuple( self ):
return ( self._name, self._path, self._mimes, self._import_file_options, self._import_tag_options, self._txt_parse_tag_service_keys, self._actions, self._action_locations, self._period, self._open_popup, self._paused )
def SetSeedCache( self, seed_cache ):
self._path_cache = seed_cache
def SetTuple( self, name, path, mimes, import_file_options, import_tag_options, txt_parse_tag_service_keys, actions, action_locations, period, open_popup, paused ):
if path != self._path:
self._path_cache = SeedCache()
if set( mimes ) != set( self._mimes ):
self._path_cache.RemoveSeeds( CC.STATUS_UNINTERESTING_MIME )
self._name = name
self._path = path
self._mimes = mimes
self._import_file_options = import_file_options
self._import_tag_options = import_tag_options
self._txt_parse_tag_service_keys = txt_parse_tag_service_keys
self._actions = actions
self._action_locations = action_locations
self._period = period
self._open_popup = open_popup
self._paused = paused
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER ] = ImportFolder
class PageOfImagesImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_PAGE_OF_IMAGES_IMPORT
SERIALISABLE_VERSION = 1
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
import_file_options = ClientDefaults.GetDefaultImportFileOptions()
self._pending_page_urls = []
self._urls_cache = SeedCache()
self._import_file_options = import_file_options
self._download_image_links = True
self._download_unlinked_images = False
self._paused = False
self._parser_status = ''
self._seed_cache_status = ( 'initialising', ( 0, 1 ) )
self._file_download_hook = None
self._lock = threading.Lock()
def _GetSerialisableInfo( self ):
serialisable_url_cache = self._urls_cache.GetSerialisableTuple()
serialisable_file_options = self._import_file_options.GetSerialisableTuple()
return ( self._pending_page_urls, serialisable_url_cache, serialisable_file_options, self._download_image_links, self._download_unlinked_images, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._pending_page_urls, serialisable_url_cache, serialisable_file_options, self._download_image_links, self._download_unlinked_images, self._paused ) = serialisable_info
self._urls_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_url_cache )
self._import_file_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
def _RegenerateSeedCacheStatus( self, page_key ):
new_seed_cache_status = self._urls_cache.GetStatus()
if self._seed_cache_status != new_seed_cache_status:
self._seed_cache_status = new_seed_cache_status
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _SetParserStatus( self, page_key, text ):
if self._parser_status != text:
self._parser_status = text
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _WorkOnFiles( self, page_key ):
do_wait = True
file_url = self._urls_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if file_url is None:
return
try:
( status, hash ) = HydrusGlobals.client_controller.Read( 'url_status', file_url )
if status == CC.STATUS_DELETED:
if not self._import_file_options.GetExcludeDeleted():
status = CC.STATUS_NEW
if status == CC.STATUS_NEW:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
report_hooks = []
with self._lock:
if self._file_download_hook is not None:
report_hooks.append( self._file_download_hook )
HydrusGlobals.client_controller.DoHTTP( HC.GET, file_url, report_hooks = report_hooks, temp_path = temp_path )
client_files_manager = HydrusGlobals.client_controller.GetClientFilesManager()
( status, hash ) = client_files_manager.ImportFile( temp_path, import_file_options = self._import_file_options, url = file_url )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
else:
do_wait = False
self._urls_cache.UpdateSeedStatus( file_url, status )
if status in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', ( hash, ) )
HydrusGlobals.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
except HydrusExceptions.MimeException as e:
status = CC.STATUS_UNINTERESTING_MIME
self._urls_cache.UpdateSeedStatus( file_url, status )
except Exception as e:
status = CC.STATUS_FAILED
self._urls_cache.UpdateSeedStatus( file_url, status, exception = e )
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
if do_wait:
ClientData.WaitPolitely( page_key )
def _WorkOnQueue( self, page_key ):
file_url = self._urls_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if file_url is not None:
return
if len( self._pending_page_urls ) > 0:
with self._lock:
page_url = self._pending_page_urls.pop( 0 )
self._SetParserStatus( page_key, 'checking ' + page_url )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
error_occurred = False
try:
html = HydrusGlobals.client_controller.DoHTTP( HC.GET, page_url )
soup = ClientDownloading.GetSoup( html )
#
all_links = soup.find_all( 'a' )
links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
all_linked_images = []
for link in all_links:
images = link.find_all( 'img' )
all_linked_images.extend( images )
all_images = soup.find_all( 'img' )
unlinked_images = [ image for image in all_images if image not in all_linked_images ]
#
file_urls = []
if self._download_image_links:
file_urls.extend( [ urlparse.urljoin( page_url, link[ 'href' ] ) for link in links_with_images ] )
if self._download_unlinked_images:
file_urls.extend( [ urlparse.urljoin( page_url, image[ 'src' ] ) for image in unlinked_images ] )
num_new = 0
for file_url in file_urls:
if not self._urls_cache.HasSeed( file_url ):
num_new += 1
self._urls_cache.AddSeed( file_url )
parser_status = 'page checked OK - ' + HydrusData.ConvertIntToPrettyString( num_new ) + ' new urls'
except HydrusExceptions.NotFoundException:
error_occurred = True
parser_status = 'page 404'
except Exception as e:
error_occurred = True
parser_status = HydrusData.ToUnicode( e )
with self._lock:
self._SetParserStatus( page_key, parser_status )
self._RegenerateSeedCacheStatus( page_key )
if error_occurred:
time.sleep( 5 )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
ClientData.WaitPolitely( page_key )
else:
with self._lock:
self._SetParserStatus( page_key, '' )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _THREADWork( self, page_key ):
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
while not ( HydrusGlobals.view_shutdown or HydrusGlobals.client_controller.PageDeleted( page_key ) ):
if self._paused or HydrusGlobals.client_controller.PageHidden( page_key ):
time.sleep( 0.1 )
else:
try:
self._WorkOnQueue( page_key )
self._WorkOnFiles( page_key )
time.sleep( 0.1 )
HydrusGlobals.client_controller.WaitUntilPubSubsEmpty()
except Exception as e:
HydrusData.ShowException( e )
return
def AdvancePageURL( self, page_url ):
with self._lock:
if page_url in self._pending_page_urls:
index = self._pending_page_urls.index( page_url )
if index - 1 >= 0:
self._pending_page_urls.remove( page_url )
self._pending_page_urls.insert( index - 1, page_url )
def DelayPageURL( self, page_url ):
with self._lock:
if page_url in self._pending_page_urls:
index = self._pending_page_urls.index( page_url )
if index + 1 < len( self._pending_page_urls ):
self._pending_page_urls.remove( page_url )
self._pending_page_urls.insert( index + 1, page_url )
def DeletePageURL( self, page_url ):
with self._lock:
if page_url in self._pending_page_urls:
self._pending_page_urls.remove( page_url )
def GetSeedCache( self ):
return self._urls_cache
def GetOptions( self ):
with self._lock:
return ( self._import_file_options, self._download_image_links, self._download_unlinked_images )
def GetStatus( self ):
with self._lock:
return ( list( self._pending_page_urls ), self._parser_status, self._seed_cache_status, self._paused )
def PausePlay( self ):
with self._lock:
self._paused = not self._paused
def PendPageURL( self, page_url ):
with self._lock:
if page_url not in self._pending_page_urls:
self._pending_page_urls.append( page_url )
def SetDownloadHook( self, hook ):
with self._lock:
self._file_download_hook = hook
def SetDownloadImageLinks( self, value ):
with self._lock:
self._download_image_links = value
def SetDownloadUnlinkedImages( self, value ):
with self._lock:
self._download_unlinked_images = value
def SetImportFileOptions( self, import_file_options ):
with self._lock:
self._import_file_options = import_file_options
def Start( self, page_key ):
threading.Thread( target = self._THREADWork, args = ( page_key, ) ).start()
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_PAGE_OF_IMAGES_IMPORT ] = PageOfImagesImport
class SeedCache( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE
SERIALISABLE_VERSION = 2
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._seeds_ordered = []
self._seeds_to_info = {}
self._lock = threading.Lock()
def _GetSeedTuple( self, seed ):
seed_info = self._seeds_to_info[ seed ]
status = seed_info[ 'status' ]
added_timestamp = seed_info[ 'added_timestamp' ]
last_modified_timestamp = seed_info[ 'last_modified_timestamp' ]
note = seed_info[ 'note' ]
return ( seed, status, added_timestamp, last_modified_timestamp, note )
def _GetSerialisableInfo( self ):
with self._lock:
serialisable_info = []
for seed in self._seeds_ordered:
seed_info = self._seeds_to_info[ seed ]
serialisable_info.append( ( seed, seed_info ) )
return serialisable_info
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
with self._lock:
for ( seed, seed_info ) in serialisable_info:
self._seeds_ordered.append( seed )
self._seeds_to_info[ seed ] = seed_info
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
if 'note' in seed_info:
seed_info[ 'note' ] = HydrusData.ToUnicode( seed_info[ 'note' ] )
new_serialisable_info.append( ( seed, seed_info ) )
return ( 2, new_serialisable_info )
def AddSeed( self, seed ):
with self._lock:
if seed in self._seeds_to_info:
self._seeds_ordered.remove( seed )
self._seeds_ordered.append( seed )
now = HydrusData.GetNow()
seed_info = {}
seed_info[ 'status' ] = CC.STATUS_UNKNOWN
seed_info[ 'added_timestamp' ] = now
seed_info[ 'last_modified_timestamp' ] = now
seed_info[ 'note' ] = ''
self._seeds_to_info[ seed ] = seed_info
HydrusGlobals.client_controller.pub( 'seed_cache_seed_updated', seed )
def AdvanceSeed( self, seed ):
with self._lock:
if seed in self._seeds_to_info:
index = self._seeds_ordered.index( seed )
if index > 0:
self._seeds_ordered.remove( seed )
self._seeds_ordered.insert( index - 1, seed )
HydrusGlobals.client_controller.pub( 'seed_cache_seed_updated', seed )
def DelaySeed( self, seed ):
with self._lock:
if seed in self._seeds_to_info:
index = self._seeds_ordered.index( seed )
if index < len( self._seeds_ordered ) - 1:
self._seeds_ordered.remove( seed )
self._seeds_ordered.insert( index + 1, seed )
HydrusGlobals.client_controller.pub( 'seed_cache_seed_updated', seed )
def Duplicate( self ):
new_seed_cache = SeedCache()
new_seed_cache._seeds_ordered = list( self._seeds_ordered )
new_seed_cache._seeds_to_info = { seed : dict( info ) for ( seed, info ) in self._seeds_to_info.items() }
return new_seed_cache
def GetNextSeed( self, status ):
with self._lock:
for seed in self._seeds_ordered:
seed_info = self._seeds_to_info[ seed ]
if seed_info[ 'status' ] == status:
return seed
return None
def GetSeedCount( self, status = None ):
result = 0
with self._lock:
if status is None:
result = len( self._seeds_ordered )
else:
for seed in self._seeds_ordered:
seed_info = self._seeds_to_info[ seed ]
if seed_info[ 'status' ] == status:
result += 1
return result
def GetSeeds( self ):
with self._lock:
return list( self._seeds_ordered )
def GetSeedsWithInfo( self ):
with self._lock:
all_info = []
for seed in self._seeds_ordered:
seed_tuple = self._GetSeedTuple( seed )
all_info.append( seed_tuple )
return all_info
def GetSeedInfo( self, seed ):
with self._lock:
return self._GetSeedTuple( seed )
def GetStatus( self ):
with self._lock:
statuses_to_counts = collections.Counter()
for seed_info in self._seeds_to_info.values():
statuses_to_counts[ seed_info[ 'status' ] ] += 1
num_successful = statuses_to_counts[ CC.STATUS_SUCCESSFUL ]
num_failed = statuses_to_counts[ CC.STATUS_FAILED ]
num_deleted = statuses_to_counts[ CC.STATUS_DELETED ]
num_redundant = statuses_to_counts[ CC.STATUS_REDUNDANT ]
num_unknown = statuses_to_counts[ CC.STATUS_UNKNOWN ]
status_strings = []
if num_successful > 0: status_strings.append( str( num_successful ) + ' successful' )
if num_failed > 0: status_strings.append( str( num_failed ) + ' failed' )
if num_deleted > 0: status_strings.append( str( num_deleted ) + ' already deleted' )
if num_redundant > 0: status_strings.append( str( num_redundant ) + ' already in db' )
status = ', '.join( status_strings )
total_processed = len( self._seeds_ordered ) - num_unknown
total = len( self._seeds_ordered )
return ( status, ( total_processed, total ) )
def HasSeed( self, seed ):
with self._lock:
return seed in self._seeds_to_info
def RemoveSeed( self, seed ):
with self._lock:
if seed in self._seeds_to_info:
del self._seeds_to_info[ seed ]
self._seeds_ordered.remove( seed )
HydrusGlobals.client_controller.pub( 'seed_cache_seed_updated', seed )
def RemoveSeeds( self, status ):
with self._lock:
seeds_to_delete = set()
for ( seed, seed_info ) in self._seeds_to_info.items():
if seed_info[ 'status' ] == status:
seeds_to_delete.add( seed )
for seed in seeds_to_delete:
del self._seeds_to_info[ seed ]
self._seeds_ordered.remove( seed )
for seed in seeds_to_delete:
HydrusGlobals.client_controller.pub( 'seed_cache_seed_updated', seed )
def UpdateSeedStatus( self, seed, status, note = '', exception = None ):
with self._lock:
if exception is not None:
first_line = HydrusData.ToUnicode( exception ).split( os.linesep )[0]
note = first_line + u'\u2026 (Copy note to see full error)'
note += os.linesep
note += traceback.format_exc()
HydrusData.Print( 'Error when processing ' + seed + '!' )
HydrusData.Print( traceback.format_exc() )
note = HydrusData.ToUnicode( note )
seed_info = self._seeds_to_info[ seed ]
seed_info[ 'status' ] = status
seed_info[ 'last_modified_timestamp' ] = HydrusData.GetNow()
seed_info[ 'note' ] = note
HydrusGlobals.client_controller.pub( 'seed_cache_seed_updated', seed )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE ] = SeedCache
class Subscription( HydrusSerialisable.SerialisableBaseNamed ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION
SERIALISABLE_VERSION = 2
def __init__( self, name ):
HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )
self._gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )
self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )
self._query = ''
self._period = 86400 * 7
self._get_tags_if_redundant = False
if HC.options[ 'gallery_file_limit' ] is None:
self._initial_file_limit = 200
else:
self._initial_file_limit = min( 200, HC.options[ 'gallery_file_limit' ] )
self._periodic_file_limit = None
self._paused = False
self._import_file_options = ClientDefaults.GetDefaultImportFileOptions()
self._import_tag_options = ClientData.ImportTagOptions()
self._last_checked = 0
self._last_error = 0
self._check_now = False
self._seed_cache = SeedCache()
def _GetSerialisableInfo( self ):
serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]
serialisable_file_options = self._import_file_options.GetSerialisableTuple()
serialisable_tag_options = self._import_tag_options.GetSerialisableTuple()
serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, self._query, self._period, self._get_tags_if_redundant, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._last_checked, self._check_now, self._last_error, serialisable_seed_cache )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, self._query, self._period, self._get_tags_if_redundant, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._last_checked, self._check_now, self._last_error, serialisable_seed_cache ) = serialisable_info
self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )
self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]
self._import_file_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._import_tag_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
def _NoRecentErrors( self ):
return HydrusData.TimeHasPassed( self._last_error + HC.UPDATE_DURATION ) or self._check_now
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, last_error, serialisable_seed_cache ) = old_serialisable_info
check_now = False
new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, serialisable_seed_cache )
return ( 2, new_serialisable_info )
def _WorkOnFiles( self, job_key ):
error_count = 0
num_urls = self._seed_cache.GetSeedCount()
successful_hashes = set()
gallery = ClientDownloading.GetGallery( self._gallery_identifier )
def hook( gauge_range, gauge_value ):
job_key.SetVariable( 'popup_gauge_2', ( gauge_value, gauge_range ) )
while True:
num_unknown = self._seed_cache.GetSeedCount( CC.STATUS_UNKNOWN )
num_done = num_urls - num_unknown
do_wait = True
url = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if url is None:
break
p1 = HC.options[ 'pause_subs_sync' ]
p2 = job_key.IsCancelled()
p3 = HydrusGlobals.view_shutdown
if p1 or p2 or p3:
break
try:
x_out_of_y = 'file ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_urls ) + ': '
job_key.SetVariable( 'popup_text_1', x_out_of_y + 'checking url status' )
job_key.SetVariable( 'popup_gauge_1', ( num_done, num_urls ) )
( status, hash ) = HydrusGlobals.client_controller.Read( 'url_status', url )
if status == CC.STATUS_DELETED:
if not self._import_file_options.GetExcludeDeleted():
status = CC.STATUS_NEW
tags = []
if status == CC.STATUS_REDUNDANT:
if self._get_tags_if_redundant and self._import_tag_options.ShouldFetchTags():
job_key.SetVariable( 'popup_text_1', x_out_of_y + 'found file in db, fetching tags' )
tags = gallery.GetTags( url, report_hooks = [ hook ] )
else:
do_wait = False
elif status == CC.STATUS_NEW:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
job_key.SetVariable( 'popup_text_1', x_out_of_y + 'downloading file' )
if self._import_tag_options.ShouldFetchTags():
tags = gallery.GetFileAndTags( temp_path, url, report_hooks = [ hook ] )
else:
gallery.GetFile( temp_path, url, report_hooks = [ hook ] )
job_key.SetVariable( 'popup_text_1', x_out_of_y + 'importing file' )
client_files_manager = HydrusGlobals.client_controller.GetClientFilesManager()
( status, hash ) = client_files_manager.ImportFile( temp_path, import_file_options = self._import_file_options, url = url )
if status == CC.STATUS_SUCCESSFUL:
successful_hashes.add( hash )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
else:
do_wait = False
self._seed_cache.UpdateSeedStatus( url, status )
if hash is not None:
service_keys_to_content_updates = self._import_tag_options.GetServiceKeysToContentUpdates( hash, tags )
if len( service_keys_to_content_updates ) > 0:
HydrusGlobals.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
except HydrusExceptions.MimeException as e:
status = CC.STATUS_UNINTERESTING_MIME
self._seed_cache.UpdateSeedStatus( url, status )
except Exception as e:
error_count += 1
status = CC.STATUS_FAILED
self._seed_cache.UpdateSeedStatus( url, status, exception = e )
time.sleep( 10 )
if error_count > 4:
raise Exception( 'The subscription ' + self._name + ' encountered several errors when downloading files, so it abandoned its sync.' )
if len( successful_hashes ) > 0:
job_key.SetVariable( 'popup_files', set( successful_hashes ) )
if do_wait:
ClientData.WaitPolitely()
job_key.DeleteVariable( 'popup_text_1' )
job_key.DeleteVariable( 'popup_gauge_1' )
job_key.DeleteVariable( 'popup_gauge_2' )
def _WorkOnFilesCanDoWork( self ):
return self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN ) is not None
def _SyncQuery( self, job_key ):
if self._SyncQueryCanDoWork():
this_is_initial_sync = self._last_checked == 0
total_new_urls = 0
urls_to_add = set()
urls_to_add_ordered = []
prefix = 'synchronising gallery query'
job_key.SetVariable( 'popup_text_1', prefix )
for gallery_stream_identifier in self._gallery_stream_identifiers:
if this_is_initial_sync:
if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:
continue
else:
if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:
continue
p1 = HC.options[ 'pause_subs_sync' ]
p2 = job_key.IsCancelled()
p3 = HydrusGlobals.view_shutdown
if p1 or p2 or p3:
return
gallery = ClientDownloading.GetGallery( gallery_stream_identifier )
page_index = 0
keep_checking = True
while keep_checking:
new_urls_this_page = 0
try:
( page_of_urls, definitely_no_more_pages ) = gallery.GetPage( self._query, page_index )
page_index += 1
if definitely_no_more_pages:
keep_checking = False
for url in page_of_urls:
if this_is_initial_sync:
if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:
keep_checking = False
break
else:
if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:
keep_checking = False
break
if url in urls_to_add:
# this catches the occasional overflow when a new file is uploaded while gallery parsing is going on
continue
if self._seed_cache.HasSeed( url ):
keep_checking = False
break
else:
urls_to_add.add( url )
urls_to_add_ordered.append( url )
new_urls_this_page += 1
total_new_urls += 1
except HydrusExceptions.NotFoundException:
# paheal now 404s when no results, so just move on and naturally break
pass
if new_urls_this_page == 0:
keep_checking = False
job_key.SetVariable( 'popup_text_1', prefix + ': found ' + HydrusData.ConvertIntToPrettyString( total_new_urls ) + ' new urls' )
ClientData.WaitPolitely()
self._last_checked = HydrusData.GetNow()
urls_to_add_ordered.reverse()
# 'first' urls are now at the end, so the seed_cache should stay roughly in oldest->newest order
for url in urls_to_add_ordered:
if not self._seed_cache.HasSeed( url ):
self._seed_cache.AddSeed( url )
def _SyncQueryCanDoWork( self ):
return HydrusData.TimeHasPassed( self._last_checked + self._period ) or self._check_now
def CheckNow( self ):
self._check_now = True
def GetGalleryIdentifier( self ):
return self._gallery_identifier
def GetLastCheckedText( self ):
periodic_next_check_time = self._last_checked + self._period
error_next_check_time = self._last_error + HC.UPDATE_DURATION
if error_next_check_time > periodic_next_check_time and not HydrusData.TimeHasPassed( error_next_check_time ):
interim_text = ' | due to error ' + HydrusData.ConvertTimestampToPrettySync( self._last_error ) + ', next check '
next_check_time = error_next_check_time
else:
interim_text = ' | next check '
next_check_time = periodic_next_check_time
return 'last checked ' + HydrusData.ConvertTimestampToPrettySync( self._last_checked ) + interim_text + HydrusData.ConvertTimestampToPrettyPending( next_check_time )
def GetQuery( self ):
return self._query
def GetImportTagOptions( self ):
return self._import_tag_options
def GetSeedCache( self ):
return self._seed_cache
def Reset( self ):
self._last_checked = 0
self._last_error = 0
self._seed_cache = SeedCache()
def SetSeedCache( self, seed_cache ):
self._seed_cache = seed_cache
def SetTuple( self, gallery_identifier, gallery_stream_identifiers, query, period, get_tags_if_redundant, initial_file_limit, periodic_file_limit, paused, import_file_options, import_tag_options ):
self._gallery_identifier = gallery_identifier
self._gallery_stream_identifiers = gallery_stream_identifiers
self._query = query
self._period = period
self._get_tags_if_redundant = get_tags_if_redundant
self._initial_file_limit = initial_file_limit
self._periodic_file_limit = periodic_file_limit
self._paused = paused
self._import_file_options = import_file_options
self._import_tag_options = import_tag_options
def Sync( self ):
p1 = not self._paused
p2 = not HydrusGlobals.view_shutdown
p3 = self._NoRecentErrors()
p4 = self._SyncQueryCanDoWork()
p5 = self._WorkOnFilesCanDoWork()
if p1 and p2 and p3 and ( p4 or p5 ):
job_key = ClientThreading.JobKey( pausable = False, cancellable = True )
try:
job_key.SetVariable( 'popup_title', 'subscriptions - ' + self._name )
HydrusGlobals.client_controller.pub( 'message', job_key )
self._SyncQuery( job_key )
self._WorkOnFiles( job_key )
except Exception as e:
HydrusData.ShowText( 'The subscription ' + self._name + ' encountered an exception when trying to sync:' )
HydrusData.ShowException( e )
self._last_error = HydrusData.GetNow()
finally:
self._check_now = False
HydrusGlobals.client_controller.WriteSynchronous( 'serialisable', self )
if job_key.HasVariable( 'popup_files' ):
job_key.Finish()
else:
job_key.Delete()
def ToTuple( self ):
return ( self._gallery_identifier, self._gallery_stream_identifiers, self._query, self._period, self._get_tags_if_redundant, self._initial_file_limit, self._periodic_file_limit, self._paused, self._import_file_options, self._import_tag_options )
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION ] = Subscription
class ThreadWatcherImport( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_THREAD_WATCHER_IMPORT
SERIALISABLE_VERSION = 1
MIN_CHECK_PERIOD = 30
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
import_file_options = ClientDefaults.GetDefaultImportFileOptions()
( times_to_check, check_period ) = HC.options[ 'thread_checker_timings' ]
self._thread_url = ''
self._urls_cache = SeedCache()
self._urls_to_filenames = {}
self._urls_to_md5_base64 = {}
self._import_file_options = import_file_options
self._import_tag_options = ClientData.ImportTagOptions()
self._times_to_check = times_to_check
self._check_period = check_period
self._last_time_checked = 0
self._file_download_hook = None
self._check_now = False
self._paused = False
self._watcher_status = 'ready to start'
self._seed_cache_status = ( 'initialising', ( 0, 1 ) )
self._lock = threading.Lock()
def _GetSerialisableInfo( self ):
serialisable_url_cache = self._urls_cache.GetSerialisableTuple()
serialisable_file_options = self._import_file_options.GetSerialisableTuple()
serialisable_tag_options = self._import_tag_options.GetSerialisableTuple()
return ( self._thread_url, serialisable_url_cache, self._urls_to_filenames, self._urls_to_md5_base64, serialisable_file_options, serialisable_tag_options, self._times_to_check, self._check_period, self._last_time_checked, self._paused )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._thread_url, serialisable_url_cache, self._urls_to_filenames, self._urls_to_md5_base64, serialisable_file_options, serialisable_tag_options, self._times_to_check, self._check_period, self._last_time_checked, self._paused ) = serialisable_info
self._urls_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_url_cache )
self._import_file_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
self._import_tag_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
def _RegenerateSeedCacheStatus( self, page_key ):
new_seed_cache_status = self._urls_cache.GetStatus()
if self._seed_cache_status != new_seed_cache_status:
self._seed_cache_status = new_seed_cache_status
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _SetWatcherStatus( self, page_key, text ):
if self._watcher_status != text:
self._watcher_status = text
HydrusGlobals.client_controller.pub( 'update_status', page_key )
def _WorkOnFiles( self, page_key ):
do_wait = True
file_url = self._urls_cache.GetNextSeed( CC.STATUS_UNKNOWN )
if file_url is None:
return
try:
file_original_filename = self._urls_to_filenames[ file_url ]
tags = [ 'filename:' + file_original_filename ]
if file_url in self._urls_to_md5_base64:
file_md5_base64 = self._urls_to_md5_base64[ file_url ]
file_md5 = file_md5_base64.decode( 'base64' )
( status, hash ) = HydrusGlobals.client_controller.Read( 'md5_status', file_md5 )
else:
status = CC.STATUS_NEW
if status == CC.STATUS_DELETED:
if not self._import_file_options.GetExcludeDeleted():
status = CC.STATUS_NEW
if status == CC.STATUS_NEW:
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
report_hooks = []
with self._lock:
if self._file_download_hook is not None:
report_hooks.append( self._file_download_hook )
HydrusGlobals.client_controller.DoHTTP( HC.GET, file_url, report_hooks = report_hooks, temp_path = temp_path )
client_files_manager = HydrusGlobals.client_controller.GetClientFilesManager()
( status, hash ) = client_files_manager.ImportFile( temp_path, import_file_options = self._import_file_options, url = file_url )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
else:
do_wait = False
self._urls_cache.UpdateSeedStatus( file_url, status )
if status in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
with self._lock:
service_keys_to_content_updates = self._import_tag_options.GetServiceKeysToContentUpdates( hash, tags )
if len( service_keys_to_content_updates ) > 0:
HydrusGlobals.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', ( hash, ) )
HydrusGlobals.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
except HydrusExceptions.MimeException as e:
status = CC.STATUS_UNINTERESTING_MIME
self._urls_cache.UpdateSeedStatus( file_url, status )
except Exception as e:
status = CC.STATUS_FAILED
self._urls_cache.UpdateSeedStatus( file_url, status, exception = e )
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
if do_wait:
ClientData.WaitPolitely( page_key )
def _WorkOnThread( self, page_key ):
do_wait = False
error_occurred = False
with self._lock:
p1 = self._check_now and HydrusData.TimeHasPassed( self._last_time_checked + self.MIN_CHECK_PERIOD )
p2 = self._times_to_check > 0 and HydrusData.TimeHasPassed( self._last_time_checked + self._check_period )
if p1 or p2:
with self._lock:
self._SetWatcherStatus( page_key, 'checking thread' )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
try:
json_url = ClientDownloading.GetImageboardThreadJSONURL( self._thread_url )
do_wait = True
raw_json = HydrusGlobals.client_controller.DoHTTP( HC.GET, json_url )
file_infos = ClientDownloading.ParseImageboardFileURLsFromJSON( self._thread_url, raw_json )
num_new = 0
for ( file_url, file_md5_base64, file_original_filename ) in file_infos:
if not self._urls_cache.HasSeed( file_url ):
num_new += 1
self._urls_cache.AddSeed( file_url )
self._urls_to_filenames[ file_url ] = file_original_filename
if file_md5_base64 is not None:
self._urls_to_md5_base64[ file_url ] = file_md5_base64
watcher_status = 'thread checked OK - ' + HydrusData.ConvertIntToPrettyString( num_new ) + ' new urls'
except HydrusExceptions.NotFoundException:
error_occurred = True
watcher_status = 'thread 404'
with self._lock:
for i in range( self._times_to_check ):
HydrusGlobals.client_controller.pub( 'decrement_times_to_check', page_key )
self._times_to_check = 0
except Exception as e:
error_occurred = True
watcher_status = HydrusData.ToUnicode( e )
with self._lock:
if self._check_now:
self._check_now = False
else:
self._times_to_check -= 1
HydrusGlobals.client_controller.pub( 'decrement_times_to_check', page_key )
self._last_time_checked = HydrusData.GetNow()
else:
with self._lock:
if self._check_now or self._times_to_check > 0:
if self._check_now:
delay = self.MIN_CHECK_PERIOD
else:
delay = self._check_period
watcher_status = 'checking again ' + HydrusData.ConvertTimestampToPrettyPending( self._last_time_checked + delay )
else:
watcher_status = 'checking finished'
with self._lock:
self._SetWatcherStatus( page_key, watcher_status )
self._RegenerateSeedCacheStatus( page_key )
if do_wait:
ClientData.WaitPolitely( page_key )
if error_occurred:
time.sleep( 5 )
def _THREADWork( self, page_key ):
with self._lock:
self._RegenerateSeedCacheStatus( page_key )
HydrusGlobals.client_controller.pub( 'update_status', page_key )
while not ( HydrusGlobals.view_shutdown or HydrusGlobals.client_controller.PageDeleted( page_key ) ):
if self._paused or HydrusGlobals.client_controller.PageHidden( page_key ):
time.sleep( 0.1 )
else:
try:
if self._thread_url != '':
self._WorkOnThread( page_key )
self._WorkOnFiles( page_key )
time.sleep( 0.1 )
HydrusGlobals.client_controller.WaitUntilPubSubsEmpty()
except Exception as e:
HydrusData.ShowException( e )
return
def CheckNow( self ):
with self._lock:
self._check_now = True
def GetSeedCache( self ):
return self._urls_cache
def GetOptions( self ):
with self._lock:
return ( self._thread_url, self._import_file_options, self._import_tag_options, self._times_to_check, self._check_period )
def GetStatus( self ):
with self._lock:
return ( self._watcher_status, self._seed_cache_status, self._check_now, self._paused )
def HasThread( self ):
with self._lock:
return self._thread_url != ''
def PausePlay( self ):
with self._lock:
self._paused = not self._paused
def SetCheckPeriod( self, check_period ):
with self._lock:
self._check_period = max( self.MIN_CHECK_PERIOD, check_period )
def SetDownloadHook( self, hook ):
with self._lock:
self._file_download_hook = hook
def SetImportFileOptions( self, import_file_options ):
with self._lock:
self._import_file_options = import_file_options
def SetImportTagOptions( self, import_tag_options ):
with self._lock:
self._import_tag_options = import_tag_options
def SetThreadURL( self, thread_url ):
with self._lock:
self._thread_url = thread_url
def SetTimesToCheck( self, times_to_check ):
with self._lock:
self._times_to_check = times_to_check
def Start( self, page_key ):
threading.Thread( target = self._THREADWork, args = ( page_key, ) ).start()
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_THREAD_WATCHER_IMPORT ] = ThreadWatcherImport