hydrus/include/ClientImportSeeds.py

1795 lines
54 KiB
Python

import ClientConstants as CC
import ClientImageHandling
import ClientNetworkingDomain
import ClientParsing
import ClientPaths
import collections
import HydrusConstants as HC
import HydrusData
import HydrusExceptions
import HydrusFileHandling
import HydrusImageHandling
import HydrusGlobals as HG
import HydrusPaths
import HydrusSerialisable
import HydrusTags
import os
import threading
import time
import traceback
import urlparse
def GenerateSeedCacheStatus( statuses_to_counts ):
num_successful_and_new = statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
num_successful_but_redundant = statuses_to_counts[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ]
num_ignored = statuses_to_counts[ CC.STATUS_VETOED ]
num_deleted = statuses_to_counts[ CC.STATUS_DELETED ]
num_failed = statuses_to_counts[ CC.STATUS_ERROR ]
num_skipped = statuses_to_counts[ CC.STATUS_SKIPPED ]
num_unknown = statuses_to_counts[ CC.STATUS_UNKNOWN ]
status_strings = []
num_successful = num_successful_and_new + num_successful_but_redundant
if num_successful > 0:
s = HydrusData.ConvertIntToPrettyString( num_successful ) + ' successful'
if num_successful_and_new > 0:
if num_successful_but_redundant > 0:
s += ' (' + HydrusData.ConvertIntToPrettyString( num_successful_but_redundant ) + ' already in db)'
else:
s += ' (all already in db)'
status_strings.append( s )
if num_ignored > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_ignored ) + ' ignored' )
if num_deleted > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_deleted ) + ' previously deleted' )
if num_failed > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_failed ) + ' failed' )
if num_skipped > 0:
status_strings.append( HydrusData.ConvertIntToPrettyString( num_skipped ) + ' skipped' )
status = ', '.join( status_strings )
total = sum( statuses_to_counts.values() )
total_processed = total - num_unknown
return ( status, ( total_processed, total ) )
class FileImportJob( object ):
def __init__( self, temp_path, file_import_options = None ):
if file_import_options is None:
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._temp_path = temp_path
self._file_import_options = file_import_options
self._hash = None
self._pre_import_status = None
self._file_info = None
self._thumbnail = None
self._phashes = None
self._extra_hashes = None
def CheckIsGoodToImport( self ):
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
self._file_import_options.CheckFileIsValid( size, mime, width, height )
def GetExtraHashes( self ):
return self._extra_hashes
def GetFileImportOptions( self ):
return self._file_import_options
def GetFileInfo( self ):
return self._file_info
def GetHash( self ):
return self._hash
def GetMime( self ):
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
return mime
def GetPreImportStatus( self ):
return self._pre_import_status
def GetPHashes( self ):
return self._phashes
def GetTempPathAndThumbnail( self ):
return ( self._temp_path, self._thumbnail )
def PubsubContentUpdates( self ):
if self._pre_import_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._file_import_options.AutomaticallyArchives():
service_keys_to_content_updates = { CC.COMBINED_LOCAL_FILE_SERVICE_KEY : [ HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, set( ( self._hash, ) ) ) ] }
HG.client_controller.Write( 'content_updates', service_keys_to_content_updates )
def IsNewToDB( self ):
if self._pre_import_status == CC.STATUS_UNKNOWN:
return True
if self._pre_import_status == CC.STATUS_DELETED:
if not self._file_import_options.ExcludesDeleted():
return True
return False
def GenerateHashAndStatus( self ):
HydrusImageHandling.ConvertToPngIfBmp( self._temp_path )
self._hash = HydrusFileHandling.GetHashFromPath( self._temp_path )
( self._pre_import_status, hash, note ) = HG.client_controller.Read( 'hash_status', 'sha256', self._hash, prefix = 'recognised during import' )
return ( self._pre_import_status, self._hash, note )
def GenerateInfo( self ):
mime = HydrusFileHandling.GetMime( self._temp_path )
new_options = HG.client_controller.new_options
if mime in HC.DECOMPRESSION_BOMB_IMAGES and not self._file_import_options.AllowsDecompressionBombs():
if HydrusImageHandling.IsDecompressionBomb( self._temp_path ):
raise HydrusExceptions.DecompressionBombException( 'Image seems to be a Decompression Bomb!' )
self._file_info = HydrusFileHandling.GetFileInfo( self._temp_path, mime )
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
if mime in HC.MIMES_WITH_THUMBNAILS:
percentage_in = HG.client_controller.new_options.GetInteger( 'video_thumbnail_percentage_in' )
self._thumbnail = HydrusFileHandling.GenerateThumbnail( self._temp_path, mime, percentage_in = percentage_in )
if mime in HC.MIMES_WE_CAN_PHASH:
self._phashes = ClientImageHandling.GenerateShapePerceptualHashes( self._temp_path, mime )
self._extra_hashes = HydrusFileHandling.GetExtraHashesFromPath( self._temp_path )
SEED_TYPE_HDD = 0
SEED_TYPE_URL = 1
class Seed( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED
SERIALISABLE_NAME = 'File Import'
SERIALISABLE_VERSION = 2
def __init__( self, seed_type = None, seed_data = None ):
if seed_type is None:
seed_type = SEED_TYPE_URL
if seed_data is None:
seed_data = 'https://big-guys.4u/monica_lewinsky_hott.tiff.exe.vbs'
HydrusSerialisable.SerialisableBase.__init__( self )
self.seed_type = seed_type
self.seed_data = seed_data
self.created = HydrusData.GetNow()
self.modified = self.created
self.source_time = None
self.status = CC.STATUS_UNKNOWN
self.note = ''
self._referral_url = None
self._urls = set()
self._tags = set()
self._hashes = {}
def __eq__( self, other ):
return self.__hash__() == other.__hash__()
def __hash__( self ):
return ( self.seed_type, self.seed_data ).__hash__()
def __ne__( self, other ):
return self.__hash__() != other.__hash__()
def _CheckTagsBlacklist( self, tags, tag_import_options ):
tag_import_options.CheckBlacklist( tags )
def _GetSerialisableInfo( self ):
serialisable_urls = list( self._urls )
serialisable_tags = list( self._tags )
serialisable_hashes = [ ( hash_type, hash.encode( 'hex' ) ) for ( hash_type, hash ) in self._hashes.items() if hash is not None ]
return ( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, self._referral_url, serialisable_urls, serialisable_tags, serialisable_hashes )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, self._referral_url, serialisable_urls, serialisable_tags, serialisable_hashes ) = serialisable_info
self._urls = set( serialisable_urls )
self._tags = set( serialisable_tags )
self._hashes = { hash_type : encoded_hash.decode( 'hex' ) for ( hash_type, encoded_hash ) in serialisable_hashes if encoded_hash is not None }
def _NormaliseAndFilterAssociableURLs( self, urls ):
normalised_urls = { HG.client_controller.network_engine.domain_manager.NormaliseURL( url ) for url in urls }
associable_urls = { url for url in normalised_urls if HG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
return associable_urls
def _UpdateModified( self ):
self.modified = HydrusData.GetNow()
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( seed_type, seed_data, created, modified, source_time, status, note, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
referral_url = None
new_serialisable_info = ( seed_type, seed_data, created, modified, source_time, status, note, referral_url, serialisable_urls, serialisable_tags, serialisable_hashes )
return ( 2, new_serialisable_info )
def AddParseResults( self, parse_results ):
for ( hash_type, hash ) in ClientParsing.GetHashesFromParseResults( parse_results ):
if hash_type not in self._hashes:
self._hashes[ hash_type ] = hash
urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_SOURCE, ) )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
associable_urls.discard( self.seed_data )
self._urls.update( associable_urls )
tags = ClientParsing.GetTagsFromParseResults( parse_results )
self._tags.update( tags )
source_timestamp = ClientParsing.GetTimestampFromParseResults( parse_results, HC.TIMESTAMP_TYPE_SOURCE )
source_timestamp = min( HydrusData.GetNow() - 30, source_timestamp )
if source_timestamp is not None:
self.source_time = source_timestamp
self._UpdateModified()
def AddTags( self, tags ):
tags = HydrusTags.CleanTags( tags )
self._tags.update( tags )
self._UpdateModified()
def AddURL( self, url ):
urls = ( url, )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
associable_urls.discard( self.seed_data )
self._urls.update( associable_urls )
def CheckPreFetchMetadata( self, tag_import_options ):
self._CheckTagsBlacklist( self._tags, tag_import_options )
def DownloadAndImportRawFile( self, file_url, file_import_options, network_job_factory, network_job_presentation_context_factory ):
self.AddURL( file_url )
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
try:
if self.seed_data != file_url:
referral_url = self.seed_data
else:
referral_url = self._referral_url
network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )
network_job.SetFileImportOptions( file_import_options )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
self.Import( temp_path, file_import_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
def FetchPageMetadata( self, tag_import_options ):
pass
def PredictPreImportStatus( self, file_import_options, file_url = None ):
if self.status != CC.STATUS_UNKNOWN:
return
UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )
( status, hash, note ) = UNKNOWN_DEFAULT
# urls
urls = set( self._urls )
if file_url is not None:
urls.add( file_url )
if self.seed_type == SEED_TYPE_URL:
urls.add( self.seed_data )
unrecognised_url_results = set()
for url in urls:
if HG.client_controller.network_engine.domain_manager.URLCanReferToMultipleFiles( url ):
continue
# we now only trust url-matched single urls and the post/file urls
# trusting unmatched source urls was too much of a hassle with too many boorus providing bad source urls like user account pages
if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToOneFile( url ) or url in ( self.seed_data, file_url ):
results = HG.client_controller.Read( 'url_statuses', url )
if len( results ) == 0: # if no match found, no useful data discovered
continue
elif len( results ) > 1: # if more than one file claims this url, it cannot be relied on to guess the file
continue
else: # i.e. 1 match found
( status, hash, note ) = results[0]
if status != CC.STATUS_UNKNOWN:
break # if a known one-file url gives a single clear result, that result is reliable
# hashes
if status == CC.STATUS_UNKNOWN:
for ( hash_type, found_hash ) in self._hashes.items():
( status, hash, note ) = HG.client_controller.Read( 'hash_status', hash_type, found_hash )
if status != CC.STATUS_UNKNOWN:
break
#
if status == CC.STATUS_DELETED:
if not file_import_options.ExcludesDeleted():
status = CC.STATUS_UNKNOWN
note = ''
self.status = status
if hash is not None:
self._hashes[ 'sha256' ] = hash
self.note = note
self._UpdateModified()
def GetHash( self ):
if 'sha256' in self._hashes:
return self._hashes[ 'sha256' ]
return None
def GetSearchSeeds( self ):
if self.seed_type == SEED_TYPE_URL:
search_urls = ClientNetworkingDomain.GetSearchURLs( self.seed_data )
search_seeds = [ Seed( SEED_TYPE_URL, search_url ) for search_url in search_urls ]
else:
search_seeds = [ self ]
return search_seeds
def HasHash( self ):
return self.GetHash() is not None
def Import( self, temp_path, file_import_options ):
file_import_job = FileImportJob( temp_path, file_import_options )
( status, hash, note ) = HG.client_controller.client_files_manager.ImportFile( file_import_job )
self.SetStatus( status, note = note )
self.SetHash( hash )
def ImportPath( self, file_import_options ):
if self.seed_type != SEED_TYPE_HDD:
raise Exception( 'Attempted to import as a path, but I do not think I am a path!' )
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
try:
path = self.seed_data
copied = HydrusPaths.MirrorFile( path, temp_path )
if not copied:
raise Exception( 'File failed to copy to temp path--see log for error.' )
self.Import( temp_path, file_import_options )
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
def IsAPostURL( self ):
if self.seed_type == SEED_TYPE_URL:
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.seed_data )
if url_type == HC.URL_TYPE_POST:
return True
return False
def Normalise( self ):
if self.seed_type == SEED_TYPE_URL:
self.seed_data = HG.client_controller.network_engine.domain_manager.NormaliseURL( self.seed_data )
def PresentToPage( self, page_key ):
hash = self.GetHash()
if hash is not None:
( media_result, ) = HG.client_controller.Read( 'media_results', ( hash, ) )
HG.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
def SetHash( self, hash ):
if hash is not None:
self._hashes[ 'sha256' ] = hash
def SetReferralURL( self, referral_url ):
self._referral_url = referral_url
def SetStatus( self, status, note = '', exception = None ):
if exception is not None:
first_line = HydrusData.ToUnicode( exception ).split( os.linesep )[0]
note = first_line + u'\u2026 (Copy note to see full error)'
note += os.linesep
note += HydrusData.ToUnicode( traceback.format_exc() )
HydrusData.Print( 'Error when processing ' + self.seed_data + ' !' )
HydrusData.Print( traceback.format_exc() )
self.status = status
self.note = note
self._UpdateModified()
def ShouldDownloadFile( self ):
return self.status == CC.STATUS_UNKNOWN
def ShouldFetchPageMetadata( self, tag_import_options ):
if self.status == CC.STATUS_UNKNOWN:
return True
if self.status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if tag_import_options.WorthFetchingTags() and tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB():
return True
return False
def ShouldPresent( self, file_import_options ):
hash = self.GetHash()
if hash is not None and self.status in CC.SUCCESSFUL_IMPORT_STATES:
if file_import_options.ShouldPresentIgnorantOfInbox( self.status ):
return True
in_inbox = HG.client_controller.Read( 'in_inbox', hash )
if file_import_options.ShouldPresent( self.status, in_inbox ):
return True
return False
def WorksInNewSystem( self ):
if self.seed_type == SEED_TYPE_URL:
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.seed_data )
if url_type == HC.URL_TYPE_FILE:
return True
if url_type == HC.URL_TYPE_POST and can_parse:
return True
if url_type == HC.URL_TYPE_UNKNOWN and self._referral_url is not None: # this is likely be a multi-file child of a post url seed
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self._referral_url )
if url_type == HC.URL_TYPE_POST: # we must have got here through parsing that m8, so let's assume this is an unrecognised file url
return True
return False
def WorkOnURL( self, seed_cache, status_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, tag_import_options = None ):
did_substantial_work = False
try:
status_hook( 'checking url status' )
self.PredictPreImportStatus( file_import_options )
if self.IsAPostURL():
if tag_import_options is None:
tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( self.seed_data )
if self.ShouldFetchPageMetadata( tag_import_options ):
did_substantial_work = True
post_url = self.seed_data
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
status_hook( 'downloading page' )
if self._referral_url not in ( post_url, url_to_check ):
referral_url = self._referral_url
else:
referral_url = None
network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
data = network_job.GetContent()
parsing_context = {}
parsing_context[ 'post_url' ] = post_url
parsing_context[ 'url' ] = url_to_check
all_parse_results = parser.Parse( parsing_context, data )
if len( all_parse_results ) == 0:
raise HydrusExceptions.VetoException( 'Could not parse any data!' )
parse_results = all_parse_results[0]
self.AddParseResults( parse_results )
self.CheckPreFetchMetadata( tag_import_options )
desired_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
child_urls = []
if len( desired_urls ) == 0:
raise HydrusExceptions.VetoException( 'Could not find a file or post URL to download!' )
elif len( desired_urls ) == 1:
desired_url = desired_urls[0]
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( desired_url )
if url_type in ( HC.URL_TYPE_FILE, HC.URL_TYPE_UNKNOWN ):
file_url = desired_url
self.PredictPreImportStatus( file_import_options, file_url )
if self.ShouldDownloadFile():
status_hook( 'downloading file' )
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )
elif url_type == HC.URL_TYPE_POST and can_parse:
# a pixiv mode=medium page has spawned a mode=manga page, so we need a new seed to go pursue that
child_urls = [ desired_url ]
else:
raise HydrusExceptions.VetoException( 'Found a URL--' + desired_url + '--but could not understand/parse it!' )
else:
child_urls = desired_urls
if len( child_urls ) > 0:
child_seeds = []
for child_url in child_urls:
duplicate_seed = self.Duplicate() # inherits all urls and tags from here
duplicate_seed.seed_data = child_url
duplicate_seed.SetReferralURL( self.seed_data )
if self._referral_url is not None:
duplicate_seed.AddURL( self._referral_url )
child_seeds.append( duplicate_seed )
try:
my_index = seed_cache.GetSeedIndex( self )
insertion_index = my_index + 1
except:
insertion_index = len( seed_cache )
seed_cache.InsertSeeds( insertion_index, child_seeds )
status = CC.STATUS_SUCCESSFUL_AND_NEW
note = 'Found ' + HydrusData.ConvertIntToPrettyString( len( child_urls ) ) + ' new URLs.'
self.SetStatus( status, note = note )
else:
if tag_import_options is None:
if self._referral_url is not None:
tio_lookup_url = self._referral_url
else:
tio_lookup_url = self.seed_data
tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( tio_lookup_url )
if self.ShouldDownloadFile():
did_substantial_work = True
file_url = self.seed_data
status_hook( 'downloading file' )
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )
did_substantial_work |= self.WriteContentUpdates( tag_import_options )
except HydrusExceptions.ShutdownException:
return False
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
note = HydrusData.ToUnicode( e )
self.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
status_hook( 'cancelled!' )
time.sleep( 2 )
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
self.SetStatus( status, note = note )
status_hook( '404' )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
self.SetStatus( status, exception = e )
status_hook( 'error!' )
time.sleep( 3 )
return did_substantial_work
def WriteContentUpdates( self, tag_import_options = None ):
did_work = False
if self.status == CC.STATUS_ERROR:
return did_work
hash = self.GetHash()
if hash is None:
return did_work
service_keys_to_content_updates = collections.defaultdict( list )
urls = set( self._urls )
if self.seed_type == SEED_TYPE_URL:
urls.add( self.seed_data )
if self._referral_url is not None:
urls.add( self._referral_url )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
if len( associable_urls ) > 0:
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( associable_urls, ( hash, ) ) )
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
if tag_import_options is not None:
for ( service_key, content_updates ) in tag_import_options.GetServiceKeysToContentUpdates( hash, set( self._tags ) ).items():
service_keys_to_content_updates[ service_key ].extend( content_updates )
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
did_work = True
return did_work
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED ] = Seed
class SeedCache( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE
SERIALISABLE_NAME = 'Import File Status Cache'
SERIALISABLE_VERSION = 8
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._seeds = HydrusSerialisable.SerialisableList()
self._seeds_to_indices = {}
self._seed_cache_key = HydrusData.GenerateKey()
self._status_cache = None
self._status_cache_generation_time = 0
self._status_dirty = True
self._lock = threading.Lock()
def __len__( self ):
return len( self._seeds )
def _GenerateStatus( self ):
statuses_to_counts = self._GetStatusesToCounts()
self._status_cache = GenerateSeedCacheStatus( statuses_to_counts )
self._status_cache_generation_time = HydrusData.GetNow()
self._status_dirty = False
def _GetStatusesToCounts( self ):
statuses_to_counts = collections.Counter()
for seed in self._seeds:
statuses_to_counts[ seed.status ] += 1
return statuses_to_counts
def _GetSeeds( self, status = None ):
if status is None:
return list( self._seeds )
else:
return [ seed for seed in self._seeds if seed.status == status ]
def _GetSerialisableInfo( self ):
with self._lock:
return self._seeds.GetSerialisableTuple()
def _GetSourceTimestamp( self, seed ):
source_timestamp = seed.source_time
if source_timestamp is None:
# decent fallback compromise
# -30 since added and 'last check' timestamps are often the same, and this messes up calculations
source_timestamp = seed.created - 30
return source_timestamp
def _HasSeed( self, seed ):
search_seeds = seed.GetSearchSeeds()
has_seed = True in ( search_seed in self._seeds_to_indices for search_seed in search_seeds )
return has_seed
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
with self._lock:
self._seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
def _SetStatusDirty( self ):
self._status_dirty = True
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
if 'note' in seed_info:
seed_info[ 'note' ] = HydrusData.ToUnicode( seed_info[ 'note' ] )
new_serialisable_info.append( ( seed, seed_info ) )
return ( 2, new_serialisable_info )
if version in ( 2, 3 ):
# gelbooru replaced their thumbnail links with this redirect spam
# 'https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY4ODA1OA=='
# I missed some http ones here, so I've broadened the test and rescheduled it
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
if 'gelbooru.com/redirect.php' in seed:
continue
new_serialisable_info.append( ( seed, seed_info ) )
return ( 4, new_serialisable_info )
if version == 4:
def ConvertRegularToRawURL( regular_url ):
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
# to this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# the 500 part can be a bunch of stuff, including letters
url_components = regular_url.split( '_' )
last_component = url_components[ -1 ]
( number_gubbins, file_ext ) = last_component.split( '.' )
raw_last_component = 'raw.' + file_ext
url_components[ -1 ] = raw_last_component
raw_url = '_'.join( url_components )
return raw_url
def Remove68Subdomain( long_url ):
# sometimes the 68 subdomain gives a 404 on the raw url, so:
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# to this:
# http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# I am not sure if it is always 68, but let's not assume
( scheme, rest ) = long_url.split( '://', 1 )
if rest.startswith( 'media.tumblr.com' ):
return long_url
( gumpf, shorter_rest ) = rest.split( '.', 1 )
shorter_url = scheme + '://' + shorter_rest
return shorter_url
new_serialisable_info = []
good_seeds = set()
for ( seed, seed_info ) in old_serialisable_info:
try:
parse = urlparse.urlparse( seed )
if 'media.tumblr.com' in parse.netloc:
seed = Remove68Subdomain( seed )
seed = ConvertRegularToRawURL( seed )
seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )
if 'pixiv.net' in parse.netloc:
seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )
if seed in good_seeds: # we hit a dupe, so skip it
continue
except:
pass
good_seeds.add( seed )
new_serialisable_info.append( ( seed, seed_info ) )
return ( 5, new_serialisable_info )
if version == 5:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
seed_info[ 'source_timestamp' ] = None
new_serialisable_info.append( ( seed, seed_info ) )
return ( 6, new_serialisable_info )
if version == 6:
new_serialisable_info = []
for ( seed, seed_info ) in old_serialisable_info:
try:
magic_phrase = '//media.tumblr.com'
replacement = '//data.tumblr.com'
if magic_phrase in seed:
seed = seed.replace( magic_phrase, replacement )
except:
pass
new_serialisable_info.append( ( seed, seed_info ) )
return ( 7, new_serialisable_info )
if version == 7:
seeds = HydrusSerialisable.SerialisableList()
for ( seed_text, seed_info ) in old_serialisable_info:
if seed_text.startswith( 'http' ):
seed_type = SEED_TYPE_URL
else:
seed_type = SEED_TYPE_HDD
seed = Seed( seed_type, seed_text )
seed.status = seed_info[ 'status' ]
seed.created = seed_info[ 'added_timestamp' ]
seed.modified = seed_info[ 'last_modified_timestamp' ]
seed.source_time = seed_info[ 'source_timestamp' ]
seed.note = seed_info[ 'note' ]
seeds.append( seed )
new_serialisable_info = seeds.GetSerialisableTuple()
return ( 8, new_serialisable_info )
def AddSeeds( self, seeds ):
if len( seeds ) == 0:
return 0
new_seeds = []
with self._lock:
for seed in seeds:
if self._HasSeed( seed ):
continue
seed.Normalise()
new_seeds.append( seed )
self._seeds.append( seed )
self._seeds_to_indices[ seed ] = len( self._seeds ) - 1
self._SetStatusDirty()
self.NotifySeedsUpdated( new_seeds )
return len( new_seeds )
def AdvanceSeed( self, seed ):
with self._lock:
if seed in self._seeds_to_indices:
index = self._seeds_to_indices[ seed ]
if index > 0:
self._seeds.remove( seed )
self._seeds.insert( index - 1, seed )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self.NotifySeedsUpdated( ( seed, ) )
def CanCompact( self, compact_before_this_source_time ):
with self._lock:
if len( self._seeds ) <= 100:
return False
for seed in self._seeds[:-100]:
if seed.status == CC.STATUS_UNKNOWN:
continue
if self._GetSourceTimestamp( seed ) < compact_before_this_source_time:
return True
return False
def Compact( self, compact_before_this_source_time ):
with self._lock:
if len( self._seeds ) <= 100:
return
new_seeds = HydrusSerialisable.SerialisableList()
for seed in self._seeds[:-100]:
still_to_do = seed.status == CC.STATUS_UNKNOWN
still_relevant = self._GetSourceTimestamp( seed ) > compact_before_this_source_time
if still_to_do or still_relevant:
new_seeds.append( seed )
new_seeds.extend( self._seeds[-100:] )
self._seeds = new_seeds
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self._SetStatusDirty()
def DelaySeed( self, seed ):
with self._lock:
if seed in self._seeds_to_indices:
index = self._seeds_to_indices[ seed ]
if index < len( self._seeds ) - 1:
self._seeds.remove( seed )
self._seeds.insert( index + 1, seed )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self.NotifySeedsUpdated( ( seed, ) )
def GetEarliestSourceTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return None
earliest_timestamp = min( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )
return earliest_timestamp
def GetLatestAddedTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return 0
latest_timestamp = max( ( seed.created for seed in self._seeds ) )
return latest_timestamp
def GetLatestSourceTime( self ):
with self._lock:
if len( self._seeds ) == 0:
return 0
latest_timestamp = max( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )
return latest_timestamp
def GetNextSeed( self, status ):
with self._lock:
for seed in self._seeds:
if seed.status == status:
return seed
return None
def GetNumNewFilesSince( self, since ):
num_files = 0
with self._lock:
for seed in self._seeds:
source_timestamp = self._GetSourceTimestamp( seed )
if source_timestamp >= since:
num_files += 1
return num_files
def GetPresentedHashes( self, file_import_options ):
with self._lock:
hashes = []
for seed in self._seeds:
if seed.HasHash() and seed.ShouldPresent( file_import_options ):
hashes.append( seed.GetHash() )
return hashes
def GetSeedCacheKey( self ):
return self._seed_cache_key
def GetSeedCount( self, status = None ):
result = 0
with self._lock:
if status is None:
result = len( self._seeds )
else:
for seed in self._seeds:
if seed.status == status:
result += 1
return result
def GetSeeds( self, status = None ):
with self._lock:
return self._GetSeeds( status )
def GetSeedIndex( self, seed ):
with self._lock:
return self._seeds_to_indices[ seed ]
def GetStatus( self ):
with self._lock:
if self._status_dirty:
self._GenerateStatus()
return self._status_cache
def GetStatusGenerationTime( self ):
with self._lock:
if self._status_dirty:
return HydrusData.GetNow()
return self._status_cache_generation_time
def GetStatusesToCounts( self ):
with self._lock:
return self._GetStatusesToCounts()
def GetValueRange( self ):
with self._lock:
if self._status_dirty:
self._GenerateStatus()
( status, ( total_processed, total ) ) = self._status_cache
return ( total_processed, total )
def HasSeed( self, seed ):
with self._lock:
return self._HasSeed( seed )
def InsertSeeds( self, index, seeds ):
if len( seeds ) == 0:
return 0
new_seeds = []
with self._lock:
index = min( index, len( self._seeds ) )
for seed in seeds:
if self._HasSeed( seed ):
continue
seed.Normalise()
new_seeds.append( seed )
self._seeds.insert( index, seed )
index += 1
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self._SetStatusDirty()
self.NotifySeedsUpdated( new_seeds )
return len( new_seeds )
def NotifySeedsUpdated( self, seeds ):
with self._lock:
self._SetStatusDirty()
HG.client_controller.pub( 'seed_cache_seeds_updated', self._seed_cache_key, seeds )
def RemoveSeeds( self, seeds ):
with self._lock:
seeds_to_delete = set( seeds )
self._seeds = HydrusSerialisable.SerialisableList( [ seed for seed in self._seeds if seed not in seeds_to_delete ] )
self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }
self._SetStatusDirty()
self.NotifySeedsUpdated( seeds_to_delete )
def RemoveSeedsByStatus( self, statuses_to_remove ):
with self._lock:
seeds_to_delete = [ seed for seed in self._seeds if seed.status in statuses_to_remove ]
self.RemoveSeeds( seeds_to_delete )
def RemoveAllButUnknownSeeds( self ):
with self._lock:
seeds_to_delete = [ seed for seed in self._seeds if seed.status != CC.STATUS_UNKNOWN ]
self.RemoveSeeds( seeds_to_delete )
def RetryFailures( self ):
with self._lock:
failed_seeds = self._GetSeeds( CC.STATUS_ERROR )
for seed in failed_seeds:
seed.SetStatus( CC.STATUS_UNKNOWN )
self.NotifySeedsUpdated( failed_seeds )
def WorkToDo( self ):
with self._lock:
if self._status_dirty:
self._GenerateStatus()
( status, ( total_processed, total ) ) = self._status_cache
return total_processed < total
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE ] = SeedCache