2084 lines
67 KiB
Python
2084 lines
67 KiB
Python
import ClientConstants as CC
|
|
import ClientImageHandling
|
|
import ClientImporting
|
|
import ClientNetworkingDomain
|
|
import ClientParsing
|
|
import ClientPaths
|
|
import collections
|
|
import HydrusConstants as HC
|
|
import HydrusData
|
|
import HydrusExceptions
|
|
import HydrusFileHandling
|
|
import HydrusImageHandling
|
|
import HydrusGlobals as HG
|
|
import HydrusPaths
|
|
import HydrusSerialisable
|
|
import HydrusTags
|
|
import os
|
|
import threading
|
|
import time
|
|
import traceback
|
|
import urlparse
|
|
|
|
def GenerateFileSeedCacheStatus( file_seed_cache ):
|
|
|
|
statuses_to_counts = file_seed_cache.GetStatusesToCounts()
|
|
|
|
return GenerateStatusesToCountsStatus( statuses_to_counts )
|
|
|
|
def GenerateFileSeedCachesStatus( file_seed_caches ):
|
|
|
|
statuses_to_counts = collections.Counter()
|
|
|
|
for file_seed_cache in file_seed_caches:
|
|
|
|
statuses_to_counts.update( file_seed_cache.GetStatusesToCounts() )
|
|
|
|
|
|
return GenerateStatusesToCountsStatus( statuses_to_counts )
|
|
|
|
def GenerateStatusesToCountsStatus( statuses_to_counts ):
|
|
|
|
num_successful_and_new = statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
|
|
num_successful_but_redundant = statuses_to_counts[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ]
|
|
num_ignored = statuses_to_counts[ CC.STATUS_VETOED ]
|
|
num_deleted = statuses_to_counts[ CC.STATUS_DELETED ]
|
|
num_failed = statuses_to_counts[ CC.STATUS_ERROR ]
|
|
num_skipped = statuses_to_counts[ CC.STATUS_SKIPPED ]
|
|
num_unknown = statuses_to_counts[ CC.STATUS_UNKNOWN ]
|
|
|
|
status_strings = []
|
|
|
|
num_successful = num_successful_and_new + num_successful_but_redundant
|
|
|
|
if num_successful > 0:
|
|
|
|
s = HydrusData.ToHumanInt( num_successful ) + ' successful'
|
|
|
|
if num_successful_and_new > 0:
|
|
|
|
if num_successful_but_redundant > 0:
|
|
|
|
s += ' (' + HydrusData.ToHumanInt( num_successful_but_redundant ) + ' already in db)'
|
|
|
|
|
|
else:
|
|
|
|
s += ' (all already in db)'
|
|
|
|
|
|
status_strings.append( s )
|
|
|
|
|
|
if num_ignored > 0:
|
|
|
|
status_strings.append( HydrusData.ToHumanInt( num_ignored ) + ' ignored' )
|
|
|
|
|
|
if num_deleted > 0:
|
|
|
|
status_strings.append( HydrusData.ToHumanInt( num_deleted ) + ' previously deleted' )
|
|
|
|
|
|
if num_failed > 0:
|
|
|
|
status_strings.append( HydrusData.ToHumanInt( num_failed ) + ' failed' )
|
|
|
|
|
|
if num_skipped > 0:
|
|
|
|
status_strings.append( HydrusData.ToHumanInt( num_skipped ) + ' skipped' )
|
|
|
|
|
|
status = ', '.join( status_strings )
|
|
|
|
#
|
|
|
|
total = sum( statuses_to_counts.values() )
|
|
|
|
total_processed = total - num_unknown
|
|
|
|
#
|
|
|
|
simple_status = ''
|
|
|
|
if total > 0:
|
|
|
|
if num_unknown > 0:
|
|
|
|
simple_status += HydrusData.ConvertValueRangeToPrettyString( total_processed, total )
|
|
|
|
else:
|
|
|
|
simple_status += HydrusData.ToHumanInt( total_processed )
|
|
|
|
|
|
simple_status_strings = []
|
|
|
|
if num_ignored > 0:
|
|
|
|
simple_status_strings.append( HydrusData.ToHumanInt( num_ignored ) + 'Ig' )
|
|
|
|
|
|
show_deleted_on_file_seed_short_summary = HG.client_controller.new_options.GetBoolean( 'show_deleted_on_file_seed_short_summary' )
|
|
|
|
if show_deleted_on_file_seed_short_summary and num_deleted > 0:
|
|
|
|
simple_status_strings.append( HydrusData.ToHumanInt( num_deleted ) + 'D' )
|
|
|
|
|
|
if num_failed > 0:
|
|
|
|
simple_status_strings.append( HydrusData.ToHumanInt( num_failed ) + 'F' )
|
|
|
|
|
|
if num_skipped > 0:
|
|
|
|
simple_status_strings.append( HydrusData.ToHumanInt( num_skipped ) + 'S' )
|
|
|
|
|
|
if len( simple_status_strings ) > 0:
|
|
|
|
simple_status += ' - ' + ''.join( simple_status_strings )
|
|
|
|
|
|
|
|
return ( status, simple_status, ( total_processed, total ) )
|
|
|
|
class FileImportJob( object ):
|
|
|
|
def __init__( self, temp_path, file_import_options = None ):
|
|
|
|
if file_import_options is None:
|
|
|
|
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
|
|
|
|
|
|
self._temp_path = temp_path
|
|
self._file_import_options = file_import_options
|
|
|
|
self._hash = None
|
|
self._pre_import_status = None
|
|
|
|
self._file_info = None
|
|
self._thumbnail = None
|
|
self._phashes = None
|
|
self._extra_hashes = None
|
|
|
|
|
|
def CheckIsGoodToImport( self ):
|
|
|
|
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
|
|
|
|
self._file_import_options.CheckFileIsValid( size, mime, width, height )
|
|
|
|
|
|
def GetExtraHashes( self ):
|
|
|
|
return self._extra_hashes
|
|
|
|
|
|
def GetFileImportOptions( self ):
|
|
|
|
return self._file_import_options
|
|
|
|
|
|
def GetFileInfo( self ):
|
|
|
|
return self._file_info
|
|
|
|
|
|
def GetHash( self ):
|
|
|
|
return self._hash
|
|
|
|
|
|
def GetMime( self ):
|
|
|
|
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
|
|
|
|
return mime
|
|
|
|
|
|
def GetPreImportStatus( self ):
|
|
|
|
return self._pre_import_status
|
|
|
|
|
|
def GetPHashes( self ):
|
|
|
|
return self._phashes
|
|
|
|
|
|
def GetTempPathAndThumbnail( self ):
|
|
|
|
return ( self._temp_path, self._thumbnail )
|
|
|
|
|
|
def PubsubContentUpdates( self ):
|
|
|
|
if self._pre_import_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
|
|
|
|
if self._file_import_options.AutomaticallyArchives():
|
|
|
|
service_keys_to_content_updates = { CC.COMBINED_LOCAL_FILE_SERVICE_KEY : [ HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, set( ( self._hash, ) ) ) ] }
|
|
|
|
HG.client_controller.Write( 'content_updates', service_keys_to_content_updates )
|
|
|
|
|
|
|
|
|
|
def IsNewToDB( self ):
|
|
|
|
if self._pre_import_status == CC.STATUS_UNKNOWN:
|
|
|
|
return True
|
|
|
|
|
|
if self._pre_import_status == CC.STATUS_DELETED:
|
|
|
|
if not self._file_import_options.ExcludesDeleted():
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def GenerateHashAndStatus( self ):
|
|
|
|
HydrusImageHandling.ConvertToPngIfBmp( self._temp_path )
|
|
|
|
self._hash = HydrusFileHandling.GetHashFromPath( self._temp_path )
|
|
|
|
( self._pre_import_status, hash, note ) = HG.client_controller.Read( 'hash_status', 'sha256', self._hash, prefix = 'recognised during import' )
|
|
|
|
return ( self._pre_import_status, self._hash, note )
|
|
|
|
|
|
def GenerateInfo( self ):
|
|
|
|
mime = HydrusFileHandling.GetMime( self._temp_path )
|
|
|
|
new_options = HG.client_controller.new_options
|
|
|
|
if mime in HC.DECOMPRESSION_BOMB_IMAGES and not self._file_import_options.AllowsDecompressionBombs():
|
|
|
|
if HydrusImageHandling.IsDecompressionBomb( self._temp_path ):
|
|
|
|
raise HydrusExceptions.DecompressionBombException( 'Image seems to be a Decompression Bomb!' )
|
|
|
|
|
|
|
|
self._file_info = HydrusFileHandling.GetFileInfo( self._temp_path, mime )
|
|
|
|
( size, mime, width, height, duration, num_frames, num_words ) = self._file_info
|
|
|
|
if mime in HC.MIMES_WITH_THUMBNAILS:
|
|
|
|
percentage_in = HG.client_controller.new_options.GetInteger( 'video_thumbnail_percentage_in' )
|
|
|
|
self._thumbnail = HydrusFileHandling.GenerateThumbnail( self._temp_path, mime, percentage_in = percentage_in )
|
|
|
|
|
|
if mime in HC.MIMES_WE_CAN_PHASH:
|
|
|
|
self._phashes = ClientImageHandling.GenerateShapePerceptualHashes( self._temp_path, mime )
|
|
|
|
|
|
self._extra_hashes = HydrusFileHandling.GetExtraHashesFromPath( self._temp_path )
|
|
|
|
|
|
FILE_SEED_TYPE_HDD = 0
|
|
FILE_SEED_TYPE_URL = 1
|
|
|
|
class FileSeed( HydrusSerialisable.SerialisableBase ):
|
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED
|
|
SERIALISABLE_NAME = 'File Import'
|
|
SERIALISABLE_VERSION = 2
|
|
|
|
def __init__( self, file_seed_type = None, file_seed_data = None ):
|
|
|
|
if file_seed_type is None:
|
|
|
|
file_seed_type = FILE_SEED_TYPE_URL
|
|
|
|
|
|
if file_seed_data is None:
|
|
|
|
file_seed_data = 'https://big-guys.4u/monica_lewinsky_hott.tiff.exe.vbs'
|
|
|
|
|
|
HydrusSerialisable.SerialisableBase.__init__( self )
|
|
|
|
self.file_seed_type = file_seed_type
|
|
self.file_seed_data = file_seed_data
|
|
|
|
self.created = HydrusData.GetNow()
|
|
self.modified = self.created
|
|
self.source_time = None
|
|
self.status = CC.STATUS_UNKNOWN
|
|
self.note = ''
|
|
|
|
self._referral_url = None
|
|
|
|
self._urls = set()
|
|
self._tags = set()
|
|
self._hashes = {}
|
|
|
|
|
|
def __eq__( self, other ):
|
|
|
|
return self.__hash__() == other.__hash__()
|
|
|
|
|
|
def __hash__( self ):
|
|
|
|
return ( self.file_seed_type, self.file_seed_data ).__hash__()
|
|
|
|
|
|
def __ne__( self, other ):
|
|
|
|
return self.__hash__() != other.__hash__()
|
|
|
|
|
|
def _CheckTagsBlacklist( self, tags, tag_import_options ):
|
|
|
|
tag_import_options.CheckBlacklist( tags )
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
serialisable_urls = list( self._urls )
|
|
serialisable_tags = list( self._tags )
|
|
serialisable_hashes = [ ( hash_type, hash.encode( 'hex' ) ) for ( hash_type, hash ) in self._hashes.items() if hash is not None ]
|
|
|
|
return ( self.file_seed_type, self.file_seed_data, self.created, self.modified, self.source_time, self.status, self.note, self._referral_url, serialisable_urls, serialisable_tags, serialisable_hashes )
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
( self.file_seed_type, self.file_seed_data, self.created, self.modified, self.source_time, self.status, self.note, self._referral_url, serialisable_urls, serialisable_tags, serialisable_hashes ) = serialisable_info
|
|
|
|
self._urls = set( serialisable_urls )
|
|
self._tags = set( serialisable_tags )
|
|
self._hashes = { hash_type : encoded_hash.decode( 'hex' ) for ( hash_type, encoded_hash ) in serialisable_hashes if encoded_hash is not None }
|
|
|
|
|
|
def _NormaliseAndFilterAssociableURLs( self, urls ):
|
|
|
|
normalised_urls = set()
|
|
|
|
for url in urls:
|
|
|
|
try:
|
|
|
|
url = HG.client_controller.network_engine.domain_manager.NormaliseURL( url )
|
|
|
|
except HydrusExceptions.URLMatchException:
|
|
|
|
continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha
|
|
|
|
|
|
normalised_urls.add( url )
|
|
|
|
|
|
associable_urls = { url for url in normalised_urls if HG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
|
|
|
|
return associable_urls
|
|
|
|
|
|
def _SetupTagImportOptions( self, given_tag_import_options ):
|
|
|
|
if given_tag_import_options.IsDefault():
|
|
|
|
if self.IsAPostURL():
|
|
|
|
tio_lookup_url = self.file_seed_data
|
|
|
|
else:
|
|
|
|
if self._referral_url is not None:
|
|
|
|
tio_lookup_url = self._referral_url
|
|
|
|
else:
|
|
|
|
tio_lookup_url = self.file_seed_data
|
|
|
|
|
|
|
|
tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( tio_lookup_url )
|
|
|
|
else:
|
|
|
|
tag_import_options = given_tag_import_options
|
|
|
|
|
|
return tag_import_options
|
|
|
|
|
|
def _UpdateModified( self ):
|
|
|
|
self.modified = HydrusData.GetNow()
|
|
|
|
|
|
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
|
|
|
|
if version == 1:
|
|
|
|
( file_seed_type, file_seed_data, created, modified, source_time, status, note, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
|
|
|
|
referral_url = None
|
|
|
|
new_serialisable_info = ( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_urls, serialisable_tags, serialisable_hashes )
|
|
|
|
return ( 2, new_serialisable_info )
|
|
|
|
|
|
|
|
def AddParseResults( self, parse_results ):
|
|
|
|
for ( hash_type, hash ) in ClientParsing.GetHashesFromParseResults( parse_results ):
|
|
|
|
if hash_type not in self._hashes:
|
|
|
|
self._hashes[ hash_type ] = hash
|
|
|
|
|
|
|
|
urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_SOURCE, ) )
|
|
|
|
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
|
|
|
|
associable_urls.discard( self.file_seed_data )
|
|
|
|
self._urls.update( associable_urls )
|
|
|
|
tags = ClientParsing.GetTagsFromParseResults( parse_results )
|
|
|
|
self._tags.update( tags )
|
|
|
|
source_timestamp = ClientParsing.GetTimestampFromParseResults( parse_results, HC.TIMESTAMP_TYPE_SOURCE )
|
|
|
|
source_timestamp = min( HydrusData.GetNow() - 30, source_timestamp )
|
|
|
|
if source_timestamp is not None:
|
|
|
|
self.source_time = source_timestamp
|
|
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
def AddTags( self, tags ):
|
|
|
|
tags = HydrusTags.CleanTags( tags )
|
|
|
|
self._tags.update( tags )
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
def AddURL( self, url ):
|
|
|
|
urls = ( url, )
|
|
|
|
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
|
|
|
|
associable_urls.discard( self.file_seed_data )
|
|
|
|
self._urls.update( associable_urls )
|
|
|
|
|
|
def CheckPreFetchMetadata( self, tag_import_options ):
|
|
|
|
self._CheckTagsBlacklist( self._tags, tag_import_options )
|
|
|
|
|
|
def DownloadAndImportRawFile( self, file_url, file_import_options, network_job_factory, network_job_presentation_context_factory, override_bandwidth = False ):
|
|
|
|
self.AddURL( file_url )
|
|
|
|
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
|
|
|
|
try:
|
|
|
|
if self.file_seed_data != file_url:
|
|
|
|
referral_url = self.file_seed_data
|
|
|
|
else:
|
|
|
|
referral_url = self._referral_url
|
|
|
|
|
|
network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )
|
|
|
|
if override_bandwidth:
|
|
|
|
network_job.OverrideBandwidth( 30 )
|
|
|
|
|
|
network_job.SetFileImportOptions( file_import_options )
|
|
|
|
HG.client_controller.network_engine.AddJob( network_job )
|
|
|
|
with network_job_presentation_context_factory( network_job ) as njpc:
|
|
|
|
network_job.WaitUntilDone()
|
|
|
|
|
|
self.Import( temp_path, file_import_options )
|
|
|
|
finally:
|
|
|
|
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
|
|
|
|
|
|
|
|
def FetchPageMetadata( self, tag_import_options ):
|
|
|
|
pass
|
|
|
|
|
|
def GetHash( self ):
|
|
|
|
if 'sha256' in self._hashes:
|
|
|
|
return self._hashes[ 'sha256' ]
|
|
|
|
|
|
return None
|
|
|
|
|
|
def GetSearchFileSeeds( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
search_urls = ClientNetworkingDomain.GetSearchURLs( self.file_seed_data )
|
|
|
|
search_file_seeds = [ FileSeed( FILE_SEED_TYPE_URL, search_url ) for search_url in search_urls ]
|
|
|
|
else:
|
|
|
|
search_file_seeds = [ self ]
|
|
|
|
|
|
return search_file_seeds
|
|
|
|
|
|
def HasHash( self ):
|
|
|
|
return self.GetHash() is not None
|
|
|
|
|
|
def Import( self, temp_path, file_import_options ):
|
|
|
|
file_import_job = FileImportJob( temp_path, file_import_options )
|
|
|
|
( status, hash, note ) = HG.client_controller.client_files_manager.ImportFile( file_import_job )
|
|
|
|
self.SetStatus( status, note = note )
|
|
self.SetHash( hash )
|
|
|
|
|
|
def ImportPath( self, file_seed_cache, file_import_options, limited_mimes = None ):
|
|
|
|
try:
|
|
|
|
if self.file_seed_type != FILE_SEED_TYPE_HDD:
|
|
|
|
raise HydrusExceptions.VetoException( 'Attempted to import as a path, but I do not think I am a path!' )
|
|
|
|
|
|
path = self.file_seed_data
|
|
|
|
if not os.path.exists( path ):
|
|
|
|
raise HydrusExceptions.VetoException( 'Source file does not exist!' )
|
|
|
|
|
|
if limited_mimes is not None:
|
|
|
|
mime = HydrusFileHandling.GetMime( path )
|
|
|
|
if mime not in limited_mimes:
|
|
|
|
raise HydrusExceptions.VetoException( 'Not in allowed mimes!' )
|
|
|
|
|
|
|
|
( os_file_handle, temp_path ) = ClientPaths.GetTempPath()
|
|
|
|
try:
|
|
|
|
copied = HydrusPaths.MirrorFile( path, temp_path )
|
|
|
|
if not copied:
|
|
|
|
raise Exception( 'File failed to copy to temp path--see log for error.' )
|
|
|
|
|
|
self.Import( temp_path, file_import_options )
|
|
|
|
finally:
|
|
|
|
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
|
|
|
|
|
|
except HydrusExceptions.MimeException as e:
|
|
|
|
self.SetStatus( CC.STATUS_ERROR, exception = e )
|
|
|
|
except HydrusExceptions.VetoException as e:
|
|
|
|
self.SetStatus( CC.STATUS_VETOED, note = HydrusData.ToUnicode( e ) )
|
|
|
|
except Exception as e:
|
|
|
|
self.SetStatus( CC.STATUS_ERROR, exception = e )
|
|
|
|
|
|
file_seed_cache.NotifyFileSeedsUpdated( ( self, ) )
|
|
|
|
|
|
def IsAPostURL( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
|
|
|
|
if url_type == HC.URL_TYPE_POST:
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def Normalise( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
self.file_seed_data = HG.client_controller.network_engine.domain_manager.NormaliseURL( self.file_seed_data )
|
|
|
|
|
|
|
|
def PredictPreImportStatus( self, file_import_options, tag_import_options, file_url = None ):
|
|
|
|
# atm, if url recognised, then hash is always recognised because url sets sha256 hash wew
|
|
# however, we will move these to now be 'setting' methods soonish, and do the status/note/etc... set here, at which point this will be _more_ accurate, at least first time around
|
|
# the should_download_file test should take into account future url/hash checkboxes in a similar way, which this will matter more
|
|
# and in fact, it may be appropriate to not actually do/'trust' url/hash status sets if the file ones are checked
|
|
|
|
url_recognised_and_file_already_in_db = self.PredictPreImportStatusURL( file_import_options, file_url = file_url )
|
|
hash_recognised_and_file_already_in_db = self.PredictPreImportStatusHash( file_import_options )
|
|
|
|
should_download_metadata = self.status == CC.STATUS_UNKNOWN # if the file is unknown, we need the metadata to get the file_url!
|
|
|
|
if not should_download_metadata and tag_import_options.WorthFetchingTags():
|
|
|
|
url_override = url_recognised_and_file_already_in_db and tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB()
|
|
hash_override = hash_recognised_and_file_already_in_db and tag_import_options.ShouldFetchTagsEvenIfHashKnownAndFileAlreadyInDB()
|
|
|
|
if url_override or hash_override:
|
|
|
|
should_download_metadata = True
|
|
|
|
|
|
|
|
should_download_file = self.status == CC.STATUS_UNKNOWN
|
|
|
|
return ( should_download_metadata, should_download_file )
|
|
|
|
|
|
def PredictPreImportStatusHash( self, file_import_options ):
|
|
|
|
UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )
|
|
|
|
( status, hash, note ) = UNKNOWN_DEFAULT
|
|
|
|
# hashes
|
|
|
|
if status == CC.STATUS_UNKNOWN:
|
|
|
|
for ( hash_type, found_hash ) in self._hashes.items():
|
|
|
|
( status, hash, note ) = HG.client_controller.Read( 'hash_status', hash_type, found_hash )
|
|
|
|
if status != CC.STATUS_UNKNOWN:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
hash_recognised_and_file_already_in_db = status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT
|
|
|
|
#
|
|
|
|
if status == CC.STATUS_DELETED:
|
|
|
|
if not file_import_options.ExcludesDeleted():
|
|
|
|
status = CC.STATUS_UNKNOWN
|
|
note = ''
|
|
|
|
|
|
|
|
if self.status == CC.STATUS_UNKNOWN:
|
|
|
|
self.status = status
|
|
|
|
if hash is not None:
|
|
|
|
self._hashes[ 'sha256' ] = hash
|
|
|
|
|
|
self.note = note
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
return hash_recognised_and_file_already_in_db
|
|
|
|
|
|
def PredictPreImportStatusURL( self, file_import_options, file_url = None ):
|
|
|
|
UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )
|
|
|
|
( status, hash, note ) = UNKNOWN_DEFAULT
|
|
|
|
# urls
|
|
|
|
urls = set( self._urls )
|
|
|
|
if file_url is not None:
|
|
|
|
urls.add( file_url )
|
|
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
urls.add( self.file_seed_data )
|
|
|
|
|
|
unrecognised_url_results = set()
|
|
|
|
for url in urls:
|
|
|
|
if HG.client_controller.network_engine.domain_manager.URLCanReferToMultipleFiles( url ):
|
|
|
|
continue
|
|
|
|
|
|
# we now only trust url-matched single urls and the post/file urls
|
|
# trusting unmatched source urls was too much of a hassle with too many boorus providing bad source urls like user account pages
|
|
|
|
if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToOneFile( url ) or url in ( self.file_seed_data, file_url ):
|
|
|
|
results = HG.client_controller.Read( 'url_statuses', url )
|
|
|
|
if len( results ) == 0: # if no match found, no useful data discovered
|
|
|
|
continue
|
|
|
|
elif len( results ) > 1: # if more than one file claims this url, it cannot be relied on to guess the file
|
|
|
|
continue
|
|
|
|
else: # i.e. 1 match found
|
|
|
|
( status, hash, note ) = results[0]
|
|
|
|
if status != CC.STATUS_UNKNOWN:
|
|
|
|
# a known one-file url has given a single clear result. sounds good
|
|
|
|
we_have_a_match = True
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
# to double-check, let's see if the file that claims that url has any other interesting urls
|
|
# if the file has another url with the same url class as ours, then this is prob an unreliable 'alternate' source url attribution, and untrustworthy
|
|
|
|
my_url = self.file_seed_data
|
|
|
|
if url != my_url:
|
|
|
|
my_url_match = HG.client_controller.network_engine.domain_manager.GetURLMatch( my_url )
|
|
|
|
( media_result, ) = HG.client_controller.Read( 'media_results', ( hash, ) )
|
|
|
|
this_files_urls = media_result.GetLocationsManager().GetURLs()
|
|
|
|
for this_files_url in this_files_urls:
|
|
|
|
if this_files_url != my_url:
|
|
|
|
this_url_match = HG.client_controller.network_engine.domain_manager.GetURLMatch( this_files_url )
|
|
|
|
if my_url_match == this_url_match:
|
|
|
|
# oh no, the file this source url refers to has a different known url in this same domain
|
|
# it is more likely that an edit on this site points to the original elsewhere
|
|
|
|
( status, hash, note ) = UNKNOWN_DEFAULT
|
|
|
|
we_have_a_match = False
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if we_have_a_match:
|
|
|
|
break # if a known one-file url gives a single clear result, that result is reliable
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url_recognised_and_file_already_in_db = status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT
|
|
|
|
#
|
|
|
|
if status == CC.STATUS_DELETED:
|
|
|
|
if not file_import_options.ExcludesDeleted():
|
|
|
|
status = CC.STATUS_UNKNOWN
|
|
note = ''
|
|
|
|
|
|
|
|
if self.status == CC.STATUS_UNKNOWN:
|
|
|
|
self.status = status
|
|
|
|
if hash is not None:
|
|
|
|
self._hashes[ 'sha256' ] = hash
|
|
|
|
|
|
self.note = note
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
return url_recognised_and_file_already_in_db
|
|
|
|
|
|
def PresentToPage( self, page_key ):
|
|
|
|
hash = self.GetHash()
|
|
|
|
if hash is not None:
|
|
|
|
( media_result, ) = HG.client_controller.Read( 'media_results', ( hash, ) )
|
|
|
|
HG.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
|
|
|
|
|
|
|
|
def SetHash( self, hash ):
|
|
|
|
if hash is not None:
|
|
|
|
self._hashes[ 'sha256' ] = hash
|
|
|
|
|
|
|
|
def SetReferralURL( self, referral_url ):
|
|
|
|
self._referral_url = referral_url
|
|
|
|
|
|
def SetStatus( self, status, note = '', exception = None ):
|
|
|
|
if exception is not None:
|
|
|
|
first_line = HydrusData.ToUnicode( exception ).split( os.linesep )[0]
|
|
|
|
note = first_line + u'\u2026 (Copy note to see full error)'
|
|
note += os.linesep
|
|
note += HydrusData.ToUnicode( traceback.format_exc() )
|
|
|
|
HydrusData.Print( 'Error when processing ' + self.file_seed_data + ' !' )
|
|
HydrusData.Print( traceback.format_exc() )
|
|
|
|
|
|
self.status = status
|
|
self.note = note
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
def ShouldPresent( self, file_import_options, in_inbox = None ):
|
|
|
|
hash = self.GetHash()
|
|
|
|
if hash is not None and self.status in CC.SUCCESSFUL_IMPORT_STATES:
|
|
|
|
if in_inbox is None:
|
|
|
|
if file_import_options.ShouldPresentIgnorantOfInbox( self.status ):
|
|
|
|
return True
|
|
|
|
|
|
if file_import_options.ShouldNotPresentIgnorantOfInbox( self.status ):
|
|
|
|
return False
|
|
|
|
|
|
in_inbox = HG.client_controller.Read( 'in_inbox', hash )
|
|
|
|
|
|
if file_import_options.ShouldPresent( self.status, in_inbox ):
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def WorksInNewSystem( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
|
|
|
|
if url_type == HC.URL_TYPE_FILE:
|
|
|
|
return True
|
|
|
|
|
|
if url_type == HC.URL_TYPE_POST and can_parse:
|
|
|
|
return True
|
|
|
|
|
|
if url_type == HC.URL_TYPE_UNKNOWN and self._referral_url is not None: # this is likely be a multi-file child of a post url file_seed
|
|
|
|
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self._referral_url )
|
|
|
|
if url_type == HC.URL_TYPE_POST: # we must have got here through parsing that m8, so let's assume this is an unrecognised file url
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def WorkOnURL( self, file_seed_cache, status_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, tag_import_options ):
|
|
|
|
did_substantial_work = False
|
|
|
|
try:
|
|
|
|
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
|
|
|
|
if url_type not in ( HC.URL_TYPE_POST, HC.URL_TYPE_FILE, HC.URL_TYPE_UNKNOWN ):
|
|
|
|
raise HydrusExceptions.VetoException( 'Did not recognise this as a File or Post URL!' )
|
|
|
|
|
|
if url_type == HC.URL_TYPE_POST and not can_parse:
|
|
|
|
raise HydrusExceptions.VetoException( 'Did not have a parser for this URL!' )
|
|
|
|
|
|
tag_import_options = self._SetupTagImportOptions( tag_import_options )
|
|
|
|
status_hook( 'checking url status' )
|
|
|
|
( should_download_metadata, should_download_file ) = self.PredictPreImportStatus( file_import_options, tag_import_options )
|
|
|
|
if self.IsAPostURL():
|
|
|
|
if should_download_metadata:
|
|
|
|
did_substantial_work = True
|
|
|
|
post_url = self.file_seed_data
|
|
|
|
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
|
|
|
|
status_hook( 'downloading page' )
|
|
|
|
if self._referral_url not in ( post_url, url_to_check ):
|
|
|
|
referral_url = self._referral_url
|
|
|
|
else:
|
|
|
|
referral_url = None
|
|
|
|
|
|
network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
|
|
|
|
HG.client_controller.network_engine.AddJob( network_job )
|
|
|
|
with network_job_presentation_context_factory( network_job ) as njpc:
|
|
|
|
network_job.WaitUntilDone()
|
|
|
|
|
|
data = network_job.GetContent()
|
|
|
|
parsing_context = {}
|
|
|
|
parsing_context[ 'post_url' ] = post_url
|
|
parsing_context[ 'url' ] = url_to_check
|
|
|
|
all_parse_results = parser.Parse( parsing_context, data )
|
|
|
|
if len( all_parse_results ) == 0:
|
|
|
|
raise HydrusExceptions.VetoException( 'No data found in document!' )
|
|
|
|
elif len( all_parse_results ) > 1:
|
|
|
|
file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, self.file_seed_data )
|
|
|
|
( num_urls_added, num_urls_already_in_file_seed_cache, can_add_more_file_urls, stop_reason ) = ClientImporting.UpdateFileSeedCacheWithFileSeeds( file_seed_cache, file_seeds )
|
|
|
|
status = CC.STATUS_SUCCESSFUL_AND_NEW
|
|
note = 'Found ' + HydrusData.ToHumanInt( num_urls_added ) + ' new URLs.'
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
else:
|
|
|
|
parse_results = all_parse_results[0]
|
|
|
|
self.AddParseResults( parse_results )
|
|
|
|
self.CheckPreFetchMetadata( tag_import_options )
|
|
|
|
desired_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
|
|
|
|
child_urls = []
|
|
|
|
if len( desired_urls ) == 0:
|
|
|
|
raise HydrusExceptions.VetoException( 'Could not find a file or post URL to download!' )
|
|
|
|
elif len( desired_urls ) == 1:
|
|
|
|
desired_url = desired_urls[0]
|
|
|
|
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( desired_url )
|
|
|
|
if url_type in ( HC.URL_TYPE_FILE, HC.URL_TYPE_UNKNOWN ):
|
|
|
|
file_url = desired_url
|
|
|
|
( should_download_metadata, should_download_file ) = self.PredictPreImportStatus( file_import_options, tag_import_options, file_url )
|
|
|
|
if should_download_file:
|
|
|
|
status_hook( 'downloading file' )
|
|
|
|
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory, override_bandwidth = True )
|
|
|
|
|
|
elif url_type == HC.URL_TYPE_POST and can_parse:
|
|
|
|
# a pixiv mode=medium page has spawned a mode=manga page, so we need a new file_seed to go pursue that
|
|
|
|
child_urls = [ desired_url ]
|
|
|
|
else:
|
|
|
|
raise HydrusExceptions.VetoException( 'Found a URL--' + desired_url + '--but could not understand/parse it!' )
|
|
|
|
|
|
else:
|
|
|
|
child_urls = desired_urls
|
|
|
|
|
|
if len( child_urls ) > 0:
|
|
|
|
child_file_seeds = []
|
|
|
|
for child_url in child_urls:
|
|
|
|
duplicate_file_seed = self.Duplicate() # inherits all urls and tags from here
|
|
|
|
duplicate_file_seed.file_seed_data = child_url
|
|
|
|
duplicate_file_seed.SetReferralURL( self.file_seed_data )
|
|
|
|
if self._referral_url is not None:
|
|
|
|
duplicate_file_seed.AddURL( self._referral_url )
|
|
|
|
|
|
child_file_seeds.append( duplicate_file_seed )
|
|
|
|
|
|
try:
|
|
|
|
my_index = file_seed_cache.GetFileSeedIndex( self )
|
|
|
|
insertion_index = my_index + 1
|
|
|
|
except:
|
|
|
|
insertion_index = len( file_seed_cache )
|
|
|
|
|
|
file_seed_cache.InsertFileSeeds( insertion_index, child_file_seeds )
|
|
|
|
status = CC.STATUS_SUCCESSFUL_AND_NEW
|
|
note = 'Found ' + HydrusData.ToHumanInt( len( child_urls ) ) + ' new URLs.'
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
if should_download_file:
|
|
|
|
did_substantial_work = True
|
|
|
|
file_url = self.file_seed_data
|
|
|
|
status_hook( 'downloading file' )
|
|
|
|
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )
|
|
|
|
|
|
|
|
did_substantial_work |= self.WriteContentUpdates( tag_import_options )
|
|
|
|
except HydrusExceptions.ShutdownException:
|
|
|
|
return False
|
|
|
|
except HydrusExceptions.VetoException as e:
|
|
|
|
status = CC.STATUS_VETOED
|
|
|
|
note = HydrusData.ToUnicode( e )
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
if isinstance( e, HydrusExceptions.CancelledException ):
|
|
|
|
status_hook( 'cancelled!' )
|
|
|
|
time.sleep( 2 )
|
|
|
|
|
|
except HydrusExceptions.ForbiddenException:
|
|
|
|
status = CC.STATUS_VETOED
|
|
note = '403'
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
status_hook( '403' )
|
|
|
|
time.sleep( 2 )
|
|
|
|
except HydrusExceptions.NotFoundException:
|
|
|
|
status = CC.STATUS_VETOED
|
|
note = '404'
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
status_hook( '404' )
|
|
|
|
time.sleep( 2 )
|
|
|
|
except Exception as e:
|
|
|
|
status = CC.STATUS_ERROR
|
|
|
|
self.SetStatus( status, exception = e )
|
|
|
|
status_hook( 'error!' )
|
|
|
|
time.sleep( 3 )
|
|
|
|
|
|
file_seed_cache.NotifyFileSeedsUpdated( ( self, ) )
|
|
|
|
return did_substantial_work
|
|
|
|
|
|
def WriteContentUpdates( self, tag_import_options = None ):
|
|
|
|
did_work = False
|
|
|
|
if self.status == CC.STATUS_ERROR:
|
|
|
|
return did_work
|
|
|
|
|
|
hash = self.GetHash()
|
|
|
|
if hash is None:
|
|
|
|
return did_work
|
|
|
|
|
|
service_keys_to_content_updates = collections.defaultdict( list )
|
|
|
|
urls = set( self._urls )
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
urls.add( self.file_seed_data )
|
|
|
|
|
|
if self._referral_url is not None:
|
|
|
|
urls.add( self._referral_url )
|
|
|
|
|
|
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
|
|
|
|
if len( associable_urls ) > 0:
|
|
|
|
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( associable_urls, ( hash, ) ) )
|
|
|
|
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
|
|
|
|
|
|
if tag_import_options is not None:
|
|
|
|
in_inbox = HG.client_controller.Read( 'in_inbox', hash )
|
|
|
|
for ( service_key, content_updates ) in tag_import_options.GetServiceKeysToContentUpdates( self.status, in_inbox, hash, set( self._tags ) ).items():
|
|
|
|
service_keys_to_content_updates[ service_key ].extend( content_updates )
|
|
|
|
|
|
|
|
if len( service_keys_to_content_updates ) > 0:
|
|
|
|
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
|
|
|
|
did_work = True
|
|
|
|
|
|
return did_work
|
|
|
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED ] = FileSeed
|
|
|
|
class FileSeedCache( HydrusSerialisable.SerialisableBase ):
|
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE
|
|
SERIALISABLE_NAME = 'Import File Status Cache'
|
|
SERIALISABLE_VERSION = 8
|
|
|
|
COMPACT_NUMBER = 100
|
|
|
|
def __init__( self ):
|
|
|
|
HydrusSerialisable.SerialisableBase.__init__( self )
|
|
|
|
self._file_seeds = HydrusSerialisable.SerialisableList()
|
|
|
|
self._file_seeds_to_indices = {}
|
|
|
|
self._file_seed_cache_key = HydrusData.GenerateKey()
|
|
|
|
self._status_cache = None
|
|
self._status_cache_generation_time = 0
|
|
|
|
self._status_dirty = True
|
|
|
|
self._lock = threading.Lock()
|
|
|
|
|
|
def __len__( self ):
|
|
|
|
return len( self._file_seeds )
|
|
|
|
|
|
def _GenerateStatus( self ):
|
|
|
|
self._status_cache = GenerateStatusesToCountsStatus( self._GetStatusesToCounts() )
|
|
self._status_cache_generation_time = HydrusData.GetNow()
|
|
|
|
self._status_dirty = False
|
|
|
|
|
|
def _GetFileSeeds( self, status = None ):
|
|
|
|
if status is None:
|
|
|
|
return list( self._file_seeds )
|
|
|
|
else:
|
|
|
|
return [ file_seed for file_seed in self._file_seeds if file_seed.status == status ]
|
|
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
with self._lock:
|
|
|
|
return self._file_seeds.GetSerialisableTuple()
|
|
|
|
|
|
|
|
def _GetSourceTimestamp( self, file_seed ):
|
|
|
|
source_timestamp = file_seed.source_time
|
|
|
|
if source_timestamp is None:
|
|
|
|
# decent fallback compromise
|
|
# -30 since added and 'last check' timestamps are often the same, and this messes up calculations
|
|
|
|
source_timestamp = file_seed.created - 30
|
|
|
|
|
|
return source_timestamp
|
|
|
|
|
|
def _GetStatusesToCounts( self ):
|
|
|
|
statuses_to_counts = collections.Counter()
|
|
|
|
for file_seed in self._file_seeds:
|
|
|
|
statuses_to_counts[ file_seed.status ] += 1
|
|
|
|
|
|
return statuses_to_counts
|
|
|
|
|
|
def _HasFileSeed( self, file_seed ):
|
|
|
|
search_file_seeds = file_seed.GetSearchFileSeeds()
|
|
|
|
has_file_seed = True in ( search_file_seed in self._file_seeds_to_indices for search_file_seed in search_file_seeds )
|
|
|
|
return has_file_seed
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
with self._lock:
|
|
|
|
self._file_seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
|
|
|
|
def _SetStatusDirty( self ):
|
|
|
|
self._status_dirty = True
|
|
|
|
|
|
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
|
|
|
|
if version == 1:
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
if 'note' in file_seed_info:
|
|
|
|
file_seed_info[ 'note' ] = HydrusData.ToUnicode( file_seed_info[ 'note' ] )
|
|
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 2, new_serialisable_info )
|
|
|
|
|
|
if version in ( 2, 3 ):
|
|
|
|
# gelbooru replaced their thumbnail links with this redirect spam
|
|
# 'https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY4ODA1OA=='
|
|
|
|
# I missed some http ones here, so I've broadened the test and rescheduled it
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
if 'gelbooru.com/redirect.php' in file_seed:
|
|
|
|
continue
|
|
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 4, new_serialisable_info )
|
|
|
|
|
|
if version == 4:
|
|
|
|
def ConvertRegularToRawURL( regular_url ):
|
|
|
|
# convert this:
|
|
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
|
|
# to this:
|
|
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
|
|
# the 500 part can be a bunch of stuff, including letters
|
|
|
|
url_components = regular_url.split( '_' )
|
|
|
|
last_component = url_components[ -1 ]
|
|
|
|
( number_gubbins, file_ext ) = last_component.split( '.' )
|
|
|
|
raw_last_component = 'raw.' + file_ext
|
|
|
|
url_components[ -1 ] = raw_last_component
|
|
|
|
raw_url = '_'.join( url_components )
|
|
|
|
return raw_url
|
|
|
|
|
|
def Remove68Subdomain( long_url ):
|
|
|
|
# sometimes the 68 subdomain gives a 404 on the raw url, so:
|
|
|
|
# convert this:
|
|
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
|
|
# to this:
|
|
# http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
|
|
|
|
# I am not sure if it is always 68, but let's not assume
|
|
|
|
( scheme, rest ) = long_url.split( '://', 1 )
|
|
|
|
if rest.startswith( 'media.tumblr.com' ):
|
|
|
|
return long_url
|
|
|
|
|
|
( gumpf, shorter_rest ) = rest.split( '.', 1 )
|
|
|
|
shorter_url = scheme + '://' + shorter_rest
|
|
|
|
return shorter_url
|
|
|
|
|
|
new_serialisable_info = []
|
|
|
|
good_file_seeds = set()
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
try:
|
|
|
|
parse = urlparse.urlparse( file_seed )
|
|
|
|
if 'media.tumblr.com' in parse.netloc:
|
|
|
|
file_seed = Remove68Subdomain( file_seed )
|
|
|
|
file_seed = ConvertRegularToRawURL( file_seed )
|
|
|
|
file_seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( file_seed )
|
|
|
|
|
|
if 'pixiv.net' in parse.netloc:
|
|
|
|
file_seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( file_seed )
|
|
|
|
|
|
if file_seed in good_file_seeds: # we hit a dupe, so skip it
|
|
|
|
continue
|
|
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
good_file_seeds.add( file_seed )
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 5, new_serialisable_info )
|
|
|
|
|
|
if version == 5:
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
file_seed_info[ 'source_timestamp' ] = None
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 6, new_serialisable_info )
|
|
|
|
|
|
if version == 6:
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
try:
|
|
|
|
magic_phrase = '//media.tumblr.com'
|
|
replacement = '//data.tumblr.com'
|
|
|
|
if magic_phrase in file_seed:
|
|
|
|
file_seed = file_seed.replace( magic_phrase, replacement )
|
|
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 7, new_serialisable_info )
|
|
|
|
|
|
if version == 7:
|
|
|
|
file_seeds = HydrusSerialisable.SerialisableList()
|
|
|
|
for ( file_seed_text, file_seed_info ) in old_serialisable_info:
|
|
|
|
if file_seed_text.startswith( 'http' ):
|
|
|
|
file_seed_type = FILE_SEED_TYPE_URL
|
|
|
|
else:
|
|
|
|
file_seed_type = FILE_SEED_TYPE_HDD
|
|
|
|
|
|
file_seed = FileSeed( file_seed_type, file_seed_text )
|
|
|
|
file_seed.status = file_seed_info[ 'status' ]
|
|
file_seed.created = file_seed_info[ 'added_timestamp' ]
|
|
file_seed.modified = file_seed_info[ 'last_modified_timestamp' ]
|
|
file_seed.source_time = file_seed_info[ 'source_timestamp' ]
|
|
file_seed.note = file_seed_info[ 'note' ]
|
|
|
|
file_seeds.append( file_seed )
|
|
|
|
|
|
new_serialisable_info = file_seeds.GetSerialisableTuple()
|
|
|
|
return ( 8, new_serialisable_info )
|
|
|
|
|
|
|
|
def AddFileSeeds( self, file_seeds ):
|
|
|
|
if len( file_seeds ) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
new_file_seeds = []
|
|
|
|
with self._lock:
|
|
|
|
for file_seed in file_seeds:
|
|
|
|
if self._HasFileSeed( file_seed ):
|
|
|
|
continue
|
|
|
|
|
|
file_seed.Normalise()
|
|
|
|
new_file_seeds.append( file_seed )
|
|
|
|
self._file_seeds.append( file_seed )
|
|
|
|
self._file_seeds_to_indices[ file_seed ] = len( self._file_seeds ) - 1
|
|
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( new_file_seeds )
|
|
|
|
return len( new_file_seeds )
|
|
|
|
|
|
def AdvanceFileSeed( self, file_seed ):
|
|
|
|
with self._lock:
|
|
|
|
if file_seed in self._file_seeds_to_indices:
|
|
|
|
index = self._file_seeds_to_indices[ file_seed ]
|
|
|
|
if index > 0:
|
|
|
|
self._file_seeds.remove( file_seed )
|
|
|
|
self._file_seeds.insert( index - 1, file_seed )
|
|
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( ( file_seed, ) )
|
|
|
|
|
|
def CanCompact( self, compact_before_this_source_time ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) <= self.COMPACT_NUMBER:
|
|
|
|
return False
|
|
|
|
|
|
for file_seed in self._file_seeds[:-self.COMPACT_NUMBER]:
|
|
|
|
if file_seed.status == CC.STATUS_UNKNOWN:
|
|
|
|
continue
|
|
|
|
|
|
if self._GetSourceTimestamp( file_seed ) < compact_before_this_source_time:
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def Compact( self, compact_before_this_source_time ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) <= self.COMPACT_NUMBER:
|
|
|
|
return
|
|
|
|
|
|
new_file_seeds = HydrusSerialisable.SerialisableList()
|
|
|
|
for file_seed in self._file_seeds[:-self.COMPACT_NUMBER]:
|
|
|
|
still_to_do = file_seed.status == CC.STATUS_UNKNOWN
|
|
still_relevant = self._GetSourceTimestamp( file_seed ) > compact_before_this_source_time
|
|
|
|
if still_to_do or still_relevant:
|
|
|
|
new_file_seeds.append( file_seed )
|
|
|
|
|
|
|
|
new_file_seeds.extend( self._file_seeds[-self.COMPACT_NUMBER:] )
|
|
|
|
self._file_seeds = new_file_seeds
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
|
|
def DelayFileSeed( self, file_seed ):
|
|
|
|
with self._lock:
|
|
|
|
if file_seed in self._file_seeds_to_indices:
|
|
|
|
index = self._file_seeds_to_indices[ file_seed ]
|
|
|
|
if index < len( self._file_seeds ) - 1:
|
|
|
|
self._file_seeds.remove( file_seed )
|
|
|
|
self._file_seeds.insert( index + 1, file_seed )
|
|
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( ( file_seed, ) )
|
|
|
|
|
|
def GetEarliestSourceTime( self ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) == 0:
|
|
|
|
return None
|
|
|
|
|
|
earliest_timestamp = min( ( self._GetSourceTimestamp( file_seed ) for file_seed in self._file_seeds ) )
|
|
|
|
|
|
return earliest_timestamp
|
|
|
|
|
|
def GetFileSeedCacheKey( self ):
|
|
|
|
return self._file_seed_cache_key
|
|
|
|
|
|
def GetFileSeedCount( self, status = None ):
|
|
|
|
result = 0
|
|
|
|
with self._lock:
|
|
|
|
if status is None:
|
|
|
|
result = len( self._file_seeds )
|
|
|
|
else:
|
|
|
|
for file_seed in self._file_seeds:
|
|
|
|
if file_seed.status == status:
|
|
|
|
result += 1
|
|
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
def GetFileSeeds( self, status = None ):
|
|
|
|
with self._lock:
|
|
|
|
return self._GetFileSeeds( status )
|
|
|
|
|
|
|
|
def GetFileSeedIndex( self, file_seed ):
|
|
|
|
with self._lock:
|
|
|
|
return self._file_seeds_to_indices[ file_seed ]
|
|
|
|
|
|
|
|
def GetLatestAddedTime( self ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
latest_timestamp = max( ( file_seed.created for file_seed in self._file_seeds ) )
|
|
|
|
|
|
return latest_timestamp
|
|
|
|
|
|
def GetLatestSourceTime( self ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
latest_timestamp = max( ( self._GetSourceTimestamp( file_seed ) for file_seed in self._file_seeds ) )
|
|
|
|
|
|
return latest_timestamp
|
|
|
|
|
|
def GetNextFileSeed( self, status ):
|
|
|
|
with self._lock:
|
|
|
|
for file_seed in self._file_seeds:
|
|
|
|
if file_seed.status == status:
|
|
|
|
return file_seed
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
def GetNumNewFilesSince( self, since ):
|
|
|
|
num_files = 0
|
|
|
|
with self._lock:
|
|
|
|
for file_seed in self._file_seeds:
|
|
|
|
source_timestamp = self._GetSourceTimestamp( file_seed )
|
|
|
|
if source_timestamp >= since:
|
|
|
|
num_files += 1
|
|
|
|
|
|
|
|
|
|
return num_files
|
|
|
|
|
|
def GetPresentedHashes( self, file_import_options ):
|
|
|
|
with self._lock:
|
|
|
|
eligible_file_seeds = [ file_seed for file_seed in self._file_seeds if file_seed.HasHash() ]
|
|
|
|
file_seed_hashes = [ file_seed.GetHash() for file_seed in eligible_file_seeds ]
|
|
|
|
inbox_hashes = HG.client_controller.Read( 'in_inbox', file_seed_hashes )
|
|
|
|
hashes = []
|
|
|
|
for file_seed in eligible_file_seeds:
|
|
|
|
hash = file_seed.GetHash()
|
|
|
|
in_inbox = hash in inbox_hashes
|
|
|
|
if file_seed.ShouldPresent( file_import_options, in_inbox = in_inbox ):
|
|
|
|
hashes.append( hash )
|
|
|
|
|
|
|
|
return hashes
|
|
|
|
|
|
|
|
def GetStatus( self ):
|
|
|
|
with self._lock:
|
|
|
|
if self._status_dirty:
|
|
|
|
self._GenerateStatus()
|
|
|
|
|
|
return self._status_cache
|
|
|
|
|
|
|
|
def GetStatusGenerationTime( self ):
|
|
|
|
with self._lock:
|
|
|
|
if self._status_dirty:
|
|
|
|
return HydrusData.GetNow()
|
|
|
|
|
|
return self._status_cache_generation_time
|
|
|
|
|
|
|
|
def GetStatusesToCounts( self ):
|
|
|
|
with self._lock:
|
|
|
|
return self._GetStatusesToCounts()
|
|
|
|
|
|
|
|
def GetValueRange( self ):
|
|
|
|
with self._lock:
|
|
|
|
if self._status_dirty:
|
|
|
|
self._GenerateStatus()
|
|
|
|
|
|
( status, simple_status, ( total_processed, total ) ) = self._status_cache
|
|
|
|
return ( total_processed, total )
|
|
|
|
|
|
|
|
def HasFileSeed( self, file_seed ):
|
|
|
|
with self._lock:
|
|
|
|
return self._HasFileSeed( file_seed )
|
|
|
|
|
|
|
|
def InsertFileSeeds( self, index, file_seeds ):
|
|
|
|
if len( file_seeds ) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
new_file_seeds = []
|
|
|
|
with self._lock:
|
|
|
|
index = min( index, len( self._file_seeds ) )
|
|
|
|
for file_seed in file_seeds:
|
|
|
|
if self._HasFileSeed( file_seed ):
|
|
|
|
continue
|
|
|
|
|
|
file_seed.Normalise()
|
|
|
|
new_file_seeds.append( file_seed )
|
|
|
|
self._file_seeds.insert( index, file_seed )
|
|
|
|
index += 1
|
|
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( new_file_seeds )
|
|
|
|
return len( new_file_seeds )
|
|
|
|
|
|
def NotifyFileSeedsUpdated( self, file_seeds ):
|
|
|
|
with self._lock:
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
HG.client_controller.pub( 'file_seed_cache_file_seeds_updated', self._file_seed_cache_key, file_seeds )
|
|
|
|
|
|
def RemoveFileSeeds( self, file_seeds ):
|
|
|
|
with self._lock:
|
|
|
|
file_seeds_to_delete = set( file_seeds )
|
|
|
|
self._file_seeds = HydrusSerialisable.SerialisableList( [ file_seed for file_seed in self._file_seeds if file_seed not in file_seeds_to_delete ] )
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( file_seeds_to_delete )
|
|
|
|
|
|
def RemoveFileSeedsByStatus( self, statuses_to_remove ):
|
|
|
|
with self._lock:
|
|
|
|
file_seeds_to_delete = [ file_seed for file_seed in self._file_seeds if file_seed.status in statuses_to_remove ]
|
|
|
|
|
|
self.RemoveFileSeeds( file_seeds_to_delete )
|
|
|
|
|
|
def RemoveAllButUnknownFileSeeds( self ):
|
|
|
|
with self._lock:
|
|
|
|
file_seeds_to_delete = [ file_seed for file_seed in self._file_seeds if file_seed.status != CC.STATUS_UNKNOWN ]
|
|
|
|
|
|
self.RemoveFileSeeds( file_seeds_to_delete )
|
|
|
|
|
|
def RetryFailures( self ):
|
|
|
|
with self._lock:
|
|
|
|
failed_file_seeds = self._GetFileSeeds( CC.STATUS_ERROR )
|
|
|
|
for file_seed in failed_file_seeds:
|
|
|
|
file_seed.SetStatus( CC.STATUS_UNKNOWN )
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( failed_file_seeds )
|
|
|
|
|
|
def RetryIgnored( self ):
|
|
|
|
with self._lock:
|
|
|
|
ignored_file_seeds = self._GetFileSeeds( CC.STATUS_VETOED )
|
|
|
|
for file_seed in ignored_file_seeds:
|
|
|
|
file_seed.SetStatus( CC.STATUS_UNKNOWN )
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( ignored_file_seeds )
|
|
|
|
|
|
def WorkToDo( self ):
|
|
|
|
with self._lock:
|
|
|
|
if self._status_dirty:
|
|
|
|
self._GenerateStatus()
|
|
|
|
|
|
( status, simple_status, ( total_processed, total ) ) = self._status_cache
|
|
|
|
return total_processed < total
|
|
|
|
|
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE ] = FileSeedCache
|