hydrus/hydrus/client/importing/ClientImportFileSeeds.py

2585 lines
88 KiB
Python
Raw Normal View History

2020-05-20 21:36:02 +00:00
import collections
2020-09-09 20:59:19 +00:00
import itertools
2020-05-20 21:36:02 +00:00
import os
2020-06-11 12:01:08 +00:00
import random
2020-05-20 21:36:02 +00:00
import threading
import time
import traceback
import typing
import urllib.parse
2020-04-22 21:00:35 +00:00
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.core import HydrusExceptions
from hydrus.core import HydrusFileHandling
from hydrus.core import HydrusImageHandling
from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusPaths
from hydrus.core import HydrusSerialisable
from hydrus.core import HydrusTags
2018-06-06 21:27:02 +00:00
2020-07-29 20:52:44 +00:00
from hydrus.client import ClientConstants as CC
from hydrus.client import ClientData
from hydrus.client import ClientImageHandling
from hydrus.client import ClientParsing
from hydrus.client import ClientPaths
from hydrus.client.importing import ClientImporting
from hydrus.client.importing import ClientImportOptions
from hydrus.client.metadata import ClientTags
2020-07-29 20:52:44 +00:00
from hydrus.client.networking import ClientNetworkingDomain
2018-06-06 21:27:02 +00:00
class FileImportJob( object ):
def __init__( self, temp_path, file_import_options = None ):
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job created for path {}.'.format( temp_path ) )
2018-06-06 21:27:02 +00:00
if file_import_options is None:
file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )
self._temp_path = temp_path
self._file_import_options = file_import_options
self._hash = None
self._pre_import_status = None
self._file_info = None
2019-05-08 21:06:42 +00:00
self._thumbnail_bytes = None
2018-06-06 21:27:02 +00:00
self._phashes = None
self._extra_hashes = None
2019-09-25 21:34:18 +00:00
self._file_modified_timestamp = None
2018-06-06 21:27:02 +00:00
def CheckIsGoodToImport( self ):
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job testing if good to import for file import options' )
2019-08-07 22:59:53 +00:00
( size, mime, width, height, duration, num_frames, has_audio, num_words ) = self._file_info
2018-06-06 21:27:02 +00:00
self._file_import_options.CheckFileIsValid( size, mime, width, height )
2020-02-19 21:48:36 +00:00
def DoWork( self, status_hook = None ):
2019-05-08 21:06:42 +00:00
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
2019-05-08 21:06:42 +00:00
2019-07-10 22:38:30 +00:00
HydrusData.ShowText( 'File import job starting work.' )
2019-05-08 21:06:42 +00:00
2020-02-19 21:48:36 +00:00
if status_hook is not None:
status_hook( 'calculating pre-import status' )
2019-05-08 21:06:42 +00:00
( pre_import_status, hash, note ) = self.GenerateHashAndStatus()
if self.IsNewToDB():
2020-02-19 21:48:36 +00:00
if status_hook is not None:
status_hook( 'generating metadata' )
2019-05-08 21:06:42 +00:00
self.GenerateInfo()
self.CheckIsGoodToImport()
mime = self.GetMime()
2020-02-19 21:48:36 +00:00
if status_hook is not None:
status_hook( 'copying file' )
2019-05-08 21:06:42 +00:00
HG.client_controller.client_files_manager.AddFile( hash, mime, self._temp_path, thumbnail_bytes = self._thumbnail_bytes )
2020-02-19 21:48:36 +00:00
if status_hook is not None:
status_hook( 'updating database' )
2019-05-08 21:06:42 +00:00
( import_status, note ) = HG.client_controller.WriteSynchronous( 'import_file', self )
else:
import_status = pre_import_status
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job is done, now publishing content updates' )
2019-05-08 21:06:42 +00:00
self.PubsubContentUpdates()
return ( import_status, hash, note )
def GenerateHashAndStatus( self ):
HydrusImageHandling.ConvertToPNGIfBMP( self._temp_path )
2019-05-08 21:06:42 +00:00
self._hash = HydrusFileHandling.GetHashFromPath( self._temp_path )
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job hash: {}'.format( self._hash.hex() ) )
2019-05-08 21:06:42 +00:00
( self._pre_import_status, hash, note ) = HG.client_controller.Read( 'hash_status', 'sha256', self._hash, prefix = 'file recognised' )
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job pre-import status: {}, {}'.format( CC.status_string_lookup[ self._pre_import_status ], note ) )
2019-05-08 21:06:42 +00:00
return ( self._pre_import_status, self._hash, note )
def GenerateInfo( self ):
mime = HydrusFileHandling.GetMime( self._temp_path )
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job mime: {}'.format( HC.mime_string_lookup[ mime ] ) )
2019-05-08 21:06:42 +00:00
new_options = HG.client_controller.new_options
if mime in HC.DECOMPRESSION_BOMB_IMAGES and not self._file_import_options.AllowsDecompressionBombs():
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job testing for decompression bomb' )
2019-05-08 21:06:42 +00:00
if HydrusImageHandling.IsDecompressionBomb( self._temp_path ):
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job: it was a decompression bomb' )
2019-05-08 21:06:42 +00:00
raise HydrusExceptions.DecompressionBombException( 'Image seems to be a Decompression Bomb!' )
self._file_info = HydrusFileHandling.GetFileInfo( self._temp_path, mime )
2019-08-07 22:59:53 +00:00
( size, mime, width, height, duration, num_frames, has_audio, num_words ) = self._file_info
2019-05-08 21:06:42 +00:00
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job file info: {}'.format( self._file_info ) )
2019-05-08 21:06:42 +00:00
if mime in HC.MIMES_WITH_THUMBNAILS:
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job generating thumbnail' )
2019-05-08 21:06:42 +00:00
bounding_dimensions = HG.client_controller.options[ 'thumbnail_dimensions' ]
target_resolution = HydrusImageHandling.GetThumbnailResolution( ( width, height ), bounding_dimensions )
percentage_in = HG.client_controller.new_options.GetInteger( 'video_thumbnail_percentage_in' )
self._thumbnail_bytes = HydrusFileHandling.GenerateThumbnailBytes( self._temp_path, target_resolution, mime, duration, num_frames, percentage_in = percentage_in )
if mime in HC.MIMES_WE_CAN_PHASH:
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job generating phashes' )
2019-05-08 21:06:42 +00:00
self._phashes = ClientImageHandling.GenerateShapePerceptualHashes( self._temp_path, mime )
2019-07-10 22:38:30 +00:00
if HG.file_import_report_mode:
2019-07-17 22:10:19 +00:00
HydrusData.ShowText( 'File import job generated {} phashes: {}'.format( len( self._phashes ), [ phash.hex() for phash in self._phashes ] ) )
2019-07-10 22:38:30 +00:00
2019-05-08 21:06:42 +00:00
2019-07-17 22:10:19 +00:00
if HG.file_import_report_mode:
HydrusData.ShowText( 'File import job generating other hashes' )
2019-05-08 21:06:42 +00:00
self._extra_hashes = HydrusFileHandling.GetExtraHashesFromPath( self._temp_path )
2019-09-25 21:34:18 +00:00
self._file_modified_timestamp = HydrusFileHandling.GetFileModifiedTimestamp( self._temp_path )
2019-05-08 21:06:42 +00:00
2018-06-06 21:27:02 +00:00
def GetExtraHashes( self ):
return self._extra_hashes
def GetFileImportOptions( self ):
return self._file_import_options
def GetFileInfo( self ):
return self._file_info
2019-09-25 21:34:18 +00:00
def GetFileModifiedTimestamp( self ):
return self._file_modified_timestamp
2018-06-06 21:27:02 +00:00
def GetHash( self ):
return self._hash
def GetMime( self ):
2019-08-07 22:59:53 +00:00
( size, mime, width, height, duration, num_frames, has_audio, num_words ) = self._file_info
2018-06-06 21:27:02 +00:00
return mime
def GetPreImportStatus( self ):
return self._pre_import_status
def GetPHashes( self ):
return self._phashes
def PubsubContentUpdates( self ):
if self._pre_import_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:
if self._file_import_options.AutomaticallyArchives():
service_keys_to_content_updates = { CC.COMBINED_LOCAL_FILE_SERVICE_KEY : [ HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, set( ( self._hash, ) ) ) ] }
HG.client_controller.Write( 'content_updates', service_keys_to_content_updates )
def IsNewToDB( self ):
if self._pre_import_status == CC.STATUS_UNKNOWN:
return True
if self._pre_import_status == CC.STATUS_DELETED:
if not self._file_import_options.ExcludesDeleted():
return True
return False
2018-06-27 19:27:05 +00:00
FILE_SEED_TYPE_HDD = 0
FILE_SEED_TYPE_URL = 1
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
class FileSeed( HydrusSerialisable.SerialisableBase ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED
2018-06-06 21:27:02 +00:00
SERIALISABLE_NAME = 'File Import'
2020-09-16 20:46:54 +00:00
SERIALISABLE_VERSION = 4
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def __init__( self, file_seed_type: int = None, file_seed_data: str = None ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if file_seed_type is None:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed_type = FILE_SEED_TYPE_URL
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if file_seed_data is None:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed_data = 'https://big-guys.4u/monica_lewinsky_hott.tiff.exe.vbs'
2018-06-06 21:27:02 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
2018-06-27 19:27:05 +00:00
self.file_seed_type = file_seed_type
self.file_seed_data = file_seed_data
2018-06-06 21:27:02 +00:00
self.created = HydrusData.GetNow()
self.modified = self.created
self.source_time = None
self.status = CC.STATUS_UNKNOWN
self.note = ''
self._referral_url = None
2020-09-16 20:46:54 +00:00
self._external_filterable_tags = set()
self._external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags()
2019-02-27 23:03:30 +00:00
2018-06-06 21:27:02 +00:00
self._urls = set()
self._tags = set()
self._hashes = {}
def __eq__( self, other ):
2020-01-22 21:04:43 +00:00
if isinstance( other, FileSeed ):
return self.__hash__() == other.__hash__()
return NotImplemented
2018-06-06 21:27:02 +00:00
def __hash__( self ):
2018-06-27 19:27:05 +00:00
return ( self.file_seed_type, self.file_seed_data ).__hash__()
2018-06-06 21:27:02 +00:00
def __ne__( self, other ):
return self.__hash__() != other.__hash__()
def _CheckTagsVeto( self, tags, tag_import_options: ClientImportOptions.TagImportOptions ):
2018-06-06 21:27:02 +00:00
2020-09-09 20:59:19 +00:00
tags_to_siblings = HG.client_controller.Read( 'tag_siblings_lookup', CC.COMBINED_TAG_SERVICE_KEY, tags )
all_chain_tags = set( itertools.chain.from_iterable( tags_to_siblings.values() ) )
tag_import_options.CheckTagsVeto( tags, all_chain_tags )
2018-06-06 21:27:02 +00:00
def _GetSerialisableInfo( self ):
2020-09-16 20:46:54 +00:00
serialisable_external_filterable_tags = list( self._external_filterable_tags )
serialisable_external_additional_service_keys_to_tags = self._external_additional_service_keys_to_tags.GetSerialisableTuple()
2019-02-27 23:03:30 +00:00
2018-06-06 21:27:02 +00:00
serialisable_urls = list( self._urls )
serialisable_tags = list( self._tags )
2019-01-09 22:59:03 +00:00
serialisable_hashes = [ ( hash_type, hash.hex() ) for ( hash_type, hash ) in list(self._hashes.items()) if hash is not None ]
2018-06-06 21:27:02 +00:00
2020-09-16 20:46:54 +00:00
return ( self.file_seed_type, self.file_seed_data, self.created, self.modified, self.source_time, self.status, self.note, self._referral_url, serialisable_external_filterable_tags, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes )
2018-06-06 21:27:02 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2020-09-16 20:46:54 +00:00
( self.file_seed_type, self.file_seed_data, self.created, self.modified, self.source_time, self.status, self.note, self._referral_url, serialisable_external_filterable_tags, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes ) = serialisable_info
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
self._external_filterable_tags = set( serialisable_external_filterable_tags )
self._external_additional_service_keys_to_tags = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_external_additional_service_keys_to_tags )
2018-06-06 21:27:02 +00:00
self._urls = set( serialisable_urls )
self._tags = set( serialisable_tags )
2019-01-09 22:59:03 +00:00
self._hashes = { hash_type : bytes.fromhex( encoded_hash ) for ( hash_type, encoded_hash ) in serialisable_hashes if encoded_hash is not None }
2018-06-06 21:27:02 +00:00
def _NormaliseAndFilterAssociableURLs( self, urls ):
2018-07-18 21:07:15 +00:00
normalised_urls = set()
for url in urls:
try:
url = HG.client_controller.network_engine.domain_manager.NormaliseURL( url )
2019-05-08 21:06:42 +00:00
except HydrusExceptions.URLClassException:
2018-07-18 21:07:15 +00:00
continue # not a url--something like "file:///C:/Users/Tall%20Man/Downloads/maxresdefault.jpg" ha ha ha
normalised_urls.add( url )
2018-06-06 21:27:02 +00:00
associable_urls = { url for url in normalised_urls if HG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }
return associable_urls
2020-04-29 21:44:12 +00:00
def _SetupTagImportOptions( self, given_tag_import_options: ClientImportOptions.TagImportOptions ) -> ClientImportOptions.TagImportOptions:
2018-07-11 20:23:51 +00:00
if given_tag_import_options.IsDefault():
if self.IsAPostURL():
tio_lookup_url = self.file_seed_data
else:
if self._referral_url is not None:
tio_lookup_url = self._referral_url
else:
tio_lookup_url = self.file_seed_data
tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( tio_lookup_url )
else:
tag_import_options = given_tag_import_options
return tag_import_options
2018-06-06 21:27:02 +00:00
def _UpdateModified( self ):
self.modified = HydrusData.GetNow()
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
2018-06-27 19:27:05 +00:00
( file_seed_type, file_seed_data, created, modified, source_time, status, note, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
2018-06-06 21:27:02 +00:00
referral_url = None
2018-06-27 19:27:05 +00:00
new_serialisable_info = ( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_urls, serialisable_tags, serialisable_hashes )
2018-06-06 21:27:02 +00:00
return ( 2, new_serialisable_info )
2019-02-27 23:03:30 +00:00
if version == 2:
( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
2020-09-16 20:46:54 +00:00
external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags()
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
serialisable_external_additional_service_keys_to_tags = external_additional_service_keys_to_tags.GetSerialisableTuple()
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
new_serialisable_info = ( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes )
2019-02-27 23:03:30 +00:00
return ( 3, new_serialisable_info )
2020-09-16 20:46:54 +00:00
if version == 3:
( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
external_filterable_tags = set()
serialisable_external_filterable_tags = list( external_filterable_tags )
new_serialisable_info = ( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_external_filterable_tags, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes )
return ( 4, new_serialisable_info )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def AddParseResults( self, parse_results, file_import_options: ClientImportOptions.FileImportOptions ):
2018-06-06 21:27:02 +00:00
for ( hash_type, hash ) in ClientParsing.GetHashesFromParseResults( parse_results ):
if hash_type not in self._hashes:
self._hashes[ hash_type ] = hash
2018-10-03 21:00:15 +00:00
if file_import_options.ShouldAssociateSourceURLs():
source_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_SOURCE, ) )
associable_urls = self._NormaliseAndFilterAssociableURLs( source_urls )
associable_urls.discard( self.file_seed_data )
self._urls.update( associable_urls )
2018-06-06 21:27:02 +00:00
tags = ClientParsing.GetTagsFromParseResults( parse_results )
self._tags.update( tags )
source_timestamp = ClientParsing.GetTimestampFromParseResults( parse_results, HC.TIMESTAMP_TYPE_SOURCE )
if source_timestamp is not None:
2019-01-09 22:59:03 +00:00
source_timestamp = min( HydrusData.GetNow() - 30, source_timestamp )
2018-06-06 21:27:02 +00:00
self.source_time = source_timestamp
self._UpdateModified()
def AddTags( self, tags ):
tags = HydrusTags.CleanTags( tags )
self._tags.update( tags )
self._UpdateModified()
2020-04-29 21:44:12 +00:00
def AddURL( self, url: str ):
2018-06-06 21:27:02 +00:00
urls = ( url, )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
2018-06-27 19:27:05 +00:00
associable_urls.discard( self.file_seed_data )
2018-06-06 21:27:02 +00:00
self._urls.update( associable_urls )
2020-04-29 21:44:12 +00:00
def CheckPreFetchMetadata( self, tag_import_options: ClientImportOptions.TagImportOptions ):
2018-06-06 21:27:02 +00:00
self._CheckTagsVeto( self._tags, tag_import_options )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def DownloadAndImportRawFile( self, file_url: str, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook, override_bandwidth = False ):
2018-06-06 21:27:02 +00:00
self.AddURL( file_url )
2019-05-01 21:24:42 +00:00
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
2018-06-06 21:27:02 +00:00
try:
2018-06-27 19:27:05 +00:00
if self.file_seed_data != file_url:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
referral_url = self.file_seed_data
2018-06-06 21:27:02 +00:00
else:
referral_url = self._referral_url
2019-02-06 22:41:35 +00:00
status_hook( 'downloading file' )
2018-06-06 21:27:02 +00:00
network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )
2018-07-18 21:07:15 +00:00
if override_bandwidth:
2019-06-19 22:08:48 +00:00
network_job.OverrideBandwidth( 3 )
2018-07-18 21:07:15 +00:00
2018-06-06 21:27:02 +00:00
network_job.SetFileImportOptions( file_import_options )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
2019-02-06 22:41:35 +00:00
status_hook( 'importing file' )
2020-02-19 21:48:36 +00:00
self.Import( temp_path, file_import_options, status_hook = status_hook )
2018-06-06 21:27:02 +00:00
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
2020-04-29 21:44:12 +00:00
def FetchPageMetadata( self, tag_import_options: ClientImportOptions.TagImportOptions ):
2018-06-06 21:27:02 +00:00
pass
2020-04-29 21:44:12 +00:00
def GetAPIInfoDict( self, simple: bool ):
2019-08-21 21:34:01 +00:00
d = {}
d[ 'import_data' ] = self.file_seed_data
d[ 'created' ] = self.created
d[ 'modified' ] = self.modified
d[ 'source_time' ] = self.source_time
d[ 'status' ] = self.status
d[ 'note' ] = self.note
return d
2018-10-31 21:41:14 +00:00
def GetExampleNetworkJob( self, network_job_factory ):
if self.IsAPostURL():
post_url = self.file_seed_data
2020-06-17 21:31:54 +00:00
try:
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
except HydrusExceptions.URLClassException:
url_to_check = post_url
2018-10-31 21:41:14 +00:00
else:
url_to_check = self.file_seed_data
network_job = network_job_factory( 'GET', url_to_check )
return network_job
2018-06-27 19:27:05 +00:00
def GetHash( self ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if 'sha256' in self._hashes:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
return self._hashes[ 'sha256' ]
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
return None
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
2020-04-29 21:44:12 +00:00
def GetPreImportStatusPredictionHash( self, file_import_options: ClientImportOptions.FileImportOptions ):
2018-06-27 19:27:05 +00:00
2018-10-03 21:00:15 +00:00
UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )
2018-06-27 19:27:05 +00:00
2018-10-03 21:00:15 +00:00
( status, hash, note ) = UNKNOWN_DEFAULT
2018-06-27 19:27:05 +00:00
2018-10-03 21:00:15 +00:00
if file_import_options.DoNotCheckHashesBeforeImporting():
2018-06-27 19:27:05 +00:00
2018-10-03 21:00:15 +00:00
return ( status, hash, note )
2018-06-06 21:27:02 +00:00
# hashes
if status == CC.STATUS_UNKNOWN:
2020-07-29 20:52:44 +00:00
for ( hash_type, found_hash ) in self._hashes.items():
2018-06-06 21:27:02 +00:00
2019-04-10 22:50:53 +00:00
( status, hash, note ) = HG.client_controller.Read( 'hash_status', hash_type, found_hash, prefix = 'hash recognised' )
2018-06-06 21:27:02 +00:00
if status != CC.STATUS_UNKNOWN:
break
if status == CC.STATUS_DELETED:
if not file_import_options.ExcludesDeleted():
2018-10-03 21:00:15 +00:00
( status, hash, note ) = UNKNOWN_DEFAULT
2018-06-27 19:27:05 +00:00
2018-06-06 21:27:02 +00:00
2018-10-03 21:00:15 +00:00
return ( status, hash, note )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def GetPreImportStatusPredictionURL( self, file_import_options: ClientImportOptions.FileImportOptions, file_url = None ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
( status, hash, note ) = UNKNOWN_DEFAULT
2018-06-06 21:27:02 +00:00
2018-10-03 21:00:15 +00:00
if file_import_options.DoNotCheckKnownURLsBeforeImporting():
return ( status, hash, note )
2018-06-27 19:27:05 +00:00
# urls
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
urls = set( self._urls )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if file_url is not None:
urls.add( file_url )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if self.file_seed_type == FILE_SEED_TYPE_URL:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
urls.add( self.file_seed_data )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
unrecognised_url_results = set()
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
for url in urls:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if HG.client_controller.network_engine.domain_manager.URLCanReferToMultipleFiles( url ):
continue
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
# we now only trust url-matched single urls and the post/file urls
# trusting unmatched source urls was too much of a hassle with too many boorus providing bad source urls like user account pages
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToOneFile( url ) or url in ( self.file_seed_data, file_url ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
results = HG.client_controller.Read( 'url_statuses', url )
if len( results ) == 0: # if no match found, no useful data discovered
continue
elif len( results ) > 1: # if more than one file claims this url, it cannot be relied on to guess the file
continue
else: # i.e. 1 match found
( status, hash, note ) = results[0]
if status != CC.STATUS_UNKNOWN:
2018-08-29 20:20:41 +00:00
# a known one-file url has given a single clear result. sounds good
we_have_a_match = True
if self.file_seed_type == FILE_SEED_TYPE_URL:
# to double-check, let's see if the file that claims that url has any other interesting urls
# if the file has another url with the same url class as ours, then this is prob an unreliable 'alternate' source url attribution, and untrustworthy
my_url = self.file_seed_data
if url != my_url:
2019-05-08 21:06:42 +00:00
my_url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( my_url )
2018-08-29 20:20:41 +00:00
2020-04-29 21:44:12 +00:00
media_result = HG.client_controller.Read( 'media_result', hash )
2018-08-29 20:20:41 +00:00
this_files_urls = media_result.GetLocationsManager().GetURLs()
for this_files_url in this_files_urls:
if this_files_url != my_url:
2020-04-01 21:51:42 +00:00
try:
this_url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( this_files_url )
except HydrusExceptions.URLClassException:
continue
2018-08-29 20:20:41 +00:00
2019-05-08 21:06:42 +00:00
if my_url_class == this_url_class:
2018-08-29 20:20:41 +00:00
# oh no, the file this source url refers to has a different known url in this same domain
# it is more likely that an edit on this site points to the original elsewhere
( status, hash, note ) = UNKNOWN_DEFAULT
we_have_a_match = False
break
if we_have_a_match:
break # if a known one-file url gives a single clear result, that result is reliable
2018-06-27 19:27:05 +00:00
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if status == CC.STATUS_DELETED:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if not file_import_options.ExcludesDeleted():
2018-10-03 21:00:15 +00:00
( status, hash, note ) = UNKNOWN_DEFAULT
return ( status, hash, note )
def GetSearchFileSeeds( self ):
if self.file_seed_type == FILE_SEED_TYPE_URL:
search_urls = ClientNetworkingDomain.GetSearchURLs( self.file_seed_data )
search_file_seeds = [ FileSeed( FILE_SEED_TYPE_URL, search_url ) for search_url in search_urls ]
else:
search_file_seeds = [ self ]
return search_file_seeds
def HasHash( self ):
return self.GetHash() is not None
2020-04-29 21:44:12 +00:00
def Import( self, temp_path: str, file_import_options: ClientImportOptions.FileImportOptions, status_hook = None ):
2018-10-03 21:00:15 +00:00
file_import_job = FileImportJob( temp_path, file_import_options )
2020-02-19 21:48:36 +00:00
( status, hash, note ) = file_import_job.DoWork( status_hook = status_hook )
2018-10-03 21:00:15 +00:00
self.SetStatus( status, note = note )
self.SetHash( hash )
2020-04-29 21:44:12 +00:00
def ImportPath( self, file_seed_cache: "FileSeedCache", file_import_options: ClientImportOptions.FileImportOptions, limited_mimes = None, status_hook = None ):
2018-10-03 21:00:15 +00:00
try:
if self.file_seed_type != FILE_SEED_TYPE_HDD:
raise HydrusExceptions.VetoException( 'Attempted to import as a path, but I do not think I am a path!' )
path = self.file_seed_data
if not os.path.exists( path ):
raise HydrusExceptions.VetoException( 'Source file does not exist!' )
if limited_mimes is not None:
mime = HydrusFileHandling.GetMime( path )
if mime not in limited_mimes:
raise HydrusExceptions.VetoException( 'Not in allowed mimes!' )
2019-05-01 21:24:42 +00:00
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
2018-10-03 21:00:15 +00:00
try:
copied = HydrusPaths.MirrorFile( path, temp_path )
if not copied:
raise Exception( 'File failed to copy to temp path--see log for error.' )
2020-02-19 21:48:36 +00:00
self.Import( temp_path, file_import_options, status_hook = status_hook )
2018-10-03 21:00:15 +00:00
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
2019-02-27 23:03:30 +00:00
self.WriteContentUpdates()
2020-05-27 21:27:52 +00:00
except HydrusExceptions.UnsupportedFileException as e:
2018-10-03 21:00:15 +00:00
self.SetStatus( CC.STATUS_ERROR, exception = e )
except HydrusExceptions.VetoException as e:
2019-01-09 22:59:03 +00:00
self.SetStatus( CC.STATUS_VETOED, note = str( e ) )
2018-10-03 21:00:15 +00:00
except Exception as e:
self.SetStatus( CC.STATUS_ERROR, exception = e )
file_seed_cache.NotifyFileSeedsUpdated( ( self, ) )
def IsAPostURL( self ):
if self.file_seed_type == FILE_SEED_TYPE_URL:
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
if url_type == HC.URL_TYPE_POST:
2018-06-27 19:27:05 +00:00
2018-10-03 21:00:15 +00:00
return True
return False
def IsDeleted( self ):
return self.status == CC.STATUS_DELETED
def Normalise( self ):
if self.file_seed_type == FILE_SEED_TYPE_URL:
2020-04-01 21:51:42 +00:00
try:
self.file_seed_data = HG.client_controller.network_engine.domain_manager.NormaliseURL( self.file_seed_data )
except HydrusExceptions.URLClassException:
pass
2018-10-03 21:00:15 +00:00
2020-04-29 21:44:12 +00:00
def PredictPreImportStatus( self, file_import_options: ClientImportOptions.FileImportOptions, tag_import_options: ClientImportOptions.TagImportOptions, file_url = None ):
2018-10-03 21:00:15 +00:00
( url_status, url_hash, url_note ) = self.GetPreImportStatusPredictionURL( file_import_options, file_url = file_url )
( hash_status, hash_hash, hash_note ) = self.GetPreImportStatusPredictionHash( file_import_options )
url_recognised_and_file_already_in_db = url_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT
hash_recognised_and_file_already_in_db = hash_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT
# now let's set the prediction
if hash_status != CC.STATUS_UNKNOWN: # trust hashes over urls m8
( status, hash, note ) = ( hash_status, hash_hash, hash_note )
else:
( status, hash, note ) = ( url_status, url_hash, url_note )
2018-06-06 21:27:02 +00:00
2018-10-03 21:00:15 +00:00
if self.status == CC.STATUS_UNKNOWN and status != CC.STATUS_UNKNOWN:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self.status = status
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if hash is not None:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._hashes[ 'sha256' ] = hash
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self.note = note
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._UpdateModified()
2018-06-06 21:27:02 +00:00
2018-10-03 21:00:15 +00:00
# and make some recommendations
should_download_file = self.status == CC.STATUS_UNKNOWN
should_download_metadata = should_download_file # if we want the file, we need the metadata to get the file_url!
# but if we otherwise still want to force some tags, let's do it
if not should_download_metadata and tag_import_options.WorthFetchingTags():
url_override = url_recognised_and_file_already_in_db and tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB()
hash_override = hash_recognised_and_file_already_in_db and tag_import_options.ShouldFetchTagsEvenIfHashKnownAndFileAlreadyInDB()
if url_override or hash_override:
should_download_metadata = True
return ( should_download_metadata, should_download_file )
2018-06-27 19:27:05 +00:00
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def PresentToPage( self, page_key: bytes ):
2018-06-06 21:27:02 +00:00
hash = self.GetHash()
if hash is not None:
2020-04-29 21:44:12 +00:00
media_result = HG.client_controller.Read( 'media_result', hash )
2018-06-06 21:27:02 +00:00
HG.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
2020-09-16 20:46:54 +00:00
def SetExternalAdditionalServiceKeysToTags( self, service_keys_to_tags ):
self._external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags( service_keys_to_tags )
def SetExternalFilterableTags( self, tags ):
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
self._external_filterable_tags = set( tags )
2019-02-27 23:03:30 +00:00
2018-06-06 21:27:02 +00:00
def SetHash( self, hash ):
if hash is not None:
self._hashes[ 'sha256' ] = hash
2020-04-29 21:44:12 +00:00
def SetReferralURL( self, referral_url: str ):
2018-06-06 21:27:02 +00:00
self._referral_url = referral_url
2020-04-29 21:44:12 +00:00
def SetStatus( self, status: int, note: str = '', exception = None ):
2018-06-06 21:27:02 +00:00
if exception is not None:
2019-01-09 22:59:03 +00:00
first_line = str( exception ).split( os.linesep )[0]
2018-06-06 21:27:02 +00:00
2019-01-09 22:59:03 +00:00
note = first_line + '\u2026 (Copy note to see full error)'
2018-06-06 21:27:02 +00:00
note += os.linesep
2019-01-09 22:59:03 +00:00
note += traceback.format_exc()
2018-06-06 21:27:02 +00:00
2020-11-11 22:20:16 +00:00
HydrusData.Print( 'Error when processing {}!'.format( self.file_seed_data ) )
2018-06-06 21:27:02 +00:00
HydrusData.Print( traceback.format_exc() )
self.status = status
self.note = note
self._UpdateModified()
2020-04-29 21:44:12 +00:00
def ShouldPresent( self, file_import_options: ClientImportOptions.FileImportOptions, in_inbox = None ):
2018-06-06 21:27:02 +00:00
hash = self.GetHash()
if hash is not None and self.status in CC.SUCCESSFUL_IMPORT_STATES:
2018-08-08 20:29:54 +00:00
if in_inbox is None:
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
if file_import_options.ShouldPresentIgnorantOfInbox( self.status ):
return True
if file_import_options.ShouldNotPresentIgnorantOfInbox( self.status ):
return False
in_inbox = HG.client_controller.Read( 'in_inbox', hash )
2018-06-06 21:27:02 +00:00
if file_import_options.ShouldPresent( self.status, in_inbox ):
return True
return False
def WorksInNewSystem( self ):
2018-06-27 19:27:05 +00:00
if self.file_seed_type == FILE_SEED_TYPE_URL:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
2018-06-06 21:27:02 +00:00
if url_type == HC.URL_TYPE_FILE:
return True
if url_type == HC.URL_TYPE_POST and can_parse:
return True
2018-06-27 19:27:05 +00:00
if url_type == HC.URL_TYPE_UNKNOWN and self._referral_url is not None: # this is likely be a multi-file child of a post url file_seed
2018-06-06 21:27:02 +00:00
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self._referral_url )
if url_type == HC.URL_TYPE_POST: # we must have got here through parsing that m8, so let's assume this is an unrecognised file url
return True
return False
2020-04-29 21:44:12 +00:00
def WorkOnURL( self, file_seed_cache: "FileSeedCache", status_hook, network_job_factory, network_job_presentation_context_factory, file_import_options: ClientImportOptions.FileImportOptions, tag_import_options: ClientImportOptions.TagImportOptions ):
2018-06-06 21:27:02 +00:00
did_substantial_work = False
try:
2018-08-01 20:44:57 +00:00
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
if url_type not in ( HC.URL_TYPE_POST, HC.URL_TYPE_FILE, HC.URL_TYPE_UNKNOWN ):
2019-01-30 22:14:54 +00:00
raise HydrusExceptions.VetoException( 'This URL appeared to be a "{}", which is not a File or Post URL!'.format( match_name ) )
2018-08-01 20:44:57 +00:00
if url_type == HC.URL_TYPE_POST and not can_parse:
raise HydrusExceptions.VetoException( 'Did not have a parser for this URL!' )
2018-07-11 20:23:51 +00:00
tag_import_options = self._SetupTagImportOptions( tag_import_options )
2018-06-06 21:27:02 +00:00
status_hook( 'checking url status' )
2018-06-27 19:27:05 +00:00
( should_download_metadata, should_download_file ) = self.PredictPreImportStatus( file_import_options, tag_import_options )
2018-06-06 21:27:02 +00:00
if self.IsAPostURL():
2018-06-27 19:27:05 +00:00
if should_download_metadata:
2018-06-06 21:27:02 +00:00
did_substantial_work = True
2018-06-27 19:27:05 +00:00
post_url = self.file_seed_data
2018-06-06 21:27:02 +00:00
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
2019-03-06 23:06:22 +00:00
status_hook( 'downloading file page' )
2018-06-06 21:27:02 +00:00
if self._referral_url not in ( post_url, url_to_check ):
referral_url = self._referral_url
else:
referral_url = None
network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
2019-01-09 22:59:03 +00:00
parsing_text = network_job.GetContentText()
2018-06-06 21:27:02 +00:00
2020-12-16 22:29:51 +00:00
actual_fetched_url = network_job.GetActualFetchedURL()
if actual_fetched_url != url_to_check:
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
if url_type == HC.URL_TYPE_POST and can_parse:
post_url = actual_fetched_url
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
2018-06-06 21:27:02 +00:00
parsing_context = {}
parsing_context[ 'post_url' ] = post_url
parsing_context[ 'url' ] = url_to_check
2019-01-09 22:59:03 +00:00
all_parse_results = parser.Parse( parsing_context, parsing_text )
2018-06-06 21:27:02 +00:00
if len( all_parse_results ) == 0:
2020-04-16 00:09:42 +00:00
it_was_a_real_file = False
( os_file_handle, temp_path ) = HydrusPaths.GetTempPath()
try:
with open( temp_path, 'wb' ) as f:
f.write( network_job.GetContentBytes() )
mime = HydrusFileHandling.GetMime( temp_path )
if mime in HC.ALLOWED_MIMES:
it_was_a_real_file = True
status_hook( 'page was actually a file, trying to import' )
self.Import( temp_path, file_import_options, status_hook = status_hook )
except:
pass # in this special occasion, we will swallow the error
finally:
HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )
if not it_was_a_real_file:
raise HydrusExceptions.VetoException( 'The parser found nothing in the document, nor did it seem to be an importable file!' )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
elif len( all_parse_results ) > 1:
2018-06-06 21:27:02 +00:00
2018-10-24 21:34:02 +00:00
# multiple child urls generated by a subsidiary page parser
2018-10-03 21:00:15 +00:00
file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, self.file_seed_data, file_import_options )
2018-08-22 21:10:59 +00:00
2019-04-03 22:45:57 +00:00
for file_seed in file_seeds:
2020-09-16 20:46:54 +00:00
file_seed.SetExternalFilterableTags( self._external_filterable_tags )
file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
2019-04-03 22:45:57 +00:00
2019-07-03 22:49:27 +00:00
file_seed._urls.update( self._urls )
file_seed._tags.update( self._tags )
2019-04-03 22:45:57 +00:00
2018-10-24 21:34:02 +00:00
try:
my_index = file_seed_cache.GetFileSeedIndex( self )
insertion_index = my_index + 1
except:
insertion_index = len( file_seed_cache )
num_urls_added = file_seed_cache.InsertFileSeeds( insertion_index, file_seeds )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
status = CC.STATUS_SUCCESSFUL_AND_NEW
2020-11-11 22:20:16 +00:00
note = 'Found {} new URLs.'.format( HydrusData.ToHumanInt( num_urls_added ) )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
self.SetStatus( status, note = note )
else:
2018-10-24 21:34:02 +00:00
# no subsidiary page parser results, just one
2018-08-08 20:29:54 +00:00
parse_results = all_parse_results[0]
2018-06-06 21:27:02 +00:00
2018-10-03 21:00:15 +00:00
self.AddParseResults( parse_results, file_import_options )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
self.CheckPreFetchMetadata( tag_import_options )
desired_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
child_urls = []
if len( desired_urls ) == 0:
raise HydrusExceptions.VetoException( 'Could not find a file or post URL to download!' )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
elif len( desired_urls ) == 1:
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
desired_url = desired_urls[0]
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( desired_url )
if url_type in ( HC.URL_TYPE_FILE, HC.URL_TYPE_UNKNOWN ):
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
file_url = desired_url
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
( should_download_metadata, should_download_file ) = self.PredictPreImportStatus( file_import_options, tag_import_options, file_url )
if should_download_file:
2019-02-06 22:41:35 +00:00
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook, override_bandwidth = True )
2018-08-08 20:29:54 +00:00
elif url_type == HC.URL_TYPE_POST and can_parse:
# a pixiv mode=medium page has spawned a mode=manga page, so we need a new file_seed to go pursue that
child_urls = [ desired_url ]
else:
2020-11-11 22:20:16 +00:00
raise HydrusExceptions.VetoException( 'Found a URL--{}--but could not understand/parse it!'.format( desired_url ) )
2018-06-06 21:27:02 +00:00
else:
2018-08-08 20:29:54 +00:00
child_urls = desired_urls
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
if len( child_urls ) > 0:
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
child_file_seeds = []
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
for child_url in child_urls:
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
duplicate_file_seed = self.Duplicate() # inherits all urls and tags from here
duplicate_file_seed.file_seed_data = child_url
duplicate_file_seed.SetReferralURL( self.file_seed_data )
if self._referral_url is not None:
duplicate_file_seed.AddURL( self._referral_url )
child_file_seeds.append( duplicate_file_seed )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
try:
my_index = file_seed_cache.GetFileSeedIndex( self )
insertion_index = my_index + 1
except:
insertion_index = len( file_seed_cache )
2018-06-06 21:27:02 +00:00
2018-10-24 21:34:02 +00:00
num_urls_added = file_seed_cache.InsertFileSeeds( insertion_index, child_file_seeds )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
status = CC.STATUS_SUCCESSFUL_AND_NEW
2020-11-11 22:20:16 +00:00
note = 'Found {} new URLs.'.format( HydrusData.ToHumanInt( num_urls_added ) )
2018-06-06 21:27:02 +00:00
2018-08-08 20:29:54 +00:00
self.SetStatus( status, note = note )
2018-06-06 21:27:02 +00:00
else:
2018-06-27 19:27:05 +00:00
if should_download_file:
2018-06-06 21:27:02 +00:00
2020-01-22 21:04:43 +00:00
self.CheckPreFetchMetadata( tag_import_options )
2018-06-06 21:27:02 +00:00
did_substantial_work = True
2018-06-27 19:27:05 +00:00
file_url = self.file_seed_data
2018-06-06 21:27:02 +00:00
2019-02-06 22:41:35 +00:00
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook )
2018-06-06 21:27:02 +00:00
did_substantial_work |= self.WriteContentUpdates( tag_import_options )
except HydrusExceptions.ShutdownException:
return False
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
2019-01-09 22:59:03 +00:00
note = str( e )
2018-06-06 21:27:02 +00:00
self.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
status_hook( 'cancelled!' )
time.sleep( 2 )
2019-02-06 22:41:35 +00:00
except HydrusExceptions.InsufficientCredentialsException:
2018-07-18 21:07:15 +00:00
status = CC.STATUS_VETOED
note = '403'
self.SetStatus( status, note = note )
status_hook( '403' )
time.sleep( 2 )
2018-06-06 21:27:02 +00:00
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
self.SetStatus( status, note = note )
status_hook( '404' )
time.sleep( 2 )
except Exception as e:
status = CC.STATUS_ERROR
self.SetStatus( status, exception = e )
status_hook( 'error!' )
time.sleep( 3 )
2018-07-11 20:23:51 +00:00
file_seed_cache.NotifyFileSeedsUpdated( ( self, ) )
2018-06-06 21:27:02 +00:00
return did_substantial_work
2020-04-29 21:44:12 +00:00
def WriteContentUpdates( self, tag_import_options: typing.Optional[ ClientImportOptions.TagImportOptions ] = None ):
2018-06-06 21:27:02 +00:00
did_work = False
if self.status == CC.STATUS_ERROR:
return did_work
hash = self.GetHash()
if hash is None:
return did_work
2019-02-27 23:03:30 +00:00
# changed this to say that urls alone are not 'did work' since all url results are doing this, and when they have no tags, they are usually superfast db hits anyway
# better to scream through an 'already in db' import list that flicker
2018-06-06 21:27:02 +00:00
service_keys_to_content_updates = collections.defaultdict( list )
urls = set( self._urls )
2018-06-27 19:27:05 +00:00
if self.file_seed_type == FILE_SEED_TYPE_URL:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
urls.add( self.file_seed_data )
2018-06-06 21:27:02 +00:00
if self._referral_url is not None:
urls.add( self._referral_url )
associable_urls = self._NormaliseAndFilterAssociableURLs( urls )
if len( associable_urls ) > 0:
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( associable_urls, ( hash, ) ) )
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
2020-04-29 21:44:12 +00:00
if tag_import_options is None:
2018-06-27 19:27:05 +00:00
2020-09-16 20:46:54 +00:00
for ( service_key, content_updates ) in ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( ( hash, ), self._external_additional_service_keys_to_tags ).items():
2018-06-06 21:27:02 +00:00
service_keys_to_content_updates[ service_key ].extend( content_updates )
2019-02-27 23:03:30 +00:00
did_work = True
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
else:
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
media_result = HG.client_controller.Read( 'media_result', hash )
2018-06-06 21:27:02 +00:00
2020-09-16 20:46:54 +00:00
for ( service_key, content_updates ) in tag_import_options.GetServiceKeysToContentUpdates( self.status, media_result, set( self._tags ), external_filterable_tags = self._external_filterable_tags, external_additional_service_keys_to_tags = self._external_additional_service_keys_to_tags ).items():
2020-04-29 21:44:12 +00:00
service_keys_to_content_updates[ service_key ].extend( content_updates )
did_work = True
2018-06-06 21:27:02 +00:00
2019-02-27 23:03:30 +00:00
if len( service_keys_to_content_updates ) > 0:
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
2018-06-06 21:27:02 +00:00
return did_work
2018-06-27 19:27:05 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED ] = FileSeed
2018-06-06 21:27:02 +00:00
2020-06-11 12:01:08 +00:00
class FileSeedCacheStatus( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE_STATUS
SERIALISABLE_NAME = 'Import File Status Cache Status'
SERIALISABLE_VERSION = 1
def __init__( self ):
self._generation_time = HydrusData.GetNow()
self._statuses_to_counts = collections.Counter()
self._latest_added_time = 0
def _GetSerialisableInfo( self ):
serialisable_statuses_to_counts = list( self._statuses_to_counts.items() )
return ( self._generation_time, serialisable_statuses_to_counts, self._latest_added_time )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._generation_time, serialisable_statuses_to_counts, self._latest_added_time ) = serialisable_info
self._statuses_to_counts = collections.Counter()
self._statuses_to_counts.update( dict( serialisable_statuses_to_counts ) )
def GetFileSeedCount( self, status: typing.Optional[ int ] = None ) -> int:
if status is None:
return sum( self._statuses_to_counts.values() )
else:
return self._statuses_to_counts[ status ]
def GetGenerationTime( self ) -> int:
return self._generation_time
def GetLatestAddedTime( self ) -> int:
return self._latest_added_time
def GetStatusText( self, simple = False ) -> str:
num_successful_and_new = self._statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
num_successful_but_redundant = self._statuses_to_counts[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ]
num_ignored = self._statuses_to_counts[ CC.STATUS_VETOED ]
num_deleted = self._statuses_to_counts[ CC.STATUS_DELETED ]
num_failed = self._statuses_to_counts[ CC.STATUS_ERROR ]
num_skipped = self._statuses_to_counts[ CC.STATUS_SKIPPED ]
num_unknown = self._statuses_to_counts[ CC.STATUS_UNKNOWN ]
if simple:
total = sum( self._statuses_to_counts.values() )
total_processed = total - num_unknown
#
status_text = ''
if total > 0:
if num_unknown > 0:
status_text += HydrusData.ConvertValueRangeToPrettyString( total_processed, total )
else:
status_text += HydrusData.ToHumanInt( total_processed )
show_new_on_file_seed_short_summary = HG.client_controller.new_options.GetBoolean( 'show_new_on_file_seed_short_summary' )
if show_new_on_file_seed_short_summary and num_successful_and_new:
status_text += ' - {}N'.format( HydrusData.ToHumanInt( num_successful_and_new ) )
simple_status_strings = []
if num_ignored > 0:
simple_status_strings.append( '{}Ig'.format( HydrusData.ToHumanInt( num_ignored ) ) )
show_deleted_on_file_seed_short_summary = HG.client_controller.new_options.GetBoolean( 'show_deleted_on_file_seed_short_summary' )
if show_deleted_on_file_seed_short_summary and num_deleted > 0:
simple_status_strings.append( '{}D'.format( HydrusData.ToHumanInt( num_deleted ) ) )
if num_failed > 0:
simple_status_strings.append( '{}F'.format( HydrusData.ToHumanInt( num_failed ) ) )
if num_skipped > 0:
simple_status_strings.append( '{}S'.format( HydrusData.ToHumanInt( num_skipped ) ) )
if len( simple_status_strings ) > 0:
status_text += ' - {}'.format( ''.join( simple_status_strings ) )
else:
status_strings = []
num_successful = num_successful_and_new + num_successful_but_redundant
if num_successful > 0:
s = '{} successful'.format( HydrusData.ToHumanInt( num_successful ) )
if num_successful_and_new > 0:
if num_successful_but_redundant > 0:
s += ' ({} already in db)'.format( HydrusData.ToHumanInt( num_successful_but_redundant ) )
else:
s += ' (all already in db)'
status_strings.append( s )
if num_ignored > 0:
status_strings.append( '{} ignored'.format( HydrusData.ToHumanInt( num_ignored ) ) )
if num_deleted > 0:
status_strings.append( '{} previously deleted'.format( HydrusData.ToHumanInt( num_deleted ) ) )
if num_failed > 0:
status_strings.append( '{} failed'.format( HydrusData.ToHumanInt( num_failed ) ) )
if num_skipped > 0:
status_strings.append( '{} skipped'.format( HydrusData.ToHumanInt( num_skipped ) ) )
status_text = ', '.join( status_strings )
return status_text
def GetStatusesToCounts( self ) -> typing.Mapping[ int, int ]:
return self._statuses_to_counts
def GetValueRange( self ) -> typing.Tuple[ int, int ]:
total = sum( self._statuses_to_counts.values() )
num_unknown = self._statuses_to_counts[ CC.STATUS_UNKNOWN ]
total_processed = total - num_unknown
return ( total_processed, total )
def HasWorkToDo( self ):
( num_done, num_total ) = self.GetValueRange()
return num_done < num_total
def Merge( self, file_seed_cache_status: "FileSeedCacheStatus" ):
self._latest_added_time = max( self._latest_added_time, file_seed_cache_status.GetLatestAddedTime() )
self._statuses_to_counts.update( file_seed_cache_status.GetStatusesToCounts() )
def SetStatusesToCounts( self, statuses_to_counts: typing.Mapping[ int, int ] ):
self._statuses_to_counts = collections.Counter()
self._statuses_to_counts.update( statuses_to_counts )
def SetLatestAddedTime( self, latest_added_time: int ):
self._latest_added_time = latest_added_time
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE_STATUS ] = FileSeedCacheStatus
2018-06-27 19:27:05 +00:00
class FileSeedCache( HydrusSerialisable.SerialisableBase ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE
2018-06-06 21:27:02 +00:00
SERIALISABLE_NAME = 'Import File Status Cache'
SERIALISABLE_VERSION = 8
2018-10-24 21:34:02 +00:00
COMPACT_NUMBER = 250
2018-08-22 21:10:59 +00:00
2018-06-06 21:27:02 +00:00
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
2018-06-27 19:27:05 +00:00
self._file_seeds = HydrusSerialisable.SerialisableList()
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds_to_indices = {}
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seed_cache_key = HydrusData.GenerateKey()
2018-06-06 21:27:02 +00:00
2020-06-11 12:01:08 +00:00
self._status_cache = FileSeedCacheStatus()
2018-06-06 21:27:02 +00:00
self._status_dirty = True
self._lock = threading.Lock()
def __len__( self ):
2018-06-27 19:27:05 +00:00
return len( self._file_seeds )
2018-06-06 21:27:02 +00:00
def _GenerateStatus( self ):
2020-06-11 12:01:08 +00:00
fscs = FileSeedCacheStatus()
fscs.SetLatestAddedTime( self._GetLatestAddedTime() )
fscs.SetStatusesToCounts( self._GetStatusesToCounts() )
self._status_cache = fscs
2018-06-06 21:27:02 +00:00
self._status_dirty = False
2020-04-29 21:44:12 +00:00
def _GetFileSeeds( self, status: int = None ):
2018-06-06 21:27:02 +00:00
if status is None:
2018-06-27 19:27:05 +00:00
return list( self._file_seeds )
2018-06-06 21:27:02 +00:00
else:
2018-06-27 19:27:05 +00:00
return [ file_seed for file_seed in self._file_seeds if file_seed.status == status ]
2018-06-06 21:27:02 +00:00
2020-06-11 12:01:08 +00:00
def _GetLatestAddedTime( self ):
if len( self._file_seeds ) == 0:
latest_timestamp = 0
else:
latest_timestamp = max( ( file_seed.created for file_seed in self._file_seeds ) )
return latest_timestamp
def _GetNextFileSeed( self, status: int ) -> typing.Optional[ FileSeed ]:
for file_seed in self._file_seeds:
if file_seed.status == status:
return file_seed
return None
2018-06-06 21:27:02 +00:00
def _GetSerialisableInfo( self ):
2019-02-06 22:41:35 +00:00
return self._file_seeds.GetSerialisableTuple()
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def _GetSourceTimestamp( self, file_seed: FileSeed ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
source_timestamp = file_seed.source_time
2018-06-06 21:27:02 +00:00
if source_timestamp is None:
# decent fallback compromise
# -30 since added and 'last check' timestamps are often the same, and this messes up calculations
2018-06-27 19:27:05 +00:00
source_timestamp = file_seed.created - 30
2018-06-06 21:27:02 +00:00
return source_timestamp
2018-07-04 20:48:28 +00:00
def _GetStatusesToCounts( self ):
statuses_to_counts = collections.Counter()
for file_seed in self._file_seeds:
statuses_to_counts[ file_seed.status ] += 1
return statuses_to_counts
2020-04-29 21:44:12 +00:00
def _HasFileSeed( self, file_seed: FileSeed ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
search_file_seeds = file_seed.GetSearchFileSeeds()
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
has_file_seed = True in ( search_file_seed in self._file_seeds_to_indices for search_file_seed in search_file_seeds )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
return has_file_seed
2018-06-06 21:27:02 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
with self._lock:
2018-06-27 19:27:05 +00:00
self._file_seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
2018-06-06 21:27:02 +00:00
def _SetStatusDirty( self ):
self._status_dirty = True
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
new_serialisable_info = []
2018-06-27 19:27:05 +00:00
for ( file_seed, file_seed_info ) in old_serialisable_info:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if 'note' in file_seed_info:
2018-06-06 21:27:02 +00:00
2019-01-09 22:59:03 +00:00
file_seed_info[ 'note' ] = str( file_seed_info[ 'note' ] )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
new_serialisable_info.append( ( file_seed, file_seed_info ) )
2018-06-06 21:27:02 +00:00
return ( 2, new_serialisable_info )
if version in ( 2, 3 ):
# gelbooru replaced their thumbnail links with this redirect spam
# 'https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY4ODA1OA=='
# I missed some http ones here, so I've broadened the test and rescheduled it
new_serialisable_info = []
2018-06-27 19:27:05 +00:00
for ( file_seed, file_seed_info ) in old_serialisable_info:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if 'gelbooru.com/redirect.php' in file_seed:
2018-06-06 21:27:02 +00:00
continue
2018-06-27 19:27:05 +00:00
new_serialisable_info.append( ( file_seed, file_seed_info ) )
2018-06-06 21:27:02 +00:00
return ( 4, new_serialisable_info )
if version == 4:
def ConvertRegularToRawURL( regular_url ):
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
# to this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# the 500 part can be a bunch of stuff, including letters
url_components = regular_url.split( '_' )
last_component = url_components[ -1 ]
( number_gubbins, file_ext ) = last_component.split( '.' )
2020-11-11 22:20:16 +00:00
raw_last_component = 'raw.{}'.format( file_ext )
2018-06-06 21:27:02 +00:00
url_components[ -1 ] = raw_last_component
raw_url = '_'.join( url_components )
return raw_url
def Remove68Subdomain( long_url ):
# sometimes the 68 subdomain gives a 404 on the raw url, so:
# convert this:
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# to this:
# http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
# I am not sure if it is always 68, but let's not assume
( scheme, rest ) = long_url.split( '://', 1 )
if rest.startswith( 'media.tumblr.com' ):
return long_url
( gumpf, shorter_rest ) = rest.split( '.', 1 )
2020-11-11 22:20:16 +00:00
shorter_url = '{}://{}'.format( scheme, shorter_rest )
2018-06-06 21:27:02 +00:00
return shorter_url
new_serialisable_info = []
2018-06-27 19:27:05 +00:00
good_file_seeds = set()
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
for ( file_seed, file_seed_info ) in old_serialisable_info:
2018-06-06 21:27:02 +00:00
try:
2019-01-09 22:59:03 +00:00
parse = urllib.parse.urlparse( file_seed )
2018-06-06 21:27:02 +00:00
if 'media.tumblr.com' in parse.netloc:
2018-06-27 19:27:05 +00:00
file_seed = Remove68Subdomain( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed = ConvertRegularToRawURL( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( file_seed )
2018-06-06 21:27:02 +00:00
if 'pixiv.net' in parse.netloc:
2018-06-27 19:27:05 +00:00
file_seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if file_seed in good_file_seeds: # we hit a dupe, so skip it
2018-06-06 21:27:02 +00:00
continue
except:
pass
2018-06-27 19:27:05 +00:00
good_file_seeds.add( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
new_serialisable_info.append( ( file_seed, file_seed_info ) )
2018-06-06 21:27:02 +00:00
return ( 5, new_serialisable_info )
if version == 5:
new_serialisable_info = []
2018-06-27 19:27:05 +00:00
for ( file_seed, file_seed_info ) in old_serialisable_info:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed_info[ 'source_timestamp' ] = None
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
new_serialisable_info.append( ( file_seed, file_seed_info ) )
2018-06-06 21:27:02 +00:00
return ( 6, new_serialisable_info )
if version == 6:
new_serialisable_info = []
2018-06-27 19:27:05 +00:00
for ( file_seed, file_seed_info ) in old_serialisable_info:
2018-06-06 21:27:02 +00:00
try:
magic_phrase = '//media.tumblr.com'
replacement = '//data.tumblr.com'
2018-06-27 19:27:05 +00:00
if magic_phrase in file_seed:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed = file_seed.replace( magic_phrase, replacement )
2018-06-06 21:27:02 +00:00
except:
pass
2018-06-27 19:27:05 +00:00
new_serialisable_info.append( ( file_seed, file_seed_info ) )
2018-06-06 21:27:02 +00:00
return ( 7, new_serialisable_info )
if version == 7:
2018-06-27 19:27:05 +00:00
file_seeds = HydrusSerialisable.SerialisableList()
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
for ( file_seed_text, file_seed_info ) in old_serialisable_info:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if file_seed_text.startswith( 'http' ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed_type = FILE_SEED_TYPE_URL
2018-06-06 21:27:02 +00:00
else:
2018-06-27 19:27:05 +00:00
file_seed_type = FILE_SEED_TYPE_HDD
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed = FileSeed( file_seed_type, file_seed_text )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed.status = file_seed_info[ 'status' ]
file_seed.created = file_seed_info[ 'added_timestamp' ]
file_seed.modified = file_seed_info[ 'last_modified_timestamp' ]
file_seed.source_time = file_seed_info[ 'source_timestamp' ]
file_seed.note = file_seed_info[ 'note' ]
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seeds.append( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
new_serialisable_info = file_seeds.GetSerialisableTuple()
2018-06-06 21:27:02 +00:00
return ( 8, new_serialisable_info )
def AddFileSeeds( self, file_seeds: typing.Collection[ FileSeed ] ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if len( file_seeds ) == 0:
2018-06-06 21:27:02 +00:00
return 0
2018-06-27 19:27:05 +00:00
new_file_seeds = []
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
for file_seed in file_seeds:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if self._HasFileSeed( file_seed ):
2018-06-06 21:27:02 +00:00
continue
2019-10-09 22:03:03 +00:00
try:
file_seed.Normalise()
except HydrusExceptions.URLClassException:
# this is some borked 'https://' url that makes no sense
continue
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
new_file_seeds.append( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds.append( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds_to_indices[ file_seed ] = len( self._file_seeds ) - 1
2018-06-06 21:27:02 +00:00
self._SetStatusDirty()
2018-06-27 19:27:05 +00:00
self.NotifyFileSeedsUpdated( new_file_seeds )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
return len( new_file_seeds )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def AdvanceFileSeed( self, file_seed: FileSeed ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
if file_seed in self._file_seeds_to_indices:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
index = self._file_seeds_to_indices[ file_seed ]
2018-06-06 21:27:02 +00:00
if index > 0:
2018-06-27 19:27:05 +00:00
self._file_seeds.remove( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds.insert( index - 1, file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self.NotifyFileSeedsUpdated( ( file_seed, ) )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def CanCompact( self, compact_before_this_source_time: int ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-08-22 21:10:59 +00:00
if len( self._file_seeds ) <= self.COMPACT_NUMBER:
2018-06-06 21:27:02 +00:00
return False
2018-08-22 21:10:59 +00:00
for file_seed in self._file_seeds[:-self.COMPACT_NUMBER]:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if file_seed.status == CC.STATUS_UNKNOWN:
2018-06-06 21:27:02 +00:00
continue
2018-06-27 19:27:05 +00:00
if self._GetSourceTimestamp( file_seed ) < compact_before_this_source_time:
2018-06-06 21:27:02 +00:00
return True
return False
2020-04-29 21:44:12 +00:00
def Compact( self, compact_before_this_source_time: int ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-08-22 21:10:59 +00:00
if len( self._file_seeds ) <= self.COMPACT_NUMBER:
2018-06-06 21:27:02 +00:00
return
2018-06-27 19:27:05 +00:00
new_file_seeds = HydrusSerialisable.SerialisableList()
2018-06-06 21:27:02 +00:00
2018-08-22 21:10:59 +00:00
for file_seed in self._file_seeds[:-self.COMPACT_NUMBER]:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
still_to_do = file_seed.status == CC.STATUS_UNKNOWN
still_relevant = self._GetSourceTimestamp( file_seed ) > compact_before_this_source_time
2018-06-06 21:27:02 +00:00
if still_to_do or still_relevant:
2018-06-27 19:27:05 +00:00
new_file_seeds.append( file_seed )
2018-06-06 21:27:02 +00:00
2018-08-22 21:10:59 +00:00
new_file_seeds.extend( self._file_seeds[-self.COMPACT_NUMBER:] )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds = new_file_seeds
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
2018-06-06 21:27:02 +00:00
self._SetStatusDirty()
2020-04-29 21:44:12 +00:00
def DelayFileSeed( self, file_seed: FileSeed ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
if file_seed in self._file_seeds_to_indices:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
index = self._file_seeds_to_indices[ file_seed ]
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if index < len( self._file_seeds ) - 1:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds.remove( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds.insert( index + 1, file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self.NotifyFileSeedsUpdated( ( file_seed, ) )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def GetAPIInfoDict( self, simple: bool ):
2019-08-21 21:34:01 +00:00
with self._lock:
d = {}
if self._status_dirty:
self._GenerateStatus()
2020-06-11 12:01:08 +00:00
d[ 'status' ] = self._status_cache.GetStatusText()
d[ 'simple_status' ] = self._status_cache.GetStatusText( simple = True )
2019-08-21 21:34:01 +00:00
2020-06-11 12:01:08 +00:00
( num_done, num_total ) = self._status_cache.GetValueRange()
d[ 'total_processed' ] = num_done
d[ 'total_to_process' ] = num_total
2019-08-21 21:34:01 +00:00
if not simple:
d[ 'import_items' ] = [ file_seed.GetAPIInfoDict( simple ) for file_seed in self._file_seeds ]
return d
2018-06-06 21:27:02 +00:00
def GetEarliestSourceTime( self ):
with self._lock:
2018-06-27 19:27:05 +00:00
if len( self._file_seeds ) == 0:
2018-06-06 21:27:02 +00:00
return None
2018-06-27 19:27:05 +00:00
earliest_timestamp = min( ( self._GetSourceTimestamp( file_seed ) for file_seed in self._file_seeds ) )
2018-06-06 21:27:02 +00:00
return earliest_timestamp
2020-06-11 12:01:08 +00:00
def GetExampleFileSeed( self ):
with self._lock:
if len( self._file_seeds ) == 0:
return None
else:
example_seed = self._GetNextFileSeed( CC.STATUS_UNKNOWN )
if example_seed is None:
example_seed = random.choice( self._file_seeds[-10:] )
if example_seed.file_seed_type == FILE_SEED_TYPE_HDD:
return None
else:
return example_seed
2018-06-27 19:27:05 +00:00
def GetFileSeedCacheKey( self ):
return self._file_seed_cache_key
2020-04-29 21:44:12 +00:00
def GetFileSeedCount( self, status: int = None ):
2018-06-27 19:27:05 +00:00
result = 0
with self._lock:
if status is None:
result = len( self._file_seeds )
else:
for file_seed in self._file_seeds:
if file_seed.status == status:
result += 1
return result
2020-04-29 21:44:12 +00:00
def GetFileSeeds( self, status: int = None ):
2018-06-27 19:27:05 +00:00
with self._lock:
return self._GetFileSeeds( status )
2020-04-29 21:44:12 +00:00
def GetFileSeedIndex( self, file_seed: FileSeed ):
2018-06-27 19:27:05 +00:00
with self._lock:
return self._file_seeds_to_indices[ file_seed ]
2018-10-03 21:00:15 +00:00
def GetHashes( self ):
with self._lock:
hashes = [ file_seed.GetHash() for file_seed in self._file_seeds if file_seed.HasHash() ]
return hashes
2018-06-06 21:27:02 +00:00
def GetLatestSourceTime( self ):
with self._lock:
2018-06-27 19:27:05 +00:00
if len( self._file_seeds ) == 0:
2018-06-06 21:27:02 +00:00
return 0
2018-06-27 19:27:05 +00:00
latest_timestamp = max( ( self._GetSourceTimestamp( file_seed ) for file_seed in self._file_seeds ) )
2018-06-06 21:27:02 +00:00
return latest_timestamp
2020-06-11 12:01:08 +00:00
def GetNextFileSeed( self, status: int ):
2018-06-06 21:27:02 +00:00
with self._lock:
2020-06-11 12:01:08 +00:00
return self._GetNextFileSeed( status )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def GetNumNewFilesSince( self, since: int ):
2018-06-06 21:27:02 +00:00
num_files = 0
with self._lock:
2018-06-27 19:27:05 +00:00
for file_seed in self._file_seeds:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
source_timestamp = self._GetSourceTimestamp( file_seed )
2018-06-06 21:27:02 +00:00
if source_timestamp >= since:
num_files += 1
return num_files
2020-04-29 21:44:12 +00:00
def GetPresentedHashes( self, file_import_options: ClientImportOptions.FileImportOptions ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-08-08 20:29:54 +00:00
eligible_file_seeds = [ file_seed for file_seed in self._file_seeds if file_seed.HasHash() ]
2020-07-29 20:52:44 +00:00
file_seed_hashes = [ file_seed.GetHash() for file_seed in eligible_file_seeds ]
if len( file_seed_hashes ) > 0:
inbox_hashes = HG.client_controller.Read( 'in_inbox', file_seed_hashes )
else:
inbox_hashes = set()
2020-07-29 20:52:44 +00:00
hashes = []
hashes_seen = set()
for file_seed in eligible_file_seeds:
2018-08-08 20:29:54 +00:00
2020-07-29 20:52:44 +00:00
hash = file_seed.GetHash()
2018-06-06 21:27:02 +00:00
2020-07-29 20:52:44 +00:00
if hash in hashes_seen:
2018-08-08 20:29:54 +00:00
2020-07-29 20:52:44 +00:00
continue
2019-10-09 22:03:03 +00:00
2020-07-29 20:52:44 +00:00
in_inbox = hash in inbox_hashes
if file_seed.ShouldPresent( file_import_options, in_inbox = in_inbox ):
2018-06-06 21:27:02 +00:00
2020-07-29 20:52:44 +00:00
hashes.append( hash )
hashes_seen.add( hash )
2018-06-06 21:27:02 +00:00
2020-07-29 20:52:44 +00:00
return hashes
2018-06-06 21:27:02 +00:00
def GetStatus( self ):
with self._lock:
if self._status_dirty:
self._GenerateStatus()
return self._status_cache
def GetValueRange( self ):
with self._lock:
if self._status_dirty:
self._GenerateStatus()
2020-06-11 12:01:08 +00:00
return self._status_cache.GetValueRange()
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def HasFileSeed( self, file_seed: FileSeed ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
return self._HasFileSeed( file_seed )
2018-06-06 21:27:02 +00:00
def InsertFileSeeds( self, index: int, file_seeds: typing.Collection[ FileSeed ] ):
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
if len( file_seeds ) == 0:
2018-06-06 21:27:02 +00:00
return 0
2018-10-24 21:34:02 +00:00
new_file_seeds = set()
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
index = min( index, len( self._file_seeds ) )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
for file_seed in file_seeds:
2018-06-06 21:27:02 +00:00
2018-10-24 21:34:02 +00:00
if self._HasFileSeed( file_seed ) or file_seed in new_file_seeds:
2018-06-06 21:27:02 +00:00
continue
2018-06-27 19:27:05 +00:00
file_seed.Normalise()
2018-06-06 21:27:02 +00:00
2018-10-24 21:34:02 +00:00
new_file_seeds.add( file_seed )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds.insert( index, file_seed )
2018-06-06 21:27:02 +00:00
index += 1
2018-06-27 19:27:05 +00:00
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
2018-06-06 21:27:02 +00:00
self._SetStatusDirty()
2018-06-27 19:27:05 +00:00
self.NotifyFileSeedsUpdated( new_file_seeds )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
return len( new_file_seeds )
2018-06-06 21:27:02 +00:00
def NotifyFileSeedsUpdated( self, file_seeds: typing.Collection[ FileSeed ] ):
2018-06-06 21:27:02 +00:00
with self._lock:
self._SetStatusDirty()
2018-06-27 19:27:05 +00:00
HG.client_controller.pub( 'file_seed_cache_file_seeds_updated', self._file_seed_cache_key, file_seeds )
2018-06-06 21:27:02 +00:00
2020-04-29 21:44:12 +00:00
def RemoveFileSeeds( self, file_seeds: typing.Iterable[ FileSeed ] ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
file_seeds_to_delete = set( file_seeds )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds = HydrusSerialisable.SerialisableList( [ file_seed for file_seed in self._file_seeds if file_seed not in file_seeds_to_delete ] )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
2018-06-06 21:27:02 +00:00
self._SetStatusDirty()
2018-06-27 19:27:05 +00:00
self.NotifyFileSeedsUpdated( file_seeds_to_delete )
2018-06-06 21:27:02 +00:00
def RemoveFileSeedsByStatus( self, statuses_to_remove: typing.Collection[ int ] ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
file_seeds_to_delete = [ file_seed for file_seed in self._file_seeds if file_seed.status in statuses_to_remove ]
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self.RemoveFileSeeds( file_seeds_to_delete )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
def RemoveAllButUnknownFileSeeds( self ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
file_seeds_to_delete = [ file_seed for file_seed in self._file_seeds if file_seed.status != CC.STATUS_UNKNOWN ]
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self.RemoveFileSeeds( file_seeds_to_delete )
2018-06-06 21:27:02 +00:00
2020-06-11 12:01:08 +00:00
def RetryFailed( self ):
2018-06-06 21:27:02 +00:00
with self._lock:
2018-06-27 19:27:05 +00:00
failed_file_seeds = self._GetFileSeeds( CC.STATUS_ERROR )
for file_seed in failed_file_seeds:
file_seed.SetStatus( CC.STATUS_UNKNOWN )
self.NotifyFileSeedsUpdated( failed_file_seeds )
def RetryIgnored( self ):
with self._lock:
ignored_file_seeds = self._GetFileSeeds( CC.STATUS_VETOED )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
for file_seed in ignored_file_seeds:
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
file_seed.SetStatus( CC.STATUS_UNKNOWN )
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
self.NotifyFileSeedsUpdated( ignored_file_seeds )
2018-06-06 21:27:02 +00:00
def WorkToDo( self ):
with self._lock:
if self._status_dirty:
self._GenerateStatus()
2020-06-11 12:01:08 +00:00
return self._status_cache.HasWorkToDo()
2018-06-06 21:27:02 +00:00
2018-06-27 19:27:05 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE ] = FileSeedCache
2020-04-29 21:44:12 +00:00
def GenerateFileSeedCachesStatus( file_seed_caches: typing.Iterable[ FileSeedCache ] ):
2020-06-11 12:01:08 +00:00
fscs = FileSeedCacheStatus()
2020-04-29 21:44:12 +00:00
for file_seed_cache in file_seed_caches:
2020-06-11 12:01:08 +00:00
fscs.Merge( file_seed_cache.GetStatus() )
2020-04-29 21:44:12 +00:00
2020-06-11 12:01:08 +00:00
return fscs
2020-04-29 21:44:12 +00:00