2828 lines
99 KiB
Python
2828 lines
99 KiB
Python
import bisect
|
|
import collections
|
|
import itertools
|
|
import os
|
|
import random
|
|
import re
|
|
import threading
|
|
import time
|
|
import traceback
|
|
import typing
|
|
import urllib.parse
|
|
|
|
from hydrus.core import HydrusConstants as HC
|
|
from hydrus.core import HydrusData
|
|
from hydrus.core import HydrusExceptions
|
|
from hydrus.core import HydrusFileHandling
|
|
from hydrus.core import HydrusGlobals as HG
|
|
from hydrus.core import HydrusPaths
|
|
from hydrus.core import HydrusSerialisable
|
|
from hydrus.core import HydrusTags
|
|
from hydrus.core import HydrusTemp
|
|
|
|
from hydrus.client import ClientConstants as CC
|
|
from hydrus.client import ClientData
|
|
from hydrus.client import ClientParsing
|
|
from hydrus.client import ClientTime
|
|
from hydrus.client.importing import ClientImportFiles
|
|
from hydrus.client.importing import ClientImporting
|
|
from hydrus.client.importing.options import FileImportOptions
|
|
from hydrus.client.importing.options import PresentationImportOptions
|
|
from hydrus.client.importing.options import TagImportOptions
|
|
from hydrus.client.metadata import ClientTags
|
|
from hydrus.client.networking import ClientNetworkingDomain
|
|
from hydrus.client.networking import ClientNetworkingFunctions
|
|
|
|
FILE_SEED_TYPE_HDD = 0
|
|
FILE_SEED_TYPE_URL = 1
|
|
|
|
class FileSeed( HydrusSerialisable.SerialisableBase ):
|
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED
|
|
SERIALISABLE_NAME = 'File Import'
|
|
SERIALISABLE_VERSION = 5
|
|
|
|
def __init__( self, file_seed_type: int = None, file_seed_data: str = None ):
|
|
|
|
if file_seed_type is None:
|
|
|
|
file_seed_type = FILE_SEED_TYPE_URL
|
|
|
|
|
|
if file_seed_data is None:
|
|
|
|
file_seed_data = 'https://big-guys.4u/monica_lewinsky_hott.tiff.exe.vbs'
|
|
|
|
|
|
HydrusSerialisable.SerialisableBase.__init__( self )
|
|
|
|
self.file_seed_type = file_seed_type
|
|
self.file_seed_data = file_seed_data
|
|
|
|
self.created = HydrusData.GetNow()
|
|
self.modified = self.created
|
|
self.source_time = None
|
|
self.status = CC.STATUS_UNKNOWN
|
|
self.note = ''
|
|
|
|
self._cloudflare_last_modified_time = None
|
|
|
|
self._referral_url = None
|
|
|
|
self._external_filterable_tags = set()
|
|
self._external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags()
|
|
|
|
self._primary_urls = set()
|
|
self._source_urls = set()
|
|
self._tags = set()
|
|
self._hashes = {}
|
|
|
|
|
|
def __eq__( self, other ):
|
|
|
|
if isinstance( other, FileSeed ):
|
|
|
|
return self.__hash__() == other.__hash__()
|
|
|
|
|
|
return NotImplemented
|
|
|
|
|
|
def __hash__( self ):
|
|
|
|
return ( self.file_seed_type, self.file_seed_data ).__hash__()
|
|
|
|
|
|
def __ne__( self, other ):
|
|
|
|
return self.__hash__() != other.__hash__()
|
|
|
|
|
|
def _AddPrimaryURLs( self, urls ):
|
|
|
|
if len( urls ) == 0:
|
|
|
|
return
|
|
|
|
|
|
urls = ClientNetworkingFunctions.NormaliseAndFilterAssociableURLs( urls )
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
urls.discard( self.file_seed_data )
|
|
|
|
|
|
if self._referral_url is not None:
|
|
|
|
urls.discard( self._referral_url )
|
|
|
|
|
|
self._primary_urls.update( urls )
|
|
self._source_urls.difference_update( urls )
|
|
|
|
|
|
def _AddSourceURLs( self, urls ):
|
|
|
|
if len( urls ) == 0:
|
|
|
|
return
|
|
|
|
|
|
urls = ClientNetworkingFunctions.NormaliseAndFilterAssociableURLs( urls )
|
|
|
|
all_primary_urls = set()
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
all_primary_urls.add( self.file_seed_data )
|
|
|
|
|
|
if self._referral_url is not None:
|
|
|
|
all_primary_urls.add( self._referral_url )
|
|
|
|
|
|
all_primary_urls.update( self._primary_urls )
|
|
|
|
urls.difference_update( all_primary_urls )
|
|
|
|
primary_url_classes = { HG.client_controller.network_engine.domain_manager.GetURLClass( url ) for url in all_primary_urls }
|
|
primary_url_classes.discard( None )
|
|
|
|
# ok when a booru has a """"""source"""""" url that points to a file alternate on the same booru, that isn't what we call a source url
|
|
# so anything that has a source url with the same url class as our primaries, just some same-site loopback, we'll dump
|
|
urls = { url for url in urls if HG.client_controller.network_engine.domain_manager.GetURLClass( url ) not in primary_url_classes }
|
|
|
|
self._source_urls.update( urls )
|
|
|
|
|
|
def _CheckTagsVeto( self, tags, tag_import_options: TagImportOptions.TagImportOptions ):
|
|
|
|
if len( tags ) > 0:
|
|
|
|
tags_to_siblings = HG.client_controller.Read( 'tag_siblings_lookup', CC.COMBINED_TAG_SERVICE_KEY, tags )
|
|
|
|
all_chain_tags = set( itertools.chain.from_iterable( tags_to_siblings.values() ) )
|
|
|
|
tag_import_options.CheckTagsVeto( tags, all_chain_tags )
|
|
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
serialisable_external_filterable_tags = list( self._external_filterable_tags )
|
|
serialisable_external_additional_service_keys_to_tags = self._external_additional_service_keys_to_tags.GetSerialisableTuple()
|
|
|
|
serialisable_primary_urls = list( self._primary_urls )
|
|
serialisable_source_urls = list( self._source_urls )
|
|
serialisable_tags = list( self._tags )
|
|
serialisable_hashes = [ ( hash_type, hash.hex() ) for ( hash_type, hash ) in list(self._hashes.items()) if hash is not None ]
|
|
|
|
return (
|
|
self.file_seed_type,
|
|
self.file_seed_data,
|
|
self.created,
|
|
self.modified,
|
|
self.source_time,
|
|
self.status,
|
|
self.note,
|
|
self._referral_url,
|
|
serialisable_external_filterable_tags,
|
|
serialisable_external_additional_service_keys_to_tags,
|
|
serialisable_primary_urls,
|
|
serialisable_source_urls,
|
|
serialisable_tags,
|
|
serialisable_hashes
|
|
)
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
(
|
|
self.file_seed_type,
|
|
self.file_seed_data,
|
|
self.created,
|
|
self.modified,
|
|
self.source_time,
|
|
self.status,
|
|
self.note,
|
|
self._referral_url,
|
|
serialisable_external_filterable_tags,
|
|
serialisable_external_additional_service_keys_to_tags,
|
|
serialisable_primary_urls,
|
|
serialisable_source_urls,
|
|
serialisable_tags,
|
|
serialisable_hashes
|
|
) = serialisable_info
|
|
|
|
self._external_filterable_tags = set( serialisable_external_filterable_tags )
|
|
self._external_additional_service_keys_to_tags = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_external_additional_service_keys_to_tags )
|
|
|
|
self._primary_urls = set( serialisable_primary_urls )
|
|
self._source_urls = set( serialisable_source_urls )
|
|
self._tags = set( serialisable_tags )
|
|
self._hashes = { hash_type : bytes.fromhex( encoded_hash ) for ( hash_type, encoded_hash ) in serialisable_hashes if encoded_hash is not None }
|
|
|
|
|
|
def _SetupTagImportOptions( self, given_tag_import_options: TagImportOptions.TagImportOptions ) -> TagImportOptions.TagImportOptions:
|
|
|
|
if given_tag_import_options.IsDefault():
|
|
|
|
if self.IsAPostURL():
|
|
|
|
tio_lookup_url = self.file_seed_data
|
|
|
|
else:
|
|
|
|
if self._referral_url is not None:
|
|
|
|
tio_lookup_url = self._referral_url
|
|
|
|
else:
|
|
|
|
tio_lookup_url = self.file_seed_data
|
|
|
|
|
|
|
|
tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( tio_lookup_url )
|
|
|
|
else:
|
|
|
|
tag_import_options = given_tag_import_options
|
|
|
|
|
|
return tag_import_options
|
|
|
|
|
|
def _UpdateModified( self ):
|
|
|
|
self.modified = HydrusData.GetNow()
|
|
|
|
|
|
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
|
|
|
|
if version == 1:
|
|
|
|
( file_seed_type, file_seed_data, created, modified, source_time, status, note, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
|
|
|
|
referral_url = None
|
|
|
|
new_serialisable_info = ( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_urls, serialisable_tags, serialisable_hashes )
|
|
|
|
return ( 2, new_serialisable_info )
|
|
|
|
|
|
if version == 2:
|
|
|
|
( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
|
|
|
|
external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags()
|
|
|
|
serialisable_external_additional_service_keys_to_tags = external_additional_service_keys_to_tags.GetSerialisableTuple()
|
|
|
|
new_serialisable_info = ( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes )
|
|
|
|
return ( 3, new_serialisable_info )
|
|
|
|
|
|
if version == 3:
|
|
|
|
( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes ) = old_serialisable_info
|
|
|
|
external_filterable_tags = set()
|
|
|
|
serialisable_external_filterable_tags = list( external_filterable_tags )
|
|
|
|
new_serialisable_info = ( file_seed_type, file_seed_data, created, modified, source_time, status, note, referral_url, serialisable_external_filterable_tags, serialisable_external_additional_service_keys_to_tags, serialisable_urls, serialisable_tags, serialisable_hashes )
|
|
|
|
return ( 4, new_serialisable_info )
|
|
|
|
|
|
if version == 4:
|
|
|
|
(
|
|
file_seed_type,
|
|
file_seed_data,
|
|
created,
|
|
modified,
|
|
source_time,
|
|
status,
|
|
note,
|
|
referral_url,
|
|
serialisable_external_filterable_tags,
|
|
serialisable_external_additional_service_keys_to_tags,
|
|
serialisable_urls,
|
|
serialisable_tags,
|
|
serialisable_hashes
|
|
) = old_serialisable_info
|
|
|
|
serialisable_primary_urls = serialisable_urls
|
|
serialisable_source_urls = []
|
|
|
|
new_serialisable_info = (
|
|
file_seed_type,
|
|
file_seed_data,
|
|
created,
|
|
modified,
|
|
source_time,
|
|
status,
|
|
note,
|
|
referral_url,
|
|
serialisable_external_filterable_tags,
|
|
serialisable_external_additional_service_keys_to_tags,
|
|
serialisable_primary_urls,
|
|
serialisable_source_urls,
|
|
serialisable_tags,
|
|
serialisable_hashes
|
|
)
|
|
|
|
return ( 5, new_serialisable_info )
|
|
|
|
|
|
|
|
def AddParseResults( self, parse_results, file_import_options: FileImportOptions.FileImportOptions ):
|
|
|
|
for ( hash_type, hash ) in ClientParsing.GetHashesFromParseResults( parse_results ):
|
|
|
|
if hash_type not in self._hashes:
|
|
|
|
self._hashes[ hash_type ] = hash
|
|
|
|
|
|
|
|
source_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_SOURCE, ) )
|
|
|
|
self._AddSourceURLs( source_urls )
|
|
|
|
tags = ClientParsing.GetTagsFromParseResults( parse_results )
|
|
|
|
self._tags.update( tags )
|
|
|
|
source_timestamp = ClientParsing.GetTimestampFromParseResults( parse_results, HC.TIMESTAMP_TYPE_SOURCE )
|
|
|
|
if source_timestamp is not None:
|
|
|
|
source_timestamp = min( HydrusData.GetNow() - 30, source_timestamp )
|
|
|
|
self.source_time = source_timestamp
|
|
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
def AddTags( self, tags ):
|
|
|
|
tags = HydrusTags.CleanTags( tags )
|
|
|
|
self._tags.update( tags )
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
def AddPrimaryURLs( self, urls ):
|
|
|
|
self._AddPrimaryURLs( urls )
|
|
|
|
|
|
def AddSourceURLs( self, urls ):
|
|
|
|
self._AddSourceURLs( urls )
|
|
|
|
|
|
def CheckPreFetchMetadata( self, tag_import_options: TagImportOptions.TagImportOptions ):
|
|
|
|
self._CheckTagsVeto( self._tags, tag_import_options )
|
|
|
|
|
|
def DownloadAndImportRawFile( self, file_url: str, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook, override_bandwidth = False, forced_referral_url = None, file_seed_cache = None ):
|
|
|
|
self.AddPrimaryURLs( ( file_url, ) )
|
|
|
|
( os_file_handle, temp_path ) = HydrusTemp.GetTempPath()
|
|
|
|
try:
|
|
|
|
if forced_referral_url is not None:
|
|
|
|
referral_url = forced_referral_url
|
|
|
|
elif self.file_seed_data != file_url:
|
|
|
|
referral_url = self.file_seed_data
|
|
|
|
else:
|
|
|
|
referral_url = self._referral_url
|
|
|
|
|
|
status_hook( 'downloading file' )
|
|
|
|
network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )
|
|
|
|
if override_bandwidth:
|
|
|
|
network_job.OverrideBandwidth( 3 )
|
|
|
|
|
|
network_job.SetFileImportOptions( file_import_options )
|
|
|
|
HG.client_controller.network_engine.AddJob( network_job )
|
|
|
|
with network_job_presentation_context_factory( network_job ) as njpc:
|
|
|
|
network_job.WaitUntilDone()
|
|
|
|
|
|
actual_fetched_url = network_job.GetActualFetchedURL()
|
|
|
|
if actual_fetched_url != file_url:
|
|
|
|
self._AddPrimaryURLs( ( actual_fetched_url, ) )
|
|
|
|
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
|
|
|
|
if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:
|
|
|
|
# we just had a 3XX redirect to a Post URL!
|
|
|
|
if file_seed_cache is None:
|
|
|
|
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}", but then there was no file log in which to queue that download!'.format( file_url, actual_fetched_url ) )
|
|
|
|
else:
|
|
|
|
( original_url_type, original_match_name, original_can_parse, original_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
|
|
|
|
if original_url_type == actual_url_type and original_match_name == actual_match_name:
|
|
|
|
raise Exception( 'The downloader thought it had a raw file url with "{}", but that redirected to the apparent Post URL "{}". As that URL has the same class as this import job\'s original URL, we are stopping here in case this is a looping redirect!'.format( file_url, actual_fetched_url ) )
|
|
|
|
|
|
file_seed = FileSeed( FILE_SEED_TYPE_URL, actual_fetched_url )
|
|
|
|
file_seed.SetReferralURL( file_url )
|
|
|
|
file_seeds = [ file_seed ]
|
|
|
|
file_seed_cache.AddFileSeeds( file_seeds )
|
|
|
|
status = CC.STATUS_SUCCESSFUL_AND_CHILD_FILES
|
|
|
|
note = 'was redirected on file download to a post url, which has been queued in the parent file log'
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
last_modified_time = network_job.GetLastModifiedTime()
|
|
|
|
if self.source_time is not None and last_modified_time is not None:
|
|
|
|
# even with timezone weirdness, does the current source time have something reasonable?
|
|
current_source_time_looks_good = HydrusData.TimeHasPassed( self.source_time - 86400 )
|
|
|
|
# if CF is delivering a timestamp from 17 days before source time, this is probably some unusual CDN situation or delayed post
|
|
# we don't _really_ want this CF timestamp since it throws the domain-based timestamp ordering out
|
|
# in future maybe we'll save it as a misc 'cloudflare' domain or something, but for now we'll discard
|
|
if network_job.IsCloudFlareCache() and abs( self.source_time - last_modified_time ) > 86400 * 2:
|
|
|
|
self._cloudflare_last_modified_time = last_modified_time
|
|
last_modified_time = None
|
|
|
|
|
|
|
|
self.source_time = ClientTime.MergeModifiedTimes( self.source_time, last_modified_time )
|
|
|
|
status_hook( 'importing file' )
|
|
|
|
self.Import( temp_path, file_import_options, status_hook = status_hook )
|
|
|
|
finally:
|
|
|
|
HydrusTemp.CleanUpTempPath( os_file_handle, temp_path )
|
|
|
|
|
|
|
|
def FetchPageMetadata( self, tag_import_options: TagImportOptions.TagImportOptions ):
|
|
|
|
pass
|
|
|
|
|
|
def GetAPIInfoDict( self, simple: bool ):
|
|
|
|
d = {}
|
|
|
|
d[ 'import_data' ] = self.file_seed_data
|
|
d[ 'created' ] = self.created
|
|
d[ 'modified' ] = self.modified
|
|
d[ 'source_time' ] = self.source_time
|
|
d[ 'status' ] = self.status
|
|
d[ 'note' ] = self.note
|
|
|
|
return d
|
|
|
|
|
|
def GetExampleNetworkJob( self, network_job_factory ):
|
|
|
|
if self.IsAPostURL():
|
|
|
|
post_url = self.file_seed_data
|
|
|
|
try:
|
|
|
|
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
url_to_check = post_url
|
|
|
|
|
|
else:
|
|
|
|
url_to_check = self.file_seed_data
|
|
|
|
|
|
network_job = network_job_factory( 'GET', url_to_check )
|
|
|
|
return network_job
|
|
|
|
|
|
def GetHash( self ):
|
|
|
|
if 'sha256' in self._hashes:
|
|
|
|
return self._hashes[ 'sha256' ]
|
|
|
|
|
|
return None
|
|
|
|
|
|
def GetHashTypesToHashes( self ):
|
|
|
|
return dict( self._hashes )
|
|
|
|
|
|
def GetPreImportStatusPredictionHash( self, file_import_options: FileImportOptions.FileImportOptions ) -> ClientImportFiles.FileImportStatus:
|
|
|
|
hash_match_found = False
|
|
|
|
if file_import_options.DoNotCheckHashesBeforeImporting() or len( self._hashes ) == 0:
|
|
|
|
return ( hash_match_found, ClientImportFiles.FileImportStatus.STATICGetUnknownStatus() )
|
|
|
|
|
|
# hashes
|
|
|
|
jobs = []
|
|
|
|
if 'sha256' in self._hashes:
|
|
|
|
jobs.append( ( 'sha256', self._hashes[ 'sha256' ] ) )
|
|
|
|
|
|
for ( hash_type, found_hash ) in self._hashes.items():
|
|
|
|
if hash_type == 'sha256':
|
|
|
|
continue
|
|
|
|
|
|
jobs.append( ( hash_type, found_hash ) )
|
|
|
|
|
|
first_result = None
|
|
|
|
for ( hash_type, found_hash ) in jobs:
|
|
|
|
file_import_status = HG.client_controller.Read( 'hash_status', hash_type, found_hash, prefix = '{} hash recognised'.format( hash_type ) )
|
|
|
|
hash_match_found = True
|
|
|
|
file_import_status = ClientImportFiles.CheckFileImportStatus( file_import_status )
|
|
|
|
if first_result is None:
|
|
|
|
first_result = file_import_status
|
|
|
|
|
|
if not file_import_status.ShouldImport( file_import_options ):
|
|
|
|
return ( hash_match_found, file_import_status )
|
|
|
|
|
|
|
|
# we do first_result gubbins rather than generating a fresh unknown one to capture correct sha256 hash and mime if db provided it
|
|
if first_result is None:
|
|
|
|
return ( hash_match_found, ClientImportFiles.FileImportStatus.STATICGetUnknownStatus() )
|
|
|
|
else:
|
|
|
|
return ( hash_match_found, first_result )
|
|
|
|
|
|
|
|
def GetPreImportStatusPredictionURL( self, file_import_options: FileImportOptions.FileImportOptions, file_url = None ) -> ClientImportFiles.FileImportStatus:
|
|
|
|
if file_import_options.DoNotCheckKnownURLsBeforeImporting():
|
|
|
|
return ClientImportFiles.FileImportStatus.STATICGetUnknownStatus()
|
|
|
|
|
|
# urls
|
|
|
|
urls = []
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
urls.append( self.file_seed_data )
|
|
|
|
|
|
if file_url is not None:
|
|
|
|
urls.append( file_url )
|
|
|
|
|
|
urls.extend( self._primary_urls )
|
|
|
|
# now that we store primary and source urls separately, we'll trust any primary but be careful about source
|
|
# trusting classless source urls was too much of a hassle with too many boorus providing bad source urls like user account pages
|
|
|
|
urls.extend( ( url for url in self._source_urls if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToOneFile( url ) ) )
|
|
|
|
# now discard gallery pages or post urls that can hold multiple files
|
|
urls = [ url for url in urls if not HG.client_controller.network_engine.domain_manager.URLCanReferToMultipleFiles( url ) ]
|
|
|
|
unrecognised_url_results = set()
|
|
|
|
first_result = None
|
|
|
|
for url in urls:
|
|
|
|
results = HG.client_controller.Read( 'url_statuses', url )
|
|
|
|
if len( results ) == 0: # if no match found, no useful data discovered
|
|
|
|
continue
|
|
|
|
elif len( results ) > 1: # if more than one file claims this url, it cannot be relied on to guess the file
|
|
|
|
continue
|
|
|
|
else: # i.e. 1 match found
|
|
|
|
file_import_status = results[0]
|
|
|
|
file_import_status = ClientImportFiles.CheckFileImportStatus( file_import_status )
|
|
|
|
if first_result is None:
|
|
|
|
first_result = file_import_status
|
|
|
|
|
|
if not file_import_status.ShouldImport( file_import_options ):
|
|
|
|
hash = file_import_status.hash
|
|
|
|
# a known one-file url has given a single clear result. sounds good
|
|
|
|
we_have_a_match = True
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
# to double-check, let's see if the file that claims that url has any other interesting urls
|
|
# if the file has another url with the same url class as ours, then this is prob an unreliable 'alternate' source url attribution, and untrustworthy
|
|
|
|
my_url = self.file_seed_data
|
|
|
|
if url != my_url:
|
|
|
|
my_url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( my_url )
|
|
|
|
media_result = HG.client_controller.Read( 'media_result', hash )
|
|
|
|
this_files_urls = media_result.GetLocationsManager().GetURLs()
|
|
|
|
for this_files_url in this_files_urls:
|
|
|
|
if this_files_url != my_url:
|
|
|
|
try:
|
|
|
|
this_url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( this_files_url )
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
continue
|
|
|
|
|
|
if my_url_class == this_url_class:
|
|
|
|
# oh no, the file this source url refers to has a different known url in this same domain
|
|
# it is more likely that an edit on this site points to the original elsewhere
|
|
|
|
we_have_a_match = False
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if we_have_a_match:
|
|
|
|
# if a known one-file url gives a single clear result, that result is reliable
|
|
|
|
return file_import_status
|
|
|
|
|
|
|
|
|
|
|
|
# we do first_result gubbins rather than generating a fresh unknown one to capture correct sha256 hash and mime if db provided it
|
|
if first_result is None:
|
|
|
|
return ClientImportFiles.FileImportStatus.STATICGetUnknownStatus()
|
|
|
|
else:
|
|
|
|
return first_result
|
|
|
|
|
|
|
|
def GetSearchFileSeeds( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
search_urls = ClientNetworkingFunctions.GetSearchURLs( self.file_seed_data )
|
|
|
|
search_file_seeds = [ FileSeed( FILE_SEED_TYPE_URL, search_url ) for search_url in search_urls ]
|
|
|
|
else:
|
|
|
|
search_file_seeds = [ self ]
|
|
|
|
|
|
return search_file_seeds
|
|
|
|
|
|
def GetExternalTags( self ):
|
|
|
|
t = set( self._tags )
|
|
t.update( self._external_filterable_tags )
|
|
|
|
return t
|
|
|
|
|
|
def GetPrimaryURLs( self ):
|
|
|
|
return set( self._primary_urls )
|
|
|
|
|
|
def GetReferralURL( self ):
|
|
|
|
return self._referral_url
|
|
|
|
|
|
def GetSourceURLs( self ):
|
|
|
|
return set( self._source_urls )
|
|
|
|
|
|
def HasHash( self ):
|
|
|
|
return self.GetHash() is not None
|
|
|
|
|
|
def Import( self, temp_path: str, file_import_options: FileImportOptions.FileImportOptions, status_hook = None ):
|
|
|
|
file_import_job = ClientImportFiles.FileImportJob( temp_path, file_import_options )
|
|
|
|
file_import_status = file_import_job.DoWork( status_hook = status_hook )
|
|
|
|
self.SetStatus( file_import_status.status, note = file_import_status.note )
|
|
self.SetHash( file_import_status.hash )
|
|
|
|
|
|
def ImportPath( self, file_seed_cache: "FileSeedCache", file_import_options: FileImportOptions.FileImportOptions, limited_mimes = None, status_hook = None ):
|
|
|
|
try:
|
|
|
|
if self.file_seed_type != FILE_SEED_TYPE_HDD:
|
|
|
|
raise HydrusExceptions.VetoException( 'Attempted to import as a path, but I do not think I am a path!' )
|
|
|
|
|
|
path = self.file_seed_data
|
|
|
|
if not os.path.exists( path ):
|
|
|
|
raise HydrusExceptions.VetoException( 'Source file does not exist!' )
|
|
|
|
|
|
( os_file_handle, temp_path ) = HydrusTemp.GetTempPath()
|
|
|
|
try:
|
|
|
|
if status_hook is not None:
|
|
|
|
status_hook( 'copying file to temp location' )
|
|
|
|
|
|
copied = HydrusPaths.MirrorFile( path, temp_path )
|
|
|
|
if not copied:
|
|
|
|
raise Exception( 'File failed to copy to temp path--see log for error.' )
|
|
|
|
|
|
if limited_mimes is not None:
|
|
|
|
# I think this thing should and will be rolled into file import options late
|
|
|
|
if status_hook is not None:
|
|
|
|
status_hook( 'testing file type' )
|
|
|
|
|
|
mime = HydrusFileHandling.GetMime( temp_path )
|
|
|
|
if mime not in limited_mimes:
|
|
|
|
raise HydrusExceptions.VetoException( 'Not in allowed mimes!' )
|
|
|
|
|
|
|
|
self.Import( temp_path, file_import_options, status_hook = status_hook )
|
|
|
|
finally:
|
|
|
|
HydrusTemp.CleanUpTempPath( os_file_handle, temp_path )
|
|
|
|
|
|
self.WriteContentUpdates( file_import_options = file_import_options )
|
|
|
|
except HydrusExceptions.VetoException as e:
|
|
|
|
self.SetStatus( CC.STATUS_VETOED, note = str( e ) )
|
|
|
|
except HydrusExceptions.UnsupportedFileException as e:
|
|
|
|
self.SetStatus( CC.STATUS_ERROR, note = str( e ) )
|
|
|
|
except Exception as e:
|
|
|
|
self.SetStatus( CC.STATUS_ERROR, exception = e )
|
|
|
|
|
|
file_seed_cache.NotifyFileSeedsUpdated( ( self, ) )
|
|
|
|
|
|
def IsAPostURL( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
try:
|
|
|
|
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
return False
|
|
|
|
|
|
if url_type == HC.URL_TYPE_POST:
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def IsDeleted( self ):
|
|
|
|
return self.status == CC.STATUS_DELETED
|
|
|
|
|
|
def IsLocalFileImport( self ):
|
|
|
|
return self.file_seed_type == FILE_SEED_TYPE_HDD
|
|
|
|
|
|
def IsProbablyMasterPostURL( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
if self._referral_url is not None:
|
|
|
|
try:
|
|
|
|
# if our given referral is a post url, we are most probably a multi-file url
|
|
|
|
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self._referral_url )
|
|
|
|
if url_type == HC.URL_TYPE_POST:
|
|
|
|
return False
|
|
|
|
|
|
except:
|
|
|
|
# screw it
|
|
return True
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
def IsURLFileImport( self ):
|
|
|
|
return self.file_seed_type == FILE_SEED_TYPE_URL
|
|
|
|
|
|
def Normalise( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
try:
|
|
|
|
self.file_seed_data = HG.client_controller.network_engine.domain_manager.NormaliseURL( self.file_seed_data )
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def PredictPreImportStatus( self, file_import_options: FileImportOptions.FileImportOptions, tag_import_options: TagImportOptions.TagImportOptions, file_url = None ):
|
|
|
|
( hash_match_found, hash_file_import_status ) = self.GetPreImportStatusPredictionHash( file_import_options )
|
|
|
|
# now let's set the prediction
|
|
|
|
url_file_import_status = None
|
|
|
|
if hash_match_found: # trust hashes over urls m8
|
|
|
|
file_import_status = hash_file_import_status
|
|
|
|
else:
|
|
|
|
url_file_import_status = self.GetPreImportStatusPredictionURL( file_import_options, file_url = file_url )
|
|
|
|
file_import_status = url_file_import_status
|
|
|
|
|
|
# and make some recommendations
|
|
|
|
should_download_file = file_import_status.ShouldImport( file_import_options )
|
|
|
|
should_download_metadata = should_download_file # if we want the file, we need the metadata to get the file_url!
|
|
|
|
# but if we otherwise still want to force some tags, let's do it
|
|
if not should_download_metadata and tag_import_options.WorthFetchingTags():
|
|
|
|
url_override = False
|
|
|
|
if tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB():
|
|
|
|
if url_file_import_status is None:
|
|
|
|
url_file_import_status = self.GetPreImportStatusPredictionURL( file_import_options, file_url = file_url )
|
|
|
|
|
|
if url_file_import_status.AlreadyInDB():
|
|
|
|
url_override = True
|
|
|
|
|
|
|
|
hash_override = hash_file_import_status.AlreadyInDB() and tag_import_options.ShouldFetchTagsEvenIfHashKnownAndFileAlreadyInDB()
|
|
|
|
if url_override or hash_override:
|
|
|
|
should_download_metadata = True
|
|
|
|
|
|
|
|
# update private status store if predictions are useful
|
|
|
|
if self.status == CC.STATUS_UNKNOWN and not should_download_file:
|
|
|
|
self.status = file_import_status.status
|
|
|
|
if file_import_status.hash is not None:
|
|
|
|
self._hashes[ 'sha256' ] = file_import_status.hash
|
|
|
|
|
|
self.note = file_import_status.note
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
return ( should_download_metadata, should_download_file )
|
|
|
|
|
|
def PresentToPage( self, page_key: bytes ):
|
|
|
|
hash = self.GetHash()
|
|
|
|
if hash is not None:
|
|
|
|
media_result = HG.client_controller.Read( 'media_result', hash )
|
|
|
|
HG.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
|
|
|
|
|
|
|
|
def SetExternalAdditionalServiceKeysToTags( self, service_keys_to_tags ):
|
|
|
|
self._external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags( service_keys_to_tags )
|
|
|
|
|
|
def SetExternalFilterableTags( self, tags ):
|
|
|
|
self._external_filterable_tags = set( tags )
|
|
|
|
|
|
def SetHash( self, hash ):
|
|
|
|
if hash is not None:
|
|
|
|
self._hashes[ 'sha256' ] = hash
|
|
|
|
|
|
|
|
def SetReferralURL( self, referral_url: str ):
|
|
|
|
self._referral_url = referral_url
|
|
|
|
|
|
def SetStatus( self, status: int, note: str = '', exception = None ):
|
|
|
|
if exception is not None:
|
|
|
|
first_line = str( exception ).split( os.linesep )[0]
|
|
|
|
note = first_line + '\u2026 (Copy note to see full error)'
|
|
note += os.linesep
|
|
note += traceback.format_exc()
|
|
|
|
HydrusData.Print( 'Error when processing {}!'.format( self.file_seed_data ) )
|
|
HydrusData.Print( traceback.format_exc() )
|
|
|
|
|
|
self.status = status
|
|
self.note = note
|
|
|
|
self._UpdateModified()
|
|
|
|
|
|
def ShouldPresent( self, presentation_import_options: PresentationImportOptions.PresentationImportOptions ):
|
|
|
|
if not self.HasHash():
|
|
|
|
return False
|
|
|
|
|
|
was_just_imported = not HydrusData.TimeHasPassed( self.modified + 5 )
|
|
|
|
should_check_location = not was_just_imported
|
|
|
|
return presentation_import_options.ShouldPresentHashAndStatus( self.GetHash(), self.status, should_check_location = should_check_location )
|
|
|
|
|
|
def WorksInNewSystem( self ):
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
|
|
|
|
if url_type == HC.URL_TYPE_FILE:
|
|
|
|
return True
|
|
|
|
|
|
if url_type == HC.URL_TYPE_POST and can_parse:
|
|
|
|
return True
|
|
|
|
|
|
if url_type == HC.URL_TYPE_UNKNOWN and self._referral_url is not None: # this is likely be a multi-file child of a post url file_seed
|
|
|
|
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self._referral_url )
|
|
|
|
if url_type == HC.URL_TYPE_POST: # we must have got here through parsing that m8, so let's assume this is an unrecognised file url
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def WorkOnURL( self, file_seed_cache: "FileSeedCache", status_hook, network_job_factory, network_job_presentation_context_factory, file_import_options: FileImportOptions.FileImportOptions, tag_import_options: TagImportOptions.TagImportOptions ):
|
|
|
|
did_substantial_work = False
|
|
|
|
try:
|
|
|
|
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.file_seed_data )
|
|
|
|
if url_type not in ( HC.URL_TYPE_POST, HC.URL_TYPE_FILE, HC.URL_TYPE_UNKNOWN ):
|
|
|
|
raise HydrusExceptions.VetoException( 'This URL appeared to be a "{}", which is not a File or Post URL!'.format( match_name ) )
|
|
|
|
|
|
if url_type == HC.URL_TYPE_POST and not can_parse:
|
|
|
|
raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason ) )
|
|
|
|
|
|
tag_import_options = self._SetupTagImportOptions( tag_import_options )
|
|
|
|
status_hook( 'checking url status' )
|
|
|
|
( should_download_metadata, should_download_file ) = self.PredictPreImportStatus( file_import_options, tag_import_options )
|
|
|
|
if self.IsAPostURL():
|
|
|
|
if should_download_metadata:
|
|
|
|
did_substantial_work = True
|
|
|
|
post_url = self.file_seed_data
|
|
|
|
url_for_child_referral = post_url
|
|
|
|
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
|
|
|
|
status_hook( 'downloading file page' )
|
|
|
|
if self._referral_url is not None and self._referral_url != url_to_check:
|
|
|
|
referral_url = self._referral_url
|
|
|
|
elif url_to_check != post_url:
|
|
|
|
referral_url = post_url
|
|
|
|
else:
|
|
|
|
referral_url = None
|
|
|
|
|
|
network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
|
|
|
|
HG.client_controller.network_engine.AddJob( network_job )
|
|
|
|
with network_job_presentation_context_factory( network_job ) as njpc:
|
|
|
|
network_job.WaitUntilDone()
|
|
|
|
|
|
parsing_text = network_job.GetContentText()
|
|
|
|
actual_fetched_url = network_job.GetActualFetchedURL()
|
|
|
|
if actual_fetched_url != url_to_check:
|
|
|
|
# we have redirected, a 3XX response
|
|
|
|
( actual_url_type, actual_match_name, actual_can_parse, actual_cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
|
|
|
|
if actual_url_type == HC.URL_TYPE_POST and actual_can_parse:
|
|
|
|
self._AddPrimaryURLs( ( actual_fetched_url, ) )
|
|
|
|
post_url = actual_fetched_url
|
|
|
|
url_for_child_referral = post_url
|
|
|
|
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )
|
|
|
|
|
|
|
|
parsing_context = {}
|
|
|
|
parsing_context[ 'post_url' ] = post_url
|
|
parsing_context[ 'url' ] = url_to_check
|
|
|
|
all_parse_results = parser.Parse( parsing_context, parsing_text )
|
|
|
|
if len( all_parse_results ) == 0:
|
|
|
|
it_was_a_real_file = False
|
|
|
|
( os_file_handle, temp_path ) = HydrusTemp.GetTempPath()
|
|
|
|
try:
|
|
|
|
with open( temp_path, 'wb' ) as f:
|
|
|
|
f.write( network_job.GetContentBytes() )
|
|
|
|
|
|
mime = HydrusFileHandling.GetMime( temp_path )
|
|
|
|
if mime in HC.ALLOWED_MIMES:
|
|
|
|
it_was_a_real_file = True
|
|
|
|
status_hook( 'page was actually a file, trying to import' )
|
|
|
|
self.Import( temp_path, file_import_options, status_hook = status_hook )
|
|
|
|
|
|
except:
|
|
|
|
pass # in this special occasion, we will swallow the error
|
|
|
|
finally:
|
|
|
|
HydrusTemp.CleanUpTempPath( os_file_handle, temp_path )
|
|
|
|
|
|
if not it_was_a_real_file:
|
|
|
|
raise HydrusExceptions.VetoException( 'The parser found nothing in the document, nor did it seem to be an importable file!' )
|
|
|
|
|
|
elif len( all_parse_results ) > 1:
|
|
|
|
# multiple child urls generated by a subsidiary page parser
|
|
|
|
file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options )
|
|
|
|
for file_seed in file_seeds:
|
|
|
|
file_seed.SetExternalFilterableTags( self._external_filterable_tags )
|
|
file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
|
|
|
|
file_seed.AddPrimaryURLs( set( self._primary_urls ) )
|
|
|
|
file_seed.AddSourceURLs( set( self._source_urls ) )
|
|
|
|
file_seed.AddTags( set( self._tags ) )
|
|
|
|
|
|
try:
|
|
|
|
my_index = file_seed_cache.GetFileSeedIndex( self )
|
|
|
|
insertion_index = my_index + 1
|
|
|
|
except:
|
|
|
|
insertion_index = len( file_seed_cache )
|
|
|
|
|
|
num_urls_added = file_seed_cache.InsertFileSeeds( insertion_index, file_seeds )
|
|
|
|
status = CC.STATUS_SUCCESSFUL_AND_CHILD_FILES
|
|
note = 'Found {} new URLs.'.format( HydrusData.ToHumanInt( num_urls_added ) )
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
else:
|
|
|
|
# no subsidiary page parser results, just one
|
|
|
|
parse_results = all_parse_results[0]
|
|
|
|
self.AddParseResults( parse_results, file_import_options )
|
|
|
|
self.CheckPreFetchMetadata( tag_import_options )
|
|
|
|
desired_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_DESIRED, ), only_get_top_priority = True )
|
|
|
|
child_urls = []
|
|
|
|
if len( desired_urls ) == 0:
|
|
|
|
raise HydrusExceptions.VetoException( 'Could not find a file or post URL to download!' )
|
|
|
|
elif len( desired_urls ) == 1:
|
|
|
|
desired_url = desired_urls[0]
|
|
|
|
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( desired_url )
|
|
|
|
if url_type in ( HC.URL_TYPE_FILE, HC.URL_TYPE_UNKNOWN ):
|
|
|
|
file_url = desired_url
|
|
|
|
( should_download_metadata, should_download_file ) = self.PredictPreImportStatus( file_import_options, tag_import_options, file_url )
|
|
|
|
if should_download_file:
|
|
|
|
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook, override_bandwidth = True, forced_referral_url = url_for_child_referral, file_seed_cache = file_seed_cache )
|
|
|
|
|
|
elif url_type == HC.URL_TYPE_POST and can_parse:
|
|
|
|
# a pixiv mode=medium page has spawned a mode=manga page, so we need a new file_seed to go pursue that
|
|
|
|
child_urls = [ desired_url ]
|
|
|
|
else:
|
|
|
|
if can_parse:
|
|
|
|
raise HydrusExceptions.VetoException( 'Found a URL--{}--but could not understand it!'.format( desired_url ) )
|
|
|
|
else:
|
|
|
|
raise HydrusExceptions.VetoException( 'Found a URL--{}--but could not parse it: {}'.format( desired_url, cannot_parse_reason ) )
|
|
|
|
|
|
|
|
else:
|
|
|
|
child_urls = desired_urls
|
|
|
|
|
|
if len( child_urls ) > 0:
|
|
|
|
child_file_seeds = []
|
|
|
|
for child_url in child_urls:
|
|
|
|
duplicate_file_seed = self.Duplicate() # inherits all urls and tags from here
|
|
|
|
duplicate_file_seed.file_seed_data = child_url
|
|
|
|
duplicate_file_seed.SetReferralURL( url_for_child_referral )
|
|
|
|
if self._referral_url is not None:
|
|
|
|
duplicate_file_seed.AddSourceURLs( ( self._referral_url, ) )
|
|
|
|
|
|
child_file_seeds.append( duplicate_file_seed )
|
|
|
|
|
|
try:
|
|
|
|
my_index = file_seed_cache.GetFileSeedIndex( self )
|
|
|
|
insertion_index = my_index + 1
|
|
|
|
except:
|
|
|
|
insertion_index = len( file_seed_cache )
|
|
|
|
|
|
num_urls_added = file_seed_cache.InsertFileSeeds( insertion_index, child_file_seeds )
|
|
|
|
status = CC.STATUS_SUCCESSFUL_AND_CHILD_FILES
|
|
note = 'Found {} new URLs.'.format( HydrusData.ToHumanInt( num_urls_added ) )
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
if should_download_file:
|
|
|
|
self.CheckPreFetchMetadata( tag_import_options )
|
|
|
|
did_substantial_work = True
|
|
|
|
file_url = self.file_seed_data
|
|
|
|
self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory, status_hook, file_seed_cache = file_seed_cache )
|
|
|
|
|
|
|
|
did_substantial_work |= self.WriteContentUpdates( file_import_options = file_import_options, tag_import_options = tag_import_options )
|
|
|
|
except HydrusExceptions.ShutdownException:
|
|
|
|
return False
|
|
|
|
except HydrusExceptions.VetoException as e:
|
|
|
|
status = CC.STATUS_VETOED
|
|
|
|
note = str( e )
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
if isinstance( e, HydrusExceptions.CancelledException ):
|
|
|
|
status_hook( 'cancelled!' )
|
|
|
|
time.sleep( 2 )
|
|
|
|
|
|
except HydrusExceptions.InsufficientCredentialsException:
|
|
|
|
status = CC.STATUS_VETOED
|
|
note = '403'
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
status_hook( '403' )
|
|
|
|
time.sleep( 2 )
|
|
|
|
except HydrusExceptions.NotFoundException:
|
|
|
|
status = CC.STATUS_VETOED
|
|
note = '404'
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
status_hook( '404' )
|
|
|
|
time.sleep( 2 )
|
|
|
|
except HydrusExceptions.UnsupportedFileException as e:
|
|
|
|
status = CC.STATUS_ERROR
|
|
|
|
note = str( e )
|
|
|
|
self.SetStatus( status, note = note )
|
|
|
|
except Exception as e:
|
|
|
|
status = CC.STATUS_ERROR
|
|
|
|
self.SetStatus( status, exception = e )
|
|
|
|
status_hook( 'error!' )
|
|
|
|
time.sleep( 3 )
|
|
|
|
finally:
|
|
|
|
file_seed_cache.NotifyFileSeedsUpdated( ( self, ) )
|
|
|
|
|
|
return did_substantial_work
|
|
|
|
|
|
def WriteContentUpdates( self, file_import_options: typing.Optional[ FileImportOptions.FileImportOptions ] = None, tag_import_options: typing.Optional[ TagImportOptions.TagImportOptions ] = None ):
|
|
|
|
did_work = False
|
|
|
|
if self.status == CC.STATUS_ERROR:
|
|
|
|
return did_work
|
|
|
|
|
|
hash = self.GetHash()
|
|
|
|
if hash is None:
|
|
|
|
return did_work
|
|
|
|
|
|
# changed this to say that urls alone are not 'did work' since all url results are doing this, and when they have no tags, they are usually superfast db hits anyway
|
|
# better to scream through an 'already in db' import list that flicker
|
|
|
|
service_keys_to_content_updates = collections.defaultdict( list )
|
|
|
|
potentially_associable_urls = set()
|
|
|
|
if file_import_options is not None:
|
|
|
|
if file_import_options.ShouldAssociatePrimaryURLs():
|
|
|
|
potentially_associable_urls.update( self._primary_urls )
|
|
|
|
if self.file_seed_type == FILE_SEED_TYPE_URL:
|
|
|
|
potentially_associable_urls.add( self.file_seed_data )
|
|
|
|
domain = ClientNetworkingFunctions.ConvertURLIntoDomain( self.file_seed_data )
|
|
|
|
if self.source_time is None:
|
|
|
|
domain_modified_timestamp = self.created
|
|
|
|
else:
|
|
|
|
domain_modified_timestamp = self.source_time
|
|
|
|
|
|
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_TIMESTAMP, HC.CONTENT_UPDATE_ADD, ( 'domain', hash, ( domain, domain_modified_timestamp ) ) )
|
|
|
|
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
|
|
|
|
if self._cloudflare_last_modified_time is not None:
|
|
|
|
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_TIMESTAMP, HC.CONTENT_UPDATE_ADD, ( 'domain', hash, ( 'cloudflare.com', self._cloudflare_last_modified_time ) ) )
|
|
|
|
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
|
|
|
|
|
|
|
|
if self._referral_url is not None:
|
|
|
|
potentially_associable_urls.add( self._referral_url )
|
|
|
|
|
|
|
|
if file_import_options.ShouldAssociateSourceURLs():
|
|
|
|
potentially_associable_urls.update( self._source_urls )
|
|
|
|
|
|
|
|
associable_urls = ClientNetworkingFunctions.NormaliseAndFilterAssociableURLs( potentially_associable_urls )
|
|
|
|
if len( associable_urls ) > 0:
|
|
|
|
content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( associable_urls, ( hash, ) ) )
|
|
|
|
service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )
|
|
|
|
|
|
if tag_import_options is None:
|
|
|
|
for ( service_key, content_updates ) in ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( ( hash, ), self._external_additional_service_keys_to_tags ).items():
|
|
|
|
service_keys_to_content_updates[ service_key ].extend( content_updates )
|
|
|
|
did_work = True
|
|
|
|
|
|
else:
|
|
|
|
media_result = HG.client_controller.Read( 'media_result', hash )
|
|
|
|
for ( service_key, content_updates ) in tag_import_options.GetServiceKeysToContentUpdates( self.status, media_result, set( self._tags ), external_filterable_tags = self._external_filterable_tags, external_additional_service_keys_to_tags = self._external_additional_service_keys_to_tags ).items():
|
|
|
|
service_keys_to_content_updates[ service_key ].extend( content_updates )
|
|
|
|
did_work = True
|
|
|
|
|
|
|
|
if len( service_keys_to_content_updates ) > 0:
|
|
|
|
HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )
|
|
|
|
|
|
return did_work
|
|
|
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED ] = FileSeed
|
|
|
|
class FileSeedCacheStatus( HydrusSerialisable.SerialisableBase ):
|
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE_STATUS
|
|
SERIALISABLE_NAME = 'Import File Status Cache Status'
|
|
SERIALISABLE_VERSION = 1
|
|
|
|
def __init__( self ):
|
|
|
|
self._generation_time = HydrusData.GetNow()
|
|
self._statuses_to_counts = collections.Counter()
|
|
self._latest_added_time = 0
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
serialisable_statuses_to_counts = list( self._statuses_to_counts.items() )
|
|
|
|
return ( self._generation_time, serialisable_statuses_to_counts, self._latest_added_time )
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
( self._generation_time, serialisable_statuses_to_counts, self._latest_added_time ) = serialisable_info
|
|
|
|
self._statuses_to_counts = collections.Counter()
|
|
|
|
self._statuses_to_counts.update( dict( serialisable_statuses_to_counts ) )
|
|
|
|
|
|
def GetFileSeedCount( self, status: typing.Optional[ int ] = None ) -> int:
|
|
|
|
if status is None:
|
|
|
|
return sum( self._statuses_to_counts.values() )
|
|
|
|
else:
|
|
|
|
return self._statuses_to_counts[ status ]
|
|
|
|
|
|
|
|
def GetGenerationTime( self ) -> int:
|
|
|
|
return self._generation_time
|
|
|
|
|
|
def GetLatestAddedTime( self ) -> int:
|
|
|
|
return self._latest_added_time
|
|
|
|
|
|
def GetStatusText( self, simple = False ) -> str:
|
|
|
|
num_successful_and_new = self._statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
|
|
num_successful_but_redundant = self._statuses_to_counts[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ]
|
|
num_ignored = self._statuses_to_counts[ CC.STATUS_VETOED ]
|
|
num_deleted = self._statuses_to_counts[ CC.STATUS_DELETED ]
|
|
num_failed = self._statuses_to_counts[ CC.STATUS_ERROR ]
|
|
num_skipped = self._statuses_to_counts[ CC.STATUS_SKIPPED ]
|
|
num_unknown = self._statuses_to_counts[ CC.STATUS_UNKNOWN ]
|
|
|
|
if simple:
|
|
|
|
total = sum( self._statuses_to_counts.values() )
|
|
|
|
total_processed = total - num_unknown
|
|
|
|
#
|
|
|
|
status_text = ''
|
|
|
|
if total > 0:
|
|
|
|
if num_unknown > 0:
|
|
|
|
status_text += HydrusData.ConvertValueRangeToPrettyString( total_processed, total )
|
|
|
|
else:
|
|
|
|
status_text += HydrusData.ToHumanInt( total_processed )
|
|
|
|
|
|
show_new_on_file_seed_short_summary = HG.client_controller.new_options.GetBoolean( 'show_new_on_file_seed_short_summary' )
|
|
|
|
if show_new_on_file_seed_short_summary and num_successful_and_new:
|
|
|
|
status_text += ' - {}N'.format( HydrusData.ToHumanInt( num_successful_and_new ) )
|
|
|
|
|
|
simple_status_strings = []
|
|
|
|
if num_ignored > 0:
|
|
|
|
simple_status_strings.append( '{}Ig'.format( HydrusData.ToHumanInt( num_ignored ) ) )
|
|
|
|
|
|
show_deleted_on_file_seed_short_summary = HG.client_controller.new_options.GetBoolean( 'show_deleted_on_file_seed_short_summary' )
|
|
|
|
if show_deleted_on_file_seed_short_summary and num_deleted > 0:
|
|
|
|
simple_status_strings.append( '{}D'.format( HydrusData.ToHumanInt( num_deleted ) ) )
|
|
|
|
|
|
if num_failed > 0:
|
|
|
|
simple_status_strings.append( '{}F'.format( HydrusData.ToHumanInt( num_failed ) ) )
|
|
|
|
|
|
if num_skipped > 0:
|
|
|
|
simple_status_strings.append( '{}S'.format( HydrusData.ToHumanInt( num_skipped ) ) )
|
|
|
|
|
|
if len( simple_status_strings ) > 0:
|
|
|
|
status_text += ' - {}'.format( ''.join( simple_status_strings ) )
|
|
|
|
|
|
|
|
else:
|
|
|
|
status_strings = []
|
|
|
|
num_successful = num_successful_and_new + num_successful_but_redundant
|
|
|
|
if num_successful > 0:
|
|
|
|
s = '{} successful'.format( HydrusData.ToHumanInt( num_successful ) )
|
|
|
|
if num_successful_and_new > 0:
|
|
|
|
if num_successful_but_redundant > 0:
|
|
|
|
s += ' ({} already in db)'.format( HydrusData.ToHumanInt( num_successful_but_redundant ) )
|
|
|
|
|
|
else:
|
|
|
|
s += ' (all already in db)'
|
|
|
|
|
|
status_strings.append( s )
|
|
|
|
|
|
if num_ignored > 0:
|
|
|
|
status_strings.append( '{} ignored'.format( HydrusData.ToHumanInt( num_ignored ) ) )
|
|
|
|
|
|
if num_deleted > 0:
|
|
|
|
status_strings.append( '{} previously deleted'.format( HydrusData.ToHumanInt( num_deleted ) ) )
|
|
|
|
|
|
if num_failed > 0:
|
|
|
|
status_strings.append( '{} failed'.format( HydrusData.ToHumanInt( num_failed ) ) )
|
|
|
|
|
|
if num_skipped > 0:
|
|
|
|
status_strings.append( '{} skipped'.format( HydrusData.ToHumanInt( num_skipped ) ) )
|
|
|
|
|
|
status_text = ', '.join( status_strings )
|
|
|
|
|
|
return status_text
|
|
|
|
|
|
def GetStatusesToCounts( self ) -> typing.Mapping[ int, int ]:
|
|
|
|
return self._statuses_to_counts
|
|
|
|
|
|
def GetValueRange( self ) -> typing.Tuple[ int, int ]:
|
|
|
|
total = sum( self._statuses_to_counts.values() )
|
|
|
|
num_unknown = self._statuses_to_counts[ CC.STATUS_UNKNOWN ]
|
|
|
|
total_processed = total - num_unknown
|
|
|
|
return ( total_processed, total )
|
|
|
|
|
|
def HasWorkToDo( self ):
|
|
|
|
( num_done, num_total ) = self.GetValueRange()
|
|
|
|
return num_done < num_total
|
|
|
|
|
|
def Merge( self, file_seed_cache_status: "FileSeedCacheStatus" ):
|
|
|
|
self._latest_added_time = max( self._latest_added_time, file_seed_cache_status.GetLatestAddedTime() )
|
|
self._statuses_to_counts.update( file_seed_cache_status.GetStatusesToCounts() )
|
|
|
|
|
|
def SetStatusesToCounts( self, statuses_to_counts: typing.Mapping[ int, int ] ):
|
|
|
|
self._statuses_to_counts = collections.Counter()
|
|
|
|
self._statuses_to_counts.update( statuses_to_counts )
|
|
|
|
|
|
def SetLatestAddedTime( self, latest_added_time: int ):
|
|
|
|
self._latest_added_time = latest_added_time
|
|
|
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE_STATUS ] = FileSeedCacheStatus
|
|
|
|
class FileSeedCache( HydrusSerialisable.SerialisableBase ):
|
|
|
|
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE
|
|
SERIALISABLE_NAME = 'Import File Status Cache'
|
|
SERIALISABLE_VERSION = 8
|
|
|
|
COMPACT_NUMBER = 250
|
|
|
|
def __init__( self ):
|
|
|
|
HydrusSerialisable.SerialisableBase.__init__( self )
|
|
|
|
self._file_seeds = HydrusSerialisable.SerialisableList()
|
|
|
|
self._file_seeds_to_indices = {}
|
|
|
|
self._statuses_to_indexed_file_seeds = collections.defaultdict( list )
|
|
|
|
self._file_seed_cache_key = HydrusData.GenerateKey()
|
|
|
|
self._status_cache = FileSeedCacheStatus()
|
|
|
|
self._status_dirty = True
|
|
self._statuses_to_indexed_file_seeds_dirty = True
|
|
|
|
self._lock = threading.Lock()
|
|
|
|
|
|
def __len__( self ):
|
|
|
|
return len( self._file_seeds )
|
|
|
|
|
|
def _FixFileSeedsStatusPosition( self, file_seeds ):
|
|
|
|
indices_and_file_seeds_affected = []
|
|
|
|
for file_seed in file_seeds:
|
|
|
|
if file_seed in self._file_seeds_to_indices:
|
|
|
|
indices_and_file_seeds_affected.append( ( self._file_seeds_to_indices[ file_seed ], file_seed ) )
|
|
|
|
else:
|
|
|
|
self._SetStatusesToFileSeedsDirty()
|
|
|
|
return
|
|
|
|
|
|
|
|
for row in indices_and_file_seeds_affected:
|
|
|
|
correct_status = row[1].status
|
|
|
|
if row in self._statuses_to_indexed_file_seeds[ correct_status ]:
|
|
|
|
continue
|
|
|
|
|
|
for ( status, indices_and_file_seeds ) in self._statuses_to_indexed_file_seeds.items():
|
|
|
|
if status == correct_status:
|
|
|
|
continue
|
|
|
|
|
|
if row in indices_and_file_seeds:
|
|
|
|
indices_and_file_seeds.remove( row )
|
|
|
|
bisect.insort( self._statuses_to_indexed_file_seeds[ correct_status ], row )
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
def _GenerateStatus( self ):
|
|
|
|
fscs = FileSeedCacheStatus()
|
|
|
|
fscs.SetLatestAddedTime( self._GetLatestAddedTime() )
|
|
fscs.SetStatusesToCounts( self._GetStatusesToCounts() )
|
|
|
|
self._status_cache = fscs
|
|
|
|
self._status_dirty = False
|
|
|
|
|
|
def _GetFileSeeds( self, status: int = None ):
|
|
|
|
if status is None:
|
|
|
|
return list( self._file_seeds )
|
|
|
|
else:
|
|
|
|
if self._statuses_to_indexed_file_seeds_dirty:
|
|
|
|
self._RegenerateStatusesToFileSeeds()
|
|
|
|
|
|
return [ file_seed for ( index, file_seed ) in self._statuses_to_indexed_file_seeds[ status ] ]
|
|
|
|
|
|
|
|
def _GetLatestAddedTime( self ):
|
|
|
|
if len( self._file_seeds ) == 0:
|
|
|
|
latest_timestamp = 0
|
|
|
|
else:
|
|
|
|
latest_timestamp = max( ( file_seed.created for file_seed in self._file_seeds ) )
|
|
|
|
|
|
return latest_timestamp
|
|
|
|
|
|
def _GetMyFileSeed( self, file_seed: FileSeed ) -> typing.Optional[ FileSeed ]:
|
|
|
|
search_file_seeds = file_seed.GetSearchFileSeeds()
|
|
|
|
for f_s in self._file_seeds:
|
|
|
|
if f_s in search_file_seeds:
|
|
|
|
return f_s
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
def _GetNextFileSeed( self, status: int ) -> typing.Optional[ FileSeed ]:
|
|
|
|
# the problem with this is if a file seed recently changed but 'notifyupdated' hasn't had a chance to go yet
|
|
# there could be a FS in a list other than the one we are looking at that has the status we want
|
|
# _however_, it seems like I do not do any async calls to notifyupdated in the actual FSC, only from notifyupdated to GUI elements, so we _seem_ to be good
|
|
|
|
if self._statuses_to_indexed_file_seeds_dirty:
|
|
|
|
self._RegenerateStatusesToFileSeeds()
|
|
|
|
|
|
indexed_file_seeds = self._statuses_to_indexed_file_seeds[ status ]
|
|
|
|
while len( indexed_file_seeds ) > 0:
|
|
|
|
row = indexed_file_seeds[ 0 ]
|
|
|
|
file_seed = row[1]
|
|
|
|
if file_seed.status == status:
|
|
|
|
return file_seed
|
|
|
|
else:
|
|
|
|
self._FixFileSeedsStatusPosition( ( file_seed, ) )
|
|
|
|
|
|
indexed_file_seeds = self._statuses_to_indexed_file_seeds[ status ]
|
|
|
|
|
|
return None
|
|
|
|
|
|
def _GetSerialisableInfo( self ):
|
|
|
|
return self._file_seeds.GetSerialisableTuple()
|
|
|
|
|
|
def _GetSourceTimestampForVelocityCalculations( self, file_seed: FileSeed ):
|
|
|
|
source_timestamp = file_seed.source_time
|
|
|
|
if source_timestamp is None:
|
|
|
|
# decent fallback compromise
|
|
# -30 since added and 'last check' timestamps are often the same, and this messes up calculations
|
|
|
|
source_timestamp = file_seed.created - 30
|
|
|
|
|
|
return source_timestamp
|
|
|
|
|
|
def _GetStatusesToCounts( self ):
|
|
|
|
statuses_to_counts = collections.Counter()
|
|
|
|
if self._statuses_to_indexed_file_seeds_dirty:
|
|
|
|
self._RegenerateStatusesToFileSeeds()
|
|
|
|
|
|
for ( status, indexed_file_seeds ) in self._statuses_to_indexed_file_seeds.items():
|
|
|
|
count = len( indexed_file_seeds )
|
|
|
|
if count > 0:
|
|
|
|
statuses_to_counts[ status ] = count
|
|
|
|
|
|
|
|
return statuses_to_counts
|
|
|
|
|
|
def _HasFileSeed( self, file_seed: FileSeed ):
|
|
|
|
search_file_seeds = file_seed.GetSearchFileSeeds()
|
|
|
|
has_file_seed = True in ( search_file_seed in self._file_seeds_to_indices for search_file_seed in search_file_seeds )
|
|
|
|
return has_file_seed
|
|
|
|
|
|
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
|
|
|
|
with self._lock:
|
|
|
|
self._file_seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusesToFileSeedsDirty()
|
|
|
|
|
|
|
|
def _RegenerateStatusesToFileSeeds( self ):
|
|
|
|
self._statuses_to_indexed_file_seeds = collections.defaultdict( list )
|
|
|
|
for ( file_seed, index ) in self._file_seeds_to_indices.items():
|
|
|
|
self._statuses_to_indexed_file_seeds[ file_seed.status ].append( ( index, file_seed ) )
|
|
|
|
|
|
for indexed_file_seeds in self._statuses_to_indexed_file_seeds.values():
|
|
|
|
indexed_file_seeds.sort()
|
|
|
|
|
|
self._statuses_to_indexed_file_seeds_dirty = False
|
|
|
|
|
|
def _SetStatusesToFileSeedsDirty( self ):
|
|
|
|
self._statuses_to_indexed_file_seeds_dirty = True
|
|
|
|
|
|
def _SetStatusDirty( self ):
|
|
|
|
self._status_dirty = True
|
|
|
|
|
|
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
|
|
|
|
if version == 1:
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
if 'note' in file_seed_info:
|
|
|
|
file_seed_info[ 'note' ] = str( file_seed_info[ 'note' ] )
|
|
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 2, new_serialisable_info )
|
|
|
|
|
|
if version in ( 2, 3 ):
|
|
|
|
# gelbooru replaced their thumbnail links with this redirect spam
|
|
# 'https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY4ODA1OA=='
|
|
|
|
# I missed some http ones here, so I've broadened the test and rescheduled it
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
if 'gelbooru.com/redirect.php' in file_seed:
|
|
|
|
continue
|
|
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 4, new_serialisable_info )
|
|
|
|
|
|
if version == 4:
|
|
|
|
def ConvertRegularToRawURL( regular_url ):
|
|
|
|
# convert this:
|
|
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
|
|
# to this:
|
|
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
|
|
# the 500 part can be a bunch of stuff, including letters
|
|
|
|
url_components = regular_url.split( '_' )
|
|
|
|
last_component = url_components[ -1 ]
|
|
|
|
( number_gubbins, file_ext ) = last_component.split( '.' )
|
|
|
|
raw_last_component = 'raw.{}'.format( file_ext )
|
|
|
|
url_components[ -1 ] = raw_last_component
|
|
|
|
raw_url = '_'.join( url_components )
|
|
|
|
return raw_url
|
|
|
|
|
|
def Remove68Subdomain( long_url ):
|
|
|
|
# sometimes the 68 subdomain gives a 404 on the raw url, so:
|
|
|
|
# convert this:
|
|
# http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
|
|
# to this:
|
|
# http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
|
|
|
|
# I am not sure if it is always 68, but let's not assume
|
|
|
|
( scheme, rest ) = long_url.split( '://', 1 )
|
|
|
|
if rest.startswith( 'media.tumblr.com' ):
|
|
|
|
return long_url
|
|
|
|
|
|
( gumpf, shorter_rest ) = rest.split( '.', 1 )
|
|
|
|
shorter_url = '{}://{}'.format( scheme, shorter_rest )
|
|
|
|
return shorter_url
|
|
|
|
|
|
new_serialisable_info = []
|
|
|
|
good_file_seeds = set()
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
try:
|
|
|
|
parse = urllib.parse.urlparse( file_seed )
|
|
|
|
if 'media.tumblr.com' in parse.netloc:
|
|
|
|
file_seed = Remove68Subdomain( file_seed )
|
|
|
|
file_seed = ConvertRegularToRawURL( file_seed )
|
|
|
|
file_seed = ClientNetworkingFunctions.ConvertHTTPToHTTPS( file_seed )
|
|
|
|
|
|
if 'pixiv.net' in parse.netloc:
|
|
|
|
file_seed = ClientNetworkingFunctions.ConvertHTTPToHTTPS( file_seed )
|
|
|
|
|
|
if file_seed in good_file_seeds: # we hit a dupe, so skip it
|
|
|
|
continue
|
|
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
good_file_seeds.add( file_seed )
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 5, new_serialisable_info )
|
|
|
|
|
|
if version == 5:
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
file_seed_info[ 'source_timestamp' ] = None
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 6, new_serialisable_info )
|
|
|
|
|
|
if version == 6:
|
|
|
|
new_serialisable_info = []
|
|
|
|
for ( file_seed, file_seed_info ) in old_serialisable_info:
|
|
|
|
try:
|
|
|
|
magic_phrase = '//media.tumblr.com'
|
|
replacement = '//data.tumblr.com'
|
|
|
|
if magic_phrase in file_seed:
|
|
|
|
file_seed = file_seed.replace( magic_phrase, replacement )
|
|
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
new_serialisable_info.append( ( file_seed, file_seed_info ) )
|
|
|
|
|
|
return ( 7, new_serialisable_info )
|
|
|
|
|
|
if version == 7:
|
|
|
|
file_seeds = HydrusSerialisable.SerialisableList()
|
|
|
|
for ( file_seed_text, file_seed_info ) in old_serialisable_info:
|
|
|
|
if file_seed_text.startswith( 'http' ):
|
|
|
|
file_seed_type = FILE_SEED_TYPE_URL
|
|
|
|
else:
|
|
|
|
file_seed_type = FILE_SEED_TYPE_HDD
|
|
|
|
|
|
file_seed = FileSeed( file_seed_type, file_seed_text )
|
|
|
|
file_seed.status = file_seed_info[ 'status' ]
|
|
file_seed.created = file_seed_info[ 'added_timestamp' ]
|
|
file_seed.modified = file_seed_info[ 'last_modified_timestamp' ]
|
|
file_seed.source_time = file_seed_info[ 'source_timestamp' ]
|
|
file_seed.note = file_seed_info[ 'note' ]
|
|
|
|
file_seeds.append( file_seed )
|
|
|
|
|
|
new_serialisable_info = file_seeds.GetSerialisableTuple()
|
|
|
|
return ( 8, new_serialisable_info )
|
|
|
|
|
|
|
|
def AddFileSeeds( self, file_seeds: typing.Collection[ FileSeed ], dupe_try_again = True ):
|
|
|
|
if len( file_seeds ) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
updated_or_new_file_seeds = []
|
|
|
|
with self._lock:
|
|
|
|
for file_seed in file_seeds:
|
|
|
|
if self._HasFileSeed( file_seed ):
|
|
|
|
if dupe_try_again:
|
|
|
|
f_s = self._GetMyFileSeed( file_seed )
|
|
|
|
if f_s is not None:
|
|
|
|
if f_s.status == CC.STATUS_ERROR:
|
|
|
|
f_s.SetStatus( CC.STATUS_UNKNOWN )
|
|
|
|
updated_or_new_file_seeds.append( f_s )
|
|
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
|
file_seed.Normalise()
|
|
|
|
except HydrusExceptions.URLClassException:
|
|
|
|
# this is some borked 'https://' url that makes no sense
|
|
|
|
continue
|
|
|
|
|
|
updated_or_new_file_seeds.append( file_seed )
|
|
|
|
self._file_seeds.append( file_seed )
|
|
|
|
index = len( self._file_seeds ) - 1
|
|
|
|
self._file_seeds_to_indices[ file_seed ] = index
|
|
|
|
if not self._statuses_to_indexed_file_seeds_dirty:
|
|
|
|
self._statuses_to_indexed_file_seeds[ file_seed.status ].append( ( index, file_seed ) )
|
|
|
|
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( updated_or_new_file_seeds )
|
|
|
|
return len( updated_or_new_file_seeds )
|
|
|
|
|
|
def AdvanceFileSeed( self, file_seed: FileSeed ):
|
|
|
|
with self._lock:
|
|
|
|
if file_seed in self._file_seeds_to_indices:
|
|
|
|
index = self._file_seeds_to_indices[ file_seed ]
|
|
|
|
if index > 0:
|
|
|
|
self._file_seeds.remove( file_seed )
|
|
|
|
self._file_seeds.insert( index - 1, file_seed )
|
|
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusesToFileSeedsDirty()
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( ( file_seed, ) )
|
|
|
|
|
|
def CanCompact( self, compact_before_this_source_time: int ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) <= self.COMPACT_NUMBER:
|
|
|
|
return False
|
|
|
|
|
|
for file_seed in self._file_seeds[:-self.COMPACT_NUMBER]:
|
|
|
|
if file_seed.status == CC.STATUS_UNKNOWN:
|
|
|
|
continue
|
|
|
|
|
|
if self._GetSourceTimestampForVelocityCalculations( file_seed ) < compact_before_this_source_time:
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
def Compact( self, compact_before_this_source_time: int ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) <= self.COMPACT_NUMBER:
|
|
|
|
return
|
|
|
|
|
|
new_file_seeds = HydrusSerialisable.SerialisableList()
|
|
|
|
for file_seed in self._file_seeds[:-self.COMPACT_NUMBER]:
|
|
|
|
still_to_do = file_seed.status == CC.STATUS_UNKNOWN
|
|
still_relevant = self._GetSourceTimestampForVelocityCalculations( file_seed ) > compact_before_this_source_time
|
|
|
|
if still_to_do or still_relevant:
|
|
|
|
new_file_seeds.append( file_seed )
|
|
|
|
|
|
|
|
new_file_seeds.extend( self._file_seeds[-self.COMPACT_NUMBER:] )
|
|
|
|
self._file_seeds = new_file_seeds
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusesToFileSeedsDirty()
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
|
|
def DelayFileSeed( self, file_seed: FileSeed ):
|
|
|
|
with self._lock:
|
|
|
|
if file_seed in self._file_seeds_to_indices:
|
|
|
|
index = self._file_seeds_to_indices[ file_seed ]
|
|
|
|
if index < len( self._file_seeds ) - 1:
|
|
|
|
self._file_seeds.remove( file_seed )
|
|
|
|
self._file_seeds.insert( index + 1, file_seed )
|
|
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusesToFileSeedsDirty()
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( ( file_seed, ) )
|
|
|
|
|
|
def GetAPIInfoDict( self, simple: bool ):
|
|
|
|
with self._lock:
|
|
|
|
d = {}
|
|
|
|
if self._status_dirty:
|
|
|
|
self._GenerateStatus()
|
|
|
|
|
|
d[ 'status' ] = self._status_cache.GetStatusText()
|
|
d[ 'simple_status' ] = self._status_cache.GetStatusText( simple = True )
|
|
|
|
( num_done, num_total ) = self._status_cache.GetValueRange()
|
|
|
|
d[ 'total_processed' ] = num_done
|
|
d[ 'total_to_process' ] = num_total
|
|
|
|
if not simple:
|
|
|
|
d[ 'import_items' ] = [ file_seed.GetAPIInfoDict( simple ) for file_seed in self._file_seeds ]
|
|
|
|
|
|
return d
|
|
|
|
|
|
|
|
def GetApproxNumMasterFileSeeds( self ):
|
|
|
|
return len( [ file_seed for file_seed in self._file_seeds if file_seed.IsProbablyMasterPostURL() ] )
|
|
|
|
|
|
def GetEarliestSourceTime( self ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) == 0:
|
|
|
|
return None
|
|
|
|
|
|
earliest_timestamp = min( ( self._GetSourceTimestampForVelocityCalculations( file_seed ) for file_seed in self._file_seeds ) )
|
|
|
|
|
|
return earliest_timestamp
|
|
|
|
|
|
def GetExampleFileSeed( self ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) == 0:
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
good_file_seeds = [ file_seed for file_seed in self._file_seeds[-30:] if file_seed.status in CC.SUCCESSFUL_IMPORT_STATES ]
|
|
|
|
if len( good_file_seeds ) > 0:
|
|
|
|
example_seed = random.choice( good_file_seeds )
|
|
|
|
else:
|
|
|
|
example_seed = self._GetNextFileSeed( CC.STATUS_UNKNOWN )
|
|
|
|
|
|
if example_seed is None:
|
|
|
|
example_seed = random.choice( self._file_seeds[-10:] )
|
|
|
|
|
|
if example_seed.file_seed_type == FILE_SEED_TYPE_HDD:
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
return example_seed
|
|
|
|
|
|
|
|
|
|
|
|
def GetFileSeedCacheKey( self ):
|
|
|
|
return self._file_seed_cache_key
|
|
|
|
|
|
def GetFileSeedCount( self, status: int = None ):
|
|
|
|
result = 0
|
|
|
|
with self._lock:
|
|
|
|
if status is None:
|
|
|
|
result = len( self._file_seeds )
|
|
|
|
else:
|
|
|
|
if self._statuses_to_indexed_file_seeds_dirty:
|
|
|
|
self._RegenerateStatusesToFileSeeds()
|
|
|
|
|
|
return len( self._statuses_to_indexed_file_seeds[ status ] )
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
def GetFileSeeds( self, status: int = None ):
|
|
|
|
with self._lock:
|
|
|
|
return self._GetFileSeeds( status )
|
|
|
|
|
|
|
|
def GetFileSeedIndex( self, file_seed: FileSeed ):
|
|
|
|
with self._lock:
|
|
|
|
return self._file_seeds_to_indices[ file_seed ]
|
|
|
|
|
|
|
|
def GetHashes( self ):
|
|
|
|
with self._lock:
|
|
|
|
hashes = [ file_seed.GetHash() for file_seed in self._file_seeds if file_seed.HasHash() ]
|
|
|
|
|
|
return hashes
|
|
|
|
|
|
def GetLatestSourceTime( self ):
|
|
|
|
with self._lock:
|
|
|
|
if len( self._file_seeds ) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
latest_timestamp = max( ( self._GetSourceTimestampForVelocityCalculations( file_seed ) for file_seed in self._file_seeds ) )
|
|
|
|
|
|
return latest_timestamp
|
|
|
|
|
|
def GetNextFileSeed( self, status: int ) -> typing.Optional[ FileSeed ]:
|
|
|
|
with self._lock:
|
|
|
|
return self._GetNextFileSeed( status )
|
|
|
|
|
|
|
|
def GetNumNewFilesSince( self, since: int ):
|
|
|
|
num_files = 0
|
|
|
|
with self._lock:
|
|
|
|
for file_seed in self._file_seeds:
|
|
|
|
source_timestamp = self._GetSourceTimestampForVelocityCalculations( file_seed )
|
|
|
|
if source_timestamp >= since:
|
|
|
|
num_files += 1
|
|
|
|
|
|
|
|
|
|
return num_files
|
|
|
|
|
|
def GetPresentedHashes( self, presentation_import_options: PresentationImportOptions.PresentationImportOptions ):
|
|
|
|
with self._lock:
|
|
|
|
hashes_and_statuses = [ ( file_seed.GetHash(), file_seed.status ) for file_seed in self._file_seeds if file_seed.HasHash() ]
|
|
|
|
|
|
return presentation_import_options.GetPresentedHashes( hashes_and_statuses )
|
|
|
|
|
|
def GetStatus( self ):
|
|
|
|
with self._lock:
|
|
|
|
if self._status_dirty:
|
|
|
|
self._GenerateStatus()
|
|
|
|
|
|
return self._status_cache
|
|
|
|
|
|
|
|
def GetValueRange( self ):
|
|
|
|
with self._lock:
|
|
|
|
if self._status_dirty:
|
|
|
|
self._GenerateStatus()
|
|
|
|
|
|
return self._status_cache.GetValueRange()
|
|
|
|
|
|
|
|
def HasFileSeed( self, file_seed: FileSeed ):
|
|
|
|
with self._lock:
|
|
|
|
return self._HasFileSeed( file_seed )
|
|
|
|
|
|
|
|
def InsertFileSeeds( self, index: int, file_seeds: typing.Collection[ FileSeed ] ):
|
|
|
|
if len( file_seeds ) == 0:
|
|
|
|
return 0
|
|
|
|
|
|
new_file_seeds = set()
|
|
|
|
with self._lock:
|
|
|
|
index = min( index, len( self._file_seeds ) )
|
|
|
|
for file_seed in file_seeds:
|
|
|
|
if self._HasFileSeed( file_seed ) or file_seed in new_file_seeds:
|
|
|
|
continue
|
|
|
|
|
|
file_seed.Normalise()
|
|
|
|
new_file_seeds.add( file_seed )
|
|
|
|
self._file_seeds.insert( index, file_seed )
|
|
|
|
index += 1
|
|
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusesToFileSeedsDirty()
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( new_file_seeds )
|
|
|
|
return len( new_file_seeds )
|
|
|
|
|
|
def NotifyFileSeedsUpdated( self, file_seeds: typing.Collection[ FileSeed ] ):
|
|
|
|
with self._lock:
|
|
|
|
if not self._statuses_to_indexed_file_seeds_dirty:
|
|
|
|
self._FixFileSeedsStatusPosition( file_seeds )
|
|
|
|
|
|
#
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
HG.client_controller.pub( 'file_seed_cache_file_seeds_updated', self._file_seed_cache_key, file_seeds )
|
|
|
|
|
|
def RemoveFileSeeds( self, file_seeds: typing.Iterable[ FileSeed ] ):
|
|
|
|
with self._lock:
|
|
|
|
file_seeds_to_delete = set( file_seeds )
|
|
|
|
self._file_seeds = HydrusSerialisable.SerialisableList( [ file_seed for file_seed in self._file_seeds if file_seed not in file_seeds_to_delete ] )
|
|
|
|
self._file_seeds_to_indices = { file_seed : index for ( index, file_seed ) in enumerate( self._file_seeds ) }
|
|
|
|
self._SetStatusesToFileSeedsDirty()
|
|
|
|
self._SetStatusDirty()
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( file_seeds_to_delete )
|
|
|
|
|
|
def RemoveFileSeedsByStatus( self, statuses_to_remove: typing.Collection[ int ] ):
|
|
|
|
with self._lock:
|
|
|
|
file_seeds_to_delete = [ file_seed for file_seed in self._file_seeds if file_seed.status in statuses_to_remove ]
|
|
|
|
|
|
self.RemoveFileSeeds( file_seeds_to_delete )
|
|
|
|
|
|
def RemoveAllButUnknownFileSeeds( self ):
|
|
|
|
with self._lock:
|
|
|
|
file_seeds_to_delete = [ file_seed for file_seed in self._file_seeds if file_seed.status != CC.STATUS_UNKNOWN ]
|
|
|
|
|
|
self.RemoveFileSeeds( file_seeds_to_delete )
|
|
|
|
|
|
def RetryFailed( self ):
|
|
|
|
with self._lock:
|
|
|
|
failed_file_seeds = self._GetFileSeeds( CC.STATUS_ERROR )
|
|
|
|
for file_seed in failed_file_seeds:
|
|
|
|
file_seed.SetStatus( CC.STATUS_UNKNOWN )
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( failed_file_seeds )
|
|
|
|
|
|
def RetryIgnored( self, ignored_regex = None ):
|
|
|
|
with self._lock:
|
|
|
|
ignored_file_seeds = self._GetFileSeeds( CC.STATUS_VETOED )
|
|
|
|
for file_seed in ignored_file_seeds:
|
|
|
|
if ignored_regex is not None:
|
|
|
|
if re.search( ignored_regex, file_seed.note ) is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
file_seed.SetStatus( CC.STATUS_UNKNOWN )
|
|
|
|
|
|
|
|
self.NotifyFileSeedsUpdated( ignored_file_seeds )
|
|
|
|
|
|
def WorkToDo( self ):
|
|
|
|
with self._lock:
|
|
|
|
if self._status_dirty:
|
|
|
|
self._GenerateStatus()
|
|
|
|
|
|
return self._status_cache.HasWorkToDo()
|
|
|
|
|
|
|
|
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_FILE_SEED_CACHE ] = FileSeedCache
|
|
|
|
def GenerateFileSeedCachesStatus( file_seed_caches: typing.Iterable[ FileSeedCache ] ):
|
|
|
|
fscs = FileSeedCacheStatus()
|
|
|
|
for file_seed_cache in file_seed_caches:
|
|
|
|
fscs.Merge( file_seed_cache.GetStatus() )
|
|
|
|
|
|
return fscs
|
|
|