hydrus/hydrus/client/importing/ClientImportGallerySeeds.py

1217 lines
41 KiB
Python
Raw Normal View History

2020-05-20 21:36:02 +00:00
import collections
import itertools
import os
2020-06-11 12:01:08 +00:00
import random
2020-05-20 21:36:02 +00:00
import threading
import time
import traceback
2020-06-11 12:01:08 +00:00
import typing
2020-05-20 21:36:02 +00:00
2020-04-22 21:00:35 +00:00
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.core import HydrusExceptions
from hydrus.core import HydrusGlobals as HG
from hydrus.core import HydrusSerialisable
2018-06-27 19:27:05 +00:00
2020-07-29 20:52:44 +00:00
from hydrus.client import ClientConstants as CC
from hydrus.client import ClientParsing
from hydrus.client.importing import ClientImporting
from hydrus.client.metadata import ClientTags
2020-07-29 20:52:44 +00:00
2018-06-27 19:27:05 +00:00
def GenerateGallerySeedLogStatus( statuses_to_counts ):
num_successful = statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
num_ignored = statuses_to_counts[ CC.STATUS_VETOED ]
num_failed = statuses_to_counts[ CC.STATUS_ERROR ]
num_skipped = statuses_to_counts[ CC.STATUS_SKIPPED ]
num_unknown = statuses_to_counts[ CC.STATUS_UNKNOWN ]
# add some kind of '(512 files found (so far))', which may be asking too much here
# might be this is complicated and needs to be (partly) done in the object, which will know if it is paused or whatever.
status_strings = []
if num_successful > 0:
2018-07-04 20:48:28 +00:00
s = HydrusData.ToHumanInt( num_successful ) + ' successful'
2018-06-27 19:27:05 +00:00
status_strings.append( s )
if num_ignored > 0:
2018-07-04 20:48:28 +00:00
status_strings.append( HydrusData.ToHumanInt( num_ignored ) + ' ignored' )
2018-06-27 19:27:05 +00:00
if num_failed > 0:
2018-07-04 20:48:28 +00:00
status_strings.append( HydrusData.ToHumanInt( num_failed ) + ' failed' )
2018-06-27 19:27:05 +00:00
if num_skipped > 0:
2018-07-04 20:48:28 +00:00
status_strings.append( HydrusData.ToHumanInt( num_skipped ) + ' skipped' )
if num_unknown > 0:
status_strings.append( HydrusData.ToHumanInt( num_unknown ) + ' pending' )
2018-06-27 19:27:05 +00:00
status = ', '.join( status_strings )
total = sum( statuses_to_counts.values() )
total_processed = total - num_unknown
return ( status, ( total_processed, total ) )
2018-07-04 20:48:28 +00:00
class GallerySeed( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_SEED
SERIALISABLE_NAME = 'Gallery Log Entry'
2020-09-16 20:46:54 +00:00
SERIALISABLE_VERSION = 3
2018-07-04 20:48:28 +00:00
def __init__( self, url = None, can_generate_more_pages = True ):
if url is None:
url = 'https://nostrils-central.cx/index.php?post=s&tag=hyper_nostrils&page=3'
2018-08-22 21:10:59 +00:00
else:
2020-04-01 21:51:42 +00:00
try:
url = HG.client_controller.network_engine.domain_manager.NormaliseURL( url )
except HydrusExceptions.URLClassException:
pass
2018-08-22 21:10:59 +00:00
2018-07-04 20:48:28 +00:00
HydrusSerialisable.SerialisableBase.__init__( self )
self.url = url
self._can_generate_more_pages = can_generate_more_pages
2020-09-16 20:46:54 +00:00
self._external_filterable_tags = set()
self._external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags()
2019-02-27 23:03:30 +00:00
2018-07-04 20:48:28 +00:00
self.created = HydrusData.GetNow()
self.modified = self.created
self.status = CC.STATUS_UNKNOWN
self.note = ''
self._referral_url = None
2018-10-17 21:00:09 +00:00
self._force_next_page_url_generation = False
self._run_token = HydrusData.GenerateKey()
2018-07-04 20:48:28 +00:00
def __eq__( self, other ):
2020-01-22 21:04:43 +00:00
if isinstance( other, GallerySeed ):
return self.__hash__() == other.__hash__()
return NotImplemented
2018-07-04 20:48:28 +00:00
def __hash__( self ):
return ( self.url, self._run_token ).__hash__()
2018-07-04 20:48:28 +00:00
def __ne__( self, other ):
return self.__hash__() != other.__hash__()
def _GetSerialisableInfo( self ):
2020-09-16 20:46:54 +00:00
serialisable_external_filterable_tags = list( self._external_filterable_tags )
serialisable_external_additional_service_keys_to_tags = self._external_additional_service_keys_to_tags.GetSerialisableTuple()
return (
self.url,
self._can_generate_more_pages,
serialisable_external_filterable_tags,
serialisable_external_additional_service_keys_to_tags,
self.created,
self.modified,
self.status,
self.note,
self._referral_url
)
2018-07-04 20:48:28 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
2020-09-16 20:46:54 +00:00
(
self.url,
self._can_generate_more_pages,
serialisable_external_filterable_tags,
serialisable_external_additional_service_keys_to_tags,
self.created,
self.modified,
self.status,
self.note,
self._referral_url
) = serialisable_info
self._external_filterable_tags = set( serialisable_external_filterable_tags )
self._external_additional_service_keys_to_tags = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_external_additional_service_keys_to_tags )
2018-07-04 20:48:28 +00:00
def _UpdateModified( self ):
self.modified = HydrusData.GetNow()
2019-02-27 23:03:30 +00:00
def _UpdateSerialisableInfo( self, version, old_serialisable_info ):
if version == 1:
( url, can_generate_more_pages, created, modified, status, note, referral_url ) = old_serialisable_info
2020-09-16 20:46:54 +00:00
external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags()
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
serialisable_external_additional_service_keys_to_tags = external_additional_service_keys_to_tags.GetSerialisableTuple()
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
new_serialisable_info = ( url, can_generate_more_pages, serialisable_external_additional_service_keys_to_tags, created, modified, status, note, referral_url )
2019-02-27 23:03:30 +00:00
return ( 2, new_serialisable_info )
2020-09-16 20:46:54 +00:00
if version == 2:
( url, can_generate_more_pages, serialisable_external_additional_service_keys_to_tags, created, modified, status, note, referral_url ) = old_serialisable_info
external_filterable_tags = set()
serialisable_external_filterable_tags = list( external_filterable_tags )
new_serialisable_info = ( url, can_generate_more_pages, serialisable_external_filterable_tags, serialisable_external_additional_service_keys_to_tags, created, modified, status, note, referral_url )
return ( 3, new_serialisable_info )
2018-10-17 21:00:09 +00:00
def ForceNextPageURLGeneration( self ):
self._force_next_page_url_generation = True
2018-09-12 21:36:26 +00:00
def GenerateRestartedDuplicate( self, can_generate_more_pages ):
gallery_seed = GallerySeed( url = self.url, can_generate_more_pages = can_generate_more_pages )
2018-10-17 21:00:09 +00:00
if can_generate_more_pages:
gallery_seed.ForceNextPageURLGeneration()
2018-09-12 21:36:26 +00:00
return gallery_seed
2019-08-21 21:34:01 +00:00
def GetAPIInfoDict( self, simple ):
d = {}
d[ 'url' ] = self.url
d[ 'created' ] = self.created
d[ 'modified' ] = self.modified
d[ 'status' ] = self.status
d[ 'note' ] = self.note
return d
2018-10-31 21:41:14 +00:00
def GetExampleNetworkJob( self, network_job_factory ):
2020-06-17 21:31:54 +00:00
try:
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( self.url )
except HydrusExceptions.URLClassException:
url_to_check = self.url
2018-10-31 21:41:14 +00:00
network_job = network_job_factory( 'GET', url_to_check )
return network_job
2020-09-16 20:46:54 +00:00
def SetExternalAdditionalServiceKeysToTags( self, service_keys_to_tags ):
self._external_additional_service_keys_to_tags = ClientTags.ServiceKeysToTags( service_keys_to_tags )
def SetExternalFilterableTags( self, tags ):
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
self._external_filterable_tags = set( tags )
2019-02-27 23:03:30 +00:00
2018-07-04 20:48:28 +00:00
def SetReferralURL( self, referral_url ):
self._referral_url = referral_url
def SetRunToken( self, run_token: bytes ):
self._run_token = run_token
2018-07-04 20:48:28 +00:00
def SetStatus( self, status, note = '', exception = None ):
if exception is not None:
2019-01-09 22:59:03 +00:00
first_line = str( exception ).split( os.linesep )[0]
2018-07-04 20:48:28 +00:00
2019-01-09 22:59:03 +00:00
note = first_line + '\u2026 (Copy note to see full error)'
2018-07-04 20:48:28 +00:00
note += os.linesep
2019-01-09 22:59:03 +00:00
note += traceback.format_exc()
2018-07-04 20:48:28 +00:00
HydrusData.Print( 'Error when processing ' + self.url + ' !' )
HydrusData.Print( traceback.format_exc() )
self.status = status
self.note = note
self._UpdateModified()
def WorksInNewSystem( self ):
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.url )
2018-07-04 20:48:28 +00:00
if url_type == HC.URL_TYPE_GALLERY and can_parse:
return True
return False
2023-01-25 22:59:39 +00:00
def WorkOnURL( self, gallery_token_name, gallery_seed_log: "GallerySeedLog", file_seeds_callable, status_hook, title_hook, network_job_factory, network_job_presentation_context_factory, file_import_options, gallery_urls_seen_before = None ):
2018-07-04 20:48:28 +00:00
if gallery_urls_seen_before is None:
gallery_urls_seen_before = set()
2018-08-01 20:44:57 +00:00
gallery_urls_seen_before.add( self.url )
2018-07-04 20:48:28 +00:00
# maybe something like 'append urls' vs 'reverse-prepend' for subs or something
# should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop
num_urls_added = 0
2018-08-01 20:44:57 +00:00
num_urls_already_in_file_seed_cache = 0
num_urls_total = 0
2018-07-11 20:23:51 +00:00
result_404 = False
2018-09-05 20:52:32 +00:00
added_new_gallery_pages = False
2018-08-22 21:10:59 +00:00
stop_reason = ''
2018-07-04 20:48:28 +00:00
try:
2020-12-16 22:29:51 +00:00
gallery_url = self.url
2021-11-17 21:22:27 +00:00
url_for_child_referral = gallery_url
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( gallery_url )
2018-07-04 20:48:28 +00:00
if url_type not in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):
2018-07-11 20:23:51 +00:00
raise HydrusExceptions.VetoException( 'Did not recognise this as a gallery or watchable URL!' )
2018-07-04 20:48:28 +00:00
if not can_parse:
raise HydrusExceptions.VetoException( 'Cannot parse {}: {}'.format( match_name, cannot_parse_reason) )
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
2018-07-04 20:48:28 +00:00
2019-03-06 23:06:22 +00:00
status_hook( 'downloading gallery page' )
2018-07-04 20:48:28 +00:00
2021-02-24 22:35:18 +00:00
if self._referral_url is not None and self._referral_url != url_to_check:
2018-07-04 20:48:28 +00:00
referral_url = self._referral_url
2021-02-24 22:35:18 +00:00
elif gallery_url != url_to_check:
referral_url = gallery_url
2018-07-04 20:48:28 +00:00
else:
referral_url = None
network_job = network_job_factory( 'GET', url_to_check, referral_url = referral_url )
2018-08-22 21:10:59 +00:00
network_job.SetGalleryToken( gallery_token_name )
2018-08-01 20:44:57 +00:00
network_job.OverrideBandwidth( 30 )
2018-07-11 20:23:51 +00:00
2018-07-04 20:48:28 +00:00
HG.client_controller.network_engine.AddJob( network_job )
with network_job_presentation_context_factory( network_job ) as njpc:
network_job.WaitUntilDone()
2019-01-09 22:59:03 +00:00
parsing_text = network_job.GetContentText()
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
actual_fetched_url = network_job.GetActualFetchedURL()
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
do_parse = True
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
if actual_fetched_url != url_to_check:
( url_type, match_name, can_parse, cannot_parse_reason ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( actual_fetched_url )
2020-12-16 22:29:51 +00:00
if url_type == HC.URL_TYPE_GALLERY:
if can_parse:
gallery_url = actual_fetched_url
2021-11-17 21:22:27 +00:00
url_for_child_referral = gallery_url
2020-12-16 22:29:51 +00:00
( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( gallery_url )
else:
2021-10-13 20:16:57 +00:00
do_parse = False
status = CC.STATUS_ERROR
note = 'Could not parse {}: {}'.format( match_name, cannot_parse_reason )
2020-12-16 22:29:51 +00:00
else:
do_parse = False
from hydrus.client.importing import ClientImportFileSeeds
file_seed = ClientImportFileSeeds.FileSeed( ClientImportFileSeeds.FILE_SEED_TYPE_URL, actual_fetched_url )
2021-11-17 21:22:27 +00:00
file_seed.SetReferralURL( url_for_child_referral )
2020-12-16 22:29:51 +00:00
file_seeds = [ file_seed ]
2021-10-13 20:16:57 +00:00
file_seeds_callable( ( file_seed, ) )
2020-12-16 22:29:51 +00:00
status = CC.STATUS_SUCCESSFUL_AND_NEW
note = 'was redirected to a non-gallery url, which has been queued as a file import'
2020-12-16 22:29:51 +00:00
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
if do_parse:
parsing_context = {}
parsing_context[ 'gallery_url' ] = gallery_url
parsing_context[ 'url' ] = url_to_check
parsing_context[ 'post_index' ] = '0'
all_parse_results = parser.Parse( parsing_context, parsing_text )
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
if len( all_parse_results ) == 0:
raise HydrusExceptions.VetoException( 'The parser found nothing in the document!' )
2021-11-17 21:22:27 +00:00
file_seeds = ClientImporting.ConvertAllParseResultsToFileSeeds( all_parse_results, url_for_child_referral, file_import_options )
2018-07-04 20:48:28 +00:00
title = ClientParsing.GetTitleFromAllParseResults( all_parse_results )
2018-10-17 21:00:09 +00:00
if title is not None:
2020-04-08 21:10:11 +00:00
title_hook( title )
2020-04-08 21:10:11 +00:00
for file_seed in file_seeds:
file_seed.SetExternalFilterableTags( self._external_filterable_tags )
file_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
2018-07-04 20:48:28 +00:00
num_urls_total = len( file_seeds )
2020-04-08 21:10:11 +00:00
( num_urls_added, num_urls_already_in_file_seed_cache, can_search_for_more_files, stop_reason ) = file_seeds_callable( file_seeds )
2020-04-08 21:10:11 +00:00
status = CC.STATUS_SUCCESSFUL_AND_NEW
2018-07-04 20:48:28 +00:00
note = HydrusData.ToHumanInt( num_urls_added ) + ' new urls found'
2018-07-04 20:48:28 +00:00
if num_urls_already_in_file_seed_cache > 0:
2018-08-29 20:20:41 +00:00
note += ' (' + HydrusData.ToHumanInt( num_urls_already_in_file_seed_cache ) + ' of page already in)'
2018-08-29 20:20:41 +00:00
if not can_search_for_more_files:
2018-09-12 21:36:26 +00:00
note += ' - ' + stop_reason
2018-09-12 21:36:26 +00:00
if parser.CanOnlyGenerateGalleryURLs() or self._force_next_page_url_generation:
2018-08-15 20:40:30 +00:00
can_add_more_gallery_urls = True
2018-08-15 20:40:30 +00:00
else:
# only keep searching if we found any files, otherwise this could be a blank results page with another stub page
can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files
flattened_results = list( itertools.chain.from_iterable( all_parse_results ) )
sub_gallery_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_SUB_GALLERY, ), only_get_top_priority = True )
sub_gallery_urls = HydrusData.DedupeList( sub_gallery_urls )
new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ]
num_new_sub_gallery_urls = len( new_sub_gallery_urls )
if num_new_sub_gallery_urls > 0:
2018-08-15 20:40:30 +00:00
sub_gallery_seeds = [ GallerySeed( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ]
2018-08-15 20:40:30 +00:00
for sub_gallery_seed in sub_gallery_seeds:
2018-08-15 20:40:30 +00:00
sub_gallery_seed.SetRunToken( self._run_token )
sub_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
sub_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
2018-08-15 20:40:30 +00:00
2023-01-25 22:59:39 +00:00
gallery_seed_log.AddGallerySeeds( sub_gallery_seeds, parent_gallery_seed = self )
2018-08-15 20:40:30 +00:00
added_new_gallery_pages = True
2018-07-04 20:48:28 +00:00
gallery_urls_seen_before.update( sub_gallery_urls )
2018-07-04 20:48:28 +00:00
note += ' - {} sub-gallery urls found'.format( HydrusData.ToHumanInt( num_new_sub_gallery_urls ) )
2018-07-04 20:48:28 +00:00
if self._can_generate_more_pages and can_add_more_gallery_urls:
2018-08-01 20:44:57 +00:00
next_page_urls = ClientParsing.GetURLsFromParseResults( flattened_results, ( HC.URL_TYPE_NEXT, ), only_get_top_priority = True )
2018-08-01 20:44:57 +00:00
if self.url in next_page_urls:
next_page_urls.remove( self.url )
2018-07-04 20:48:28 +00:00
if url_to_check in next_page_urls:
2018-07-04 20:48:28 +00:00
next_page_urls.remove( url_to_check )
if len( next_page_urls ) > 0:
next_page_generation_phrase = ' next gallery pages found'
else:
# we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one
url_class = HG.client_controller.network_engine.domain_manager.GetURLClass( url_to_check )
if url_class is not None and url_class.CanGenerateNextGalleryPage():
2019-02-27 23:03:30 +00:00
try:
next_page_url = url_class.GetNextGalleryPage( url_to_check )
next_page_urls = [ next_page_url ]
except Exception as e:
note += ' - Attempted to generate a next gallery page url, but failed!'
note += os.linesep
note += traceback.format_exc()
2019-02-27 23:03:30 +00:00
next_page_generation_phrase = ' next gallery pages extrapolated from url class'
2018-07-04 20:48:28 +00:00
if len( next_page_urls ) > 0:
next_page_urls = HydrusData.DedupeList( next_page_urls )
new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ]
2018-09-05 20:52:32 +00:00
duplicate_next_page_urls = gallery_urls_seen_before.intersection( new_next_page_urls )
2018-07-04 20:48:28 +00:00
num_new_next_page_urls = len( new_next_page_urls )
num_dupe_next_page_urls = len( duplicate_next_page_urls )
if num_new_next_page_urls > 0:
next_gallery_seeds = [ GallerySeed( next_page_url ) for next_page_url in new_next_page_urls ]
2018-07-04 20:48:28 +00:00
for next_gallery_seed in next_gallery_seeds:
next_gallery_seed.SetRunToken( self._run_token )
2021-11-17 21:22:27 +00:00
next_gallery_seed.SetReferralURL( url_for_child_referral )
next_gallery_seed.SetExternalFilterableTags( self._external_filterable_tags )
next_gallery_seed.SetExternalAdditionalServiceKeysToTags( self._external_additional_service_keys_to_tags )
2023-01-25 22:59:39 +00:00
gallery_seed_log.AddGallerySeeds( next_gallery_seeds, parent_gallery_seed = self )
added_new_gallery_pages = True
gallery_urls_seen_before.update( new_next_page_urls )
if num_dupe_next_page_urls == 0:
note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase
else:
note += ' - ' + HydrusData.ToHumanInt( num_new_next_page_urls ) + next_page_generation_phrase + ', but ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + ' had already been visited this run and were not added'
2018-07-04 20:48:28 +00:00
else:
note += ' - ' + HydrusData.ToHumanInt( num_dupe_next_page_urls ) + next_page_generation_phrase + ', but they had already been visited this run and were not added'
2018-07-04 20:48:28 +00:00
self.SetStatus( status, note = note )
except HydrusExceptions.ShutdownException:
pass
except HydrusExceptions.VetoException as e:
status = CC.STATUS_VETOED
2019-01-09 22:59:03 +00:00
note = str( e )
2018-07-04 20:48:28 +00:00
self.SetStatus( status, note = note )
if isinstance( e, HydrusExceptions.CancelledException ):
status_hook( 'cancelled!' )
time.sleep( 2 )
2019-02-06 22:41:35 +00:00
except HydrusExceptions.InsufficientCredentialsException:
2018-07-18 21:07:15 +00:00
status = CC.STATUS_VETOED
note = '403'
self.SetStatus( status, note = note )
status_hook( '403' )
time.sleep( 2 )
result_404 = True
2018-07-04 20:48:28 +00:00
except HydrusExceptions.NotFoundException:
status = CC.STATUS_VETOED
note = '404'
self.SetStatus( status, note = note )
status_hook( '404' )
time.sleep( 2 )
2018-07-11 20:23:51 +00:00
result_404 = True
2018-07-04 20:48:28 +00:00
except Exception as e:
status = CC.STATUS_ERROR
self.SetStatus( status, exception = e )
status_hook( 'error!' )
time.sleep( 3 )
2018-07-11 20:23:51 +00:00
if isinstance( e, HydrusExceptions.NetworkException ): # so the larger queue can set a delaywork or whatever
raise
finally:
gallery_seed_log.NotifyGallerySeedsUpdated( ( self, ) )
2018-07-04 20:48:28 +00:00
2018-09-05 20:52:32 +00:00
return ( num_urls_added, num_urls_already_in_file_seed_cache, num_urls_total, result_404, added_new_gallery_pages, stop_reason )
2018-07-04 20:48:28 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_SEED ] = GallerySeed
2018-06-27 19:27:05 +00:00
class GallerySeedLog( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_SEED_LOG
SERIALISABLE_NAME = 'Gallery Log'
SERIALISABLE_VERSION = 1
2018-08-22 21:10:59 +00:00
COMPACT_NUMBER = 100
2018-06-27 19:27:05 +00:00
def __init__( self ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._gallery_seeds = HydrusSerialisable.SerialisableList()
self._gallery_seeds_to_indices = {}
self._gallery_seed_log_key = HydrusData.GenerateKey()
self._status_cache = None
self._status_dirty = True
self._lock = threading.Lock()
def __len__( self ):
return len( self._gallery_seeds )
def _GenerateStatus( self ):
statuses_to_counts = self._GetStatusesToCounts()
self._status_cache = GenerateGallerySeedLogStatus( statuses_to_counts )
self._status_dirty = False
2020-06-11 12:01:08 +00:00
def _GetNextGallerySeed( self, status: int ) -> typing.Optional[ GallerySeed ]:
for gallery_seed in self._gallery_seeds:
if gallery_seed.status == status:
return gallery_seed
return None
2018-06-27 19:27:05 +00:00
def _GetStatusesToCounts( self ):
statuses_to_counts = collections.Counter()
for gallery_seed in self._gallery_seeds:
statuses_to_counts[ gallery_seed.status ] += 1
return statuses_to_counts
def _GetGallerySeeds( self, status = None ):
if status is None:
return list( self._gallery_seeds )
else:
return [ gallery_seed for gallery_seed in self._gallery_seeds if gallery_seed.status == status ]
def _GetSerialisableInfo( self ):
2019-02-06 22:41:35 +00:00
return self._gallery_seeds.GetSerialisableTuple()
2018-06-27 19:27:05 +00:00
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
with self._lock:
self._gallery_seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )
self._gallery_seeds_to_indices = { gallery_seed : index for ( index, gallery_seed ) in enumerate( self._gallery_seeds ) }
def _SetStatusDirty( self ):
self._status_dirty = True
2023-01-25 22:59:39 +00:00
def AddGallerySeeds( self, gallery_seeds, parent_gallery_seed: typing.Optional[ GallerySeed ] = None ) -> int:
2018-06-27 19:27:05 +00:00
if len( gallery_seeds ) == 0:
2018-07-04 20:48:28 +00:00
return 0
2018-06-27 19:27:05 +00:00
seen_urls = set()
2018-06-27 19:27:05 +00:00
new_gallery_seeds = []
with self._lock:
for gallery_seed in gallery_seeds:
if gallery_seed.url in seen_urls:
continue
2018-06-27 19:27:05 +00:00
if gallery_seed in self._gallery_seeds_to_indices:
continue
new_gallery_seeds.append( gallery_seed )
2023-01-25 22:59:39 +00:00
seen_urls.add( gallery_seed.url )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
if len( new_gallery_seeds ) == 0:
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
return 0
if parent_gallery_seed is None or parent_gallery_seed not in self._gallery_seeds:
insertion_index = len( self._gallery_seeds )
else:
insertion_index = self._gallery_seeds.index( parent_gallery_seed ) + 1
original_insertion_index = insertion_index
for gallery_seed in new_gallery_seeds:
self._gallery_seeds.insert( insertion_index, gallery_seed )
2023-01-25 22:59:39 +00:00
insertion_index += 1
self._gallery_seeds_to_indices = { gallery_seed : index for ( index, gallery_seed ) in enumerate( self._gallery_seeds ) }
2018-06-27 19:27:05 +00:00
self._SetStatusDirty()
2023-01-25 22:59:39 +00:00
updated_gallery_seeds = self._gallery_seeds[ original_insertion_index : ]
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self.NotifyGallerySeedsUpdated( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
return len( new_gallery_seeds )
def AdvanceGallerySeed( self, gallery_seed ):
2023-01-25 22:59:39 +00:00
updated_gallery_seeds = []
2018-06-27 19:27:05 +00:00
with self._lock:
if gallery_seed in self._gallery_seeds_to_indices:
index = self._gallery_seeds_to_indices[ gallery_seed ]
if index > 0:
2023-01-25 22:59:39 +00:00
swapped_gallery_seed = self._gallery_seeds[ index - 1 ]
2018-06-27 19:27:05 +00:00
self._gallery_seeds.remove( gallery_seed )
self._gallery_seeds.insert( index - 1, gallery_seed )
2023-01-25 22:59:39 +00:00
self._gallery_seeds_to_indices[ gallery_seed ] = index - 1
self._gallery_seeds_to_indices[ swapped_gallery_seed ] = index
updated_gallery_seeds = ( gallery_seed, swapped_gallery_seed )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self.NotifyGallerySeedsUpdated( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
2018-08-15 20:40:30 +00:00
def CanCompact( self, compact_before_this_source_time ):
with self._lock:
2018-08-22 21:10:59 +00:00
if len( self._gallery_seeds ) <= self.COMPACT_NUMBER:
2018-08-15 20:40:30 +00:00
return False
2018-08-22 21:10:59 +00:00
for gallery_seed in self._gallery_seeds[:-self.COMPACT_NUMBER]:
2018-08-15 20:40:30 +00:00
if gallery_seed.status == CC.STATUS_UNKNOWN:
continue
if gallery_seed.created < compact_before_this_source_time:
return True
return False
2018-10-17 21:00:09 +00:00
def CanRestartFailedSearch( self ):
with self._lock:
if len( self._gallery_seeds ) == 0:
return False
last_gallery_seed = self._gallery_seeds[-1]
if last_gallery_seed.status == CC.STATUS_ERROR:
return True
2018-08-15 20:40:30 +00:00
def Compact( self, compact_before_this_source_time ):
with self._lock:
2018-08-22 21:10:59 +00:00
if len( self._gallery_seeds ) <= self.COMPACT_NUMBER:
2018-08-15 20:40:30 +00:00
return
new_gallery_seeds = HydrusSerialisable.SerialisableList()
2018-08-22 21:10:59 +00:00
for gallery_seed in self._gallery_seeds[:-self.COMPACT_NUMBER]:
2018-08-15 20:40:30 +00:00
still_to_do = gallery_seed.status == CC.STATUS_UNKNOWN
still_relevant = gallery_seed.created > compact_before_this_source_time
if still_to_do or still_relevant:
new_gallery_seeds.append( gallery_seed )
2018-08-22 21:10:59 +00:00
new_gallery_seeds.extend( self._gallery_seeds[-self.COMPACT_NUMBER:] )
2018-08-15 20:40:30 +00:00
self._gallery_seeds = new_gallery_seeds
self._gallery_seeds_to_indices = { gallery_seed : index for ( index, gallery_seed ) in enumerate( self._gallery_seeds ) }
self._SetStatusDirty()
2018-06-27 19:27:05 +00:00
def DelayGallerySeed( self, gallery_seed ):
2023-01-25 22:59:39 +00:00
updated_gallery_seeds = []
2018-06-27 19:27:05 +00:00
with self._lock:
if gallery_seed in self._gallery_seeds_to_indices:
index = self._gallery_seeds_to_indices[ gallery_seed ]
if index < len( self._gallery_seeds ) - 1:
2023-01-25 22:59:39 +00:00
swapped_gallery_seed = self._gallery_seeds[ index + 1 ]
2018-06-27 19:27:05 +00:00
self._gallery_seeds.remove( gallery_seed )
self._gallery_seeds.insert( index + 1, gallery_seed )
2023-01-25 22:59:39 +00:00
self._gallery_seeds_to_indices[ swapped_gallery_seed ] = index
self._gallery_seeds_to_indices[ gallery_seed ] = index + 1
updated_gallery_seeds = ( swapped_gallery_seed, gallery_seed )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self.NotifyGallerySeedsUpdated( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
def GetExampleGallerySeed( self ):
2018-06-27 19:27:05 +00:00
with self._lock:
2020-06-11 12:01:08 +00:00
if len( self._gallery_seeds ) == 0:
return None
else:
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
example_seed = self._GetNextGallerySeed( CC.STATUS_UNKNOWN )
if example_seed is None:
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
example_seed = random.choice( self._gallery_seeds[-10:] )
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
return example_seed
2018-06-27 19:27:05 +00:00
2019-08-21 21:34:01 +00:00
def GetAPIInfoDict( self, simple ):
with self._lock:
d = {}
if self._status_dirty:
self._GenerateStatus()
( status, ( total_processed, total ) ) = self._status_cache
d[ 'status' ] = status
d[ 'total_processed' ] = total_processed
d[ 'total_to_process' ] = total
if not simple:
d[ 'log_items' ] = [ gallery_seed.GetAPIInfoDict( simple ) for gallery_seed in self._gallery_seeds ]
return d
2018-06-27 19:27:05 +00:00
def GetGallerySeedLogKey( self ):
return self._gallery_seed_log_key
def GetGallerySeedCount( self, status = None ):
result = 0
with self._lock:
if status is None:
result = len( self._gallery_seeds )
else:
for gallery_seed in self._gallery_seeds:
if gallery_seed.status == status:
result += 1
return result
def GetGallerySeeds( self, status = None ):
with self._lock:
return self._GetGallerySeeds( status )
def GetGallerySeedIndex( self, gallery_seed ):
with self._lock:
return self._gallery_seeds_to_indices[ gallery_seed ]
2020-06-11 12:01:08 +00:00
def GetNextGallerySeed( self, status ):
2018-06-27 19:27:05 +00:00
with self._lock:
2020-06-11 12:01:08 +00:00
return self._GetNextGallerySeed( status )
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
def GetStatus( self ):
2018-06-27 19:27:05 +00:00
with self._lock:
if self._status_dirty:
2020-06-11 12:01:08 +00:00
self._GenerateStatus()
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
return self._status_cache
2018-06-27 19:27:05 +00:00
def GetStatusesToCounts( self ):
with self._lock:
return self._GetStatusesToCounts()
def HasGallerySeed( self, gallery_seed ):
with self._lock:
return gallery_seed in self._gallery_seeds_to_indices
2018-07-04 20:48:28 +00:00
def HasGalleryURL( self, url ):
search_gallery_seed = GallerySeed( url )
search_url = search_gallery_seed.url
return search_url in ( gallery_seed.url for gallery_seed in self._gallery_seeds )
2018-06-27 19:27:05 +00:00
def NotifyGallerySeedsUpdated( self, gallery_seeds ):
2023-01-25 22:59:39 +00:00
if len( gallery_seeds ) == 0:
return
2018-06-27 19:27:05 +00:00
with self._lock:
self._SetStatusDirty()
HG.client_controller.pub( 'gallery_seed_log_gallery_seeds_updated', self._gallery_seed_log_key, gallery_seeds )
2023-01-25 22:59:39 +00:00
def RemoveGallerySeeds( self, gallery_seeds_to_delete ):
2018-06-27 19:27:05 +00:00
with self._lock:
2023-01-25 22:59:39 +00:00
gallery_seeds_to_delete = { gallery_seed for gallery_seed in gallery_seeds_to_delete if gallery_seed in self._gallery_seeds_to_indices }
if len( gallery_seeds_to_delete ) == 0:
return
earliest_affected_index = min( ( self._gallery_seeds_to_indices[ gallery_seed ] for gallery_seed in gallery_seeds_to_delete ) )
2018-06-27 19:27:05 +00:00
self._gallery_seeds = HydrusSerialisable.SerialisableList( [ gallery_seed for gallery_seed in self._gallery_seeds if gallery_seed not in gallery_seeds_to_delete ] )
self._gallery_seeds_to_indices = { gallery_seed : index for ( index, gallery_seed ) in enumerate( self._gallery_seeds ) }
self._SetStatusDirty()
2023-01-25 22:59:39 +00:00
index_shuffled_gallery_seeds = self._gallery_seeds[ earliest_affected_index : ]
updated_gallery_seeds = gallery_seeds_to_delete.union( index_shuffled_gallery_seeds )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self.NotifyGallerySeedsUpdated( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
def RemoveGallerySeedsByStatus( self, statuses_to_remove ):
with self._lock:
gallery_seeds_to_delete = [ gallery_seed for gallery_seed in self._gallery_seeds if gallery_seed.status in statuses_to_remove ]
self.RemoveGallerySeeds( gallery_seeds_to_delete )
def RemoveAllButUnknownGallerySeeds( self ):
with self._lock:
gallery_seeds_to_delete = [ gallery_seed for gallery_seed in self._gallery_seeds if gallery_seed.status != CC.STATUS_UNKNOWN ]
self.RemoveGallerySeeds( gallery_seeds_to_delete )
2018-10-17 21:00:09 +00:00
def RestartFailedSearch( self ):
with self._lock:
if len( self._gallery_seeds ) == 0:
return
last_gallery_seed = self._gallery_seeds[-1]
if last_gallery_seed.status != CC.STATUS_ERROR:
return
can_generate_more_pages = True
new_gallery_seeds = ( last_gallery_seed.GenerateRestartedDuplicate( can_generate_more_pages ), )
self.AddGallerySeeds( new_gallery_seeds )
self.NotifyGallerySeedsUpdated( new_gallery_seeds )
2020-06-11 12:01:08 +00:00
def RetryFailed( self ):
2018-06-27 19:27:05 +00:00
with self._lock:
failed_gallery_seeds = self._GetGallerySeeds( CC.STATUS_ERROR )
for gallery_seed in failed_gallery_seeds:
gallery_seed.SetStatus( CC.STATUS_UNKNOWN )
self.NotifyGallerySeedsUpdated( failed_gallery_seeds )
def WorkToDo( self ):
with self._lock:
if self._status_dirty:
self._GenerateStatus()
2018-08-01 20:44:57 +00:00
( status, ( total_processed, total ) ) = self._status_cache
2018-06-27 19:27:05 +00:00
return total_processed < total
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_SEED_LOG ] = GallerySeedLog