2020-05-20 21:36:02 +00:00
import collections
import itertools
import os
2020-06-11 12:01:08 +00:00
import random
2020-05-20 21:36:02 +00:00
import threading
import time
import traceback
2020-06-11 12:01:08 +00:00
import typing
2020-05-20 21:36:02 +00:00
2020-04-22 21:00:35 +00:00
from hydrus . core import HydrusConstants as HC
from hydrus . core import HydrusData
from hydrus . core import HydrusExceptions
from hydrus . core import HydrusGlobals as HG
from hydrus . core import HydrusSerialisable
2018-06-27 19:27:05 +00:00
2020-07-29 20:52:44 +00:00
from hydrus . client import ClientConstants as CC
from hydrus . client import ClientParsing
from hydrus . client . importing import ClientImporting
2020-08-05 20:10:36 +00:00
from hydrus . client . metadata import ClientTags
2020-07-29 20:52:44 +00:00
2018-06-27 19:27:05 +00:00
def GenerateGallerySeedLogStatus ( statuses_to_counts ) :
num_successful = statuses_to_counts [ CC . STATUS_SUCCESSFUL_AND_NEW ]
num_ignored = statuses_to_counts [ CC . STATUS_VETOED ]
num_failed = statuses_to_counts [ CC . STATUS_ERROR ]
num_skipped = statuses_to_counts [ CC . STATUS_SKIPPED ]
num_unknown = statuses_to_counts [ CC . STATUS_UNKNOWN ]
# add some kind of '(512 files found (so far))', which may be asking too much here
# might be this is complicated and needs to be (partly) done in the object, which will know if it is paused or whatever.
status_strings = [ ]
if num_successful > 0 :
2018-07-04 20:48:28 +00:00
s = HydrusData . ToHumanInt ( num_successful ) + ' successful '
2018-06-27 19:27:05 +00:00
status_strings . append ( s )
if num_ignored > 0 :
2018-07-04 20:48:28 +00:00
status_strings . append ( HydrusData . ToHumanInt ( num_ignored ) + ' ignored ' )
2018-06-27 19:27:05 +00:00
if num_failed > 0 :
2018-07-04 20:48:28 +00:00
status_strings . append ( HydrusData . ToHumanInt ( num_failed ) + ' failed ' )
2018-06-27 19:27:05 +00:00
if num_skipped > 0 :
2018-07-04 20:48:28 +00:00
status_strings . append ( HydrusData . ToHumanInt ( num_skipped ) + ' skipped ' )
if num_unknown > 0 :
status_strings . append ( HydrusData . ToHumanInt ( num_unknown ) + ' pending ' )
2018-06-27 19:27:05 +00:00
status = ' , ' . join ( status_strings )
total = sum ( statuses_to_counts . values ( ) )
total_processed = total - num_unknown
return ( status , ( total_processed , total ) )
2018-07-04 20:48:28 +00:00
class GallerySeed ( HydrusSerialisable . SerialisableBase ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_GALLERY_SEED
SERIALISABLE_NAME = ' Gallery Log Entry '
2020-09-16 20:46:54 +00:00
SERIALISABLE_VERSION = 3
2018-07-04 20:48:28 +00:00
def __init__ ( self , url = None , can_generate_more_pages = True ) :
if url is None :
url = ' https://nostrils-central.cx/index.php?post=s&tag=hyper_nostrils&page=3 '
2018-08-22 21:10:59 +00:00
else :
2020-04-01 21:51:42 +00:00
try :
url = HG . client_controller . network_engine . domain_manager . NormaliseURL ( url )
except HydrusExceptions . URLClassException :
pass
2018-08-22 21:10:59 +00:00
2018-07-04 20:48:28 +00:00
HydrusSerialisable . SerialisableBase . __init__ ( self )
self . url = url
self . _can_generate_more_pages = can_generate_more_pages
2020-09-16 20:46:54 +00:00
self . _external_filterable_tags = set ( )
self . _external_additional_service_keys_to_tags = ClientTags . ServiceKeysToTags ( )
2019-02-27 23:03:30 +00:00
2018-07-04 20:48:28 +00:00
self . created = HydrusData . GetNow ( )
self . modified = self . created
self . status = CC . STATUS_UNKNOWN
self . note = ' '
self . _referral_url = None
2018-10-17 21:00:09 +00:00
self . _force_next_page_url_generation = False
2020-07-08 22:00:33 +00:00
self . _run_token = HydrusData . GenerateKey ( )
2018-07-04 20:48:28 +00:00
def __eq__ ( self , other ) :
2020-01-22 21:04:43 +00:00
if isinstance ( other , GallerySeed ) :
return self . __hash__ ( ) == other . __hash__ ( )
return NotImplemented
2018-07-04 20:48:28 +00:00
def __hash__ ( self ) :
2020-07-08 22:00:33 +00:00
return ( self . url , self . _run_token ) . __hash__ ( )
2018-07-04 20:48:28 +00:00
def __ne__ ( self , other ) :
return self . __hash__ ( ) != other . __hash__ ( )
def _GetSerialisableInfo ( self ) :
2020-09-16 20:46:54 +00:00
serialisable_external_filterable_tags = list ( self . _external_filterable_tags )
serialisable_external_additional_service_keys_to_tags = self . _external_additional_service_keys_to_tags . GetSerialisableTuple ( )
return (
self . url ,
self . _can_generate_more_pages ,
serialisable_external_filterable_tags ,
serialisable_external_additional_service_keys_to_tags ,
self . created ,
self . modified ,
self . status ,
self . note ,
self . _referral_url
)
2018-07-04 20:48:28 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
2020-09-16 20:46:54 +00:00
(
self . url ,
self . _can_generate_more_pages ,
serialisable_external_filterable_tags ,
serialisable_external_additional_service_keys_to_tags ,
self . created ,
self . modified ,
self . status ,
self . note ,
self . _referral_url
) = serialisable_info
self . _external_filterable_tags = set ( serialisable_external_filterable_tags )
self . _external_additional_service_keys_to_tags = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_external_additional_service_keys_to_tags )
2018-07-04 20:48:28 +00:00
def _UpdateModified ( self ) :
self . modified = HydrusData . GetNow ( )
2019-02-27 23:03:30 +00:00
def _UpdateSerialisableInfo ( self , version , old_serialisable_info ) :
if version == 1 :
( url , can_generate_more_pages , created , modified , status , note , referral_url ) = old_serialisable_info
2020-09-16 20:46:54 +00:00
external_additional_service_keys_to_tags = ClientTags . ServiceKeysToTags ( )
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
serialisable_external_additional_service_keys_to_tags = external_additional_service_keys_to_tags . GetSerialisableTuple ( )
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
new_serialisable_info = ( url , can_generate_more_pages , serialisable_external_additional_service_keys_to_tags , created , modified , status , note , referral_url )
2019-02-27 23:03:30 +00:00
return ( 2 , new_serialisable_info )
2020-09-16 20:46:54 +00:00
if version == 2 :
( url , can_generate_more_pages , serialisable_external_additional_service_keys_to_tags , created , modified , status , note , referral_url ) = old_serialisable_info
external_filterable_tags = set ( )
serialisable_external_filterable_tags = list ( external_filterable_tags )
new_serialisable_info = ( url , can_generate_more_pages , serialisable_external_filterable_tags , serialisable_external_additional_service_keys_to_tags , created , modified , status , note , referral_url )
return ( 3 , new_serialisable_info )
2018-10-17 21:00:09 +00:00
def ForceNextPageURLGeneration ( self ) :
self . _force_next_page_url_generation = True
2018-09-12 21:36:26 +00:00
def GenerateRestartedDuplicate ( self , can_generate_more_pages ) :
gallery_seed = GallerySeed ( url = self . url , can_generate_more_pages = can_generate_more_pages )
2018-10-17 21:00:09 +00:00
if can_generate_more_pages :
gallery_seed . ForceNextPageURLGeneration ( )
2018-09-12 21:36:26 +00:00
return gallery_seed
2019-08-21 21:34:01 +00:00
def GetAPIInfoDict ( self , simple ) :
d = { }
d [ ' url ' ] = self . url
d [ ' created ' ] = self . created
d [ ' modified ' ] = self . modified
d [ ' status ' ] = self . status
d [ ' note ' ] = self . note
return d
2018-10-31 21:41:14 +00:00
def GetExampleNetworkJob ( self , network_job_factory ) :
2020-06-17 21:31:54 +00:00
try :
( url_to_check , parser ) = HG . client_controller . network_engine . domain_manager . GetURLToFetchAndParser ( self . url )
except HydrusExceptions . URLClassException :
url_to_check = self . url
2018-10-31 21:41:14 +00:00
network_job = network_job_factory ( ' GET ' , url_to_check )
return network_job
2020-09-16 20:46:54 +00:00
def SetExternalAdditionalServiceKeysToTags ( self , service_keys_to_tags ) :
self . _external_additional_service_keys_to_tags = ClientTags . ServiceKeysToTags ( service_keys_to_tags )
def SetExternalFilterableTags ( self , tags ) :
2019-02-27 23:03:30 +00:00
2020-09-16 20:46:54 +00:00
self . _external_filterable_tags = set ( tags )
2019-02-27 23:03:30 +00:00
2018-07-04 20:48:28 +00:00
def SetReferralURL ( self , referral_url ) :
self . _referral_url = referral_url
2020-07-08 22:00:33 +00:00
def SetRunToken ( self , run_token : bytes ) :
self . _run_token = run_token
2018-07-04 20:48:28 +00:00
def SetStatus ( self , status , note = ' ' , exception = None ) :
if exception is not None :
2019-01-09 22:59:03 +00:00
first_line = str ( exception ) . split ( os . linesep ) [ 0 ]
2018-07-04 20:48:28 +00:00
2019-01-09 22:59:03 +00:00
note = first_line + ' \u2026 (Copy note to see full error) '
2018-07-04 20:48:28 +00:00
note + = os . linesep
2019-01-09 22:59:03 +00:00
note + = traceback . format_exc ( )
2018-07-04 20:48:28 +00:00
HydrusData . Print ( ' Error when processing ' + self . url + ' ! ' )
HydrusData . Print ( traceback . format_exc ( ) )
self . status = status
self . note = note
self . _UpdateModified ( )
def WorksInNewSystem ( self ) :
2021-09-08 21:41:52 +00:00
( url_type , match_name , can_parse , cannot_parse_reason ) = HG . client_controller . network_engine . domain_manager . GetURLParseCapability ( self . url )
2018-07-04 20:48:28 +00:00
if url_type == HC . URL_TYPE_GALLERY and can_parse :
return True
return False
2023-01-25 22:59:39 +00:00
def WorkOnURL ( self , gallery_token_name , gallery_seed_log : " GallerySeedLog " , file_seeds_callable , status_hook , title_hook , network_job_factory , network_job_presentation_context_factory , file_import_options , gallery_urls_seen_before = None ) :
2018-07-04 20:48:28 +00:00
if gallery_urls_seen_before is None :
gallery_urls_seen_before = set ( )
2018-08-01 20:44:57 +00:00
gallery_urls_seen_before . add ( self . url )
2018-07-04 20:48:28 +00:00
# maybe something like 'append urls' vs 'reverse-prepend' for subs or something
# should also take--and populate--a set of urls we have seen this 'run', so we can bomb out if next_gallery_url ends up in some loop
num_urls_added = 0
2018-08-01 20:44:57 +00:00
num_urls_already_in_file_seed_cache = 0
num_urls_total = 0
2018-07-11 20:23:51 +00:00
result_404 = False
2018-09-05 20:52:32 +00:00
added_new_gallery_pages = False
2018-08-22 21:10:59 +00:00
stop_reason = ' '
2018-07-04 20:48:28 +00:00
try :
2020-12-16 22:29:51 +00:00
gallery_url = self . url
2021-11-17 21:22:27 +00:00
url_for_child_referral = gallery_url
2021-09-08 21:41:52 +00:00
( url_type , match_name , can_parse , cannot_parse_reason ) = HG . client_controller . network_engine . domain_manager . GetURLParseCapability ( gallery_url )
2018-07-04 20:48:28 +00:00
if url_type not in ( HC . URL_TYPE_GALLERY , HC . URL_TYPE_WATCHABLE ) :
2018-07-11 20:23:51 +00:00
raise HydrusExceptions . VetoException ( ' Did not recognise this as a gallery or watchable URL! ' )
2018-07-04 20:48:28 +00:00
if not can_parse :
2021-09-08 21:41:52 +00:00
raise HydrusExceptions . VetoException ( ' Cannot parse {} : {} ' . format ( match_name , cannot_parse_reason ) )
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
( url_to_check , parser ) = HG . client_controller . network_engine . domain_manager . GetURLToFetchAndParser ( gallery_url )
2018-07-04 20:48:28 +00:00
2019-03-06 23:06:22 +00:00
status_hook ( ' downloading gallery page ' )
2018-07-04 20:48:28 +00:00
2021-02-24 22:35:18 +00:00
if self . _referral_url is not None and self . _referral_url != url_to_check :
2018-07-04 20:48:28 +00:00
referral_url = self . _referral_url
2021-02-24 22:35:18 +00:00
elif gallery_url != url_to_check :
referral_url = gallery_url
2018-07-04 20:48:28 +00:00
else :
referral_url = None
network_job = network_job_factory ( ' GET ' , url_to_check , referral_url = referral_url )
2018-08-22 21:10:59 +00:00
network_job . SetGalleryToken ( gallery_token_name )
2018-08-01 20:44:57 +00:00
network_job . OverrideBandwidth ( 30 )
2018-07-11 20:23:51 +00:00
2018-07-04 20:48:28 +00:00
HG . client_controller . network_engine . AddJob ( network_job )
with network_job_presentation_context_factory ( network_job ) as njpc :
network_job . WaitUntilDone ( )
2019-01-09 22:59:03 +00:00
parsing_text = network_job . GetContentText ( )
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
actual_fetched_url = network_job . GetActualFetchedURL ( )
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
do_parse = True
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
if actual_fetched_url != url_to_check :
2021-09-08 21:41:52 +00:00
( url_type , match_name , can_parse , cannot_parse_reason ) = HG . client_controller . network_engine . domain_manager . GetURLParseCapability ( actual_fetched_url )
2020-12-16 22:29:51 +00:00
if url_type == HC . URL_TYPE_GALLERY :
if can_parse :
gallery_url = actual_fetched_url
2021-11-17 21:22:27 +00:00
url_for_child_referral = gallery_url
2020-12-16 22:29:51 +00:00
( url_to_check , parser ) = HG . client_controller . network_engine . domain_manager . GetURLToFetchAndParser ( gallery_url )
2021-09-08 21:41:52 +00:00
else :
2021-10-13 20:16:57 +00:00
do_parse = False
2021-09-08 21:41:52 +00:00
status = CC . STATUS_ERROR
note = ' Could not parse {} : {} ' . format ( match_name , cannot_parse_reason )
2020-12-16 22:29:51 +00:00
else :
do_parse = False
from hydrus . client . importing import ClientImportFileSeeds
file_seed = ClientImportFileSeeds . FileSeed ( ClientImportFileSeeds . FILE_SEED_TYPE_URL , actual_fetched_url )
2021-11-17 21:22:27 +00:00
file_seed . SetReferralURL ( url_for_child_referral )
2020-12-16 22:29:51 +00:00
file_seeds = [ file_seed ]
2021-10-13 20:16:57 +00:00
file_seeds_callable ( ( file_seed , ) )
2020-12-16 22:29:51 +00:00
2021-09-08 21:41:52 +00:00
status = CC . STATUS_SUCCESSFUL_AND_NEW
note = ' was redirected to a non-gallery url, which has been queued as a file import '
2020-12-16 22:29:51 +00:00
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
if do_parse :
parsing_context = { }
parsing_context [ ' gallery_url ' ] = gallery_url
parsing_context [ ' url ' ] = url_to_check
parsing_context [ ' post_index ' ] = ' 0 '
all_parse_results = parser . Parse ( parsing_context , parsing_text )
2018-07-04 20:48:28 +00:00
2020-12-16 22:29:51 +00:00
if len ( all_parse_results ) == 0 :
raise HydrusExceptions . VetoException ( ' The parser found nothing in the document! ' )
2021-11-17 21:22:27 +00:00
file_seeds = ClientImporting . ConvertAllParseResultsToFileSeeds ( all_parse_results , url_for_child_referral , file_import_options )
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
title = ClientParsing . GetTitleFromAllParseResults ( all_parse_results )
2018-10-17 21:00:09 +00:00
2021-09-08 21:41:52 +00:00
if title is not None :
2020-04-08 21:10:11 +00:00
2021-09-08 21:41:52 +00:00
title_hook ( title )
2020-04-08 21:10:11 +00:00
2021-09-08 21:41:52 +00:00
for file_seed in file_seeds :
file_seed . SetExternalFilterableTags ( self . _external_filterable_tags )
file_seed . SetExternalAdditionalServiceKeysToTags ( self . _external_additional_service_keys_to_tags )
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
num_urls_total = len ( file_seeds )
2020-04-08 21:10:11 +00:00
2021-09-08 21:41:52 +00:00
( num_urls_added , num_urls_already_in_file_seed_cache , can_search_for_more_files , stop_reason ) = file_seeds_callable ( file_seeds )
2020-04-08 21:10:11 +00:00
2021-09-08 21:41:52 +00:00
status = CC . STATUS_SUCCESSFUL_AND_NEW
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
note = HydrusData . ToHumanInt ( num_urls_added ) + ' new urls found '
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
if num_urls_already_in_file_seed_cache > 0 :
2018-08-29 20:20:41 +00:00
2021-09-08 21:41:52 +00:00
note + = ' ( ' + HydrusData . ToHumanInt ( num_urls_already_in_file_seed_cache ) + ' of page already in) '
2018-08-29 20:20:41 +00:00
2021-09-08 21:41:52 +00:00
if not can_search_for_more_files :
2018-09-12 21:36:26 +00:00
2021-09-08 21:41:52 +00:00
note + = ' - ' + stop_reason
2018-09-12 21:36:26 +00:00
2021-09-08 21:41:52 +00:00
if parser . CanOnlyGenerateGalleryURLs ( ) or self . _force_next_page_url_generation :
2018-08-15 20:40:30 +00:00
2021-09-08 21:41:52 +00:00
can_add_more_gallery_urls = True
2018-08-15 20:40:30 +00:00
else :
2021-09-08 21:41:52 +00:00
# only keep searching if we found any files, otherwise this could be a blank results page with another stub page
can_add_more_gallery_urls = num_urls_added > 0 and can_search_for_more_files
flattened_results = list ( itertools . chain . from_iterable ( all_parse_results ) )
sub_gallery_urls = ClientParsing . GetURLsFromParseResults ( flattened_results , ( HC . URL_TYPE_SUB_GALLERY , ) , only_get_top_priority = True )
sub_gallery_urls = HydrusData . DedupeList ( sub_gallery_urls )
new_sub_gallery_urls = [ sub_gallery_url for sub_gallery_url in sub_gallery_urls if sub_gallery_url not in gallery_urls_seen_before ]
num_new_sub_gallery_urls = len ( new_sub_gallery_urls )
if num_new_sub_gallery_urls > 0 :
2018-08-15 20:40:30 +00:00
2021-09-08 21:41:52 +00:00
sub_gallery_seeds = [ GallerySeed ( sub_gallery_url ) for sub_gallery_url in new_sub_gallery_urls ]
2018-08-15 20:40:30 +00:00
2021-09-08 21:41:52 +00:00
for sub_gallery_seed in sub_gallery_seeds :
2018-08-15 20:40:30 +00:00
2021-09-08 21:41:52 +00:00
sub_gallery_seed . SetRunToken ( self . _run_token )
sub_gallery_seed . SetExternalFilterableTags ( self . _external_filterable_tags )
sub_gallery_seed . SetExternalAdditionalServiceKeysToTags ( self . _external_additional_service_keys_to_tags )
2018-08-15 20:40:30 +00:00
2023-01-25 22:59:39 +00:00
gallery_seed_log . AddGallerySeeds ( sub_gallery_seeds , parent_gallery_seed = self )
2018-08-15 20:40:30 +00:00
2021-09-08 21:41:52 +00:00
added_new_gallery_pages = True
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
gallery_urls_seen_before . update ( sub_gallery_urls )
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
note + = ' - {} sub-gallery urls found ' . format ( HydrusData . ToHumanInt ( num_new_sub_gallery_urls ) )
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
if self . _can_generate_more_pages and can_add_more_gallery_urls :
2018-08-01 20:44:57 +00:00
2021-09-08 21:41:52 +00:00
next_page_urls = ClientParsing . GetURLsFromParseResults ( flattened_results , ( HC . URL_TYPE_NEXT , ) , only_get_top_priority = True )
2018-08-01 20:44:57 +00:00
2021-09-08 21:41:52 +00:00
if self . url in next_page_urls :
next_page_urls . remove ( self . url )
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
if url_to_check in next_page_urls :
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
next_page_urls . remove ( url_to_check )
if len ( next_page_urls ) > 0 :
next_page_generation_phrase = ' next gallery pages found '
else :
# we have failed to parse a next page url, but we would still like one, so let's see if the url match can provide one
url_class = HG . client_controller . network_engine . domain_manager . GetURLClass ( url_to_check )
if url_class is not None and url_class . CanGenerateNextGalleryPage ( ) :
2019-02-27 23:03:30 +00:00
2021-09-08 21:41:52 +00:00
try :
next_page_url = url_class . GetNextGalleryPage ( url_to_check )
next_page_urls = [ next_page_url ]
except Exception as e :
note + = ' - Attempted to generate a next gallery page url, but failed! '
note + = os . linesep
note + = traceback . format_exc ( )
2019-02-27 23:03:30 +00:00
2021-09-08 21:41:52 +00:00
next_page_generation_phrase = ' next gallery pages extrapolated from url class '
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
if len ( next_page_urls ) > 0 :
next_page_urls = HydrusData . DedupeList ( next_page_urls )
new_next_page_urls = [ next_page_url for next_page_url in next_page_urls if next_page_url not in gallery_urls_seen_before ]
2018-09-05 20:52:32 +00:00
2021-09-08 21:41:52 +00:00
duplicate_next_page_urls = gallery_urls_seen_before . intersection ( new_next_page_urls )
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
num_new_next_page_urls = len ( new_next_page_urls )
num_dupe_next_page_urls = len ( duplicate_next_page_urls )
if num_new_next_page_urls > 0 :
next_gallery_seeds = [ GallerySeed ( next_page_url ) for next_page_url in new_next_page_urls ]
2018-07-04 20:48:28 +00:00
2021-09-08 21:41:52 +00:00
for next_gallery_seed in next_gallery_seeds :
next_gallery_seed . SetRunToken ( self . _run_token )
2021-11-17 21:22:27 +00:00
next_gallery_seed . SetReferralURL ( url_for_child_referral )
2021-09-08 21:41:52 +00:00
next_gallery_seed . SetExternalFilterableTags ( self . _external_filterable_tags )
next_gallery_seed . SetExternalAdditionalServiceKeysToTags ( self . _external_additional_service_keys_to_tags )
2023-01-25 22:59:39 +00:00
gallery_seed_log . AddGallerySeeds ( next_gallery_seeds , parent_gallery_seed = self )
2021-09-08 21:41:52 +00:00
added_new_gallery_pages = True
gallery_urls_seen_before . update ( new_next_page_urls )
if num_dupe_next_page_urls == 0 :
note + = ' - ' + HydrusData . ToHumanInt ( num_new_next_page_urls ) + next_page_generation_phrase
else :
note + = ' - ' + HydrusData . ToHumanInt ( num_new_next_page_urls ) + next_page_generation_phrase + ' , but ' + HydrusData . ToHumanInt ( num_dupe_next_page_urls ) + ' had already been visited this run and were not added '
2018-07-04 20:48:28 +00:00
else :
2021-09-08 21:41:52 +00:00
note + = ' - ' + HydrusData . ToHumanInt ( num_dupe_next_page_urls ) + next_page_generation_phrase + ' , but they had already been visited this run and were not added '
2018-07-04 20:48:28 +00:00
self . SetStatus ( status , note = note )
except HydrusExceptions . ShutdownException :
pass
except HydrusExceptions . VetoException as e :
status = CC . STATUS_VETOED
2019-01-09 22:59:03 +00:00
note = str ( e )
2018-07-04 20:48:28 +00:00
self . SetStatus ( status , note = note )
if isinstance ( e , HydrusExceptions . CancelledException ) :
status_hook ( ' cancelled! ' )
time . sleep ( 2 )
2019-02-06 22:41:35 +00:00
except HydrusExceptions . InsufficientCredentialsException :
2018-07-18 21:07:15 +00:00
status = CC . STATUS_VETOED
note = ' 403 '
self . SetStatus ( status , note = note )
status_hook ( ' 403 ' )
time . sleep ( 2 )
result_404 = True
2018-07-04 20:48:28 +00:00
except HydrusExceptions . NotFoundException :
status = CC . STATUS_VETOED
note = ' 404 '
self . SetStatus ( status , note = note )
status_hook ( ' 404 ' )
time . sleep ( 2 )
2018-07-11 20:23:51 +00:00
result_404 = True
2018-07-04 20:48:28 +00:00
except Exception as e :
status = CC . STATUS_ERROR
self . SetStatus ( status , exception = e )
status_hook ( ' error! ' )
time . sleep ( 3 )
2018-07-11 20:23:51 +00:00
if isinstance ( e , HydrusExceptions . NetworkException ) : # so the larger queue can set a delaywork or whatever
raise
2021-09-08 21:41:52 +00:00
finally :
gallery_seed_log . NotifyGallerySeedsUpdated ( ( self , ) )
2018-07-04 20:48:28 +00:00
2018-09-05 20:52:32 +00:00
return ( num_urls_added , num_urls_already_in_file_seed_cache , num_urls_total , result_404 , added_new_gallery_pages , stop_reason )
2018-07-04 20:48:28 +00:00
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_GALLERY_SEED ] = GallerySeed
2018-06-27 19:27:05 +00:00
class GallerySeedLog ( HydrusSerialisable . SerialisableBase ) :
SERIALISABLE_TYPE = HydrusSerialisable . SERIALISABLE_TYPE_GALLERY_SEED_LOG
SERIALISABLE_NAME = ' Gallery Log '
SERIALISABLE_VERSION = 1
2018-08-22 21:10:59 +00:00
COMPACT_NUMBER = 100
2018-06-27 19:27:05 +00:00
def __init__ ( self ) :
HydrusSerialisable . SerialisableBase . __init__ ( self )
self . _gallery_seeds = HydrusSerialisable . SerialisableList ( )
self . _gallery_seeds_to_indices = { }
self . _gallery_seed_log_key = HydrusData . GenerateKey ( )
self . _status_cache = None
self . _status_dirty = True
self . _lock = threading . Lock ( )
def __len__ ( self ) :
return len ( self . _gallery_seeds )
def _GenerateStatus ( self ) :
statuses_to_counts = self . _GetStatusesToCounts ( )
self . _status_cache = GenerateGallerySeedLogStatus ( statuses_to_counts )
self . _status_dirty = False
2020-06-11 12:01:08 +00:00
def _GetNextGallerySeed ( self , status : int ) - > typing . Optional [ GallerySeed ] :
for gallery_seed in self . _gallery_seeds :
if gallery_seed . status == status :
return gallery_seed
return None
2018-06-27 19:27:05 +00:00
def _GetStatusesToCounts ( self ) :
statuses_to_counts = collections . Counter ( )
for gallery_seed in self . _gallery_seeds :
statuses_to_counts [ gallery_seed . status ] + = 1
return statuses_to_counts
def _GetGallerySeeds ( self , status = None ) :
if status is None :
return list ( self . _gallery_seeds )
else :
return [ gallery_seed for gallery_seed in self . _gallery_seeds if gallery_seed . status == status ]
def _GetSerialisableInfo ( self ) :
2019-02-06 22:41:35 +00:00
return self . _gallery_seeds . GetSerialisableTuple ( )
2018-06-27 19:27:05 +00:00
def _InitialiseFromSerialisableInfo ( self , serialisable_info ) :
with self . _lock :
self . _gallery_seeds = HydrusSerialisable . CreateFromSerialisableTuple ( serialisable_info )
self . _gallery_seeds_to_indices = { gallery_seed : index for ( index , gallery_seed ) in enumerate ( self . _gallery_seeds ) }
def _SetStatusDirty ( self ) :
self . _status_dirty = True
2023-01-25 22:59:39 +00:00
def AddGallerySeeds ( self , gallery_seeds , parent_gallery_seed : typing . Optional [ GallerySeed ] = None ) - > int :
2018-06-27 19:27:05 +00:00
if len ( gallery_seeds ) == 0 :
2018-07-04 20:48:28 +00:00
return 0
2018-06-27 19:27:05 +00:00
2020-07-08 22:00:33 +00:00
seen_urls = set ( )
2018-06-27 19:27:05 +00:00
new_gallery_seeds = [ ]
with self . _lock :
for gallery_seed in gallery_seeds :
2020-07-08 22:00:33 +00:00
if gallery_seed . url in seen_urls :
continue
2018-06-27 19:27:05 +00:00
if gallery_seed in self . _gallery_seeds_to_indices :
continue
new_gallery_seeds . append ( gallery_seed )
2023-01-25 22:59:39 +00:00
seen_urls . add ( gallery_seed . url )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
if len ( new_gallery_seeds ) == 0 :
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
return 0
if parent_gallery_seed is None or parent_gallery_seed not in self . _gallery_seeds :
insertion_index = len ( self . _gallery_seeds )
else :
insertion_index = self . _gallery_seeds . index ( parent_gallery_seed ) + 1
original_insertion_index = insertion_index
for gallery_seed in new_gallery_seeds :
self . _gallery_seeds . insert ( insertion_index , gallery_seed )
2020-07-08 22:00:33 +00:00
2023-01-25 22:59:39 +00:00
insertion_index + = 1
self . _gallery_seeds_to_indices = { gallery_seed : index for ( index , gallery_seed ) in enumerate ( self . _gallery_seeds ) }
2018-06-27 19:27:05 +00:00
self . _SetStatusDirty ( )
2023-01-25 22:59:39 +00:00
updated_gallery_seeds = self . _gallery_seeds [ original_insertion_index : ]
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self . NotifyGallerySeedsUpdated ( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
return len ( new_gallery_seeds )
def AdvanceGallerySeed ( self , gallery_seed ) :
2023-01-25 22:59:39 +00:00
updated_gallery_seeds = [ ]
2018-06-27 19:27:05 +00:00
with self . _lock :
if gallery_seed in self . _gallery_seeds_to_indices :
index = self . _gallery_seeds_to_indices [ gallery_seed ]
if index > 0 :
2023-01-25 22:59:39 +00:00
swapped_gallery_seed = self . _gallery_seeds [ index - 1 ]
2018-06-27 19:27:05 +00:00
self . _gallery_seeds . remove ( gallery_seed )
self . _gallery_seeds . insert ( index - 1 , gallery_seed )
2023-01-25 22:59:39 +00:00
self . _gallery_seeds_to_indices [ gallery_seed ] = index - 1
self . _gallery_seeds_to_indices [ swapped_gallery_seed ] = index
updated_gallery_seeds = ( gallery_seed , swapped_gallery_seed )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self . NotifyGallerySeedsUpdated ( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
2018-08-15 20:40:30 +00:00
def CanCompact ( self , compact_before_this_source_time ) :
with self . _lock :
2018-08-22 21:10:59 +00:00
if len ( self . _gallery_seeds ) < = self . COMPACT_NUMBER :
2018-08-15 20:40:30 +00:00
return False
2018-08-22 21:10:59 +00:00
for gallery_seed in self . _gallery_seeds [ : - self . COMPACT_NUMBER ] :
2018-08-15 20:40:30 +00:00
if gallery_seed . status == CC . STATUS_UNKNOWN :
continue
if gallery_seed . created < compact_before_this_source_time :
return True
return False
2018-10-17 21:00:09 +00:00
def CanRestartFailedSearch ( self ) :
with self . _lock :
if len ( self . _gallery_seeds ) == 0 :
return False
last_gallery_seed = self . _gallery_seeds [ - 1 ]
if last_gallery_seed . status == CC . STATUS_ERROR :
return True
2018-08-15 20:40:30 +00:00
def Compact ( self , compact_before_this_source_time ) :
with self . _lock :
2018-08-22 21:10:59 +00:00
if len ( self . _gallery_seeds ) < = self . COMPACT_NUMBER :
2018-08-15 20:40:30 +00:00
return
new_gallery_seeds = HydrusSerialisable . SerialisableList ( )
2018-08-22 21:10:59 +00:00
for gallery_seed in self . _gallery_seeds [ : - self . COMPACT_NUMBER ] :
2018-08-15 20:40:30 +00:00
still_to_do = gallery_seed . status == CC . STATUS_UNKNOWN
still_relevant = gallery_seed . created > compact_before_this_source_time
if still_to_do or still_relevant :
new_gallery_seeds . append ( gallery_seed )
2018-08-22 21:10:59 +00:00
new_gallery_seeds . extend ( self . _gallery_seeds [ - self . COMPACT_NUMBER : ] )
2018-08-15 20:40:30 +00:00
self . _gallery_seeds = new_gallery_seeds
self . _gallery_seeds_to_indices = { gallery_seed : index for ( index , gallery_seed ) in enumerate ( self . _gallery_seeds ) }
self . _SetStatusDirty ( )
2018-06-27 19:27:05 +00:00
def DelayGallerySeed ( self , gallery_seed ) :
2023-01-25 22:59:39 +00:00
updated_gallery_seeds = [ ]
2018-06-27 19:27:05 +00:00
with self . _lock :
if gallery_seed in self . _gallery_seeds_to_indices :
index = self . _gallery_seeds_to_indices [ gallery_seed ]
if index < len ( self . _gallery_seeds ) - 1 :
2023-01-25 22:59:39 +00:00
swapped_gallery_seed = self . _gallery_seeds [ index + 1 ]
2018-06-27 19:27:05 +00:00
self . _gallery_seeds . remove ( gallery_seed )
self . _gallery_seeds . insert ( index + 1 , gallery_seed )
2023-01-25 22:59:39 +00:00
self . _gallery_seeds_to_indices [ swapped_gallery_seed ] = index
self . _gallery_seeds_to_indices [ gallery_seed ] = index + 1
updated_gallery_seeds = ( swapped_gallery_seed , gallery_seed )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self . NotifyGallerySeedsUpdated ( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
def GetExampleGallerySeed ( self ) :
2018-06-27 19:27:05 +00:00
with self . _lock :
2020-06-11 12:01:08 +00:00
if len ( self . _gallery_seeds ) == 0 :
return None
else :
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
example_seed = self . _GetNextGallerySeed ( CC . STATUS_UNKNOWN )
if example_seed is None :
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
example_seed = random . choice ( self . _gallery_seeds [ - 10 : ] )
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
return example_seed
2018-06-27 19:27:05 +00:00
2019-08-21 21:34:01 +00:00
def GetAPIInfoDict ( self , simple ) :
with self . _lock :
d = { }
if self . _status_dirty :
self . _GenerateStatus ( )
( status , ( total_processed , total ) ) = self . _status_cache
d [ ' status ' ] = status
d [ ' total_processed ' ] = total_processed
d [ ' total_to_process ' ] = total
if not simple :
d [ ' log_items ' ] = [ gallery_seed . GetAPIInfoDict ( simple ) for gallery_seed in self . _gallery_seeds ]
return d
2018-06-27 19:27:05 +00:00
def GetGallerySeedLogKey ( self ) :
return self . _gallery_seed_log_key
def GetGallerySeedCount ( self , status = None ) :
result = 0
with self . _lock :
if status is None :
result = len ( self . _gallery_seeds )
else :
for gallery_seed in self . _gallery_seeds :
if gallery_seed . status == status :
result + = 1
return result
def GetGallerySeeds ( self , status = None ) :
with self . _lock :
return self . _GetGallerySeeds ( status )
def GetGallerySeedIndex ( self , gallery_seed ) :
with self . _lock :
return self . _gallery_seeds_to_indices [ gallery_seed ]
2020-06-11 12:01:08 +00:00
def GetNextGallerySeed ( self , status ) :
2018-06-27 19:27:05 +00:00
with self . _lock :
2020-06-11 12:01:08 +00:00
return self . _GetNextGallerySeed ( status )
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
def GetStatus ( self ) :
2018-06-27 19:27:05 +00:00
with self . _lock :
if self . _status_dirty :
2020-06-11 12:01:08 +00:00
self . _GenerateStatus ( )
2018-06-27 19:27:05 +00:00
2020-06-11 12:01:08 +00:00
return self . _status_cache
2018-06-27 19:27:05 +00:00
def GetStatusesToCounts ( self ) :
with self . _lock :
return self . _GetStatusesToCounts ( )
def HasGallerySeed ( self , gallery_seed ) :
with self . _lock :
return gallery_seed in self . _gallery_seeds_to_indices
2018-07-04 20:48:28 +00:00
def HasGalleryURL ( self , url ) :
search_gallery_seed = GallerySeed ( url )
search_url = search_gallery_seed . url
return search_url in ( gallery_seed . url for gallery_seed in self . _gallery_seeds )
2018-06-27 19:27:05 +00:00
def NotifyGallerySeedsUpdated ( self , gallery_seeds ) :
2023-01-25 22:59:39 +00:00
if len ( gallery_seeds ) == 0 :
return
2018-06-27 19:27:05 +00:00
with self . _lock :
self . _SetStatusDirty ( )
HG . client_controller . pub ( ' gallery_seed_log_gallery_seeds_updated ' , self . _gallery_seed_log_key , gallery_seeds )
2023-01-25 22:59:39 +00:00
def RemoveGallerySeeds ( self , gallery_seeds_to_delete ) :
2018-06-27 19:27:05 +00:00
with self . _lock :
2023-01-25 22:59:39 +00:00
gallery_seeds_to_delete = { gallery_seed for gallery_seed in gallery_seeds_to_delete if gallery_seed in self . _gallery_seeds_to_indices }
if len ( gallery_seeds_to_delete ) == 0 :
return
earliest_affected_index = min ( ( self . _gallery_seeds_to_indices [ gallery_seed ] for gallery_seed in gallery_seeds_to_delete ) )
2018-06-27 19:27:05 +00:00
self . _gallery_seeds = HydrusSerialisable . SerialisableList ( [ gallery_seed for gallery_seed in self . _gallery_seeds if gallery_seed not in gallery_seeds_to_delete ] )
self . _gallery_seeds_to_indices = { gallery_seed : index for ( index , gallery_seed ) in enumerate ( self . _gallery_seeds ) }
self . _SetStatusDirty ( )
2023-01-25 22:59:39 +00:00
index_shuffled_gallery_seeds = self . _gallery_seeds [ earliest_affected_index : ]
updated_gallery_seeds = gallery_seeds_to_delete . union ( index_shuffled_gallery_seeds )
2018-06-27 19:27:05 +00:00
2023-01-25 22:59:39 +00:00
self . NotifyGallerySeedsUpdated ( updated_gallery_seeds )
2018-06-27 19:27:05 +00:00
def RemoveGallerySeedsByStatus ( self , statuses_to_remove ) :
with self . _lock :
gallery_seeds_to_delete = [ gallery_seed for gallery_seed in self . _gallery_seeds if gallery_seed . status in statuses_to_remove ]
self . RemoveGallerySeeds ( gallery_seeds_to_delete )
def RemoveAllButUnknownGallerySeeds ( self ) :
with self . _lock :
gallery_seeds_to_delete = [ gallery_seed for gallery_seed in self . _gallery_seeds if gallery_seed . status != CC . STATUS_UNKNOWN ]
self . RemoveGallerySeeds ( gallery_seeds_to_delete )
2018-10-17 21:00:09 +00:00
def RestartFailedSearch ( self ) :
with self . _lock :
if len ( self . _gallery_seeds ) == 0 :
return
last_gallery_seed = self . _gallery_seeds [ - 1 ]
if last_gallery_seed . status != CC . STATUS_ERROR :
return
can_generate_more_pages = True
new_gallery_seeds = ( last_gallery_seed . GenerateRestartedDuplicate ( can_generate_more_pages ) , )
self . AddGallerySeeds ( new_gallery_seeds )
self . NotifyGallerySeedsUpdated ( new_gallery_seeds )
2020-06-11 12:01:08 +00:00
def RetryFailed ( self ) :
2018-06-27 19:27:05 +00:00
with self . _lock :
failed_gallery_seeds = self . _GetGallerySeeds ( CC . STATUS_ERROR )
for gallery_seed in failed_gallery_seeds :
gallery_seed . SetStatus ( CC . STATUS_UNKNOWN )
self . NotifyGallerySeedsUpdated ( failed_gallery_seeds )
def WorkToDo ( self ) :
with self . _lock :
if self . _status_dirty :
self . _GenerateStatus ( )
2018-08-01 20:44:57 +00:00
( status , ( total_processed , total ) ) = self . _status_cache
2018-06-27 19:27:05 +00:00
return total_processed < total
HydrusSerialisable . SERIALISABLE_TYPES_TO_OBJECT_TYPES [ HydrusSerialisable . SERIALISABLE_TYPE_GALLERY_SEED_LOG ] = GallerySeedLog