hydrus/include/ClientImporting.py

import bs4
import ClientConstants as CC
import ClientData
import ClientDefaults
import ClientDownloading
import ClientFiles
import ClientImageHandling
import ClientImportOptions
import ClientNetworkingContexts
import ClientNetworkingDomain
import ClientNetworkingJobs
import ClientParsing
import ClientPaths
import ClientTags
import ClientThreading
import collections
import HydrusConstants as HC
import HydrusData
import HydrusExceptions
import HydrusFileHandling
import HydrusImageHandling
import HydrusGlobals as HG
import HydrusPaths
import HydrusSerialisable
import HydrusTags
import HydrusText
import json
import os
import random
import re
import shutil
import threading
import time
import traceback
import urlparse
import wx
import HydrusThreading

CHECKER_STATUS_OK = 0
CHECKER_STATUS_DEAD = 1
CHECKER_STATUS_404 = 2

DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME = 0.1

REPEATING_JOB_TYPICAL_PERIOD = 30.0

def GenerateDownloaderNetworkJobFactory( page_key ):

    def network_job_factory( *args, **kwargs ):

        network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, *args, **kwargs )

        return network_job


    return network_job_factory

def GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key ):

    def network_job_presentation_context_factory( network_job ):

        def enter_call():

            job_key.SetVariable( 'popup_network_job', network_job )


        def exit_call():

            pass


        return NetworkJobPresentationContext( enter_call, exit_call )


    return network_job_presentation_context_factory

def GenerateSeedCacheStatus( statuses_to_counts ):

    num_successful_and_new = statuses_to_counts[ CC.STATUS_SUCCESSFUL_AND_NEW ]
    num_successful_but_redundant = statuses_to_counts[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ]
    num_ignored = statuses_to_counts[ CC.STATUS_VETOED ]
    num_deleted = statuses_to_counts[ CC.STATUS_DELETED ]
    num_failed = statuses_to_counts[ CC.STATUS_ERROR ]
    num_skipped = statuses_to_counts[ CC.STATUS_SKIPPED ]
    num_unknown = statuses_to_counts[ CC.STATUS_UNKNOWN ]

    status_strings = []

    num_successful = num_successful_and_new + num_successful_but_redundant

    if num_successful > 0:

        s = HydrusData.ConvertIntToPrettyString( num_successful ) + ' successful'

        if num_successful_and_new > 0:

            if num_successful_but_redundant > 0:

                s += ' (' + HydrusData.ConvertIntToPrettyString( num_successful_but_redundant ) + ' already in db)'


        else:

            s += ' (all already in db)'


        status_strings.append( s )


    if num_ignored > 0:

        status_strings.append( HydrusData.ConvertIntToPrettyString( num_ignored ) + ' ignored' )


    if num_deleted > 0:

        status_strings.append( HydrusData.ConvertIntToPrettyString( num_deleted ) + ' previously deleted' )


    if num_failed > 0:

        status_strings.append( HydrusData.ConvertIntToPrettyString( num_failed ) + ' failed' )


    if num_skipped > 0:

        status_strings.append( HydrusData.ConvertIntToPrettyString( num_skipped ) + ' skipped' )


    status = ', '.join( status_strings )

    total = sum( statuses_to_counts.values() )

    total_processed = total - num_unknown

    return ( status, ( total_processed, total ) )

def GenerateSinglePopupNetworkJobPresentationContextFactory( job_key ):

    def network_job_presentation_context_factory( network_job ):

        def enter_call():

            job_key.SetVariable( 'popup_network_job', network_job )


        def exit_call():

            job_key.DeleteVariable( 'popup_network_job' )


        return NetworkJobPresentationContext( enter_call, exit_call )


    return network_job_presentation_context_factory

def GenerateSubscriptionNetworkJobFactory( subscription_key ):

    def network_job_factory( *args, **kwargs ):

        network_job = ClientNetworkingJobs.NetworkJobSubscription( subscription_key, *args, **kwargs )

        network_job.OverrideBandwidth( 30 )

        return network_job


    return network_job_factory

def GenerateWatcherNetworkJobFactory( watcher_key ):

    def network_job_factory( *args, **kwargs ):

        network_job = ClientNetworkingJobs.NetworkJobWatcherPage( watcher_key, *args, **kwargs )

        return network_job


    return network_job_factory

def GetRepeatingJobInitialDelay():

    return 0.5 + ( random.random() * 0.5 )

def PageImporterShouldStopWorking( page_key ):

    return HG.view_shutdown or not HG.client_controller.PageAlive( page_key )

def PublishPresentationHashes( name, hashes, publish_to_popup_button, publish_files_to_page ):

    if publish_to_popup_button:

        files_job_key = ClientThreading.JobKey()

        files_job_key.SetVariable( 'popup_files_mergable', True )
        files_job_key.SetVariable( 'popup_files', ( list( hashes ), name ) )

        HG.client_controller.pub( 'message', files_job_key )


    if publish_files_to_page:

        HG.client_controller.pub( 'imported_files_to_page', list( hashes ), name )


def THREADDownloadURL( job_key, url, url_string ):

    job_key.SetVariable( 'popup_title', url_string )
    job_key.SetVariable( 'popup_text_1', 'downloading and importing' )

    #

    file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )

    def network_job_factory( *args, **kwargs ):

        network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )

        network_job.OverrideBandwidth( 30 )

        return network_job


    network_job_presentation_context_factory = GenerateSinglePopupNetworkJobPresentationContextFactory( job_key )

    seed = Seed( SEED_TYPE_URL, url )

    #

    try:

        seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory )

        status = seed.status

        if status in CC.SUCCESSFUL_IMPORT_STATES:

            if status == CC.STATUS_SUCCESSFUL_AND_NEW:

                job_key.SetVariable( 'popup_text_1', 'successful!' )

            elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:

                job_key.SetVariable( 'popup_text_1', 'was already in the database!' )


            hash = seed.GetHash()

            job_key.SetVariable( 'popup_files', ( [ hash ], 'download' ) )

        elif status == CC.STATUS_DELETED:

            job_key.SetVariable( 'popup_text_1', 'had already been deleted!' )


    finally:

        job_key.Finish()


def THREADDownloadURLs( job_key, urls, title ):

    job_key.SetVariable( 'popup_title', title )
    job_key.SetVariable( 'popup_text_1', 'initialising' )

    num_successful = 0
    num_redundant = 0
    num_deleted = 0
    num_failed = 0

    presentation_hashes = []
    presentation_hashes_fast = set()

    file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )

    def network_job_factory( *args, **kwargs ):

        network_job = ClientNetworkingJobs.NetworkJob( *args, **kwargs )

        network_job.OverrideBandwidth()

        return network_job


    network_job_presentation_context_factory = GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key )

    for ( i, url ) in enumerate( urls ):

        ( i_paused, should_quit ) = job_key.WaitIfNeeded()

        if should_quit:

            break


        job_key.SetVariable( 'popup_text_1', HydrusData.ConvertValueRangeToPrettyString( i + 1, len( urls ) ) )
        job_key.SetVariable( 'popup_gauge_1', ( i + 1, len( urls ) ) )

        seed = Seed( SEED_TYPE_URL, url )

        try:

            seed.DownloadAndImportRawFile( url, file_import_options, network_job_factory, network_job_presentation_context_factory )

            status = seed.status

            if status in CC.SUCCESSFUL_IMPORT_STATES:

                if status == CC.STATUS_SUCCESSFUL_AND_NEW:

                    num_successful += 1

                elif status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:

                    num_redundant += 1


                hash = seed.GetHash()

                if hash not in presentation_hashes_fast:

                    presentation_hashes.append( hash )


                presentation_hashes_fast.add( hash )

            elif status == CC.STATUS_DELETED:

                num_deleted += 1


        except Exception as e:

            num_failed += 1

            HydrusData.Print( url + ' failed to import!' )
            HydrusData.PrintException( e )


    job_key.DeleteVariable( 'popup_network_job' )

    text_components = []

    if num_successful > 0:

        text_components.append( HydrusData.ConvertIntToPrettyString( num_successful ) + ' successful' )


    if num_redundant > 0:

        text_components.append( HydrusData.ConvertIntToPrettyString( num_redundant ) + ' already in db' )


    if num_deleted > 0:

        text_components.append( HydrusData.ConvertIntToPrettyString( num_deleted ) + ' deleted' )


    if num_failed > 0:

        text_components.append( HydrusData.ConvertIntToPrettyString( num_failed ) + ' failed (errors written to log)' )


    job_key.SetVariable( 'popup_text_1', ', '.join( text_components ) )

    if len( presentation_hashes ) > 0:

        job_key.SetVariable( 'popup_files', ( presentation_hashes, 'downloads' ) )


    job_key.DeleteVariable( 'popup_gauge_1' )

    job_key.Finish()

def UpdateSeedCacheWithAllParseResults( seed_cache, all_parse_results, source_url = None, tag_import_options = None ):

    # need a limit param here for 'stop at 40 total new because of file limit'

    new_seeds = []

    num_new = 0
    num_already_in = 0

    for parse_results in all_parse_results:

        parsed_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST ), only_get_top_priority = True )

        for url in parsed_urls:

            seed = Seed( SEED_TYPE_URL, url )

            if source_url is not None:

                seed.AddURL( source_url )


            if seed_cache.HasSeed( seed ):

                num_already_in += 1

            else:

                num_new += 1

                seed.AddParseResults( parse_results )

                new_seeds.append( seed )


    seed_cache.AddSeeds( new_seeds )

    return ( num_new, num_already_in )

def WakeRepeatingJob( job ):

    if job is not None:

        job.Wake()


class FileImportJob( object ):

    def __init__( self, temp_path, file_import_options = None ):

        if file_import_options is None:

            file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )


        self._temp_path = temp_path
        self._file_import_options = file_import_options

        self._hash = None
        self._pre_import_status = None

        self._file_info = None
        self._thumbnail = None
        self._phashes = None
        self._extra_hashes = None


    def CheckIsGoodToImport( self ):

        ( size, mime, width, height, duration, num_frames, num_words ) = self._file_info

        self._file_import_options.CheckFileIsValid( size, mime, width, height )


    def GetExtraHashes( self ):

        return self._extra_hashes


    def GetFileImportOptions( self ):

        return self._file_import_options


    def GetFileInfo( self ):

        return self._file_info


    def GetHash( self ):

        return self._hash


    def GetMime( self ):

        ( size, mime, width, height, duration, num_frames, num_words ) = self._file_info

        return mime


    def GetPreImportStatus( self ):

        return self._pre_import_status


    def GetPHashes( self ):

        return self._phashes


    def GetTempPathAndThumbnail( self ):

        return ( self._temp_path, self._thumbnail )


    def PubsubContentUpdates( self ):

        if self._pre_import_status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:

            if self._file_import_options.AutomaticallyArchives():

                service_keys_to_content_updates = { CC.COMBINED_LOCAL_FILE_SERVICE_KEY : [ HydrusData.ContentUpdate( HC.CONTENT_TYPE_FILES, HC.CONTENT_UPDATE_ARCHIVE, set( ( self._hash, ) ) ) ] }

                HG.client_controller.Write( 'content_updates', service_keys_to_content_updates )


    def IsNewToDB( self ):

        if self._pre_import_status == CC.STATUS_UNKNOWN:

            return True


        if self._pre_import_status == CC.STATUS_DELETED:

            if not self._file_import_options.ExcludesDeleted():

                return True


        return False


    def GenerateHashAndStatus( self ):

        HydrusImageHandling.ConvertToPngIfBmp( self._temp_path )

        self._hash = HydrusFileHandling.GetHashFromPath( self._temp_path )

        ( self._pre_import_status, hash, note ) = HG.client_controller.Read( 'hash_status', 'sha256', self._hash, prefix = 'recognised during import' )

        return ( self._pre_import_status, self._hash, note )


    def GenerateInfo( self ):

        mime = HydrusFileHandling.GetMime( self._temp_path )

        new_options = HG.client_controller.new_options

        if mime in HC.DECOMPRESSION_BOMB_IMAGES and not self._file_import_options.AllowsDecompressionBombs():

            if HydrusImageHandling.IsDecompressionBomb( self._temp_path ):

                raise HydrusExceptions.DecompressionBombException( 'Image seems to be a Decompression Bomb!' )


        self._file_info = HydrusFileHandling.GetFileInfo( self._temp_path, mime )

        ( size, mime, width, height, duration, num_frames, num_words ) = self._file_info

        if mime in HC.MIMES_WITH_THUMBNAILS:

            percentage_in = HG.client_controller.new_options.GetInteger( 'video_thumbnail_percentage_in' )

            self._thumbnail = HydrusFileHandling.GenerateThumbnail( self._temp_path, mime, percentage_in = percentage_in )


        if mime in HC.MIMES_WE_CAN_PHASH:

            self._phashes = ClientImageHandling.GenerateShapePerceptualHashes( self._temp_path, mime )


        self._extra_hashes = HydrusFileHandling.GetExtraHashesFromPath( self._temp_path )


class GalleryImport( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT
    SERIALISABLE_NAME = 'Gallery Import'
    SERIALISABLE_VERSION = 2

    def __init__( self, gallery_identifier = None ):

        if gallery_identifier is None:

            gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )


        HydrusSerialisable.SerialisableBase.__init__( self )

        self._gallery_identifier = gallery_identifier

        self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )

        self._current_query = None
        self._current_query_num_urls = 0

        self._current_gallery_stream_identifier = None
        self._current_gallery_stream_identifier_page_index = 0
        self._current_gallery_stream_identifier_found_urls = set()

        self._pending_gallery_stream_identifiers = []

        self._pending_queries = []

        new_options = HG.client_controller.new_options

        self._file_limit = HC.options[ 'gallery_file_limit' ]
        self._gallery_paused = False
        self._files_paused = False

        self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )

        self._tag_import_options = new_options.GetDefaultTagImportOptions( self._gallery_identifier )

        self._last_gallery_page_hit_timestamp = 0

        self._seed_cache = SeedCache()

        self._lock = threading.Lock()

        self._gallery = None

        self._gallery_status = ''
        self._gallery_status_can_change_timestamp = 0

        self._current_action = ''

        self._download_control_file_set = None
        self._download_control_file_clear = None

        self._download_control_gallery_set = None
        self._download_control_gallery_clear = None

        self._files_repeating_job = None
        self._gallery_repeating_job = None

        HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )


    def _GetSerialisableInfo( self ):

        serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
        serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]

        if self._current_gallery_stream_identifier is None:

            serialisable_current_gallery_stream_identifier = None

        else:

            serialisable_current_gallery_stream_identifier = self._current_gallery_stream_identifier.GetSerialisableTuple()


        serialisable_current_gallery_stream_identifier_found_urls = list( self._current_gallery_stream_identifier_found_urls )

        serialisable_pending_gallery_stream_identifiers = [ pending_gallery_stream_identifier.GetSerialisableTuple() for pending_gallery_stream_identifier in self._pending_gallery_stream_identifiers ]

        serialisable_file_options = self._file_import_options.GetSerialisableTuple()
        serialisable_tag_options = self._tag_import_options.GetSerialisableTuple()
        serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()

        serialisable_current_query_stuff = ( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifiers )

        return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, self._pending_queries, self._file_limit, self._gallery_paused, self._files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache ) = serialisable_info

        ( self._current_query, self._current_query_num_urls, serialisable_current_gallery_stream_identifier, self._current_gallery_stream_identifier_page_index, serialisable_current_gallery_stream_identifier_found_urls, serialisable_pending_gallery_stream_identifier ) = serialisable_current_query_stuff

        self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )

        self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]

        if serialisable_current_gallery_stream_identifier is None:

            self._current_gallery_stream_identifier = None

        else:

            self._current_gallery_stream_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_current_gallery_stream_identifier )


        self._current_gallery_stream_identifier_found_urls = set( serialisable_current_gallery_stream_identifier_found_urls )

        self._pending_gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_pending_gallery_stream_identifier ) for serialisable_pending_gallery_stream_identifier in serialisable_pending_gallery_stream_identifier ]
        self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
        self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )
        self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )


    def _FileNetworkJobPresentationContextFactory( self, network_job ):

        def enter_call():

            with self._lock:

                if self._download_control_file_set is not None:

                    wx.CallAfter( self._download_control_file_set, network_job )


        def exit_call():

            with self._lock:

                if self._download_control_file_clear is not None:

                    wx.CallAfter( self._download_control_file_clear )


        return NetworkJobPresentationContext( enter_call, exit_call )


    def _SetGalleryStatus( self, status, timeout = None ):

        if HydrusData.TimeHasPassed( self._gallery_status_can_change_timestamp ):

            self._gallery_status = status

            if timeout is not None:

                self._gallery_status_can_change_timestamp = HydrusData.GetNow() + timeout


    def _UpdateSerialisableInfo( self, version, old_serialisable_info ):

        if version == 1:

            ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, pending_queries, get_tags_if_url_known_and_file_redundant, file_limit, gallery_paused, files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache ) = old_serialisable_info

            new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_current_query_stuff, pending_queries, file_limit, gallery_paused, files_paused, serialisable_file_options, serialisable_tag_options, serialisable_seed_cache )

            return ( 2, new_serialisable_info )


    def _WorkOnFiles( self, page_key ):

        seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

        if seed is None:

            return


        did_substantial_work = False

        def network_job_factory( method, url, **kwargs ):

            network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, method, url, **kwargs )

            wx.CallAfter( self._download_control_file_set, network_job )

            return network_job


        try:

            gallery = ClientDownloading.GetGallery( self._gallery_identifier )

        except Exception as e:

            HydrusData.PrintException( e )

            with self._lock:

                self._files_paused = True
                self._gallery_paused = True

                HydrusData.ShowText( 'A downloader could not load its gallery! It has been paused and the full error has been written to the log!' )

                return


        gallery.SetNetworkJobFactory( network_job_factory )

        try:

            if seed.WorksInNewSystem():

                def status_hook( text ):

                    with self._lock:

                        self._current_action = text


                did_substantial_work = seed.WorkOnPostURL( self._file_import_options, self._tag_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._FileNetworkJobPresentationContextFactory )

                if seed.ShouldPresent( self._file_import_options ):

                    seed.PresentToPage( page_key )

                    did_substantial_work = True


            else:

                with self._lock:

                    self._current_action = 'reviewing file'


                seed.PredictPreImportStatus( self._file_import_options )

                status = seed.status

                url = seed.seed_data

                if status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:

                    if self._tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB() and self._tag_import_options.WorthFetchingTags():

                        downloaded_tags = gallery.GetTags( url )

                        seed.AddTags( downloaded_tags )


                elif status == CC.STATUS_UNKNOWN:

                    ( os_file_handle, temp_path ) = ClientPaths.GetTempPath()

                    try:

                        with self._lock:

                            self._current_action = 'downloading file'


                        if self._tag_import_options.WorthFetchingTags():

                            downloaded_tags = gallery.GetFileAndTags( temp_path, url )

                            seed.AddTags( downloaded_tags )

                        else:

                            gallery.GetFile( temp_path, url )


                        seed.CheckPreFetchMetadata( self._tag_import_options )

                        with self._lock:

                            self._current_action = 'importing file'


                        seed.Import( temp_path, self._file_import_options )

                        did_substantial_work = True

                    finally:

                        HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )


                did_substantial_work = seed.WriteContentUpdates( self._tag_import_options )

                if seed.ShouldPresent( self._file_import_options ):

                    seed.PresentToPage( page_key )

                    did_substantial_work = True


        except HydrusExceptions.VetoException as e:

            status = CC.STATUS_VETOED

            note = HydrusData.ToUnicode( e )

            seed.SetStatus( status, note = note )

            if isinstance( e, HydrusExceptions.CancelledException ):

                time.sleep( 2 )


        except HydrusExceptions.NotFoundException:

            status = CC.STATUS_VETOED
            note = '404'

            seed.SetStatus( status, note = note )

            time.sleep( 2 )

        except Exception as e:

            status = CC.STATUS_ERROR

            seed.SetStatus( status, exception = e )

            time.sleep( 3 )

        finally:

            self._seed_cache.NotifySeedsUpdated( ( seed, ) )

            wx.CallAfter( self._download_control_file_clear )


        with self._lock:

            self._current_action = ''


        if did_substantial_work:

            time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )


    def _WorkOnGallery( self, page_key ):

        with self._lock:

            if self._current_query is None:

                if len( self._pending_queries ) == 0:

                    self._SetGalleryStatus( '' )

                    return False

                else:

                    self._current_query = self._pending_queries.pop( 0 )
                    self._current_query_num_urls = 0

                    self._current_gallery_stream_identifier = None
                    self._pending_gallery_stream_identifiers = list( self._gallery_stream_identifiers )


            if self._current_gallery_stream_identifier is None:

                if len( self._pending_gallery_stream_identifiers ) == 0:

                    self._SetGalleryStatus( self._current_query + ': produced ' + HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls', 5 )

                    self._current_query = None

                    return False

                else:

                    self._current_gallery_stream_identifier = self._pending_gallery_stream_identifiers.pop( 0 )
                    self._current_gallery_stream_identifier_page_index = 0
                    self._current_gallery_stream_identifier_found_urls = set()


            next_gallery_page_hit_timestamp = self._last_gallery_page_hit_timestamp + HG.client_controller.new_options.GetInteger( 'gallery_page_wait_period_pages' )

            if not HydrusData.TimeHasPassed( next_gallery_page_hit_timestamp ):

                if self._current_gallery_stream_identifier_page_index == 0:

                    page_check_status = 'checking first page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )

                else:

                    page_check_status = HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found, checking next page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )


                self._SetGalleryStatus( self._current_query + ': ' + page_check_status )

                return True


            def network_job_factory( method, url, **kwargs ):

                network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, method, url, **kwargs )

                network_job.OverrideBandwidth( 30 )

                wx.CallAfter( self._download_control_gallery_set, network_job )

                return network_job


            try:

                gallery = ClientDownloading.GetGallery( self._current_gallery_stream_identifier )

            except Exception as e:

                HydrusData.PrintException( e )

                with self._lock:

                    self._files_paused = True
                    self._gallery_paused = True

                    HydrusData.ShowText( 'A downloader could not load its gallery! It has been paused and the full error has been written to the log!' )

                    return False


            gallery.SetNetworkJobFactory( network_job_factory )

            query = self._current_query
            page_index = self._current_gallery_stream_identifier_page_index

            self._SetGalleryStatus( self._current_query + ': ' + HydrusData.ConvertIntToPrettyString( self._current_query_num_urls ) + ' urls found, now checking page ' + HydrusData.ConvertIntToPrettyString( self._current_gallery_stream_identifier_page_index + 1 ) )


        error_occured = False

        num_already_in_seed_cache = 0
        new_seeds = []

        try:

            try:

                ( page_of_seeds, definitely_no_more_pages ) = gallery.GetPage( query, page_index )

            finally:

                self._last_gallery_page_hit_timestamp = HydrusData.GetNow()


            with self._lock:

                no_urls_found = len( page_of_seeds ) == 0

                page_of_urls = [ seed.seed_data for seed in page_of_seeds ]
                no_new_urls = len( self._current_gallery_stream_identifier_found_urls.intersection( page_of_urls ) ) == len( page_of_seeds )

                if definitely_no_more_pages or no_urls_found or no_new_urls:

                    self._current_gallery_stream_identifier = None

                else:

                    self._current_gallery_stream_identifier_page_index += 1
                    self._current_gallery_stream_identifier_found_urls.update( page_of_urls )


            for seed in page_of_seeds:

                if self._seed_cache.HasSeed( seed ):

                    num_already_in_seed_cache += 1

                else:

                    with self._lock:

                        if self._file_limit is not None and self._current_query_num_urls + 1 > self._file_limit:

                            self._current_gallery_stream_identifier = None

                            self._pending_gallery_stream_identifiers = []

                            break


                        self._current_query_num_urls += 1


                    new_seeds.append( seed )


            self._seed_cache.AddSeeds( new_seeds )

            if len( new_seeds ) > 0:

                WakeRepeatingJob( self._files_repeating_job )


        except Exception as e:

            if isinstance( e, HydrusExceptions.NotFoundException ):

                text = 'gallery 404'

            else:

                text = HydrusData.ToUnicode( e )

                HydrusData.DebugPrint( traceback.format_exc() )


            with self._lock:

                self._current_gallery_stream_identifier = None

                self._SetGalleryStatus( text, 5 )


            time.sleep( 5 )

        finally:

            wx.CallAfter( self._download_control_gallery_clear )


        with self._lock:

            status = query + ': ' + HydrusData.ConvertIntToPrettyString( len( new_seeds ) ) + ' new urls found'

            if num_already_in_seed_cache > 0:

                status += ' (' + HydrusData.ConvertIntToPrettyString( num_already_in_seed_cache ) + ' of last page already in queue)'


            self._SetGalleryStatus( status )


        return True


    def AdvanceQueries( self, queries ):

        with self._lock:

            queries_lookup = set( queries )

            for query in queries:

                if query in self._pending_queries:

                    index = self._pending_queries.index( query )

                    if index > 0 and self._pending_queries[ index - 1 ] not in queries_lookup:

                        self._pending_queries.remove( query )

                        self._pending_queries.insert( index - 1, query )


    def CurrentlyWorking( self ):

        with self._lock:

            finished = not self._seed_cache.WorkToDo()

            return not finished and not self._files_paused


    def DelayQueries( self, queries ):

        with self._lock:

            queries = list( queries )

            queries.reverse()

            queries_lookup = set( queries )

            for query in queries:

                if query in self._pending_queries:

                    index = self._pending_queries.index( query )

                    if index + 1 < len( self._pending_queries ) and self._pending_queries[ index + 1 ] not in queries_lookup:

                        self._pending_queries.remove( query )

                        self._pending_queries.insert( index + 1, query )


    def DeleteQueries( self, queries ):

        with self._lock:

            for query in queries:

                if query in self._pending_queries:

                    self._pending_queries.remove( query )


    def FinishCurrentQuery( self ):

        with self._lock:

            self._current_query = None
            self._gallery_paused = False

            WakeRepeatingJob( self._gallery_repeating_job )


    def GetGalleryIdentifier( self ):

        return self._gallery_identifier


    def GetOptions( self ):

        with self._lock:

            return ( self._file_import_options, self._tag_import_options, self._file_limit )


    def GetSeedCache( self ):

        return self._seed_cache


    def GetStatus( self ):

        with self._lock:

            cancellable = self._current_query is not None

            return ( list( self._pending_queries ), self._gallery_status, self._current_action, self._files_paused, self._gallery_paused, cancellable )


    def GetValueRange( self ):

        with self._lock:

            return self._seed_cache.GetValueRange()


    def NotifySeedsUpdated( self, seed_cache_key, seeds ):

        if seed_cache_key == self._seed_cache.GetSeedCacheKey():

            WakeRepeatingJob( self._files_repeating_job )


    def PausePlayFiles( self ):

        with self._lock:

            self._files_paused = not self._files_paused

            WakeRepeatingJob( self._files_repeating_job )


    def PausePlayGallery( self ):

        with self._lock:

            self._gallery_paused = not self._gallery_paused

            WakeRepeatingJob( self._gallery_repeating_job )


    def PendQuery( self, query ):

        with self._lock:

            if query not in self._pending_queries:

                self._pending_queries.append( query )

                WakeRepeatingJob( self._gallery_repeating_job )


    def SetDownloadControls( self, file_download_control, gallery_download_control ):

        with self._lock:

            self._download_control_file_set = file_download_control.SetNetworkJob
            self._download_control_file_clear = file_download_control.ClearNetworkJob

            self._download_control_gallery_set = gallery_download_control.SetNetworkJob
            self._download_control_gallery_clear = gallery_download_control.ClearNetworkJob


    def SetFileLimit( self, file_limit ):

        with self._lock:

            self._file_limit = file_limit


    def SetFileImportOptions( self, file_import_options ):

        with self._lock:

            self._file_import_options = file_import_options


    def SetTagImportOptions( self, tag_import_options ):

        with self._lock:

            self._tag_import_options = tag_import_options


    def Start( self, page_key ):

        self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )
        self._gallery_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnGallery, page_key )


    def REPEATINGWorkOnFiles( self, page_key ):

        with self._lock:

            if PageImporterShouldStopWorking( page_key ):

                self._files_repeating_job.Cancel()

                return


            work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


        while work_to_do:

            try:

                self._WorkOnFiles( page_key )

                HG.client_controller.WaitUntilViewFree()

            except Exception as e:

                HydrusData.ShowException( e )


            with self._lock:

                if PageImporterShouldStopWorking( page_key ):

                    self._files_repeating_job.Cancel()

                    return


                work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


    def REPEATINGWorkOnGallery( self, page_key ):

        with self._lock:

            if PageImporterShouldStopWorking( page_key ):

                self._gallery_repeating_job.Cancel()

                return


            ok_to_work = not ( self._gallery_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


        while ok_to_work:

            try:

                work_to_do = self._WorkOnGallery( page_key )

                if work_to_do:

                    time.sleep( 1 )

                else:

                    return


                HG.client_controller.WaitUntilViewFree()

            except Exception as e:

                HydrusData.ShowException( e )


            with self._lock:

                if PageImporterShouldStopWorking( page_key ):

                    self._gallery_repeating_job.Cancel()

                    return


                ok_to_work = not ( self._gallery_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IMPORT ] = GalleryImport

class HDDImport( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT
    SERIALISABLE_NAME = 'Local File Import'
    SERIALISABLE_VERSION = 1

    def __init__( self, paths = None, file_import_options = None, paths_to_tags = None, delete_after_success = None ):

        HydrusSerialisable.SerialisableBase.__init__( self )

        if paths is None:

            self._seed_cache = None

        else:

            self._seed_cache = SeedCache()

            seeds = []

            for path in paths:

                seed = Seed( SEED_TYPE_HDD, path )

                try:

                    s = os.stat( path )

                    seed.source_time = int( min( s.st_mtime, s.st_ctime ) )

                except:

                    pass


                seeds.append( seed )


            self._seed_cache.AddSeeds( seeds )


        self._file_import_options = file_import_options
        self._paths_to_tags = paths_to_tags
        self._delete_after_success = delete_after_success

        self._current_action = ''
        self._paused = False

        self._lock = threading.Lock()

        self._files_repeating_job = None

        HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )


    def _GetSerialisableInfo( self ):

        serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
        serialisable_options = self._file_import_options.GetSerialisableTuple()
        serialisable_paths_to_tags = { path : { service_key.encode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in self._paths_to_tags.items() }

        return ( serialisable_seed_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( serialisable_seed_cache, serialisable_options, serialisable_paths_to_tags, self._delete_after_success, self._paused ) = serialisable_info

        self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
        self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_options )
        self._paths_to_tags = { path : { service_key.decode( 'hex' ) : tags for ( service_key, tags ) in service_keys_to_tags.items() } for ( path, service_keys_to_tags ) in serialisable_paths_to_tags.items() }


    def _WorkOnFiles( self, page_key ):

        seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

        if seed is None:

            return


        did_substantial_work = False

        path = seed.seed_data

        with self._lock:

            if path in self._paths_to_tags:

                service_keys_to_tags = self._paths_to_tags[ path ]

            else:

                service_keys_to_tags = {}


        try:

            if not os.path.exists( path ):

                raise Exception( 'Source file does not exist!' )


            with self._lock:

                self._current_action = 'importing'


            seed.ImportPath( self._file_import_options )

            did_substantial_work = True

            if seed.status in CC.SUCCESSFUL_IMPORT_STATES:

                hash = seed.GetHash()

                service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )

                if len( service_keys_to_content_updates ) > 0:

                    HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )

                    did_substantial_work = True


                if seed.ShouldPresent( self._file_import_options ):

                    seed.PresentToPage( page_key )

                    did_substantial_work = True


                if self._delete_after_success:

                    try:

                        ClientPaths.DeletePath( path )

                    except Exception as e:

                        HydrusData.ShowText( 'While attempting to delete ' + path + ', the following error occured:' )
                        HydrusData.ShowException( e )


                    txt_path = path + '.txt'

                    if os.path.exists( txt_path ):

                        try:

                            ClientPaths.DeletePath( txt_path )

                        except Exception as e:

                            HydrusData.ShowText( 'While attempting to delete ' + txt_path + ', the following error occured:' )
                            HydrusData.ShowException( e )


        except HydrusExceptions.VetoException as e:

            status = CC.STATUS_VETOED

            note = HydrusData.ToUnicode( e )

            seed.SetStatus( status, note = note )

        except Exception as e:

            status = CC.STATUS_ERROR

            seed.SetStatus( status, exception = e )

        finally:

            self._seed_cache.NotifySeedsUpdated( ( seed, ) )

            with self._lock:

                self._current_action = ''


        if did_substantial_work:

            time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )


    def CurrentlyWorking( self ):

        with self._lock:

            work_to_do = self._seed_cache.WorkToDo()

            return work_to_do and not self._paused


    def GetFileImportOptions( self ):

        with self._lock:

            return self._file_import_options


    def GetSeedCache( self ):

        return self._seed_cache


    def GetStatus( self ):

        with self._lock:

            return ( self._current_action, self._paused )


    def GetValueRange( self ):

        with self._lock:

            return self._seed_cache.GetValueRange()


    def NotifySeedsUpdated( self, seed_cache_key, seeds ):

        if seed_cache_key == self._seed_cache.GetSeedCacheKey():

            WakeRepeatingJob( self._files_repeating_job )


    def PausePlay( self ):

        with self._lock:

            self._paused = not self._paused

            WakeRepeatingJob( self._files_repeating_job )


    def SetFileImportOptions( self, file_import_options ):

        with self._lock:

            self._file_import_options = file_import_options


    def Start( self, page_key ):

        self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )


    def REPEATINGWorkOnFiles( self, page_key ):

        with self._lock:

            if PageImporterShouldStopWorking( page_key ):

                self._files_repeating_job.Cancel()

                return


            work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


        while work_to_do:

            try:

                self._WorkOnFiles( page_key )

                HG.client_controller.WaitUntilViewFree()

            except Exception as e:

                HydrusData.ShowException( e )


            with self._lock:

                if PageImporterShouldStopWorking( page_key ):

                    self._files_repeating_job.Cancel()

                    return


                work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_HDD_IMPORT ] = HDDImport

class ImportFolder( HydrusSerialisable.SerialisableBaseNamed ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER
    SERIALISABLE_NAME = 'Import Folder'
    SERIALISABLE_VERSION = 6

    def __init__( self, name, path = '', file_import_options = None, tag_import_options = None, tag_service_keys_to_filename_tagging_options = None, mimes = None, actions = None, action_locations = None, period = 3600, check_regularly = True, show_working_popup = True, publish_files_to_popup_button = True, publish_files_to_page = False ):

        if mimes is None:

            mimes = HC.ALLOWED_MIMES


        if file_import_options is None:

            file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' )


        if tag_import_options is None:

            tag_import_options = HG.client_controller.new_options.GetDefaultTagImportOptions( ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEFAULT ) )


        if tag_service_keys_to_filename_tagging_options is None:

            tag_service_keys_to_filename_tagging_options = {}


        if actions is None:

            actions = {}

            actions[ CC.STATUS_SUCCESSFUL_AND_NEW ] = CC.IMPORT_FOLDER_IGNORE
            actions[ CC.STATUS_SUCCESSFUL_BUT_REDUNDANT ] = CC.IMPORT_FOLDER_IGNORE
            actions[ CC.STATUS_DELETED ] = CC.IMPORT_FOLDER_IGNORE
            actions[ CC.STATUS_ERROR ] = CC.IMPORT_FOLDER_IGNORE


        if action_locations is None:

            action_locations = {}


        HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )

        self._path = path
        self._mimes = mimes
        self._file_import_options = file_import_options
        self._tag_import_options = tag_import_options
        self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
        self._actions = actions
        self._action_locations = action_locations
        self._period = period
        self._check_regularly = check_regularly

        self._seed_cache = SeedCache()
        self._last_checked = 0
        self._paused = False
        self._check_now = False

        self._show_working_popup = show_working_popup
        self._publish_files_to_popup_button = publish_files_to_popup_button
        self._publish_files_to_page = publish_files_to_page


    def _ActionPaths( self ):

        for status in ( CC.STATUS_SUCCESSFUL_AND_NEW, CC.STATUS_SUCCESSFUL_BUT_REDUNDANT, CC.STATUS_DELETED, CC.STATUS_ERROR ):

            action = self._actions[ status ]

            if action == CC.IMPORT_FOLDER_DELETE:

                while True:

                    seed = self._seed_cache.GetNextSeed( status )

                    if seed is None or HG.view_shutdown:

                        break


                    path = seed.seed_data

                    try:

                        if os.path.exists( path ):

                            ClientPaths.DeletePath( path )


                        txt_path = path + '.txt'

                        if os.path.exists( txt_path ):

                            ClientPaths.DeletePath( txt_path )


                        self._seed_cache.RemoveSeeds( ( seed, ) )

                    except Exception as e:

                        HydrusData.ShowText( 'Import folder tried to delete ' + path + ', but could not:' )

                        HydrusData.ShowException( e )

                        HydrusData.ShowText( 'Import folder has been paused.' )

                        self._paused = True

                        return


            elif action == CC.IMPORT_FOLDER_MOVE:

                while True:

                    seed = self._seed_cache.GetNextSeed( status )

                    if seed is None or HG.view_shutdown:

                        break


                    path = seed.seed_data

                    try:

                        dest_dir = self._action_locations[ status ]

                        if not os.path.exists( dest_dir ):

                            raise HydrusExceptions.DataMissing( 'The move location "' + dest_dir + '" does not exist!' )


                        if os.path.exists( path ):

                            filename = os.path.basename( path )

                            dest_path = os.path.join( dest_dir, filename )

                            dest_path = HydrusPaths.AppendPathUntilNoConflicts( dest_path )

                            HydrusPaths.MergeFile( path, dest_path )


                        txt_path = path + '.txt'

                        if os.path.exists( txt_path ):

                            txt_filename = os.path.basename( txt_path )

                            txt_dest_path = os.path.join( dest_dir, txt_filename )

                            txt_dest_path = HydrusPaths.AppendPathUntilNoConflicts( txt_dest_path )

                            HydrusPaths.MergeFile( txt_path, txt_dest_path )


                        self._seed_cache.RemoveSeeds( ( seed, ) )

                    except Exception as e:

                        HydrusData.ShowText( 'Import folder tried to move ' + path + ', but could not:' )

                        HydrusData.ShowException( e )

                        HydrusData.ShowText( 'Import folder has been paused.' )

                        self._paused = True

                        return


            elif status == CC.IMPORT_FOLDER_IGNORE:

                pass


    def _CheckFolder( self, job_key ):

        filenames = os.listdir( HydrusData.ToUnicode( self._path ) )

        raw_paths = [ os.path.join( self._path, filename ) for filename in filenames ]

        all_paths = ClientFiles.GetAllPaths( raw_paths )

        all_paths = HydrusPaths.FilterFreePaths( all_paths )

        seeds = []

        for path in all_paths:

            if job_key.IsCancelled():

                break


            if path.endswith( '.txt' ):

                continue


            seed = Seed( SEED_TYPE_HDD, path )

            if not self._seed_cache.HasSeed( seed ):

                seeds.append( seed )


            job_key.SetVariable( 'popup_text_1', 'checking: found ' + HydrusData.ConvertIntToPrettyString( len( seeds ) ) + ' new files' )


        self._seed_cache.AddSeeds( seeds )

        self._last_checked = HydrusData.GetNow()
        self._check_now = False


    def _GetSerialisableInfo( self ):

        serialisable_file_import_options = self._file_import_options.GetSerialisableTuple()
        serialisable_tag_import_options = self._tag_import_options.GetSerialisableTuple()
        serialisable_tag_service_keys_to_filename_tagging_options = [ ( service_key.encode( 'hex' ), filename_tagging_options.GetSerialisableTuple() ) for ( service_key, filename_tagging_options ) in self._tag_service_keys_to_filename_tagging_options.items() ]
        serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()

        # json turns int dict keys to strings
        action_pairs = self._actions.items()
        action_location_pairs = self._action_locations.items()

        return ( self._path, self._mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, self._period, self._check_regularly, serialisable_seed_cache, self._last_checked, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page )


    def _ImportFiles( self, job_key ):

        did_work = False

        time_to_save = HydrusData.GetNow() + 600

        num_files_imported = 0
        presentation_hashes = []
        presentation_hashes_fast = set()

        i = 0

        num_total = len( self._seed_cache )
        num_total_unknown = self._seed_cache.GetSeedCount( CC.STATUS_UNKNOWN )
        num_total_done = num_total - num_total_unknown

        while True:

            seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

            p1 = HC.options[ 'pause_import_folders_sync' ] or self._paused
            p2 = HydrusThreading.IsThreadShuttingDown()
            p3 = job_key.IsCancelled()

            if seed is None or p1 or p2 or p3:

                break


            if HydrusData.TimeHasPassed( time_to_save ):

                HG.client_controller.WriteSynchronous( 'serialisable', self )

                time_to_save = HydrusData.GetNow() + 600


            gauge_num_done = num_total_done + num_files_imported + 1

            job_key.SetVariable( 'popup_text_1', 'importing file ' + HydrusData.ConvertValueRangeToPrettyString( gauge_num_done, num_total ) )
            job_key.SetVariable( 'popup_gauge_1', ( gauge_num_done, num_total ) )

            path = seed.seed_data

            try:

                mime = HydrusFileHandling.GetMime( path )

                if mime in self._mimes:

                    seed.ImportPath( self._file_import_options )

                    hash = seed.GetHash()

                    if seed.status in CC.SUCCESSFUL_IMPORT_STATES:

                        downloaded_tags = []

                        service_keys_to_content_updates = self._tag_import_options.GetServiceKeysToContentUpdates( hash, downloaded_tags ) # additional tags

                        if len( service_keys_to_content_updates ) > 0:

                            HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )


                        service_keys_to_tags = {}

                        for ( tag_service_key, filename_tagging_options ) in self._tag_service_keys_to_filename_tagging_options.items():

                            if not HG.client_controller.services_manager.ServiceExists( tag_service_key ):

                                continue


                            try:

                                tags = filename_tagging_options.GetTags( tag_service_key, path )

                                if len( tags ) > 0:

                                    service_keys_to_tags[ tag_service_key ] = tags


                            except Exception as e:

                                HydrusData.ShowText( 'Trying to parse filename tags in the import folder "' + self._name + '" threw an error!' )

                                HydrusData.ShowException( e )


                        if len( service_keys_to_tags ) > 0:

                            service_keys_to_content_updates = ClientData.ConvertServiceKeysToTagsToServiceKeysToContentUpdates( { hash }, service_keys_to_tags )

                            HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )


                        num_files_imported += 1

                        if hash not in presentation_hashes_fast:

                            if seed.ShouldPresent( self._file_import_options ):

                                presentation_hashes.append( hash )

                                presentation_hashes_fast.add( hash )


                else:

                    seed.SetStatus( CC.STATUS_VETOED )


            except Exception as e:

                error_text = traceback.format_exc()

                HydrusData.Print( 'A file failed to import from import folder ' + self._name + ':' + path )

                seed.SetStatus( CC.STATUS_ERROR, exception = e )

            finally:

                did_work = True


            i += 1

            if i % 10 == 0:

                self._ActionPaths()


        if num_files_imported > 0:

            HydrusData.Print( 'Import folder ' + self._name + ' imported ' + HydrusData.ConvertIntToPrettyString( num_files_imported ) + ' files.' )

            if len( presentation_hashes ) > 0:

                PublishPresentationHashes( self._name, presentation_hashes, self._publish_files_to_popup_button, self._publish_files_to_page )


        self._ActionPaths()

        return did_work


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( self._path, self._mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, self._period, self._check_regularly, serialisable_seed_cache, self._last_checked, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page ) = serialisable_info

        self._actions = dict( action_pairs )
        self._action_locations = dict( action_location_pairs )

        self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_import_options )
        self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_import_options )
        self._tag_service_keys_to_filename_tagging_options = dict( [ ( encoded_service_key.decode( 'hex' ), HydrusSerialisable.CreateFromSerialisableTuple( serialisable_filename_tagging_options ) ) for ( encoded_service_key, serialisable_filename_tagging_options ) in serialisable_tag_service_keys_to_filename_tagging_options ] )
        self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )


    def _UpdateSerialisableInfo( self, version, old_serialisable_info ):

        if version == 1:

            ( path, mimes, serialisable_file_import_options, action_pairs, action_location_pairs, period, open_popup, tag, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info

            service_keys_to_additional_tags = {}

            if tag is not None:

                service_keys_to_additional_tags[ CC.LOCAL_TAG_SERVICE_KEY ] = { tag }


            tag_import_options = ClientImportOptions.TagImportOptions( service_keys_to_additional_tags = service_keys_to_additional_tags )

            serialisable_tag_import_options = tag_import_options.GetSerialisableTuple()

            new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused )

            return ( 2, new_serialisable_info )


        if version == 2:

            ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info

            serialisable_txt_parse_tag_service_keys = []

            new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused )

            return ( 3, new_serialisable_info )


        if version == 3:

            ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused ) = old_serialisable_info

            check_now = False

            new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now )

            return ( 4, new_serialisable_info )


        if version == 4:

            ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_txt_parse_tag_service_keys, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now ) = old_serialisable_info

            txt_parse_tag_service_keys = [ service_key.decode( 'hex' ) for service_key in serialisable_txt_parse_tag_service_keys ]

            tag_service_keys_to_filename_tagging_options = {}

            for service_key in txt_parse_tag_service_keys:

                filename_tagging_options = ClientImportOptions.FilenameTaggingOptions()

                filename_tagging_options._load_from_neighbouring_txt_files = True

                tag_service_keys_to_filename_tagging_options[ service_key ] = filename_tagging_options


            serialisable_tag_service_keys_to_filename_tagging_options = [ ( service_key.encode( 'hex' ), filename_tagging_options.GetSerialisableTuple() ) for ( service_key, filename_tagging_options ) in tag_service_keys_to_filename_tagging_options.items() ]

            new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now )

            return ( 5, new_serialisable_info )


        if version == 5:

            ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, open_popup, serialisable_seed_cache, last_checked, paused, check_now ) = old_serialisable_info

            check_regularly = not paused
            show_working_popup = True
            publish_files_to_page = False
            publish_files_to_popup_button = open_popup

            new_serialisable_info = ( path, mimes, serialisable_file_import_options, serialisable_tag_import_options, serialisable_tag_service_keys_to_filename_tagging_options, action_pairs, action_location_pairs, period, check_regularly, serialisable_seed_cache, last_checked, paused, check_now, show_working_popup, publish_files_to_popup_button, publish_files_to_page )

            return ( 6, new_serialisable_info )


    def CheckNow( self ):

        self._check_now = True


    def DoWork( self ):

        if HG.view_shutdown:

            return


        if HC.options[ 'pause_import_folders_sync' ] or self._paused:

            return


        if not os.path.exists( self._path ) or not os.path.isdir( self._path ):

            return


        pubbed_job_key = False

        job_key = ClientThreading.JobKey( pausable = False, cancellable = True )

        job_key.SetVariable( 'popup_title', 'import folder - ' + self._name )

        due_by_check_now = self._check_now
        due_by_period = self._check_regularly and HydrusData.TimeHasPassed( self._last_checked + self._period )

        checked_folder = False

        if due_by_check_now or due_by_period:

            if not pubbed_job_key and self._show_working_popup:

                HG.client_controller.pub( 'message', job_key )

                pubbed_job_key = True


            self._CheckFolder( job_key )

            checked_folder = True


        seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

        did_import_file_work = False

        if seed is not None:

            if not pubbed_job_key and self._show_working_popup:

                HG.client_controller.pub( 'message', job_key )

                pubbed_job_key = True


            did_import_file_work = self._ImportFiles( job_key )


        if checked_folder or did_import_file_work:

            HG.client_controller.WriteSynchronous( 'serialisable', self )


        job_key.Delete()


    def GetSeedCache( self ):

        return self._seed_cache


    def ToListBoxTuple( self ):

        return ( self._name, self._path, self._period )


    def ToTuple( self ):

        return ( self._name, self._path, self._mimes, self._file_import_options, self._tag_import_options, self._tag_service_keys_to_filename_tagging_options, self._actions, self._action_locations, self._period, self._check_regularly, self._paused, self._check_now, self._show_working_popup, self._publish_files_to_popup_button, self._publish_files_to_page )


    def SetSeedCache( self, seed_cache ):

        self._seed_cache = seed_cache


    def SetTuple( self, name, path, mimes, file_import_options, tag_import_options, tag_service_keys_to_filename_tagging_options, actions, action_locations, period, check_regularly, paused, check_now, show_working_popup, publish_files_to_popup_button, publish_files_to_page ):

        if path != self._path:

            self._seed_cache = SeedCache()


        if set( mimes ) != set( self._mimes ):

            self._seed_cache.RemoveSeedsByStatus( ( CC.STATUS_VETOED, ) )


        self._name = name
        self._path = path
        self._mimes = mimes
        self._file_import_options = file_import_options
        self._tag_import_options = tag_import_options
        self._tag_service_keys_to_filename_tagging_options = tag_service_keys_to_filename_tagging_options
        self._actions = actions
        self._action_locations = action_locations
        self._period = period
        self._check_regularly = check_regularly
        self._paused = paused
        self._check_now = check_now
        self._show_working_popup = show_working_popup
        self._publish_files_to_popup_button = publish_files_to_popup_button
        self._publish_files_to_page = publish_files_to_page


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_IMPORT_FOLDER ] = ImportFolder

class NetworkJobPresentationContext( object ):

    def __init__( self, enter_call, exit_call ):

        self._enter_call = enter_call
        self._exit_call = exit_call


    def __enter__( self ):

        self._enter_call()


    def __exit__( self, exc_type, exc_val, exc_tb ):

        self._exit_call()


SEED_TYPE_HDD = 0
SEED_TYPE_URL = 1

class Seed( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED
    SERIALISABLE_NAME = 'File Import'
    SERIALISABLE_VERSION = 1

    def __init__( self, seed_type = None, seed_data = None ):

        if seed_type is None:

            seed_type = SEED_TYPE_URL


        if seed_data is None:

            seed_data = 'https://big-guys.4u/monica_lewinsky_hott.tiff.exe.vbs'


        HydrusSerialisable.SerialisableBase.__init__( self )

        self.seed_type = seed_type
        self.seed_data = seed_data

        self.created = HydrusData.GetNow()
        self.modified = self.created
        self.source_time = None
        self.status = CC.STATUS_UNKNOWN
        self.note = ''

        self._urls = set()
        self._tags = set()
        self._hashes = {}


    def __eq__( self, other ):

        return self.__hash__() == other.__hash__()


    def __hash__( self ):

        return ( self.seed_type, self.seed_data ).__hash__()


    def __ne__( self, other ):

        return self.__hash__() != other.__hash__()


    def _CheckTagsBlacklist( self, tags, tag_import_options ):

        tag_import_options.CheckBlacklist( tags )


    def _GetSerialisableInfo( self ):

        serialisable_urls = list( self._urls )
        serialisable_tags = list( self._tags )
        serialisable_hashes = [ ( hash_type, hash.encode( 'hex' ) ) for ( hash_type, hash ) in self._hashes.items() if hash is not None ]

        return ( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, serialisable_urls, serialisable_tags, serialisable_hashes )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( self.seed_type, self.seed_data, self.created, self.modified, self.source_time, self.status, self.note, serialisable_urls, serialisable_tags, serialisable_hashes ) = serialisable_info

        self._urls = set( serialisable_urls )
        self._tags = set( serialisable_tags )
        self._hashes = { hash_type : encoded_hash.decode( 'hex' ) for ( hash_type, encoded_hash ) in serialisable_hashes if encoded_hash is not None }


    def _NormaliseAndFilterAssociableURLs( self, urls ):

        normalised_urls = { HG.client_controller.network_engine.domain_manager.NormaliseURL( url ) for url in urls }

        associable_urls = { url for url in normalised_urls if HG.client_controller.network_engine.domain_manager.ShouldAssociateURLWithFiles( url ) }

        return associable_urls


    def _UpdateModified( self ):

        self.modified = HydrusData.GetNow()


    def AddParseResults( self, parse_results ):

        for ( hash_type, hash ) in ClientParsing.GetHashesFromParseResults( parse_results ):

            if hash_type not in self._hashes:

                self._hashes[ hash_type ] = hash


        urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, HC.URL_TYPE_POST ) )

        associable_urls = self._NormaliseAndFilterAssociableURLs( urls )

        associable_urls.discard( self.seed_data )

        self._urls.update( associable_urls )

        tags = ClientParsing.GetTagsFromParseResults( parse_results )

        self._tags.update( tags )

        source_timestamp = ClientParsing.GetTimestampFromParseResults( parse_results, HC.TIMESTAMP_TYPE_SOURCE )

        source_timestamp = min( HydrusData.GetNow() - 30, source_timestamp )

        if source_timestamp is not None:

            self.source_time = source_timestamp


        self._UpdateModified()


    def AddTags( self, tags ):

        tags = HydrusTags.CleanTags( tags )

        self._tags.update( tags )

        self._UpdateModified()


    def AddURL( self, url ):

        urls = ( url, )

        associable_urls = self._NormaliseAndFilterAssociableURLs( urls )

        associable_urls.discard( self.seed_data )

        self._urls.update( associable_urls )


    def CheckPreFetchMetadata( self, tag_import_options ):

        self._CheckTagsBlacklist( self._tags, tag_import_options )


    def DownloadAndImportRawFile( self, file_url, file_import_options, network_job_factory, network_job_presentation_context_factory ):

        self.AddURL( file_url )

        ( os_file_handle, temp_path ) = ClientPaths.GetTempPath()

        try:

            if self.seed_data != file_url:

                referral_url = self.seed_data

            else:

                referral_url = None


            network_job = network_job_factory( 'GET', file_url, temp_path = temp_path, referral_url = referral_url )

            HG.client_controller.network_engine.AddJob( network_job )

            with network_job_presentation_context_factory( network_job ) as njpc:

                network_job.WaitUntilDone()


            self.Import( temp_path, file_import_options )

        finally:

            HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )


    def FetchPageMetadata( self, tag_import_options ):

        pass


    def PredictPreImportStatus( self, file_import_options, file_url = None ):

        if self.status != CC.STATUS_UNKNOWN:

            return


        UNKNOWN_DEFAULT = ( CC.STATUS_UNKNOWN, None, '' )

        ( status, hash, note ) = UNKNOWN_DEFAULT

        # urls

        urls = set( self._urls )

        if file_url is not None:

            urls.add( file_url )


        if self.seed_type == SEED_TYPE_URL:

            urls.add( self.seed_data )


        unrecognised_url_results = set()

        for url in urls:

            if HG.client_controller.network_engine.domain_manager.URLCanReferToMultipleFiles( url ):

                continue


            # we now only trust url-matched single urls and the post/file urls
            # trusting unmatched source urls was too much of a hassle with too many boorus providing bad source urls like user account pages

            if HG.client_controller.network_engine.domain_manager.URLDefinitelyRefersToOneFile( url ) or url in ( self.seed_data, file_url ):

                results = HG.client_controller.Read( 'url_statuses', url )

                if len( results ) == 0: # if no match found, no useful data discovered

                    continue

                elif len( results ) > 1: # if more than one file claims this url, it cannot be relied on to guess the file

                    continue

                else: # i.e. 1 match found

                    ( status, hash, note ) = results[0]

                    if status != CC.STATUS_UNKNOWN:

                        break # if a known one-file url gives a single clear result, that result is reliable


        # hashes

        if status == CC.STATUS_UNKNOWN:

            for ( hash_type, found_hash ) in self._hashes.items():

                ( status, hash, note ) = HG.client_controller.Read( 'hash_status', hash_type, found_hash )

                if status != CC.STATUS_UNKNOWN:

                    break


        #

        if status == CC.STATUS_DELETED:

            if not file_import_options.ExcludesDeleted():

                status = CC.STATUS_UNKNOWN
                note = ''


        self.status = status

        if hash is not None:

            self._hashes[ 'sha256' ] = hash


        self.note = note

        self._UpdateModified()


    def GetHash( self ):

        if 'sha256' in self._hashes:

            return self._hashes[ 'sha256' ]


        return None


    def GetSearchSeeds( self ):

        if self.seed_type == SEED_TYPE_URL:

            search_urls = ClientNetworkingDomain.GetSearchURLs( self.seed_data )

            search_seeds = [ Seed( SEED_TYPE_URL, search_url ) for search_url in search_urls ]

        else:

            search_seeds = [ self ]


        return search_seeds


    def HasHash( self ):

        return self.GetHash() is not None


    def Import( self, temp_path, file_import_options ):

        file_import_job = FileImportJob( temp_path, file_import_options )

        ( status, hash, note ) = HG.client_controller.client_files_manager.ImportFile( file_import_job )

        self.SetStatus( status, note = note )
        self.SetHash( hash )


    def ImportPath( self, file_import_options ):

        if self.seed_type != SEED_TYPE_HDD:

            raise Exception( 'Attempted to import as a path, but I do not think I am a path!' )


        ( os_file_handle, temp_path ) = ClientPaths.GetTempPath()

        try:

            path = self.seed_data

            copied = HydrusPaths.MirrorFile( path, temp_path )

            if not copied:

                raise Exception( 'File failed to copy to temp path--see log for error.' )


            self.Import( temp_path, file_import_options )

        finally:

            HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )


    def Normalise( self ):

        if self.seed_type == SEED_TYPE_URL:

            self.seed_data = HG.client_controller.network_engine.domain_manager.NormaliseURL( self.seed_data )


    def PresentToPage( self, page_key ):

        hash = self.GetHash()

        if hash is not None:

            ( media_result, ) = HG.client_controller.Read( 'media_results', ( hash, ) )

            HG.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )


    def SetHash( self, hash ):

        if hash is not None:

            self._hashes[ 'sha256' ] = hash


    def SetStatus( self, status, note = '', exception = None ):

        if exception is not None:

            first_line = HydrusData.ToUnicode( exception ).split( os.linesep )[0]

            note = first_line + u'\u2026 (Copy note to see full error)'
            note += os.linesep
            note += HydrusData.ToUnicode( traceback.format_exc() )

            HydrusData.Print( 'Error when processing ' + self.seed_data + ' !' )
            HydrusData.Print( traceback.format_exc() )


        self.status = status
        self.note = note

        self._UpdateModified()


    def ShouldDownloadFile( self ):

        return self.status == CC.STATUS_UNKNOWN


    def ShouldFetchPageMetadata( self, tag_import_options ):

        if self.status == CC.STATUS_UNKNOWN:

            return True


        if self.status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:

            if tag_import_options.WorthFetchingTags() and tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB():

                return True


        return False


    def ShouldPresent( self, file_import_options ):

        hash = self.GetHash()

        if hash is not None and self.status in CC.SUCCESSFUL_IMPORT_STATES:

            if file_import_options.ShouldPresentIgnorantOfInbox( self.status ):

                return True


            in_inbox = HG.client_controller.Read( 'in_inbox', hash )

            if file_import_options.ShouldPresent( self.status, in_inbox ):

                return True


        return False


    def WorkOnFileURL( self, file_import_options, status_hook, network_job_factory, network_job_presentation_context_factory, tag_import_options = None ):

        did_substantial_work = False

        try:

            status_hook( 'checking url status' )

            self.PredictPreImportStatus( file_import_options )

            if self.status == CC.STATUS_UNKNOWN:

                file_url = self.seed_data

                status_hook( 'downloading file' )

                self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )

                did_substantial_work = True


            did_substantial_work |= self.WriteContentUpdates( tag_import_options )

        except HydrusExceptions.ShutdownException:

            return False

        except HydrusExceptions.VetoException as e:

            status = CC.STATUS_VETOED

            note = HydrusData.ToUnicode( e )

            self.SetStatus( status, note = note )

            if isinstance( e, HydrusExceptions.CancelledException ):

                status_hook( 'cancelled!' )

                time.sleep( 2 )


        except HydrusExceptions.NotFoundException:

            status = CC.STATUS_VETOED
            note = '404'

            self.SetStatus( status, note = note )

            status_hook( '404' )

            time.sleep( 2 )

        except Exception as e:

            status = CC.STATUS_ERROR

            self.SetStatus( status, exception = e )

            status_hook( 'error!' )

            time.sleep( 3 )


        return did_substantial_work


    def WorkOnPostURL( self, file_import_options, tag_import_options, status_hook, network_job_factory, network_job_presentation_context_factory ):

        did_substantial_work = False

        try:

            status_hook( 'checking url status' )

            self.PredictPreImportStatus( file_import_options )

            if self.ShouldFetchPageMetadata( tag_import_options ):

                post_url = self.seed_data

                ( url_to_check, parser ) = HG.client_controller.network_engine.domain_manager.GetURLToFetchAndParser( post_url )

                status_hook( 'downloading page' )

                network_job = network_job_factory( 'GET', url_to_check )

                HG.client_controller.network_engine.AddJob( network_job )

                with network_job_presentation_context_factory( network_job ) as njpc:

                    network_job.WaitUntilDone()


                data = network_job.GetContent()

                parsing_context = {}

                parsing_context[ 'post_url' ] = post_url
                parsing_context[ 'url' ] = url_to_check

                all_parse_results = parser.Parse( parsing_context, data )

                if len( all_parse_results ) == 0:

                    raise HydrusExceptions.VetoException( 'Could not parse any data!' )


                parse_results = all_parse_results[0]

                # this now needs to deal with multiple file post urls cleverly, which I think means no longer associating file_urls at this point--do that url association in DownloadAndImportRawFile only
                self.AddParseResults( parse_results )

                self.CheckPreFetchMetadata( tag_import_options )

                file_urls = ClientParsing.GetURLsFromParseResults( parse_results, ( HC.URL_TYPE_FILE, ), only_get_top_priority = True )

                if len( file_urls ) == 0:

                    raise HydrusExceptions.VetoException( 'Could not file a file URL!' )

                elif len( file_urls ) == 1 or True: # let's still mandata this for a bit

                    file_url = file_urls[0]

                    self.PredictPreImportStatus( file_import_options, file_url )

                    if self.ShouldDownloadFile():

                        status_hook( 'downloading file' )

                        self.DownloadAndImportRawFile( file_url, file_import_options, network_job_factory, network_job_presentation_context_factory )

                        did_substantial_work = True


                else:

                    raise HydrusExceptions.VetoException( 'Multiple-file post pages are not yet supported!' )

                    for file_url in file_urls:

                        duplicate_seed = self.Duplicate() # inherits all urls and tags from here

                        duplicate_seed.seed_data = file_url

                        duplicate_seed.AddURL( self.seed_data )

                        # set referral url as my seed_data--this should probably auto-do AddURL( self.seed_data ) tbh

                        # insert in my seed cache just after me


                    status = CC.STATUS_SUCCESSFUL_AND_NEW
                    note = 'Found ' + HydrusData.ConvertIntToPrettyString( len( file_urls ) ) + ' File URLs in this page.'

                    self.SetStatus( status, note = note )

                    # alter seeds so:
                    # referral url is saved and used in workonfileurl and workonposturl
                    # gallery/sub import loops can now handle workonfileurl

                    # there is also the question of pixiv manga pages, which may need linking from the mode=medium page, which is two jumps
                    # this presumably means adding a new url content type to the parser like 'addable post url' or something


            did_substantial_work |= self.WriteContentUpdates( tag_import_options )

        except HydrusExceptions.ShutdownException:

            return False

        except HydrusExceptions.VetoException as e:

            status = CC.STATUS_VETOED

            note = HydrusData.ToUnicode( e )

            self.SetStatus( status, note = note )

            if isinstance( e, HydrusExceptions.CancelledException ):

                status_hook( 'cancelled!' )

                time.sleep( 2 )


        except HydrusExceptions.NotFoundException:

            status = CC.STATUS_VETOED
            note = '404'

            self.SetStatus( status, note = note )

            status_hook( '404' )

            time.sleep( 2 )

        except Exception as e:

            status = CC.STATUS_ERROR

            self.SetStatus( status, exception = e )

            status_hook( 'error!' )

            time.sleep( 3 )


        return did_substantial_work


    def WorksInNewSystem( self ):

        if self.seed_type == SEED_TYPE_URL:

            ( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( self.seed_data )

            if url_type == HC.URL_TYPE_POST and can_parse:

                return True


        return False


    def WriteContentUpdates( self, tag_import_options = None ):

        did_work = False

        if self.status == CC.STATUS_ERROR:

            return did_work


        hash = self.GetHash()

        if hash is None:

            return did_work


        service_keys_to_content_updates = collections.defaultdict( list )

        urls = set( self._urls )

        if self.seed_type == SEED_TYPE_URL:

            urls.add( self.seed_data )


        associable_urls = self._NormaliseAndFilterAssociableURLs( urls )

        if len( associable_urls ) > 0:

            content_update = HydrusData.ContentUpdate( HC.CONTENT_TYPE_URLS, HC.CONTENT_UPDATE_ADD, ( associable_urls, ( hash, ) ) )

            service_keys_to_content_updates[ CC.COMBINED_LOCAL_FILE_SERVICE_KEY ].append( content_update )


        if tag_import_options is not None:

            for ( service_key, content_updates ) in tag_import_options.GetServiceKeysToContentUpdates( hash, set( self._tags ) ).items():

                service_keys_to_content_updates[ service_key ].extend( content_updates )


        if len( service_keys_to_content_updates ) > 0:

            HG.client_controller.WriteSynchronous( 'content_updates', service_keys_to_content_updates )

            did_work = True


        return did_work


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED ] = Seed

class SeedCache( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE
    SERIALISABLE_NAME = 'Import File Status Cache'
    SERIALISABLE_VERSION = 8

    def __init__( self ):

        HydrusSerialisable.SerialisableBase.__init__( self )

        self._seeds = HydrusSerialisable.SerialisableList()

        self._seeds_to_indices = {}

        self._seed_cache_key = HydrusData.GenerateKey()

        self._status_cache = None
        self._status_cache_generation_time = 0

        self._dirty = True

        self._lock = threading.Lock()


    def __len__( self ):

        return len( self._seeds )


    def _GenerateStatus( self ):

        statuses_to_counts = self._GetStatusesToCounts()

        self._status_cache = GenerateSeedCacheStatus( statuses_to_counts )
        self._status_cache_generation_time = HydrusData.GetNow()

        self._dirty = False


    def _GetStatusesToCounts( self ):

        statuses_to_counts = collections.Counter()

        for seed in self._seeds:

            statuses_to_counts[ seed.status ] += 1


        return statuses_to_counts


    def _GetSeeds( self, status = None ):

        if status is None:

            return list( self._seeds )

        else:

            return [ seed for seed in self._seeds if seed.status == status ]


    def _GetSerialisableInfo( self ):

        with self._lock:

            return self._seeds.GetSerialisableTuple()


    def _GetSourceTimestamp( self, seed ):

        source_timestamp = seed.source_time

        if source_timestamp is None:

            # decent fallback compromise
            # -30 since added and 'last check' timestamps are often the same, and this messes up calculations

            source_timestamp = seed.created - 30


        return source_timestamp


    def _HasSeed( self, seed ):

        search_seeds = seed.GetSearchSeeds()

        has_seed = True in ( search_seed in self._seeds_to_indices for search_seed in search_seeds )

        return has_seed


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        with self._lock:

            self._seeds = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_info )

            self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }


    def _SetDirty( self ):

        self._dirty = True


    def _UpdateSerialisableInfo( self, version, old_serialisable_info ):

        if version == 1:

            new_serialisable_info = []

            for ( seed, seed_info ) in old_serialisable_info:

                if 'note' in seed_info:

                    seed_info[ 'note' ] = HydrusData.ToUnicode( seed_info[ 'note' ] )


                new_serialisable_info.append( ( seed, seed_info ) )


            return ( 2, new_serialisable_info )


        if version in ( 2, 3 ):

            # gelbooru replaced their thumbnail links with this redirect spam
            # 'https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY4ODA1OA=='

            # I missed some http ones here, so I've broadened the test and rescheduled it

            new_serialisable_info = []

            for ( seed, seed_info ) in old_serialisable_info:

                if 'gelbooru.com/redirect.php' in seed:

                    continue


                new_serialisable_info.append( ( seed, seed_info ) )


            return ( 4, new_serialisable_info )


        if version == 4:

            def ConvertRegularToRawURL( regular_url ):

                # convert this:
                # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
                # to this:
                # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
                # the 500 part can be a bunch of stuff, including letters

                url_components = regular_url.split( '_' )

                last_component = url_components[ -1 ]

                ( number_gubbins, file_ext ) = last_component.split( '.' )

                raw_last_component = 'raw.' + file_ext

                url_components[ -1 ] = raw_last_component

                raw_url = '_'.join( url_components )

                return raw_url


            def Remove68Subdomain( long_url ):

                # sometimes the 68 subdomain gives a 404 on the raw url, so:

                # convert this:
                # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
                # to this:
                # http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg

                # I am not sure if it is always 68, but let's not assume

                ( scheme, rest ) = long_url.split( '://', 1 )

                if rest.startswith( 'media.tumblr.com' ):

                    return long_url


                ( gumpf, shorter_rest ) = rest.split( '.', 1 )

                shorter_url = scheme + '://' + shorter_rest

                return shorter_url


            new_serialisable_info = []

            good_seeds = set()

            for ( seed, seed_info ) in old_serialisable_info:

                try:

                    parse = urlparse.urlparse( seed )

                    if 'media.tumblr.com' in parse.netloc:

                        seed = Remove68Subdomain( seed )

                        seed = ConvertRegularToRawURL( seed )

                        seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )


                    if 'pixiv.net' in parse.netloc:

                        seed = ClientNetworkingDomain.ConvertHTTPToHTTPS( seed )


                    if seed in good_seeds: # we hit a dupe, so skip it

                        continue


                except:

                    pass


                good_seeds.add( seed )

                new_serialisable_info.append( ( seed, seed_info ) )


            return ( 5, new_serialisable_info )


        if version == 5:

            new_serialisable_info = []

            for ( seed, seed_info ) in old_serialisable_info:

                seed_info[ 'source_timestamp' ] = None

                new_serialisable_info.append( ( seed, seed_info ) )


            return ( 6, new_serialisable_info )


        if version == 6:

            new_serialisable_info = []

            for ( seed, seed_info ) in old_serialisable_info:

                try:

                    magic_phrase = '//media.tumblr.com'
                    replacement = '//data.tumblr.com'

                    if magic_phrase in seed:

                        seed = seed.replace( magic_phrase, replacement )


                except:

                    pass


                new_serialisable_info.append( ( seed, seed_info ) )


            return ( 7, new_serialisable_info )


        if version == 7:

            seeds = HydrusSerialisable.SerialisableList()

            for ( seed_text, seed_info ) in old_serialisable_info:

                if seed_text.startswith( 'http' ):

                    seed_type = SEED_TYPE_URL

                else:

                    seed_type = SEED_TYPE_HDD


                seed = Seed( seed_type, seed_text )

                seed.status = seed_info[ 'status' ]
                seed.created = seed_info[ 'added_timestamp' ]
                seed.modified = seed_info[ 'last_modified_timestamp' ]
                seed.source_time = seed_info[ 'source_timestamp' ]
                seed.note = seed_info[ 'note' ]

                seeds.append( seed )


            new_serialisable_info = seeds.GetSerialisableTuple()

            return ( 8, new_serialisable_info )


    def AddSeeds( self, seeds ):

        if len( seeds ) == 0:

            return 0


        new_seeds = []

        with self._lock:

            for seed in seeds:

                if self._HasSeed( seed ):

                    continue


                seed.Normalise()

                new_seeds.append( seed )

                self._seeds.append( seed )

                self._seeds_to_indices[ seed ] = len( self._seeds ) - 1


            self._SetDirty()


        self.NotifySeedsUpdated( new_seeds )

        return len( new_seeds )


    def AdvanceSeed( self, seed ):

        with self._lock:

            if seed in self._seeds_to_indices:

                index = self._seeds_to_indices[ seed ]

                if index > 0:

                    self._seeds.remove( seed )

                    self._seeds.insert( index - 1, seed )


                self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }


        self.NotifySeedsUpdated( ( seed, ) )


    def CanCompact( self, compact_before_this_source_time ):

        with self._lock:

            if len( self._seeds ) <= 100:

                return False


            for seed in self._seeds[:-100]:

                if seed.status == CC.STATUS_UNKNOWN:

                    continue


                if self._GetSourceTimestamp( seed ) < compact_before_this_source_time:

                    return True


        return False


    def Compact( self, compact_before_this_source_time ):

        with self._lock:

            if len( self._seeds ) <= 100:

                return


            new_seeds = HydrusSerialisable.SerialisableList()

            for seed in self._seeds[:-100]:

                still_to_do = seed.status == CC.STATUS_UNKNOWN
                still_relevant = self._GetSourceTimestamp( seed ) > compact_before_this_source_time

                if still_to_do or still_relevant:

                    new_seeds.append( seed )


            new_seeds.extend( self._seeds[-100:] )

            self._seeds = new_seeds
            self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }

            self._SetDirty()


    def DelaySeed( self, seed ):

        with self._lock:

            if seed in self._seeds_to_indices:

                index = self._seeds_to_indices[ seed ]

                if index < len( self._seeds ) - 1:

                    self._seeds.remove( seed )

                    self._seeds.insert( index + 1, seed )


                self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }


        self.NotifySeedsUpdated( ( seed, ) )


    def GetEarliestSourceTime( self ):

        with self._lock:

            if len( self._seeds ) == 0:

                return None


            earliest_timestamp = min( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )


        return earliest_timestamp


    def GetLatestAddedTime( self ):

        with self._lock:

            if len( self._seeds ) == 0:

                return 0


            latest_timestamp = max( ( seed.created for seed in self._seeds ) )


        return latest_timestamp


    def GetLatestSourceTime( self ):

        with self._lock:

            if len( self._seeds ) == 0:

                return 0


            latest_timestamp = max( ( self._GetSourceTimestamp( seed ) for seed in self._seeds ) )


        return latest_timestamp


    def GetNextSeed( self, status ):

        with self._lock:

            for seed in self._seeds:

                if seed.status == status:

                    return seed


        return None


    def GetNumNewFilesSince( self, since ):

        num_files = 0

        with self._lock:

            for seed in self._seeds:

                source_timestamp = self._GetSourceTimestamp( seed )

                if source_timestamp >= since:

                    num_files += 1


        return num_files


    def GetPresentedHashes( self, file_import_options ):

        with self._lock:

            hashes = []

            for seed in self._seeds:

                if seed.HasHash() and seed.ShouldPresent( file_import_options ):

                    hashes.append( seed.GetHash() )


            return hashes


    def GetSeedCacheKey( self ):

        return self._seed_cache_key


    def GetSeedCount( self, status = None ):

        result = 0

        with self._lock:

            if status is None:

                result = len( self._seeds )

            else:

                for seed in self._seeds:

                    if seed.status == status:

                        result += 1


        return result


    def GetSeeds( self, status = None ):

        with self._lock:

            return self._GetSeeds( status )


    def GetSeedIndex( self, seed ):

        with self._lock:

            return self._seeds_to_indices[ seed ]


    def GetStatus( self ):

        with self._lock:

            if self._dirty:

                self._GenerateStatus()


            return self._status_cache


    def GetStatusGenerationTime( self ):

        with self._lock:

            if self._dirty:

                return HydrusData.GetNow()


            return self._status_cache_generation_time


    def GetStatusesToCounts( self ):

        with self._lock:

            return self._GetStatusesToCounts()


    def GetValueRange( self ):

        with self._lock:

            if self._dirty:

                self._GenerateStatus()


            ( status, ( total_processed, total ) ) = self._status_cache

            return ( total_processed, total )


    def HasSeed( self, seed ):

        with self._lock:

            return self._HasSeed( seed )


    def NotifySeedsUpdated( self, seeds ):

        with self._lock:

            self._SetDirty()


        HG.client_controller.pub( 'seed_cache_seeds_updated', self._seed_cache_key, seeds )


    def RemoveSeeds( self, seeds ):

        with self._lock:

            seeds_to_delete = set( seeds )

            self._seeds = HydrusSerialisable.SerialisableList( [ seed for seed in self._seeds if seed not in seeds_to_delete ] )

            self._seeds_to_indices = { seed : index for ( index, seed ) in enumerate( self._seeds ) }

            self._SetDirty()


        self.NotifySeedsUpdated( seeds_to_delete )


    def RemoveSeedsByStatus( self, statuses_to_remove ):

        with self._lock:

            seeds_to_delete = [ seed for seed in self._seeds if seed.status in statuses_to_remove ]


        self.RemoveSeeds( seeds_to_delete )


    def RemoveAllButUnknownSeeds( self ):

        with self._lock:

            seeds_to_delete = [ seed for seed in self._seeds if seed.status != CC.STATUS_UNKNOWN ]


        self.RemoveSeeds( seeds_to_delete )


    def RetryFailures( self ):

        with self._lock:

            failed_seeds = self._GetSeeds( CC.STATUS_ERROR )

            for seed in failed_seeds:

                seed.SetStatus( CC.STATUS_UNKNOWN )


        self.NotifySeedsUpdated( failed_seeds )


    def WorkToDo( self ):

        with self._lock:

            if self._dirty:

                self._GenerateStatus()


            ( status, ( total_processed, total ) ) = self._status_cache

            return total_processed < total


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SEED_CACHE ] = SeedCache

class SimpleDownloaderImport( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_IMPORT
    SERIALISABLE_NAME = 'Simple Downloader Import'
    SERIALISABLE_VERSION = 4

    def __init__( self ):

        HydrusSerialisable.SerialisableBase.__init__( self )

        file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )

        self._pending_jobs = []
        self._seed_cache = SeedCache()
        self._file_import_options = file_import_options
        self._formula_name = 'all files linked by images in page'
        self._queue_paused = False
        self._files_paused = False

        self._parser_status = ''
        self._current_action = ''

        self._download_control_file_set = None
        self._download_control_file_clear = None
        self._download_control_page_set = None
        self._download_control_page_clear = None

        self._lock = threading.Lock()

        self._files_repeating_job = None
        self._queue_repeating_job = None

        HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )


    def _FileNetworkJobPresentationContextFactory( self, network_job ):

        def enter_call():

            with self._lock:

                if self._download_control_file_set is not None:

                    wx.CallAfter( self._download_control_file_set, network_job )


        def exit_call():

            with self._lock:

                if self._download_control_file_clear is not None:

                    wx.CallAfter( self._download_control_file_clear )


        return NetworkJobPresentationContext( enter_call, exit_call )


    def _GetSerialisableInfo( self ):

        serialisable_pending_jobs = [ ( url, simple_downloader_formula.GetSerialisableTuple() ) for ( url, simple_downloader_formula ) in self._pending_jobs ]

        serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
        serialisable_file_options = self._file_import_options.GetSerialisableTuple()

        return ( serialisable_pending_jobs, serialisable_seed_cache, serialisable_file_options, self._formula_name, self._queue_paused, self._files_paused )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( serialisable_pending_jobs, serialisable_seed_cache, serialisable_file_options, self._formula_name, self._queue_paused, self._files_paused ) = serialisable_info

        self._pending_jobs = [ ( url, HydrusSerialisable.CreateFromSerialisableTuple( serialisable_simple_downloader_formula ) ) for ( url, serialisable_simple_downloader_formula ) in serialisable_pending_jobs ]

        self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
        self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )


    def _PageNetworkJobPresentationContextFactory( self, network_job ):

        def enter_call():

            with self._lock:

                if self._download_control_page_set is not None:

                    wx.CallAfter( self._download_control_page_set, network_job )


        def exit_call():

            with self._lock:

                if self._download_control_page_clear is not None:

                    wx.CallAfter( self._download_control_page_clear )


        return NetworkJobPresentationContext( enter_call, exit_call )


    def _UpdateSerialisableInfo( self, version, old_serialisable_info ):

        if version == 1:

            ( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, paused ) = old_serialisable_info

            queue_paused = paused
            files_paused = paused

            new_serialisable_info = ( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, queue_paused, files_paused )

            return ( 2, new_serialisable_info )


        if version == 2:

            ( pending_page_urls, serialisable_seed_cache, serialisable_file_options, download_image_links, download_unlinked_images, queue_paused, files_paused ) = old_serialisable_info

            pending_jobs = []

            new_serialisable_info = ( pending_jobs, serialisable_seed_cache, serialisable_file_options, queue_paused, files_paused )

            return ( 3, new_serialisable_info )


        if version == 3:

            ( pending_jobs, serialisable_seed_cache, serialisable_file_options, queue_paused, files_paused ) = old_serialisable_info

            pending_jobs = []

            formula_name = 'all files linked by images in page'

            new_serialisable_info = ( pending_jobs, serialisable_seed_cache, serialisable_file_options, formula_name, queue_paused, files_paused )

            return ( 4, new_serialisable_info )


    def _WorkOnFiles( self, page_key ):

        seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

        if seed is None:

            return


        did_substantial_work = False

        file_url = seed.seed_data

        try:

            def status_hook( text ):

                with self._lock:

                    self._current_action = text


            did_substantial_work = seed.WorkOnFileURL( self._file_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._FileNetworkJobPresentationContextFactory )

            if seed.ShouldPresent( self._file_import_options ):

                seed.PresentToPage( page_key )

                did_substantial_work = True


        except Exception as e:

            status = CC.STATUS_ERROR

            seed.SetStatus( status, exception = e )

            time.sleep( 3 )

        finally:

            self._seed_cache.NotifySeedsUpdated( ( seed, ) )

            with self._lock:

                self._current_action = ''


        if did_substantial_work:

            time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )


    def _WorkOnQueue( self, page_key ):

        if len( self._pending_jobs ) > 0:

            with self._lock:

                ( url, simple_downloader_formula ) = self._pending_jobs.pop( 0 )

                self._parser_status = 'checking ' + url


            error_occurred = False

            try:

                network_job = ClientNetworkingJobs.NetworkJobDownloader( page_key, 'GET', url )

                network_job.OverrideBandwidth( 30 )

                HG.client_controller.network_engine.AddJob( network_job )

                with self._PageNetworkJobPresentationContextFactory( network_job ):

                    network_job.WaitUntilDone()


                data = network_job.GetContent()

                #

                parsing_context = {}

                parsing_context[ 'url' ] = url

                parsing_formula = simple_downloader_formula.GetFormula()

                file_urls = [ urlparse.urljoin( url, parsed_text ) for parsed_text in parsing_formula.Parse( parsing_context, data ) ]

                seeds = [ Seed( SEED_TYPE_URL, file_url ) for file_url in file_urls ]

                for seed in seeds:

                    seed.AddURL( url )


                num_new = self._seed_cache.AddSeeds( seeds )

                if num_new > 0:

                    WakeRepeatingJob( self._files_repeating_job )


                parser_status = 'page checked OK - ' + HydrusData.ConvertIntToPrettyString( num_new ) + ' new urls'

                num_already_in_seed_cache = len( file_urls ) - num_new

                if num_already_in_seed_cache > 0:

                    parser_status += ' (' + HydrusData.ConvertIntToPrettyString( num_already_in_seed_cache ) + ' already in queue)'


            except HydrusExceptions.ShutdownException:

                return

            except HydrusExceptions.NotFoundException:

                error_occurred = True

                parser_status = 'page 404'

            except Exception as e:

                error_occurred = True

                parser_status = HydrusData.ToUnicode( e )


            with self._lock:

                self._parser_status = parser_status


            if error_occurred:

                time.sleep( 5 )


            return True

        else:

            with self._lock:

                self._parser_status = ''


            return False


    def AdvanceJob( self, job ):

        with self._lock:

            if job in self._pending_jobs:

                index = self._pending_jobs.index( job )

                if index - 1 >= 0:

                    self._pending_jobs.remove( job )

                    self._pending_jobs.insert( index - 1, job )


    def CurrentlyWorking( self ):

        with self._lock:

            finished = not self._seed_cache.WorkToDo() or len( self._pending_jobs ) > 0

            return not finished and not self._files_paused


    def DelayJob( self, job ):

        with self._lock:

            if job in self._pending_jobs:

                index = self._pending_jobs.index( job )

                if index + 1 < len( self._pending_jobs ):

                    self._pending_jobs.remove( job )

                    self._pending_jobs.insert( index + 1, job )


    def DeleteJob( self, job ):

        with self._lock:

            if job in self._pending_jobs:

                self._pending_jobs.remove( job )


    def GetSeedCache( self ):

        with self._lock:

            return self._seed_cache


    def GetFileImportOptions( self ):

        with self._lock:

            return self._file_import_options


    def GetFormulaName( self ):

        with self._lock:

            return self._formula_name


    def GetStatus( self ):

        with self._lock:

            return ( list( self._pending_jobs ), self._parser_status, self._current_action, self._queue_paused, self._files_paused )


    def GetValueRange( self ):

        with self._lock:

            return self._seed_cache.GetValueRange()


    def NotifySeedsUpdated( self, seed_cache_key, seeds ):

        if seed_cache_key == self._seed_cache.GetSeedCacheKey():

            WakeRepeatingJob( self._files_repeating_job )


    def PausePlayFiles( self ):

        with self._lock:

            self._files_paused = not self._files_paused

            WakeRepeatingJob( self._files_repeating_job )


    def PausePlayQueue( self ):

        with self._lock:

            self._queue_paused = not self._queue_paused

            WakeRepeatingJob( self._queue_repeating_job )


    def PendJob( self, job ):

        with self._lock:

            if job not in self._pending_jobs:

                self._pending_jobs.append( job )

                WakeRepeatingJob( self._queue_repeating_job )


    def SetDownloadControlFile( self, download_control ):

        with self._lock:

            self._download_control_file_set = download_control.SetNetworkJob
            self._download_control_file_clear = download_control.ClearNetworkJob


    def SetDownloadControlPage( self, download_control ):

        with self._lock:

            self._download_control_page_set = download_control.SetNetworkJob
            self._download_control_page_clear = download_control.ClearNetworkJob


    def SetFileImportOptions( self, file_import_options ):

        with self._lock:

            self._file_import_options = file_import_options


    def SetFormulaName( self, formula_name ):

        with self._lock:

            self._formula_name = formula_name


    def Start( self, page_key ):

        self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )
        self._queue_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnQueue, page_key )


    def REPEATINGWorkOnFiles( self, page_key ):

        with self._lock:

            if PageImporterShouldStopWorking( page_key ):

                self._files_repeating_job.Cancel()

                return


            work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


        while work_to_do:

            try:

                self._WorkOnFiles( page_key )

                HG.client_controller.WaitUntilViewFree()

            except Exception as e:

                HydrusData.ShowException( e )


            with self._lock:

                if PageImporterShouldStopWorking( page_key ):

                    self._files_repeating_job.Cancel()

                    return


                work_to_do = self._seed_cache.WorkToDo() and not ( self._files_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


    def REPEATINGWorkOnQueue( self, page_key ):

        with self._lock:

            if PageImporterShouldStopWorking( page_key ):

                self._queue_repeating_job.Cancel()

                return


            ok_to_work = not ( self._queue_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


        while ok_to_work:

            try:

                did_work = self._WorkOnQueue( page_key )

                if did_work:

                    time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )

                else:

                    return


                HG.client_controller.WaitUntilViewFree()

            except Exception as e:

                HydrusData.ShowException( e )


            with self._lock:

                if PageImporterShouldStopWorking( page_key ):

                    self._queue_repeating_job.Cancel()

                    return


                ok_to_work = not ( self._queue_paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SIMPLE_DOWNLOADER_IMPORT ] = SimpleDownloaderImport

class Subscription( HydrusSerialisable.SerialisableBaseNamed ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION
    SERIALISABLE_NAME = 'Subscription'
    SERIALISABLE_VERSION = 6

    def __init__( self, name ):

        HydrusSerialisable.SerialisableBaseNamed.__init__( self, name )

        self._gallery_identifier = ClientDownloading.GalleryIdentifier( HC.SITE_TYPE_DEVIANT_ART )

        self._gallery_stream_identifiers = ClientDownloading.GetGalleryStreamIdentifiers( self._gallery_identifier )

        self._queries = []

        new_options = HG.client_controller.new_options

        self._checker_options = ClientDefaults.GetDefaultCheckerOptions( 'artist subscription' )

        if HC.options[ 'gallery_file_limit' ] is None:

            self._initial_file_limit = 200

        else:

            self._initial_file_limit = min( 200, HC.options[ 'gallery_file_limit' ] )


        self._periodic_file_limit = 50
        self._paused = False

        self._file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'quiet' )

        new_options = HG.client_controller.new_options

        self._tag_import_options = new_options.GetDefaultTagImportOptions( self._gallery_identifier )

        self._last_gallery_page_hit_timestamp = 0

        self._no_work_until = 0
        self._no_work_until_reason = ''

        self._publish_files_to_popup_button = True
        self._publish_files_to_page = False
        self._merge_query_publish_events = True


    def _DelayWork( self, time_delta, reason ):

        self._no_work_until = HydrusData.GetNow() + time_delta
        self._no_work_until_reason = reason


    def _GetExampleNetworkContexts( self, query ):

        seed_cache = query.GetSeedCache()

        seed = seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

        if seed is None:

            return [ ClientNetworkingContexts.NetworkContext( CC.NETWORK_CONTEXT_SUBSCRIPTION, self._GetNetworkJobSubscriptionKey( query ) ), ClientNetworkingContexts.GLOBAL_NETWORK_CONTEXT ]


        url = seed.seed_data

        example_nj = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), 'GET', url )
        example_network_contexts = example_nj.GetNetworkContexts()

        return example_network_contexts


    def _GetNetworkJobSubscriptionKey( self, query ):

        query_text = query.GetQueryText()

        return self._name + ': ' + query_text


    def _GetQueriesForProcessing( self ):

        queries = list( self._queries )

        if HG.client_controller.new_options.GetBoolean( 'process_subs_in_random_order' ):

            random.shuffle( queries )

        else:

            def key( q ):

                return q.GetQueryText()


            queries.sort( key = key )


        return queries


    def _GetSerialisableInfo( self ):

        serialisable_gallery_identifier = self._gallery_identifier.GetSerialisableTuple()
        serialisable_gallery_stream_identifiers = [ gallery_stream_identifier.GetSerialisableTuple() for gallery_stream_identifier in self._gallery_stream_identifiers ]
        serialisable_queries = [ query.GetSerialisableTuple() for query in self._queries ]
        serialisable_checker_options = self._checker_options.GetSerialisableTuple()
        serialisable_file_options = self._file_import_options.GetSerialisableTuple()
        serialisable_tag_options = self._tag_import_options.GetSerialisableTuple()

        return ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._no_work_until, self._no_work_until_reason, self._publish_files_to_popup_button, self._publish_files_to_page, self._merge_query_publish_events )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, serialisable_file_options, serialisable_tag_options, self._no_work_until, self._no_work_until_reason, self._publish_files_to_popup_button, self._publish_files_to_page, self._merge_query_publish_events ) = serialisable_info

        self._gallery_identifier = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_identifier )
        self._gallery_stream_identifiers = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_gallery_stream_identifier ) for serialisable_gallery_stream_identifier in serialisable_gallery_stream_identifiers ]
        self._queries = [ HydrusSerialisable.CreateFromSerialisableTuple( serialisable_query ) for serialisable_query in serialisable_queries ]
        self._checker_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_checker_options )
        self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )
        self._tag_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_tag_options )


    def _NoDelays( self ):

        return HydrusData.TimeHasPassed( self._no_work_until )


    def _QueryBandwidthIsOK( self, query ):

        example_network_contexts = self._GetExampleNetworkContexts( query )

        # just a little padding here
        expected_requests = 3
        expected_bytes = 1048576
        threshold = 30

        result = HG.client_controller.network_engine.bandwidth_manager.CanDoWork( example_network_contexts, expected_requests = expected_requests, expected_bytes = expected_bytes, threshold = threshold )

        if HG.subscription_report_mode:

            HydrusData.ShowText( 'Query "' + query.GetQueryText() + '" pre-work bandwidth test. Bandwidth ok: ' + str( result ) + '.' )


        return result


    def _ShowHitPeriodicFileLimitMessage( self, query_text ):

        message = 'When syncing, the query "' + query_text + '" for subscription "' + self._name + '" hit its periodic file limit!'
        message += os.linesep * 2
        message += 'This may be because the query has not run in a while--so the backlog of files has built up--or that the site has changed how it presents file urls on its gallery pages (and so the subscription thinks it is seeing new files when it truly is not).'
        message += os.linesep * 2
        message += 'If the former is true, you might want to fill in the gap with a manual download page, but if the latter is true, the maintainer for the download parser (hydrus dev or whoever), would be interested in knowing this information so they can roll out a fix.'

        HydrusData.ShowText( message )


    def _UpdateSerialisableInfo( self, version, old_serialisable_info ):

        if version == 1:

            ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, last_error, serialisable_seed_cache ) = old_serialisable_info

            check_now = False

            new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, serialisable_seed_cache )

            return ( 2, new_serialisable_info )


        if version == 2:

            ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, serialisable_seed_cache ) = old_serialisable_info

            no_work_until = 0
            no_work_until_reason = ''

            new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, no_work_until, no_work_until_reason, serialisable_seed_cache )

            return ( 3, new_serialisable_info )


        if version == 3:

            ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, query, period, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, last_checked, check_now, last_error, no_work_until, no_work_until_reason, serialisable_seed_cache ) = old_serialisable_info

            checker_options = ClientImportOptions.CheckerOptions( 5, period / 5, period * 10, ( 1, period * 10 ) )

            seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )

            query = SubscriptionQuery( query )

            query._seed_cache = seed_cache
            query._last_check_time = last_checked

            query.UpdateNextCheckTime( checker_options )

            queries = [ query ]

            serialisable_queries = [ query.GetSerialisableTuple() for query in queries ]
            serialisable_checker_options = checker_options.GetSerialisableTuple()

            new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason )

            return ( 4, new_serialisable_info )


        if version == 4:

            ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, get_tags_if_url_known_and_file_redundant, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason ) = old_serialisable_info

            new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason )

            return ( 5, new_serialisable_info )


        if version == 5:

            ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason ) = old_serialisable_info

            publish_files_to_popup_button = True
            publish_files_to_page = False
            merge_query_publish_events = True

            new_serialisable_info = new_serialisable_info = ( serialisable_gallery_identifier, serialisable_gallery_stream_identifiers, serialisable_queries, serialisable_checker_options, initial_file_limit, periodic_file_limit, paused, serialisable_file_options, serialisable_tag_options, no_work_until, no_work_until_reason, publish_files_to_popup_button, publish_files_to_page, merge_query_publish_events )

            return ( 6, new_serialisable_info )


    def _WorkOnFiles( self, job_key ):

        try:

            gallery = ClientDownloading.GetGallery( self._gallery_identifier )

        except Exception as e:

            HydrusData.PrintException( e )

            self._DelayWork( HC.UPDATE_DURATION, 'gallery would not load' )

            self._paused = True

            HydrusData.ShowText( 'The subscription ' + self._name + ' could not load its gallery! It has been paused and the full error has been written to the log!' )

            return


        error_count = 0

        all_presentation_hashes = []
        all_presentation_hashes_fast = set()

        queries = self._GetQueriesForProcessing()

        for query in queries:

            this_query_has_done_work = False

            query_text = query.GetQueryText()
            seed_cache = query.GetSeedCache()

            def network_job_factory( method, url, **kwargs ):

                network_job = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), method, url, **kwargs )

                network_job.OverrideBandwidth( 30 )

                job_key.SetVariable( 'popup_network_job', network_job )

                return network_job


            gallery.SetNetworkJobFactory( network_job_factory )

            text_1 = 'downloading files'
            query_summary_name = self._name

            if query_text != self._name:

                text_1 += ' for "' + query_text + '"'
                query_summary_name += ': ' + query_text


            job_key.SetVariable( 'popup_text_1', text_1 )

            num_urls = seed_cache.GetSeedCount()

            presentation_hashes = []
            presentation_hashes_fast = set()

            while True:

                num_unknown = seed_cache.GetSeedCount( CC.STATUS_UNKNOWN )
                num_done = num_urls - num_unknown

                seed = seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

                if seed is None:

                    if HG.subscription_report_mode:

                        HydrusData.ShowText( 'Query "' + query_text + '" can do no more file work due to running out of unknown urls.' )


                    break


                if job_key.IsCancelled():

                    self._DelayWork( 300, 'recently cancelled' )

                    break


                p1 = HC.options[ 'pause_subs_sync' ]
                p3 = HG.view_shutdown
                p4 = not self._QueryBandwidthIsOK( query )

                if p1 or p3 or p4:

                    if p4 and this_query_has_done_work:

                        job_key.SetVariable( 'popup_text_2', 'no more bandwidth to download files, will do some more later' )

                        time.sleep( 5 )


                    break


                try:

                    x_out_of_y = 'file ' + HydrusData.ConvertValueRangeToPrettyString( num_done, num_urls ) + ': '

                    job_key.SetVariable( 'popup_gauge_2', ( num_done, num_urls ) )

                    if seed.WorksInNewSystem():

                        def status_hook( text ):

                            job_key.SetVariable( 'popup_text_2', x_out_of_y + text )


                        seed.WorkOnPostURL( self._file_import_options, self._tag_import_options, status_hook, GenerateSubscriptionNetworkJobFactory( self._GetNetworkJobSubscriptionKey( query ) ), GenerateMultiplePopupNetworkJobPresentationContextFactory( job_key ) )

                        if seed.ShouldPresent( self._file_import_options ):

                            hash = seed.GetHash()

                            if hash not in presentation_hashes_fast:

                                if hash not in all_presentation_hashes_fast:

                                    all_presentation_hashes.append( hash )

                                    all_presentation_hashes_fast.add( hash )


                                presentation_hashes.append( hash )

                                presentation_hashes_fast.add( hash )


                    else:

                        job_key.SetVariable( 'popup_text_2', x_out_of_y + 'checking url status' )

                        seed.PredictPreImportStatus( self._file_import_options )

                        status = seed.status
                        url = seed.seed_data

                        if status == CC.STATUS_SUCCESSFUL_BUT_REDUNDANT:

                            if self._tag_import_options.ShouldFetchTagsEvenIfURLKnownAndFileAlreadyInDB() and self._tag_import_options.WorthFetchingTags():

                                job_key.SetVariable( 'popup_text_2', x_out_of_y + 'found file in db, fetching tags' )

                                downloaded_tags = gallery.GetTags( url )

                                seed.AddTags( downloaded_tags )


                        elif status == CC.STATUS_UNKNOWN:

                            ( os_file_handle, temp_path ) = ClientPaths.GetTempPath()

                            try:

                                job_key.SetVariable( 'popup_text_2', x_out_of_y + 'downloading file' )

                                if self._tag_import_options.WorthFetchingTags():

                                    downloaded_tags = gallery.GetFileAndTags( temp_path, url )

                                    seed.AddTags( downloaded_tags )

                                else:

                                    gallery.GetFile( temp_path, url )


                                seed.CheckPreFetchMetadata( self._tag_import_options )

                                job_key.SetVariable( 'popup_text_2', x_out_of_y + 'importing file' )

                                seed.Import( temp_path, self._file_import_options )

                                hash = seed.GetHash()

                                if hash not in presentation_hashes_fast:

                                    if seed.ShouldPresent( self._file_import_options ):

                                        if hash not in all_presentation_hashes_fast:

                                            all_presentation_hashes.append( hash )

                                            all_presentation_hashes_fast.add( hash )


                                        presentation_hashes.append( hash )

                                        presentation_hashes_fast.add( hash )


                            finally:

                                HydrusPaths.CleanUpTempPath( os_file_handle, temp_path )


                        seed.WriteContentUpdates( self._tag_import_options )


                except HydrusExceptions.CancelledException as e:

                    self._DelayWork( 300, HydrusData.ToUnicode( e ) )

                    break

                except HydrusExceptions.VetoException as e:

                    status = CC.STATUS_VETOED

                    note = HydrusData.ToUnicode( e )

                    seed.SetStatus( status, note = note )

                except HydrusExceptions.NotFoundException:

                    status = CC.STATUS_VETOED

                    note = '404'

                    seed.SetStatus( status, note = note )

                except Exception as e:

                    status = CC.STATUS_ERROR

                    job_key.SetVariable( 'popup_text_2', x_out_of_y + 'file failed' )

                    seed.SetStatus( status, exception = e )

                    if isinstance( e, HydrusExceptions.DataMissing ):

                        # DataMissing is a quick thing to avoid subscription abandons when lots of deleted files in e621 (or any other booru)
                        # this should be richer in any case in the new system

                        pass

                    else:

                        error_count += 1

                        time.sleep( 10 )


                    if error_count > 4:

                        raise Exception( 'The subscription ' + self._name + ' encountered several errors when downloading files, so it abandoned its sync.' )


                this_query_has_done_work = True

                if len( presentation_hashes ) > 0:

                    job_key.SetVariable( 'popup_files', ( list( presentation_hashes ), query_summary_name ) )


                time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )

                HG.client_controller.WaitUntilViewFree()


            if not self._merge_query_publish_events and len( presentation_hashes ) > 0:

                PublishPresentationHashes( query_summary_name, presentation_hashes, self._publish_files_to_popup_button, self._publish_files_to_page )


        if self._merge_query_publish_events and len( all_presentation_hashes ) > 0:

            PublishPresentationHashes( self._name, all_presentation_hashes, self._publish_files_to_popup_button, self._publish_files_to_page )


        job_key.DeleteVariable( 'popup_files' )
        job_key.DeleteVariable( 'popup_text_1' )
        job_key.DeleteVariable( 'popup_text_2' )
        job_key.DeleteVariable( 'popup_gauge_2' )


    def _WorkOnFilesCanDoWork( self ):

        for query in self._queries:

            if query.CanWorkOnFiles():

                if self._QueryBandwidthIsOK( query ):

                    return True


        return False


    def _SyncQuery( self, job_key ):

        have_made_an_initial_sync_bandwidth_notification = False

        queries = self._GetQueriesForProcessing()

        for query in queries:

            can_sync = query.CanSync()

            if HG.subscription_report_mode:

                HydrusData.ShowText( 'Query "' + query.GetQueryText() + '" started. Current can_sync is ' + str( can_sync ) + '.' )


            if not can_sync:

                continue


            done_first_page = False

            query_text = query.GetQueryText()
            seed_cache = query.GetSeedCache()

            this_is_initial_sync = query.IsInitialSync()
            total_new_urls = 0

            seeds_to_add = set()
            seeds_to_add_ordered = []

            prefix = 'synchronising'

            if query_text != self._name:

                prefix += ' "' + query_text + '"'


            job_key.SetVariable( 'popup_text_1', prefix )

            for gallery_stream_identifier in self._gallery_stream_identifiers:

                if this_is_initial_sync:

                    if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:

                        break


                else:

                    if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:

                        self._ShowHitPeriodicFileLimitMessage( query_text )

                        break


                p1 = HC.options[ 'pause_subs_sync' ]
                p2 = job_key.IsCancelled()
                p3 = HG.view_shutdown

                if p1 or p2 or p3:

                    break


                try:

                    gallery = ClientDownloading.GetGallery( gallery_stream_identifier )

                except Exception as e:

                    HydrusData.PrintException( e )

                    self._DelayWork( HC.UPDATE_DURATION, 'gallery would not load' )

                    self._paused = True

                    HydrusData.ShowText( 'The subscription ' + self._name + ' could not load its gallery! It has been paused and the full error has been written to the log!' )

                    return


                def network_job_factory( method, url, **kwargs ):

                    network_job = ClientNetworkingJobs.NetworkJobSubscription( self._GetNetworkJobSubscriptionKey( query ), method, url, **kwargs )

                    job_key.SetVariable( 'popup_network_job', network_job )

                    network_job.OverrideBandwidth( 30 )

                    return network_job


                gallery.SetNetworkJobFactory( network_job_factory )

                page_index = 0
                num_existing_urls = 0
                keep_checking = True

                while keep_checking:

                    new_urls_this_page = 0

                    try:

                        p1 = HC.options[ 'pause_subs_sync' ]
                        p2 = HG.view_shutdown

                        if p1 or p2:

                            return


                        if job_key.IsCancelled():

                            raise HydrusExceptions.CancelledException( 'gallery parsing cancelled, likely by user' )


                        next_gallery_page_hit_timestamp = self._last_gallery_page_hit_timestamp + HG.client_controller.new_options.GetInteger( 'gallery_page_wait_period_subscriptions' )

                        if not HydrusData.TimeHasPassed( next_gallery_page_hit_timestamp ):

                            if not done_first_page:

                                page_check_status = 'checking first page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )

                            else:

                                page_check_status = HydrusData.ConvertIntToPrettyString( total_new_urls ) + ' new urls found, checking next page ' + HydrusData.ConvertTimestampToPrettyPending( next_gallery_page_hit_timestamp )


                            job_key.SetVariable( 'popup_text_1', prefix + ': ' + page_check_status )

                            time.sleep( 1 )

                            continue


                        job_key.SetVariable( 'popup_text_1', prefix + ': found ' + HydrusData.ConvertIntToPrettyString( total_new_urls ) + ' new urls, checking next page' )

                        try:

                            ( page_of_seeds, definitely_no_more_pages ) = gallery.GetPage( query_text, page_index )

                        finally:

                            self._last_gallery_page_hit_timestamp = HydrusData.GetNow()


                        done_first_page = True

                        page_index += 1

                        if definitely_no_more_pages:

                            keep_checking = False


                        for seed in page_of_seeds:

                            if this_is_initial_sync:

                                if self._initial_file_limit is not None and total_new_urls + 1 > self._initial_file_limit:

                                    keep_checking = False

                                    break


                            else:

                                if self._periodic_file_limit is not None and total_new_urls + 1 > self._periodic_file_limit:

                                    self._ShowHitPeriodicFileLimitMessage( query_text )

                                    keep_checking = False

                                    break


                            if seed in seeds_to_add:

                                # this catches the occasional overflow when a new file is uploaded while gallery parsing is going on

                                continue


                            if seed_cache.HasSeed( seed ):

                                num_existing_urls += 1

                                if num_existing_urls > 5:

                                    keep_checking = False

                                    break


                            else:

                                seeds_to_add.add( seed )
                                seeds_to_add_ordered.append( seed )

                                new_urls_this_page += 1
                                total_new_urls += 1


                        if new_urls_this_page == 0:

                            keep_checking = False


                    except HydrusExceptions.CancelledException as e:

                        self._DelayWork( 300, HydrusData.ToUnicode( e ) )

                        break

                    except HydrusExceptions.NotFoundException:

                        # paheal now 404s when no results, so just naturally break

                        break


            seeds_to_add_ordered.reverse()

            # 'first' urls are now at the end, so the seed_cache should stay roughly in oldest->newest order

            seed_cache.AddSeeds( seeds_to_add_ordered )

            query.RegisterSyncComplete()
            query.UpdateNextCheckTime( self._checker_options )

            if query.IsDead():

                if this_is_initial_sync:

                    HydrusData.ShowText( 'The query "' + query_text + '" for subscription "' + self._name + '" did not find any files on its first sync! Could the query text have a typo, like a missing underscore?' )

                else:

                    HydrusData.ShowText( 'The query "' + query_text + '" for subscription "' + self._name + '" appears to be dead!' )


            else:

                if this_is_initial_sync:

                    if not self._QueryBandwidthIsOK( query ) and not have_made_an_initial_sync_bandwidth_notification:

                        HydrusData.ShowText( 'FYI: The query "' + query_text + '" for subscription "' + self._name + '" performed its initial sync ok, but that domain is short on bandwidth right now, so no files will be downloaded yet. The subscription will catch up in future as bandwidth becomes available. You can review the estimated time until bandwidth is available under the manage subscriptions dialog. If more queries are performing initial syncs in this run, they may be the same.' )

                        have_made_an_initial_sync_bandwidth_notification = True


    def _SyncQueryCanDoWork( self ):

        return True in ( query.CanSync() for query in self._queries )


    def CanCheckNow( self ):

        return True in ( query.CanCheckNow() for query in self._queries )


    def CanCompact( self ):

        return True in ( query.CanCompact( self._checker_options ) for query in self._queries )


    def CanReset( self ):

        return True in ( not query.IsInitialSync() for query in self._queries )


    def CanRetryFailures( self ):

        return True in ( query.CanRetryFailed() for query in self._queries )


    def CanScrubDelay( self ):

        return not HydrusData.TimeHasPassed( self._no_work_until )


    def CheckNow( self ):

        for query in self._queries:

            query.CheckNow()


        self.ScrubDelay()


    def Compact( self ):

        for query in self._queries:

            query.Compact( self._checker_options )


    def GetBandwidthWaitingEstimate( self, query ):

        example_network_contexts = self._GetExampleNetworkContexts( query )

        estimate = HG.client_controller.network_engine.bandwidth_manager.GetWaitingEstimate( example_network_contexts )

        return estimate


    def GetBandwidthWaitingEstimateMinMax( self ):

        if len( self._queries ) == 0:

            return ( 0, 0 )


        estimates = []

        for query in self._queries:

            example_network_contexts = self._GetExampleNetworkContexts( query )

            estimate = HG.client_controller.network_engine.bandwidth_manager.GetWaitingEstimate( example_network_contexts )

            estimates.append( estimate )


        min_estimate = min( estimates )
        max_estimate = max( estimates )

        return ( min_estimate, max_estimate )


    def GetGalleryIdentifier( self ):

        return self._gallery_identifier


    def GetQueries( self ):

        return self._queries


    def GetPresentationOptions( self ):

        return ( self._publish_files_to_popup_button, self._publish_files_to_page, self._merge_query_publish_events )


    def GetTagImportOptions( self ):

        return self._tag_import_options


    def HasQuerySearchText( self, search_text ):

        for query in self._queries:

            query_text = query.GetQueryText()

            if search_text in query_text:

                return True


        return False


    def Merge( self, potential_mergee_subscriptions ):

        unmergable_subscriptions = []

        for subscription in potential_mergee_subscriptions:

            if subscription._gallery_identifier == self._gallery_identifier:

                my_new_queries = [ query.Duplicate() for query in subscription._queries ]

                self._queries.extend( my_new_queries )

            else:

                unmergable_subscriptions.append( subscription )


        return unmergable_subscriptions


    def PauseResume( self ):

        self._paused = not self._paused


    def Reset( self ):

        for query in self._queries:

            query.Reset()


        self.ScrubDelay()


    def RetryFailures( self ):

        for query in self._queries:

            query.RetryFailures()


    def ReviveDead( self ):

        for query in self._queries:

            if query.IsDead():

                query.CheckNow()


    def Separate( self, base_name, only_these_queries = None ):

        if only_these_queries is None:

            only_these_queries = set( self._queries )

        else:

            only_these_queries = set( only_these_queries )


        subscriptions = []

        for query in self._queries:

            if query not in only_these_queries:

                continue


            subscription = self.Duplicate()

            subscription._queries = [ query.Duplicate() ]

            subscription.SetName( base_name + ': ' + query.GetQueryText() )

            subscriptions.append( subscription )


        self._queries = [ query for query in self._queries if query not in only_these_queries ]

        return subscriptions


    def SetCheckerOptions( self, checker_options ):

        self._checker_options = checker_options

        for query in self._queries:

            query.UpdateNextCheckTime( self._checker_options )


    def SetPresentationOptions( self, publish_files_to_popup_button, publish_files_to_page, merge_query_publish_events ):

        self._publish_files_to_popup_button = publish_files_to_popup_button
        self._publish_files_to_page = publish_files_to_page
        self._merge_query_publish_events = merge_query_publish_events


    def SetTuple( self, gallery_identifier, gallery_stream_identifiers, queries, checker_options, initial_file_limit, periodic_file_limit, paused, file_import_options, tag_import_options, no_work_until ):

        self._gallery_identifier = gallery_identifier
        self._gallery_stream_identifiers = gallery_stream_identifiers
        self._queries = queries
        self._checker_options = checker_options
        self._initial_file_limit = initial_file_limit
        self._periodic_file_limit = periodic_file_limit
        self._paused = paused

        self._file_import_options = file_import_options
        self._tag_import_options = tag_import_options

        self._no_work_until = no_work_until


    def ScrubDelay( self ):

        self._no_work_until = 0
        self._no_work_until_reason = ''


    def Sync( self ):

        p1 = not self._paused
        p2 = not HG.view_shutdown
        p3 = self._NoDelays()
        p4 = self._SyncQueryCanDoWork()
        p5 = self._WorkOnFilesCanDoWork()

        if HG.subscription_report_mode:

            message = 'Subscription "' + self._name + '" entered sync.'
            message += os.linesep
            message += 'Unpaused: ' + str( p1 )
            message += os.linesep
            message += 'No delays: ' + str( p3 )
            message += os.linesep
            message += 'Sync can do work: ' + str( p4 )
            message += os.linesep
            message += 'Files can do work: ' + str( p5 )

            HydrusData.ShowText( message )


        if p1 and p2 and p3 and ( p4 or p5 ):

            job_key = ClientThreading.JobKey( pausable = False, cancellable = True )

            try:

                job_key.SetVariable( 'popup_title', 'subscriptions - ' + self._name )

                HG.client_controller.pub( 'message', job_key )

                self._SyncQuery( job_key )

                self._WorkOnFiles( job_key )

            except HydrusExceptions.NetworkException as e:

                if isinstance( e, HydrusExceptions.NetworkInfrastructureException ):

                    delay = 3600

                else:

                    delay = HC.UPDATE_DURATION


                HydrusData.Print( 'The subscription ' + self._name + ' encountered an exception when trying to sync:' )
                HydrusData.PrintException( e )

                job_key.SetVariable( 'popup_text_1', 'Encountered a network error, will retry again later' )

                self._DelayWork( delay, 'network error: ' + HydrusData.ToUnicode( e ) )

                time.sleep( 5 )

            except Exception as e:

                HydrusData.ShowText( 'The subscription ' + self._name + ' encountered an exception when trying to sync:' )
                HydrusData.ShowException( e )

                self._DelayWork( HC.UPDATE_DURATION, 'error: ' + HydrusData.ToUnicode( e ) )

            finally:

                job_key.DeleteVariable( 'popup_network_job' )


            HG.client_controller.WriteSynchronous( 'serialisable', self )

            if job_key.HasVariable( 'popup_files' ):

                job_key.Finish()

            else:

                job_key.Delete()


    def ToTuple( self ):

        return ( self._name, self._gallery_identifier, self._gallery_stream_identifiers, self._queries, self._checker_options, self._initial_file_limit, self._periodic_file_limit, self._paused, self._file_import_options, self._tag_import_options, self._no_work_until, self._no_work_until_reason )


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION ] = Subscription

class SubscriptionQuery( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION_QUERY
    SERIALISABLE_NAME = 'Subscription Query'
    SERIALISABLE_VERSION = 1

    def __init__( self, query = 'query text' ):

        HydrusSerialisable.SerialisableBase.__init__( self )

        self._query = query
        self._check_now = False
        self._last_check_time = 0
        self._next_check_time = 0
        self._paused = False
        self._status = CHECKER_STATUS_OK
        self._seed_cache = SeedCache()


    def _GetSerialisableInfo( self ):

        serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()

        return ( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, serialisable_seed_cache )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, serialisable_seed_cache ) = serialisable_info

        self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )


    def CanWorkOnFiles( self ):

        seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

        if HG.subscription_report_mode:

            HydrusData.ShowText( 'Query "' + self._query + '" CanWorkOnFiles test. Next import is ' + repr( seed ) + '.' )


        return seed is not None


    def CanCheckNow( self ):

        return not self._check_now


    def CanCompact( self, checker_options ):

        death_period = checker_options.GetDeathFileVelocityPeriod()

        compact_before_this_source_time = self._last_check_time - ( death_period * 2 )

        return self._seed_cache.CanCompact( compact_before_this_source_time )


    def CanRetryFailed( self ):

        return self._seed_cache.GetSeedCount( CC.STATUS_ERROR ) > 0


    def CanSync( self ):

        if HG.subscription_report_mode:

            HydrusData.ShowText( 'Query "' + self._query + '" CanSync test. Paused status is ' + str( self._paused ) + ' and check time due is ' + str( HydrusData.TimeHasPassed( self._next_check_time ) ) + ' and check_now is ' + str( self._check_now ) + '.' )


        if self._paused:

            return False


        return HydrusData.TimeHasPassed( self._next_check_time ) or self._check_now


    def CheckNow( self ):

        self._check_now = True
        self._paused = False

        self._next_check_time = 0
        self._status = CHECKER_STATUS_OK


    def Compact( self, checker_options ):

        death_period = checker_options.GetDeathFileVelocityPeriod()

        compact_before_this_time = self._last_check_time - ( death_period * 2 )

        return self._seed_cache.Compact( compact_before_this_time )


    def GetLastChecked( self ):

        return self._last_check_time


    def GetLatestAddedTime( self ):

        return self._seed_cache.GetLatestAddedTime()


    def GetNextCheckStatusString( self ):

        if self._check_now:

            return 'checking on dialog ok'

        elif self._status == CHECKER_STATUS_DEAD:

            return 'dead, so not checking'

        elif self._paused:

            return 'paused, but would be ' + HydrusData.ConvertTimestampToPrettyPending( self._next_check_time )

        else:

            return HydrusData.ConvertTimestampToPrettyPending( self._next_check_time )


    def GetNumURLsAndFailed( self ):

        return ( self._seed_cache.GetSeedCount( CC.STATUS_UNKNOWN ), len( self._seed_cache ), self._seed_cache.GetSeedCount( CC.STATUS_ERROR ) )


    def GetQueryText( self ):

        return self._query


    def GetSeedCache( self ):

        return self._seed_cache


    def IsDead( self ):

        return self._status == CHECKER_STATUS_DEAD


    def IsInitialSync( self ):

        return self._last_check_time == 0


    def IsPaused( self ):

        return self._paused


    def PausePlay( self ):

        self._paused = not self._paused


    def RegisterSyncComplete( self ):

        self._last_check_time = HydrusData.GetNow()

        self._check_now = False


    def Reset( self ):

        self._last_check_time = 0
        self._next_check_time = 0
        self._status = CHECKER_STATUS_OK
        self._paused = False

        self._seed_cache = SeedCache()


    def RetryFailures( self ):

        self._seed_cache.RetryFailures()


    def SetCheckNow( self, check_now ):

        self._check_now = check_now


    def SetPaused( self, paused ):

        self._paused = paused


    def SetQueryAndSeedCache( self, query, seed_cache ):

        self._query = query
        self._seed_cache = seed_cache


    def UpdateNextCheckTime( self, checker_options ):

        if self._check_now:

            self._next_check_time = 0

            self._status = CHECKER_STATUS_OK

        else:

            if checker_options.IsDead( self._seed_cache, self._last_check_time ):

                self._status = CHECKER_STATUS_DEAD

                self._paused = True


            self._next_check_time = checker_options.GetNextCheckTime( self._seed_cache, self._last_check_time )


    def ToTuple( self ):

        return ( self._query, self._check_now, self._last_check_time, self._next_check_time, self._paused, self._status, self._seed_cache )


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_SUBSCRIPTION_QUERY ] = SubscriptionQuery

class URLsImport( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_URLS_IMPORT
    SERIALISABLE_NAME = 'URL Import'
    SERIALISABLE_VERSION = 1

    def __init__( self ):

        HydrusSerialisable.SerialisableBase.__init__( self )

        file_import_options = HG.client_controller.new_options.GetDefaultFileImportOptions( 'loud' )

        self._seed_cache = SeedCache()
        self._file_import_options = file_import_options
        self._paused = False

        self._seed_cache_status = ( 'initialising', ( 0, 1 ) )
        self._download_control_file_set = None
        self._download_control_file_clear = None

        self._lock = threading.Lock()

        self._files_repeating_job = None

        HG.client_controller.sub( self, 'NotifySeedsUpdated', 'seed_cache_seeds_updated' )


    def _GetSerialisableInfo( self ):

        serialisable_seed_cache = self._seed_cache.GetSerialisableTuple()
        serialisable_file_options = self._file_import_options.GetSerialisableTuple()

        return ( serialisable_seed_cache, serialisable_file_options, self._paused )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( serialisable_seed_cache, serialisable_file_options, self._paused ) = serialisable_info

        self._seed_cache = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_seed_cache )
        self._file_import_options = HydrusSerialisable.CreateFromSerialisableTuple( serialisable_file_options )


    def _NetworkJobPresentationContextFactory( self, network_job ):

        def enter_call():

            with self._lock:

                if self._download_control_file_set is not None:

                    wx.CallAfter( self._download_control_file_set, network_job )


        def exit_call():

            with self._lock:

                if self._download_control_file_clear is not None:

                    wx.CallAfter( self._download_control_file_clear )


        return NetworkJobPresentationContext( enter_call, exit_call )


    def _RegenerateSeedCacheStatus( self ):

        new_seed_cache_status = self._seed_cache.GetStatus()

        if self._seed_cache_status != new_seed_cache_status:

            self._seed_cache_status = new_seed_cache_status


    def _WorkOnFiles( self, page_key ):

        seed = self._seed_cache.GetNextSeed( CC.STATUS_UNKNOWN )

        if seed is None:

            return


        did_substantial_work = False

        url = seed.seed_data

        try:

            with self._lock:

                self._RegenerateSeedCacheStatus()


            ( url_type, match_name, can_parse ) = HG.client_controller.network_engine.domain_manager.GetURLParseCapability( url )

            if url_type in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_POST, HC.URL_TYPE_WATCHABLE ) and not can_parse:

                message = 'This URL was recognised as a "' + match_name + '" but this URL class does not yet have a parsing script linked to it!'
                message += os.linesep * 2
                message += 'Since this URL cannot be parsed, a downloader cannot be created for it! Please check your url class links under the \'networking\' menu.'

                raise HydrusExceptions.ParseException( message )


            if url_type in ( HC.URL_TYPE_UNKNOWN, HC.URL_TYPE_FILE ):

                did_substantial_work = self._WorkOnFilesRawURL( page_key, seed )

            elif url_type == HC.URL_TYPE_POST:

                did_substantial_work = self._WorkOnFilesPostURL( page_key, seed )

            elif url_type in ( HC.URL_TYPE_GALLERY, HC.URL_TYPE_WATCHABLE ):

                raise NotImplementedError( 'Unfortunately, galleries and watchable urls do not work here yet!' )


        except Exception as e:

            status = CC.STATUS_ERROR

            seed.SetStatus( status, exception = e )

            time.sleep( 3 )

        finally:

            self._seed_cache.NotifySeedsUpdated( ( seed, ) )

            with self._lock:

                self._RegenerateSeedCacheStatus()


        if did_substantial_work:

            time.sleep( DID_SUBSTANTIAL_FILE_WORK_MINIMUM_SLEEP_TIME )


    def _WorkOnFilesPostURL( self, page_key, seed ):

        url = seed.seed_data

        tag_import_options = HG.client_controller.network_engine.domain_manager.GetDefaultTagImportOptionsForURL( url )

        status_hook = lambda s: s # do nothing for now

        did_substantial_work = seed.WorkOnPostURL( self._file_import_options, tag_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._NetworkJobPresentationContextFactory )

        if seed.ShouldPresent( self._file_import_options ):

            seed.PresentToPage( page_key )

            did_substantial_work = True


        return did_substantial_work


    def _WorkOnFilesRawURL( self, page_key, seed ):

        status_hook = lambda s: s # do nothing for now

        did_substantial_work = seed.WorkOnFileURL( self._file_import_options, status_hook, GenerateDownloaderNetworkJobFactory( page_key ), self._NetworkJobPresentationContextFactory )

        if seed.ShouldPresent( self._file_import_options ):

            seed.PresentToPage( page_key )

            did_substantial_work = True


        return did_substantial_work


    def CurrentlyWorking( self ):

        with self._lock:

            finished = not self._seed_cache.WorkToDo()

            return not finished and not self._paused


    def GetSeedCache( self ):

        return self._seed_cache


    def GetOptions( self ):

        with self._lock:

            return self._file_import_options


    def GetStatus( self ):

        with self._lock:

            return ( self._seed_cache_status, self._paused )


    def GetValueRange( self ):

        with self._lock:

            return self._seed_cache.GetValueRange()


    def NotifySeedsUpdated( self, seed_cache_key, seeds ):

        if seed_cache_key == self._seed_cache.GetSeedCacheKey():

            WakeRepeatingJob( self._files_repeating_job )


    def PausePlay( self ):

        with self._lock:

            self._paused = not self._paused

            WakeRepeatingJob( self._files_repeating_job )


    def PendURLs( self, urls ):

        with self._lock:

            urls = filter( lambda u: len( u ) > 1, urls ) # > _1_ to take out the occasional whitespace

            seeds = [ Seed( SEED_TYPE_URL, url ) for url in urls ]

            if len( seeds ) > 0:

                self._seed_cache.AddSeeds( seeds )

                WakeRepeatingJob( self._files_repeating_job )


    def SetDownloadControlFile( self, download_control ):

        with self._lock:

            self._download_control_file_set = download_control.SetNetworkJob
            self._download_control_file_clear = download_control.ClearNetworkJob


    def SetFileImportOptions( self, file_import_options ):

        with self._lock:

            self._file_import_options = file_import_options


    def Start( self, page_key ):

        with self._lock:

            self._RegenerateSeedCacheStatus()


        self._files_repeating_job = HG.client_controller.CallRepeating( GetRepeatingJobInitialDelay(), REPEATING_JOB_TYPICAL_PERIOD, self.REPEATINGWorkOnFiles, page_key )


    def REPEATINGWorkOnFiles( self, page_key ):

        with self._lock:

            if PageImporterShouldStopWorking( page_key ):

                self._files_repeating_job.Cancel()

                return


            work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


        while work_to_do:

            try:

                self._WorkOnFiles( page_key )

                HG.client_controller.WaitUntilViewFree()

            except Exception as e:

                HydrusData.ShowException( e )


            with self._lock:

                if PageImporterShouldStopWorking( page_key ):

                    self._files_repeating_job.Cancel()

                    return


                work_to_do = self._seed_cache.WorkToDo() and not ( self._paused or HG.client_controller.PageClosedButNotDestroyed( page_key ) )


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_URLS_IMPORT ] = URLsImport