hydrus/include/HydrusDownloading.py

import bs4
import collections
import httplib
import HydrusConstants as HC
import json
import lxml
import pafy
import threading
import traceback
import urllib
import urlparse
import wx

def ConvertServiceIdentifiersToTagsToServiceIdentifiersToContentUpdates( hash, service_identifiers_to_tags ):

    hashes = set( ( hash, ) )

    service_identifiers_to_content_updates = {}

    for ( service_identifier, tags ) in service_identifiers_to_tags.items():

        if service_identifier == HC.LOCAL_TAG_SERVICE_IDENTIFIER: action = HC.CONTENT_UPDATE_ADD
        else: action = HC.CONTENT_UPDATE_PENDING

        content_updates = [ HC.ContentUpdate( HC.CONTENT_DATA_TYPE_MAPPINGS, action, ( tag, hashes ) ) for tag in tags ]

        service_identifiers_to_content_updates[ service_identifier ] = content_updates


    return service_identifiers_to_content_updates

def GetDownloader( site_download_type, *args ):

    if site_download_type == HC.SITE_DOWNLOAD_TYPE_BOORU: c = DownloaderBooru
    elif site_download_type == HC.SITE_DOWNLOAD_TYPE_DEVIANT_ART: c = DownloaderDeviantArt
    elif site_download_type == HC.SITE_DOWNLOAD_TYPE_GIPHY: c = DownloaderGiphy
    elif site_download_type == HC.SITE_DOWNLOAD_TYPE_HENTAI_FOUNDRY: c = DownloaderHentaiFoundry
    elif site_download_type == HC.SITE_DOWNLOAD_TYPE_PIXIV: c = DownloaderPixiv
    elif site_download_type == HC.SITE_DOWNLOAD_TYPE_TUMBLR: c = DownloaderTumblr
    elif site_download_type == HC.SITE_DOWNLOAD_TYPE_NEWGROUNDS: c = DownloaderNewgrounds

    return c( *args )

def ConvertTagsToServiceIdentifiersToTags( tags, advanced_tag_options ):

    tags = [ tag for tag in tags if tag is not None ]

    service_identifiers_to_tags = {}

    siblings_manager = HC.app.GetManager( 'tag_siblings' )
    parents_manager = HC.app.GetManager( 'tag_parents' )

    for ( service_identifier, namespaces ) in advanced_tag_options.items():

        if len( namespaces ) > 0:

            tags_to_add_here = []

            for namespace in namespaces:

                if namespace == '': tags_to_add_here.extend( [ HC.CleanTag( tag ) for tag in tags if not ':' in tag ] )
                else: tags_to_add_here.extend( [ HC.CleanTag( tag ) for tag in tags if tag.startswith( namespace + ':' ) ] )


            if len( tags_to_add_here ) > 0:

                tags_to_add_here = siblings_manager.CollapseTags( tags_to_add_here )
                tags_to_add_here = parents_manager.ExpandTags( service_identifier, tags_to_add_here )

                service_identifiers_to_tags[ service_identifier ] = tags_to_add_here


    return service_identifiers_to_tags

def DownloadYoutubeURL( job_key, url, message_string ):

    try:

        parse_result = urlparse.urlparse( url )

        connection = httplib.HTTPConnection( parse_result.hostname, timeout = 20 )

        connection.request( 'GET', url )

        response = connection.getresponse()

        try:

            total_num_bytes = int( response.getheader( 'Content-Length' ) )

            get_message = lambda num_bytes_so_far: message_string + ' - ' + HC.ConvertIntToBytes( num_bytes_so_far ) + '/' + HC.ConvertIntToBytes( total_num_bytes )

        except:

            total_num_bytes = None

            get_message = lambda num_bytes_so_far: message_string + ' - ' + HC.ConvertIntToBytes( num_bytes_so_far )


        block_size = 64 * 1024

        num_bytes_so_far = 0

        temp_path = HC.GetTempPath()

        HC.pubsub.pub( 'message_gauge_info', job_key, total_num_bytes, num_bytes_so_far, get_message( num_bytes_so_far ) )

        with open( temp_path, 'wb' ) as f:

            while True:

                if HC.shutdown or job_key.IsCancelled(): return

                block = response.read( block_size )

                num_bytes_so_far += len( block )

                HC.pubsub.pub( 'message_gauge_info', job_key, total_num_bytes, num_bytes_so_far, get_message( num_bytes_so_far ) )

                if block == '': break

                f.write( block )


        HC.pubsub.pub( 'message_gauge_info', job_key, None, None, 'importing ' + message_string )

        ( result, hash ) = HC.app.WriteSynchronous( 'import_file', temp_path )

        if result in ( 'successful', 'redundant' ): HC.pubsub.pub( 'message_gauge_show_file_button', job_key, message_string, { hash } )
        elif result == 'deleted': HC.pubsub.pub( 'message_gauge_info', job_key, None, None, 'File was already deleted!' )

    except Exception as e:

        HC.pubsub.pub( 'message_gauge_info', job_key, None, None, 'Error with ' + message_string + '!' )

        HC.ShowException( e )


def GetYoutubeFormats( youtube_url ):

    try: p = pafy.Pafy( youtube_url )
    except: raise Exception( 'Could not fetch video info from youtube!' )

    info = { ( s.extension, s.resolution ) : ( s.url, s.title ) for s in p.streams if s.extension in ( 'flv', 'mp4' ) }

    return info

class Downloader():

    def __init__( self ):

        self._we_are_done = False

        self._connections = {}

        self._report_hooks = []

        self._all_urls_so_far = set()

        self._num_pages_done = 0


    def _DownloadFile( self, connection, *args, **kwargs ):

        for hook in self._report_hooks: connection.AddReportHook( hook )

        response = connection.geturl( *args, **kwargs )

        connection.ClearReportHooks()

        return response


    def _EstablishSession( self, connection ): pass

    def _GetConnection( self, url ):

        parse_result = urlparse.urlparse( url )

        ( scheme, host, port ) = ( parse_result.scheme, parse_result.hostname, parse_result.port )

        if ( scheme, host, port ) not in self._connections:

            connection = HC.get_connection( scheme = scheme, host = host, port = port )

            self._EstablishSession( connection )

            self._connections[ ( scheme, host, port ) ] = connection


        return self._connections[ ( scheme, host, port ) ]


    def _GetNextGalleryPageURLs( self ): return ( self._GetNextGalleryPageURL(), )

    def AddReportHook( self, hook ): self._report_hooks.append( hook )

    def ClearReportHooks( self ): self._report_hooks = []

    def GetAnotherPage( self ):

        if self._we_are_done: return []

        urls = self._GetNextGalleryPageURLs()

        url_info = []

        for url in urls:

            connection = self._GetConnection( url )

            data = connection.geturl( url )

            page_of_url_info = self._ParseGalleryPage( data, url )

            # stop ourselves getting into an accidental infinite loop

            url_info += [ info for info in page_of_url_info if info[0] not in self._all_urls_so_far ]

            self._all_urls_so_far.update( [ info[0] for info in url_info ] )

            # now url_info only contains new url info


        self._num_pages_done += 1

        return url_info


    def GetFile( self, url, *args ):

        connection = self._GetConnection( url )

        return self._DownloadFile( connection, url, response_to_path = True )


    def GetFileAndTags( self, url, *args ):

        temp_path = self.GetFile( url, *args )
        tags = self.GetTags( url, *args )

        return ( temp_path, tags )


    def GetTags( self, url ): pass

    def SetupGallerySearch( self ): pass

class DownloaderBooru( Downloader ):

    def __init__( self, booru, tags ):

        self._booru = booru
        self._tags = tags

        self._gallery_advance_num = None

        ( self._search_url, self._advance_by_page_num, self._search_separator, self._thumb_classname ) = booru.GetGalleryParsingInfo()

        Downloader.__init__( self )


    def _GetNextGalleryPageURL( self ):

        if self._advance_by_page_num: index = 1 + self._num_pages_done
        else:

            if self._gallery_advance_num is None: index = 0
            else: index = self._num_pages_done * self._gallery_advance_num


        return self._search_url.replace( '%tags%', self._search_separator.join( self._tags ) ).replace( '%index%', HC.u( index ) )


    def _ParseGalleryPage( self, html, url_base ):

        urls_set = set()
        urls = []

        soup = bs4.BeautifulSoup( html )

        # this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
        def starts_with_classname( classname ): return classname is not None and classname.startswith( self._thumb_classname )

        thumbnails = soup.find_all( class_ = starts_with_classname )

        if self._gallery_advance_num is None:

            if len( thumbnails ) == 0: self._we_are_done = True
            else: self._gallery_advance_num = len( thumbnails )


        for thumbnail in thumbnails:

            links = thumbnail.find_all( 'a' )

            if thumbnail.name == 'a': links.append( thumbnail )

            for link in links:

                if link.string is not None and link.string == 'Image Only': continue # rule 34 @ paheal fix

                url = link[ 'href' ]

                url = urlparse.urljoin( url_base, url )

                if url not in urls_set:

                    urls_set.add( url )
                    urls.append( ( url, ) )


        return urls


    def _ParseImagePage( self, html, url_base ):

        ( search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces ) = self._booru.GetData()

        soup = bs4.BeautifulSoup( html )

        image_base = None

        if image_id is not None:

            image = soup.find( id = image_id )

            image_url = image[ 'src' ]


        if image_data is not None:

            links = soup.find_all( 'a' )

            for link in links:

                if link.string == image_data: image_url = link[ 'href' ]


        image_url = urlparse.urljoin( url_base, image_url )

        image_url = image_url.replace( 'sample/sample-', '' ) # fix for danbooru resizing

        tags = []

        for ( tag_classname, namespace ) in tag_classnames_to_namespaces.items():

            tag_list_entries = soup.find_all( class_ = tag_classname )

            for tag_list_entry in tag_list_entries:

                links = tag_list_entry.find_all( 'a' )

                if tag_list_entry.name == 'a': links.append( tag_list_entry )

                for link in links:

                    if link.string not in ( '?', '-', '+' ):

                        if namespace == '': tags.append( link.string )
                        else: tags.append( namespace + ':' + link.string )


        return ( image_url, tags )


    def _GetFileURLAndTags( self, url ):

        connection = self._GetConnection( url )

        html = connection.geturl( url )

        return self._ParseImagePage( html, url )


    def GetFile( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        connection = self._GetConnection( file_url )

        return self._DownloadFile( connection, file_url, response_to_path = True )


    def GetFileAndTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        connection = self._GetConnection( file_url )

        temp_path = self._DownloadFile( connection, file_url, response_to_path = True )

        return ( temp_path, tags )


    def GetTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        return tags


class DownloaderDeviantArt( Downloader ):

    def __init__( self, artist ):

        self._gallery_url = 'http://' + artist + '.deviantart.com/gallery/?catpath=/&offset='

        Downloader.__init__( self )


    def _GetNextGalleryPageURL( self ): return self._gallery_url + HC.u( self._num_pages_done * 24 )

    def _ParseGalleryPage( self, html, url_base ):

        results = []

        soup = bs4.BeautifulSoup( html )

        thumbs_container = soup.find( class_ = 'zones-container' )

        def starts_with_thumb( classname ): return classname is not None and classname.startswith( 'thumb' )

        links = thumbs_container.find_all( 'a', class_ = starts_with_thumb )

        for link in links:

            try: # starts_with_thumb picks up some false positives, but they break

                page_url = link[ 'href' ] # something in the form of blah.da.com/art/blah-123456

                raw_title = link[ 'title' ] # sweet dolls by ~AngeniaC, Feb 29, 2012 in Artisan Crafts &gt; Miniatures &gt; Jewelry

                raw_title_reversed = raw_title[::-1] # yrleweJ ;tg& serutainiM ;tg& stfarC nasitrA ni 2102 ,92 beF ,CainegnA~ yb sllod teews

                ( creator_and_date_and_tags_reversed, title_reversed ) = raw_title_reversed.split( ' yb ', 1 )

                creator_and_date_and_tags = creator_and_date_and_tags_reversed[::-1] # ~AngeniaC, Feb 29, 2012 in Artisan Crafts &gt; Miniatures &gt; Jewelry

                ( creator_with_username_char, date_and_tags ) = creator_and_date_and_tags.split( ',', 1 )

                creator = creator_with_username_char[1:] # AngeniaC

                title = title_reversed[::-1] # sweet dolls

                try:

                    ( date_gumpf, raw_category_tags ) = date_and_tags.split( ' in ', 1 )

                    category_tags = raw_category_tags.split( ' > ' )

                except Exception as e:

                    HC.ShowException( e )

                    category_tags = []


                tags = []

                tags.append( 'title:' + title )
                tags.append( 'creator:' + creator )
                tags.extend( category_tags )

                results.append( ( page_url, tags ) )

            except: pass


        return results


    def _ParseImagePage( self, html ):

        soup = bs4.BeautifulSoup( html )

        # if can find download link:
        if False:

            pass # go fetch the popup page using tokens as appropriate. feels like it needs the GET token and a referrer, as middle click just redirects back to image page

        else:

            img = soup.find( class_ = 'dev-content-full' )

            src = img[ 'src' ]

            return src


    def _GetFileURL( self, url ):

        connection = self._GetConnection( url )

        html = connection.geturl( url )

        return self._ParseImagePage( html )


    def GetFile( self, url, tags ):

        file_url = self._GetFileURL( url )

        connection = self._GetConnection( file_url )

        return self._DownloadFile( connection, file_url, response_to_path = True )


    def GetTags( self, url, tags ): return tags

class DownloaderGiphy( Downloader ):

    def __init__( self, tag ):

        self._gallery_url = 'http://giphy.com/api/gifs?tag=' + tag.replace( ' ', '+' ) + '&page='

        Downloader.__init__( self )


    def _GetNextGalleryPageURL( self ): return self._gallery_url + HC.u( self._num_pages_done + 1 )

    def _ParseGalleryPage( self, data, url_base ):

        json_dict = json.loads( data )

        if 'data' in json_dict:

            json_data = json_dict[ 'data' ]

            return [ ( d[ 'image_original_url' ], d[ 'id' ] ) for d in json_data ]

        else: return []


    def GetTags( self, url, id ):

        url = 'http://giphy.com/api/gifs/' + HC.u( id )

        connection = self._GetConnection( url )

        try:

            raw_json = connection.geturl( url )

            json_dict = json.loads( raw_json )

            tags_data = json_dict[ 'data' ][ 'tags' ]

            tags = [ tag_data[ 'name' ] for tag_data in tags_data ]

        except Exception as e:

            HC.ShowException( e )

            tags = []


        return tags


class DownloaderHentaiFoundry( Downloader ):

    def __init__( self, query_type, query, advanced_hentai_foundry_options ):

        self._query_type = query_type
        self._query = query
        self._advanced_hentai_foundry_options = advanced_hentai_foundry_options

        Downloader.__init__( self )


    def _EstablishSession( self, connection ):

        manager = HC.app.GetManager( 'web_sessions' )

        cookies = manager.GetCookies( 'hentai foundry' )

        for ( key, value ) in cookies.items(): connection.SetCookie( key, value )


    def _GetFileURLAndTags( self, url ):

        connection = self._GetConnection( url )

        html = connection.geturl( url )

        return self._ParseImagePage( html, url )


    def _GetNextGalleryPageURL( self ):

        if self._query_type in ( 'artist', 'artist pictures' ):

            artist = self._query

            gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist

            return gallery_url + '/page/' + HC.u( self._num_pages_done + 1 )

        elif self._query_type == 'artist scraps':

            artist = self._query

            gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist + '/scraps'

            return gallery_url + '/page/' + HC.u( self._num_pages_done + 1 )

        elif self._query_type == 'tags':

            tags = self._query

            return 'http://www.hentai-foundry.com/search/pictures?query=' + '+'.join( tags ) + '&search_in=all&scraps=-1&page=' + HC.u( self._num_pages_done + 1 )
            # scraps = 0 hide
            # -1 means show both
            # 1 means scraps only. wetf


    def _ParseGalleryPage( self, html, url_base ):

        urls_set = set()

        soup = bs4.BeautifulSoup( html )

        def correct_url( href ):

            # a good url is in the form "/pictures/user/artist_name/file_id/title"

            if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):

                ( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )

                # /pictures/user/artist_name/page/3
                if file_id != 'page': return True


            return False


        links = soup.find_all( 'a', href = correct_url )

        urls = [ 'http://www.hentai-foundry.com' + link['href'] for link in links ]

        result_urls = []

        for url in urls:

            if url not in urls_set:

                urls_set.add( url )

                result_urls.append( ( url, ) )


        # this is copied from old code. surely we can improve it?
        if 'class="next"' not in html: self._we_are_done = True

        return result_urls


    def _ParseImagePage( self, html, url_base ):

        # can't parse this easily normally because HF is a pain with the preview->click to see full size business.
        # find http://pictures.hentai-foundry.com//
        # then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
        # the .jpg bit is what we really need, but whatever
        try:

            index = html.index( 'http://pictures.hentai-foundry.com//' )

            stuff = html[ index : index + 100 ]

            try: ( image_url, gumpf ) = stuff.split( '"', 1 )
            except: ( image_url, gumpf ) = stuff.split( '&#039;', 1 )

        except: raise Exception( 'Could not parse image url!' )

        soup = bs4.BeautifulSoup( html )

        tags = []

        try:

            title = soup.find( 'title' )

            ( data, nothing ) = HC.u( title.string ).split( ' - Hentai Foundry' )

            data_reversed = data[::-1] # want to do it right-side first, because title might have ' by ' in it

            ( artist_reversed, title_reversed ) = data_reversed.split( ' yb ' )

            artist = artist_reversed[::-1]

            title = title_reversed[::-1]

            tags.append( 'creator:' + artist )
            tags.append( 'title:' + title )

        except: pass

        tag_links = soup.find_all( 'a', rel = 'tag' )

        for tag_link in tag_links: tags.append( tag_link.string )

        return ( image_url, tags )


    def GetFile( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        connection = self._GetConnection( file_url )

        return self._DownloadFile( connection, file_url, response_to_path = True )


    def GetFileAndTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        connection = self._GetConnection( file_url )

        temp_path = self._DownloadFile( connection, file_url, response_to_path = True )

        return ( temp_path, tags )


    def GetTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        return tags


    def SetupGallerySearch( self ):

        connection = self._GetConnection( 'http://www.hentai-foundry.com/site/filters' )

        cookies = connection.GetCookies()

        raw_csrf = cookies[ 'YII_CSRF_TOKEN' ] # YII_CSRF_TOKEN=19b05b536885ec60b8b37650a32f8deb11c08cd1s%3A40%3A%222917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32%22%3B

        processed_csrf = urllib.unquote( raw_csrf ) # 19b05b536885ec60b8b37650a32f8deb11c08cd1s:40:"2917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32";

        csrf_token = processed_csrf.split( '"' )[1] # the 2917... bit

        self._advanced_hentai_foundry_options[ 'YII_CSRF_TOKEN' ] = csrf_token

        body = urllib.urlencode( self._advanced_hentai_foundry_options )

        headers = {}
        headers[ 'Content-Type' ] = 'application/x-www-form-urlencoded'

        connection.request( 'POST', '/site/filters', headers = headers, body = body )


class DownloaderNewgrounds( Downloader ):

    def __init__( self, query ):

        self._query = query

        Downloader.__init__( self )


    def _GetFileURLAndTags( self, url ):

        connection = self._GetConnection( url )

        html = connection.geturl( url )

        return self._ParseImagePage( html, url )


    def _GetNextGalleryPageURLs( self ):

        artist = self._query

        gallery_urls = []

        gallery_urls.append( 'http://' + artist + '.newgrounds.com/games/' )
        gallery_urls.append( 'http://' + artist + '.newgrounds.com/movies/' )

        self._we_are_done = True

        return gallery_urls


    def _ParseGalleryPage( self, html, url_base ):

        soup = bs4.BeautifulSoup( html )

        fatcol = soup.find( 'div', class_ = 'fatcol' )

        links = fatcol.find_all( 'a' )

        urls_set = set()

        result_urls = []

        for link in links:

            try:

                url = link[ 'href' ]

                if url not in urls_set:

                    if url.startswith( 'http://www.newgrounds.com/portal/view/' ):

                        urls_set.add( url )

                        result_urls.append( ( url, ) )


            except: pass


        return result_urls


    def _ParseImagePage( self, html, url_base ):

        soup = bs4.BeautifulSoup( html )

        tags = set()

        author_links = soup.find( 'ul', class_ = 'authorlinks' )

        if author_links is not None:

            authors = set()

            links = author_links.find_all( 'a' )

            for link in links:

                try:

                    href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com

                    creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' )

                    tags.add( u'creator:' + creator )

                except: pass


        try:

            title = soup.find( 'title' )

            tags.add( u'title:' + title.string )

        except: pass

        all_links = soup.find_all( 'a' )

        for link in all_links:

            try:

                href = link[ 'href' ]

                if '/browse/tag/' in href: tags.add( link.string )

            except: pass


        #

        try:

            components = html.split( '"http://uploads.ungrounded.net/' )

            # there is sometimes another bit of api flash earlier on that we don't want
            # it is called http://uploads.ungrounded.net/apiassets/sandbox.swf

            if len( components ) == 2: flash_url = components[1]
            else: flash_url = components[2]

            flash_url = flash_url.split( '"', 1 )[0]

            flash_url = 'http://uploads.ungrounded.net/' + flash_url

        except: raise Exception( 'Could not find the swf file! It was probably an mp4!' )

        return ( flash_url, tags )


    def GetFile( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        connection = self._GetConnection( file_url )

        return self._DownloadFile( connection, file_url, response_to_path = True )


    def GetFileAndTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        connection = self._GetConnection( file_url )

        temp_path = self._DownloadFile( connection, file_url, response_to_path = True )

        return ( temp_path, tags )


    def GetTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        return tags


class DownloaderPixiv( Downloader ):

    def __init__( self, query_type, query ):

        self._query_type = query_type
        self._query = query

        Downloader.__init__( self )


    def _EstablishSession( self, connection ):

        manager = HC.app.GetManager( 'web_sessions' )

        cookies = manager.GetCookies( 'pixiv' )

        for ( key, value ) in cookies.items(): connection.SetCookie( key, value )


    def _GetNextGalleryPageURL( self ):

        if self._query_type == 'artist':

            artist_id = self._query

            gallery_url = 'http://www.pixiv.net/member_illust.php?id=' + HC.u( artist_id )

        elif self._query_type == 'tag':

            tag = self._query

            tag = urllib.quote( tag.encode( 'utf-8' ) )

            gallery_url = 'http://www.pixiv.net/search.php?word=' + tag + '&s_mode=s_tag_full&order=date_d'


        return gallery_url + '&p=' + HC.u( self._num_pages_done + 1 )


    def _ParseGalleryPage( self, html, url_base ):

        results = []

        soup = bs4.BeautifulSoup( html )

        thumbnail_links = soup.find_all( class_ = 'work' )

        for thumbnail_link in thumbnail_links:

            url = urlparse.urljoin( url_base, thumbnail_link[ 'href' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690

            results.append( ( url, ) )


        return results


    def _ParseImagePage( self, html, page_url ):

        soup = bs4.BeautifulSoup( html )

        #

        # this is the page that holds the full size of the image.
        # pixiv won't serve the image unless it thinks this page is the referrer
        referral_url = page_url.replace( 'medium', 'big' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690

        #

        works_display = soup.find( class_ = 'works_display' )

        img = works_display.find( 'img' )

        img_url = img[ 'src' ] # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468_m.png

        image_url = img_url.replace( '_m.', '.' ) # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468.png

        #

        tags = soup.find( 'ul', class_ = 'tags' )

        tags = [ a_item.string for a_item in tags.find_all( 'a', class_ = 'text' ) ]

        user = soup.find( 'h1', class_ = 'user' )

        tags.append( 'creator:' + user.string )

        title_parent = soup.find( 'section', class_ = 'work-info' )

        title = title_parent.find( 'h1', class_ = 'title' )

        tags.append( 'title:' + title.string )

        try: tags.append( 'creator:' + image_url.split( '/' )[ -2 ] ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg -> dnosuke
        except: pass

        return ( referral_url, image_url, tags )


    def _GetReferralURLFileURLAndTags( self, page_url ):

        connection = self._GetConnection( page_url )

        html = connection.geturl( page_url )

        return self._ParseImagePage( html, page_url )


    def GetFile( self, url ):

        ( referral_url, image_url, tags ) = self._GetReferralURLFileURLAndTags( url )

        connection = self._GetConnection( image_url )

        headers = { 'Referer' : referral_url }

        return self._DownloadFile( connection, image_url, headers = headers, response_to_path = True )


    def GetFileAndTags( self, url ):

        ( referral_url, image_url, tags ) = self._GetReferralURLFileURLAndTags( url )

        connection = self._GetConnection( image_url )

        headers = { 'Referer' : referral_url }

        temp_path = self._DownloadFile( connection, image_url, headers = headers, response_to_path = True )

        return ( temp_path, tags )


    def GetTags( self, url ):

        ( referral_url, image_url, tags ) = self._GetReferralURLFileURLAndTags( url )

        return tags


class DownloaderTumblr( Downloader ):

    def __init__( self, username ):

        self._gallery_url = 'http://' + username + '.tumblr.com/api/read/json?start=%start%&num=50'

        Downloader.__init__( self )


    def _GetNextGalleryPageURL( self ): return self._gallery_url.replace( '%start%', HC.u( self._num_pages_done * 50 ) )

    def _ParseGalleryPage( self, data, url_base ):

        processed_raw_json = data.split( 'var tumblr_api_read = ' )[1][:-2] # -2 takes a couple newline chars off at the end

        json_object = json.loads( processed_raw_json )

        results = []

        if 'posts' in json_object:

            for post in json_object[ 'posts' ]:

                if 'tags' in post: tags = post[ 'tags' ]
                else: tags = []

                post_type = post[ 'type' ]

                if post_type == 'photo':

                    if len( post[ 'photos' ] ) == 0:

                        try: results.append( ( post[ 'photo-url-1280' ], tags ) )
                        except: pass

                    else:

                        for photo in post[ 'photos' ]:

                            try: results.append( ( photo[ 'photo-url-1280' ], tags ) )
                            except: pass


        return results


    def GetTags( self, url, tags ): return tags

class DownloaderEngine(): # rename this to something more import related

    # this should be a yamlable thing

    def __init__( self, page_key, import_queue_generator ):

        self._page_key = page_key
        self._import_queue_generator = import_queue_generator

        self._current_queue_processor = None

        self._pending_queue_jobs = []


    def GetCurrentQueueProcessor( self ): return self._current_queue_processor

    def ToTuple( self ): return ( self._pending_queue_jobs, )

    def PendQueueJob( self, job ):

        self._pending_queue_jobs.append( job )


    def THREADProcessJobs( self ):

        while True:

            if len( self._pending_queue_jobs ) > 0:

                job = self._pending_queue_jobs.pop( 0 )

                self._current_queue_processor = self._import_queue_generator( job )

                self._current_queue_processor.ProcessQueue()


            # if there are any pending jobs:

                # get it
                # process it


            pass


class ImportQueueProcessor():

    def __init__( self, page_key, import_args_generator ):

        self._page_key = page_key
        self._import_args_generator = import_args_generator

        self._queue_is_done = False

        self._queue = []

        self._paused = False

        self._current_position = 0

        self._lock = threading.Lock()

        HC.pubsub.sub( self, 'SetPaused', 'pause_import_queue_processor' )


    def AddToQueue( self, queue_objects ):

        with self._lock: self._queue.extend( queue_objects )


    def QueueIsDone( self ): self._queue_is_done = True

    def SetPaused( self, status ): self._paused = status

    def ToTuple( self ):

        with self._lock: return ( self._current_position, len( self._queue ) )


    def ProcessQueue( self ):

        while not self._queue_is_done:

            with self._lock: queue_length = len( self._queue )

            if not self._paused and self._current_position < queue_length:

                with self._lock: queue_object = self._queue[ self._current_position ]

                # reorder these params as is best
                ( temp_path, url, tags, anything_else ) = self._path_generator( self._page_key, queue_object )

                # synchronously write import to db

                self._current_position += 1


            time.sleep( 1 )


def PathGeneratorBooru( self, page_key, queue_object ):

    # unpack queue_object
    # test url or whatever as appropriate
    # fetch file, possibly with help of downloader or whatever!
    # downloader should write file to path, returning temp_path
    # we should return temp_path

    pass