hydrus/include/ClientDownloading.py

import bs4
import ClientData
import ClientNetworking
import HydrusConstants as HC
import HydrusExceptions
import HydrusPaths
import HydrusSerialisable
import json
import lxml # to force import for later bs4 stuff
import os
import pafy
import re
import requests
import threading
import time
import urllib
import urlparse
import HydrusData
import ClientConstants as CC
import HydrusGlobals as HG

# This is fairly ugly, but it works for what I need it to do

URL_EXTRA_INFO = {}
URL_EXTRA_INFO_LOCK = threading.Lock()

def GetExtraURLInfo( url ):

    with URL_EXTRA_INFO_LOCK:

        if url in URL_EXTRA_INFO:

            return URL_EXTRA_INFO[ url ]

        else:

            return None


def GetGalleryStreamIdentifiers( gallery_identifier ):

    site_type = gallery_identifier.GetSiteType()

    if site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST:

        gallery_stream_identifiers = [ GalleryIdentifier( HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST_PICTURES ), GalleryIdentifier( HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST_SCRAPS ) ]

    elif site_type == HC.SITE_TYPE_NEWGROUNDS:

        gallery_stream_identifiers = [ GalleryIdentifier( HC.SITE_TYPE_NEWGROUNDS_GAMES ), GalleryIdentifier( HC.SITE_TYPE_NEWGROUNDS_MOVIES ) ]

    else:

        gallery_stream_identifiers = [ gallery_identifier ]


    return gallery_stream_identifiers

def SetExtraURLInfo( url, info ):

    with URL_EXTRA_INFO_LOCK:

        URL_EXTRA_INFO[ url ] = info


def GetGallery( gallery_identifier ):

    site_type = gallery_identifier.GetSiteType()

    if site_type == HC.SITE_TYPE_BOORU:

        booru_name = gallery_identifier.GetAdditionalInfo()

        return GalleryBooru( booru_name )

    elif site_type == HC.SITE_TYPE_DEVIANT_ART:

        return GalleryDeviantArt()

    elif site_type == HC.SITE_TYPE_GIPHY:

        return GalleryGiphy()

    elif site_type in ( HC.SITE_TYPE_HENTAI_FOUNDRY, HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST ):

        return GalleryHentaiFoundry()

    elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST_PICTURES:

        return GalleryHentaiFoundryArtistPictures()

    elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST_SCRAPS:

        return GalleryHentaiFoundryArtistScraps()

    elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_TAGS:

        return GalleryHentaiFoundryTags()

    elif site_type == HC.SITE_TYPE_NEWGROUNDS:

        return GalleryNewgrounds()

    elif site_type == HC.SITE_TYPE_NEWGROUNDS_GAMES:

        return GalleryNewgroundsGames()

    elif site_type == HC.SITE_TYPE_NEWGROUNDS_MOVIES:

        return GalleryNewgroundsMovies()

    elif site_type == HC.SITE_TYPE_PIXIV:

        return GalleryPixiv()

    elif site_type == HC.SITE_TYPE_PIXIV_ARTIST_ID:

        return GalleryPixivArtistID()

    elif site_type == HC.SITE_TYPE_PIXIV_TAG:

        return GalleryPixivTag()

    elif site_type == HC.SITE_TYPE_TUMBLR:

        return GalleryTumblr()


_8CHAN_BOARDS_TO_MEDIA_HOSTS = {}

def GetImageboardFileURL( thread_url, filename, ext ):

    ( thread_url, host, board, thread_id ) = ParseImageboardThreadURL( thread_url )

    is_4chan = '4chan.org' in host
    is_8chan = '8ch.net' in host

    if is_4chan:

        return 'https://i.4cdn.org/' + board + '/' + filename + ext

    elif is_8chan:

        if len( filename ) == 64: # new sha256 filename

            return 'https://media.8ch.net/file_store/' + filename + ext

        else:

            if board not in _8CHAN_BOARDS_TO_MEDIA_HOSTS:

                try:

                    html_url = 'https://8ch.net/' + board + '/res/' + thread_id + '.html'

                    network_job = ClientNetworking.NetworkJob( 'GET', html_url )

                    network_job.OverrideBandwidth()

                    HG.client_controller.network_engine.AddJob( network_job )

                    network_job.WaitUntilDone()

                    thread_html = network_job.GetContent()

                    soup = GetSoup( thread_html )

                    file_infos = soup.find_all( 'p', class_ = "fileinfo" )

                    example_file_url = file_infos[0].find( 'a' )[ 'href' ]

                    parse_result = urlparse.urlparse( example_file_url )

                    hostname = parse_result.hostname

                    if hostname is None:

                        hostname = '8ch.net'


                    _8CHAN_BOARDS_TO_MEDIA_HOSTS[ board ] = hostname

                except Exception as e:

                    _8CHAN_BOARDS_TO_MEDIA_HOSTS[ board ] = 'media.8ch.net'


            media_host = _8CHAN_BOARDS_TO_MEDIA_HOSTS[ board ]

            return 'https://' + media_host + '/' + board + '/src/' + filename + ext


def GetImageboardThreadJSONURL( thread_url ):

    ( thread_url, host, board, thread_id ) = ParseImageboardThreadURL( thread_url )

    is_4chan = '4chan.org' in host
    is_8chan = '8ch.net' in host

    # 4chan
    # https://a.4cdn.org/asp/thread/382059.json

    # 8chan
    # https://8ch.net/v/res/406061.json

    if is_4chan:

        return 'https://a.4cdn.org/' + board + '/thread/' + thread_id + '.json'

    elif is_8chan:

        return 'https://8ch.net/' + board + '/res/' + thread_id + '.json'


def GetSoup( html ):

    return bs4.BeautifulSoup( html, 'lxml' )

def GetYoutubeFormats( youtube_url ):

    try:

        p = pafy.new( youtube_url )

    except Exception as e:

        raise Exception( 'Could not fetch video info from youtube!' + os.linesep + HydrusData.ToUnicode( e ) )


    info = { ( s.extension, s.resolution ) : ( s.url, s.title ) for s in p.streams if s.extension in ( 'flv', 'mp4', 'webm' ) }

    return info

def Parse4chanPostScreen( html ):

    soup = GetSoup( html )

    title_tag = soup.find( 'title' )

    if title_tag.string == 'Post successful!': return ( 'success', None )
    elif title_tag.string == '4chan - Banned':

        HydrusData.Print( soup )

        text = 'You are banned from this board! html written to log.'

        HydrusData.ShowText( text )

        return ( 'big error', text )

    else:

        try:

            problem_tag = soup.find( id = 'errmsg' )

            if problem_tag is None:

                HydrusData.Print( soup )

                text = 'Unknown problem; html written to log.'

                HydrusData.ShowText( text )

                return ( 'error', text )


            problem = HydrusData.ToUnicode( problem_tag )

            if 'CAPTCHA' in problem: return ( 'captcha', None )
            elif 'seconds' in problem: return ( 'too quick', None )
            elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
            else: return ( 'error', problem )

        except: return ( 'error', 'unknown error' )


def ParseImageboardFileURLFromPost( thread_url, post, source_timestamp ):

    url_filename = str( post[ 'tim' ] )
    url_ext = post[ 'ext' ]

    file_original_filename = post[ 'filename' ]
    file_url = GetImageboardFileURL( thread_url, url_filename, url_ext )

    if 'md5' in post:

        file_md5_base64 = post[ 'md5' ]

    else:

        file_md5_base64 = None


    return ( file_url, file_md5_base64, file_original_filename, source_timestamp )

def ParseImageboardFileURLsFromJSON( thread_url, raw_json ):

    json_dict = json.loads( raw_json )

    posts_list = json_dict[ 'posts' ]

    file_infos = []

    for post in posts_list:

        if 'filename' not in post:

            continue


        if 'time' in post:

            source_timestamp = post[ 'time' ]

        else:

            source_timestamp = HydrusData.GetNow()


        file_infos.append( ParseImageboardFileURLFromPost( thread_url, post, source_timestamp ) )

        if 'extra_files' in post:

            for extra_file in post[ 'extra_files' ]:

                if 'filename' not in extra_file:

                    continue


                file_infos.append( ParseImageboardFileURLFromPost( thread_url, extra_file, source_timestamp ) )


    return file_infos

def IsImageboardThread( url ):

    if '4chan.org' in url:

        if '/thread/' in url:

            return True


    if '8ch.net' in url:

        if '/res/' in url:

            return True


    return False

def ParseImageboardThreadURL( thread_url ):

    try:

        if '#' in thread_url:

            ( thread_url, post_anchor_gumpf ) = thread_url.split( '#', 1 )


        parse_result = urlparse.urlparse( thread_url )

        host = parse_result.hostname

        request = parse_result.path

        if host is None or request is None:

            raise Exception()


    except:

        raise Exception ( 'Could not understand that url!' )


    is_4chan = '4chan.org' in host
    is_8chan = '8ch.net' in host

    if not ( is_4chan or is_8chan ):

        raise Exception( 'This only works for 4chan and 8chan right now!' )


    try:

        # 4chan
        # /asp/thread/382059/post-your-favourite-martial-arts-video-if-martin

        # 8chan
        # /v/res/406061.html

        if is_4chan:

            ( board, rest_of_request ) = request[1:].split( '/thread/', 1 )

            if '/' in rest_of_request:

                ( thread_id, gumpf ) = rest_of_request.split( '/' )

            else:

                thread_id = rest_of_request


        elif is_8chan:

            ( board, rest_of_request ) = request[1:].split( '/res/', 1 )

            thread_id = rest_of_request[:-5]


    except Exception as e:

        raise Exception( 'Could not understand that thread url! Either the board or the thread id components were malformed or missing.' )


    return ( thread_url, host, board, thread_id )

def ParsePageForURLs( html, starting_url ):

    soup = GetSoup( html )

    all_links = soup.find_all( 'a' )

    links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]

    urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]

    return urls

class GalleryIdentifier( HydrusSerialisable.SerialisableBase ):

    SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IDENTIFIER
    SERIALISABLE_VERSION = 1

    def __init__( self, site_type = None, additional_info = None ):

        HydrusSerialisable.SerialisableBase.__init__( self )

        self._site_type = site_type
        self._additional_info = additional_info


    def __eq__( self, other ):

        return self.__hash__() == other.__hash__()


    def __hash__( self ):

        return ( self._site_type, self._additional_info ).__hash__()


    def __ne__( self, other ):

        return self.__hash__() != other.__hash__()


    def __repr__( self ):

        text = 'Gallery Identifier: ' + HC.site_type_string_lookup[ self._site_type ]

        if self._site_type == HC.SITE_TYPE_BOORU:

            text += ': ' + HydrusData.ToUnicode( self._additional_info )


        return text


    def _GetSerialisableInfo( self ):

        return ( self._site_type, self._additional_info )


    def _InitialiseFromSerialisableInfo( self, serialisable_info ):

        ( self._site_type, self._additional_info ) = serialisable_info


    def GetAdditionalInfo( self ):

        return self._additional_info


    def GetSiteType( self ):

        return self._site_type


    def ToString( self ):

        text = HC.site_type_string_lookup[ self._site_type ]

        if self._site_type == HC.SITE_TYPE_BOORU and self._additional_info is not None:

            booru_name = self._additional_info

            text = HC.site_type_string_lookup[ self._site_type ] + ': ' + booru_name


        return text


HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IDENTIFIER ] = GalleryIdentifier

class Gallery( object ):

    def __init__( self ):

        self._network_job_factory = self._DefaultNetworkJobFactory


    def _DefaultNetworkJobFactory( self, method, url, **kwargs ):

        return ClientNetworking.NetworkJob( method, url, **kwargs )


    def _EnsureLoggedIn( self ):

        pass


    def _FetchData( self, url, referral_url = None, temp_path = None ):

        self._EnsureLoggedIn()

        network_job = self._network_job_factory( 'GET', url, referral_url = referral_url, temp_path = temp_path )

        HG.client_controller.network_engine.AddJob( network_job )

        try:

            network_job.WaitUntilDone()

        except Exception as e:

            HydrusData.Print( 'The url ' + url + ' gave the following problem:' )
            HydrusData.PrintException( e )

            raise


        if temp_path is None:

            return network_job.GetContent()


    def _GetGalleryPageURL( self, query, page_index ):

        raise NotImplementedError()


    def _ParseGalleryPage( self, data, url ):

        raise NotImplementedError()


    def GetFile( self, temp_path, url ):

        self._FetchData( url, temp_path = temp_path )


    def GetFileAndTags( self, temp_path, url ):

        self.GetFile( temp_path, url )
        tags = self.GetTags( url )

        return tags


    def GetPage( self, query, page_index ):

        gallery_url = self._GetGalleryPageURL( query, page_index )

        data = self._FetchData( gallery_url )

        ( page_of_urls, definitely_no_more_pages ) = self._ParseGalleryPage( data, gallery_url )

        return ( page_of_urls, definitely_no_more_pages )


    def GetTags( self, url ):

        raise NotImplementedError()


    def SetNetworkJobFactory( self, network_job_factory ):

        self._network_job_factory = network_job_factory


class GalleryBooru( Gallery ):

    def __init__( self, booru_name ):

        try:

            self._booru = HG.client_controller.Read( 'remote_booru', booru_name )

        except:

            raise Exception( 'Attempted to find booru "' + booru_name + '", but it was missing from the database!' )


        self._gallery_advance_num = None

        ( self._search_url, self._advance_by_page_num, self._search_separator, self._thumb_classname ) = self._booru.GetGalleryParsingInfo()

        Gallery.__init__( self )


    def _GetGalleryPageURL( self, query, page_index ):

        if self._advance_by_page_num:

            url_index = page_index + 1

        else:

            if self._gallery_advance_num is None:

                if page_index == 0:

                    url_index = page_index

                else:

                    self.GetPage( query, 0 )

                    if self._gallery_advance_num is None:

                        raise Exception( 'Unable to calculate the booru\'s gallery advance number.' )

                    else:

                        url_index = page_index * self._gallery_advance_num


            else:

                url_index = page_index * self._gallery_advance_num


        tags = query.split( ' ' )

        if 'e621' in self._search_url:

            tags_to_use = []

            for tag in tags:

                if '/' in tag:

                    tag = tag.replace( '/', '%-2F' )


                tags_to_use.append( tag )


            tags = tags_to_use


        tags_replace = self._search_separator.join( [ urllib.quote( HydrusData.ToByteString( tag ), '' ) for tag in tags ] )

        return self._search_url.replace( '%tags%', tags_replace ).replace( '%index%', str( url_index ) )


    def _ParseGalleryPage( self, html, url_base ):

        definitely_no_more_pages = False

        urls_set = set()
        urls = []

        soup = GetSoup( html )

        # this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
        def starts_with_classname( classname ):

            return classname is not None and classname.startswith( self._thumb_classname )


        thumbnails = soup.find_all( class_ = starts_with_classname )

        # this is a sankaku thing
        popular_thumbnail_parent = soup.find( id = 'popular-preview' )

        if popular_thumbnail_parent is not None:

            popular_thumbnails = popular_thumbnail_parent.find_all( class_ = starts_with_classname )

            thumbnails = thumbnails[ len( popular_thumbnails ) : ]


        for thumbnail in thumbnails:

            links = thumbnail.find_all( 'a' )

            if thumbnail.name == 'a':

                links.append( thumbnail )


            for link in links:

                if link.string is not None and link.string == 'Image Only':

                    continue # rule 34 @ paheal fix


                url = link[ 'href' ]

                url = urlparse.urljoin( url_base, url )

                if url not in urls_set:

                    urls_set.add( url )
                    urls.append( url )


        if self._gallery_advance_num is None:

            if len( urls ) == 0:

                definitely_no_more_pages = True

            else:

                self._gallery_advance_num = len( urls )


        if 'gelbooru.com' in url_base:

            # they now use redirect urls for thumbs, wew lad

            bad_urls = urls

            urls = []

            session = requests.Session()

            for bad_url in bad_urls:

                # the garbage after the redirect.php is the redirect in base64

                # https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY5NDEyMg==

                if 'redirect.php' in bad_url:

                    try:

                        encoded_location = bad_url.split( '?s=' )[1]

                        location = encoded_location.decode( 'base64' )

                        url = urlparse.urljoin( bad_url, location )

                        urls.append( url )

                    except Exception as e:

                        HydrusData.ShowText( 'gelbooru parsing problem!' )
                        HydrusData.ShowException( e )

                        time.sleep( 2 )

                        break


                else:

                    urls.append( bad_url )


            # giving 404 on some content servers for http, no redirect for some reason
            urls = [ ClientData.ConvertHTTPToHTTPS( url ) for url in urls ]


        return ( urls, definitely_no_more_pages )


    def _ParseImagePage( self, html, url_base ):

        ( search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces ) = self._booru.GetData()

        soup = GetSoup( html )

        image_url = None

        try:

            if image_id is not None:

                image = soup.find( id = image_id )

                if image is None:

                    image_string = soup.find( text = re.compile( 'Save this file' ) )

                    if image_string is None:

                        image_string = soup.find( text = re.compile( 'Save this video' ) )


                    if image_string is None:

                        # catchall for rule34hentai.net's webms

                        if image_url is None:

                            a_tags = soup.find_all( 'a' )

                            for a_tag in a_tags:

                                href = a_tag[ 'href' ]

                                if href is not None:

                                    if href.endswith( '.webm' ):

                                        image_url = href

                                        break


                        # catchall for rule34hentai.net's mp4s, which are loaded in a mickey-mouse flv player

                        if image_url is None:

                            magic_phrase = 'document.write("<source src=\''

                            if magic_phrase in html:

                                # /image/252605' type='video/mp4...

                                image_url_and_gumpf = html.split( magic_phrase, 1 )[1]

                                image_url = image_url_and_gumpf.split( '\'', 1 )[0]


                    else:

                        image = image_string.parent

                        image_url = image[ 'href' ]


                else:

                    if image.name in ( 'img', 'video' ):

                        image_url = image[ 'src' ]

                        if 'Running Danbooru' in html:

                            # possible danbooru resized image

                            possible_better_image = soup.find( id = 'image-resize-link' )

                            if possible_better_image is not None:

                                image_url = possible_better_image[ 'href' ]


                    elif image.name == 'a':

                        image_url = image[ 'href' ]


            if image_data is not None:

                links = soup.find_all( 'a' )

                ok_link = None
                better_link = None

                for link in links:

                    if link.string is not None:

                        if link.string.startswith( image_data ):

                            ok_link = link[ 'href' ]


                        if link.string.startswith( 'Download PNG' ):

                            better_link = link[ 'href' ]

                            break


                if better_link is not None:

                    image_url = better_link

                else:

                    image_url = ok_link


        except Exception as e:

            raise HydrusExceptions.DataMissing( 'Could not parse a download link for ' + url_base + '!' + os.linesep + HydrusData.ToUnicode( e ) )


        if image_url is None:

            raise HydrusExceptions.DataMissing( 'Could not parse a download link for ' + url_base + '!' )


        image_url = urlparse.urljoin( url_base, image_url )

        if 'gelbooru.com' in url_base:

            # giving 404 on some content servers for http, no redirect for some reason
            image_url = ClientData.ConvertHTTPToHTTPS( image_url )


        tags = []

        for ( tag_classname, namespace ) in tag_classnames_to_namespaces.items():

            tag_list_entries = soup.find_all( class_ = tag_classname )

            for tag_list_entry in tag_list_entries:

                links = tag_list_entry.find_all( 'a' )

                if tag_list_entry.name == 'a': links.append( tag_list_entry )

                for link in links:

                    if link.string is None:

                        continue


                    if link.string not in ( '?', '-', '+' ):

                        if namespace == '': tags.append( link.string )
                        else: tags.append( namespace + ':' + link.string )


        return ( image_url, tags )


    def _GetFileURLAndTags( self, url ):

        html = self._FetchData( url )

        return self._ParseImagePage( html, url )


    def GetFile( self, temp_path, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( file_url, referral_url = url, temp_path = temp_path )


    def GetFileAndTags( self, temp_path, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( file_url, referral_url = url, temp_path = temp_path )

        return tags


    def GetTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        return tags


class GalleryDeviantArt( Gallery ):

    def _GetGalleryPageURL( self, query, page_index ):

        artist = query

        return 'http://' + artist + '.deviantart.com/gallery/?catpath=/&offset=' + str( page_index * 24 )


    def _ParseGalleryPage( self, html, url_base ):

        definitely_no_more_pages = False

        urls = []

        soup = GetSoup( html )

        thumbs_container = soup.find( 'div', class_ = 'torpedo-container' )

        artist = url_base.split( 'http://' )[1].split( '.deviantart.com' )[0]

        thumbs = thumbs_container.find_all( 'span', class_ = 'thumb' )

        for thumb in thumbs:

            url = thumb[ 'href' ] # something in the form of blah.da.com/art/blah-123456

            urls.append( url )

            tags = []

            tags.append( 'creator:' + artist )

            title_tag = thumb.find( 'span', class_ = 'title' )

            if title_tag is not None:

                title = title_tag.string

                if title is not None and title != '':

                    tags.append( 'title:' + title )


            SetExtraURLInfo( url, tags )


        return ( urls, definitely_no_more_pages )


    def _ParseImagePage( self, html, referral_url ):

        soup = GetSoup( html )

        download_button = soup.find( 'a', class_ = 'dev-page-download' )

        if download_button is None:

            # this method maxes out at 1024 width

            img = soup.find( class_ = 'dev-content-full' )

            if img is None:

                # nsfw

                # used to fetch this from a tumblr share url, now we grab from some hidden gubbins behind an age gate

                a_ismatures = soup.find_all( 'a', class_ = 'ismature' )

                imgs = []

                for a_ismature in a_ismatures:

                    imgs.extend( a_ismature.find_all( 'img' ) )


                for img in imgs:

                    # <img   width="150" height="75" alt="Jelly gals by ArtInCase" src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" data-src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" srcset="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w" sizes="150px">

                    if img.has_attr( 'srcset' ):

                        # http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w
                        # the last url here is what we want

                        srcset = img[ 'srcset' ]

                        # 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg

                        gubbins_and_url = srcset.split( ' ' )[ -2 ]

                        img_url = gubbins_and_url.split( ',' )[1]

                        break


            else:

                img_url = img[ 'src' ]


        else:

            # something like http://www.deviantart.com/download/518046750/varda_and_the_sacred_trees_of_valinor_by_implosinoatic-d8kfjfi.jpg?token=476cb73aa2ab22bb8554542bc9f14982e09bd534&ts=1445717843
            # given the right cookies, it redirects to the truly fullsize image_url
            # otherwise, it seems to redirect to a small interstitial redirect page that heads back to the original image page

            img_url = download_button[ 'href' ]


        return img_url


    def _GetFileURL( self, url ):

        html = self._FetchData( url )

        return self._ParseImagePage( html, url )


    def GetFile( self, temp_path, url ):

        file_url = self._GetFileURL( url )

        self._FetchData( file_url, referral_url = url, temp_path = temp_path )


    def GetTags( self, url ):

        result = GetExtraURLInfo( url )

        if result is None:

            return []

        else:

            return result


class GalleryGiphy( Gallery ):

    def _GetGalleryPageURL( self, query, page_index ):

        tag = query

        return 'http://giphy.com/api/gifs?tag=' + urllib.quote( HydrusData.ToByteString( tag ).replace( ' ', '+' ), '' ) + '&page=' + str( page_index + 1 )


    def _ParseGalleryPage( self, data, url_base ):

        definitely_no_more_pages = False

        json_dict = json.loads( data )

        urls = []

        if 'data' in json_dict:

            json_data = json_dict[ 'data' ]

            for d in json_data:

                url = d[ 'image_original_url' ]
                id = d[ 'id' ]

                SetExtraURLInfo( url, id )

                urls.append( url )


        return ( urls, definitely_no_more_pages )


    def GetTags( self, url ):

        id = GetExtraURLInfo( url )

        if id is None:

            return []

        else:

            url = 'http://giphy.com/api/gifs/' + str( id )

            try:

                raw_json = self._FetchData( url )

                json_dict = json.loads( raw_json )

                tags_data = json_dict[ 'data' ][ 'tags' ]

                return [ tag_data[ 'name' ] for tag_data in tags_data ]

            except Exception as e:

                HydrusData.ShowException( e )

                return []


class GalleryHentaiFoundry( Gallery ):

    def _EnsureLoggedIn( self ):

        manager = HG.client_controller.GetManager( 'web_sessions' )

        manager.EnsureLoggedIn( 'hentai foundry' )


    def _GetFileURLAndTags( self, url ):

        html = self._FetchData( url )

        return self._ParseImagePage( html, url )


    def _ParseGalleryPage( self, html, url_base ):

        definitely_no_more_pages = False

        urls_set = set()

        soup = GetSoup( html )

        def correct_url( href ):

            if href is None:

                return False


            # a good url is in the form "/pictures/user/artist_name/file_id/title"

            if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):

                ( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )

                # /pictures/user/artist_name/page/3
                if file_id != 'page':

                    return True


            return False


        urls = []

        links = soup.find_all( 'a', href = correct_url )

        for link in links:

            url = 'http://www.hentai-foundry.com' + link['href']

            if url not in urls_set:

                urls_set.add( url )

                urls.append( url )


        # this is copied from old code. surely we can improve it?
        if 'class="next"' not in html:

            definitely_no_more_pages = True


        return ( urls, definitely_no_more_pages )


    def _ParseImagePage( self, html, url_base ):

        # can't parse this easily normally because HF is a pain with the preview->click to see full size business.
        # find http://pictures.hentai-foundry.com//
        # then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144/image.jpg
        # the .jpg bit is what we really need, but whatever
        try:

            index = html.index( 'pictures.hentai-foundry.com' )

            image_url = html[ index : index + 256 ]

            if '"' in image_url: ( image_url, gumpf ) = image_url.split( '"', 1 )
            if '&#039;' in image_url: ( image_url, gumpf ) = image_url.split( '&#039;', 1 )

            image_url = 'http://' + image_url

        except Exception as e:

            raise Exception( 'Could not parse image url!' + os.linesep + HydrusData.ToUnicode( e ) )


        soup = GetSoup( html )

        tags = []

        try:

            title = soup.find( 'title' )

            ( data, nothing ) = title.string.split( ' - Hentai Foundry' )

            data_reversed = data[::-1] # want to do it right-side first, because title might have ' by ' in it

            ( artist_reversed, title_reversed ) = data_reversed.split( ' yb ' )

            artist = artist_reversed[::-1]

            title = title_reversed[::-1]

            tags.append( 'creator:' + artist )
            tags.append( 'title:' + title )

        except: pass

        return ( image_url, tags )


    def GetFile( self, temp_path, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( file_url, referral_url = url, temp_path = temp_path )


    def GetFileAndTags( self, temp_path, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( file_url, referral_url = url, temp_path = temp_path )

        return tags


    def GetTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        return tags


class GalleryHentaiFoundryArtistPictures( GalleryHentaiFoundry ):

    def _GetGalleryPageURL( self, query, page_index ):

        artist = query

        gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist

        return gallery_url + '/page/' + str( page_index + 1 )


class GalleryHentaiFoundryArtistScraps( GalleryHentaiFoundry ):

    def _GetGalleryPageURL( self, query, page_index ):

        artist = query

        gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist + '/scraps'

        return gallery_url + '/page/' + str( page_index + 1 )


class GalleryHentaiFoundryTags( GalleryHentaiFoundry ):

    def _GetGalleryPageURL( self, query, page_index ):

        tags = query.split( ' ' )

        return 'http://www.hentai-foundry.com/search/pictures?query=' + '+'.join( tags ) + '&search_in=all&scraps=-1&page=' + str( page_index + 1 )
        # scraps = 0 hide
        # -1 means show both
        # 1 means scraps only. wetf


class GalleryNewgrounds( Gallery ):

    def _GetFileURLAndTags( self, url ):

        html = self._FetchData( url )

        return self._ParseImagePage( html, url )


    def _ParseGalleryPage( self, html, url_base ):

        soup = GetSoup( html )

        fatcol = soup.find( 'div', class_ = 'fatcol' )

        links = fatcol.find_all( 'a' )

        urls_set = set()

        urls = []

        for link in links:

            try:

                url = link[ 'href' ]

                if url not in urls_set:

                    if url.startswith( 'http://www.newgrounds.com/portal/view/' ):

                        urls_set.add( url )

                        urls.append( url )


            except: pass


        definitely_no_more_pages = True

        return ( urls, definitely_no_more_pages )


    def _ParseImagePage( self, html, url_base ):

        soup = GetSoup( html )

        tags = set()

        author_links = soup.find( 'ul', class_ = 'authorlinks' )

        if author_links is not None:

            authors = set()

            links = author_links.find_all( 'a' )

            for link in links:

                try:

                    href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com

                    creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' )

                    tags.add( u'creator:' + creator )

                except: pass


        try:

            title = soup.find( 'title' )

            tags.add( u'title:' + title.string )

        except: pass

        all_links = soup.find_all( 'a' )

        for link in all_links:

            try:

                href = link[ 'href' ]

                if '/browse/tag/' in href: tags.add( link.string )

            except: pass


        #

        flash_url = html.split( '"http:\/\/uploads.ungrounded.net\/', 1 )[1]

        flash_url = flash_url.split( '"', 1 )[0]

        flash_url = flash_url.replace( "\/", '/' )

        flash_url = 'http://uploads.ungrounded.net/' + flash_url

        return ( flash_url, tags )


    def GetFile( self, temp_path, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( file_url, referral_url = url, temp_path = temp_path )


    def GetFileAndTags( self, temp_path, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( file_url, referral_url = url, temp_path = temp_path )

        return tags


    def GetTags( self, url ):

        ( file_url, tags ) = self._GetFileURLAndTags( url )

        return tags


class GalleryNewgroundsGames( GalleryNewgrounds ):

    def _GetGalleryPageURL( self, query, page_index ):

        artist = query

        return 'http://' + artist + '.newgrounds.com/games/'


class GalleryNewgroundsMovies( GalleryNewgrounds ):

    def _GetGalleryPageURL( self, query, page_index ):

        artist = query

        return 'http://' + artist + '.newgrounds.com/movies/'


class GalleryPixiv( Gallery ):

    def _EnsureLoggedIn( self ):

        manager = HG.client_controller.GetManager( 'web_sessions' )

        manager.EnsureLoggedIn( 'pixiv' )


    def _ParseGalleryPage( self, html, url_base ):

        definitely_no_more_pages = False

        urls = []

        soup = GetSoup( html )

        manga_links = soup.find_all( class_ = 'manga' )
        thumbnail_links = soup.find_all( class_ = 'work' )

        manga_urls = { manga_link[ 'href' ] for manga_link in manga_links }
        thumbnail_urls = [ thumbnail_link[ 'href' ] for thumbnail_link in thumbnail_links ]

        for thumbnail_url in thumbnail_urls:

            if thumbnail_url in manga_urls:

                # I think the best way to handle this is to wait until I have cbz support or whatever and sort it out in getfileandtags
                # download each file in turn and write to a cbz on temp_path
                # replace this with the mode=manga url so it is easily noticed at that stage
                # but for now, just skip it

                pass

            else:

                url = urlparse.urljoin( url_base, thumbnail_url ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690

                urls.append( url )


        return ( urls, definitely_no_more_pages )


    def _ParseImagePage( self, html, page_url ):

        if 'member_illust.php?mode=manga' in html:

            manga_url = page_url.replace( 'medium', 'manga' )

            raise HydrusExceptions.MimeException( page_url + ' was manga, not a single image, so could not be downloaded.' )


        if 'member_illust.php?mode=ugoira_view' in html:

            raise HydrusExceptions.MimeException( page_url + ' was ugoira, not a single image, so could not be downloaded.' )


        soup = GetSoup( html )

        #

        original_image = soup.find( class_ = 'original-image' )

        image_url = original_image[ 'data-src' ] # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg

        #

        tags_parent = soup.find( 'section', class_ = 'work-tags' )

        # <a href="/search.php?s_mode=s_tag_full&amp;word=%E3%83%8F%E3%83%B3%E3%83%89%E3%83%A1%E3%82%A4%E3%83%89" class="text">[unicode tag here]</a>
        tags = [ link.string for link in tags_parent.find_all( 'a', class_ = 'text' ) ]

        user = soup.find( 'h1', class_ = 'user' )

        if user is not None:

            tags.append( 'creator:' + user.string )


        title_parent = soup.find( 'section', class_ = re.compile( 'work-info' ) )

        if title_parent is not None:

            title = title_parent.find( 'h1', class_ = 'title' )

            if title is not None:

                tags.append( 'title:' + title.string )


        return ( image_url, tags )


    def _GetFileURLAndTags( self, page_url ):

        html = self._FetchData( page_url )

        return self._ParseImagePage( html, page_url )


    def GetFile( self, temp_path, url ):

        ( image_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( image_url, referral_url = url, temp_path = temp_path )


    def GetFileAndTags( self, temp_path, url ):

        ( image_url, tags ) = self._GetFileURLAndTags( url )

        self._FetchData( image_url, referral_url = url, temp_path = temp_path )

        return tags


    def GetTags( self, url ):

        ( image_url, tags ) = self._GetFileURLAndTags( url )

        return tags


class GalleryPixivArtistID( GalleryPixiv ):

    def _GetGalleryPageURL( self, query, page_index ):

        artist_id = query

        gallery_url = 'https://www.pixiv.net/member_illust.php?type=illust&id=' + str( artist_id )

        return gallery_url + '&p=' + str( page_index + 1 )


class GalleryPixivTag( GalleryPixiv ):

    def _GetGalleryPageURL( self, query, page_index ):

        tag = query

        gallery_url = 'https://www.pixiv.net/search.php?word=' + urllib.quote( HydrusData.ToByteString( tag ), '' ) + '&s_mode=s_tag_full&order=date_d'

        return gallery_url + '&p=' + str( page_index + 1 )


class GalleryTumblr( Gallery ):

    def _GetGalleryPageURL( self, query, page_index ):

        username = query

        return 'https://' + username + '.tumblr.com/api/read/json?start=' + str( page_index * 50 ) + '&num=50'


    def _ParseGalleryPage( self, data, url_base ):

        def ConvertRegularToRawURL( regular_url ):

            # convert this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
            # to this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
            # the 500 part can be a bunch of stuff, including letters

            url_components = regular_url.split( '_' )

            last_component = url_components[ -1 ]

            ( number_gubbins, file_ext ) = last_component.split( '.' )

            raw_last_component = 'raw.' + file_ext

            url_components[ -1 ] = raw_last_component

            raw_url = '_'.join( url_components )

            return raw_url


        def Remove68Subdomain( long_url ):

            # sometimes the 68 subdomain gives a 404 on the raw url, so:

            # convert this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
            # to this:
            # http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg

            # I am not sure if it is always 68, but let's not assume

            ( scheme, rest ) = long_url.split( '://', 1 )

            if rest.startswith( 'media.tumblr.com' ):

                return long_url


            ( gumpf, shorter_rest ) = rest.split( '.', 1 )

            shorter_url = scheme + '://' + shorter_rest

            return shorter_url


        definitely_no_more_pages = False

        processed_raw_json = data.split( 'var tumblr_api_read = ' )[1][:-2] # -1 takes a js ';' off the end

        json_object = json.loads( processed_raw_json )

        urls = []

        if 'posts' in json_object:

            for post in json_object[ 'posts' ]:

                # 2012-06-20 15:59:00 GMT
                date = post[ 'date-gmt' ]

                date_struct = time.strptime( date, '%Y-%m-%d %H:%M:%S %Z' )

                raw_url_available = date_struct.tm_year > 2012

                if 'tags' in post: tags = post[ 'tags' ]
                else: tags = []

                post_type = post[ 'type' ]

                if post_type == 'photo':

                    if len( post[ 'photos' ] ) == 0:

                        photos = [ post ]

                    else:

                        photos = post[ 'photos' ]


                    for photo in photos:

                        try:

                            url = photo[ 'photo-url-1280' ]

                            # some urls are given in the form:
                            # https://68.media.tumblr.com/tumblr_m5yb5m2O6A1rso2eyo1_540.jpg
                            # which is missing the hex key in the middle
                            # these urls are unavailable as raws from the main media server
                            # these seem to all be the pre-2013 files, but we'll double-check just in case anyway
                            unusual_hexless_url = url.count( '/' ) == 3

                            if not unusual_hexless_url:

                                if raw_url_available:

                                    url = ConvertRegularToRawURL( url )

                                    url = Remove68Subdomain( url )


                            url = ClientData.ConvertHTTPToHTTPS( url )

                            SetExtraURLInfo( url, tags )

                            urls.append( url )

                        except:

                            pass


        return ( urls, definitely_no_more_pages )


    def GetTags( self, url ):

        result = GetExtraURLInfo( url )

        if result is None:

            return []

        else:

            return result