2013-04-10 18:10:37 +00:00
import bs4
import HydrusConstants as HC
import json
import lxml
import traceback
2013-04-17 21:48:18 +00:00
import urllib
2013-04-10 18:10:37 +00:00
import urlparse
import wx
def ConvertServiceIdentifiersToTagsToContentUpdates ( hash , service_identifiers_to_tags ) :
content_updates = [ ]
for ( service_identifier , tags ) in service_identifiers_to_tags . items ( ) :
if len ( tags ) > 0 :
if service_identifier == HC . LOCAL_TAG_SERVICE_IDENTIFIER : action = HC . CONTENT_UPDATE_ADD
else : action = HC . CONTENT_UPDATE_PENDING
edit_log = [ ( action , tag ) for tag in tags ]
content_updates . append ( HC . ContentUpdate ( HC . CONTENT_UPDATE_EDIT_LOG , service_identifier , ( hash , ) , info = edit_log ) )
return content_updates
2013-04-17 21:48:18 +00:00
def GetDownloader ( site_download_type , * args ) :
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
if site_download_type == HC . SITE_DOWNLOAD_TYPE_BOORU : c = DownloaderBooru
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_DEVIANT_ART : c = DownloaderDeviantArt
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_GIPHY : c = DownloaderGiphy
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_HENTAI_FOUNDRY : c = DownloaderHentaiFoundry
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_PIXIV : c = DownloaderPixiv
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_TUMBLR : c = DownloaderTumblr
2013-05-15 18:58:14 +00:00
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_NEWGROUNDS : c = DownloaderNewgrounds
2013-04-10 18:10:37 +00:00
return c ( * args )
def ConvertTagsToServiceIdentifiersToTags ( tags , advanced_tag_options ) :
tags = [ tag for tag in tags if tag is not None ]
service_identifiers_to_tags = { }
for ( service_identifier , namespaces ) in advanced_tag_options . items ( ) :
if len ( namespaces ) > 0 :
tags_to_add_here = [ ]
for namespace in namespaces :
if namespace == ' ' : tags_to_add_here . extend ( [ HC . CleanTag ( tag ) for tag in tags if not ' : ' in tag ] )
else : tags_to_add_here . extend ( [ HC . CleanTag ( tag ) for tag in tags if tag . startswith ( namespace + ' : ' ) ] )
if len ( tags_to_add_here ) > 0 : service_identifiers_to_tags [ service_identifier ] = tags_to_add_here
return service_identifiers_to_tags
class Downloader ( ) :
def __init__ ( self ) :
2013-04-17 21:48:18 +00:00
self . _we_are_done = False
2013-04-10 18:10:37 +00:00
self . _connections = { }
2013-04-17 21:48:18 +00:00
self . _all_urls_so_far = set ( )
2013-04-10 18:10:37 +00:00
self . _num_pages_done = 0
def _EstablishSession ( self , connection ) : pass
def _GetConnection ( self , url ) :
parse_result = urlparse . urlparse ( url )
( scheme , host , port ) = ( parse_result . scheme , parse_result . hostname , parse_result . port )
if ( scheme , host , port ) not in self . _connections :
connection = HC . AdvancedHTTPConnection ( scheme = scheme , host = host , port = port )
self . _EstablishSession ( connection )
self . _connections [ ( scheme , host , port ) ] = connection
return self . _connections [ ( scheme , host , port ) ]
2013-05-15 18:58:14 +00:00
def _GetNextGalleryPageURLs ( self ) : return ( self . _GetNextGalleryPageURL ( ) , )
2013-04-10 18:10:37 +00:00
def GetAnotherPage ( self ) :
2013-04-17 21:48:18 +00:00
if self . _we_are_done : return [ ]
2013-05-15 18:58:14 +00:00
urls = self . _GetNextGalleryPageURLs ( )
2013-04-10 18:10:37 +00:00
2013-05-15 18:58:14 +00:00
url_info = [ ]
2013-04-10 18:10:37 +00:00
2013-05-15 18:58:14 +00:00
for url in urls :
connection = self . _GetConnection ( url )
data = connection . geturl ( url )
page_of_url_info = self . _ParseGalleryPage ( data , url )
# stop ourselves getting into an accidental infinite loop
url_info + = [ info for info in page_of_url_info if info [ 0 ] not in self . _all_urls_so_far ]
self . _all_urls_so_far . update ( [ info [ 0 ] for info in url_info ] )
# now url_info only contains new url info
2013-04-10 18:10:37 +00:00
self . _num_pages_done + = 1
return url_info
def GetFile ( self , url , * args ) :
connection = self . _GetConnection ( url )
return connection . geturl ( url )
def GetFileAndTags ( self , url , * args ) :
file = self . GetFile ( url , * args )
tags = self . GetTags ( url , * args )
return ( file , tags )
def GetTags ( self , url ) : pass
2013-04-17 21:48:18 +00:00
def SetupGallerySearch ( self ) : pass
2013-04-10 18:10:37 +00:00
class DownloaderBooru ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , booru , tags ) :
2013-04-10 18:10:37 +00:00
self . _booru = booru
2013-04-17 21:48:18 +00:00
self . _tags = tags
self . _gallery_advance_num = None
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
( self . _search_url , self . _advance_by_page_num , self . _search_separator , self . _thumb_classname ) = booru . GetGalleryParsingInfo ( )
2013-04-10 18:10:37 +00:00
Downloader . __init__ ( self )
def _GetNextGalleryPageURL ( self ) :
2013-05-01 17:21:53 +00:00
if self . _advance_by_page_num : index = 1 + self . _num_pages_done
else :
if self . _gallery_advance_num is None : index = 0
else : index = self . _num_pages_done * self . _gallery_advance_num
2013-04-17 21:48:18 +00:00
2013-05-01 17:21:53 +00:00
return self . _search_url . replace ( ' % tags % ' , self . _search_separator . join ( self . _tags ) ) . replace ( ' %i ndex % ' , str ( index ) )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , html , url_base ) :
urls_set = set ( )
urls = [ ]
soup = bs4 . BeautifulSoup ( html )
2013-05-01 17:21:53 +00:00
# this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
def starts_with_classname ( classname ) : return classname is not None and classname . startswith ( self . _thumb_classname )
thumbnails = soup . find_all ( class_ = starts_with_classname )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
if self . _gallery_advance_num is None :
if len ( thumbnails ) == 0 : self . _we_are_done = True
else : self . _gallery_advance_num = len ( thumbnails )
2013-04-10 18:10:37 +00:00
for thumbnail in thumbnails :
links = thumbnail . find_all ( ' a ' )
if thumbnail . name == ' a ' : links . append ( thumbnail )
for link in links :
if link . string is not None and link . string == ' Image Only ' : continue # rule 34 @ paheal fix
url = link [ ' href ' ]
url = urlparse . urljoin ( url_base , url )
if url not in urls_set :
urls_set . add ( url )
2013-04-17 21:48:18 +00:00
urls . append ( ( url , ) )
2013-04-10 18:10:37 +00:00
return urls
def _ParseImagePage ( self , html , url_base ) :
2013-04-17 21:48:18 +00:00
( search_url , search_separator , advance_by_page_num , thumb_classname , image_id , image_data , tag_classnames_to_namespaces ) = self . _booru . GetData ( )
2013-04-10 18:10:37 +00:00
soup = bs4 . BeautifulSoup ( html )
image_base = None
if image_id is not None :
image = soup . find ( id = image_id )
image_url = image [ ' src ' ]
if image_data is not None :
links = soup . find_all ( ' a ' )
for link in links :
if link . string == image_data : image_url = link [ ' href ' ]
image_url = urlparse . urljoin ( url_base , image_url )
image_url = image_url . replace ( ' sample/sample- ' , ' ' ) # fix for danbooru resizing
tags = [ ]
for ( tag_classname , namespace ) in tag_classnames_to_namespaces . items ( ) :
tag_list_entries = soup . find_all ( class_ = tag_classname )
for tag_list_entry in tag_list_entries :
links = tag_list_entry . find_all ( ' a ' )
if tag_list_entry . name == ' a ' : links . append ( tag_list_entry )
for link in links :
if link . string not in ( ' ? ' , ' - ' , ' + ' ) :
if namespace == ' ' : tags . append ( link . string )
else : tags . append ( namespace + ' : ' + link . string )
return ( image_url , tags )
def _GetFileURLAndTags ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html , url )
def GetFile ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
return connection . geturl ( file_url )
def GetFileAndTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
file = connection . geturl ( file_url )
return ( file , tags )
def GetTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
return tags
class DownloaderDeviantArt ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , artist ) :
2013-04-10 18:10:37 +00:00
self . _gallery_url = ' http:// ' + artist + ' .deviantart.com/gallery/?catpath=/&offset= '
Downloader . __init__ ( self )
def _GetNextGalleryPageURL ( self ) : return self . _gallery_url + str ( self . _num_pages_done * 24 )
def _ParseGalleryPage ( self , html , url_base ) :
results = [ ]
soup = bs4 . BeautifulSoup ( html )
2013-04-17 21:48:18 +00:00
thumbs_container = soup . find ( class_ = ' zones-container ' )
2013-04-10 18:10:37 +00:00
def starts_with_thumb ( classname ) : return classname is not None and classname . startswith ( ' thumb ' )
links = thumbs_container . find_all ( ' a ' , class_ = starts_with_thumb )
for link in links :
page_url = link [ ' href ' ] # something in the form of blah.da.com/art/blah-123456
2013-04-17 21:48:18 +00:00
raw_title = link [ ' title ' ] # sweet dolls by ~AngeniaC, Feb 29, 2012 in Artisan Crafts > Miniatures > Jewelry
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
raw_title_reversed = raw_title [ : : - 1 ] # yrleweJ ;tg& serutainiM ;tg& stfarC nasitrA ni 2102 ,92 beF ,CainegnA~ yb sllod teews
( creator_and_date_and_tags_reversed , title_reversed ) = raw_title_reversed . split ( ' yb ' , 1 )
creator_and_date_and_tags = creator_and_date_and_tags_reversed [ : : - 1 ] # ~AngeniaC, Feb 29, 2012 in Artisan Crafts > Miniatures > Jewelry
( creator_with_username_char , date_and_tags ) = creator_and_date_and_tags . split ( ' , ' , 1 )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
creator = creator_with_username_char [ 1 : ] # AngeniaC
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
title = title_reversed [ : : - 1 ] # sweet dolls
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
try : ( date_gumpf , raw_category_tags ) = date_and_tags . split ( ' in ' , 1 )
except :
print ( raw_title )
print ( date_and_tags )
raise
2013-04-10 18:10:37 +00:00
category_tags = raw_category_tags . split ( ' > ' )
tags = [ ]
tags . append ( ' title: ' + title )
tags . append ( ' creator: ' + creator )
tags . extend ( category_tags )
2013-05-01 17:21:53 +00:00
results . append ( ( page_url , tags ) )
2013-04-10 18:10:37 +00:00
return results
2013-05-01 17:21:53 +00:00
def _ParseImagePage ( self , html ) :
soup = bs4 . BeautifulSoup ( html )
# if can find download link:
if False :
pass # go fetch the popup page using tokens as appropriate. feels like it needs the GET token and a referrer, as middle click just redirects back to image page
else :
img = soup . find ( id = ' gmi-ResViewSizer_fullimg ' )
src = img [ ' src ' ]
return src
def _GetFileURL ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html )
def GetFile ( self , url , tags ) :
file_url = self . _GetFileURL ( url )
connection = self . _GetConnection ( file_url )
return connection . geturl ( file_url )
2013-04-10 18:10:37 +00:00
def GetTags ( self , url , tags ) : return tags
class DownloaderGiphy ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , tag ) :
2013-04-10 18:10:37 +00:00
self . _gallery_url = ' http://giphy.com/api/gifs?tag= ' + tag . replace ( ' ' , ' + ' ) + ' &page= '
Downloader . __init__ ( self )
2013-04-17 21:48:18 +00:00
def _GetNextGalleryPageURL ( self ) : return self . _gallery_url + str ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , data , url_base ) :
json_dict = json . loads ( data )
if ' data ' in json_dict :
json_data = json_dict [ ' data ' ]
return [ ( d [ ' image_original_url ' ] , d [ ' id ' ] ) for d in json_data ]
else : return [ ]
def GetTags ( self , url , id ) :
url = ' http://giphy.com/api/gifs/ ' + str ( id )
connection = self . _GetConnection ( url )
try :
raw_json = connection . geturl ( url )
json_dict = json . loads ( raw_json )
tags_data = json_dict [ ' data ' ] [ ' tags ' ]
tags = [ tag_data [ ' name ' ] for tag_data in tags_data ]
except :
print ( traceback . format_exc ( ) )
tags = [ ]
return tags
class DownloaderHentaiFoundry ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , query_type , query , advanced_hentai_foundry_options ) :
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
self . _query_type = query_type
2013-04-10 18:10:37 +00:00
self . _query = query
2013-04-17 21:48:18 +00:00
self . _advanced_hentai_foundry_options = advanced_hentai_foundry_options
2013-04-10 18:10:37 +00:00
Downloader . __init__ ( self )
def _EstablishSession ( self , connection ) :
cookies = wx . GetApp ( ) . GetWebCookies ( ' hentai foundry ' )
for ( key , value ) in cookies . items ( ) : connection . SetCookie ( key , value )
2013-04-17 21:48:18 +00:00
def _GetFileURLAndTags ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html , url )
2013-04-10 18:10:37 +00:00
def _GetNextGalleryPageURL ( self ) :
2013-04-17 21:48:18 +00:00
if self . _query_type in ( ' artist ' , ' artist pictures ' ) :
artist = self . _query
gallery_url = ' http://www.hentai-foundry.com/pictures/user/ ' + artist
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
return gallery_url + ' /page/ ' + str ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
elif self . _query_type == ' artist scraps ' :
artist = self . _query
gallery_url = ' http://www.hentai-foundry.com/pictures/user/ ' + artist + ' /scraps '
return gallery_url + ' /page/ ' + str ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
elif self . _query_type == ' tags ' :
tags = self . _query
2013-04-17 21:48:18 +00:00
return ' http://www.hentai-foundry.com/search/pictures?query= ' + ' + ' . join ( tags ) + ' &search_in=all&scraps=-1&page= ' + str ( self . _num_pages_done + 1 )
# scraps = 0 hide
# -1 means show both
# 1 means scraps only. wetf
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , html , url_base ) :
urls_set = set ( )
soup = bs4 . BeautifulSoup ( html )
def correct_url ( href ) :
# a good url is in the form "/pictures/user/artist_name/file_id/title"
if href . count ( ' / ' ) == 5 and href . startswith ( ' /pictures/user/ ' ) :
( nothing , pictures , user , artist_name , file_id , title ) = href . split ( ' / ' )
# /pictures/user/artist_name/page/3
if file_id != ' page ' : return True
return False
links = soup . find_all ( ' a ' , href = correct_url )
urls = [ ' http://www.hentai-foundry.com ' + link [ ' href ' ] for link in links ]
result_urls = [ ]
for url in urls :
if url not in urls_set :
urls_set . add ( url )
2013-04-17 21:48:18 +00:00
result_urls . append ( ( url , ) )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
# this is copied from old code. surely we can improve it?
if ' class= " next " ' not in html : self . _we_are_done = True
2013-04-10 18:10:37 +00:00
return result_urls
2013-04-17 21:48:18 +00:00
def _ParseImagePage ( self , html , url_base ) :
2013-04-10 18:10:37 +00:00
# can't parse this easily normally because HF is a pain with the preview->click to see full size business.
# find http://pictures.hentai-foundry.com//
# then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
# the .jpg bit is what we really need, but whatever
try :
index = html . index ( ' http://pictures.hentai-foundry.com// ' )
stuff = html [ index : index + 100 ]
try : ( image_url , gumpf ) = stuff . split ( ' " ' , 1 )
except : ( image_url , gumpf ) = stuff . split ( ' ' ' , 1 )
except : raise Exception ( ' Could not parse image url! ' )
soup = bs4 . BeautifulSoup ( html )
tags = [ ]
try :
title = soup . find ( ' title ' )
( data , nothing ) = unicode ( title . string ) . split ( ' - Hentai Foundry ' )
data_reversed = data [ : : - 1 ] # want to do it right-side first, because title might have ' by ' in it
( artist_reversed , title_reversed ) = data_reversed . split ( ' yb ' )
artist = artist_reversed [ : : - 1 ]
title = title_reversed [ : : - 1 ]
tags . append ( ' creator: ' + artist )
tags . append ( ' title: ' + title )
except : pass
tag_links = soup . find_all ( ' a ' , rel = ' tag ' )
for tag_link in tag_links : tags . append ( tag_link . string )
return ( image_url , tags )
def GetFile ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
return connection . geturl ( file_url )
def GetFileAndTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
file = connection . geturl ( file_url )
return ( file , tags )
def GetTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
return tags
2013-04-17 21:48:18 +00:00
def SetupGallerySearch ( self ) :
connection = self . _GetConnection ( ' http://www.hentai-foundry.com/site/filters ' )
cookies = connection . GetCookies ( )
raw_csrf = cookies [ ' YII_CSRF_TOKEN ' ] # YII_CSRF_TOKEN=19b05b536885ec60b8b37650a32f8deb11c08cd1s%3A40%3A%222917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32%22%3B
processed_csrf = urllib . unquote ( raw_csrf ) # 19b05b536885ec60b8b37650a32f8deb11c08cd1s:40:"2917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32";
csrf_token = processed_csrf . split ( ' " ' ) [ 1 ] # the 2917... bit
self . _advanced_hentai_foundry_options [ ' YII_CSRF_TOKEN ' ] = csrf_token
body = urllib . urlencode ( self . _advanced_hentai_foundry_options )
headers = { }
headers [ ' Content-Type ' ] = ' application/x-www-form-urlencoded '
connection . request ( ' POST ' , ' /site/filters ' , headers = headers , body = body )
2013-05-15 18:58:14 +00:00
class DownloaderNewgrounds ( Downloader ) :
def __init__ ( self , query ) :
self . _query = query
Downloader . __init__ ( self )
def _GetFileURLAndTags ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html , url )
def _GetNextGalleryPageURLs ( self ) :
artist = self . _query
gallery_urls = [ ]
gallery_urls . append ( ' http:// ' + artist + ' .newgrounds.com/games/ ' )
gallery_urls . append ( ' http:// ' + artist + ' .newgrounds.com/movies/ ' )
self . _we_are_done = True
return gallery_urls
def _ParseGalleryPage ( self , html , url_base ) :
soup = bs4 . BeautifulSoup ( html )
fatcol = soup . find ( ' div ' , class_ = ' fatcol ' )
links = fatcol . find_all ( ' a ' )
urls_set = set ( )
result_urls = [ ]
for link in links :
try :
url = link [ ' href ' ]
if url not in urls_set :
if url . startswith ( ' http://www.newgrounds.com/portal/view/ ' ) :
urls_set . add ( url )
result_urls . append ( ( url , ) )
except : pass
return result_urls
def _ParseImagePage ( self , html , url_base ) :
soup = bs4 . BeautifulSoup ( html )
tags = [ ]
author_links = soup . find ( ' ul ' , class_ = ' authorlinks ' )
if author_links is not None :
authors = set ( )
links = author_links . find_all ( ' a ' )
for link in links :
try :
href = link [ ' href ' ] # http://warlord-of-noodles.newgrounds.com
creator = href . replace ( ' http:// ' , ' ' ) . replace ( ' .newgrounds.com ' , ' ' )
tags . append ( ' creator: ' + creator )
except : pass
try :
title = soup . find ( ' title ' )
2013-05-29 20:19:54 +00:00
tags . append ( ' title: ' + title . string )
2013-05-15 18:58:14 +00:00
except : pass
all_links = soup . find_all ( ' a ' )
for link in all_links :
try :
href = link [ ' href ' ]
if ' /browse/tag/ ' in href : tags . append ( link . string )
except : pass
#
try :
components = html . split ( ' " src " ' )
# there is sometimes another bit of api flash earlier on that we don't want
# it is called http://uploads.ungrounded.net/apiassets/sandbox.swf
if len ( components ) == 2 : flash_url = components [ 1 ]
else : flash_url = components [ 2 ]
#"src": "http:\/\/flash.ngfiles.com\/video_player\/videoplayer.swf",
#"width": "100%",
#"src": "http:\/\/uploads.ungrounded.net\/593000\/593806_Kitty.swf",
#"width": "100%",
flash_url = flash_url . split ( ' " width " ' , 1 ) [ 0 ]
flash_url = flash_url . split ( ' " ' ) [ 1 ]
flash_url = flash_url . replace ( ' \\ / ' , ' / ' )
except : raise Exception ( ' Could not find the swf file! ' )
if flash_url == ' http://flash.ngfiles.com/video_player/videoplayer.swf ' : raise Exception ( ' It was an mp4 movie, not a swf! ' )
return ( flash_url , tags )
def GetFile ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
return connection . geturl ( file_url )
def GetFileAndTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
file = connection . geturl ( file_url )
return ( file , tags )
def GetTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
return tags
2013-04-10 18:10:37 +00:00
class DownloaderPixiv ( Downloader ) :
def __init__ ( self , query_type , query ) :
self . _query_type = query_type
self . _query = query
Downloader . __init__ ( self )
def _EstablishSession ( self , connection ) :
cookies = wx . GetApp ( ) . GetWebCookies ( ' pixiv ' )
for ( key , value ) in cookies . items ( ) : connection . SetCookie ( key , value )
def _GetNextGalleryPageURL ( self ) :
2013-04-17 21:48:18 +00:00
if self . _query_type == ' artist ' :
artist_id = self . _query
gallery_url = ' http://www.pixiv.net/member_illust.php?id= ' + str ( artist_id )
elif self . _query_type == ' tag ' :
tag = self . _query
tag = urllib . quote ( tag . encode ( ' utf-8 ' ) )
gallery_url = ' http://www.pixiv.net/search.php?word= ' + tag + ' &s_mode=s_tag_full&order=date_d '
return gallery_url + ' &p= ' + str ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , html , url_base ) :
results = [ ]
soup = bs4 . BeautifulSoup ( html )
thumbnail_links = soup . find_all ( class_ = ' work ' )
for thumbnail_link in thumbnail_links :
url = urlparse . urljoin ( url_base , thumbnail_link [ ' href ' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690
2013-04-17 21:48:18 +00:00
results . append ( ( url , ) )
2013-04-10 18:10:37 +00:00
return results
2013-04-17 21:48:18 +00:00
def _ParseImagePage ( self , html , page_url ) :
2013-04-10 18:10:37 +00:00
soup = bs4 . BeautifulSoup ( html )
2013-04-17 21:48:18 +00:00
#
# this is the page that holds the full size of the image.
# pixiv won't serve the image unless it thinks this page is the referrer
referral_url = page_url . replace ( ' medium ' , ' big ' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690
#
works_display = soup . find ( class_ = ' works_display ' )
img = works_display . find ( ' img ' )
img_url = img [ ' src ' ] # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468_m.png
image_url = img_url . replace ( ' _m. ' , ' . ' ) # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468.png
#
2013-04-10 18:10:37 +00:00
tags = soup . find ( ' ul ' , class_ = ' tags ' )
tags = [ a_item . string for a_item in tags . find_all ( ' a ' , class_ = ' text ' ) ]
user = soup . find ( ' h1 ' , class_ = ' user ' )
tags . append ( ' creator: ' + user . string )
title_parent = soup . find ( ' section ' , class_ = ' work-info ' )
title = title_parent . find ( ' h1 ' , class_ = ' title ' )
tags . append ( ' title: ' + title . string )
try : tags . append ( ' creator: ' + image_url . split ( ' / ' ) [ - 2 ] ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg -> dnosuke
except : pass
2013-04-17 21:48:18 +00:00
return ( referral_url , image_url , tags )
def _GetReferralURLFileURLAndTags ( self , page_url ) :
connection = self . _GetConnection ( page_url )
html = connection . geturl ( page_url )
return self . _ParseImagePage ( html , page_url )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
def GetFile ( self , url ) :
( referral_url , image_url , tags ) = self . _GetReferralURLFileURLAndTags ( url )
2013-04-10 18:10:37 +00:00
connection = self . _GetConnection ( image_url )
2013-04-17 21:48:18 +00:00
headers = { ' Referer ' : referral_url }
2013-04-10 18:10:37 +00:00
return connection . geturl ( image_url , headers = headers )
2013-04-17 21:48:18 +00:00
def GetFileAndTags ( self , url ) :
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
( referral_url , image_url , tags ) = self . _GetReferralURLFileURLAndTags ( url )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
connection = self . _GetConnection ( image_url )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
headers = { ' Referer ' : referral_url }
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
file = connection . geturl ( image_url , headers = headers )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
return ( file , tags )
def GetTags ( self , url ) :
( referral_url , image_url , tags ) = self . _GetReferralURLFileURLAndTags ( url )
return tags
2013-04-10 18:10:37 +00:00
class DownloaderTumblr ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , username ) :
2013-04-10 18:10:37 +00:00
self . _gallery_url = ' http:// ' + username + ' .tumblr.com/api/read/json?start= %s tart % &num=50 '
Downloader . __init__ ( self )
2013-04-17 21:48:18 +00:00
def _GetNextGalleryPageURL ( self ) : return self . _gallery_url . replace ( ' %s tart % ' , str ( self . _num_pages_done * 50 ) )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , data , url_base ) :
processed_raw_json = data . split ( ' var tumblr_api_read = ' ) [ 1 ] [ : - 2 ] # -2 takes a couple newline chars off at the end
json_object = json . loads ( processed_raw_json )
results = [ ]
if ' posts ' in json_object :
for post in json_object [ ' posts ' ] :
if ' tags ' in post : tags = post [ ' tags ' ]
else : tags = [ ]
post_type = post [ ' type ' ]
if post_type == ' photo ' :
if len ( post [ ' photos ' ] ) == 0 :
try : results . append ( ( post [ ' photo-url-1280 ' ] , tags ) )
except : pass
else :
for photo in post [ ' photos ' ] :
try : results . append ( ( photo [ ' photo-url-1280 ' ] , tags ) )
except : pass
return results
def GetTags ( self , url , tags ) : return tags