2013-04-10 18:10:37 +00:00
import bs4
2013-06-12 22:53:31 +00:00
import collections
2013-09-11 21:28:19 +00:00
import httplib
2013-04-10 18:10:37 +00:00
import HydrusConstants as HC
import json
import lxml
2013-09-11 21:28:19 +00:00
import pafy
2013-04-10 18:10:37 +00:00
import traceback
2013-04-17 21:48:18 +00:00
import urllib
2013-04-10 18:10:37 +00:00
import urlparse
import wx
2013-06-12 22:53:31 +00:00
def ConvertServiceIdentifiersToTagsToServiceIdentifiersToContentUpdates ( hash , service_identifiers_to_tags ) :
2013-04-10 18:10:37 +00:00
2013-06-12 22:53:31 +00:00
hashes = set ( ( hash , ) )
service_identifiers_to_content_updates = { }
2013-04-10 18:10:37 +00:00
for ( service_identifier , tags ) in service_identifiers_to_tags . items ( ) :
2013-06-12 22:53:31 +00:00
if service_identifier == HC . LOCAL_TAG_SERVICE_IDENTIFIER : action = HC . CONTENT_UPDATE_ADD
else : action = HC . CONTENT_UPDATE_PENDING
content_updates = [ HC . ContentUpdate ( HC . CONTENT_DATA_TYPE_MAPPINGS , action , ( tag , hashes ) ) for tag in tags ]
service_identifiers_to_content_updates [ service_identifier ] = content_updates
2013-04-10 18:10:37 +00:00
2013-06-12 22:53:31 +00:00
return service_identifiers_to_content_updates
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
def GetDownloader ( site_download_type , * args ) :
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
if site_download_type == HC . SITE_DOWNLOAD_TYPE_BOORU : c = DownloaderBooru
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_DEVIANT_ART : c = DownloaderDeviantArt
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_GIPHY : c = DownloaderGiphy
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_HENTAI_FOUNDRY : c = DownloaderHentaiFoundry
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_PIXIV : c = DownloaderPixiv
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_TUMBLR : c = DownloaderTumblr
2013-05-15 18:58:14 +00:00
elif site_download_type == HC . SITE_DOWNLOAD_TYPE_NEWGROUNDS : c = DownloaderNewgrounds
2013-04-10 18:10:37 +00:00
return c ( * args )
def ConvertTagsToServiceIdentifiersToTags ( tags , advanced_tag_options ) :
tags = [ tag for tag in tags if tag is not None ]
service_identifiers_to_tags = { }
2013-07-10 20:25:57 +00:00
siblings_manager = HC . app . GetTagSiblingsManager ( )
parents_manager = HC . app . GetTagParentsManager ( )
2013-06-19 20:25:06 +00:00
2013-04-10 18:10:37 +00:00
for ( service_identifier , namespaces ) in advanced_tag_options . items ( ) :
if len ( namespaces ) > 0 :
tags_to_add_here = [ ]
for namespace in namespaces :
if namespace == ' ' : tags_to_add_here . extend ( [ HC . CleanTag ( tag ) for tag in tags if not ' : ' in tag ] )
else : tags_to_add_here . extend ( [ HC . CleanTag ( tag ) for tag in tags if tag . startswith ( namespace + ' : ' ) ] )
2013-06-19 20:25:06 +00:00
if len ( tags_to_add_here ) > 0 :
2013-07-03 18:49:26 +00:00
tags_to_add_here = siblings_manager . CollapseTags ( tags_to_add_here )
2013-06-19 20:25:06 +00:00
tags_to_add_here = parents_manager . ExpandTags ( service_identifier , tags_to_add_here )
service_identifiers_to_tags [ service_identifier ] = tags_to_add_here
2013-04-10 18:10:37 +00:00
return service_identifiers_to_tags
2013-09-25 20:20:10 +00:00
def DownloadYoutubeURL ( job_key , url , message_string ) :
2013-09-11 21:28:19 +00:00
try :
parse_result = urlparse . urlparse ( url )
2013-09-25 20:20:10 +00:00
connection = httplib . HTTPConnection ( parse_result . hostname , timeout = 20 )
2013-09-11 21:28:19 +00:00
connection . request ( ' GET ' , url )
response = connection . getresponse ( )
2013-09-25 20:20:10 +00:00
try :
total_num_bytes = int ( response . getheader ( ' Content-Length ' ) )
get_message = lambda num_bytes_so_far : message_string + ' - ' + HC . ConvertIntToBytes ( num_bytes_so_far ) + ' / ' + HC . ConvertIntToBytes ( total_num_bytes )
except :
total_num_bytes = None
get_message = lambda num_bytes_so_far : message_string + ' - ' + HC . ConvertIntToBytes ( num_bytes_so_far )
2013-09-11 21:28:19 +00:00
block_size = 64 * 1024
2013-09-25 20:20:10 +00:00
num_bytes_so_far = 0
2013-09-11 21:28:19 +00:00
temp_path = HC . GetTempPath ( )
2013-09-25 20:20:10 +00:00
HC . pubsub . pub ( ' message_gauge_info ' , job_key , total_num_bytes , num_bytes_so_far , get_message ( num_bytes_so_far ) )
2013-09-11 21:28:19 +00:00
with open ( temp_path , ' wb ' ) as f :
while True :
if HC . shutdown or job_key . IsCancelled ( ) : return
block = response . read ( block_size )
2013-09-25 20:20:10 +00:00
num_bytes_so_far + = len ( block )
2013-09-11 21:28:19 +00:00
2013-09-25 20:20:10 +00:00
HC . pubsub . pub ( ' message_gauge_info ' , job_key , total_num_bytes , num_bytes_so_far , get_message ( num_bytes_so_far ) )
2013-09-11 21:28:19 +00:00
if block == ' ' : break
f . write ( block )
2013-09-25 20:20:10 +00:00
HC . pubsub . pub ( ' message_gauge_info ' , job_key , None , None , ' importing ' + message_string )
2013-09-11 21:28:19 +00:00
( result , hash ) = HC . app . WriteSynchronous ( ' import_file ' , temp_path )
2013-09-25 20:20:10 +00:00
if result in ( ' successful ' , ' redundant ' ) : HC . pubsub . pub ( ' message_gauge_show_file_button ' , job_key , message_string , { hash } )
2013-10-30 22:28:06 +00:00
elif result == ' deleted ' : HC . pubsub . pub ( ' message_gauge_info ' , job_key , None , None , ' File was already deleted! ' )
2013-09-11 21:28:19 +00:00
2013-10-30 22:28:06 +00:00
except Exception as e :
2013-09-11 21:28:19 +00:00
2013-10-30 22:28:06 +00:00
HC . pubsub . pub ( ' message_gauge_info ' , job_key , None , None , ' Error with ' + message_string + ' ! ' )
2013-09-11 21:28:19 +00:00
2013-10-30 22:28:06 +00:00
HC . ShowException ( e )
2013-09-11 21:28:19 +00:00
def GetYoutubeFormats ( youtube_url ) :
try : p = pafy . Pafy ( youtube_url )
except : raise Exception ( ' Could not fetch video info from youtube! ' )
info = { ( s . extension , s . resolution ) : ( s . url , s . title ) for s in p . streams if s . extension in ( ' flv ' , ' mp4 ' ) }
return info
2013-04-10 18:10:37 +00:00
class Downloader ( ) :
def __init__ ( self ) :
2013-04-17 21:48:18 +00:00
self . _we_are_done = False
2013-04-10 18:10:37 +00:00
self . _connections = { }
2013-10-30 22:28:06 +00:00
self . _report_hooks = [ ]
2013-04-17 21:48:18 +00:00
self . _all_urls_so_far = set ( )
2013-04-10 18:10:37 +00:00
self . _num_pages_done = 0
2013-10-30 22:28:06 +00:00
def _DownloadFile ( self , connection , * args , * * kwargs ) :
for hook in self . _report_hooks : connection . AddReportHook ( hook )
response = connection . geturl ( * args , * * kwargs )
connection . ClearReportHooks ( )
return response
2013-04-10 18:10:37 +00:00
def _EstablishSession ( self , connection ) : pass
def _GetConnection ( self , url ) :
parse_result = urlparse . urlparse ( url )
( scheme , host , port ) = ( parse_result . scheme , parse_result . hostname , parse_result . port )
if ( scheme , host , port ) not in self . _connections :
2013-07-31 21:26:38 +00:00
connection = HC . get_connection ( scheme = scheme , host = host , port = port )
2013-04-10 18:10:37 +00:00
self . _EstablishSession ( connection )
self . _connections [ ( scheme , host , port ) ] = connection
return self . _connections [ ( scheme , host , port ) ]
2013-05-15 18:58:14 +00:00
def _GetNextGalleryPageURLs ( self ) : return ( self . _GetNextGalleryPageURL ( ) , )
2013-10-30 22:28:06 +00:00
def AddReportHook ( self , hook ) : self . _report_hooks . append ( hook )
def ClearReportHooks ( self ) : self . _report_hooks = [ ]
2013-04-10 18:10:37 +00:00
def GetAnotherPage ( self ) :
2013-04-17 21:48:18 +00:00
if self . _we_are_done : return [ ]
2013-05-15 18:58:14 +00:00
urls = self . _GetNextGalleryPageURLs ( )
2013-04-10 18:10:37 +00:00
2013-05-15 18:58:14 +00:00
url_info = [ ]
2013-04-10 18:10:37 +00:00
2013-05-15 18:58:14 +00:00
for url in urls :
connection = self . _GetConnection ( url )
data = connection . geturl ( url )
page_of_url_info = self . _ParseGalleryPage ( data , url )
# stop ourselves getting into an accidental infinite loop
url_info + = [ info for info in page_of_url_info if info [ 0 ] not in self . _all_urls_so_far ]
self . _all_urls_so_far . update ( [ info [ 0 ] for info in url_info ] )
# now url_info only contains new url info
2013-04-10 18:10:37 +00:00
self . _num_pages_done + = 1
return url_info
def GetFile ( self , url , * args ) :
connection = self . _GetConnection ( url )
2013-11-06 18:22:07 +00:00
return self . _DownloadFile ( connection , url , response_to_path = True )
2013-04-10 18:10:37 +00:00
def GetFileAndTags ( self , url , * args ) :
2013-11-06 18:22:07 +00:00
temp_path = self . GetFile ( url , * args )
2013-04-10 18:10:37 +00:00
tags = self . GetTags ( url , * args )
2013-11-06 18:22:07 +00:00
return ( temp_path , tags )
2013-04-10 18:10:37 +00:00
def GetTags ( self , url ) : pass
2013-04-17 21:48:18 +00:00
def SetupGallerySearch ( self ) : pass
2013-04-10 18:10:37 +00:00
class DownloaderBooru ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , booru , tags ) :
2013-04-10 18:10:37 +00:00
self . _booru = booru
2013-04-17 21:48:18 +00:00
self . _tags = tags
self . _gallery_advance_num = None
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
( self . _search_url , self . _advance_by_page_num , self . _search_separator , self . _thumb_classname ) = booru . GetGalleryParsingInfo ( )
2013-04-10 18:10:37 +00:00
Downloader . __init__ ( self )
def _GetNextGalleryPageURL ( self ) :
2013-05-01 17:21:53 +00:00
if self . _advance_by_page_num : index = 1 + self . _num_pages_done
else :
if self . _gallery_advance_num is None : index = 0
else : index = self . _num_pages_done * self . _gallery_advance_num
2013-04-17 21:48:18 +00:00
2013-07-31 21:26:38 +00:00
return self . _search_url . replace ( ' % tags % ' , self . _search_separator . join ( self . _tags ) ) . replace ( ' %i ndex % ' , HC . u ( index ) )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , html , url_base ) :
urls_set = set ( )
urls = [ ]
soup = bs4 . BeautifulSoup ( html )
2013-05-01 17:21:53 +00:00
# this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
def starts_with_classname ( classname ) : return classname is not None and classname . startswith ( self . _thumb_classname )
thumbnails = soup . find_all ( class_ = starts_with_classname )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
if self . _gallery_advance_num is None :
if len ( thumbnails ) == 0 : self . _we_are_done = True
else : self . _gallery_advance_num = len ( thumbnails )
2013-04-10 18:10:37 +00:00
for thumbnail in thumbnails :
links = thumbnail . find_all ( ' a ' )
if thumbnail . name == ' a ' : links . append ( thumbnail )
for link in links :
if link . string is not None and link . string == ' Image Only ' : continue # rule 34 @ paheal fix
url = link [ ' href ' ]
url = urlparse . urljoin ( url_base , url )
if url not in urls_set :
urls_set . add ( url )
2013-04-17 21:48:18 +00:00
urls . append ( ( url , ) )
2013-04-10 18:10:37 +00:00
return urls
def _ParseImagePage ( self , html , url_base ) :
2013-04-17 21:48:18 +00:00
( search_url , search_separator , advance_by_page_num , thumb_classname , image_id , image_data , tag_classnames_to_namespaces ) = self . _booru . GetData ( )
2013-04-10 18:10:37 +00:00
soup = bs4 . BeautifulSoup ( html )
image_base = None
if image_id is not None :
image = soup . find ( id = image_id )
image_url = image [ ' src ' ]
if image_data is not None :
links = soup . find_all ( ' a ' )
for link in links :
if link . string == image_data : image_url = link [ ' href ' ]
image_url = urlparse . urljoin ( url_base , image_url )
image_url = image_url . replace ( ' sample/sample- ' , ' ' ) # fix for danbooru resizing
tags = [ ]
for ( tag_classname , namespace ) in tag_classnames_to_namespaces . items ( ) :
tag_list_entries = soup . find_all ( class_ = tag_classname )
for tag_list_entry in tag_list_entries :
links = tag_list_entry . find_all ( ' a ' )
if tag_list_entry . name == ' a ' : links . append ( tag_list_entry )
for link in links :
if link . string not in ( ' ? ' , ' - ' , ' + ' ) :
if namespace == ' ' : tags . append ( link . string )
else : tags . append ( namespace + ' : ' + link . string )
return ( image_url , tags )
def _GetFileURLAndTags ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html , url )
def GetFile ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
2013-11-06 18:22:07 +00:00
return self . _DownloadFile ( connection , file_url , response_to_path = True )
2013-04-10 18:10:37 +00:00
def GetFileAndTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
2013-11-06 18:22:07 +00:00
temp_path = self . _DownloadFile ( connection , file_url , response_to_path = True )
2013-04-10 18:10:37 +00:00
2013-11-06 18:22:07 +00:00
return ( temp_path , tags )
2013-04-10 18:10:37 +00:00
def GetTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
return tags
class DownloaderDeviantArt ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , artist ) :
2013-04-10 18:10:37 +00:00
self . _gallery_url = ' http:// ' + artist + ' .deviantart.com/gallery/?catpath=/&offset= '
Downloader . __init__ ( self )
2013-07-31 21:26:38 +00:00
def _GetNextGalleryPageURL ( self ) : return self . _gallery_url + HC . u ( self . _num_pages_done * 24 )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , html , url_base ) :
results = [ ]
soup = bs4 . BeautifulSoup ( html )
2013-04-17 21:48:18 +00:00
thumbs_container = soup . find ( class_ = ' zones-container ' )
2013-04-10 18:10:37 +00:00
def starts_with_thumb ( classname ) : return classname is not None and classname . startswith ( ' thumb ' )
links = thumbs_container . find_all ( ' a ' , class_ = starts_with_thumb )
for link in links :
2013-11-06 18:22:07 +00:00
try : # starts_with_thumb picks up some false positives, but they break
2013-04-17 21:48:18 +00:00
2013-11-06 18:22:07 +00:00
page_url = link [ ' href ' ] # something in the form of blah.da.com/art/blah-123456
2013-04-17 21:48:18 +00:00
2013-11-06 18:22:07 +00:00
raw_title = link [ ' title ' ] # sweet dolls by ~AngeniaC, Feb 29, 2012 in Artisan Crafts > Miniatures > Jewelry
2013-08-14 20:21:49 +00:00
2013-11-06 18:22:07 +00:00
raw_title_reversed = raw_title [ : : - 1 ] # yrleweJ ;tg& serutainiM ;tg& stfarC nasitrA ni 2102 ,92 beF ,CainegnA~ yb sllod teews
2013-08-14 20:21:49 +00:00
2013-11-06 18:22:07 +00:00
( creator_and_date_and_tags_reversed , title_reversed ) = raw_title_reversed . split ( ' yb ' , 1 )
2013-08-14 20:21:49 +00:00
2013-11-06 18:22:07 +00:00
creator_and_date_and_tags = creator_and_date_and_tags_reversed [ : : - 1 ] # ~AngeniaC, Feb 29, 2012 in Artisan Crafts > Miniatures > Jewelry
2013-04-17 21:48:18 +00:00
2013-11-06 18:22:07 +00:00
( creator_with_username_char , date_and_tags ) = creator_and_date_and_tags . split ( ' , ' , 1 )
creator = creator_with_username_char [ 1 : ] # AngeniaC
title = title_reversed [ : : - 1 ] # sweet dolls
try :
( date_gumpf , raw_category_tags ) = date_and_tags . split ( ' in ' , 1 )
category_tags = raw_category_tags . split ( ' > ' )
except Exception as e :
HC . ShowException ( e )
category_tags = [ ]
tags = [ ]
tags . append ( ' title: ' + title )
tags . append ( ' creator: ' + creator )
tags . extend ( category_tags )
results . append ( ( page_url , tags ) )
except : pass
2013-04-10 18:10:37 +00:00
return results
2013-05-01 17:21:53 +00:00
def _ParseImagePage ( self , html ) :
soup = bs4 . BeautifulSoup ( html )
# if can find download link:
if False :
pass # go fetch the popup page using tokens as appropriate. feels like it needs the GET token and a referrer, as middle click just redirects back to image page
else :
2013-11-06 18:22:07 +00:00
img = soup . find ( class_ = ' dev-content-full ' )
2013-05-01 17:21:53 +00:00
src = img [ ' src ' ]
return src
def _GetFileURL ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html )
def GetFile ( self , url , tags ) :
file_url = self . _GetFileURL ( url )
connection = self . _GetConnection ( file_url )
2013-11-06 18:22:07 +00:00
return self . _DownloadFile ( connection , file_url , response_to_path = True )
2013-05-01 17:21:53 +00:00
2013-04-10 18:10:37 +00:00
def GetTags ( self , url , tags ) : return tags
class DownloaderGiphy ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , tag ) :
2013-04-10 18:10:37 +00:00
self . _gallery_url = ' http://giphy.com/api/gifs?tag= ' + tag . replace ( ' ' , ' + ' ) + ' &page= '
Downloader . __init__ ( self )
2013-07-31 21:26:38 +00:00
def _GetNextGalleryPageURL ( self ) : return self . _gallery_url + HC . u ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , data , url_base ) :
json_dict = json . loads ( data )
if ' data ' in json_dict :
json_data = json_dict [ ' data ' ]
return [ ( d [ ' image_original_url ' ] , d [ ' id ' ] ) for d in json_data ]
else : return [ ]
def GetTags ( self , url , id ) :
2013-07-31 21:26:38 +00:00
url = ' http://giphy.com/api/gifs/ ' + HC . u ( id )
2013-04-10 18:10:37 +00:00
connection = self . _GetConnection ( url )
try :
raw_json = connection . geturl ( url )
json_dict = json . loads ( raw_json )
tags_data = json_dict [ ' data ' ] [ ' tags ' ]
tags = [ tag_data [ ' name ' ] for tag_data in tags_data ]
2013-08-14 20:21:49 +00:00
except Exception as e :
2013-04-10 18:10:37 +00:00
2013-08-14 20:21:49 +00:00
HC . ShowException ( e )
2013-04-10 18:10:37 +00:00
tags = [ ]
return tags
class DownloaderHentaiFoundry ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , query_type , query , advanced_hentai_foundry_options ) :
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
self . _query_type = query_type
2013-04-10 18:10:37 +00:00
self . _query = query
2013-04-17 21:48:18 +00:00
self . _advanced_hentai_foundry_options = advanced_hentai_foundry_options
2013-04-10 18:10:37 +00:00
Downloader . __init__ ( self )
def _EstablishSession ( self , connection ) :
2013-07-10 20:25:57 +00:00
cookies = HC . app . GetWebCookies ( ' hentai foundry ' )
2013-04-10 18:10:37 +00:00
for ( key , value ) in cookies . items ( ) : connection . SetCookie ( key , value )
2013-04-17 21:48:18 +00:00
def _GetFileURLAndTags ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html , url )
2013-04-10 18:10:37 +00:00
def _GetNextGalleryPageURL ( self ) :
2013-04-17 21:48:18 +00:00
if self . _query_type in ( ' artist ' , ' artist pictures ' ) :
artist = self . _query
gallery_url = ' http://www.hentai-foundry.com/pictures/user/ ' + artist
2013-04-10 18:10:37 +00:00
2013-07-31 21:26:38 +00:00
return gallery_url + ' /page/ ' + HC . u ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
elif self . _query_type == ' artist scraps ' :
artist = self . _query
gallery_url = ' http://www.hentai-foundry.com/pictures/user/ ' + artist + ' /scraps '
2013-07-31 21:26:38 +00:00
return gallery_url + ' /page/ ' + HC . u ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
elif self . _query_type == ' tags ' :
tags = self . _query
2013-07-31 21:26:38 +00:00
return ' http://www.hentai-foundry.com/search/pictures?query= ' + ' + ' . join ( tags ) + ' &search_in=all&scraps=-1&page= ' + HC . u ( self . _num_pages_done + 1 )
2013-04-17 21:48:18 +00:00
# scraps = 0 hide
# -1 means show both
# 1 means scraps only. wetf
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , html , url_base ) :
urls_set = set ( )
soup = bs4 . BeautifulSoup ( html )
def correct_url ( href ) :
# a good url is in the form "/pictures/user/artist_name/file_id/title"
if href . count ( ' / ' ) == 5 and href . startswith ( ' /pictures/user/ ' ) :
( nothing , pictures , user , artist_name , file_id , title ) = href . split ( ' / ' )
# /pictures/user/artist_name/page/3
if file_id != ' page ' : return True
return False
links = soup . find_all ( ' a ' , href = correct_url )
urls = [ ' http://www.hentai-foundry.com ' + link [ ' href ' ] for link in links ]
result_urls = [ ]
for url in urls :
if url not in urls_set :
urls_set . add ( url )
2013-04-17 21:48:18 +00:00
result_urls . append ( ( url , ) )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
# this is copied from old code. surely we can improve it?
if ' class= " next " ' not in html : self . _we_are_done = True
2013-04-10 18:10:37 +00:00
return result_urls
2013-04-17 21:48:18 +00:00
def _ParseImagePage ( self , html , url_base ) :
2013-04-10 18:10:37 +00:00
# can't parse this easily normally because HF is a pain with the preview->click to see full size business.
# find http://pictures.hentai-foundry.com//
# then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
# the .jpg bit is what we really need, but whatever
try :
index = html . index ( ' http://pictures.hentai-foundry.com// ' )
stuff = html [ index : index + 100 ]
try : ( image_url , gumpf ) = stuff . split ( ' " ' , 1 )
except : ( image_url , gumpf ) = stuff . split ( ' ' ' , 1 )
except : raise Exception ( ' Could not parse image url! ' )
soup = bs4 . BeautifulSoup ( html )
tags = [ ]
try :
title = soup . find ( ' title ' )
2013-07-31 21:26:38 +00:00
( data , nothing ) = HC . u ( title . string ) . split ( ' - Hentai Foundry ' )
2013-04-10 18:10:37 +00:00
data_reversed = data [ : : - 1 ] # want to do it right-side first, because title might have ' by ' in it
( artist_reversed , title_reversed ) = data_reversed . split ( ' yb ' )
artist = artist_reversed [ : : - 1 ]
title = title_reversed [ : : - 1 ]
tags . append ( ' creator: ' + artist )
tags . append ( ' title: ' + title )
except : pass
tag_links = soup . find_all ( ' a ' , rel = ' tag ' )
for tag_link in tag_links : tags . append ( tag_link . string )
return ( image_url , tags )
def GetFile ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
2013-11-06 18:22:07 +00:00
return self . _DownloadFile ( connection , file_url , response_to_path = True )
2013-04-10 18:10:37 +00:00
def GetFileAndTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
2013-11-06 18:22:07 +00:00
temp_path = self . _DownloadFile ( connection , file_url , response_to_path = True )
2013-04-10 18:10:37 +00:00
2013-11-06 18:22:07 +00:00
return ( temp_path , tags )
2013-04-10 18:10:37 +00:00
def GetTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
return tags
2013-04-17 21:48:18 +00:00
def SetupGallerySearch ( self ) :
connection = self . _GetConnection ( ' http://www.hentai-foundry.com/site/filters ' )
cookies = connection . GetCookies ( )
raw_csrf = cookies [ ' YII_CSRF_TOKEN ' ] # YII_CSRF_TOKEN=19b05b536885ec60b8b37650a32f8deb11c08cd1s%3A40%3A%222917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32%22%3B
processed_csrf = urllib . unquote ( raw_csrf ) # 19b05b536885ec60b8b37650a32f8deb11c08cd1s:40:"2917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32";
csrf_token = processed_csrf . split ( ' " ' ) [ 1 ] # the 2917... bit
self . _advanced_hentai_foundry_options [ ' YII_CSRF_TOKEN ' ] = csrf_token
body = urllib . urlencode ( self . _advanced_hentai_foundry_options )
headers = { }
headers [ ' Content-Type ' ] = ' application/x-www-form-urlencoded '
connection . request ( ' POST ' , ' /site/filters ' , headers = headers , body = body )
2013-05-15 18:58:14 +00:00
class DownloaderNewgrounds ( Downloader ) :
def __init__ ( self , query ) :
self . _query = query
Downloader . __init__ ( self )
def _GetFileURLAndTags ( self , url ) :
connection = self . _GetConnection ( url )
html = connection . geturl ( url )
return self . _ParseImagePage ( html , url )
def _GetNextGalleryPageURLs ( self ) :
artist = self . _query
gallery_urls = [ ]
gallery_urls . append ( ' http:// ' + artist + ' .newgrounds.com/games/ ' )
gallery_urls . append ( ' http:// ' + artist + ' .newgrounds.com/movies/ ' )
self . _we_are_done = True
return gallery_urls
def _ParseGalleryPage ( self , html , url_base ) :
soup = bs4 . BeautifulSoup ( html )
fatcol = soup . find ( ' div ' , class_ = ' fatcol ' )
links = fatcol . find_all ( ' a ' )
urls_set = set ( )
result_urls = [ ]
for link in links :
try :
url = link [ ' href ' ]
if url not in urls_set :
if url . startswith ( ' http://www.newgrounds.com/portal/view/ ' ) :
urls_set . add ( url )
result_urls . append ( ( url , ) )
except : pass
return result_urls
def _ParseImagePage ( self , html , url_base ) :
soup = bs4 . BeautifulSoup ( html )
2013-07-31 21:26:38 +00:00
tags = set ( )
2013-05-15 18:58:14 +00:00
author_links = soup . find ( ' ul ' , class_ = ' authorlinks ' )
if author_links is not None :
authors = set ( )
links = author_links . find_all ( ' a ' )
for link in links :
try :
href = link [ ' href ' ] # http://warlord-of-noodles.newgrounds.com
creator = href . replace ( ' http:// ' , ' ' ) . replace ( ' .newgrounds.com ' , ' ' )
2013-07-31 21:26:38 +00:00
tags . add ( u ' creator: ' + creator )
2013-05-15 18:58:14 +00:00
except : pass
try :
title = soup . find ( ' title ' )
2013-07-31 21:26:38 +00:00
tags . add ( u ' title: ' + title . string )
2013-05-15 18:58:14 +00:00
except : pass
all_links = soup . find_all ( ' a ' )
for link in all_links :
try :
href = link [ ' href ' ]
2013-07-31 21:26:38 +00:00
if ' /browse/tag/ ' in href : tags . add ( link . string )
2013-05-15 18:58:14 +00:00
except : pass
#
try :
2013-07-31 21:26:38 +00:00
components = html . split ( ' " http://uploads.ungrounded.net/ ' )
2013-05-15 18:58:14 +00:00
# there is sometimes another bit of api flash earlier on that we don't want
# it is called http://uploads.ungrounded.net/apiassets/sandbox.swf
if len ( components ) == 2 : flash_url = components [ 1 ]
else : flash_url = components [ 2 ]
2013-07-31 21:26:38 +00:00
flash_url = flash_url . split ( ' " ' , 1 ) [ 0 ]
2013-05-15 18:58:14 +00:00
2013-07-31 21:26:38 +00:00
flash_url = ' http://uploads.ungrounded.net/ ' + flash_url
2013-05-15 18:58:14 +00:00
2013-08-14 20:21:49 +00:00
except : raise Exception ( ' Could not find the swf file! It was probably an mp4! ' )
2013-05-15 18:58:14 +00:00
return ( flash_url , tags )
def GetFile ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
2013-11-06 18:22:07 +00:00
return self . _DownloadFile ( connection , file_url , response_to_path = True )
2013-05-15 18:58:14 +00:00
def GetFileAndTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
connection = self . _GetConnection ( file_url )
2013-11-06 18:22:07 +00:00
temp_path = self . _DownloadFile ( connection , file_url , response_to_path = True )
2013-05-15 18:58:14 +00:00
2013-11-06 18:22:07 +00:00
return ( temp_path , tags )
2013-05-15 18:58:14 +00:00
def GetTags ( self , url ) :
( file_url , tags ) = self . _GetFileURLAndTags ( url )
return tags
2013-04-10 18:10:37 +00:00
class DownloaderPixiv ( Downloader ) :
def __init__ ( self , query_type , query ) :
self . _query_type = query_type
self . _query = query
Downloader . __init__ ( self )
def _EstablishSession ( self , connection ) :
2013-07-10 20:25:57 +00:00
cookies = HC . app . GetWebCookies ( ' pixiv ' )
2013-04-10 18:10:37 +00:00
for ( key , value ) in cookies . items ( ) : connection . SetCookie ( key , value )
def _GetNextGalleryPageURL ( self ) :
2013-04-17 21:48:18 +00:00
if self . _query_type == ' artist ' :
artist_id = self . _query
2013-07-31 21:26:38 +00:00
gallery_url = ' http://www.pixiv.net/member_illust.php?id= ' + HC . u ( artist_id )
2013-04-17 21:48:18 +00:00
elif self . _query_type == ' tag ' :
tag = self . _query
tag = urllib . quote ( tag . encode ( ' utf-8 ' ) )
gallery_url = ' http://www.pixiv.net/search.php?word= ' + tag + ' &s_mode=s_tag_full&order=date_d '
2013-07-31 21:26:38 +00:00
return gallery_url + ' &p= ' + HC . u ( self . _num_pages_done + 1 )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , html , url_base ) :
results = [ ]
soup = bs4 . BeautifulSoup ( html )
thumbnail_links = soup . find_all ( class_ = ' work ' )
for thumbnail_link in thumbnail_links :
url = urlparse . urljoin ( url_base , thumbnail_link [ ' href ' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690
2013-04-17 21:48:18 +00:00
results . append ( ( url , ) )
2013-04-10 18:10:37 +00:00
return results
2013-04-17 21:48:18 +00:00
def _ParseImagePage ( self , html , page_url ) :
2013-04-10 18:10:37 +00:00
soup = bs4 . BeautifulSoup ( html )
2013-04-17 21:48:18 +00:00
#
# this is the page that holds the full size of the image.
# pixiv won't serve the image unless it thinks this page is the referrer
referral_url = page_url . replace ( ' medium ' , ' big ' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690
#
works_display = soup . find ( class_ = ' works_display ' )
img = works_display . find ( ' img ' )
img_url = img [ ' src ' ] # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468_m.png
image_url = img_url . replace ( ' _m. ' , ' . ' ) # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468.png
#
2013-04-10 18:10:37 +00:00
tags = soup . find ( ' ul ' , class_ = ' tags ' )
tags = [ a_item . string for a_item in tags . find_all ( ' a ' , class_ = ' text ' ) ]
user = soup . find ( ' h1 ' , class_ = ' user ' )
tags . append ( ' creator: ' + user . string )
title_parent = soup . find ( ' section ' , class_ = ' work-info ' )
title = title_parent . find ( ' h1 ' , class_ = ' title ' )
tags . append ( ' title: ' + title . string )
try : tags . append ( ' creator: ' + image_url . split ( ' / ' ) [ - 2 ] ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg -> dnosuke
except : pass
2013-04-17 21:48:18 +00:00
return ( referral_url , image_url , tags )
def _GetReferralURLFileURLAndTags ( self , page_url ) :
connection = self . _GetConnection ( page_url )
html = connection . geturl ( page_url )
return self . _ParseImagePage ( html , page_url )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
def GetFile ( self , url ) :
( referral_url , image_url , tags ) = self . _GetReferralURLFileURLAndTags ( url )
2013-04-10 18:10:37 +00:00
connection = self . _GetConnection ( image_url )
2013-04-17 21:48:18 +00:00
headers = { ' Referer ' : referral_url }
2013-04-10 18:10:37 +00:00
2013-11-06 18:22:07 +00:00
return self . _DownloadFile ( connection , image_url , headers = headers , response_to_path = True )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
def GetFileAndTags ( self , url ) :
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
( referral_url , image_url , tags ) = self . _GetReferralURLFileURLAndTags ( url )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
connection = self . _GetConnection ( image_url )
2013-04-10 18:10:37 +00:00
2013-04-17 21:48:18 +00:00
headers = { ' Referer ' : referral_url }
2013-04-10 18:10:37 +00:00
2013-11-06 18:22:07 +00:00
temp_path = self . _DownloadFile ( connection , image_url , headers = headers , response_to_path = True )
2013-04-10 18:10:37 +00:00
2013-11-06 18:22:07 +00:00
return ( temp_path , tags )
2013-04-17 21:48:18 +00:00
def GetTags ( self , url ) :
( referral_url , image_url , tags ) = self . _GetReferralURLFileURLAndTags ( url )
return tags
2013-04-10 18:10:37 +00:00
class DownloaderTumblr ( Downloader ) :
2013-04-17 21:48:18 +00:00
def __init__ ( self , username ) :
2013-04-10 18:10:37 +00:00
self . _gallery_url = ' http:// ' + username + ' .tumblr.com/api/read/json?start= %s tart % &num=50 '
Downloader . __init__ ( self )
2013-07-31 21:26:38 +00:00
def _GetNextGalleryPageURL ( self ) : return self . _gallery_url . replace ( ' %s tart % ' , HC . u ( self . _num_pages_done * 50 ) )
2013-04-10 18:10:37 +00:00
def _ParseGalleryPage ( self , data , url_base ) :
processed_raw_json = data . split ( ' var tumblr_api_read = ' ) [ 1 ] [ : - 2 ] # -2 takes a couple newline chars off at the end
json_object = json . loads ( processed_raw_json )
results = [ ]
if ' posts ' in json_object :
for post in json_object [ ' posts ' ] :
if ' tags ' in post : tags = post [ ' tags ' ]
else : tags = [ ]
post_type = post [ ' type ' ]
if post_type == ' photo ' :
if len ( post [ ' photos ' ] ) == 0 :
try : results . append ( ( post [ ' photo-url-1280 ' ] , tags ) )
except : pass
else :
for photo in post [ ' photos ' ] :
try : results . append ( ( photo [ ' photo-url-1280 ' ] , tags ) )
except : pass
return results
def GetTags ( self , url , tags ) : return tags