hydrus/include/HydrusDownloading.py

1233 lines
38 KiB
Python

import bs4
import collections
import httplib
import HydrusConstants as HC
import json
import lxml
import pafy
import threading
import traceback
import urllib
import urlparse
import wx
def ConvertServiceIdentifiersToTagsToServiceIdentifiersToContentUpdates( hash, service_identifiers_to_tags ):
hashes = set( ( hash, ) )
service_identifiers_to_content_updates = {}
for ( service_identifier, tags ) in service_identifiers_to_tags.items():
if service_identifier == HC.LOCAL_TAG_SERVICE_IDENTIFIER: action = HC.CONTENT_UPDATE_ADD
else: action = HC.CONTENT_UPDATE_PENDING
content_updates = [ HC.ContentUpdate( HC.CONTENT_DATA_TYPE_MAPPINGS, action, ( tag, hashes ) ) for tag in tags ]
service_identifiers_to_content_updates[ service_identifier ] = content_updates
return service_identifiers_to_content_updates
def GetDownloader( site_download_type, *args ):
if site_download_type == HC.SITE_DOWNLOAD_TYPE_BOORU: c = DownloaderBooru
elif site_download_type == HC.SITE_DOWNLOAD_TYPE_DEVIANT_ART: c = DownloaderDeviantArt
elif site_download_type == HC.SITE_DOWNLOAD_TYPE_GIPHY: c = DownloaderGiphy
elif site_download_type == HC.SITE_DOWNLOAD_TYPE_HENTAI_FOUNDRY: c = DownloaderHentaiFoundry
elif site_download_type == HC.SITE_DOWNLOAD_TYPE_PIXIV: c = DownloaderPixiv
elif site_download_type == HC.SITE_DOWNLOAD_TYPE_TUMBLR: c = DownloaderTumblr
elif site_download_type == HC.SITE_DOWNLOAD_TYPE_NEWGROUNDS: c = DownloaderNewgrounds
return c( *args )
def ConvertTagsToServiceIdentifiersToTags( tags, advanced_tag_options ):
tags = [ tag for tag in tags if tag is not None ]
service_identifiers_to_tags = {}
siblings_manager = HC.app.GetManager( 'tag_siblings' )
parents_manager = HC.app.GetManager( 'tag_parents' )
for ( service_identifier, namespaces ) in advanced_tag_options.items():
if len( namespaces ) > 0:
tags_to_add_here = []
for namespace in namespaces:
if namespace == '': tags_to_add_here.extend( [ HC.CleanTag( tag ) for tag in tags if not ':' in tag ] )
else: tags_to_add_here.extend( [ HC.CleanTag( tag ) for tag in tags if tag.startswith( namespace + ':' ) ] )
if len( tags_to_add_here ) > 0:
tags_to_add_here = siblings_manager.CollapseTags( tags_to_add_here )
tags_to_add_here = parents_manager.ExpandTags( service_identifier, tags_to_add_here )
service_identifiers_to_tags[ service_identifier ] = tags_to_add_here
return service_identifiers_to_tags
def DownloadYoutubeURL( job_key, url, message_string ):
try:
parse_result = urlparse.urlparse( url )
connection = httplib.HTTPConnection( parse_result.hostname, timeout = 20 )
connection.request( 'GET', url )
response = connection.getresponse()
try:
total_num_bytes = int( response.getheader( 'Content-Length' ) )
get_message = lambda num_bytes_so_far: message_string + ' - ' + HC.ConvertIntToBytes( num_bytes_so_far ) + '/' + HC.ConvertIntToBytes( total_num_bytes )
except:
total_num_bytes = None
get_message = lambda num_bytes_so_far: message_string + ' - ' + HC.ConvertIntToBytes( num_bytes_so_far )
block_size = 64 * 1024
num_bytes_so_far = 0
temp_path = HC.GetTempPath()
HC.pubsub.pub( 'message_gauge_info', job_key, total_num_bytes, num_bytes_so_far, get_message( num_bytes_so_far ) )
with open( temp_path, 'wb' ) as f:
while True:
if HC.shutdown or job_key.IsCancelled(): return
block = response.read( block_size )
num_bytes_so_far += len( block )
HC.pubsub.pub( 'message_gauge_info', job_key, total_num_bytes, num_bytes_so_far, get_message( num_bytes_so_far ) )
if block == '': break
f.write( block )
HC.pubsub.pub( 'message_gauge_info', job_key, None, None, 'importing ' + message_string )
( result, hash ) = HC.app.WriteSynchronous( 'import_file', temp_path )
if result in ( 'successful', 'redundant' ): HC.pubsub.pub( 'message_gauge_show_file_button', job_key, message_string, { hash } )
elif result == 'deleted': HC.pubsub.pub( 'message_gauge_info', job_key, None, None, 'File was already deleted!' )
except Exception as e:
HC.pubsub.pub( 'message_gauge_info', job_key, None, None, 'Error with ' + message_string + '!' )
HC.ShowException( e )
def GetYoutubeFormats( youtube_url ):
try: p = pafy.Pafy( youtube_url )
except: raise Exception( 'Could not fetch video info from youtube!' )
info = { ( s.extension, s.resolution ) : ( s.url, s.title ) for s in p.streams if s.extension in ( 'flv', 'mp4' ) }
return info
class Downloader():
def __init__( self ):
self._we_are_done = False
self._connections = {}
self._report_hooks = []
self._all_urls_so_far = set()
self._num_pages_done = 0
def _DownloadFile( self, connection, *args, **kwargs ):
for hook in self._report_hooks: connection.AddReportHook( hook )
response = connection.geturl( *args, **kwargs )
connection.ClearReportHooks()
return response
def _EstablishSession( self, connection ): pass
def _GetConnection( self, url ):
parse_result = urlparse.urlparse( url )
( scheme, host, port ) = ( parse_result.scheme, parse_result.hostname, parse_result.port )
if ( scheme, host, port ) not in self._connections:
connection = HC.get_connection( scheme = scheme, host = host, port = port )
self._EstablishSession( connection )
self._connections[ ( scheme, host, port ) ] = connection
return self._connections[ ( scheme, host, port ) ]
def _GetNextGalleryPageURLs( self ): return ( self._GetNextGalleryPageURL(), )
def AddReportHook( self, hook ): self._report_hooks.append( hook )
def ClearReportHooks( self ): self._report_hooks = []
def GetAnotherPage( self ):
if self._we_are_done: return []
urls = self._GetNextGalleryPageURLs()
url_info = []
for url in urls:
connection = self._GetConnection( url )
data = connection.geturl( url )
page_of_url_info = self._ParseGalleryPage( data, url )
# stop ourselves getting into an accidental infinite loop
url_info += [ info for info in page_of_url_info if info[0] not in self._all_urls_so_far ]
self._all_urls_so_far.update( [ info[0] for info in url_info ] )
# now url_info only contains new url info
self._num_pages_done += 1
return url_info
def GetFile( self, url, *args ):
connection = self._GetConnection( url )
return self._DownloadFile( connection, url, response_to_path = True )
def GetFileAndTags( self, url, *args ):
temp_path = self.GetFile( url, *args )
tags = self.GetTags( url, *args )
return ( temp_path, tags )
def GetTags( self, url ): pass
def SetupGallerySearch( self ): pass
class DownloaderBooru( Downloader ):
def __init__( self, booru, tags ):
self._booru = booru
self._tags = tags
self._gallery_advance_num = None
( self._search_url, self._advance_by_page_num, self._search_separator, self._thumb_classname ) = booru.GetGalleryParsingInfo()
Downloader.__init__( self )
def _GetNextGalleryPageURL( self ):
if self._advance_by_page_num: index = 1 + self._num_pages_done
else:
if self._gallery_advance_num is None: index = 0
else: index = self._num_pages_done * self._gallery_advance_num
return self._search_url.replace( '%tags%', self._search_separator.join( self._tags ) ).replace( '%index%', HC.u( index ) )
def _ParseGalleryPage( self, html, url_base ):
urls_set = set()
urls = []
soup = bs4.BeautifulSoup( html )
# this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
def starts_with_classname( classname ): return classname is not None and classname.startswith( self._thumb_classname )
thumbnails = soup.find_all( class_ = starts_with_classname )
if self._gallery_advance_num is None:
if len( thumbnails ) == 0: self._we_are_done = True
else: self._gallery_advance_num = len( thumbnails )
for thumbnail in thumbnails:
links = thumbnail.find_all( 'a' )
if thumbnail.name == 'a': links.append( thumbnail )
for link in links:
if link.string is not None and link.string == 'Image Only': continue # rule 34 @ paheal fix
url = link[ 'href' ]
url = urlparse.urljoin( url_base, url )
if url not in urls_set:
urls_set.add( url )
urls.append( ( url, ) )
return urls
def _ParseImagePage( self, html, url_base ):
( search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces ) = self._booru.GetData()
soup = bs4.BeautifulSoup( html )
image_base = None
if image_id is not None:
image = soup.find( id = image_id )
image_url = image[ 'src' ]
if image_data is not None:
links = soup.find_all( 'a' )
for link in links:
if link.string == image_data: image_url = link[ 'href' ]
image_url = urlparse.urljoin( url_base, image_url )
image_url = image_url.replace( 'sample/sample-', '' ) # fix for danbooru resizing
tags = []
for ( tag_classname, namespace ) in tag_classnames_to_namespaces.items():
tag_list_entries = soup.find_all( class_ = tag_classname )
for tag_list_entry in tag_list_entries:
links = tag_list_entry.find_all( 'a' )
if tag_list_entry.name == 'a': links.append( tag_list_entry )
for link in links:
if link.string not in ( '?', '-', '+' ):
if namespace == '': tags.append( link.string )
else: tags.append( namespace + ':' + link.string )
return ( image_url, tags )
def _GetFileURLAndTags( self, url ):
connection = self._GetConnection( url )
html = connection.geturl( url )
return self._ParseImagePage( html, url )
def GetFile( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
connection = self._GetConnection( file_url )
return self._DownloadFile( connection, file_url, response_to_path = True )
def GetFileAndTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
connection = self._GetConnection( file_url )
temp_path = self._DownloadFile( connection, file_url, response_to_path = True )
return ( temp_path, tags )
def GetTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
return tags
class DownloaderDeviantArt( Downloader ):
def __init__( self, artist ):
self._gallery_url = 'http://' + artist + '.deviantart.com/gallery/?catpath=/&offset='
Downloader.__init__( self )
def _GetNextGalleryPageURL( self ): return self._gallery_url + HC.u( self._num_pages_done * 24 )
def _ParseGalleryPage( self, html, url_base ):
results = []
soup = bs4.BeautifulSoup( html )
thumbs_container = soup.find( class_ = 'zones-container' )
def starts_with_thumb( classname ): return classname is not None and classname.startswith( 'thumb' )
links = thumbs_container.find_all( 'a', class_ = starts_with_thumb )
for link in links:
try: # starts_with_thumb picks up some false positives, but they break
page_url = link[ 'href' ] # something in the form of blah.da.com/art/blah-123456
raw_title = link[ 'title' ] # sweet dolls by ~AngeniaC, Feb 29, 2012 in Artisan Crafts > Miniatures > Jewelry
raw_title_reversed = raw_title[::-1] # yrleweJ ;tg& serutainiM ;tg& stfarC nasitrA ni 2102 ,92 beF ,CainegnA~ yb sllod teews
( creator_and_date_and_tags_reversed, title_reversed ) = raw_title_reversed.split( ' yb ', 1 )
creator_and_date_and_tags = creator_and_date_and_tags_reversed[::-1] # ~AngeniaC, Feb 29, 2012 in Artisan Crafts > Miniatures > Jewelry
( creator_with_username_char, date_and_tags ) = creator_and_date_and_tags.split( ',', 1 )
creator = creator_with_username_char[1:] # AngeniaC
title = title_reversed[::-1] # sweet dolls
try:
( date_gumpf, raw_category_tags ) = date_and_tags.split( ' in ', 1 )
category_tags = raw_category_tags.split( ' > ' )
except Exception as e:
HC.ShowException( e )
category_tags = []
tags = []
tags.append( 'title:' + title )
tags.append( 'creator:' + creator )
tags.extend( category_tags )
results.append( ( page_url, tags ) )
except: pass
return results
def _ParseImagePage( self, html ):
soup = bs4.BeautifulSoup( html )
# if can find download link:
if False:
pass # go fetch the popup page using tokens as appropriate. feels like it needs the GET token and a referrer, as middle click just redirects back to image page
else:
img = soup.find( class_ = 'dev-content-full' )
src = img[ 'src' ]
return src
def _GetFileURL( self, url ):
connection = self._GetConnection( url )
html = connection.geturl( url )
return self._ParseImagePage( html )
def GetFile( self, url, tags ):
file_url = self._GetFileURL( url )
connection = self._GetConnection( file_url )
return self._DownloadFile( connection, file_url, response_to_path = True )
def GetTags( self, url, tags ): return tags
class DownloaderGiphy( Downloader ):
def __init__( self, tag ):
self._gallery_url = 'http://giphy.com/api/gifs?tag=' + tag.replace( ' ', '+' ) + '&page='
Downloader.__init__( self )
def _GetNextGalleryPageURL( self ): return self._gallery_url + HC.u( self._num_pages_done + 1 )
def _ParseGalleryPage( self, data, url_base ):
json_dict = json.loads( data )
if 'data' in json_dict:
json_data = json_dict[ 'data' ]
return [ ( d[ 'image_original_url' ], d[ 'id' ] ) for d in json_data ]
else: return []
def GetTags( self, url, id ):
url = 'http://giphy.com/api/gifs/' + HC.u( id )
connection = self._GetConnection( url )
try:
raw_json = connection.geturl( url )
json_dict = json.loads( raw_json )
tags_data = json_dict[ 'data' ][ 'tags' ]
tags = [ tag_data[ 'name' ] for tag_data in tags_data ]
except Exception as e:
HC.ShowException( e )
tags = []
return tags
class DownloaderHentaiFoundry( Downloader ):
def __init__( self, query_type, query, advanced_hentai_foundry_options ):
self._query_type = query_type
self._query = query
self._advanced_hentai_foundry_options = advanced_hentai_foundry_options
Downloader.__init__( self )
def _EstablishSession( self, connection ):
manager = HC.app.GetManager( 'web_sessions' )
cookies = manager.GetCookies( 'hentai foundry' )
for ( key, value ) in cookies.items(): connection.SetCookie( key, value )
def _GetFileURLAndTags( self, url ):
connection = self._GetConnection( url )
html = connection.geturl( url )
return self._ParseImagePage( html, url )
def _GetNextGalleryPageURL( self ):
if self._query_type in ( 'artist', 'artist pictures' ):
artist = self._query
gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist
return gallery_url + '/page/' + HC.u( self._num_pages_done + 1 )
elif self._query_type == 'artist scraps':
artist = self._query
gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist + '/scraps'
return gallery_url + '/page/' + HC.u( self._num_pages_done + 1 )
elif self._query_type == 'tags':
tags = self._query
return 'http://www.hentai-foundry.com/search/pictures?query=' + '+'.join( tags ) + '&search_in=all&scraps=-1&page=' + HC.u( self._num_pages_done + 1 )
# scraps = 0 hide
# -1 means show both
# 1 means scraps only. wetf
def _ParseGalleryPage( self, html, url_base ):
urls_set = set()
soup = bs4.BeautifulSoup( html )
def correct_url( href ):
# a good url is in the form "/pictures/user/artist_name/file_id/title"
if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):
( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )
# /pictures/user/artist_name/page/3
if file_id != 'page': return True
return False
links = soup.find_all( 'a', href = correct_url )
urls = [ 'http://www.hentai-foundry.com' + link['href'] for link in links ]
result_urls = []
for url in urls:
if url not in urls_set:
urls_set.add( url )
result_urls.append( ( url, ) )
# this is copied from old code. surely we can improve it?
if 'class="next"' not in html: self._we_are_done = True
return result_urls
def _ParseImagePage( self, html, url_base ):
# can't parse this easily normally because HF is a pain with the preview->click to see full size business.
# find http://pictures.hentai-foundry.com//
# then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
# the .jpg bit is what we really need, but whatever
try:
index = html.index( 'http://pictures.hentai-foundry.com//' )
stuff = html[ index : index + 100 ]
try: ( image_url, gumpf ) = stuff.split( '"', 1 )
except: ( image_url, gumpf ) = stuff.split( ''', 1 )
except: raise Exception( 'Could not parse image url!' )
soup = bs4.BeautifulSoup( html )
tags = []
try:
title = soup.find( 'title' )
( data, nothing ) = HC.u( title.string ).split( ' - Hentai Foundry' )
data_reversed = data[::-1] # want to do it right-side first, because title might have ' by ' in it
( artist_reversed, title_reversed ) = data_reversed.split( ' yb ' )
artist = artist_reversed[::-1]
title = title_reversed[::-1]
tags.append( 'creator:' + artist )
tags.append( 'title:' + title )
except: pass
tag_links = soup.find_all( 'a', rel = 'tag' )
for tag_link in tag_links: tags.append( tag_link.string )
return ( image_url, tags )
def GetFile( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
connection = self._GetConnection( file_url )
return self._DownloadFile( connection, file_url, response_to_path = True )
def GetFileAndTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
connection = self._GetConnection( file_url )
temp_path = self._DownloadFile( connection, file_url, response_to_path = True )
return ( temp_path, tags )
def GetTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
return tags
def SetupGallerySearch( self ):
connection = self._GetConnection( 'http://www.hentai-foundry.com/site/filters' )
cookies = connection.GetCookies()
raw_csrf = cookies[ 'YII_CSRF_TOKEN' ] # YII_CSRF_TOKEN=19b05b536885ec60b8b37650a32f8deb11c08cd1s%3A40%3A%222917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32%22%3B
processed_csrf = urllib.unquote( raw_csrf ) # 19b05b536885ec60b8b37650a32f8deb11c08cd1s:40:"2917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32";
csrf_token = processed_csrf.split( '"' )[1] # the 2917... bit
self._advanced_hentai_foundry_options[ 'YII_CSRF_TOKEN' ] = csrf_token
body = urllib.urlencode( self._advanced_hentai_foundry_options )
headers = {}
headers[ 'Content-Type' ] = 'application/x-www-form-urlencoded'
connection.request( 'POST', '/site/filters', headers = headers, body = body )
class DownloaderNewgrounds( Downloader ):
def __init__( self, query ):
self._query = query
Downloader.__init__( self )
def _GetFileURLAndTags( self, url ):
connection = self._GetConnection( url )
html = connection.geturl( url )
return self._ParseImagePage( html, url )
def _GetNextGalleryPageURLs( self ):
artist = self._query
gallery_urls = []
gallery_urls.append( 'http://' + artist + '.newgrounds.com/games/' )
gallery_urls.append( 'http://' + artist + '.newgrounds.com/movies/' )
self._we_are_done = True
return gallery_urls
def _ParseGalleryPage( self, html, url_base ):
soup = bs4.BeautifulSoup( html )
fatcol = soup.find( 'div', class_ = 'fatcol' )
links = fatcol.find_all( 'a' )
urls_set = set()
result_urls = []
for link in links:
try:
url = link[ 'href' ]
if url not in urls_set:
if url.startswith( 'http://www.newgrounds.com/portal/view/' ):
urls_set.add( url )
result_urls.append( ( url, ) )
except: pass
return result_urls
def _ParseImagePage( self, html, url_base ):
soup = bs4.BeautifulSoup( html )
tags = set()
author_links = soup.find( 'ul', class_ = 'authorlinks' )
if author_links is not None:
authors = set()
links = author_links.find_all( 'a' )
for link in links:
try:
href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com
creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' )
tags.add( u'creator:' + creator )
except: pass
try:
title = soup.find( 'title' )
tags.add( u'title:' + title.string )
except: pass
all_links = soup.find_all( 'a' )
for link in all_links:
try:
href = link[ 'href' ]
if '/browse/tag/' in href: tags.add( link.string )
except: pass
#
try:
components = html.split( '"http://uploads.ungrounded.net/' )
# there is sometimes another bit of api flash earlier on that we don't want
# it is called http://uploads.ungrounded.net/apiassets/sandbox.swf
if len( components ) == 2: flash_url = components[1]
else: flash_url = components[2]
flash_url = flash_url.split( '"', 1 )[0]
flash_url = 'http://uploads.ungrounded.net/' + flash_url
except: raise Exception( 'Could not find the swf file! It was probably an mp4!' )
return ( flash_url, tags )
def GetFile( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
connection = self._GetConnection( file_url )
return self._DownloadFile( connection, file_url, response_to_path = True )
def GetFileAndTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
connection = self._GetConnection( file_url )
temp_path = self._DownloadFile( connection, file_url, response_to_path = True )
return ( temp_path, tags )
def GetTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
return tags
class DownloaderPixiv( Downloader ):
def __init__( self, query_type, query ):
self._query_type = query_type
self._query = query
Downloader.__init__( self )
def _EstablishSession( self, connection ):
manager = HC.app.GetManager( 'web_sessions' )
cookies = manager.GetCookies( 'pixiv' )
for ( key, value ) in cookies.items(): connection.SetCookie( key, value )
def _GetNextGalleryPageURL( self ):
if self._query_type == 'artist':
artist_id = self._query
gallery_url = 'http://www.pixiv.net/member_illust.php?id=' + HC.u( artist_id )
elif self._query_type == 'tag':
tag = self._query
tag = urllib.quote( tag.encode( 'utf-8' ) )
gallery_url = 'http://www.pixiv.net/search.php?word=' + tag + '&s_mode=s_tag_full&order=date_d'
return gallery_url + '&p=' + HC.u( self._num_pages_done + 1 )
def _ParseGalleryPage( self, html, url_base ):
results = []
soup = bs4.BeautifulSoup( html )
thumbnail_links = soup.find_all( class_ = 'work' )
for thumbnail_link in thumbnail_links:
url = urlparse.urljoin( url_base, thumbnail_link[ 'href' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690
results.append( ( url, ) )
return results
def _ParseImagePage( self, html, page_url ):
soup = bs4.BeautifulSoup( html )
#
# this is the page that holds the full size of the image.
# pixiv won't serve the image unless it thinks this page is the referrer
referral_url = page_url.replace( 'medium', 'big' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690
#
works_display = soup.find( class_ = 'works_display' )
img = works_display.find( 'img' )
img_url = img[ 'src' ] # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468_m.png
image_url = img_url.replace( '_m.', '.' ) # http://i2.pixiv.net/img122/img/amanekukagenoyuragi/34992468.png
#
tags = soup.find( 'ul', class_ = 'tags' )
tags = [ a_item.string for a_item in tags.find_all( 'a', class_ = 'text' ) ]
user = soup.find( 'h1', class_ = 'user' )
tags.append( 'creator:' + user.string )
title_parent = soup.find( 'section', class_ = 'work-info' )
title = title_parent.find( 'h1', class_ = 'title' )
tags.append( 'title:' + title.string )
try: tags.append( 'creator:' + image_url.split( '/' )[ -2 ] ) # http://i2.pixiv.net/img02/img/dnosuke/462657.jpg -> dnosuke
except: pass
return ( referral_url, image_url, tags )
def _GetReferralURLFileURLAndTags( self, page_url ):
connection = self._GetConnection( page_url )
html = connection.geturl( page_url )
return self._ParseImagePage( html, page_url )
def GetFile( self, url ):
( referral_url, image_url, tags ) = self._GetReferralURLFileURLAndTags( url )
connection = self._GetConnection( image_url )
headers = { 'Referer' : referral_url }
return self._DownloadFile( connection, image_url, headers = headers, response_to_path = True )
def GetFileAndTags( self, url ):
( referral_url, image_url, tags ) = self._GetReferralURLFileURLAndTags( url )
connection = self._GetConnection( image_url )
headers = { 'Referer' : referral_url }
temp_path = self._DownloadFile( connection, image_url, headers = headers, response_to_path = True )
return ( temp_path, tags )
def GetTags( self, url ):
( referral_url, image_url, tags ) = self._GetReferralURLFileURLAndTags( url )
return tags
class DownloaderTumblr( Downloader ):
def __init__( self, username ):
self._gallery_url = 'http://' + username + '.tumblr.com/api/read/json?start=%start%&num=50'
Downloader.__init__( self )
def _GetNextGalleryPageURL( self ): return self._gallery_url.replace( '%start%', HC.u( self._num_pages_done * 50 ) )
def _ParseGalleryPage( self, data, url_base ):
processed_raw_json = data.split( 'var tumblr_api_read = ' )[1][:-2] # -2 takes a couple newline chars off at the end
json_object = json.loads( processed_raw_json )
results = []
if 'posts' in json_object:
for post in json_object[ 'posts' ]:
if 'tags' in post: tags = post[ 'tags' ]
else: tags = []
post_type = post[ 'type' ]
if post_type == 'photo':
if len( post[ 'photos' ] ) == 0:
try: results.append( ( post[ 'photo-url-1280' ], tags ) )
except: pass
else:
for photo in post[ 'photos' ]:
try: results.append( ( photo[ 'photo-url-1280' ], tags ) )
except: pass
return results
def GetTags( self, url, tags ): return tags
class DownloaderEngine(): # rename this to something more import related
# this should be a yamlable thing
def __init__( self, page_key, import_queue_generator ):
self._page_key = page_key
self._import_queue_generator = import_queue_generator
self._current_queue_processor = None
self._pending_queue_jobs = []
def GetCurrentQueueProcessor( self ): return self._current_queue_processor
def ToTuple( self ): return ( self._pending_queue_jobs, )
def PendQueueJob( self, job ):
self._pending_queue_jobs.append( job )
def THREADProcessJobs( self ):
while True:
if len( self._pending_queue_jobs ) > 0:
job = self._pending_queue_jobs.pop( 0 )
self._current_queue_processor = self._import_queue_generator( job )
self._current_queue_processor.ProcessQueue()
# if there are any pending jobs:
# get it
# process it
pass
class ImportQueueProcessor():
def __init__( self, page_key, import_args_generator ):
self._page_key = page_key
self._import_args_generator = import_args_generator
self._queue_is_done = False
self._queue = []
self._paused = False
self._current_position = 0
self._lock = threading.Lock()
HC.pubsub.sub( self, 'SetPaused', 'pause_import_queue_processor' )
def AddToQueue( self, queue_objects ):
with self._lock: self._queue.extend( queue_objects )
def QueueIsDone( self ): self._queue_is_done = True
def SetPaused( self, status ): self._paused = status
def ToTuple( self ):
with self._lock: return ( self._current_position, len( self._queue ) )
def ProcessQueue( self ):
while not self._queue_is_done:
with self._lock: queue_length = len( self._queue )
if not self._paused and self._current_position < queue_length:
with self._lock: queue_object = self._queue[ self._current_position ]
# reorder these params as is best
( temp_path, url, tags, anything_else ) = self._path_generator( self._page_key, queue_object )
# synchronously write import to db
self._current_position += 1
time.sleep( 1 )
def PathGeneratorBooru( self, page_key, queue_object ):
# unpack queue_object
# test url or whatever as appropriate
# fetch file, possibly with help of downloader or whatever!
# downloader should write file to path, returning temp_path
# we should return temp_path
pass