hydrus/include/ClientDownloading.py

2000 lines
64 KiB
Python
Raw Normal View History

2015-03-25 22:04:19 +00:00
import bs4
import collections
import httplib
import HydrusConstants as HC
import HydrusExceptions
import HydrusNetworking
import HydrusThreading
import json
import os
import pafy
import re
import sys
import threading
import time
import traceback
import urllib
import urlparse
import wx
import HydrusTags
import HydrusData
import HydrusFileHandling
import ClientData
2015-06-03 21:05:13 +00:00
import ClientConstants as CC
2015-03-25 22:04:19 +00:00
import HydrusGlobals
2015-06-03 21:05:13 +00:00
# This is fairly ugly, but it works for what I need it to do
URL_EXTRA_INFO = {}
URL_EXTRA_INFO_LOCK = threading.Lock()
def GetExtraURLInfo( url ):
with URL_EXTRA_INFO_LOCK:
if url in URL_EXTRA_INFO:
return URL_EXTRA_INFO[ url ]
else:
return None
def SetExtraURLInfo( url, info ):
with URL_EXTRA_INFO_LOCK:
URL_EXTRA_INFO[ url ] = info
2015-03-25 22:04:19 +00:00
def ConvertServiceKeysToTagsToServiceKeysToContentUpdates( hash, service_keys_to_tags ):
hashes = set( ( hash, ) )
service_keys_to_content_updates = {}
for ( service_key, tags ) in service_keys_to_tags.items():
2015-06-03 21:05:13 +00:00
if service_key == CC.LOCAL_TAG_SERVICE_KEY: action = HC.CONTENT_UPDATE_ADD
2015-09-23 21:21:02 +00:00
else: action = HC.CONTENT_UPDATE_PEND
2015-03-25 22:04:19 +00:00
content_updates = [ HydrusData.ContentUpdate( HC.CONTENT_DATA_TYPE_MAPPINGS, action, ( tag, hashes ) ) for tag in tags ]
service_keys_to_content_updates[ service_key ] = content_updates
return service_keys_to_content_updates
2015-05-20 21:31:40 +00:00
def GetGalleryParser( site_type, *args ):
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
if site_type == HC.SITE_TYPE_BOORU: c = GalleryParserBooru
elif site_type == HC.SITE_TYPE_DEVIANT_ART: c = GalleryParserDeviantArt
elif site_type == HC.SITE_TYPE_GIPHY: c = GalleryParserGiphy
elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY: c = GalleryParserHentaiFoundry
elif site_type == HC.SITE_TYPE_PIXIV: c = GalleryParserPixiv
elif site_type == HC.SITE_TYPE_TUMBLR: c = GalleryParserTumblr
elif site_type == HC.SITE_TYPE_NEWGROUNDS: c = GalleryParserNewgrounds
2015-03-25 22:04:19 +00:00
return c( *args )
2015-08-05 18:42:35 +00:00
def GetImageboardThreadURLs( thread_url ):
try:
parse_result = urlparse.urlparse( thread_url )
host = parse_result.hostname
request = parse_result.path
if host is None or request is None: raise Exception()
except: raise Exception ( 'Could not understand that url!' )
is_4chan = '4chan.org' in host
is_8chan = '8chan.co' in host or '8ch.net' in host
if not ( is_4chan or is_8chan ): raise Exception( 'This only works for 4chan and 8chan right now!' )
try:
# 4chan
# /asp/thread/382059/post-your-favourite-martial-arts-video-if-martin
# http://a.4cdn.org/asp/thread/382059.json
# http://i.4cdn.org/asp/ for images
# 8chan
# /v/res/406061.html
# http://8chan.co/v/res/406061.json
# http://8chan.co/v/src/ for images
if is_4chan:
( board, rest_of_request ) = request[1:].split( '/thread/', 1 )
if '/' in rest_of_request: ( thread_id, gumpf ) = rest_of_request.split( '/' )
else: thread_id = rest_of_request
json_url = 'http://a.4cdn.org/' + board + '/thread/' + thread_id + '.json'
file_base = 'http://i.4cdn.org/' + board + '/'
elif is_8chan:
( board, rest_of_request ) = request[1:].split( '/res/', 1 )
json_url = thread_url[:-4] + 'json'
file_base = 'http://8ch.net/' + board + '/src/'
except: raise Exception( 'Could not understand the board or thread id!' )
return ( json_url, file_base )
2015-03-25 22:04:19 +00:00
def ConvertTagsToServiceKeysToTags( tags, advanced_tag_options ):
tags = [ tag for tag in tags if tag is not None ]
service_keys_to_tags = {}
2015-09-16 18:11:00 +00:00
siblings_manager = HydrusGlobals.client_controller.GetManager( 'tag_siblings' )
parents_manager = HydrusGlobals.client_controller.GetManager( 'tag_parents' )
2015-03-25 22:04:19 +00:00
for ( service_key, namespaces ) in advanced_tag_options.items():
if len( namespaces ) > 0:
tags_to_add_here = []
for namespace in namespaces:
if namespace == '': tags_to_add_here.extend( [ tag for tag in tags if not ':' in tag ] )
else: tags_to_add_here.extend( [ tag for tag in tags if tag.startswith( namespace + ':' ) ] )
tags_to_add_here = HydrusTags.CleanTags( tags_to_add_here )
if len( tags_to_add_here ) > 0:
tags_to_add_here = siblings_manager.CollapseTags( tags_to_add_here )
tags_to_add_here = parents_manager.ExpandTags( service_key, tags_to_add_here )
service_keys_to_tags[ service_key ] = tags_to_add_here
return service_keys_to_tags
def GetYoutubeFormats( youtube_url ):
try: p = pafy.Pafy( youtube_url )
except Exception as e:
raise Exception( 'Could not fetch video info from youtube!' + os.linesep + HydrusData.ToString( e ) )
info = { ( s.extension, s.resolution ) : ( s.url, s.title ) for s in p.streams if s.extension in ( 'flv', 'mp4' ) }
return info
2015-08-19 21:48:21 +00:00
def THREADDownloadURL( job_key, url, url_string ):
job_key.SetVariable( 'popup_text_1', url_string + ' - initialising' )
def hook( gauge_range, gauge_value ):
if gauge_range is None: text = url_string + ' - ' + HydrusData.ConvertIntToBytes( gauge_value )
else: text = url_string + ' - ' + HydrusData.ConvertValueRangeToPrettyString( gauge_value, gauge_range )
job_key.SetVariable( 'popup_text_1', text )
job_key.SetVariable( 'popup_gauge_1', ( gauge_value, gauge_range ) )
( os_file_handle, temp_path ) = HydrusFileHandling.GetTempPath()
try:
2015-09-16 18:11:00 +00:00
HydrusGlobals.client_controller.DoHTTP( HC.GET, url, temp_path = temp_path, report_hooks = [ hook ] )
2015-08-19 21:48:21 +00:00
job_key.DeleteVariable( 'popup_gauge_1' )
job_key.SetVariable( 'popup_text_1', 'importing ' + url_string )
2015-09-16 18:11:00 +00:00
( result, hash ) = HydrusGlobals.client_controller.WriteSynchronous( 'import_file', temp_path )
2015-08-19 21:48:21 +00:00
finally:
HydrusFileHandling.CleanUpTempPath( os_file_handle, temp_path )
if result in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
if result == CC.STATUS_SUCCESSFUL:
job_key.SetVariable( 'popup_text_1', url_string )
else:
job_key.SetVariable( 'popup_text_1', url_string + ' was already in the database!' )
job_key.SetVariable( 'popup_files', { hash } )
elif result == CC.STATUS_DELETED:
job_key.SetVariable( 'popup_text_1', url_string + ' had already been deleted!' )
job_key.Finish()
def Parse4chanPostScreen( html ):
soup = bs4.BeautifulSoup( html )
title_tag = soup.find( 'title' )
if title_tag.string == 'Post successful!': return ( 'success', None )
elif title_tag.string == '4chan - Banned':
print( repr( soup ) )
text = 'You are banned from this board! html written to log.'
HydrusData.ShowText( text )
return ( 'big error', text )
else:
try:
problem_tag = soup.find( id = 'errmsg' )
if problem_tag is None:
try: print( repr( soup ) )
except: pass
text = 'Unknown problem; html written to log.'
HydrusData.ShowText( text )
return ( 'error', text )
problem = HydrusData.ToString( problem_tag )
if 'CAPTCHA' in problem: return ( 'captcha', None )
elif 'seconds' in problem: return ( 'too quick', None )
elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
else: return ( 'error', problem )
except: return ( 'error', 'unknown error' )
def ParsePageForURLs( html, starting_url ):
soup = bs4.BeautifulSoup( html )
all_links = soup.find_all( 'a' )
links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]
return urls
2015-05-20 21:31:40 +00:00
class GalleryParser( object ):
2015-03-25 22:04:19 +00:00
def __init__( self ):
self._we_are_done = False
self._report_hooks = []
self._all_urls_so_far = set()
def _AddSessionCookies( self, request_headers ): pass
def _FetchData( self, url, request_headers = None, report_hooks = None, temp_path = None ):
if request_headers is None: request_headers = {}
if report_hooks is None: report_hooks = []
self._AddSessionCookies( request_headers )
2015-09-16 18:11:00 +00:00
return HydrusGlobals.client_controller.DoHTTP( HC.GET, url, request_headers = request_headers, report_hooks = report_hooks, temp_path = temp_path )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURL( self, page_index ):
return ''
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURLs( self, page_index ):
return ( self._GetGalleryPageURL( page_index ), )
2015-03-25 22:04:19 +00:00
2015-04-01 20:44:54 +00:00
def _ParseGalleryPage( self, data, url ):
raise NotImplementedError()
2015-03-25 22:04:19 +00:00
def AddReportHook( self, hook ): self._report_hooks.append( hook )
def ClearReportHooks( self ): self._report_hooks = []
2015-06-03 21:05:13 +00:00
def GetFile( self, temp_path, url ): self._FetchData( url, report_hooks = self._report_hooks, temp_path = temp_path )
def GetFileAndTags( self, temp_path, url ):
temp_path = self.GetFile( temp_path, url )
tags = self.GetTags( url )
return tags
def GetPage( self, page_index ):
2015-03-25 22:04:19 +00:00
if self._we_are_done: return []
2015-06-03 21:05:13 +00:00
gallery_urls = self._GetGalleryPageURLs( page_index )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
all_urls = []
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
for gallery_url in gallery_urls:
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
data = self._FetchData( gallery_url )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
page_of_urls = self._ParseGalleryPage( data, gallery_url )
2015-03-25 22:04:19 +00:00
# stop ourselves getting into an accidental infinite loop
2015-06-03 21:05:13 +00:00
all_urls += [ url for url in page_of_urls if url not in self._all_urls_so_far ]
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
self._all_urls_so_far.update( page_of_urls )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
return all_urls
2015-03-25 22:04:19 +00:00
def GetTags( self, url ): pass
def SetupGallerySearch( self ): pass
2015-05-20 21:31:40 +00:00
class GalleryParserBooru( GalleryParser ):
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
def __init__( self, booru_name, tags ):
try:
2015-09-16 18:11:00 +00:00
self._booru = HydrusGlobals.client_controller.Read( 'remote_booru', booru_name )
2015-07-08 21:45:38 +00:00
except:
raise HydrusExceptions.NotFoundException( 'Attempted to find booru "' + booru_name + '", but it was missing from the database!' )
2015-03-25 22:04:19 +00:00
self._tags = tags
self._gallery_advance_num = None
2015-07-08 21:45:38 +00:00
( self._search_url, self._advance_by_page_num, self._search_separator, self._thumb_classname ) = self._booru.GetGalleryParsingInfo()
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
GalleryParser.__init__( self )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURL( self, page_index ):
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
if self._advance_by_page_num: url_index = page_index + 1
2015-03-25 22:04:19 +00:00
else:
2015-06-03 21:05:13 +00:00
if self._gallery_advance_num is None: url_index = 0
else: url_index = page_index * self._gallery_advance_num
2015-03-25 22:04:19 +00:00
2015-08-12 20:35:24 +00:00
tags_to_use = self._tags
if 'e621' in self._search_url:
tags_to_use = []
for tag in self._tags:
if '/' in tag:
tag = tag.replace( '/', '%-2F' )
tags_to_use.append( tag )
2015-09-02 23:16:09 +00:00
return self._search_url.replace( '%tags%', self._search_separator.join( [ urllib.quote( tag.encode( 'utf-8' ), '' ) for tag in tags_to_use ] ) ).replace( '%index%', HydrusData.ToString( url_index ) )
2015-03-25 22:04:19 +00:00
def _ParseGalleryPage( self, html, url_base ):
urls_set = set()
urls = []
soup = bs4.BeautifulSoup( html )
# this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
def starts_with_classname( classname ): return classname is not None and classname.startswith( self._thumb_classname )
thumbnails = soup.find_all( class_ = starts_with_classname )
# this is a sankaku thing
popular_thumbnail_parent = soup.find( id = 'popular-preview' )
if popular_thumbnail_parent is not None:
popular_thumbnails = popular_thumbnail_parent.find_all( class_ = starts_with_classname )
thumbnails = thumbnails[ len( popular_thumbnails ) : ]
if self._gallery_advance_num is None:
if len( thumbnails ) == 0: self._we_are_done = True
else: self._gallery_advance_num = len( thumbnails )
for thumbnail in thumbnails:
links = thumbnail.find_all( 'a' )
if thumbnail.name == 'a': links.append( thumbnail )
for link in links:
if link.string is not None and link.string == 'Image Only': continue # rule 34 @ paheal fix
url = link[ 'href' ]
url = urlparse.urljoin( url_base, url )
if url not in urls_set:
urls_set.add( url )
2015-06-03 21:05:13 +00:00
urls.append( url )
2015-03-25 22:04:19 +00:00
return urls
def _ParseImagePage( self, html, url_base ):
( search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces ) = self._booru.GetData()
soup = bs4.BeautifulSoup( html )
image_base = None
2015-06-03 21:05:13 +00:00
image_url = None
2015-07-08 21:45:38 +00:00
try:
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
if image_id is not None:
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
image = soup.find( id = image_id )
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
if image is None:
image_string = soup.find( text = re.compile( 'Save this file' ) )
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
if image_string is None: image_string = soup.find( text = re.compile( 'Save this video' ) )
image = image_string.parent
image_url = image[ 'href' ]
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
else:
if image.name in ( 'img', 'video' ):
image_url = image[ 'src' ]
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
if 'sample/sample-' in image_url:
# danbooru resized image
image = soup.find( id = 'image-resize-link' )
image_url = image[ 'href' ]
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
elif image.name == 'a':
2015-03-25 22:04:19 +00:00
image_url = image[ 'href' ]
2015-07-08 21:45:38 +00:00
if image_data is not None:
links = soup.find_all( 'a' )
2015-07-15 20:28:26 +00:00
ok_link = None
better_link = None
2015-07-08 21:45:38 +00:00
for link in links:
2015-03-25 22:04:19 +00:00
2015-07-15 20:28:26 +00:00
if link.string is not None:
if link.string.startswith( image_data ):
ok_link = link[ 'href' ]
if link.string.startswith( 'Download PNG' ):
better_link = link[ 'href' ]
break
if better_link is not None:
image_url = better_link
else:
image_url = ok_link
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
except Exception as e:
2015-03-25 22:04:19 +00:00
2015-07-08 21:45:38 +00:00
raise HydrusExceptions.NotFoundException( 'Could not parse a download link for ' + url_base + '!' + os.linesep + HydrusData.ToString( e ) )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
if image_url is None:
2015-07-08 21:45:38 +00:00
raise HydrusExceptions.NotFoundException( 'Could not parse a download link for ' + url_base + '!' )
2015-06-03 21:05:13 +00:00
2015-03-25 22:04:19 +00:00
image_url = urlparse.urljoin( url_base, image_url )
tags = []
for ( tag_classname, namespace ) in tag_classnames_to_namespaces.items():
tag_list_entries = soup.find_all( class_ = tag_classname )
for tag_list_entry in tag_list_entries:
links = tag_list_entry.find_all( 'a' )
if tag_list_entry.name == 'a': links.append( tag_list_entry )
for link in links:
if link.string not in ( '?', '-', '+' ):
if namespace == '': tags.append( link.string )
else: tags.append( namespace + ':' + link.string )
return ( image_url, tags )
def _GetFileURLAndTags( self, url ):
html = self._FetchData( url )
return self._ParseImagePage( html, url )
def GetFile( self, temp_path, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
def GetFileAndTags( self, temp_path, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
return tags
def GetTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
return tags
2015-05-20 21:31:40 +00:00
class GalleryParserDeviantArt( GalleryParser ):
2015-03-25 22:04:19 +00:00
def __init__( self, artist ):
self._gallery_url = 'http://' + artist + '.deviantart.com/gallery/?catpath=/&offset='
self._artist = artist
2015-05-20 21:31:40 +00:00
GalleryParser.__init__( self )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURL( self, page_index ):
return self._gallery_url + HydrusData.ToString( page_index * 24 )
2015-03-25 22:04:19 +00:00
def _ParseGalleryPage( self, html, url_base ):
2015-06-03 21:05:13 +00:00
urls = []
2015-03-25 22:04:19 +00:00
soup = bs4.BeautifulSoup( html )
thumbs_container = soup.find( class_ = 'zones-container' )
2015-08-19 21:48:21 +00:00
links = thumbs_container.find_all( 'a', class_ = 'thumb' )
2015-03-25 22:04:19 +00:00
for link in links:
2015-08-19 21:48:21 +00:00
url = link[ 'href' ] # something in the form of blah.da.com/art/blah-123456
urls.append( url )
tags = []
tags.append( 'creator:' + self._artist )
2015-03-25 22:04:19 +00:00
try: # starts_with_thumb picks up some false positives, but they break
raw_title = link[ 'title' ] # sweet dolls by AngeniaC, date, blah blah blah
raw_title_reversed = raw_title[::-1] # trAtnaiveD no CainegnA yb sllod teews
( creator_and_gumpf_reversed, title_reversed ) = raw_title_reversed.split( ' yb ', 1 )
title = title_reversed[::-1] # sweet dolls
tags.append( 'title:' + title )
except: pass
2015-08-19 21:48:21 +00:00
SetExtraURLInfo( url, tags )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
return urls
2015-03-25 22:04:19 +00:00
def _ParseImagePage( self, html ):
soup = bs4.BeautifulSoup( html )
2015-08-19 21:48:21 +00:00
img = soup.find( class_ = 'dev-content-full' )
if img is None:
2015-03-25 22:04:19 +00:00
2015-08-19 21:48:21 +00:00
# this probably means it is mature
# DA hide the url pretty much everywhere except the tumblr share thing
2015-03-25 22:04:19 +00:00
2015-08-19 21:48:21 +00:00
a_tumblr = soup.find( id = 'gmi-ResourceViewShareTumblr' )
2015-03-25 22:04:19 +00:00
2015-08-19 21:48:21 +00:00
tumblr_url = a_tumblr[ 'href' ] # http://www.tumblr.com/share/photo?source=http%3A%2F%2Fimg09.deviantart.net%2Ff19a%2Fi%2F2015%2F054%2Fe%2Fd%2Fass_by_gmgkaiser-d8j7ija.png&caption=%3Ca+href%3D%22http%3A%2F%2Fgmgkaiser.deviantart.com%2Fart%2Fass-515992726%22%3Eass%3C%2Fa%3E+by+%3Ca+href%3D%22http%3A%2F%2Fgmgkaiser.deviantart.com%2F%22%3EGMGkaiser%3C%2Fa%3E&clickthru=http%3A%2F%2Fgmgkaiser.deviantart.com%2Fart%2Fass-515992726
2015-03-25 22:04:19 +00:00
2015-08-19 21:48:21 +00:00
parse_result = urlparse.urlparse( tumblr_url )
2015-03-25 22:04:19 +00:00
2015-08-19 21:48:21 +00:00
query_parse_result = urlparse.parse_qs( parse_result.query )
img_url = query_parse_result[ 'source' ][0] # http://img09.deviantart.net/f19a/i/2015/054/e/d/ass_by_gmgkaiser-d8j7ija.png
else:
img_url = img[ 'src' ]
2015-03-25 22:04:19 +00:00
2015-08-19 21:48:21 +00:00
return img_url
2015-03-25 22:04:19 +00:00
def _GetFileURL( self, url ):
html = self._FetchData( url )
return self._ParseImagePage( html )
2015-06-03 21:05:13 +00:00
def GetFile( self, temp_path, url ):
2015-03-25 22:04:19 +00:00
file_url = self._GetFileURL( url )
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
2015-06-03 21:05:13 +00:00
def GetTags( self, url ):
result = GetExtraURLInfo( url )
if result is None:
return []
else:
return result
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
class GalleryParserGiphy( GalleryParser ):
2015-03-25 22:04:19 +00:00
def __init__( self, tag ):
2015-09-02 23:16:09 +00:00
self._gallery_url = 'http://giphy.com/api/gifs?tag=' + urllib.quote( tag.encode( 'utf-8' ).replace( ' ', '+' ), '' ) + '&page='
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
GalleryParser.__init__( self )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURL( self, page_index ):
return self._gallery_url + HydrusData.ToString( page_index + 1 )
2015-03-25 22:04:19 +00:00
def _ParseGalleryPage( self, data, url_base ):
json_dict = json.loads( data )
2015-06-03 21:05:13 +00:00
urls = []
2015-03-25 22:04:19 +00:00
if 'data' in json_dict:
json_data = json_dict[ 'data' ]
2015-06-03 21:05:13 +00:00
for d in json_data:
url = d[ 'image_original_url' ]
id = d[ 'id' ]
SetExtraURLInfo( url, id )
urls.append( url )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
return urls
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def GetTags( self, url ):
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
id = GetExtraURLInfo( url )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
if id is None:
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
return []
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
else:
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
url = 'http://giphy.com/api/gifs/' + HydrusData.ToString( id )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
try:
raw_json = self._FetchData( url )
json_dict = json.loads( raw_json )
tags_data = json_dict[ 'data' ][ 'tags' ]
return [ tag_data[ 'name' ] for tag_data in tags_data ]
except Exception as e:
HydrusData.ShowException( e )
return []
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
class GalleryParserHentaiFoundry( GalleryParser ):
2015-03-25 22:04:19 +00:00
def __init__( self, query_type, query, advanced_hentai_foundry_options ):
self._query_type = query_type
self._query = query
self._advanced_hentai_foundry_options = advanced_hentai_foundry_options
2015-05-20 21:31:40 +00:00
GalleryParser.__init__( self )
2015-03-25 22:04:19 +00:00
def _AddSessionCookies( self, request_headers ):
2015-09-16 18:11:00 +00:00
manager = HydrusGlobals.client_controller.GetManager( 'web_sessions' )
2015-03-25 22:04:19 +00:00
cookies = manager.GetCookies( 'hentai foundry' )
HydrusNetworking.AddCookiesToHeaders( cookies, request_headers )
def _GetFileURLAndTags( self, url ):
html = self._FetchData( url )
return self._ParseImagePage( html, url )
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURL( self, page_index ):
2015-03-25 22:04:19 +00:00
if self._query_type in ( 'artist', 'artist pictures' ):
artist = self._query
gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist
2015-06-03 21:05:13 +00:00
return gallery_url + '/page/' + HydrusData.ToString( page_index + 1 )
2015-03-25 22:04:19 +00:00
elif self._query_type == 'artist scraps':
artist = self._query
gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist + '/scraps'
2015-06-03 21:05:13 +00:00
return gallery_url + '/page/' + HydrusData.ToString( page_index + 1 )
2015-03-25 22:04:19 +00:00
elif self._query_type == 'tags':
tags = self._query
2015-06-03 21:05:13 +00:00
return 'http://www.hentai-foundry.com/search/pictures?query=' + '+'.join( tags ) + '&search_in=all&scraps=-1&page=' + HydrusData.ToString( page_index + 1 )
2015-03-25 22:04:19 +00:00
# scraps = 0 hide
# -1 means show both
# 1 means scraps only. wetf
def _ParseGalleryPage( self, html, url_base ):
urls_set = set()
soup = bs4.BeautifulSoup( html )
def correct_url( href ):
# a good url is in the form "/pictures/user/artist_name/file_id/title"
if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):
( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )
# /pictures/user/artist_name/page/3
if file_id != 'page': return True
return False
2015-06-03 21:05:13 +00:00
urls = []
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
links = soup.find_all( 'a', href = correct_url )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
for link in links:
url = 'http://www.hentai-foundry.com' + link['href']
2015-03-25 22:04:19 +00:00
if url not in urls_set:
urls_set.add( url )
2015-06-03 21:05:13 +00:00
urls.append( url )
2015-03-25 22:04:19 +00:00
# this is copied from old code. surely we can improve it?
if 'class="next"' not in html: self._we_are_done = True
2015-06-03 21:05:13 +00:00
return urls
2015-03-25 22:04:19 +00:00
def _ParseImagePage( self, html, url_base ):
# can't parse this easily normally because HF is a pain with the preview->click to see full size business.
# find http://pictures.hentai-foundry.com//
# then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
# the .jpg bit is what we really need, but whatever
try:
index = html.index( 'pictures.hentai-foundry.com' )
image_url = html[ index : index + 256 ]
if '"' in image_url: ( image_url, gumpf ) = image_url.split( '"', 1 )
if ''' in image_url: ( image_url, gumpf ) = image_url.split( ''', 1 )
image_url = 'http://' + image_url
except Exception as e:
raise Exception( 'Could not parse image url!' + os.linesep + HydrusData.ToString( e ) )
soup = bs4.BeautifulSoup( html )
tags = []
try:
title = soup.find( 'title' )
( data, nothing ) = HydrusData.ToString( title.string ).split( ' - Hentai Foundry' )
data_reversed = data[::-1] # want to do it right-side first, because title might have ' by ' in it
( artist_reversed, title_reversed ) = data_reversed.split( ' yb ' )
artist = artist_reversed[::-1]
title = title_reversed[::-1]
tags.append( 'creator:' + artist )
tags.append( 'title:' + title )
except: pass
tag_links = soup.find_all( 'a', rel = 'tag' )
for tag_link in tag_links: tags.append( tag_link.string )
return ( image_url, tags )
def GetFile( self, temp_path, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
def GetFileAndTags( self, temp_path, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
return tags
def GetTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
return tags
def SetupGallerySearch( self ):
2015-09-16 18:11:00 +00:00
manager = HydrusGlobals.client_controller.GetManager( 'web_sessions' )
2015-03-25 22:04:19 +00:00
cookies = manager.GetCookies( 'hentai foundry' )
raw_csrf = cookies[ 'YII_CSRF_TOKEN' ] # 19b05b536885ec60b8b37650a32f8deb11c08cd1s%3A40%3A%222917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32%22%3B
processed_csrf = urllib.unquote( raw_csrf ) # 19b05b536885ec60b8b37650a32f8deb11c08cd1s:40:"2917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32";
csrf_token = processed_csrf.split( '"' )[1] # the 2917... bit
self._advanced_hentai_foundry_options[ 'YII_CSRF_TOKEN' ] = csrf_token
body = urllib.urlencode( self._advanced_hentai_foundry_options )
request_headers = {}
request_headers[ 'Content-Type' ] = 'application/x-www-form-urlencoded'
self._AddSessionCookies( request_headers )
2015-09-16 18:11:00 +00:00
HydrusGlobals.client_controller.DoHTTP( HC.POST, 'http://www.hentai-foundry.com/site/filters', request_headers = request_headers, body = body )
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
class GalleryParserNewgrounds( GalleryParser ):
2015-03-25 22:04:19 +00:00
def __init__( self, query ):
self._query = query
2015-05-20 21:31:40 +00:00
GalleryParser.__init__( self )
2015-03-25 22:04:19 +00:00
def _GetFileURLAndTags( self, url ):
html = self._FetchData( url )
return self._ParseImagePage( html, url )
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURLs( self, page_index ):
2015-03-25 22:04:19 +00:00
artist = self._query
gallery_urls = []
gallery_urls.append( 'http://' + artist + '.newgrounds.com/games/' )
gallery_urls.append( 'http://' + artist + '.newgrounds.com/movies/' )
self._we_are_done = True
return gallery_urls
def _ParseGalleryPage( self, html, url_base ):
soup = bs4.BeautifulSoup( html )
fatcol = soup.find( 'div', class_ = 'fatcol' )
links = fatcol.find_all( 'a' )
urls_set = set()
2015-06-03 21:05:13 +00:00
urls = []
2015-03-25 22:04:19 +00:00
for link in links:
try:
url = link[ 'href' ]
if url not in urls_set:
if url.startswith( 'http://www.newgrounds.com/portal/view/' ):
urls_set.add( url )
2015-06-03 21:05:13 +00:00
urls.append( url )
2015-03-25 22:04:19 +00:00
except: pass
2015-06-03 21:05:13 +00:00
return urls
2015-03-25 22:04:19 +00:00
def _ParseImagePage( self, html, url_base ):
soup = bs4.BeautifulSoup( html )
tags = set()
author_links = soup.find( 'ul', class_ = 'authorlinks' )
if author_links is not None:
authors = set()
links = author_links.find_all( 'a' )
for link in links:
try:
href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com
creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' )
tags.add( u'creator:' + creator )
except: pass
try:
title = soup.find( 'title' )
tags.add( u'title:' + title.string )
except: pass
all_links = soup.find_all( 'a' )
for link in all_links:
try:
href = link[ 'href' ]
if '/browse/tag/' in href: tags.add( link.string )
except: pass
#
try:
components = html.split( '"http://uploads.ungrounded.net/' )
# there is sometimes another bit of api flash earlier on that we don't want
# it is called http://uploads.ungrounded.net/apiassets/sandbox.swf
if len( components ) == 2: flash_url = components[1]
else: flash_url = components[2]
flash_url = flash_url.split( '"', 1 )[0]
flash_url = 'http://uploads.ungrounded.net/' + flash_url
except: raise Exception( 'Could not find the swf file! It was probably an mp4!' )
return ( flash_url, tags )
def GetFile( self, temp_path, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
def GetFileAndTags( self, temp_path, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
return tags
def GetTags( self, url ):
( file_url, tags ) = self._GetFileURLAndTags( url )
return tags
2015-05-20 21:31:40 +00:00
class GalleryParserPixiv( GalleryParser ):
2015-03-25 22:04:19 +00:00
def __init__( self, query_type, query ):
self._query_type = query_type
self._query = query
2015-05-20 21:31:40 +00:00
GalleryParser.__init__( self )
2015-03-25 22:04:19 +00:00
def _AddSessionCookies( self, request_headers ):
2015-09-16 18:11:00 +00:00
manager = HydrusGlobals.client_controller.GetManager( 'web_sessions' )
2015-03-25 22:04:19 +00:00
cookies = manager.GetCookies( 'pixiv' )
HydrusNetworking.AddCookiesToHeaders( cookies, request_headers )
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURL( self, page_index ):
2015-03-25 22:04:19 +00:00
if self._query_type == 'artist_id':
artist_id = self._query
gallery_url = 'http://www.pixiv.net/member_illust.php?id=' + HydrusData.ToString( artist_id )
elif self._query_type == 'tags':
tag = self._query
2015-08-12 20:35:24 +00:00
gallery_url = 'http://www.pixiv.net/search.php?word=' + urllib.quote( tag.encode( 'utf-8' ), '' ) + '&s_mode=s_tag_full&order=date_d'
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
return gallery_url + '&p=' + HydrusData.ToString( page_index + 1 )
2015-03-25 22:04:19 +00:00
def _ParseGalleryPage( self, html, url_base ):
2015-06-03 21:05:13 +00:00
urls = []
2015-03-25 22:04:19 +00:00
soup = bs4.BeautifulSoup( html )
thumbnail_links = soup.find_all( class_ = 'work' )
for thumbnail_link in thumbnail_links:
url = urlparse.urljoin( url_base, thumbnail_link[ 'href' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690
2015-06-03 21:05:13 +00:00
urls.append( url )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
return urls
2015-03-25 22:04:19 +00:00
def _ParseImagePage( self, html, page_url ):
2015-07-01 22:02:07 +00:00
if 'member_illust.php?mode=manga' in html:
manga_url = page_url.replace( 'medium', 'manga' )
raise Exception( page_url + ' was manga, not a single image, so could not be downloaded.' )
2015-03-25 22:04:19 +00:00
soup = bs4.BeautifulSoup( html )
#
# this is the page that holds the full size of the image.
# pixiv won't serve the image unless it thinks this page is the referrer
2015-07-01 22:02:07 +00:00
#referral_url = page_url.replace( 'medium', 'big' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690
2015-03-25 22:04:19 +00:00
#
2015-07-01 22:02:07 +00:00
original_image = soup.find( class_ = 'original-image' )
2015-03-25 22:04:19 +00:00
2015-07-01 22:02:07 +00:00
image_url = original_image[ 'data-src' ] # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg
2015-03-25 22:04:19 +00:00
#
2015-07-01 22:02:07 +00:00
tags = soup.find( 'ul', class_ = 'tagCloud' )
2015-03-25 22:04:19 +00:00
2015-07-01 22:02:07 +00:00
# <a href="/member_illust.php?id=5754629&amp;tag=Ib">Ib<span class="cnt">(2)</span></a> -> Ib
tags = [ a_item.contents[0] for a_item in tags.find_all( 'a' ) ]
2015-03-25 22:04:19 +00:00
user = soup.find( 'h1', class_ = 'user' )
tags.append( 'creator:' + user.string )
2015-07-01 22:02:07 +00:00
title_parent = soup.find( 'section', class_ = re.compile( 'work-info' ) )
2015-03-25 22:04:19 +00:00
title = title_parent.find( 'h1', class_ = 'title' )
tags.append( 'title:' + title.string )
2015-07-01 22:02:07 +00:00
return ( image_url, tags )
2015-03-25 22:04:19 +00:00
2015-07-01 22:02:07 +00:00
def _GetFileURLAndTags( self, page_url ):
2015-03-25 22:04:19 +00:00
html = self._FetchData( page_url )
return self._ParseImagePage( html, page_url )
def GetFile( self, temp_path, url ):
2015-07-01 22:02:07 +00:00
( image_url, tags ) = self._GetFileURLAndTags( url )
2015-03-25 22:04:19 +00:00
2015-07-01 22:02:07 +00:00
request_headers = { 'Referer' : url }
2015-03-25 22:04:19 +00:00
self._FetchData( image_url, request_headers = request_headers, report_hooks = self._report_hooks, temp_path = temp_path )
def GetFileAndTags( self, temp_path, url ):
2015-07-01 22:02:07 +00:00
( image_url, tags ) = self._GetFileURLAndTags( url )
2015-03-25 22:04:19 +00:00
2015-07-01 22:02:07 +00:00
request_headers = { 'Referer' : url }
2015-03-25 22:04:19 +00:00
self._FetchData( image_url, request_headers = request_headers, report_hooks = self._report_hooks, temp_path = temp_path )
return tags
def GetTags( self, url ):
2015-07-01 22:02:07 +00:00
( image_url, tags ) = self._GetFileURLAndTags( url )
2015-03-25 22:04:19 +00:00
return tags
2015-05-20 21:31:40 +00:00
class GalleryParserTumblr( GalleryParser ):
2015-03-25 22:04:19 +00:00
def __init__( self, username ):
self._gallery_url = 'http://' + username + '.tumblr.com/api/read/json?start=%start%&num=50'
2015-06-03 21:05:13 +00:00
self._urls_to_tags = {}
2015-05-20 21:31:40 +00:00
GalleryParser.__init__( self )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def _GetGalleryPageURL( self, page_index ):
return self._gallery_url.replace( '%start%', HydrusData.ToString( page_index * 50 ) )
2015-03-25 22:04:19 +00:00
def _ParseGalleryPage( self, data, url_base ):
processed_raw_json = data.split( 'var tumblr_api_read = ' )[1][:-2] # -2 takes a couple newline chars off at the end
json_object = json.loads( processed_raw_json )
2015-06-03 21:05:13 +00:00
urls = []
2015-03-25 22:04:19 +00:00
if 'posts' in json_object:
for post in json_object[ 'posts' ]:
if 'tags' in post: tags = post[ 'tags' ]
else: tags = []
post_type = post[ 'type' ]
if post_type == 'photo':
if len( post[ 'photos' ] ) == 0:
2015-06-03 21:05:13 +00:00
try:
url = post[ 'photo-url-1280' ]
SetExtraURLInfo( url, tags )
urls.append( url )
2015-03-25 22:04:19 +00:00
except: pass
else:
for photo in post[ 'photos' ]:
2015-06-03 21:05:13 +00:00
try:
url = photo[ 'photo-url-1280' ]
SetExtraURLInfo( url, tags )
urls.append( url )
2015-03-25 22:04:19 +00:00
except: pass
2015-06-03 21:05:13 +00:00
return urls
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def GetTags( self, url ):
result = GetExtraURLInfo( url )
if result is None:
return []
else:
return result
2015-03-25 22:04:19 +00:00
class ImportArgsGenerator( object ):
2015-07-22 19:40:39 +00:00
def __init__( self, job_key, item, import_file_options ):
2015-03-25 22:04:19 +00:00
self._job_key = job_key
self._item = item
2015-07-22 19:40:39 +00:00
self._import_file_options = import_file_options
2015-03-25 22:04:19 +00:00
def __call__( self ):
try:
( result, media_result ) = self._CheckCurrentStatus()
2015-06-03 21:05:13 +00:00
if result == CC.STATUS_NEW:
2015-03-25 22:04:19 +00:00
( os_file_handle, temp_path ) = HydrusFileHandling.GetTempPath()
try:
( name, service_keys_to_tags, url ) = self._GetArgs( temp_path )
self._job_key.SetVariable( 'status', 'importing' )
2015-09-16 18:11:00 +00:00
( result, media_result ) = HydrusGlobals.client_controller.WriteSynchronous( 'import_file', temp_path, import_file_options = self._import_file_options, service_keys_to_tags = service_keys_to_tags, generate_media_result = True, url = url )
2015-03-25 22:04:19 +00:00
finally:
HydrusFileHandling.CleanUpTempPath( os_file_handle, temp_path )
self._job_key.SetVariable( 'result', result )
2015-06-03 21:05:13 +00:00
if result in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
2015-03-25 22:04:19 +00:00
page_key = self._job_key.GetVariable( 'page_key' )
if media_result is not None and page_key is not None:
2015-09-16 18:11:00 +00:00
HydrusGlobals.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'status', '' )
self._job_key.Finish()
2015-07-22 19:40:39 +00:00
self._CleanUp()
2015-03-25 22:04:19 +00:00
except Exception as e:
2015-06-03 21:05:13 +00:00
self._job_key.SetVariable( 'result', CC.STATUS_FAILED )
2015-03-25 22:04:19 +00:00
if 'name' in locals(): HydrusData.ShowText( 'There was a problem importing ' + name + '!' )
HydrusData.ShowException( e )
2015-09-09 22:04:39 +00:00
time.sleep( 5 )
2015-03-25 22:04:19 +00:00
self._job_key.Cancel()
def _CleanUp( self ): pass
2015-06-03 21:05:13 +00:00
def _CheckCurrentStatus( self ): return ( CC.STATUS_NEW, None )
2015-03-25 22:04:19 +00:00
2015-04-01 20:44:54 +00:00
def _GetArgs( self, temp_path ):
raise NotImplementedError()
2015-03-25 22:04:19 +00:00
class ImportArgsGeneratorGallery( ImportArgsGenerator ):
2015-07-22 19:40:39 +00:00
def __init__( self, job_key, item, import_file_options, advanced_tag_options, gallery_parsers_factory ):
2015-03-25 22:04:19 +00:00
2015-07-22 19:40:39 +00:00
ImportArgsGenerator.__init__( self, job_key, item, import_file_options )
2015-03-25 22:04:19 +00:00
self._advanced_tag_options = advanced_tag_options
2015-05-20 21:31:40 +00:00
self._gallery_parsers_factory = gallery_parsers_factory
2015-03-25 22:04:19 +00:00
def _GetArgs( self, temp_path ):
2015-06-03 21:05:13 +00:00
url = self._item
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'status', 'downloading' )
2015-05-20 21:31:40 +00:00
gallery_parser = self._gallery_parsers_factory( 'example' )[0]
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
def hook( gauge_range, gauge_value ):
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
self._job_key.SetVariable( 'range', gauge_range )
self._job_key.SetVariable( 'value', gauge_value )
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
gallery_parser.AddReportHook( hook )
2015-03-25 22:04:19 +00:00
do_tags = len( self._advanced_tag_options ) > 0
2015-06-03 21:05:13 +00:00
if do_tags:
tags = gallery_parser.GetFileAndTags( temp_path, url )
2015-03-25 22:04:19 +00:00
else:
2015-06-03 21:05:13 +00:00
gallery_parser.GetFile( temp_path, url )
2015-03-25 22:04:19 +00:00
tags = []
2015-05-20 21:31:40 +00:00
gallery_parser.ClearReportHooks()
2015-03-25 22:04:19 +00:00
service_keys_to_tags = ConvertTagsToServiceKeysToTags( tags, self._advanced_tag_options )
2015-09-09 22:04:39 +00:00
time.sleep( HC.options[ 'website_download_polite_wait' ] )
2015-03-25 22:04:19 +00:00
return ( url, service_keys_to_tags, url )
def _CheckCurrentStatus( self ):
2015-06-03 21:05:13 +00:00
url = self._item
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'status', 'checking url status' )
2015-05-20 21:31:40 +00:00
gallery_parser = self._gallery_parsers_factory( 'example' )[0]
2015-03-25 22:04:19 +00:00
2015-09-16 18:11:00 +00:00
( status, hash ) = HydrusGlobals.client_controller.Read( 'url_status', url )
2015-03-25 22:04:19 +00:00
2015-07-22 19:40:39 +00:00
if status == CC.STATUS_DELETED and not self._import_file_options[ 'exclude_deleted_files' ]: status = CC.STATUS_NEW
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
if status == CC.STATUS_REDUNDANT:
2015-03-25 22:04:19 +00:00
2015-09-16 18:11:00 +00:00
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', CC.LOCAL_FILE_SERVICE_KEY, ( hash, ) )
2015-03-25 22:04:19 +00:00
do_tags = len( self._advanced_tag_options ) > 0
if do_tags:
2015-06-03 21:05:13 +00:00
tags = gallery_parser.GetTags( url )
2015-03-25 22:04:19 +00:00
service_keys_to_tags = ConvertTagsToServiceKeysToTags( tags, self._advanced_tag_options )
service_keys_to_content_updates = ConvertServiceKeysToTagsToServiceKeysToContentUpdates( hash, service_keys_to_tags )
2015-09-16 18:11:00 +00:00
HydrusGlobals.client_controller.Write( 'content_updates', service_keys_to_content_updates )
2015-03-25 22:04:19 +00:00
2015-09-09 22:04:39 +00:00
time.sleep( HC.options[ 'website_download_polite_wait' ] )
2015-03-25 22:04:19 +00:00
return ( status, media_result )
else: return ( status, None )
class ImportArgsGeneratorURLs( ImportArgsGenerator ):
def _GetArgs( self, temp_path ):
url = self._item
self._job_key.SetVariable( 'status', 'downloading' )
2015-06-03 21:05:13 +00:00
def hook( gauge_range, gauge_value ):
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
self._job_key.SetVariable( 'range', gauge_range )
self._job_key.SetVariable( 'value', gauge_value )
2015-03-25 22:04:19 +00:00
2015-09-16 18:11:00 +00:00
HydrusGlobals.client_controller.DoHTTP( HC.GET, url, report_hooks = [ hook ], temp_path = temp_path )
2015-03-25 22:04:19 +00:00
service_keys_to_tags = {}
return ( url, service_keys_to_tags, url )
def _CheckCurrentStatus( self ):
url = self._item
self._job_key.SetVariable( 'status', 'checking url status' )
2015-09-16 18:11:00 +00:00
( status, hash ) = HydrusGlobals.client_controller.Read( 'url_status', url )
2015-03-25 22:04:19 +00:00
2015-07-22 19:40:39 +00:00
if status == CC.STATUS_DELETED and not self._import_file_options[ 'exclude_deleted_files' ]: status = CC.STATUS_NEW
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
if status == CC.STATUS_REDUNDANT:
2015-03-25 22:04:19 +00:00
2015-09-16 18:11:00 +00:00
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', CC.LOCAL_FILE_SERVICE_KEY, ( hash, ) )
2015-03-25 22:04:19 +00:00
return ( status, media_result )
else: return ( status, None )
class ImportController( object ):
def __init__( self, import_args_generator_factory, import_queue_builder_factory, page_key = None ):
self._controller_job_key = self._GetNewJobKey( 'controller' )
self._import_args_generator_factory = import_args_generator_factory
self._import_queue_builder_factory = import_queue_builder_factory
self._page_key = page_key
self._import_job_key = self._GetNewJobKey( 'import' )
self._import_queue_job_key = self._GetNewJobKey( 'import_queue' )
self._import_queue_builder_job_key = self._GetNewJobKey( 'import_queue_builder' )
self._pending_import_queue_jobs = []
self._lock = threading.Lock()
def _GetNewJobKey( self, job_type ):
2015-09-16 18:11:00 +00:00
job_key = HydrusThreading.JobKey( pausable = True, cancellable = True )
2015-03-25 22:04:19 +00:00
if job_type == 'controller':
2015-06-03 21:05:13 +00:00
result_counts = {}
result_counts[ CC.STATUS_SUCCESSFUL ] = 0
result_counts[ CC.STATUS_FAILED ] = 0
result_counts[ CC.STATUS_DELETED ] = 0
result_counts[ CC.STATUS_REDUNDANT ] = 0
job_key.SetVariable( 'result_counts', result_counts )
2015-03-25 22:04:19 +00:00
else:
job_key.SetVariable( 'status', '' )
if job_type == 'import':
job_key.SetVariable( 'page_key', self._page_key )
job_key.SetVariable( 'range', 1 )
job_key.SetVariable( 'value', 0 )
elif job_type == 'import_queue':
job_key.SetVariable( 'queue_position', 0 )
elif job_type == 'import_queue_builder':
job_key.SetVariable( 'queue', [] )
return job_key
def CleanBeforeDestroy( self ): self._controller_job_key.Cancel()
def GetJobKey( self, job_type ):
with self._lock:
if job_type == 'controller': return self._controller_job_key
elif job_type == 'import': return self._import_job_key
elif job_type == 'import_queue': return self._import_queue_job_key
elif job_type == 'import_queue_builder': return self._import_queue_builder_job_key
def GetPendingImportQueueJobs( self ):
with self._lock: return self._pending_import_queue_jobs
def PendImportQueueJob( self, job ):
with self._lock: self._pending_import_queue_jobs.append( job )
def RemovePendingImportQueueJob( self, job ):
with self._lock:
if job in self._pending_import_queue_jobs: self._pending_import_queue_jobs.remove( job )
def MovePendingImportQueueJobUp( self, job ):
with self._lock:
if job in self._pending_import_queue_jobs:
index = self._pending_import_queue_jobs.index( job )
if index > 0:
self._pending_import_queue_jobs.remove( job )
self._pending_import_queue_jobs.insert( index - 1, job )
def MovePendingImportQueueJobDown( self, job ):
with self._lock:
if job in self._pending_import_queue_jobs:
index = self._pending_import_queue_jobs.index( job )
if index + 1 < len( self._pending_import_queue_jobs ):
self._pending_import_queue_jobs.remove( job )
self._pending_import_queue_jobs.insert( index + 1, job )
def MainLoop( self ):
try:
while not self._controller_job_key.IsDone():
create_import_item = False
create_import_queue_item = False
while self._controller_job_key.IsPaused():
time.sleep( 0.1 )
self._import_job_key.Pause()
self._import_queue_job_key.Pause()
self._import_queue_builder_job_key.Pause()
2015-08-26 21:18:39 +00:00
if HydrusGlobals.view_shutdown or self._controller_job_key.IsDone(): break
2015-03-25 22:04:19 +00:00
2015-08-26 21:18:39 +00:00
if HydrusGlobals.view_shutdown or self._controller_job_key.IsDone(): break
2015-03-25 22:04:19 +00:00
with self._lock:
queue_position = self._import_queue_job_key.GetVariable( 'queue_position' )
queue = self._import_queue_builder_job_key.GetVariable( 'queue' )
if self._import_job_key.IsDone():
result = self._import_job_key.GetVariable( 'result' )
2015-06-03 21:05:13 +00:00
result_counts = self._controller_job_key.GetVariable( 'result_counts' )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
result_counts[ result ] += 1
2015-03-25 22:04:19 +00:00
self._import_job_key = self._GetNewJobKey( 'import' )
queue_position += 1
self._import_queue_job_key.SetVariable( 'queue_position', queue_position )
2015-06-10 19:40:25 +00:00
position_string = HydrusData.ConvertValueRangeToPrettyString( queue_position + 1, len( queue ) )
2015-03-25 22:04:19 +00:00
if self._import_queue_job_key.IsPaused(): self._import_queue_job_key.SetVariable( 'status', 'paused at ' + position_string )
elif self._import_queue_job_key.IsWorking():
if self._import_job_key.IsWorking():
self._import_queue_job_key.SetVariable( 'status', 'processing ' + position_string )
else:
if queue_position < len( queue ):
self._import_queue_job_key.SetVariable( 'status', 'preparing ' + position_string )
self._import_job_key.Begin()
import_item = queue[ queue_position ]
create_import_item = True
else:
if self._import_queue_builder_job_key.IsWorking(): self._import_queue_job_key.SetVariable( 'status', 'waiting for more items' )
else: self._import_queue_job_key.Finish()
else:
if self._import_queue_job_key.IsDone():
if self._import_queue_job_key.IsCancelled(): status = 'cancelled at ' + position_string
else: status = 'done'
self._import_queue_job_key = self._GetNewJobKey( 'import_queue' )
self._import_queue_builder_job_key = self._GetNewJobKey( 'import_queue_builder' )
else: status = ''
self._import_queue_job_key.SetVariable( 'status', status )
if len( self._pending_import_queue_jobs ) > 0:
self._import_queue_job_key.Begin()
self._import_queue_builder_job_key.Begin()
queue_item = self._pending_import_queue_jobs.pop( 0 )
create_import_queue_item = True
# This is outside the lock, as it may call wx-blocking stuff, and other wx bits will sometimes wait on the lock
if create_import_item:
args_generator = self._import_args_generator_factory( self._import_job_key, import_item )
2015-09-16 18:11:00 +00:00
HydrusGlobals.client_controller.CallToThread( args_generator )
2015-03-25 22:04:19 +00:00
if create_import_queue_item:
queue_builder = self._import_queue_builder_factory( self._import_queue_builder_job_key, queue_item )
# make it a daemon, not a thread job, as it has a loop!
threading.Thread( target = queue_builder ).start()
time.sleep( 0.05 )
except Exception as e:
HydrusData.ShowException( e )
finally:
self._import_job_key.Cancel()
self._import_queue_job_key.Cancel()
self._import_queue_builder_job_key.Cancel()
def StartDaemon( self ): threading.Thread( target = self.MainLoop ).start()
class ImportQueueBuilder( object ):
def __init__( self, job_key, item ):
self._job_key = job_key
self._item = item
def __call__( self ):
queue = self._item
self._job_key.SetVariable( 'queue', queue )
self._job_key.Finish()
class ImportQueueBuilderGallery( ImportQueueBuilder ):
2015-05-20 21:31:40 +00:00
def __init__( self, job_key, item, gallery_parsers_factory ):
2015-03-25 22:04:19 +00:00
ImportQueueBuilder.__init__( self, job_key, item )
2015-05-20 21:31:40 +00:00
self._gallery_parsers_factory = gallery_parsers_factory
2015-03-25 22:04:19 +00:00
def __call__( self ):
try:
2015-05-20 21:31:40 +00:00
( raw_query, self._get_tags_if_redundant, self._file_limit ) = self._item
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
gallery_parsers = list( self._gallery_parsers_factory( raw_query ) )
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
gallery_parsers[0].SetupGallerySearch() # for now this is cookie-based for hf, so only have to do it on one
2015-03-25 22:04:19 +00:00
total_urls_found = 0
2015-06-03 21:05:13 +00:00
num_pages_found = 0
page_index = 0
2015-03-25 22:04:19 +00:00
first_run = True
while True:
2015-05-20 21:31:40 +00:00
gallery_parsers_to_remove = []
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
for gallery_parser in gallery_parsers:
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
urls_in_pages = HydrusData.ConvertIntToPrettyString( total_urls_found ) + ' urls in ' + HydrusData.ConvertIntToPrettyString( num_pages_found ) + ' pages'
2015-03-25 22:04:19 +00:00
while self._job_key.IsPaused():
time.sleep( 0.1 )
self._job_key.SetVariable( 'status', 'paused after finding ' + urls_in_pages )
2015-08-26 21:18:39 +00:00
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
2015-03-25 22:04:19 +00:00
2015-08-26 21:18:39 +00:00
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
2015-03-25 22:04:19 +00:00
2015-09-09 22:04:39 +00:00
self._job_key.SetVariable( 'status', 'found ' + urls_in_pages + '.' )
2015-03-25 22:04:19 +00:00
if first_run: first_run = False
2015-09-09 22:04:39 +00:00
else: time.sleep( HC.options[ 'website_download_polite_wait' ] )
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'status', 'found ' + urls_in_pages + '. looking for next page' )
2015-06-03 21:05:13 +00:00
page_of_urls = gallery_parser.GetPage( page_index )
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
if len( page_of_urls ) == 0: gallery_parsers_to_remove.append( gallery_parser )
2015-03-25 22:04:19 +00:00
else:
queue = self._job_key.GetVariable( 'queue' )
queue = list( queue )
2015-05-20 21:31:40 +00:00
if self._file_limit is not None:
2015-06-03 21:05:13 +00:00
while len( page_of_urls ) > 0 and total_urls_found < self._file_limit:
2015-05-20 21:31:40 +00:00
2015-06-03 21:05:13 +00:00
url = page_of_urls.pop( 0 )
2015-05-20 21:31:40 +00:00
2015-06-03 21:05:13 +00:00
queue.append( url )
2015-05-20 21:31:40 +00:00
total_urls_found += 1
else:
2015-06-03 21:05:13 +00:00
queue.extend( page_of_urls )
2015-05-20 21:31:40 +00:00
2015-06-03 21:05:13 +00:00
total_urls_found += len( page_of_urls )
2015-05-20 21:31:40 +00:00
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'queue', queue )
2015-06-03 21:05:13 +00:00
num_pages_found += 1
2015-03-25 22:04:19 +00:00
2015-06-03 21:05:13 +00:00
urls_in_pages = HydrusData.ConvertIntToPrettyString( total_urls_found ) + ' urls in ' + HydrusData.ConvertIntToPrettyString( num_pages_found ) + ' pages'
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
for gallery_parser in gallery_parsers_to_remove: gallery_parsers.remove( gallery_parser )
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
if len( gallery_parsers ) == 0: break
2015-03-25 22:04:19 +00:00
while self._job_key.IsPaused():
time.sleep( 0.1 )
self._job_key.SetVariable( 'status', 'paused after finding ' + urls_in_pages )
2015-08-26 21:18:39 +00:00
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
2015-03-25 22:04:19 +00:00
2015-08-26 21:18:39 +00:00
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
2015-03-25 22:04:19 +00:00
2015-05-20 21:31:40 +00:00
if self._file_limit is not None and total_urls_found >= self._file_limit: break
2015-04-01 20:44:54 +00:00
2015-06-03 21:05:13 +00:00
page_index += 1
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'status', 'finished. found ' + urls_in_pages )
2015-09-09 22:04:39 +00:00
time.sleep( HC.options[ 'website_download_polite_wait' ] )
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'status', '' )
except Exception as e:
self._job_key.SetVariable( 'status', HydrusData.ToString( e ) )
HydrusData.ShowException( e )
2015-09-09 22:04:39 +00:00
time.sleep( 5 )
2015-03-25 22:04:19 +00:00
finally: self._job_key.Finish()
class ImportQueueBuilderURLs( ImportQueueBuilder ):
def __call__( self ):
try:
2015-05-20 21:31:40 +00:00
( url, get_tags_if_redundant, file_limit ) = self._item
2015-03-25 22:04:19 +00:00
self._job_key.SetVariable( 'status', 'Connecting to address' )
2015-09-16 18:11:00 +00:00
try: html = HydrusGlobals.client_controller.DoHTTP( HC.GET, url )
2015-03-25 22:04:19 +00:00
except: raise Exception( 'Could not download that url' )
self._job_key.SetVariable( 'status', 'parsing html' )
try: urls = ParsePageForURLs( html, url )
except: raise Exception( 'Could not parse that URL\'s html' )
queue = urls
self._job_key.SetVariable( 'queue', queue )
except Exception as e:
self._job_key.SetVariable( 'status', HydrusData.ToString( e ) )
HydrusData.ShowException( e )
2015-09-09 22:04:39 +00:00
time.sleep( 5 )
2015-03-25 22:04:19 +00:00
finally: self._job_key.Finish()