2015-03-25 22:04:19 +00:00
|
|
|
import bs4
|
|
|
|
import collections
|
|
|
|
import httplib
|
|
|
|
import HydrusConstants as HC
|
|
|
|
import HydrusExceptions
|
|
|
|
import HydrusNetworking
|
|
|
|
import HydrusThreading
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import pafy
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import threading
|
|
|
|
import time
|
|
|
|
import traceback
|
|
|
|
import urllib
|
|
|
|
import urlparse
|
|
|
|
import wx
|
|
|
|
import HydrusTags
|
|
|
|
import HydrusData
|
|
|
|
import HydrusFileHandling
|
|
|
|
import ClientData
|
2015-06-03 21:05:13 +00:00
|
|
|
import ClientConstants as CC
|
2015-03-25 22:04:19 +00:00
|
|
|
import HydrusGlobals
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
# This is fairly ugly, but it works for what I need it to do
|
|
|
|
|
|
|
|
URL_EXTRA_INFO = {}
|
|
|
|
URL_EXTRA_INFO_LOCK = threading.Lock()
|
|
|
|
|
|
|
|
def GetExtraURLInfo( url ):
|
|
|
|
|
|
|
|
with URL_EXTRA_INFO_LOCK:
|
|
|
|
|
|
|
|
if url in URL_EXTRA_INFO:
|
|
|
|
|
|
|
|
return URL_EXTRA_INFO[ url ]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def SetExtraURLInfo( url, info ):
|
|
|
|
|
|
|
|
with URL_EXTRA_INFO_LOCK:
|
|
|
|
|
|
|
|
URL_EXTRA_INFO[ url ] = info
|
|
|
|
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
def ConvertServiceKeysToTagsToServiceKeysToContentUpdates( hash, service_keys_to_tags ):
|
|
|
|
|
|
|
|
hashes = set( ( hash, ) )
|
|
|
|
|
|
|
|
service_keys_to_content_updates = {}
|
|
|
|
|
|
|
|
for ( service_key, tags ) in service_keys_to_tags.items():
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if service_key == CC.LOCAL_TAG_SERVICE_KEY: action = HC.CONTENT_UPDATE_ADD
|
2015-09-23 21:21:02 +00:00
|
|
|
else: action = HC.CONTENT_UPDATE_PEND
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
content_updates = [ HydrusData.ContentUpdate( HC.CONTENT_DATA_TYPE_MAPPINGS, action, ( tag, hashes ) ) for tag in tags ]
|
|
|
|
|
|
|
|
service_keys_to_content_updates[ service_key ] = content_updates
|
|
|
|
|
|
|
|
|
|
|
|
return service_keys_to_content_updates
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
def GetGalleryParser( site_type, *args ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
if site_type == HC.SITE_TYPE_BOORU: c = GalleryParserBooru
|
|
|
|
elif site_type == HC.SITE_TYPE_DEVIANT_ART: c = GalleryParserDeviantArt
|
|
|
|
elif site_type == HC.SITE_TYPE_GIPHY: c = GalleryParserGiphy
|
|
|
|
elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY: c = GalleryParserHentaiFoundry
|
|
|
|
elif site_type == HC.SITE_TYPE_PIXIV: c = GalleryParserPixiv
|
|
|
|
elif site_type == HC.SITE_TYPE_TUMBLR: c = GalleryParserTumblr
|
|
|
|
elif site_type == HC.SITE_TYPE_NEWGROUNDS: c = GalleryParserNewgrounds
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
return c( *args )
|
|
|
|
|
2015-08-05 18:42:35 +00:00
|
|
|
def GetImageboardThreadURLs( thread_url ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
parse_result = urlparse.urlparse( thread_url )
|
|
|
|
|
|
|
|
host = parse_result.hostname
|
|
|
|
|
|
|
|
request = parse_result.path
|
|
|
|
|
|
|
|
if host is None or request is None: raise Exception()
|
|
|
|
|
|
|
|
except: raise Exception ( 'Could not understand that url!' )
|
|
|
|
|
|
|
|
is_4chan = '4chan.org' in host
|
|
|
|
is_8chan = '8chan.co' in host or '8ch.net' in host
|
|
|
|
|
|
|
|
if not ( is_4chan or is_8chan ): raise Exception( 'This only works for 4chan and 8chan right now!' )
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
# 4chan
|
|
|
|
# /asp/thread/382059/post-your-favourite-martial-arts-video-if-martin
|
|
|
|
# http://a.4cdn.org/asp/thread/382059.json
|
|
|
|
# http://i.4cdn.org/asp/ for images
|
|
|
|
|
|
|
|
# 8chan
|
|
|
|
# /v/res/406061.html
|
|
|
|
# http://8chan.co/v/res/406061.json
|
|
|
|
# http://8chan.co/v/src/ for images
|
|
|
|
|
|
|
|
if is_4chan:
|
|
|
|
|
|
|
|
( board, rest_of_request ) = request[1:].split( '/thread/', 1 )
|
|
|
|
|
|
|
|
if '/' in rest_of_request: ( thread_id, gumpf ) = rest_of_request.split( '/' )
|
|
|
|
else: thread_id = rest_of_request
|
|
|
|
|
|
|
|
json_url = 'http://a.4cdn.org/' + board + '/thread/' + thread_id + '.json'
|
|
|
|
file_base = 'http://i.4cdn.org/' + board + '/'
|
|
|
|
|
|
|
|
elif is_8chan:
|
|
|
|
|
|
|
|
( board, rest_of_request ) = request[1:].split( '/res/', 1 )
|
|
|
|
|
|
|
|
json_url = thread_url[:-4] + 'json'
|
|
|
|
file_base = 'http://8ch.net/' + board + '/src/'
|
|
|
|
|
|
|
|
|
|
|
|
except: raise Exception( 'Could not understand the board or thread id!' )
|
|
|
|
|
|
|
|
return ( json_url, file_base )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
def ConvertTagsToServiceKeysToTags( tags, advanced_tag_options ):
|
|
|
|
|
|
|
|
tags = [ tag for tag in tags if tag is not None ]
|
|
|
|
|
|
|
|
service_keys_to_tags = {}
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
siblings_manager = HydrusGlobals.client_controller.GetManager( 'tag_siblings' )
|
|
|
|
parents_manager = HydrusGlobals.client_controller.GetManager( 'tag_parents' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
for ( service_key, namespaces ) in advanced_tag_options.items():
|
|
|
|
|
|
|
|
if len( namespaces ) > 0:
|
|
|
|
|
|
|
|
tags_to_add_here = []
|
|
|
|
|
|
|
|
for namespace in namespaces:
|
|
|
|
|
|
|
|
if namespace == '': tags_to_add_here.extend( [ tag for tag in tags if not ':' in tag ] )
|
|
|
|
else: tags_to_add_here.extend( [ tag for tag in tags if tag.startswith( namespace + ':' ) ] )
|
|
|
|
|
|
|
|
|
|
|
|
tags_to_add_here = HydrusTags.CleanTags( tags_to_add_here )
|
|
|
|
|
|
|
|
if len( tags_to_add_here ) > 0:
|
|
|
|
|
|
|
|
tags_to_add_here = siblings_manager.CollapseTags( tags_to_add_here )
|
|
|
|
tags_to_add_here = parents_manager.ExpandTags( service_key, tags_to_add_here )
|
|
|
|
|
|
|
|
service_keys_to_tags[ service_key ] = tags_to_add_here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return service_keys_to_tags
|
|
|
|
|
|
|
|
def GetYoutubeFormats( youtube_url ):
|
|
|
|
|
|
|
|
try: p = pafy.Pafy( youtube_url )
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
raise Exception( 'Could not fetch video info from youtube!' + os.linesep + HydrusData.ToString( e ) )
|
|
|
|
|
|
|
|
|
|
|
|
info = { ( s.extension, s.resolution ) : ( s.url, s.title ) for s in p.streams if s.extension in ( 'flv', 'mp4' ) }
|
|
|
|
|
|
|
|
return info
|
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
def THREADDownloadURL( job_key, url, url_string ):
|
|
|
|
|
|
|
|
job_key.SetVariable( 'popup_text_1', url_string + ' - initialising' )
|
|
|
|
|
|
|
|
def hook( gauge_range, gauge_value ):
|
|
|
|
|
|
|
|
if gauge_range is None: text = url_string + ' - ' + HydrusData.ConvertIntToBytes( gauge_value )
|
|
|
|
else: text = url_string + ' - ' + HydrusData.ConvertValueRangeToPrettyString( gauge_value, gauge_range )
|
|
|
|
|
|
|
|
job_key.SetVariable( 'popup_text_1', text )
|
|
|
|
job_key.SetVariable( 'popup_gauge_1', ( gauge_value, gauge_range ) )
|
|
|
|
|
|
|
|
|
|
|
|
( os_file_handle, temp_path ) = HydrusFileHandling.GetTempPath()
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
HydrusGlobals.client_controller.DoHTTP( HC.GET, url, temp_path = temp_path, report_hooks = [ hook ] )
|
2015-08-19 21:48:21 +00:00
|
|
|
|
|
|
|
job_key.DeleteVariable( 'popup_gauge_1' )
|
|
|
|
job_key.SetVariable( 'popup_text_1', 'importing ' + url_string )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
( result, hash ) = HydrusGlobals.client_controller.WriteSynchronous( 'import_file', temp_path )
|
2015-08-19 21:48:21 +00:00
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
|
|
HydrusFileHandling.CleanUpTempPath( os_file_handle, temp_path )
|
|
|
|
|
|
|
|
|
|
|
|
if result in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
|
|
|
|
|
|
|
|
if result == CC.STATUS_SUCCESSFUL:
|
|
|
|
|
|
|
|
job_key.SetVariable( 'popup_text_1', url_string )
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
job_key.SetVariable( 'popup_text_1', url_string + ' was already in the database!' )
|
|
|
|
|
|
|
|
|
|
|
|
job_key.SetVariable( 'popup_files', { hash } )
|
|
|
|
|
|
|
|
elif result == CC.STATUS_DELETED:
|
|
|
|
|
|
|
|
job_key.SetVariable( 'popup_text_1', url_string + ' had already been deleted!' )
|
|
|
|
|
|
|
|
|
|
|
|
job_key.Finish()
|
|
|
|
|
|
|
|
def Parse4chanPostScreen( html ):
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
title_tag = soup.find( 'title' )
|
|
|
|
|
|
|
|
if title_tag.string == 'Post successful!': return ( 'success', None )
|
|
|
|
elif title_tag.string == '4chan - Banned':
|
|
|
|
|
|
|
|
print( repr( soup ) )
|
|
|
|
|
|
|
|
text = 'You are banned from this board! html written to log.'
|
|
|
|
|
|
|
|
HydrusData.ShowText( text )
|
|
|
|
|
|
|
|
return ( 'big error', text )
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
problem_tag = soup.find( id = 'errmsg' )
|
|
|
|
|
|
|
|
if problem_tag is None:
|
|
|
|
|
|
|
|
try: print( repr( soup ) )
|
|
|
|
except: pass
|
|
|
|
|
|
|
|
text = 'Unknown problem; html written to log.'
|
|
|
|
|
|
|
|
HydrusData.ShowText( text )
|
|
|
|
|
|
|
|
return ( 'error', text )
|
|
|
|
|
|
|
|
|
|
|
|
problem = HydrusData.ToString( problem_tag )
|
|
|
|
|
|
|
|
if 'CAPTCHA' in problem: return ( 'captcha', None )
|
|
|
|
elif 'seconds' in problem: return ( 'too quick', None )
|
|
|
|
elif 'Duplicate' in problem: return ( 'error', 'duplicate file detected' )
|
|
|
|
else: return ( 'error', problem )
|
|
|
|
|
|
|
|
except: return ( 'error', 'unknown error' )
|
|
|
|
|
|
|
|
|
|
|
|
def ParsePageForURLs( html, starting_url ):
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
all_links = soup.find_all( 'a' )
|
|
|
|
|
|
|
|
links_with_images = [ link for link in all_links if len( link.find_all( 'img' ) ) > 0 ]
|
|
|
|
|
|
|
|
urls = [ urlparse.urljoin( starting_url, link[ 'href' ] ) for link in links_with_images ]
|
|
|
|
|
|
|
|
return urls
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParser( object ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def __init__( self ):
|
|
|
|
|
|
|
|
self._we_are_done = False
|
|
|
|
|
|
|
|
self._report_hooks = []
|
|
|
|
|
|
|
|
self._all_urls_so_far = set()
|
|
|
|
|
|
|
|
|
|
|
|
def _AddSessionCookies( self, request_headers ): pass
|
|
|
|
|
|
|
|
def _FetchData( self, url, request_headers = None, report_hooks = None, temp_path = None ):
|
|
|
|
|
|
|
|
if request_headers is None: request_headers = {}
|
|
|
|
if report_hooks is None: report_hooks = []
|
|
|
|
|
|
|
|
self._AddSessionCookies( request_headers )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
return HydrusGlobals.client_controller.DoHTTP( HC.GET, url, request_headers = request_headers, report_hooks = report_hooks, temp_path = temp_path )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURL( self, page_index ):
|
|
|
|
|
|
|
|
return ''
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURLs( self, page_index ):
|
|
|
|
|
|
|
|
return ( self._GetGalleryPageURL( page_index ), )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-04-01 20:44:54 +00:00
|
|
|
def _ParseGalleryPage( self, data, url ):
|
|
|
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
def AddReportHook( self, hook ): self._report_hooks.append( hook )
|
|
|
|
|
|
|
|
def ClearReportHooks( self ): self._report_hooks = []
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def GetFile( self, temp_path, url ): self._FetchData( url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
def GetFileAndTags( self, temp_path, url ):
|
|
|
|
|
|
|
|
temp_path = self.GetFile( temp_path, url )
|
|
|
|
tags = self.GetTags( url )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
|
|
|
def GetPage( self, page_index ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if self._we_are_done: return []
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
gallery_urls = self._GetGalleryPageURLs( page_index )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
all_urls = []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
for gallery_url in gallery_urls:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
data = self._FetchData( gallery_url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
page_of_urls = self._ParseGalleryPage( data, gallery_url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
# stop ourselves getting into an accidental infinite loop
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
all_urls += [ url for url in page_of_urls if url not in self._all_urls_so_far ]
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
self._all_urls_so_far.update( page_of_urls )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return all_urls
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def GetTags( self, url ): pass
|
|
|
|
|
|
|
|
def SetupGallerySearch( self ): pass
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParserBooru( GalleryParser ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
def __init__( self, booru_name, tags ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
self._booru = HydrusGlobals.client_controller.Read( 'remote_booru', booru_name )
|
2015-07-08 21:45:38 +00:00
|
|
|
|
|
|
|
except:
|
|
|
|
|
|
|
|
raise HydrusExceptions.NotFoundException( 'Attempted to find booru "' + booru_name + '", but it was missing from the database!' )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._tags = tags
|
|
|
|
|
|
|
|
self._gallery_advance_num = None
|
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
( self._search_url, self._advance_by_page_num, self._search_separator, self._thumb_classname ) = self._booru.GetGalleryParsingInfo()
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
GalleryParser.__init__( self )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURL( self, page_index ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if self._advance_by_page_num: url_index = page_index + 1
|
2015-03-25 22:04:19 +00:00
|
|
|
else:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if self._gallery_advance_num is None: url_index = 0
|
|
|
|
else: url_index = page_index * self._gallery_advance_num
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-08-12 20:35:24 +00:00
|
|
|
tags_to_use = self._tags
|
|
|
|
|
|
|
|
if 'e621' in self._search_url:
|
|
|
|
|
|
|
|
tags_to_use = []
|
|
|
|
|
|
|
|
for tag in self._tags:
|
|
|
|
|
|
|
|
if '/' in tag:
|
|
|
|
|
|
|
|
tag = tag.replace( '/', '%-2F' )
|
|
|
|
|
|
|
|
|
|
|
|
tags_to_use.append( tag )
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-09-02 23:16:09 +00:00
|
|
|
return self._search_url.replace( '%tags%', self._search_separator.join( [ urllib.quote( tag.encode( 'utf-8' ), '' ) for tag in tags_to_use ] ) ).replace( '%index%', HydrusData.ToString( url_index ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _ParseGalleryPage( self, html, url_base ):
|
|
|
|
|
|
|
|
urls_set = set()
|
|
|
|
urls = []
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
# this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
|
|
|
|
def starts_with_classname( classname ): return classname is not None and classname.startswith( self._thumb_classname )
|
|
|
|
|
|
|
|
thumbnails = soup.find_all( class_ = starts_with_classname )
|
|
|
|
|
|
|
|
# this is a sankaku thing
|
|
|
|
popular_thumbnail_parent = soup.find( id = 'popular-preview' )
|
|
|
|
|
|
|
|
if popular_thumbnail_parent is not None:
|
|
|
|
|
|
|
|
popular_thumbnails = popular_thumbnail_parent.find_all( class_ = starts_with_classname )
|
|
|
|
|
|
|
|
thumbnails = thumbnails[ len( popular_thumbnails ) : ]
|
|
|
|
|
|
|
|
|
|
|
|
if self._gallery_advance_num is None:
|
|
|
|
|
|
|
|
if len( thumbnails ) == 0: self._we_are_done = True
|
|
|
|
else: self._gallery_advance_num = len( thumbnails )
|
|
|
|
|
|
|
|
|
|
|
|
for thumbnail in thumbnails:
|
|
|
|
|
|
|
|
links = thumbnail.find_all( 'a' )
|
|
|
|
|
|
|
|
if thumbnail.name == 'a': links.append( thumbnail )
|
|
|
|
|
|
|
|
for link in links:
|
|
|
|
|
|
|
|
if link.string is not None and link.string == 'Image Only': continue # rule 34 @ paheal fix
|
|
|
|
|
|
|
|
url = link[ 'href' ]
|
|
|
|
|
|
|
|
url = urlparse.urljoin( url_base, url )
|
|
|
|
|
|
|
|
if url not in urls_set:
|
|
|
|
|
|
|
|
urls_set.add( url )
|
2015-06-03 21:05:13 +00:00
|
|
|
urls.append( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return urls
|
|
|
|
|
|
|
|
|
|
|
|
def _ParseImagePage( self, html, url_base ):
|
|
|
|
|
|
|
|
( search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces ) = self._booru.GetData()
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
image_base = None
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
image_url = None
|
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
try:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
if image_id is not None:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
image = soup.find( id = image_id )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
if image is None:
|
|
|
|
|
|
|
|
image_string = soup.find( text = re.compile( 'Save this file' ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
if image_string is None: image_string = soup.find( text = re.compile( 'Save this video' ) )
|
|
|
|
|
|
|
|
image = image_string.parent
|
|
|
|
|
|
|
|
image_url = image[ 'href' ]
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
else:
|
|
|
|
|
|
|
|
if image.name in ( 'img', 'video' ):
|
|
|
|
|
|
|
|
image_url = image[ 'src' ]
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
if 'sample/sample-' in image_url:
|
|
|
|
|
|
|
|
# danbooru resized image
|
|
|
|
|
|
|
|
image = soup.find( id = 'image-resize-link' )
|
|
|
|
|
|
|
|
image_url = image[ 'href' ]
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
elif image.name == 'a':
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
image_url = image[ 'href' ]
|
|
|
|
|
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
|
|
|
|
|
|
|
|
if image_data is not None:
|
|
|
|
|
|
|
|
links = soup.find_all( 'a' )
|
|
|
|
|
2015-07-15 20:28:26 +00:00
|
|
|
ok_link = None
|
|
|
|
better_link = None
|
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
for link in links:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-15 20:28:26 +00:00
|
|
|
if link.string is not None:
|
|
|
|
|
|
|
|
if link.string.startswith( image_data ):
|
|
|
|
|
|
|
|
ok_link = link[ 'href' ]
|
|
|
|
|
|
|
|
|
|
|
|
if link.string.startswith( 'Download PNG' ):
|
|
|
|
|
|
|
|
better_link = link[ 'href' ]
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if better_link is not None:
|
|
|
|
|
|
|
|
image_url = better_link
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
image_url = ok_link
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
except Exception as e:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
raise HydrusExceptions.NotFoundException( 'Could not parse a download link for ' + url_base + '!' + os.linesep + HydrusData.ToString( e ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if image_url is None:
|
|
|
|
|
2015-07-08 21:45:38 +00:00
|
|
|
raise HydrusExceptions.NotFoundException( 'Could not parse a download link for ' + url_base + '!' )
|
2015-06-03 21:05:13 +00:00
|
|
|
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
image_url = urlparse.urljoin( url_base, image_url )
|
|
|
|
|
|
|
|
tags = []
|
|
|
|
|
|
|
|
for ( tag_classname, namespace ) in tag_classnames_to_namespaces.items():
|
|
|
|
|
|
|
|
tag_list_entries = soup.find_all( class_ = tag_classname )
|
|
|
|
|
|
|
|
for tag_list_entry in tag_list_entries:
|
|
|
|
|
|
|
|
links = tag_list_entry.find_all( 'a' )
|
|
|
|
|
|
|
|
if tag_list_entry.name == 'a': links.append( tag_list_entry )
|
|
|
|
|
|
|
|
for link in links:
|
|
|
|
|
|
|
|
if link.string not in ( '?', '-', '+' ):
|
|
|
|
|
|
|
|
if namespace == '': tags.append( link.string )
|
|
|
|
else: tags.append( namespace + ':' + link.string )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return ( image_url, tags )
|
|
|
|
|
|
|
|
|
|
|
|
def _GetFileURLAndTags( self, url ):
|
|
|
|
|
|
|
|
html = self._FetchData( url )
|
|
|
|
|
|
|
|
return self._ParseImagePage( html, url )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFile( self, temp_path, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFileAndTags( self, temp_path, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
|
|
|
def GetTags( self, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParserDeviantArt( GalleryParser ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def __init__( self, artist ):
|
|
|
|
|
|
|
|
self._gallery_url = 'http://' + artist + '.deviantart.com/gallery/?catpath=/&offset='
|
|
|
|
self._artist = artist
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
GalleryParser.__init__( self )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURL( self, page_index ):
|
|
|
|
|
|
|
|
return self._gallery_url + HydrusData.ToString( page_index * 24 )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def _ParseGalleryPage( self, html, url_base ):
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls = []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
thumbs_container = soup.find( class_ = 'zones-container' )
|
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
links = thumbs_container.find_all( 'a', class_ = 'thumb' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
for link in links:
|
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
url = link[ 'href' ] # something in the form of blah.da.com/art/blah-123456
|
|
|
|
|
|
|
|
urls.append( url )
|
|
|
|
|
|
|
|
tags = []
|
|
|
|
|
|
|
|
tags.append( 'creator:' + self._artist )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
try: # starts_with_thumb picks up some false positives, but they break
|
|
|
|
|
|
|
|
raw_title = link[ 'title' ] # sweet dolls by AngeniaC, date, blah blah blah
|
|
|
|
|
|
|
|
raw_title_reversed = raw_title[::-1] # trAtnaiveD no CainegnA yb sllod teews
|
|
|
|
|
|
|
|
( creator_and_gumpf_reversed, title_reversed ) = raw_title_reversed.split( ' yb ', 1 )
|
|
|
|
|
|
|
|
title = title_reversed[::-1] # sweet dolls
|
|
|
|
|
|
|
|
tags.append( 'title:' + title )
|
|
|
|
|
|
|
|
except: pass
|
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
SetExtraURLInfo( url, tags )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return urls
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _ParseImagePage( self, html ):
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
img = soup.find( class_ = 'dev-content-full' )
|
|
|
|
|
|
|
|
if img is None:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
# this probably means it is mature
|
|
|
|
# DA hide the url pretty much everywhere except the tumblr share thing
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
a_tumblr = soup.find( id = 'gmi-ResourceViewShareTumblr' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
tumblr_url = a_tumblr[ 'href' ] # http://www.tumblr.com/share/photo?source=http%3A%2F%2Fimg09.deviantart.net%2Ff19a%2Fi%2F2015%2F054%2Fe%2Fd%2Fass_by_gmgkaiser-d8j7ija.png&caption=%3Ca+href%3D%22http%3A%2F%2Fgmgkaiser.deviantart.com%2Fart%2Fass-515992726%22%3Eass%3C%2Fa%3E+by+%3Ca+href%3D%22http%3A%2F%2Fgmgkaiser.deviantart.com%2F%22%3EGMGkaiser%3C%2Fa%3E&clickthru=http%3A%2F%2Fgmgkaiser.deviantart.com%2Fart%2Fass-515992726
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
parse_result = urlparse.urlparse( tumblr_url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
query_parse_result = urlparse.parse_qs( parse_result.query )
|
|
|
|
|
|
|
|
img_url = query_parse_result[ 'source' ][0] # http://img09.deviantart.net/f19a/i/2015/054/e/d/ass_by_gmgkaiser-d8j7ija.png
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
img_url = img[ 'src' ]
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-08-19 21:48:21 +00:00
|
|
|
return img_url
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def _GetFileURL( self, url ):
|
|
|
|
|
|
|
|
html = self._FetchData( url )
|
|
|
|
|
|
|
|
return self._ParseImagePage( html )
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def GetFile( self, temp_path, url ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
file_url = self._GetFileURL( url )
|
|
|
|
|
|
|
|
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def GetTags( self, url ):
|
|
|
|
|
|
|
|
result = GetExtraURLInfo( url )
|
|
|
|
|
|
|
|
if result is None:
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParserGiphy( GalleryParser ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def __init__( self, tag ):
|
|
|
|
|
2015-09-02 23:16:09 +00:00
|
|
|
self._gallery_url = 'http://giphy.com/api/gifs?tag=' + urllib.quote( tag.encode( 'utf-8' ).replace( ' ', '+' ), '' ) + '&page='
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
GalleryParser.__init__( self )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURL( self, page_index ):
|
|
|
|
|
|
|
|
return self._gallery_url + HydrusData.ToString( page_index + 1 )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def _ParseGalleryPage( self, data, url_base ):
|
|
|
|
|
|
|
|
json_dict = json.loads( data )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls = []
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
if 'data' in json_dict:
|
|
|
|
|
|
|
|
json_data = json_dict[ 'data' ]
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
for d in json_data:
|
|
|
|
|
|
|
|
url = d[ 'image_original_url' ]
|
|
|
|
id = d[ 'id' ]
|
|
|
|
|
|
|
|
SetExtraURLInfo( url, id )
|
|
|
|
|
|
|
|
urls.append( url )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
|
|
|
|
return urls
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def GetTags( self, url ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
id = GetExtraURLInfo( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if id is None:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
else:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
url = 'http://giphy.com/api/gifs/' + HydrusData.ToString( id )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
try:
|
|
|
|
|
|
|
|
raw_json = self._FetchData( url )
|
|
|
|
|
|
|
|
json_dict = json.loads( raw_json )
|
|
|
|
|
|
|
|
tags_data = json_dict[ 'data' ][ 'tags' ]
|
|
|
|
|
|
|
|
return [ tag_data[ 'name' ] for tag_data in tags_data ]
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
HydrusData.ShowException( e )
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParserHentaiFoundry( GalleryParser ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def __init__( self, query_type, query, advanced_hentai_foundry_options ):
|
|
|
|
|
|
|
|
self._query_type = query_type
|
|
|
|
self._query = query
|
|
|
|
self._advanced_hentai_foundry_options = advanced_hentai_foundry_options
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
GalleryParser.__init__( self )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _AddSessionCookies( self, request_headers ):
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
manager = HydrusGlobals.client_controller.GetManager( 'web_sessions' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
cookies = manager.GetCookies( 'hentai foundry' )
|
|
|
|
|
|
|
|
HydrusNetworking.AddCookiesToHeaders( cookies, request_headers )
|
|
|
|
|
|
|
|
|
|
|
|
def _GetFileURLAndTags( self, url ):
|
|
|
|
|
|
|
|
html = self._FetchData( url )
|
|
|
|
|
|
|
|
return self._ParseImagePage( html, url )
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURL( self, page_index ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if self._query_type in ( 'artist', 'artist pictures' ):
|
|
|
|
|
|
|
|
artist = self._query
|
|
|
|
|
|
|
|
gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return gallery_url + '/page/' + HydrusData.ToString( page_index + 1 )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
elif self._query_type == 'artist scraps':
|
|
|
|
|
|
|
|
artist = self._query
|
|
|
|
|
|
|
|
gallery_url = 'http://www.hentai-foundry.com/pictures/user/' + artist + '/scraps'
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return gallery_url + '/page/' + HydrusData.ToString( page_index + 1 )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
elif self._query_type == 'tags':
|
|
|
|
|
|
|
|
tags = self._query
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return 'http://www.hentai-foundry.com/search/pictures?query=' + '+'.join( tags ) + '&search_in=all&scraps=-1&page=' + HydrusData.ToString( page_index + 1 )
|
2015-03-25 22:04:19 +00:00
|
|
|
# scraps = 0 hide
|
|
|
|
# -1 means show both
|
|
|
|
# 1 means scraps only. wetf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _ParseGalleryPage( self, html, url_base ):
|
|
|
|
|
|
|
|
urls_set = set()
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
def correct_url( href ):
|
|
|
|
|
|
|
|
# a good url is in the form "/pictures/user/artist_name/file_id/title"
|
|
|
|
|
|
|
|
if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):
|
|
|
|
|
|
|
|
( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )
|
|
|
|
|
|
|
|
# /pictures/user/artist_name/page/3
|
|
|
|
if file_id != 'page': return True
|
|
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls = []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
links = soup.find_all( 'a', href = correct_url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
for link in links:
|
|
|
|
|
|
|
|
url = 'http://www.hentai-foundry.com' + link['href']
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if url not in urls_set:
|
|
|
|
|
|
|
|
urls_set.add( url )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls.append( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# this is copied from old code. surely we can improve it?
|
|
|
|
if 'class="next"' not in html: self._we_are_done = True
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return urls
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _ParseImagePage( self, html, url_base ):
|
|
|
|
|
|
|
|
# can't parse this easily normally because HF is a pain with the preview->click to see full size business.
|
|
|
|
# find http://pictures.hentai-foundry.com//
|
|
|
|
# then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144.jpg
|
|
|
|
# the .jpg bit is what we really need, but whatever
|
|
|
|
try:
|
|
|
|
|
|
|
|
index = html.index( 'pictures.hentai-foundry.com' )
|
|
|
|
|
|
|
|
image_url = html[ index : index + 256 ]
|
|
|
|
|
|
|
|
if '"' in image_url: ( image_url, gumpf ) = image_url.split( '"', 1 )
|
|
|
|
if ''' in image_url: ( image_url, gumpf ) = image_url.split( ''', 1 )
|
|
|
|
|
|
|
|
image_url = 'http://' + image_url
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
raise Exception( 'Could not parse image url!' + os.linesep + HydrusData.ToString( e ) )
|
|
|
|
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
tags = []
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
title = soup.find( 'title' )
|
|
|
|
|
|
|
|
( data, nothing ) = HydrusData.ToString( title.string ).split( ' - Hentai Foundry' )
|
|
|
|
|
|
|
|
data_reversed = data[::-1] # want to do it right-side first, because title might have ' by ' in it
|
|
|
|
|
|
|
|
( artist_reversed, title_reversed ) = data_reversed.split( ' yb ' )
|
|
|
|
|
|
|
|
artist = artist_reversed[::-1]
|
|
|
|
|
|
|
|
title = title_reversed[::-1]
|
|
|
|
|
|
|
|
tags.append( 'creator:' + artist )
|
|
|
|
tags.append( 'title:' + title )
|
|
|
|
|
|
|
|
except: pass
|
|
|
|
|
|
|
|
tag_links = soup.find_all( 'a', rel = 'tag' )
|
|
|
|
|
|
|
|
for tag_link in tag_links: tags.append( tag_link.string )
|
|
|
|
|
|
|
|
return ( image_url, tags )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFile( self, temp_path, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFileAndTags( self, temp_path, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
|
|
|
def GetTags( self, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
|
|
|
def SetupGallerySearch( self ):
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
manager = HydrusGlobals.client_controller.GetManager( 'web_sessions' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
cookies = manager.GetCookies( 'hentai foundry' )
|
|
|
|
|
|
|
|
raw_csrf = cookies[ 'YII_CSRF_TOKEN' ] # 19b05b536885ec60b8b37650a32f8deb11c08cd1s%3A40%3A%222917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32%22%3B
|
|
|
|
|
|
|
|
processed_csrf = urllib.unquote( raw_csrf ) # 19b05b536885ec60b8b37650a32f8deb11c08cd1s:40:"2917dcfbfbf2eda2c1fbe43f4d4c4ec4b6902b32";
|
|
|
|
|
|
|
|
csrf_token = processed_csrf.split( '"' )[1] # the 2917... bit
|
|
|
|
|
|
|
|
self._advanced_hentai_foundry_options[ 'YII_CSRF_TOKEN' ] = csrf_token
|
|
|
|
|
|
|
|
body = urllib.urlencode( self._advanced_hentai_foundry_options )
|
|
|
|
|
|
|
|
request_headers = {}
|
|
|
|
request_headers[ 'Content-Type' ] = 'application/x-www-form-urlencoded'
|
|
|
|
|
|
|
|
self._AddSessionCookies( request_headers )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
HydrusGlobals.client_controller.DoHTTP( HC.POST, 'http://www.hentai-foundry.com/site/filters', request_headers = request_headers, body = body )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParserNewgrounds( GalleryParser ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def __init__( self, query ):
|
|
|
|
|
|
|
|
self._query = query
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
GalleryParser.__init__( self )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _GetFileURLAndTags( self, url ):
|
|
|
|
|
|
|
|
html = self._FetchData( url )
|
|
|
|
|
|
|
|
return self._ParseImagePage( html, url )
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURLs( self, page_index ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
artist = self._query
|
|
|
|
|
|
|
|
gallery_urls = []
|
|
|
|
|
|
|
|
gallery_urls.append( 'http://' + artist + '.newgrounds.com/games/' )
|
|
|
|
gallery_urls.append( 'http://' + artist + '.newgrounds.com/movies/' )
|
|
|
|
|
|
|
|
self._we_are_done = True
|
|
|
|
|
|
|
|
return gallery_urls
|
|
|
|
|
|
|
|
|
|
|
|
def _ParseGalleryPage( self, html, url_base ):
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
fatcol = soup.find( 'div', class_ = 'fatcol' )
|
|
|
|
|
|
|
|
links = fatcol.find_all( 'a' )
|
|
|
|
|
|
|
|
urls_set = set()
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls = []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
for link in links:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
url = link[ 'href' ]
|
|
|
|
|
|
|
|
if url not in urls_set:
|
|
|
|
|
|
|
|
if url.startswith( 'http://www.newgrounds.com/portal/view/' ):
|
|
|
|
|
|
|
|
urls_set.add( url )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls.append( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except: pass
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return urls
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _ParseImagePage( self, html, url_base ):
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
tags = set()
|
|
|
|
|
|
|
|
author_links = soup.find( 'ul', class_ = 'authorlinks' )
|
|
|
|
|
|
|
|
if author_links is not None:
|
|
|
|
|
|
|
|
authors = set()
|
|
|
|
|
|
|
|
links = author_links.find_all( 'a' )
|
|
|
|
|
|
|
|
for link in links:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com
|
|
|
|
|
|
|
|
creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' )
|
|
|
|
|
|
|
|
tags.add( u'creator:' + creator )
|
|
|
|
|
|
|
|
except: pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
title = soup.find( 'title' )
|
|
|
|
|
|
|
|
tags.add( u'title:' + title.string )
|
|
|
|
|
|
|
|
except: pass
|
|
|
|
|
|
|
|
all_links = soup.find_all( 'a' )
|
|
|
|
|
|
|
|
for link in all_links:
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
href = link[ 'href' ]
|
|
|
|
|
|
|
|
if '/browse/tag/' in href: tags.add( link.string )
|
|
|
|
|
|
|
|
except: pass
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
components = html.split( '"http://uploads.ungrounded.net/' )
|
|
|
|
|
|
|
|
# there is sometimes another bit of api flash earlier on that we don't want
|
|
|
|
# it is called http://uploads.ungrounded.net/apiassets/sandbox.swf
|
|
|
|
|
|
|
|
if len( components ) == 2: flash_url = components[1]
|
|
|
|
else: flash_url = components[2]
|
|
|
|
|
|
|
|
flash_url = flash_url.split( '"', 1 )[0]
|
|
|
|
|
|
|
|
flash_url = 'http://uploads.ungrounded.net/' + flash_url
|
|
|
|
|
|
|
|
except: raise Exception( 'Could not find the swf file! It was probably an mp4!' )
|
|
|
|
|
|
|
|
return ( flash_url, tags )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFile( self, temp_path, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFileAndTags( self, temp_path, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
self._FetchData( file_url, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
|
|
|
def GetTags( self, url ):
|
|
|
|
|
|
|
|
( file_url, tags ) = self._GetFileURLAndTags( url )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParserPixiv( GalleryParser ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def __init__( self, query_type, query ):
|
|
|
|
|
|
|
|
self._query_type = query_type
|
|
|
|
self._query = query
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
GalleryParser.__init__( self )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _AddSessionCookies( self, request_headers ):
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
manager = HydrusGlobals.client_controller.GetManager( 'web_sessions' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
cookies = manager.GetCookies( 'pixiv' )
|
|
|
|
|
|
|
|
HydrusNetworking.AddCookiesToHeaders( cookies, request_headers )
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURL( self, page_index ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if self._query_type == 'artist_id':
|
|
|
|
|
|
|
|
artist_id = self._query
|
|
|
|
|
|
|
|
gallery_url = 'http://www.pixiv.net/member_illust.php?id=' + HydrusData.ToString( artist_id )
|
|
|
|
|
|
|
|
elif self._query_type == 'tags':
|
|
|
|
|
|
|
|
tag = self._query
|
|
|
|
|
2015-08-12 20:35:24 +00:00
|
|
|
gallery_url = 'http://www.pixiv.net/search.php?word=' + urllib.quote( tag.encode( 'utf-8' ), '' ) + '&s_mode=s_tag_full&order=date_d'
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return gallery_url + '&p=' + HydrusData.ToString( page_index + 1 )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _ParseGalleryPage( self, html, url_base ):
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls = []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
thumbnail_links = soup.find_all( class_ = 'work' )
|
|
|
|
|
|
|
|
for thumbnail_link in thumbnail_links:
|
|
|
|
|
|
|
|
url = urlparse.urljoin( url_base, thumbnail_link[ 'href' ] ) # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls.append( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return urls
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _ParseImagePage( self, html, page_url ):
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
if 'member_illust.php?mode=manga' in html:
|
|
|
|
|
|
|
|
manga_url = page_url.replace( 'medium', 'manga' )
|
|
|
|
|
|
|
|
raise Exception( page_url + ' was manga, not a single image, so could not be downloaded.' )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup( html )
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# this is the page that holds the full size of the image.
|
|
|
|
# pixiv won't serve the image unless it thinks this page is the referrer
|
2015-07-01 22:02:07 +00:00
|
|
|
#referral_url = page_url.replace( 'medium', 'big' ) # http://www.pixiv.net/member_illust.php?mode=big&illust_id=33500690
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
original_image = soup.find( class_ = 'original-image' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
image_url = original_image[ 'data-src' ] # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
#
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
tags = soup.find( 'ul', class_ = 'tagCloud' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
# <a href="/member_illust.php?id=5754629&tag=Ib">Ib<span class="cnt">(2)</span></a> -> Ib
|
|
|
|
tags = [ a_item.contents[0] for a_item in tags.find_all( 'a' ) ]
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
user = soup.find( 'h1', class_ = 'user' )
|
|
|
|
|
|
|
|
tags.append( 'creator:' + user.string )
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
title_parent = soup.find( 'section', class_ = re.compile( 'work-info' ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
title = title_parent.find( 'h1', class_ = 'title' )
|
|
|
|
|
|
|
|
tags.append( 'title:' + title.string )
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
return ( image_url, tags )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
def _GetFileURLAndTags( self, page_url ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
html = self._FetchData( page_url )
|
|
|
|
|
|
|
|
return self._ParseImagePage( html, page_url )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFile( self, temp_path, url ):
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
( image_url, tags ) = self._GetFileURLAndTags( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
request_headers = { 'Referer' : url }
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._FetchData( image_url, request_headers = request_headers, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
|
|
|
|
def GetFileAndTags( self, temp_path, url ):
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
( image_url, tags ) = self._GetFileURLAndTags( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
request_headers = { 'Referer' : url }
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._FetchData( image_url, request_headers = request_headers, report_hooks = self._report_hooks, temp_path = temp_path )
|
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
|
|
|
def GetTags( self, url ):
|
|
|
|
|
2015-07-01 22:02:07 +00:00
|
|
|
( image_url, tags ) = self._GetFileURLAndTags( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
return tags
|
|
|
|
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
class GalleryParserTumblr( GalleryParser ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def __init__( self, username ):
|
|
|
|
|
|
|
|
self._gallery_url = 'http://' + username + '.tumblr.com/api/read/json?start=%start%&num=50'
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
self._urls_to_tags = {}
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
GalleryParser.__init__( self )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _GetGalleryPageURL( self, page_index ):
|
|
|
|
|
|
|
|
return self._gallery_url.replace( '%start%', HydrusData.ToString( page_index * 50 ) )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
def _ParseGalleryPage( self, data, url_base ):
|
|
|
|
|
|
|
|
processed_raw_json = data.split( 'var tumblr_api_read = ' )[1][:-2] # -2 takes a couple newline chars off at the end
|
|
|
|
|
|
|
|
json_object = json.loads( processed_raw_json )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls = []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if 'posts' in json_object:
|
|
|
|
|
|
|
|
for post in json_object[ 'posts' ]:
|
|
|
|
|
|
|
|
if 'tags' in post: tags = post[ 'tags' ]
|
|
|
|
else: tags = []
|
|
|
|
|
|
|
|
post_type = post[ 'type' ]
|
|
|
|
|
|
|
|
if post_type == 'photo':
|
|
|
|
|
|
|
|
if len( post[ 'photos' ] ) == 0:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
try:
|
|
|
|
|
|
|
|
url = post[ 'photo-url-1280' ]
|
|
|
|
|
|
|
|
SetExtraURLInfo( url, tags )
|
|
|
|
|
|
|
|
urls.append( url )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
except: pass
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
for photo in post[ 'photos' ]:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
try:
|
|
|
|
|
|
|
|
url = photo[ 'photo-url-1280' ]
|
|
|
|
|
|
|
|
SetExtraURLInfo( url, tags )
|
|
|
|
|
|
|
|
urls.append( url )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
except: pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
return urls
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def GetTags( self, url ):
|
|
|
|
|
|
|
|
result = GetExtraURLInfo( url )
|
|
|
|
|
|
|
|
if result is None:
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
class ImportArgsGenerator( object ):
|
|
|
|
|
2015-07-22 19:40:39 +00:00
|
|
|
def __init__( self, job_key, item, import_file_options ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key = job_key
|
|
|
|
self._item = item
|
2015-07-22 19:40:39 +00:00
|
|
|
self._import_file_options = import_file_options
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def __call__( self ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
( result, media_result ) = self._CheckCurrentStatus()
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if result == CC.STATUS_NEW:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
( os_file_handle, temp_path ) = HydrusFileHandling.GetTempPath()
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
( name, service_keys_to_tags, url ) = self._GetArgs( temp_path )
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'importing' )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
( result, media_result ) = HydrusGlobals.client_controller.WriteSynchronous( 'import_file', temp_path, import_file_options = self._import_file_options, service_keys_to_tags = service_keys_to_tags, generate_media_result = True, url = url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
|
|
HydrusFileHandling.CleanUpTempPath( os_file_handle, temp_path )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'result', result )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if result in ( CC.STATUS_SUCCESSFUL, CC.STATUS_REDUNDANT ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
page_key = self._job_key.GetVariable( 'page_key' )
|
|
|
|
|
|
|
|
if media_result is not None and page_key is not None:
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
HydrusGlobals.client_controller.pub( 'add_media_results', page_key, ( media_result, ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', '' )
|
|
|
|
|
|
|
|
self._job_key.Finish()
|
|
|
|
|
2015-07-22 19:40:39 +00:00
|
|
|
self._CleanUp()
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
self._job_key.SetVariable( 'result', CC.STATUS_FAILED )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if 'name' in locals(): HydrusData.ShowText( 'There was a problem importing ' + name + '!' )
|
|
|
|
|
|
|
|
HydrusData.ShowException( e )
|
|
|
|
|
2015-09-09 22:04:39 +00:00
|
|
|
time.sleep( 5 )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.Cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _CleanUp( self ): pass
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def _CheckCurrentStatus( self ): return ( CC.STATUS_NEW, None )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-04-01 20:44:54 +00:00
|
|
|
def _GetArgs( self, temp_path ):
|
|
|
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
class ImportArgsGeneratorGallery( ImportArgsGenerator ):
|
|
|
|
|
2015-07-22 19:40:39 +00:00
|
|
|
def __init__( self, job_key, item, import_file_options, advanced_tag_options, gallery_parsers_factory ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-22 19:40:39 +00:00
|
|
|
ImportArgsGenerator.__init__( self, job_key, item, import_file_options )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._advanced_tag_options = advanced_tag_options
|
2015-05-20 21:31:40 +00:00
|
|
|
self._gallery_parsers_factory = gallery_parsers_factory
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _GetArgs( self, temp_path ):
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
url = self._item
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'downloading' )
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
gallery_parser = self._gallery_parsers_factory( 'example' )[0]
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def hook( gauge_range, gauge_value ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
self._job_key.SetVariable( 'range', gauge_range )
|
|
|
|
self._job_key.SetVariable( 'value', gauge_value )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
gallery_parser.AddReportHook( hook )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
do_tags = len( self._advanced_tag_options ) > 0
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if do_tags:
|
|
|
|
|
|
|
|
tags = gallery_parser.GetFileAndTags( temp_path, url )
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
else:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
gallery_parser.GetFile( temp_path, url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
tags = []
|
|
|
|
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
gallery_parser.ClearReportHooks()
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
service_keys_to_tags = ConvertTagsToServiceKeysToTags( tags, self._advanced_tag_options )
|
|
|
|
|
2015-09-09 22:04:39 +00:00
|
|
|
time.sleep( HC.options[ 'website_download_polite_wait' ] )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
return ( url, service_keys_to_tags, url )
|
|
|
|
|
|
|
|
|
|
|
|
def _CheckCurrentStatus( self ):
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
url = self._item
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'checking url status' )
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
gallery_parser = self._gallery_parsers_factory( 'example' )[0]
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
( status, hash ) = HydrusGlobals.client_controller.Read( 'url_status', url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-22 19:40:39 +00:00
|
|
|
if status == CC.STATUS_DELETED and not self._import_file_options[ 'exclude_deleted_files' ]: status = CC.STATUS_NEW
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if status == CC.STATUS_REDUNDANT:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', CC.LOCAL_FILE_SERVICE_KEY, ( hash, ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
do_tags = len( self._advanced_tag_options ) > 0
|
|
|
|
|
|
|
|
if do_tags:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
tags = gallery_parser.GetTags( url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
service_keys_to_tags = ConvertTagsToServiceKeysToTags( tags, self._advanced_tag_options )
|
|
|
|
|
|
|
|
service_keys_to_content_updates = ConvertServiceKeysToTagsToServiceKeysToContentUpdates( hash, service_keys_to_tags )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
HydrusGlobals.client_controller.Write( 'content_updates', service_keys_to_content_updates )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-09-09 22:04:39 +00:00
|
|
|
time.sleep( HC.options[ 'website_download_polite_wait' ] )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
return ( status, media_result )
|
|
|
|
|
|
|
|
else: return ( status, None )
|
|
|
|
|
|
|
|
|
|
|
|
class ImportArgsGeneratorURLs( ImportArgsGenerator ):
|
|
|
|
|
|
|
|
def _GetArgs( self, temp_path ):
|
|
|
|
|
|
|
|
url = self._item
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'downloading' )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
def hook( gauge_range, gauge_value ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
self._job_key.SetVariable( 'range', gauge_range )
|
|
|
|
self._job_key.SetVariable( 'value', gauge_value )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
HydrusGlobals.client_controller.DoHTTP( HC.GET, url, report_hooks = [ hook ], temp_path = temp_path )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
service_keys_to_tags = {}
|
|
|
|
|
|
|
|
return ( url, service_keys_to_tags, url )
|
|
|
|
|
|
|
|
|
|
|
|
def _CheckCurrentStatus( self ):
|
|
|
|
|
|
|
|
url = self._item
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'checking url status' )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
( status, hash ) = HydrusGlobals.client_controller.Read( 'url_status', url )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-07-22 19:40:39 +00:00
|
|
|
if status == CC.STATUS_DELETED and not self._import_file_options[ 'exclude_deleted_files' ]: status = CC.STATUS_NEW
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if status == CC.STATUS_REDUNDANT:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
( media_result, ) = HydrusGlobals.client_controller.Read( 'media_results', CC.LOCAL_FILE_SERVICE_KEY, ( hash, ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
return ( status, media_result )
|
|
|
|
|
|
|
|
else: return ( status, None )
|
|
|
|
|
|
|
|
|
|
|
|
class ImportController( object ):
|
|
|
|
|
|
|
|
def __init__( self, import_args_generator_factory, import_queue_builder_factory, page_key = None ):
|
|
|
|
|
|
|
|
self._controller_job_key = self._GetNewJobKey( 'controller' )
|
|
|
|
|
|
|
|
self._import_args_generator_factory = import_args_generator_factory
|
|
|
|
self._import_queue_builder_factory = import_queue_builder_factory
|
|
|
|
self._page_key = page_key
|
|
|
|
|
|
|
|
self._import_job_key = self._GetNewJobKey( 'import' )
|
|
|
|
self._import_queue_job_key = self._GetNewJobKey( 'import_queue' )
|
|
|
|
self._import_queue_builder_job_key = self._GetNewJobKey( 'import_queue_builder' )
|
|
|
|
self._pending_import_queue_jobs = []
|
|
|
|
|
|
|
|
self._lock = threading.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
def _GetNewJobKey( self, job_type ):
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
job_key = HydrusThreading.JobKey( pausable = True, cancellable = True )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if job_type == 'controller':
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
result_counts = {}
|
|
|
|
|
|
|
|
result_counts[ CC.STATUS_SUCCESSFUL ] = 0
|
|
|
|
result_counts[ CC.STATUS_FAILED ] = 0
|
|
|
|
result_counts[ CC.STATUS_DELETED ] = 0
|
|
|
|
result_counts[ CC.STATUS_REDUNDANT ] = 0
|
|
|
|
|
|
|
|
job_key.SetVariable( 'result_counts', result_counts )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
job_key.SetVariable( 'status', '' )
|
|
|
|
|
|
|
|
if job_type == 'import':
|
|
|
|
|
|
|
|
job_key.SetVariable( 'page_key', self._page_key )
|
|
|
|
job_key.SetVariable( 'range', 1 )
|
|
|
|
job_key.SetVariable( 'value', 0 )
|
|
|
|
|
|
|
|
elif job_type == 'import_queue':
|
|
|
|
|
|
|
|
job_key.SetVariable( 'queue_position', 0 )
|
|
|
|
|
|
|
|
elif job_type == 'import_queue_builder':
|
|
|
|
|
|
|
|
job_key.SetVariable( 'queue', [] )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return job_key
|
|
|
|
|
|
|
|
|
|
|
|
def CleanBeforeDestroy( self ): self._controller_job_key.Cancel()
|
|
|
|
|
|
|
|
def GetJobKey( self, job_type ):
|
|
|
|
|
|
|
|
with self._lock:
|
|
|
|
|
|
|
|
if job_type == 'controller': return self._controller_job_key
|
|
|
|
elif job_type == 'import': return self._import_job_key
|
|
|
|
elif job_type == 'import_queue': return self._import_queue_job_key
|
|
|
|
elif job_type == 'import_queue_builder': return self._import_queue_builder_job_key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GetPendingImportQueueJobs( self ):
|
|
|
|
|
|
|
|
with self._lock: return self._pending_import_queue_jobs
|
|
|
|
|
|
|
|
|
|
|
|
def PendImportQueueJob( self, job ):
|
|
|
|
|
|
|
|
with self._lock: self._pending_import_queue_jobs.append( job )
|
|
|
|
|
|
|
|
|
|
|
|
def RemovePendingImportQueueJob( self, job ):
|
|
|
|
|
|
|
|
with self._lock:
|
|
|
|
|
|
|
|
if job in self._pending_import_queue_jobs: self._pending_import_queue_jobs.remove( job )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def MovePendingImportQueueJobUp( self, job ):
|
|
|
|
|
|
|
|
with self._lock:
|
|
|
|
|
|
|
|
if job in self._pending_import_queue_jobs:
|
|
|
|
|
|
|
|
index = self._pending_import_queue_jobs.index( job )
|
|
|
|
|
|
|
|
if index > 0:
|
|
|
|
|
|
|
|
self._pending_import_queue_jobs.remove( job )
|
|
|
|
|
|
|
|
self._pending_import_queue_jobs.insert( index - 1, job )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def MovePendingImportQueueJobDown( self, job ):
|
|
|
|
|
|
|
|
with self._lock:
|
|
|
|
|
|
|
|
if job in self._pending_import_queue_jobs:
|
|
|
|
|
|
|
|
index = self._pending_import_queue_jobs.index( job )
|
|
|
|
|
|
|
|
if index + 1 < len( self._pending_import_queue_jobs ):
|
|
|
|
|
|
|
|
self._pending_import_queue_jobs.remove( job )
|
|
|
|
|
|
|
|
self._pending_import_queue_jobs.insert( index + 1, job )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def MainLoop( self ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
while not self._controller_job_key.IsDone():
|
|
|
|
|
|
|
|
create_import_item = False
|
|
|
|
create_import_queue_item = False
|
|
|
|
|
|
|
|
while self._controller_job_key.IsPaused():
|
|
|
|
|
|
|
|
time.sleep( 0.1 )
|
|
|
|
|
|
|
|
self._import_job_key.Pause()
|
|
|
|
self._import_queue_job_key.Pause()
|
|
|
|
self._import_queue_builder_job_key.Pause()
|
|
|
|
|
2015-08-26 21:18:39 +00:00
|
|
|
if HydrusGlobals.view_shutdown or self._controller_job_key.IsDone(): break
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-08-26 21:18:39 +00:00
|
|
|
if HydrusGlobals.view_shutdown or self._controller_job_key.IsDone(): break
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
with self._lock:
|
|
|
|
|
|
|
|
queue_position = self._import_queue_job_key.GetVariable( 'queue_position' )
|
|
|
|
queue = self._import_queue_builder_job_key.GetVariable( 'queue' )
|
|
|
|
|
|
|
|
if self._import_job_key.IsDone():
|
|
|
|
|
|
|
|
result = self._import_job_key.GetVariable( 'result' )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
result_counts = self._controller_job_key.GetVariable( 'result_counts' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
result_counts[ result ] += 1
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._import_job_key = self._GetNewJobKey( 'import' )
|
|
|
|
|
|
|
|
queue_position += 1
|
|
|
|
|
|
|
|
self._import_queue_job_key.SetVariable( 'queue_position', queue_position )
|
|
|
|
|
|
|
|
|
2015-06-10 19:40:25 +00:00
|
|
|
position_string = HydrusData.ConvertValueRangeToPrettyString( queue_position + 1, len( queue ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if self._import_queue_job_key.IsPaused(): self._import_queue_job_key.SetVariable( 'status', 'paused at ' + position_string )
|
|
|
|
elif self._import_queue_job_key.IsWorking():
|
|
|
|
|
|
|
|
if self._import_job_key.IsWorking():
|
|
|
|
|
|
|
|
self._import_queue_job_key.SetVariable( 'status', 'processing ' + position_string )
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
if queue_position < len( queue ):
|
|
|
|
|
|
|
|
self._import_queue_job_key.SetVariable( 'status', 'preparing ' + position_string )
|
|
|
|
|
|
|
|
self._import_job_key.Begin()
|
|
|
|
|
|
|
|
import_item = queue[ queue_position ]
|
|
|
|
|
|
|
|
create_import_item = True
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
if self._import_queue_builder_job_key.IsWorking(): self._import_queue_job_key.SetVariable( 'status', 'waiting for more items' )
|
|
|
|
else: self._import_queue_job_key.Finish()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
if self._import_queue_job_key.IsDone():
|
|
|
|
|
|
|
|
if self._import_queue_job_key.IsCancelled(): status = 'cancelled at ' + position_string
|
|
|
|
else: status = 'done'
|
|
|
|
|
|
|
|
self._import_queue_job_key = self._GetNewJobKey( 'import_queue' )
|
|
|
|
|
|
|
|
self._import_queue_builder_job_key = self._GetNewJobKey( 'import_queue_builder' )
|
|
|
|
|
|
|
|
else: status = ''
|
|
|
|
|
|
|
|
self._import_queue_job_key.SetVariable( 'status', status )
|
|
|
|
|
|
|
|
if len( self._pending_import_queue_jobs ) > 0:
|
|
|
|
|
|
|
|
self._import_queue_job_key.Begin()
|
|
|
|
|
|
|
|
self._import_queue_builder_job_key.Begin()
|
|
|
|
|
|
|
|
queue_item = self._pending_import_queue_jobs.pop( 0 )
|
|
|
|
|
|
|
|
create_import_queue_item = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# This is outside the lock, as it may call wx-blocking stuff, and other wx bits will sometimes wait on the lock
|
|
|
|
if create_import_item:
|
|
|
|
|
|
|
|
args_generator = self._import_args_generator_factory( self._import_job_key, import_item )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
HydrusGlobals.client_controller.CallToThread( args_generator )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
if create_import_queue_item:
|
|
|
|
|
|
|
|
queue_builder = self._import_queue_builder_factory( self._import_queue_builder_job_key, queue_item )
|
|
|
|
|
|
|
|
# make it a daemon, not a thread job, as it has a loop!
|
|
|
|
threading.Thread( target = queue_builder ).start()
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep( 0.05 )
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
HydrusData.ShowException( e )
|
|
|
|
|
|
|
|
finally:
|
|
|
|
|
|
|
|
self._import_job_key.Cancel()
|
|
|
|
self._import_queue_job_key.Cancel()
|
|
|
|
self._import_queue_builder_job_key.Cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def StartDaemon( self ): threading.Thread( target = self.MainLoop ).start()
|
|
|
|
|
|
|
|
class ImportQueueBuilder( object ):
|
|
|
|
|
|
|
|
def __init__( self, job_key, item ):
|
|
|
|
|
|
|
|
self._job_key = job_key
|
|
|
|
self._item = item
|
|
|
|
|
|
|
|
|
|
|
|
def __call__( self ):
|
|
|
|
|
|
|
|
queue = self._item
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'queue', queue )
|
|
|
|
|
|
|
|
self._job_key.Finish()
|
|
|
|
|
|
|
|
|
|
|
|
class ImportQueueBuilderGallery( ImportQueueBuilder ):
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
def __init__( self, job_key, item, gallery_parsers_factory ):
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
ImportQueueBuilder.__init__( self, job_key, item )
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
self._gallery_parsers_factory = gallery_parsers_factory
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
def __call__( self ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
( raw_query, self._get_tags_if_redundant, self._file_limit ) = self._item
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
gallery_parsers = list( self._gallery_parsers_factory( raw_query ) )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
gallery_parsers[0].SetupGallerySearch() # for now this is cookie-based for hf, so only have to do it on one
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
total_urls_found = 0
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
num_pages_found = 0
|
|
|
|
|
|
|
|
page_index = 0
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
first_run = True
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
gallery_parsers_to_remove = []
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
for gallery_parser in gallery_parsers:
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls_in_pages = HydrusData.ConvertIntToPrettyString( total_urls_found ) + ' urls in ' + HydrusData.ConvertIntToPrettyString( num_pages_found ) + ' pages'
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
while self._job_key.IsPaused():
|
|
|
|
|
|
|
|
time.sleep( 0.1 )
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'paused after finding ' + urls_in_pages )
|
|
|
|
|
2015-08-26 21:18:39 +00:00
|
|
|
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-08-26 21:18:39 +00:00
|
|
|
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-09-09 22:04:39 +00:00
|
|
|
self._job_key.SetVariable( 'status', 'found ' + urls_in_pages + '.' )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
if first_run: first_run = False
|
2015-09-09 22:04:39 +00:00
|
|
|
else: time.sleep( HC.options[ 'website_download_polite_wait' ] )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'found ' + urls_in_pages + '. looking for next page' )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
page_of_urls = gallery_parser.GetPage( page_index )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
if len( page_of_urls ) == 0: gallery_parsers_to_remove.append( gallery_parser )
|
2015-03-25 22:04:19 +00:00
|
|
|
else:
|
|
|
|
|
|
|
|
queue = self._job_key.GetVariable( 'queue' )
|
|
|
|
|
|
|
|
queue = list( queue )
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
if self._file_limit is not None:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
while len( page_of_urls ) > 0 and total_urls_found < self._file_limit:
|
2015-05-20 21:31:40 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
url = page_of_urls.pop( 0 )
|
2015-05-20 21:31:40 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
queue.append( url )
|
2015-05-20 21:31:40 +00:00
|
|
|
|
|
|
|
total_urls_found += 1
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
queue.extend( page_of_urls )
|
2015-05-20 21:31:40 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
total_urls_found += len( page_of_urls )
|
2015-05-20 21:31:40 +00:00
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.SetVariable( 'queue', queue )
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
num_pages_found += 1
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
urls_in_pages = HydrusData.ConvertIntToPrettyString( total_urls_found ) + ' urls in ' + HydrusData.ConvertIntToPrettyString( num_pages_found ) + ' pages'
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
for gallery_parser in gallery_parsers_to_remove: gallery_parsers.remove( gallery_parser )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
if len( gallery_parsers ) == 0: break
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
while self._job_key.IsPaused():
|
|
|
|
|
|
|
|
time.sleep( 0.1 )
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'paused after finding ' + urls_in_pages )
|
|
|
|
|
2015-08-26 21:18:39 +00:00
|
|
|
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
|
2015-08-26 21:18:39 +00:00
|
|
|
if HydrusGlobals.view_shutdown or self._job_key.IsDone(): break
|
2015-03-25 22:04:19 +00:00
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
if self._file_limit is not None and total_urls_found >= self._file_limit: break
|
2015-04-01 20:44:54 +00:00
|
|
|
|
2015-06-03 21:05:13 +00:00
|
|
|
page_index += 1
|
|
|
|
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'finished. found ' + urls_in_pages )
|
|
|
|
|
2015-09-09 22:04:39 +00:00
|
|
|
time.sleep( HC.options[ 'website_download_polite_wait' ] )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', '' )
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', HydrusData.ToString( e ) )
|
|
|
|
|
|
|
|
HydrusData.ShowException( e )
|
|
|
|
|
2015-09-09 22:04:39 +00:00
|
|
|
time.sleep( 5 )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
finally: self._job_key.Finish()
|
|
|
|
|
|
|
|
|
|
|
|
class ImportQueueBuilderURLs( ImportQueueBuilder ):
|
|
|
|
|
|
|
|
def __call__( self ):
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
2015-05-20 21:31:40 +00:00
|
|
|
( url, get_tags_if_redundant, file_limit ) = self._item
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'Connecting to address' )
|
|
|
|
|
2015-09-16 18:11:00 +00:00
|
|
|
try: html = HydrusGlobals.client_controller.DoHTTP( HC.GET, url )
|
2015-03-25 22:04:19 +00:00
|
|
|
except: raise Exception( 'Could not download that url' )
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', 'parsing html' )
|
|
|
|
|
|
|
|
try: urls = ParsePageForURLs( html, url )
|
|
|
|
except: raise Exception( 'Could not parse that URL\'s html' )
|
|
|
|
|
|
|
|
queue = urls
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'queue', queue )
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
self._job_key.SetVariable( 'status', HydrusData.ToString( e ) )
|
|
|
|
|
|
|
|
HydrusData.ShowException( e )
|
|
|
|
|
2015-09-09 22:04:39 +00:00
|
|
|
time.sleep( 5 )
|
2015-03-25 22:04:19 +00:00
|
|
|
|
|
|
|
finally: self._job_key.Finish()
|
|
|
|
|
|
|
|
|