hydrus/include/ClientDownloading.py

336 lines
12 KiB
Python
Raw Normal View History

2015-03-25 22:04:19 +00:00
import bs4
2019-01-09 22:59:03 +00:00
from . import ClientNetworkingDomain
from . import ClientNetworkingJobs
from . import ClientParsing
from . import HydrusConstants as HC
from . import HydrusExceptions
from . import HydrusPaths
from . import HydrusSerialisable
from . import HydrusTags
2015-03-25 22:04:19 +00:00
import json
import os
import re
2017-05-17 21:53:02 +00:00
import requests
2015-03-25 22:04:19 +00:00
import threading
2017-05-17 21:53:02 +00:00
import time
2019-01-09 22:59:03 +00:00
from . import HydrusData
from . import ClientConstants as CC
from . import HydrusGlobals as HG
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
def ConvertBooruToNewObjects( booru ):
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
name = booru.GetName()
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
name = 'zzz - auto-generated from legacy booru system - ' + name
( search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces ) = booru.GetData()
if advance_by_page_num:
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
search_url = search_url.replace( '%index%', '1' )
2015-10-07 21:56:22 +00:00
else:
2018-09-05 20:52:32 +00:00
search_url = search_url.replace( '%index%', '0' )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
gug = ClientNetworkingDomain.GalleryURLGenerator( name + ' search', url_template = search_url, replacement_phrase = '%tags%', search_terms_separator = search_separator, initial_search_text = 'tag search', example_search_text = 'blonde_hair blue_eyes' )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
#
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
tag_rules = []
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = None
tag_attributes = { 'class' : thumb_classname }
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = None
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'href' )
url_type = HC.URL_TYPE_DESIRED
priority = 50
additional_info = ( url_type, priority )
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
thumb_content_parser = ClientParsing.ContentParser( name = 'get post urls (based on old booru thumb search)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
gallery_parser = ClientParsing.PageParser( name + ' gallery page parser', content_parsers = [ thumb_content_parser ], example_urls = [ gug.GetExampleURL() ] )
#
content_parsers = []
if image_id is not None:
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
tag_rules = []
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = { 'id' : image_id }
tag_index = None
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'href' )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
url_type = HC.URL_TYPE_DESIRED
priority = 75
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
additional_info = ( url_type, priority )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
image_link_content_parser = ClientParsing.ContentParser( name = 'get image file link url (based on old booru parser)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
content_parsers.append( image_link_content_parser )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
#
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
tag_rules = []
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'img'
tag_attributes = { 'id' : image_id }
tag_index = None
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'src' )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
url_type = HC.URL_TYPE_DESIRED
priority = 50
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
additional_info = ( url_type, priority )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
image_src_content_parser = ClientParsing.ContentParser( name = 'get image file src url (based on old booru parser)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
content_parsers.append( image_src_content_parser )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
elif image_data is not None:
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
tag_rules = []
2015-11-11 21:20:41 +00:00
2018-09-05 20:52:32 +00:00
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = None
tag_index = None
2015-11-11 21:20:41 +00:00
2018-09-05 20:52:32 +00:00
string_match = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FIXED, match_value = image_data, example_string = image_data )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index, should_test_tag_string = True, tag_string_string_match = string_match ) )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'href' )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
url_type = HC.URL_TYPE_DESIRED
priority = 50
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
additional_info = ( url_type, priority )
2015-10-07 21:56:22 +00:00
2018-09-05 20:52:32 +00:00
image_link_content_parser = ClientParsing.ContentParser( name = 'get image file url (based on old booru parser)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
content_parsers.append( image_link_content_parser )
2015-03-25 22:04:19 +00:00
2019-01-09 22:59:03 +00:00
for ( classname, namespace ) in list(tag_classnames_to_namespaces.items()):
2017-06-07 22:05:15 +00:00
2018-09-05 20:52:32 +00:00
tag_rules = []
2017-06-07 22:05:15 +00:00
2018-09-05 20:52:32 +00:00
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = None
tag_attributes = { 'class' : classname }
tag_index = None
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = None
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_STRING )
additional_info = namespace
tag_content_parser = ClientParsing.ContentParser( name = 'get "' + namespace + '" tags', content_type = HC.CONTENT_TYPE_MAPPINGS, formula = formula, additional_info = additional_info )
content_parsers.append( tag_content_parser )
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
post_parser = ClientParsing.PageParser( name + ' post page parser', content_parsers = content_parsers, example_urls = [] )
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
#
return ( gug, gallery_parser, post_parser )
def ConvertGalleryIdentifierToGUGKeyAndName( gallery_identifier ):
gug_name = ConvertGalleryIdentifierToGUGName( gallery_identifier )
2019-01-09 22:59:03 +00:00
from . import ClientDefaults
2018-09-05 20:52:32 +00:00
gugs = ClientDefaults.GetDefaultGUGs()
for gug in gugs:
if gug.GetName() == gug_name:
return gug.GetGUGKeyAndName()
2015-03-25 22:04:19 +00:00
2018-09-05 20:52:32 +00:00
return ( HydrusData.GenerateKey(), gug_name )
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
def ConvertGalleryIdentifierToGUGName( gallery_identifier ):
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
site_type = gallery_identifier.GetSiteType()
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
if site_type == HC.SITE_TYPE_DEVIANT_ART:
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
return 'deviant art artist lookup'
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
elif site_type == HC.SITE_TYPE_TUMBLR:
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
return 'tumblr username lookup'
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
elif site_type == HC.SITE_TYPE_NEWGROUNDS:
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
return 'newgrounds artist lookup'
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST:
return 'hentai foundry artist lookup'
elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_TAGS:
return 'hentai foundry tag search'
elif site_type == HC.SITE_TYPE_PIXIV_ARTIST_ID:
return 'pixiv artist lookup'
elif site_type == HC.SITE_TYPE_PIXIV_TAG:
return 'pixiv tag search'
elif site_type == HC.SITE_TYPE_BOORU:
booru_name_converter = {}
booru_name_converter[ 'gelbooru' ] = 'gelbooru tag search'
booru_name_converter[ 'safebooru' ] = 'safebooru tag search'
booru_name_converter[ 'e621' ] = 'e621 tag search'
booru_name_converter[ 'rule34@paheal' ] = 'rule34.paheal tag search'
booru_name_converter[ 'danbooru' ] = 'danbooru tag search'
booru_name_converter[ 'mishimmie' ] = 'mishimmie tag search'
booru_name_converter[ 'rule34@booru.org' ] = 'rule34.xxx tag search'
booru_name_converter[ 'furry@booru.org' ] = 'furry.booru.org tag search'
booru_name_converter[ 'xbooru' ] = 'xbooru tag search'
booru_name_converter[ 'konachan' ] = 'konachan tag search'
booru_name_converter[ 'yande.re' ] = 'yande.re tag search'
booru_name_converter[ 'tbib' ] = 'tbib tag search'
booru_name_converter[ 'sankaku chan' ] = 'sankaku channel tag search'
booru_name_converter[ 'sankaku idol' ] = 'sankaku idol tag search'
booru_name_converter[ 'rule34hentai' ] = 'rule34hentai tag search'
booru_name = gallery_identifier.GetAdditionalInfo()
if booru_name in booru_name_converter:
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
return booru_name_converter[ booru_name ]
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
else:
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
return booru_name
2015-08-19 21:48:21 +00:00
2018-09-05 20:52:32 +00:00
else:
return 'unknown site'
2015-10-07 21:56:22 +00:00
class GalleryIdentifier( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IDENTIFIER
2017-11-29 21:48:23 +00:00
SERIALISABLE_NAME = 'Gallery Identifier'
2015-10-07 21:56:22 +00:00
SERIALISABLE_VERSION = 1
def __init__( self, site_type = None, additional_info = None ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._site_type = site_type
self._additional_info = additional_info
def __eq__( self, other ):
return self.__hash__() == other.__hash__()
def __hash__( self ):
return ( self._site_type, self._additional_info ).__hash__()
def __ne__( self, other ):
return self.__hash__() != other.__hash__()
def __repr__( self ):
text = 'Gallery Identifier: ' + HC.site_type_string_lookup[ self._site_type ]
if self._site_type == HC.SITE_TYPE_BOORU:
2019-01-09 22:59:03 +00:00
text += ': ' + str( self._additional_info )
2015-10-07 21:56:22 +00:00
return text
def _GetSerialisableInfo( self ):
return ( self._site_type, self._additional_info )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._site_type, self._additional_info ) = serialisable_info
2015-03-25 22:04:19 +00:00
2015-10-07 21:56:22 +00:00
def GetAdditionalInfo( self ):
2015-03-25 22:04:19 +00:00
2015-10-07 21:56:22 +00:00
return self._additional_info
2015-03-25 22:04:19 +00:00
2015-10-07 21:56:22 +00:00
def GetSiteType( self ):
2015-03-25 22:04:19 +00:00
2015-10-07 21:56:22 +00:00
return self._site_type
2015-03-25 22:04:19 +00:00
2015-10-14 21:02:25 +00:00
def ToString( self ):
text = HC.site_type_string_lookup[ self._site_type ]
if self._site_type == HC.SITE_TYPE_BOORU and self._additional_info is not None:
booru_name = self._additional_info
2018-08-01 20:44:57 +00:00
text = booru_name
2015-10-14 21:02:25 +00:00
return text
2015-10-07 21:56:22 +00:00
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IDENTIFIER ] = GalleryIdentifier