hydrus/include/ClientDownloading.py

336 lines
12 KiB
Python

import bs4
from . import ClientNetworkingDomain
from . import ClientNetworkingJobs
from . import ClientParsing
from . import HydrusConstants as HC
from . import HydrusExceptions
from . import HydrusPaths
from . import HydrusSerialisable
from . import HydrusTags
import json
import os
import re
import requests
import threading
import time
from . import HydrusData
from . import ClientConstants as CC
from . import HydrusGlobals as HG
def ConvertBooruToNewObjects( booru ):
name = booru.GetName()
name = 'zzz - auto-generated from legacy booru system - ' + name
( search_url, search_separator, advance_by_page_num, thumb_classname, image_id, image_data, tag_classnames_to_namespaces ) = booru.GetData()
if advance_by_page_num:
search_url = search_url.replace( '%index%', '1' )
else:
search_url = search_url.replace( '%index%', '0' )
gug = ClientNetworkingDomain.GalleryURLGenerator( name + ' search', url_template = search_url, replacement_phrase = '%tags%', search_terms_separator = search_separator, initial_search_text = 'tag search', example_search_text = 'blonde_hair blue_eyes' )
#
tag_rules = []
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = None
tag_attributes = { 'class' : thumb_classname }
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = None
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'href' )
url_type = HC.URL_TYPE_DESIRED
priority = 50
additional_info = ( url_type, priority )
thumb_content_parser = ClientParsing.ContentParser( name = 'get post urls (based on old booru thumb search)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
gallery_parser = ClientParsing.PageParser( name + ' gallery page parser', content_parsers = [ thumb_content_parser ], example_urls = [ gug.GetExampleURL() ] )
#
content_parsers = []
if image_id is not None:
tag_rules = []
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = { 'id' : image_id }
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'href' )
url_type = HC.URL_TYPE_DESIRED
priority = 75
additional_info = ( url_type, priority )
image_link_content_parser = ClientParsing.ContentParser( name = 'get image file link url (based on old booru parser)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
content_parsers.append( image_link_content_parser )
#
tag_rules = []
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'img'
tag_attributes = { 'id' : image_id }
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'src' )
url_type = HC.URL_TYPE_DESIRED
priority = 50
additional_info = ( url_type, priority )
image_src_content_parser = ClientParsing.ContentParser( name = 'get image file src url (based on old booru parser)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
content_parsers.append( image_src_content_parser )
elif image_data is not None:
tag_rules = []
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = None
tag_index = None
string_match = ClientParsing.StringMatch( match_type = ClientParsing.STRING_MATCH_FIXED, match_value = image_data, example_string = image_data )
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index, should_test_tag_string = True, tag_string_string_match = string_match ) )
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_ATTRIBUTE, attribute_to_fetch = 'href' )
url_type = HC.URL_TYPE_DESIRED
priority = 50
additional_info = ( url_type, priority )
image_link_content_parser = ClientParsing.ContentParser( name = 'get image file url (based on old booru parser)', content_type = HC.CONTENT_TYPE_URLS, formula = formula, additional_info = additional_info )
content_parsers.append( image_link_content_parser )
for ( classname, namespace ) in list(tag_classnames_to_namespaces.items()):
tag_rules = []
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = None
tag_attributes = { 'class' : classname }
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
rule_type = ClientParsing.HTML_RULE_TYPE_DESCENDING
tag_name = 'a'
tag_attributes = None
tag_index = None
tag_rules.append( ClientParsing.ParseRuleHTML( rule_type = rule_type, tag_name = tag_name, tag_attributes = tag_attributes, tag_index = tag_index ) )
formula = ClientParsing.ParseFormulaHTML( tag_rules = tag_rules, content_to_fetch = ClientParsing.HTML_CONTENT_STRING )
additional_info = namespace
tag_content_parser = ClientParsing.ContentParser( name = 'get "' + namespace + '" tags', content_type = HC.CONTENT_TYPE_MAPPINGS, formula = formula, additional_info = additional_info )
content_parsers.append( tag_content_parser )
post_parser = ClientParsing.PageParser( name + ' post page parser', content_parsers = content_parsers, example_urls = [] )
#
return ( gug, gallery_parser, post_parser )
def ConvertGalleryIdentifierToGUGKeyAndName( gallery_identifier ):
gug_name = ConvertGalleryIdentifierToGUGName( gallery_identifier )
from . import ClientDefaults
gugs = ClientDefaults.GetDefaultGUGs()
for gug in gugs:
if gug.GetName() == gug_name:
return gug.GetGUGKeyAndName()
return ( HydrusData.GenerateKey(), gug_name )
def ConvertGalleryIdentifierToGUGName( gallery_identifier ):
site_type = gallery_identifier.GetSiteType()
if site_type == HC.SITE_TYPE_DEVIANT_ART:
return 'deviant art artist lookup'
elif site_type == HC.SITE_TYPE_TUMBLR:
return 'tumblr username lookup'
elif site_type == HC.SITE_TYPE_NEWGROUNDS:
return 'newgrounds artist lookup'
elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_ARTIST:
return 'hentai foundry artist lookup'
elif site_type == HC.SITE_TYPE_HENTAI_FOUNDRY_TAGS:
return 'hentai foundry tag search'
elif site_type == HC.SITE_TYPE_PIXIV_ARTIST_ID:
return 'pixiv artist lookup'
elif site_type == HC.SITE_TYPE_PIXIV_TAG:
return 'pixiv tag search'
elif site_type == HC.SITE_TYPE_BOORU:
booru_name_converter = {}
booru_name_converter[ 'gelbooru' ] = 'gelbooru tag search'
booru_name_converter[ 'safebooru' ] = 'safebooru tag search'
booru_name_converter[ 'e621' ] = 'e621 tag search'
booru_name_converter[ 'rule34@paheal' ] = 'rule34.paheal tag search'
booru_name_converter[ 'danbooru' ] = 'danbooru tag search'
booru_name_converter[ 'mishimmie' ] = 'mishimmie tag search'
booru_name_converter[ 'rule34@booru.org' ] = 'rule34.xxx tag search'
booru_name_converter[ 'furry@booru.org' ] = 'furry.booru.org tag search'
booru_name_converter[ 'xbooru' ] = 'xbooru tag search'
booru_name_converter[ 'konachan' ] = 'konachan tag search'
booru_name_converter[ 'yande.re' ] = 'yande.re tag search'
booru_name_converter[ 'tbib' ] = 'tbib tag search'
booru_name_converter[ 'sankaku chan' ] = 'sankaku channel tag search'
booru_name_converter[ 'sankaku idol' ] = 'sankaku idol tag search'
booru_name_converter[ 'rule34hentai' ] = 'rule34hentai tag search'
booru_name = gallery_identifier.GetAdditionalInfo()
if booru_name in booru_name_converter:
return booru_name_converter[ booru_name ]
else:
return booru_name
else:
return 'unknown site'
class GalleryIdentifier( HydrusSerialisable.SerialisableBase ):
SERIALISABLE_TYPE = HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IDENTIFIER
SERIALISABLE_NAME = 'Gallery Identifier'
SERIALISABLE_VERSION = 1
def __init__( self, site_type = None, additional_info = None ):
HydrusSerialisable.SerialisableBase.__init__( self )
self._site_type = site_type
self._additional_info = additional_info
def __eq__( self, other ):
return self.__hash__() == other.__hash__()
def __hash__( self ):
return ( self._site_type, self._additional_info ).__hash__()
def __ne__( self, other ):
return self.__hash__() != other.__hash__()
def __repr__( self ):
text = 'Gallery Identifier: ' + HC.site_type_string_lookup[ self._site_type ]
if self._site_type == HC.SITE_TYPE_BOORU:
text += ': ' + str( self._additional_info )
return text
def _GetSerialisableInfo( self ):
return ( self._site_type, self._additional_info )
def _InitialiseFromSerialisableInfo( self, serialisable_info ):
( self._site_type, self._additional_info ) = serialisable_info
def GetAdditionalInfo( self ):
return self._additional_info
def GetSiteType( self ):
return self._site_type
def ToString( self ):
text = HC.site_type_string_lookup[ self._site_type ]
if self._site_type == HC.SITE_TYPE_BOORU and self._additional_info is not None:
booru_name = self._additional_info
text = booru_name
return text
HydrusSerialisable.SERIALISABLE_TYPES_TO_OBJECT_TYPES[ HydrusSerialisable.SERIALISABLE_TYPE_GALLERY_IDENTIFIER ] = GalleryIdentifier