import os import sqlite3 import typing from hydrus.core import HydrusData from hydrus.core import HydrusDB from hydrus.core import HydrusDBModule from hydrus.core import HydrusExceptions from hydrus.core import HydrusTags from hydrus.client.networking import ClientNetworkingDomain class ClientDBMasterHashes( HydrusDBModule.HydrusDBModule ): def __init__( self, cursor: sqlite3.Cursor ): HydrusDBModule.HydrusDBModule.__init__( self, 'client hashes master', cursor ) self._hash_ids_to_hashes_cache = {} def _GetIndexGenerationTuples( self ): index_generation_tuples = [] index_generation_tuples.append( ( 'external_master.local_hashes', [ 'md5' ], False ) ) index_generation_tuples.append( ( 'external_master.local_hashes', [ 'sha1' ], False ) ) index_generation_tuples.append( ( 'external_master.local_hashes', [ 'sha512' ], False ) ) return index_generation_tuples def _PopulateHashIdsToHashesCache( self, hash_ids, exception_on_error = False ): if len( self._hash_ids_to_hashes_cache ) > 100000: if not isinstance( hash_ids, set ): hash_ids = set( hash_ids ) self._hash_ids_to_hashes_cache = { hash_id : hash for ( hash_id, hash ) in self._hash_ids_to_hashes_cache.items() if hash_id in hash_ids } uncached_hash_ids = { hash_id for hash_id in hash_ids if hash_id not in self._hash_ids_to_hashes_cache } if len( uncached_hash_ids ) > 0: pubbed_error = False if len( uncached_hash_ids ) == 1: ( uncached_hash_id, ) = uncached_hash_ids rows = self._c.execute( 'SELECT hash_id, hash FROM hashes WHERE hash_id = ?;', ( uncached_hash_id, ) ).fetchall() else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_hash_ids, 'hash_id' ) as temp_table_name: # temp hash_ids to actual hashes rows = self._c.execute( 'SELECT hash_id, hash FROM {} CROSS JOIN hashes USING ( hash_id );'.format( temp_table_name ) ).fetchall() uncached_hash_ids_to_hashes = dict( rows ) if len( uncached_hash_ids_to_hashes ) < len( uncached_hash_ids ): for hash_id in uncached_hash_ids: if hash_id not in uncached_hash_ids_to_hashes: if exception_on_error: raise HydrusExceptions.DataMissing( 'Did not find all entries for those hash ids!' ) HydrusData.DebugPrint( 'Database hash error: hash_id ' + str( hash_id ) + ' was missing!' ) HydrusData.PrintException( Exception( 'Missing file identifier stack trace.' ) ) if not pubbed_error: HydrusData.ShowText( 'A file identifier was missing! This is a serious error that means your client database has an orphan file id! Think about contacting hydrus dev!' ) pubbed_error = True hash = bytes.fromhex( 'aaaaaaaaaaaaaaaa' ) + os.urandom( 16 ) uncached_hash_ids_to_hashes[ hash_id ] = hash self._hash_ids_to_hashes_cache.update( uncached_hash_ids_to_hashes ) def CreateTables( self ): self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.hashes ( hash_id INTEGER PRIMARY KEY, hash BLOB_BYTES UNIQUE );' ) self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.local_hashes ( hash_id INTEGER PRIMARY KEY, md5 BLOB_BYTES, sha1 BLOB_BYTES, sha512 BLOB_BYTES );' ) def GetExpectedTableNames( self ) -> typing.Collection[ str ]: expected_table_names = [ 'external_master.hashes', 'external_master.local_hashes' ] return expected_table_names def GetExtraHash( self, hash_type, hash_id ): result = self._c.execute( 'SELECT {} FROM local_hashes WHERE hash_id = ?;'.format( hash_type ), ( hash_id, ) ).fetchone() if result is None: raise HydrusExceptions.DataMissing( '{} not available for file {}!'.format( hash_type, hash_id ) ) ( hash, ) = result return hash def GetFileHashes( self, given_hashes, given_hash_type, desired_hash_type ): if given_hash_type == 'sha256': hash_ids = self.GetHashIds( given_hashes ) else: hash_ids = [] for given_hash in given_hashes: if given_hash is None: continue result = self._c.execute( 'SELECT hash_id FROM local_hashes WHERE {} = ?;'.format( given_hash_type ), ( sqlite3.Binary( given_hash ), ) ).fetchone() if result is not None: ( hash_id, ) = result hash_ids.append( hash_id ) if desired_hash_type == 'sha256': desired_hashes = self.GetHashes( hash_ids ) else: desired_hashes = [ desired_hash for ( desired_hash, ) in self._c.execute( 'SELECT {} FROM local_hashes WHERE hash_id IN {};'.format( desired_hash_type, HydrusData.SplayListForDB( hash_ids ) ) ) ] return desired_hashes def GetHash( self, hash_id ): self._PopulateHashIdsToHashesCache( ( hash_id, ) ) return self._hash_ids_to_hashes_cache[ hash_id ] def GetHashes( self, hash_ids ): self._PopulateHashIdsToHashesCache( hash_ids ) return [ self._hash_ids_to_hashes_cache[ hash_id ] for hash_id in hash_ids ] def GetHashId( self, hash ) -> int: result = self._c.execute( 'SELECT hash_id FROM hashes WHERE hash = ?;', ( sqlite3.Binary( hash ), ) ).fetchone() if result is None: self._c.execute( 'INSERT INTO hashes ( hash ) VALUES ( ? );', ( sqlite3.Binary( hash ), ) ) hash_id = self._c.lastrowid else: ( hash_id, ) = result return hash_id def GetHashIdFromExtraHash( self, hash_type, hash ): if hash_type == 'md5': result = self._c.execute( 'SELECT hash_id FROM local_hashes WHERE md5 = ?;', ( sqlite3.Binary( hash ), ) ).fetchone() elif hash_type == 'sha1': result = self._c.execute( 'SELECT hash_id FROM local_hashes WHERE sha1 = ?;', ( sqlite3.Binary( hash ), ) ).fetchone() elif hash_type == 'sha512': result = self._c.execute( 'SELECT hash_id FROM local_hashes WHERE sha512 = ?;', ( sqlite3.Binary( hash ), ) ).fetchone() if result is None: raise HydrusExceptions.DataMissing( 'Hash Id not found for {} hash {}!'.format( hash_type, hash.hex() ) ) ( hash_id, ) = result return hash_id def GetHashIds( self, hashes ) -> typing.Set[ int ]: hash_ids = set() hashes_not_in_db = set() for hash in hashes: if hash is None: continue result = self._c.execute( 'SELECT hash_id FROM hashes WHERE hash = ?;', ( sqlite3.Binary( hash ), ) ).fetchone() if result is None: hashes_not_in_db.add( hash ) else: ( hash_id, ) = result hash_ids.add( hash_id ) if len( hashes_not_in_db ) > 0: self._c.executemany( 'INSERT INTO hashes ( hash ) VALUES ( ? );', ( ( sqlite3.Binary( hash ), ) for hash in hashes_not_in_db ) ) for hash in hashes_not_in_db: ( hash_id, ) = self._c.execute( 'SELECT hash_id FROM hashes WHERE hash = ?;', ( sqlite3.Binary( hash ), ) ).fetchone() hash_ids.add( hash_id ) return hash_ids def GetHashIdsToHashes( self, hash_ids = None, hashes = None ): if hash_ids is not None: self._PopulateHashIdsToHashesCache( hash_ids, exception_on_error = True ) hash_ids_to_hashes = { hash_id : self._hash_ids_to_hashes_cache[ hash_id ] for hash_id in hash_ids } elif hashes is not None: hash_ids_to_hashes = { self.GetHashId( hash ) : hash for hash in hashes } return hash_ids_to_hashes def HasExtraHashes( self, hash_id ): result = self._c.execute( 'SELECT 1 FROM local_hashes WHERE hash_id = ?;', ( hash_id, ) ).fetchone() return result is not None def SetExtraHashes( self, hash_id, md5, sha1, sha512 ): self._c.execute( 'INSERT OR IGNORE INTO local_hashes ( hash_id, md5, sha1, sha512 ) VALUES ( ?, ?, ?, ? );', ( hash_id, sqlite3.Binary( md5 ), sqlite3.Binary( sha1 ), sqlite3.Binary( sha512 ) ) ) class ClientDBMasterTexts( HydrusDBModule.HydrusDBModule ): def __init__( self, cursor: sqlite3.Cursor ): HydrusDBModule.HydrusDBModule.__init__( self, 'client texts master', cursor ) def _GetIndexGenerationTuples( self ): index_generation_tuples = [] return index_generation_tuples def CreateTables( self ): self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.labels ( label_id INTEGER PRIMARY KEY, label TEXT UNIQUE );' ) self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.notes ( note_id INTEGER PRIMARY KEY, note TEXT UNIQUE );' ) self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.texts ( text_id INTEGER PRIMARY KEY, text TEXT UNIQUE );' ) def GetExpectedTableNames( self ) -> typing.Collection[ str ]: expected_table_names = [ 'external_master.labels', 'external_master.notes', 'external_master.texts' ] return expected_table_names def GetLabelId( self, label ): result = self._c.execute( 'SELECT label_id FROM labels WHERE label = ?;', ( label, ) ).fetchone() if result is None: self._c.execute( 'INSERT INTO labels ( label ) VALUES ( ? );', ( label, ) ) label_id = self._c.lastrowid else: ( label_id, ) = result return label_id def GetText( self, text_id ): result = self._c.execute( 'SELECT text FROM texts WHERE text_id = ?;', ( text_id, ) ).fetchone() if result is None: raise HydrusExceptions.DataMissing( 'Text lookup error in database' ) ( text, ) = result return text def GetTextId( self, text ): result = self._c.execute( 'SELECT text_id FROM texts WHERE text = ?;', ( text, ) ).fetchone() if result is None: self._c.execute( 'INSERT INTO texts ( text ) VALUES ( ? );', ( text, ) ) text_id = self._c.lastrowid else: ( text_id, ) = result return text_id class ClientDBMasterTags( HydrusDBModule.HydrusDBModule ): def __init__( self, cursor: sqlite3.Cursor ): HydrusDBModule.HydrusDBModule.__init__( self, 'client master', cursor ) self.null_namespace_id = None self._tag_ids_to_tags_cache = {} def _GetIndexGenerationTuples( self ): index_generation_tuples = [] index_generation_tuples.append( ( 'external_master.tags', [ 'subtag_id' ], False ) ) index_generation_tuples.append( ( 'external_master.tags', [ 'namespace_id', 'subtag_id' ], True ) ) return index_generation_tuples def _PopulateTagIdsToTagsCache( self, tag_ids ): if len( self._tag_ids_to_tags_cache ) > 100000: if not isinstance( tag_ids, set ): tag_ids = set( tag_ids ) self._tag_ids_to_tags_cache = { tag_id : tag for ( tag_id, tag ) in self._tag_ids_to_tags_cache.items() if tag_id in tag_ids } uncached_tag_ids = { tag_id for tag_id in tag_ids if tag_id not in self._tag_ids_to_tags_cache } if len( uncached_tag_ids ) > 0: if len( uncached_tag_ids ) == 1: ( uncached_tag_id, ) = uncached_tag_ids rows = self._c.execute( 'SELECT tag_id, namespace, subtag FROM tags NATURAL JOIN namespaces NATURAL JOIN subtags WHERE tag_id = ?;', ( uncached_tag_id, ) ).fetchall() else: with HydrusDB.TemporaryIntegerTable( self._c, uncached_tag_ids, 'tag_id' ) as temp_table_name: # temp tag_ids to tags to subtags and namespaces rows = self._c.execute( 'SELECT tag_id, namespace, subtag FROM {} CROSS JOIN tags USING ( tag_id ) CROSS JOIN subtags USING ( subtag_id ) CROSS JOIN namespaces USING ( namespace_id );'.format( temp_table_name ) ).fetchall() uncached_tag_ids_to_tags = { tag_id : HydrusTags.CombineTag( namespace, subtag ) for ( tag_id, namespace, subtag ) in rows } if len( uncached_tag_ids_to_tags ) < len( uncached_tag_ids ): for tag_id in uncached_tag_ids: if tag_id not in uncached_tag_ids_to_tags: tag = 'unknown tag:' + HydrusData.GenerateKey().hex() ( namespace, subtag ) = HydrusTags.SplitTag( tag ) namespace_id = self.GetNamespaceId( namespace ) subtag_id = self.GetSubtagId( subtag ) self._c.execute( 'REPLACE INTO tags ( tag_id, namespace_id, subtag_id ) VALUES ( ?, ?, ? );', ( tag_id, namespace_id, subtag_id ) ) uncached_tag_ids_to_tags[ tag_id ] = tag self._tag_ids_to_tags_cache.update( uncached_tag_ids_to_tags ) def CreateTables( self ): self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.namespaces ( namespace_id INTEGER PRIMARY KEY, namespace TEXT UNIQUE );' ) self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.subtags ( subtag_id INTEGER PRIMARY KEY, subtag TEXT UNIQUE );' ) self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.tags ( tag_id INTEGER PRIMARY KEY, namespace_id INTEGER, subtag_id INTEGER );' ) def GetExpectedTableNames( self ) -> typing.Collection[ str ]: expected_table_names = [ 'external_master.namespaces', 'external_master.subtags', 'external_master.tags' ] return expected_table_names def GetNamespaceId( self, namespace ): if namespace == '': if self.null_namespace_id is None: ( self.null_namespace_id, ) = self._c.execute( 'SELECT namespace_id FROM namespaces WHERE namespace = ?;', ( '', ) ).fetchone() return self.null_namespace_id result = self._c.execute( 'SELECT namespace_id FROM namespaces WHERE namespace = ?;', ( namespace, ) ).fetchone() if result is None: self._c.execute( 'INSERT INTO namespaces ( namespace ) VALUES ( ? );', ( namespace, ) ) namespace_id = self._c.lastrowid else: ( namespace_id, ) = result return namespace_id def GetSubtagId( self, subtag ): result = self._c.execute( 'SELECT subtag_id FROM subtags WHERE subtag = ?;', ( subtag, ) ).fetchone() if result is None: self._c.execute( 'INSERT INTO subtags ( subtag ) VALUES ( ? );', ( subtag, ) ) subtag_id = self._c.lastrowid else: ( subtag_id, ) = result return subtag_id def GetTagId( self, tag ): clean_tag = HydrusTags.CleanTag( tag ) try: HydrusTags.CheckTagNotEmpty( clean_tag ) except HydrusExceptions.TagSizeException: raise HydrusExceptions.TagSizeException( '"{}" tag seems not valid--when cleaned, it ends up with zero size!'.format( tag ) ) ( namespace, subtag ) = HydrusTags.SplitTag( clean_tag ) namespace_id = self.GetNamespaceId( namespace ) subtag_id = self.GetSubtagId( subtag ) result = self._c.execute( 'SELECT tag_id FROM tags WHERE namespace_id = ? AND subtag_id = ?;', ( namespace_id, subtag_id ) ).fetchone() if result is None: self._c.execute( 'INSERT INTO tags ( namespace_id, subtag_id ) VALUES ( ?, ? );', ( namespace_id, subtag_id ) ) tag_id = self._c.lastrowid else: ( tag_id, ) = result return tag_id def GetTagIdsToTags( self, tag_ids = None, tags = None ): if tag_ids is not None: self._PopulateTagIdsToTagsCache( tag_ids ) tag_ids_to_tags = { tag_id : self._tag_ids_to_tags_cache[ tag_id ] for tag_id in tag_ids } elif tags is not None: tag_ids_to_tags = { self.GetTagId( tag ) : tag for tag in tags } return tag_ids_to_tags class ClientDBMasterURLs( HydrusDBModule.HydrusDBModule ): def __init__( self, cursor: sqlite3.Cursor ): HydrusDBModule.HydrusDBModule.__init__( self, 'client master', cursor ) def _GetIndexGenerationTuples( self ): index_generation_tuples = [] index_generation_tuples.append( ( 'external_master.urls', [ 'domain_id' ], False ) ) return index_generation_tuples def CreateTables( self ): self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.url_domains ( domain_id INTEGER PRIMARY KEY, domain TEXT UNIQUE );' ) self._c.execute( 'CREATE TABLE IF NOT EXISTS external_master.urls ( url_id INTEGER PRIMARY KEY, domain_id INTEGER, url TEXT UNIQUE );' ) def GetExpectedTableNames( self ) -> typing.Collection[ str ]: expected_table_names = [ 'external_master.url_domains', 'external_master.urls' ] return expected_table_names def GetURLDomainId( self, domain ): result = self._c.execute( 'SELECT domain_id FROM url_domains WHERE domain = ?;', ( domain, ) ).fetchone() if result is None: self._c.execute( 'INSERT INTO url_domains ( domain ) VALUES ( ? );', ( domain, ) ) domain_id = self._c.lastrowid else: ( domain_id, ) = result return domain_id def GetURLDomainAndSubdomainIds( self, domain, only_www_subdomains = False ): domain = ClientNetworkingDomain.RemoveWWWFromDomain( domain ) domain_ids = set() domain_ids.add( self.GetURLDomainId( domain ) ) if only_www_subdomains: search_phrase = 'www%.{}'.format( domain ) else: search_phrase = '%.{}'.format( domain ) for ( domain_id, ) in self._c.execute( 'SELECT domain_id FROM url_domains WHERE domain LIKE ?;', ( search_phrase, ) ): domain_ids.add( domain_id ) return domain_ids def GetURLId( self, url ): result = self._c.execute( 'SELECT url_id FROM urls WHERE url = ?;', ( url, ) ).fetchone() if result is None: try: domain = ClientNetworkingDomain.ConvertURLIntoDomain( url ) except HydrusExceptions.URLClassException: domain = 'unknown.com' domain_id = self.GetURLDomainId( domain ) self._c.execute( 'INSERT INTO urls ( domain_id, url ) VALUES ( ?, ? );', ( domain_id, url ) ) url_id = self._c.lastrowid else: ( url_id, ) = result return url_id