hydrus/hydrus/client/db/ClientDBURLMap.py

204 lines
7.9 KiB
Python

import re
import sqlite3
import typing
from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData
from hydrus.client.db import ClientDBMaster
from hydrus.client.db import ClientDBModule
class ClientDBURLMap( ClientDBModule.ClientDBModule ):
def __init__( self, cursor: sqlite3.Cursor, modules_urls: ClientDBMaster.ClientDBMasterURLs ):
self.modules_urls = modules_urls
ClientDBModule.ClientDBModule.__init__( self, 'client urls mapping', cursor )
def _GetInitialIndexGenerationDict( self ) -> dict:
index_generation_dict = {}
index_generation_dict[ 'main.url_map' ] = [
( [ 'url_id' ], False, 485 )
]
return index_generation_dict
def _GetInitialTableGenerationDict( self ) -> dict:
return {
'main.url_map' : ( 'CREATE TABLE IF NOT EXISTS {} ( hash_id INTEGER, url_id INTEGER, PRIMARY KEY ( hash_id, url_id ) );', 485 )
}
def AddMapping( self, hash_id: int, url: str ):
url_id = self.modules_urls.GetURLId( url )
self._Execute( 'INSERT OR IGNORE INTO url_map ( hash_id, url_id ) VALUES ( ?, ? );', ( hash_id, url_id ) )
def DeleteMapping( self, hash_id: int, url: str ):
url_id = self.modules_urls.GetURLId( url )
self._Execute( 'DELETE FROM url_map WHERE hash_id = ? AND url_id = ?;', ( hash_id, url_id ) )
def GetHashIds( self, search_url: str ):
hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM url_map NATURAL JOIN urls WHERE url = ?;', ( search_url, ) ) )
return hash_ids
def GetHashIdsFromURLRule( self, rule_type, rule, hash_ids = None, hash_ids_table_name = None ):
if rule_type == 'exact_match':
url = rule
table_name = 'url_map NATURAL JOIN urls'
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
table_name += ' NATURAL JOIN {}'.format( hash_ids_table_name )
select = 'SELECT hash_id FROM {} WHERE url = ?;'.format( table_name )
result_hash_ids = self._STS( self._Execute( select, ( url, ) ) )
return result_hash_ids
elif rule_type in ( 'url_class', 'url_match' ):
url_class = rule
domain = url_class.GetDomain()
if url_class.MatchesSubdomains():
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
else:
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain, only_www_subdomains = True )
result_hash_ids = set()
with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id );'.format( hash_ids_table_name, temp_domain_table_name )
else:
# domains to urls to url map
select = 'SELECT hash_id, url FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
for ( hash_id, url ) in self._Execute( select ):
# this is actually insufficient, as more detailed url classes may match
if hash_id not in result_hash_ids and url_class.Matches( url ):
result_hash_ids.add( hash_id )
return result_hash_ids
elif rule_type in 'domain':
domain = rule
# if we search for site.com, we also want artist.site.com or www.site.com or cdn2.site.com
domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
result_hash_ids = set()
with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
select = 'SELECT hash_id FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id )'.format( hash_ids_table_name, temp_domain_table_name )
else:
# domains to urls to url map
select = 'SELECT hash_id FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
result_hash_ids = self._STS( self._Execute( select ) )
return result_hash_ids
else:
regex = rule
if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
# if we aren't gonk mode with the number of files, temp hashes to url map to urls
# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on _TOTAL_ url count
select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name )
else:
select = 'SELECT hash_id, url FROM url_map NATURAL JOIN urls;'
result_hash_ids = set()
for ( hash_id, url ) in self._Execute( select ):
if hash_id not in result_hash_ids and re.search( regex, url ) is not None:
result_hash_ids.add( hash_id )
return result_hash_ids
def GetHashIdsToURLs( self, hash_ids_table_name = None ):
hash_ids_to_urls = {}
if hash_ids_table_name is not None:
hash_ids_to_urls = HydrusData.BuildKeyToSetDict( self._Execute( 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name ) ) )
return hash_ids_to_urls
def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
# if content type is a domain, then give urls? bleh
tables_and_columns = []
if content_type == HC.CONTENT_TYPE_FILES:
tables_and_columns.append( ( 'main.url_map', 'hash_id' ) )
return tables_and_columns