hydrus/hydrus/client/db/ClientDBURLMap.py

import re
import sqlite3
import typing

from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData

from hydrus.client.db import ClientDBMaster
from hydrus.client.db import ClientDBModule

class ClientDBURLMap( ClientDBModule.ClientDBModule ):
    
    def __init__( self, cursor: sqlite3.Cursor, modules_urls: ClientDBMaster.ClientDBMasterURLs ):
        
        self.modules_urls = modules_urls
        
        ClientDBModule.ClientDBModule.__init__( self, 'client urls mapping', cursor )
        
    
    def _GetInitialIndexGenerationDict( self ) -> dict:
        
        index_generation_dict = {}
        
        index_generation_dict[ 'main.url_map' ] = [
            ( [ 'url_id' ], False, 485 )
        ]
        
        return index_generation_dict
        
    
    def _GetInitialTableGenerationDict( self ) -> dict:
        
        return {
            'main.url_map' : ( 'CREATE TABLE IF NOT EXISTS {} ( hash_id INTEGER, url_id INTEGER, PRIMARY KEY ( hash_id, url_id ) );', 485 )
        }
        
    
    def AddMapping( self, hash_id: int, url: str ):
        
        url_id = self.modules_urls.GetURLId( url )
        
        self._Execute( 'INSERT OR IGNORE INTO url_map ( hash_id, url_id ) VALUES ( ?, ? );', ( hash_id, url_id ) )
        
    
    def DeleteMapping( self, hash_id: int, url: str ):
        
        url_id = self.modules_urls.GetURLId( url )
        
        self._Execute( 'DELETE FROM url_map WHERE hash_id = ? AND url_id = ?;', ( hash_id, url_id ) )
        
    
    def GetHashIds( self, search_url: str ):
        
        hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM url_map NATURAL JOIN urls WHERE url = ?;', ( search_url, ) ) )
        
        return hash_ids
        
    
    def GetHashIdsFromURLRule( self, rule_type, rule, hash_ids = None, hash_ids_table_name = None ):
        
        if rule_type == 'exact_match':
            
            url = rule
            
            table_name = 'url_map NATURAL JOIN urls'
            
            if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
                
                table_name += ' NATURAL JOIN {}'.format( hash_ids_table_name )
                
            
            select = 'SELECT hash_id FROM {} WHERE url = ?;'.format( table_name )
            
            result_hash_ids = self._STS( self._Execute( select, ( url, ) ) )
            
            return result_hash_ids
            
        elif rule_type in ( 'url_class', 'url_match' ):
            
            url_class = rule
            
            domain = url_class.GetDomain()
            
            if url_class.MatchesSubdomains():
                
                domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
                
            else:
                
                domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain, only_www_subdomains = True )
                
            
            result_hash_ids = set()
            
            with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
                
                if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
                    
                    # if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
                    # next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
                    select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id );'.format( hash_ids_table_name, temp_domain_table_name )
                    
                else:
                    
                    # domains to urls to url map
                    select = 'SELECT hash_id, url FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
                    
                
                for ( hash_id, url ) in self._Execute( select ):
                    
                    # this is actually insufficient, as more detailed url classes may match
                    if hash_id not in result_hash_ids and url_class.Matches( url ):
                        
                        result_hash_ids.add( hash_id )
                        
                    
                
            
            return result_hash_ids
            
        elif rule_type in 'domain':
            
            domain = rule
            
            # if we search for site.com, we also want artist.site.com or www.site.com or cdn2.site.com
            domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )
            
            result_hash_ids = set()
            
            with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:
                
                if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
                    
                    # if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
                    # next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
                    select = 'SELECT hash_id FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id )'.format( hash_ids_table_name, temp_domain_table_name )
                    
                else:
                    
                    # domains to urls to url map
                    select = 'SELECT hash_id FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )
                    
                
                result_hash_ids = self._STS( self._Execute( select ) )
                
            
            return result_hash_ids
            
        else:
            
            regex = rule
            
            if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:
                
                # if we aren't gonk mode with the number of files, temp hashes to url map to urls
                # next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on _TOTAL_ url count
                select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name )
                
            else:
                
                select = 'SELECT hash_id, url FROM url_map NATURAL JOIN urls;'
                
            
            result_hash_ids = set()
            
            for ( hash_id, url ) in self._Execute( select ):
                
                if hash_id not in result_hash_ids and re.search( regex, url ) is not None:
                    
                    result_hash_ids.add( hash_id )
                    
                
            
            return result_hash_ids
            
        
    
    def GetHashIdsToURLs( self, hash_ids_table_name = None ):
        
        hash_ids_to_urls = {}
        
        if hash_ids_table_name is not None:
            
            hash_ids_to_urls = HydrusData.BuildKeyToSetDict( self._Execute( 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name ) ) )
            
        
        return hash_ids_to_urls
    
    
    def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:
        
        # if content type is a domain, then give urls? bleh
        
        tables_and_columns = []
        
        if content_type == HC.CONTENT_TYPE_FILES:
            
            tables_and_columns.append( ( 'main.url_map', 'hash_id' ) )
            
        
        return tables_and_columns
Version 485 2022-05-18 20:18:25 +00:00			`import re`
			`import sqlite3`
			`import typing`

			`from hydrus.core import HydrusConstants as HC`
			`from hydrus.core import HydrusData`

			`from hydrus.client.db import ClientDBMaster`
			`from hydrus.client.db import ClientDBModule`

			`class ClientDBURLMap( ClientDBModule.ClientDBModule ):`

			`def __init__( self, cursor: sqlite3.Cursor, modules_urls: ClientDBMaster.ClientDBMasterURLs ):`

			`self.modules_urls = modules_urls`

			`ClientDBModule.ClientDBModule.__init__( self, 'client urls mapping', cursor )`


			`def _GetInitialIndexGenerationDict( self ) -> dict:`

			`index_generation_dict = {}`

			`index_generation_dict[ 'main.url_map' ] = [`
			`( [ 'url_id' ], False, 485 )`
			`]`

			`return index_generation_dict`


			`def _GetInitialTableGenerationDict( self ) -> dict:`

			`return {`
			`'main.url_map' : ( 'CREATE TABLE IF NOT EXISTS {} ( hash_id INTEGER, url_id INTEGER, PRIMARY KEY ( hash_id, url_id ) );', 485 )`
			`}`


			`def AddMapping( self, hash_id: int, url: str ):`

			`url_id = self.modules_urls.GetURLId( url )`

			`self._Execute( 'INSERT OR IGNORE INTO url_map ( hash_id, url_id ) VALUES ( ?, ? );', ( hash_id, url_id ) )`


			`def DeleteMapping( self, hash_id: int, url: str ):`

			`url_id = self.modules_urls.GetURLId( url )`

			`self._Execute( 'DELETE FROM url_map WHERE hash_id = ? AND url_id = ?;', ( hash_id, url_id ) )`


			`def GetHashIds( self, search_url: str ):`

			`hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM url_map NATURAL JOIN urls WHERE url = ?;', ( search_url, ) ) )`

			`return hash_ids`


			`def GetHashIdsFromURLRule( self, rule_type, rule, hash_ids = None, hash_ids_table_name = None ):`

			`if rule_type == 'exact_match':`

			`url = rule`

			`table_name = 'url_map NATURAL JOIN urls'`

			`if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:`

			`table_name += ' NATURAL JOIN {}'.format( hash_ids_table_name )`


			`select = 'SELECT hash_id FROM {} WHERE url = ?;'.format( table_name )`

			`result_hash_ids = self._STS( self._Execute( select, ( url, ) ) )`

			`return result_hash_ids`

			`elif rule_type in ( 'url_class', 'url_match' ):`

			`url_class = rule`

			`domain = url_class.GetDomain()`

			`if url_class.MatchesSubdomains():`

			`domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )`

			`else:`

			`domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain, only_www_subdomains = True )`


			`result_hash_ids = set()`

			`with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:`

			`if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:`

			`# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains`
			`# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count`
			`select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id );'.format( hash_ids_table_name, temp_domain_table_name )`

			`else:`

			`# domains to urls to url map`
			`select = 'SELECT hash_id, url FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )`


			`for ( hash_id, url ) in self._Execute( select ):`

			`# this is actually insufficient, as more detailed url classes may match`
			`if hash_id not in result_hash_ids and url_class.Matches( url ):`

			`result_hash_ids.add( hash_id )`




			`return result_hash_ids`

			`elif rule_type in 'domain':`

			`domain = rule`

			`# if we search for site.com, we also want artist.site.com or www.site.com or cdn2.site.com`
			`domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )`

			`result_hash_ids = set()`

			`with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:`

			`if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:`

			`# if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains`
			`# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count`
			`select = 'SELECT hash_id FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id )'.format( hash_ids_table_name, temp_domain_table_name )`

			`else:`

			`# domains to urls to url map`
			`select = 'SELECT hash_id FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )`


			`result_hash_ids = self._STS( self._Execute( select ) )`


			`return result_hash_ids`

			`else:`

			`regex = rule`

			`if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:`

			`# if we aren't gonk mode with the number of files, temp hashes to url map to urls`
			`# next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on _TOTAL_ url count`
			`select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name )`

			`else:`

			`select = 'SELECT hash_id, url FROM url_map NATURAL JOIN urls;'`


			`result_hash_ids = set()`

			`for ( hash_id, url ) in self._Execute( select ):`

			`if hash_id not in result_hash_ids and re.search( regex, url ) is not None:`

			`result_hash_ids.add( hash_id )`



			`return result_hash_ids`



			`def GetHashIdsToURLs( self, hash_ids_table_name = None ):`

			`hash_ids_to_urls = {}`

			`if hash_ids_table_name is not None:`

			`hash_ids_to_urls = HydrusData.BuildKeyToSetDict( self._Execute( 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name ) ) )`


			`return hash_ids_to_urls`


			`def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:`

			`# if content type is a domain, then give urls? bleh`

			`tables_and_columns = []`

Version 500 closes #1239, closes #1235 2022-09-28 17:15:23 +00:00			`if content_type == HC.CONTENT_TYPE_FILES:`
Version 485 2022-05-18 20:18:25 +00:00
			`tables_and_columns.append( ( 'main.url_map', 'hash_id' ) )`


			`return tables_and_columns`