hydrus/hydrus/client/db/ClientDBURLMap.py

import re
import sqlite3
import typing

from hydrus.core import HydrusConstants as HC
from hydrus.core import HydrusData

from hydrus.client.db import ClientDBMaster
from hydrus.client.db import ClientDBModule

class ClientDBURLMap( ClientDBModule.ClientDBModule ):

    def __init__( self, cursor: sqlite3.Cursor, modules_urls: ClientDBMaster.ClientDBMasterURLs ):

        self.modules_urls = modules_urls

        ClientDBModule.ClientDBModule.__init__( self, 'client urls mapping', cursor )


    def _GetInitialIndexGenerationDict( self ) -> dict:

        index_generation_dict = {}

        index_generation_dict[ 'main.url_map' ] = [
            ( [ 'url_id' ], False, 485 )
        ]

        return index_generation_dict


    def _GetInitialTableGenerationDict( self ) -> dict:

        return {
            'main.url_map' : ( 'CREATE TABLE IF NOT EXISTS {} ( hash_id INTEGER, url_id INTEGER, PRIMARY KEY ( hash_id, url_id ) );', 485 )
        }


    def AddMapping( self, hash_id: int, url: str ):

        url_id = self.modules_urls.GetURLId( url )

        self._Execute( 'INSERT OR IGNORE INTO url_map ( hash_id, url_id ) VALUES ( ?, ? );', ( hash_id, url_id ) )


    def DeleteMapping( self, hash_id: int, url: str ):

        url_id = self.modules_urls.GetURLId( url )

        self._Execute( 'DELETE FROM url_map WHERE hash_id = ? AND url_id = ?;', ( hash_id, url_id ) )


    def GetHashIds( self, search_url: str ):

        hash_ids = self._STS( self._Execute( 'SELECT hash_id FROM url_map NATURAL JOIN urls WHERE url = ?;', ( search_url, ) ) )

        return hash_ids


    def GetHashIdsFromURLRule( self, rule_type, rule, hash_ids = None, hash_ids_table_name = None ):

        if rule_type == 'exact_match':

            url = rule

            table_name = 'url_map NATURAL JOIN urls'

            if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:

                table_name += ' NATURAL JOIN {}'.format( hash_ids_table_name )


            select = 'SELECT hash_id FROM {} WHERE url = ?;'.format( table_name )

            result_hash_ids = self._STS( self._Execute( select, ( url, ) ) )

            return result_hash_ids

        elif rule_type in ( 'url_class', 'url_match' ):

            url_class = rule

            domain = url_class.GetDomain()

            if url_class.MatchesSubdomains():

                domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )

            else:

                domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain, only_www_subdomains = True )


            result_hash_ids = set()

            with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:

                if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:

                    # if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
                    # next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
                    select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id );'.format( hash_ids_table_name, temp_domain_table_name )

                else:

                    # domains to urls to url map
                    select = 'SELECT hash_id, url FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )


                for ( hash_id, url ) in self._Execute( select ):

                    # this is actually insufficient, as more detailed url classes may match
                    if hash_id not in result_hash_ids and url_class.Matches( url ):

                        result_hash_ids.add( hash_id )


            return result_hash_ids

        elif rule_type in 'domain':

            domain = rule

            # if we search for site.com, we also want artist.site.com or www.site.com or cdn2.site.com
            domain_ids = self.modules_urls.GetURLDomainAndSubdomainIds( domain )

            result_hash_ids = set()

            with self._MakeTemporaryIntegerTable( domain_ids, 'domain_id' ) as temp_domain_table_name:

                if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:

                    # if we aren't gonk mode with the number of files, temp hashes to url map to urls to domains
                    # next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on url domain count
                    select = 'SELECT hash_id FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id ) CROSS JOIN {} USING ( domain_id )'.format( hash_ids_table_name, temp_domain_table_name )

                else:

                    # domains to urls to url map
                    select = 'SELECT hash_id FROM {} CROSS JOIN urls USING ( domain_id ) CROSS JOIN url_map USING ( url_id );'.format( temp_domain_table_name )


                result_hash_ids = self._STS( self._Execute( select ) )


            return result_hash_ids

        else:

            regex = rule

            if hash_ids_table_name is not None and hash_ids is not None and len( hash_ids ) < 50000:

                # if we aren't gonk mode with the number of files, temp hashes to url map to urls
                # next step here is irl profiling and a domain->url_count cache so I can decide whether to do this or not based on _TOTAL_ url count
                select = 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name )

            else:

                select = 'SELECT hash_id, url FROM url_map NATURAL JOIN urls;'


            result_hash_ids = set()

            for ( hash_id, url ) in self._Execute( select ):

                if hash_id not in result_hash_ids and re.search( regex, url ) is not None:

                    result_hash_ids.add( hash_id )


            return result_hash_ids


    def GetHashIdsToURLs( self, hash_ids_table_name = None ):

        hash_ids_to_urls = {}

        if hash_ids_table_name is not None:

            hash_ids_to_urls = HydrusData.BuildKeyToSetDict( self._Execute( 'SELECT hash_id, url FROM {} CROSS JOIN url_map USING ( hash_id ) CROSS JOIN urls USING ( url_id );'.format( hash_ids_table_name ) ) )


        return hash_ids_to_urls


    def GetTablesAndColumnsThatUseDefinitions( self, content_type: int ) -> typing.List[ typing.Tuple[ str, str ] ]:

        # if content type is a domain, then give urls? bleh

        tables_and_columns = []

        if content_type == HC.CONTENT_TYPE_FILES:

            tables_and_columns.append( ( 'main.url_map', 'hash_id' ) )


        return tables_and_columns