hydrus/include/HydrusTags.py

import collections
import HydrusConstants as HC
import itertools
import os
import threading
import time
import traceback
import HydrusData
import HydrusExceptions
import re
import HydrusGlobals as HG

re_newlines = re.compile( '[\r\n]', re.UNICODE )
re_multiple_spaces = re.compile( '\\s+', re.UNICODE )
re_trailing_space = re.compile( '\\s$', re.UNICODE )
re_leading_space_or_garbage = re.compile( '^(\\s|-|system:)', re.UNICODE )
re_leading_single_colon = re.compile( '^:(?!:)', re.UNICODE )

def CensorshipMatch( tag, censorships ):

    for censorship in censorships:

        if censorship == '': # '' - all non namespaced tags

            ( namespace, subtag ) = SplitTag( tag )

            if namespace == '':

                return True


        elif censorship == ':': # ':' - all namespaced tags

            ( namespace, subtag ) = SplitTag( tag )

            if namespace != '':

                return True


        elif ':' in censorship:

            if censorship.endswith( ':' ): # 'series:' - namespaced tags

                ( namespace, subtag ) = SplitTag( tag )

                if namespace == censorship[:-1]:

                    return True


            else: # 'series:evangelion' - exact match with namespace

                if tag == censorship:

                    return True


        else:

            # 'table' - normal tag, or namespaced version of same

            ( namespace, subtag ) = SplitTag( tag )

            if subtag == censorship:

                return True


    return False

def ConvertTagToSortable( t ):

    if len( t ) > 0 and t[0].isdecimal():

        # We want to maintain that:
        # 0 < 0a < 0b < 1 ( lexicographic comparison )
        # -and-
        # 2 < 22 ( value comparison )
        # So, if the first bit can be turned into an int, split it into ( int, extra )

        int_component = ''

        i = 0

        for character in t:

            if character.isdecimal(): int_component += character
            else: break

            i += 1


        str_component = t[i:]

        number = int( int_component )


        return ( number, str_component )

    else:

        return t


def FilterNamespaces( tags, namespaces ):

    processed_tags = collections.defaultdict( set )

    for tag in tags:

        ( namespace, subtag ) = SplitTag( tag )

        processed_tags[ namespace ].add( tag )


    result = set()

    for namespace in namespaces:

        if namespace == None:

            result.update( processed_tags[ '' ] )

        else:

            result.update( processed_tags[ namespace ] )


    return result

def SortNumericTags( tags ):

    tags = list( tags )

    tags.sort( key = ConvertTagToSortable )

    return tags

def CheckTagNotEmpty( tag ):

    ( namespace, subtag ) = SplitTag( tag )

    if subtag == '':

        raise HydrusExceptions.SizeException( 'Received a zero-length tag!' )


def CleanTag( tag ):

    try:

        tag = tag[:1024]

        tag = tag.lower()

        tag = HydrusData.ToUnicode( tag )

        if tag.startswith( ':' ):

            tag = re_leading_single_colon.sub( '::', tag ) # Convert anything starting with one colon to start with two i.e. :D -> ::D

            tag = StripTextOfGumpf( tag )

        elif ':' in tag:

            tag = StripTextOfGumpf( tag ) # need to repeat here to catch 'system:' stuff

            ( namespace, subtag ) = SplitTag( tag )

            namespace = StripTextOfGumpf( namespace )
            subtag = StripTextOfGumpf( subtag )

            tag = CombineTag( namespace, subtag )

        else:

            tag = StripTextOfGumpf( tag )


    except Exception as e:

        text = 'Was unable to parse the tag: ' + HydrusData.ToUnicode( tag )
        text += os.linesep * 2
        text += HydrusData.ToUnicode( e )

        raise Exception( text )


    return tag

def CleanTags( tags ):

    clean_tags = set()

    for tag in tags:

        tag = CleanTag( tag )

        try: CheckTagNotEmpty( tag )
        except HydrusExceptions.SizeException: continue

        clean_tags.add( tag )


    return clean_tags

def CombineTag( namespace, subtag ):

    if namespace == '':

        if subtag.startswith( ':' ):

            return ':' + subtag

        else:

            return subtag


    else:

        return namespace + ':' + subtag


def SplitTag( tag ):

    if ':' in tag:

        return tag.split( ':', 1 )

    else:

        return ( '', tag )


def StripTextOfGumpf( t ):

    t = re_newlines.sub( '', t )

    t = re_multiple_spaces.sub( ' ', t )

    t = re_trailing_space.sub( '', t )

    while re_leading_space_or_garbage.search( t ) is not None:

        t = re_leading_space_or_garbage.sub( '', t )


    return t