hydrus/include/HydrusTags.py

import collections
import HydrusConstants as HC
import itertools
import os
import threading
import time
import traceback
import HydrusData
import HydrusExceptions
import re
import HydrusGlobals

def CensorshipMatch( tag, censorship ):

    if ':' in censorship:

        if censorship == ':': return ':' in tag # ':' - all namespaced tags
        else: return tag.startswith( censorship )

    else:

        if censorship == '': return ':' not in tag # '' - all non namespaced tags
        else: # 'table' - normal tag, or namespaced version of same

            if ':' in tag: ( namespace, tag ) = tag.split( ':', 1 )

            return tag == censorship


def ConvertTagToSortable( t ):

    if t[0].isdigit():

        # We want to maintain that:
        # 0 < 0a < 0b < 1 ( lexicographic comparison )
        # -and-
        # 2 < 22 ( value comparison )
        # So, if the first bit can be turned into an int, split it into ( int, extra )

        int_component = ''

        i = 0

        for character in t:

            if character.isdigit(): int_component += character
            else: break

            i += 1


        str_component = t[i:]

        return ( int( int_component ), str_component )

    else: return t

def FilterNamespaces( tags, namespaces ):

    processed_tags = collections.defaultdict( set )

    for tag in tags:

        if ':' in tag:

            ( namespace, subtag ) = tag.split( ':', 1 )

            processed_tags[ namespace ].add( tag )

        else: processed_tags[ '' ].add( tag )


    result = set()

    for namespace in namespaces:

        if namespace in ( '', None ): result.update( processed_tags[ '' ] )

        result.update( processed_tags[ namespace ] )


    return result

def SortTags( tags ):

    tags = list( tags )

    tags.sort( key = ConvertTagToSortable )

    return tags

def CheckTagNotEmpty( tag ):

    empty_tag = False

    if tag == '': empty_tag = True

    if ':' in tag:

        ( namespace, subtag ) = tag.split( ':', 1 )

        if subtag == '': empty_tag = True


    if empty_tag: raise HydrusExceptions.SizeException( 'Received a zero-length tag!' )

def CleanTag( tag ):

    try:

        tag = tag[:1024]

        tag = tag.lower()

        tag = HydrusData.ToUnicode( tag )

        tag.replace( '\r', '' )
        tag.replace( '\n', '' )

        tag = re.sub( '[\\s]+', ' ', tag, flags = re.UNICODE ) # turns multiple spaces into single spaces

        tag = re.sub( '\\s\\Z', '', tag, flags = re.UNICODE ) # removes space at the end

        while re.match( '\\s|-|system:', tag, flags = re.UNICODE ) is not None:

            tag = re.sub( '\\A(\\s|-|system:)', '', tag, flags = re.UNICODE ) # removes spaces or garbage at the beginning


    except Exception as e:

        text = 'Was unable to parse the tag: ' + HydrusData.ToUnicode( tag )
        text += os.linesep * 2
        text += str( e )

        raise Exception( text )


    return tag

def CleanTags( tags ):

    clean_tags = set()

    for tag in tags:

        tag = CleanTag( tag )

        try: CheckTagNotEmpty( tag )
        except HydrusExceptions.SizeException: continue

        clean_tags.add( tag )


    return clean_tags