hydrus/include/HydrusTags.py

import collections
import HydrusConstants as HC
import itertools
import os
import threading
import time
import traceback
import HydrusData
import HydrusExceptions
import re
import HydrusGlobals as HG

re_newlines = re.compile( '[\r\n]', re.UNICODE )
re_multiple_spaces = re.compile( '\\s+', re.UNICODE )
re_trailing_space = re.compile( '\\s$', re.UNICODE )
re_leading_space_or_garbage = re.compile( '^(\\s|-|system:)', re.UNICODE )
re_leading_single_colon = re.compile( '^:(?!:)', re.UNICODE )

def CensorshipMatch( tag, censorships ):
    
    for censorship in censorships:
        
        if censorship == '': # '' - all non namespaced tags
            
            ( namespace, subtag ) = SplitTag( tag )
            
            if namespace == '':
                
                return True
                
            
        elif censorship == ':': # ':' - all namespaced tags
            
            ( namespace, subtag ) = SplitTag( tag )
            
            if namespace != '':
                
                return True
                
            
        elif ':' in censorship:
            
            if censorship.endswith( ':' ): # 'series:' - namespaced tags
                
                ( namespace, subtag ) = SplitTag( tag )
                
                if namespace == censorship[:-1]:
                    
                    return True
                    
                
            else: # 'series:evangelion' - exact match with namespace
                
                if tag == censorship:
                    
                    return True
                    
                
            
        else:
            
            # 'table' - normal tag, or namespaced version of same
            
            ( namespace, subtag ) = SplitTag( tag )
            
            if subtag == censorship:
                
                return True
                
            
        
    
    return False
    
def ConvertTagToSortable( t ):
    
    if len( t ) > 0 and t[0].isdecimal():
        
        # We want to maintain that:
        # 0 < 0a < 0b < 1 ( lexicographic comparison )
        # -and-
        # 2 < 22 ( value comparison )
        # So, if the first bit can be turned into an int, split it into ( int, extra )
        
        int_component = ''
        
        i = 0
        
        for character in t:
            
            if character.isdecimal(): int_component += character
            else: break
            
            i += 1
            
        
        str_component = t[i:]
        
        number = int( int_component )
        
        
        return ( number, str_component )
        
    else:
        
        return t
        

def FilterNamespaces( tags, namespaces ):
    
    processed_tags = collections.defaultdict( set )
    
    for tag in tags:
        
        ( namespace, subtag ) = SplitTag( tag )
        
        processed_tags[ namespace ].add( tag )
        
    
    result = set()
    
    for namespace in namespaces:
        
        if namespace == None:
            
            result.update( processed_tags[ '' ] )
            
        else:
            
            result.update( processed_tags[ namespace ] )
            
        
    
    return result
    
def SortNumericTags( tags ):
    
    tags = list( tags )
    
    tags.sort( key = ConvertTagToSortable )
    
    return tags
    
def CheckTagNotEmpty( tag ):
    
    ( namespace, subtag ) = SplitTag( tag )
    
    if subtag == '':
        
        raise HydrusExceptions.SizeException( 'Received a zero-length tag!' )
        
    
def CleanTag( tag ):
    
    try:
        
        tag = tag[:1024]
        
        tag = tag.lower()
        
        tag = HydrusData.ToUnicode( tag )
        
        if tag.startswith( ':' ):
            
            tag = re_leading_single_colon.sub( '::', tag ) # Convert anything starting with one colon to start with two i.e. :D -> ::D
            
            tag = StripTextOfGumpf( tag )
            
        elif ':' in tag:
            
            tag = StripTextOfGumpf( tag ) # need to repeat here to catch 'system:' stuff
            
            ( namespace, subtag ) = SplitTag( tag )
            
            namespace = StripTextOfGumpf( namespace )
            subtag = StripTextOfGumpf( subtag )
            
            tag = CombineTag( namespace, subtag )
            
        else:
            
            tag = StripTextOfGumpf( tag )
            
        
    except Exception as e:
        
        text = 'Was unable to parse the tag: ' + HydrusData.ToUnicode( tag )
        text += os.linesep * 2
        text += HydrusData.ToUnicode( e )
        
        raise Exception( text )
        
    
    return tag

def CleanTags( tags ):
    
    clean_tags = set()
    
    for tag in tags:
        
        tag = CleanTag( tag )
        
        try: CheckTagNotEmpty( tag )
        except HydrusExceptions.SizeException: continue
        
        clean_tags.add( tag )
        
    
    return clean_tags
    
def CombineTag( namespace, subtag ):
    
    if namespace == '':
        
        if subtag.startswith( ':' ):
            
            return ':' + subtag
            
        else:
            
            return subtag
            
        
    else:
        
        return namespace + ':' + subtag
        
    
def SplitTag( tag ):
    
    if ':' in tag:
        
        return tag.split( ':', 1 )
        
    else:
        
        return ( '', tag )
        
    
def StripTextOfGumpf( t ):
    
    t = re_newlines.sub( '', t )
    
    t = re_multiple_spaces.sub( ' ', t )
    
    t = re_trailing_space.sub( '', t )
    
    while re_leading_space_or_garbage.search( t ) is not None:
        
        t = re_leading_space_or_garbage.sub( '', t )
        
    
    return t
Version 77 2013-07-17 20:56:13 +00:00			`import collections`
			`import HydrusConstants as HC`
			`import itertools`
			`import os`
			`import threading`
			`import time`
			`import traceback`
Version 151 2015-03-25 22:04:19 +00:00			`import HydrusData`
			`import HydrusExceptions`
			`import re`
Version 255 2017-05-10 21:33:58 +00:00			`import HydrusGlobals as HG`
Version 77 2013-07-17 20:56:13 +00:00
Version 250 2017-04-05 21:16:40 +00:00			`re_newlines = re.compile( '[\r\n]', re.UNICODE )`
			`re_multiple_spaces = re.compile( '\\s+', re.UNICODE )`
			`re_trailing_space = re.compile( '\\s$', re.UNICODE )`
			`re_leading_space_or_garbage = re.compile( '^(\\s\|-\|system:)', re.UNICODE )`
			`re_leading_single_colon = re.compile( '^:(?!:)', re.UNICODE )`

Version 200 2016-04-06 19:52:45 +00:00			`def CensorshipMatch( tag, censorships ):`
Version 107 2014-03-12 22:08:23 +00:00
Version 200 2016-04-06 19:52:45 +00:00			`for censorship in censorships:`
Version 107 2014-03-12 22:08:23 +00:00
Version 200 2016-04-06 19:52:45 +00:00			`if censorship == '': # '' - all non namespaced tags`

Version 244 2017-02-08 22:27:00 +00:00			`( namespace, subtag ) = SplitTag( tag )`

			`if namespace == '':`
Version 200 2016-04-06 19:52:45 +00:00
			`return True`


			`elif censorship == ':': # ':' - all namespaced tags`

Version 244 2017-02-08 22:27:00 +00:00			`( namespace, subtag ) = SplitTag( tag )`

			`if namespace != '':`
Version 200 2016-04-06 19:52:45 +00:00
			`return True`

Version 107 2014-03-12 22:08:23 +00:00
Version 200 2016-04-06 19:52:45 +00:00			`elif ':' in censorship:`
Version 107 2014-03-12 22:08:23 +00:00
Version 200 2016-04-06 19:52:45 +00:00			`if censorship.endswith( ':' ): # 'series:' - namespaced tags`

Version 244 2017-02-08 22:27:00 +00:00			`( namespace, subtag ) = SplitTag( tag )`

Version 247 2017-03-15 20:13:04 +00:00			`if namespace == censorship[:-1]:`
Version 200 2016-04-06 19:52:45 +00:00
			`return True`


			`else: # 'series:evangelion' - exact match with namespace`

			`if tag == censorship:`

			`return True`



			`else:`

			`# 'table' - normal tag, or namespaced version of same`

Version 244 2017-02-08 22:27:00 +00:00			`( namespace, subtag ) = SplitTag( tag )`

			`if subtag == censorship:`
Version 200 2016-04-06 19:52:45 +00:00
Version 244 2017-02-08 22:27:00 +00:00			`return True`
Version 200 2016-04-06 19:52:45 +00:00
Version 107 2014-03-12 22:08:23 +00:00


Version 200 2016-04-06 19:52:45 +00:00			`return False`

Version 146 2015-02-03 20:40:21 +00:00			`def ConvertTagToSortable( t ):`
Version 236 2016-12-14 21:19:07 +00:00
Version 244 2017-02-08 22:27:00 +00:00			`if len( t ) > 0 and t[0].isdecimal():`
Version 146 2015-02-03 20:40:21 +00:00
			`# We want to maintain that:`
			`# 0 < 0a < 0b < 1 ( lexicographic comparison )`
			`# -and-`
			`# 2 < 22 ( value comparison )`
			`# So, if the first bit can be turned into an int, split it into ( int, extra )`

			`int_component = ''`

			`i = 0`

			`for character in t:`

Version 236 2016-12-14 21:19:07 +00:00			`if character.isdecimal(): int_component += character`
Version 146 2015-02-03 20:40:21 +00:00			`else: break`

			`i += 1`


			`str_component = t[i:]`

Version 236 2016-12-14 21:19:07 +00:00			`number = int( int_component )`


			`return ( number, str_component )`

			`else:`

			`return t`
Version 146 2015-02-03 20:40:21 +00:00

Version 136 2014-11-12 23:33:13 +00:00			`def FilterNamespaces( tags, namespaces ):`

			`processed_tags = collections.defaultdict( set )`

			`for tag in tags:`

Version 244 2017-02-08 22:27:00 +00:00			`( namespace, subtag ) = SplitTag( tag )`

			`processed_tags[ namespace ].add( tag )`
Version 136 2014-11-12 23:33:13 +00:00

			`result = set()`

			`for namespace in namespaces:`

Version 244 2017-02-08 22:27:00 +00:00			`if namespace == None:`

			`result.update( processed_tags[ '' ] )`

			`else:`

			`result.update( processed_tags[ namespace ] )`

Version 136 2014-11-12 23:33:13 +00:00

			`return result`

Version 203 2016-04-27 19:20:37 +00:00			`def SortNumericTags( tags ):`
Version 145 2015-01-21 22:49:58 +00:00
			`tags = list( tags )`

Version 146 2015-02-03 20:40:21 +00:00			`tags.sort( key = ConvertTagToSortable )`
Version 145 2015-01-21 22:49:58 +00:00
			`return tags`

Version 151 2015-03-25 22:04:19 +00:00			`def CheckTagNotEmpty( tag ):`

Version 244 2017-02-08 22:27:00 +00:00			`( namespace, subtag ) = SplitTag( tag )`
Version 151 2015-03-25 22:04:19 +00:00
Version 244 2017-02-08 22:27:00 +00:00			`if subtag == '':`
Version 151 2015-03-25 22:04:19 +00:00
Version 244 2017-02-08 22:27:00 +00:00			`raise HydrusExceptions.SizeException( 'Received a zero-length tag!' )`
Version 151 2015-03-25 22:04:19 +00:00
Version 250 2017-04-05 21:16:40 +00:00
Version 151 2015-03-25 22:04:19 +00:00			`def CleanTag( tag ):`

Version 165 2015-07-15 20:28:26 +00:00			`try:`

			`tag = tag[:1024]`

			`tag = tag.lower()`

Version 180 2015-11-04 22:30:28 +00:00			`tag = HydrusData.ToUnicode( tag )`
Version 165 2015-07-15 20:28:26 +00:00
Version 245 2017-03-02 02:14:56 +00:00			`if tag.startswith( ':' ):`
Version 165 2015-07-15 20:28:26 +00:00
Version 250 2017-04-05 21:16:40 +00:00			`tag = re_leading_single_colon.sub( '::', tag ) # Convert anything starting with one colon to start with two i.e. :D -> ::D`
Version 245 2017-03-02 02:14:56 +00:00
Version 250 2017-04-05 21:16:40 +00:00			`tag = StripTextOfGumpf( tag )`
Version 245 2017-03-02 02:14:56 +00:00
			`elif ':' in tag:`

Version 250 2017-04-05 21:16:40 +00:00			`tag = StripTextOfGumpf( tag ) # need to repeat here to catch 'system:' stuff`

Version 245 2017-03-02 02:14:56 +00:00			`( namespace, subtag ) = SplitTag( tag )`

Version 250 2017-04-05 21:16:40 +00:00			`namespace = StripTextOfGumpf( namespace )`
			`subtag = StripTextOfGumpf( subtag )`
Version 245 2017-03-02 02:14:56 +00:00
			`tag = CombineTag( namespace, subtag )`

			`else:`

Version 250 2017-04-05 21:16:40 +00:00			`tag = StripTextOfGumpf( tag )`
Version 165 2015-07-15 20:28:26 +00:00
Version 185 2015-12-09 23:16:41 +00:00
Version 165 2015-07-15 20:28:26 +00:00			`except Exception as e:`

Version 182 2015-11-18 22:44:07 +00:00			`text = 'Was unable to parse the tag: ' + HydrusData.ToUnicode( tag )`
Version 165 2015-07-15 20:28:26 +00:00			`text += os.linesep * 2`
Version 249 2017-03-29 19:39:34 +00:00			`text += HydrusData.ToUnicode( e )`
Version 151 2015-03-25 22:04:19 +00:00
Version 165 2015-07-15 20:28:26 +00:00			`raise Exception( text )`
Version 151 2015-03-25 22:04:19 +00:00

			`return tag`

			`def CleanTags( tags ):`

			`clean_tags = set()`

			`for tag in tags:`

			`tag = CleanTag( tag )`

			`try: CheckTagNotEmpty( tag )`
			`except HydrusExceptions.SizeException: continue`

			`clean_tags.add( tag )`


			`return clean_tags`

Version 244 2017-02-08 22:27:00 +00:00			`def CombineTag( namespace, subtag ):`
Version 185 2015-12-09 23:16:41 +00:00
			`if namespace == '':`

Version 244 2017-02-08 22:27:00 +00:00			`if subtag.startswith( ':' ):`
Version 185 2015-12-09 23:16:41 +00:00
Version 244 2017-02-08 22:27:00 +00:00			`return ':' + subtag`
Version 185 2015-12-09 23:16:41 +00:00
			`else:`

Version 244 2017-02-08 22:27:00 +00:00			`return subtag`
Version 185 2015-12-09 23:16:41 +00:00

			`else:`

Version 244 2017-02-08 22:27:00 +00:00			`return namespace + ':' + subtag`
Version 185 2015-12-09 23:16:41 +00:00

Version 244 2017-02-08 22:27:00 +00:00			`def SplitTag( tag ):`

			`if ':' in tag:`

			`return tag.split( ':', 1 )`

			`else:`

			`return ( '', tag )`


Version 250 2017-04-05 21:16:40 +00:00			`def StripTextOfGumpf( t ):`

			`t = re_newlines.sub( '', t )`

			`t = re_multiple_spaces.sub( ' ', t )`

			`t = re_trailing_space.sub( '', t )`

			`while re_leading_space_or_garbage.search( t ) is not None:`

			`t = re_leading_space_or_garbage.sub( '', t )`


			`return t`