From 426a70ad04a4179371937649c8e95df5ef9a0963 Mon Sep 17 00:00:00 2001 From: Hydrus Date: Wed, 5 Nov 2014 15:17:13 -0600 Subject: [PATCH] Adding HydrusTagArchive --- include/HydrusTagArchive.py | 239 ++++++++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 include/HydrusTagArchive.py diff --git a/include/HydrusTagArchive.py b/include/HydrusTagArchive.py new file mode 100644 index 00000000..a3eed7bb --- /dev/null +++ b/include/HydrusTagArchive.py @@ -0,0 +1,239 @@ +import os +import sqlite3 + +HASH_TYPE_MD5 = 0 # 16 bytes long +HASH_TYPE_SHA1 = 1 # 20 bytes long +HASH_TYPE_SHA256 = 2 # 32 bytes long +HASH_TYPE_SHA512 = 3 # 64 bytes long + +# Please feel free to use this file however you wish. +# None of this is thread-safe, though, so don't try to do anything clever. + + +# If you want to make a new tag archive for use in hydrus, you want to do something like: + +# import HydrusTagArchive +# hta = HydrusTagArchive.HydrusTagArchive( 'my_little_archive.db' ) +# hta.SetHashType( HydrusTagArchive.HASH_TYPE_MD5 ) +# hta.BeginBigJob() +# for ( hash, tags ) in my_complex_mappings_generator: hta.SetMappings( hash, tags ) + # -or- +# for ( hash, tag ) in my_simple_mapping_generator: hta.AddMapping( hash, tag ) +# hta.CommitBigJob() +# del hta + + +# If you are only adding a couple tags, you can exclude the BigJob stuff. It just makes millions of sequential writes more efficient. + + +# Also, this manages hashes as bytes, not hex, so if you have something like: + +# hash = ab156e87c5d6e215ab156e87c5d6e215 + +# Then go hash = hash.decode( 'hex' ) before you pass it to Add/Get/Has/SetMappings + + +# If you have tags that are namespaced like hydrus (e.g. series:ghost in the shell), then check out: +# GetNamespaces +# DeleteNamespaces +# and +# RebuildNamespaces + +# RebuildNamespaces takes namespaces_to_exclude, if you want to curate your namespaces a little better. + +# If your GetNamespaces gives garbage, then just hit DeleteNamespaces. I'll be using the result of GetNamespaces to populate +# the advanced tag options widget when people sync with these archives. + + +# And also feel free to contact me directly at hydrus.admin@gmail.com if you need help. + +class HydrusTagArchive( object ): + + def __init__( self, path ): + + self._path = path + + if not os.path.exists( self._path ): create_db = True + else: create_db = False + + self._InitDBCursor() + + if create_db: self._InitDB() + + self._namespaces = { namespace for ( namespace, ) in self._c.execute( 'SELECT namespace FROM namespaces;' ) } + + + def _InitDB( self ): + + self._c.execute( 'CREATE TABLE hash_type ( hash_type INTEGER );', ) + + self._c.execute( 'CREATE TABLE hashes ( hash_id INTEGER PRIMARY KEY, hash BLOB_BYTES );' ) + self._c.execute( 'CREATE UNIQUE INDEX hashes_hash_index ON hashes ( hash );' ) + + self._c.execute( 'CREATE TABLE mappings ( hash_id INTEGER, tag_id INTEGER );' ) + self._c.execute( 'CREATE INDEX mappings_hash_id_index ON mappings ( hash_id );' ) + + self._c.execute( 'CREATE TABLE namespaces ( namespace TEXT );' ) + + self._c.execute( 'CREATE TABLE tags ( tag_id INTEGER PRIMARY KEY, tag TEXT );' ) + self._c.execute( 'CREATE UNIQUE INDEX tags_tag_index ON tags ( tag );' ) + + + def _InitDBCursor( self ): + + self._db = sqlite3.connect( self._path, isolation_level = None, detect_types = sqlite3.PARSE_DECLTYPES ) + + self._c = self._db.cursor() + + + def _GetHashId( self, hash, read_only = False ): + + result = self._c.execute( 'SELECT hash_id FROM hashes WHERE hash = ?;', ( sqlite3.Binary( hash ), ) ).fetchone() + + if result is None: + + if read_only: raise Exception() + + self._c.execute( 'INSERT INTO hashes ( hash ) VALUES ( ? );', ( sqlite3.Binary( hash ), ) ) + + hash_id = self._c.lastrowid + + else: ( hash_id, ) = result + + return hash_id + + + def _GetTagId( self, tag ): + + if ':' in tag: + + ( namespace, subtag ) = tag.split( ':', 1 ) + + if namespace != '' and namespace not in self._namespaces: + + self._c.execute( 'INSERT INTO namespaces ( namespace ) VALUES ( ? );', ( namespace, ) ) + + self._namespaces.add( namespace ) + + + + result = self._c.execute( 'SELECT tag_id FROM tags WHERE tag = ?;', ( tag, ) ).fetchone() + + if result is None: + + self._c.execute( 'INSERT INTO tags ( tag ) VALUES ( ? );', ( tag, ) ) + + tag_id = self._c.lastrowid + + else: ( tag_id, ) = result + + return tag_id + + + def BeginBigJob( self ): self._c.execute( 'BEGIN IMMEDIATE' ) + + def CommitBigJob( self ): + + self._c.execute( 'COMMIT' ) + self._c.execute( 'VACUUM' ) + + + def AddMapping( self, hash, tag ): + + hash_id = self._GetHashId( hash ) + tag_id = self._GetTagId( tag ) + + self._c.execute( 'INSERT OR IGNORE INTO mappings ( hash_id, tag_id ) VALUES ( ?, ? );', ( hash_id, tag_id ) ) + + + def DeleteMapping( self, hash, tag ): + + hash_id = self._GetHashId( hash ) + tag_id = self._GetTagId( tag ) + + self._c.execute( 'DELETE FROM mappings WHERE hash_id = ? AND tag_id = ?;', ( hash_id, tag_id ) ) + + + def DeleteNamespaces( self ): + + self._namespaces = {} + + self._c.execute( 'DELETE FROM namespaces;' ) + + + def GetHashType( self ): + + try: ( hash_type, ) = self._c.execute( 'SELECT hash_type FROM hash_type;' ).fetchone() + except: raise Exception( 'This archive has no hash type set.' ) + + return hash_type + + + def GetMappings( self, hash ): + + try: hash_id = self._GetHashId( hash, read_only = True ) + except: return [] + + result = { tag for ( tag, ) in self._c.execute( 'SELECT tag FROM mappings, tags USING ( tag_id ) WHERE hash_id = ?;', ( hash_id, ) ) } + + return result + + + def GetNamespaces( self ): return self._namespaces + + def HasHash( self, hash ): + + try: + + hash_id = self._GetHashId( hash, read_only = True ) + + return True + + except: return False + + + def RebuildNamespaces( self, namespaces_to_exclude = set() ): + + self._namespaces = set() + + self._c.execute( 'DELETE FROM namespaces;' ) + + for ( tag, ) in self._c.execute( 'SELECT tag FROM tags;' ): + + if ':' in tag: + + ( namespace, subtag ) = tag.split( ':', 1 ) + + if namespace != '' and namespace not in self._namespaces and namespace not in namespaces_to_exclude: + + self._namespaces.add( namespace ) + + + + + self._c.executemany( 'INSERT INTO namespaces ( namespace ) VALUES ( ? );', ( ( namespace, ) for namespace in self._namespaces ) ) + + + def SetHashType( self, hash_type ): + + self._c.execute( 'DELETE FROM hash_type;' ) + + self._c.execute( 'INSERT INTO hash_type ( hash_type ) VALUES ( ? );', ( hash_type, ) ) + + + def SetMappings( self, hash, tags ): + + hash_id = self._GetHashId( hash ) + + self._c.execute( 'DELETE FROM mappings WHERE hash_id = ?;', ( hash_id, ) ) + + tag_ids = [ self._GetTagId( tag ) for tag in tags ] + + for tag in tags: + + tag_id = self._GetTagId( tag ) + + self._c.execute( 'INSERT INTO mappings ( hash_id, tag_id ) VALUES ( ?, ? );', ( hash_id, tag_id ) ) + + + \ No newline at end of file