ceph/branches/sage/pgs/osbdb/OSBDB.h
sageweil 9213a23f14 eek
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1138 29311d96-e01e-0410-9327-a35deaab8ce9
2007-02-28 18:42:55 +00:00

508 lines
12 KiB
C++

/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*-
Copyright (C) 2007 Casey Marshall <csm@soe.ucsc.edu>
Ceph - scalable distributed file system
This is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 2.1, as published by the Free Software
Foundation. See file COPYING. */
#include <db_cxx.h>
#include "osd/ObjectStore.h"
// Redefine this to use a different BDB access type. DB_BTREE is
// probably the only other one that makes sense.
#ifndef OSBDB_DB_TYPE
#define OSBDB_DB_TYPE DB_HASH
#endif // OSBDB_DB_TYPE
/*
* Maximum length of an attribute name.
*/
#define OSBDB_MAX_ATTR_LEN 256
#define OSBDB_THIS_VERSION 1
#define OSBDB_SUPERBLOCK_KEY ((void *) "s")
/*
* The "superblock" of the BDB object store. We store one of these in
* the DB, to store version and other information. We don't record
* anything special here, just the version number the database was
* written with.
*
* In principle, this structure is variable-length, depending on the
* software version writing the superblock.
*/
struct stored_superblock
{
uint32_t version;
};
inline ostream& operator<<(ostream& out, const stored_superblock sb)
{
out << "osbdb.super(" << sb.version << ")" << endl;
return out;
}
/**
* An object identifier; we define this so we can have a POD object to
* work with.
*/
struct oid_t // POD
{
char id[16];
};
inline void mkoid (oid_t& id, object_t& oid)
{
// XXX byte order?
memcpy (id.id, &oid, sizeof (oid_t));
}
inline ostream& operator<<(ostream& out, const oid_t id)
{
for (int i = 0; i < 16; i++)
{
out.fill('0');
out << setw(2) << hex << (id.id[i] & 0xFF);
if ((i & 3) == 3)
out << ':';
}
out.unsetf(ios::right);
out << dec;
return out;
}
/**
* An "inode" key. We map a 'stored_object' struct to this key for
* every object.
*/
struct object_inode_key // POD
{
oid_t oid;
char tag;
};
/**
* "Constructor" for an object_inode_key.
*/
inline object_inode_key new_object_inode_key (object_t& oid)
{
object_inode_key key;
memset(&key, 0, sizeof (object_inode_key));
mkoid (key.oid, oid);
key.tag = 'i';
return key;
}
/*
* We use this, instead of sizeof(), to try and guarantee that we
* don't include the structure padding, if any.
*
* This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1.
*/
inline size_t sizeof_object_inode_key()
{
return offsetof(object_inode_key, tag) + sizeof (char);
}
// Frank Poole: Unfortunately, that sounds a little
// like famous last words.
// -- 2001: A Space Odyssey
inline ostream& operator<<(ostream& out, const object_inode_key o)
{
out << o.tag << "/" << o.oid;
return out;
}
/**
* A stored object. This is essentially the "inode" of the object,
* containing things like the object's length. The object itself is
* stored as-is, mapped by the 128-bit object ID.
*/
struct stored_object
{
uint32_t length;
};
inline ostream& operator<<(ostream& out, const stored_object s)
{
out << "inode(l:" << s.length << ")";
return out;
}
/*
* Key referencing the list of attribute names for an object. This is
* simply the object's ID, with an additional character 'a' appended.
*/
struct attrs_id // POD
{
oid_t oid;
char tag;
};
/*
* "Construtor" for attrs_id.
*/
inline struct attrs_id new_attrs_id (object_t& oid)
{
attrs_id aid;
memset (&aid, 0, sizeof (attrs_id));
mkoid(aid.oid, oid);
aid.tag = 'a';
return aid;
}
/*
* See explanation for sizeof_object_inode_id.
*/
inline size_t sizeof_attrs_id()
{
return offsetof(struct attrs_id, tag) + sizeof (char);
}
inline ostream& operator<<(ostream& out, const attrs_id id)
{
out << id.tag << "/" << id.oid;
return out;
}
/*
* Encapsulation of a single attribute name.
*/
struct attr_name // POD
{
char name[OSBDB_MAX_ATTR_LEN];
};
inline ostream& operator<<(ostream& out, const attr_name n)
{
out << n.name;
return out;
}
inline bool operator<(const attr_name n1, const attr_name n2)
{
return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0);
}
inline bool operator>(const attr_name n1, const attr_name n2)
{
return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0);
}
inline bool operator==(const attr_name n1, const attr_name n2)
{
std::cerr << n1.name << " == " << n2.name << "?" << endl;
return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0);
}
inline bool operator!=(const attr_name n1, const attr_name n2)
{
return !(n1 == n2);
}
inline bool operator>=(const attr_name n1, const attr_name n2)
{
return !(n1 < n2);
}
inline bool operator<=(const attr_name n1, const attr_name n2)
{
return !(n1 > n2);
}
/*
* A list of an object or collection's attribute names.
*/
struct stored_attrs
{
uint32_t count;
attr_name names[0]; // actually variable-length
};
inline ostream& operator<<(ostream& out, const stored_attrs *sa)
{
out << sa->count << " [ ";
for (unsigned i = 0; i < sa->count; i++)
out << sa->names[i] << (i == sa->count - 1 ? " " : ", ");
out << "]";
return out;
}
/*
* An object attribute key. An object attribute is mapped simply by
* the object ID appended with the attribute name. Attribute names
* may not be empty, and must be less than 256 characters, in this
* implementation.
*/
struct attr_id // POD
{
oid_t oid;
attr_name name;
};
inline attr_id new_attr_id (object_t& oid, const char *name)
{
attr_id aid;
memset(&aid, 0, sizeof (attr_id));
mkoid (aid.oid, oid);
strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN);
return aid;
}
inline ostream& operator<<(ostream &out, const attr_id id)
{
out << id.oid << ":" << id.name;
return out;
}
/*
* A key for a collection attributes list.
*/
struct coll_attrs_id // POD
{
coll_t cid;
char tag;
};
inline coll_attrs_id new_coll_attrs_id (coll_t cid)
{
coll_attrs_id catts;
memset(&catts, 0, sizeof (coll_attrs_id));
catts.cid = cid;
catts.tag = 'C';
return catts;
}
inline size_t sizeof_coll_attrs_id()
{
return offsetof(coll_attrs_id, tag) + sizeof (char);
}
inline ostream& operator<<(ostream& out, coll_attrs_id id)
{
out << id.tag << "/" << id.cid;
return out;
}
/*
* A collection attribute key. Similar to
*/
struct coll_attr_id // POD
{
coll_t cid;
attr_name name;
};
inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name)
{
coll_attr_id catt;
memset(&catt, 0, sizeof (coll_attr_id));
catt.cid = cid;
strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN);
return catt;
}
inline ostream& operator<<(ostream& out, coll_attr_id id)
{
out << id.cid << ":" << id.name;
return out;
}
/*
* This is the key we store the master collections list under.
*/
#define COLLECTIONS_KEY ((void *) "c")
/*
* The master list of collections. There should be one of these per
* OSD. The sole reason for this structure is to have the ability
* to enumerate all collections stored on this OSD.
*/
struct stored_colls
{
// The number of collections.
uint32_t count;
// The collection identifiers. This is a sorted list of coll_t
// values.
coll_t colls[0]; // actually variable-length
};
inline ostream& operator<<(ostream& out, stored_colls *c)
{
out << c->count << " [ ";
for (unsigned i = 0; i < c->count; i++)
{
out << hex << c->colls[i];
if (i < c->count - 1)
out << ", ";
}
out << " ]" << dec;
return out;
}
/*
* A stored collection (a bag of object IDs). These are referenced by
* the bare collection identifier type, a coll_t (thus, a 32-bit
* integer). Internally this is stored as a sorted list of object IDs.
*
* Note, this structure places all collection items in a single
* record; this may be a memory burden for large collections.
*/
struct stored_coll
{
// The size of this collection.
uint32_t count;
// The object IDs in this collection. This is a sorted list of all
// object ID's in this collection.
object_t objects[0]; // actually variable-length
};
inline ostream& operator<<(ostream& out, stored_coll *c)
{
out << c->count << " [ ";
for (unsigned i = 0; i < c->count; i++)
{
out << c->objects[i];
if (i < c->count - 1)
out << ", ";
}
out << " ]";
return out;
}
/*
* The object store interface for Berkeley DB.
*/
class OSBDB : public ObjectStore
{
private:
DbEnv *env;
Db *db;
string device;
bool mounted;
bool opened;
public:
OSBDB(const char *dev)
: env(0), db (0), device (dev), mounted(false), opened(false)
{
/*env = new DbEnv (DB_CXX_NO_EXCEPTIONS);
env->set_error_stream (&std::cerr);
// WTF? You can't open an env if you set this flag here, but BDB
// says you also can't set it after you open the env.
//env->set_flags (DB_LOG_INMEMORY, 1);
char *p = strrchr (dev, '/');
int env_flags = (DB_CREATE | DB_THREAD | DB_INIT_LOCK
| DB_INIT_MPOOL | DB_INIT_TXN | DB_INIT_LOG);
if (p != NULL)
{
*p = '\0';
if (env->open (dev, env_flags, 0) != 0)
{
std::cerr << "failed to open environment: "
<< dev << std::endl;
::abort();
}
*p = '/';
dev = p+1;
}
else
{
if (env->open (NULL, env_flags, 0) != 0)
{
std::cerr << "failed to open environment: ." << std::endl;
::abort();
}
}
// Double WTF: if you remove the DB_LOG_INMEMORY bit, db->open
// fails, inexplicably, with EINVAL!*/
// env->set_flags (DB_DIRECT_DB | /*DB_AUTO_COMMIT |*/ DB_LOG_INMEMORY, 1);
}
~OSBDB()
{
if (mounted)
{
umount();
}
if (env != NULL)
{
env->close (0);
delete env;
}
}
int mount();
int umount();
int mkfs();
int statfs(struct statfs *buf);
int pick_object_revision_lt(object_t& oid);
bool exists(object_t oid);
int stat(object_t oid, struct stat *st);
int remove(object_t oid, Context *onsafe=0);
int truncate(object_t oid, off_t size, Context *onsafe=0);
int read(object_t oid, off_t offset, size_t len,
bufferlist& bl);
int write(object_t oid, off_t offset, size_t len,
bufferlist& bl, Context *onsafe);
int setattr(object_t oid, const char *name,
const void *value, size_t size, Context *onsafe=0);
int setattrs(object_t oid, map<string,bufferptr>& aset,
Context *onsafe=0);
int getattr(object_t oid, const char *name,
void *value, size_t size);
int getattrs(object_t oid, map<string,bufferptr>& aset);
int rmattr(object_t oid, const char *name,
Context *onsafe=0);
int listattr(object_t oid, char *attrs, size_t size);
int clone(object_t oid, object_t noid);
// Collections.
int list_collections(list<coll_t>& ls);
int create_collection(coll_t c, Context *onsafe=0);
int destroy_collection(coll_t c, Context *onsafe=0);
bool collection_exists(coll_t c);
int collection_stat(coll_t c, struct stat *st);
int collection_add(coll_t c, object_t o, Context *onsafe=0);
int collection_remove(coll_t c, object_t o, Context *onsafe=0);
int collection_list(coll_t c, list<object_t>& o);
int collection_setattr(coll_t cid, const char *name,
const void *value, size_t size,
Context *onsafe=0);
int collection_rmattr(coll_t cid, const char *name,
Context *onsafe=0);
int collection_getattr(coll_t cid, const char *name,
void *value, size_t size);
int collection_listattr(coll_t cid, char *attrs, size_t size);
void sync(Context *onsync);
void sync();
private:
int opendb (DBTYPE type=DB_UNKNOWN, int flags=0);
int _setattr(object_t oid, const char *name, const void *value,
size_t size, Context *onsync);
int _getattr(object_t oid, const char *name, void *value, size_t size);
};