mirror of
https://github.com/ceph/ceph
synced 2025-02-19 08:57:27 +00:00
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1543 29311d96-e01e-0410-9327-a35deaab8ce9
226 lines
6.5 KiB
Plaintext
226 lines
6.5 KiB
Plaintext
|
|
|
|
SOME GENERAL REQUIREMENTS
|
|
|
|
- cluster expansion:
|
|
- any or all of the replicas may move to new OSDs.
|
|
|
|
- cluster map may change frequently
|
|
- map change should translate into pending replication/migration
|
|
state quickly (or better yet, instantly), so that we could push
|
|
through a series of (say, botched) maps quickly and be fine, so long
|
|
as the final map is correct.
|
|
|
|
- ideally, unordered osd<->osd, client<->osd communication
|
|
(mds<->mds, client<->mds communication is ordered, but file i/o
|
|
would be too slow that way?)
|
|
|
|
|
|
|
|
|
|
PRIMARY ONLY PICTURE
|
|
|
|
let's completely ignore replication for a while, and see how
|
|
complicated the picture needs to be to reliably support cluster expansion.
|
|
|
|
typedef __uint64_t version_t;
|
|
|
|
|
|
per-Object metadata:
|
|
- version #. incremented when an object is modified.
|
|
e.g. version_t version;
|
|
- on primary, keep list of stray replicas
|
|
e.g. map<int,version_t> stray_replicas; // osds w/ stray replicas
|
|
includes old primary osd(s), until deletion is confirmed. used while rg
|
|
is importing.
|
|
|
|
|
|
per-RG metadata
|
|
- object list. well, a method to fetch it by querying a collection or whatever.
|
|
- negative <object,version> list
|
|
e.g. map<object_t, version_t> deleted_objects;
|
|
- used to enumerate deleted objects, when in "importing" state.
|
|
- a RG "state" (enum/int)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Normal RG state:
|
|
- role=primary
|
|
clean - i am primary, all is well. no stray copies. i can
|
|
discard my negative object list, since my local
|
|
object store tells me everything.
|
|
|
|
|
|
After a map change:
|
|
- new primary
|
|
undef - initially; i don't know RG exists.
|
|
- old primary
|
|
homeless - i was primary, still have unmolested data. new primary is not yet migrating
|
|
(presumably it's state=undef.) i need to contact new primary and tell them
|
|
this RG exists.
|
|
|
|
- new primary
|
|
importing - i am migrating data from old primary. keep negative dir entries for deletions.
|
|
write locally. proxy reads (force immediately migration). do whole objects
|
|
initially (on write, block until i migrate the object). later we can do
|
|
sub-object state (where "live" object data is spread across new/old primaries..
|
|
- old primary
|
|
exporting - primary is migrating my data.
|
|
undef - when it finishes. (i will forget this RG existed.)
|
|
|
|
|
|
After a second map change (scenario 1):
|
|
as above, if we were clean again.
|
|
|
|
After a second map change (scenario 2):
|
|
we weren't clean yet.
|
|
- new primary
|
|
undef - initially (until i learn RG exists)
|
|
- old primary
|
|
importing - i'm still migrating from old old primary
|
|
- old old primary
|
|
exporting - ...
|
|
- old primary
|
|
?? importing+exporting - proxy reads as before. continue migrating from old old primary.
|
|
|
|
|
|
After a second map change (scenario 3):
|
|
we weren't clean yet, and old old primary is also new primary
|
|
- new primary (and old old primary)
|
|
exporting - change state to importing. be sure to compare object versions, and neg dir
|
|
entries (as we always should do, really!).
|
|
- old primary
|
|
importing - note that the old import source matches new primary, and change
|
|
state to exporting, and stop importing. (unlike scenario 2)
|
|
|
|
-> this approach could mean that a series of fast map changes could
|
|
force data to migrate down a "chain" of old primaries to reach the
|
|
new one. maybe old primary should go from importing -> exporting,
|
|
and pass along old old primary id to new primary such that the
|
|
import is a many-to-one thing, instead of one-to-one. version
|
|
numbers and neg entries will make it easy to pick out correct versions.
|
|
|
|
|
|
|
|
For the importing process on a given RG:
|
|
|
|
- metadata for each source
|
|
- each source has a state:
|
|
'starting' - don't know anything about source yet. query source!
|
|
this probaby induces the source to change from
|
|
'homeless' or something similar to 'exporting'.
|
|
'importing' - i've fetched the source's object list (and neg
|
|
object list). i'm busy reading them! these lists
|
|
will shrink as the process continues. after i fetch
|
|
an object, i will erase it from the source.
|
|
(object metadata will include stray copy info
|
|
until i confirm that its removed.)
|
|
'finishing' - i've read all my data, and i'm telling the old person
|
|
to discard any remaining RG metadata (RG contents
|
|
should already be gone)
|
|
- unmigrated object list
|
|
- migrated but not deleted object list
|
|
- stray osd is also listed in per-object MD during this stage
|
|
- negative object list
|
|
- i can remove these items if i see a newer object version (say,
|
|
from a different import source or something).
|
|
- i can remove any local objects or ignore imported ones if it is
|
|
older than deleted version
|
|
|
|
- the lists should be sets or otherwise queryable so that while i'm
|
|
importing and a real op comes through I can quickly determine if a
|
|
given object_id is pending migration etc or if my local store is to
|
|
be trusted.
|
|
|
|
|
|
|
|
|
|
|
|
SOME CODE BITS
|
|
|
|
|
|
typedef __uint64_t version_t;
|
|
class Object {
|
|
version_t version;
|
|
map<int, version_t> stray_replicas;
|
|
};
|
|
|
|
|
|
class ReplicaGroup {
|
|
int enumerate_objects(list<object_t>& ls);
|
|
|
|
int state;
|
|
|
|
// for unstable states,
|
|
map<object_t, version_t> deleted_objects; // locally
|
|
map<int, RGExporter_t> exporters; // importing from these guys.
|
|
};
|
|
|
|
// primary
|
|
#define RG_STATE_CLEAN 1
|
|
#define RG_STATE_IMPORTING 2 // pulling data
|
|
|
|
// non-primary
|
|
#define RG_STATE_HOMELESS 5 // old primary; new primary not yet
|
|
// notified; not yet exporting.
|
|
#define RG_STATE_EXPORTING 6 // a newer primary is extracting my
|
|
// data.
|
|
|
|
|
|
struct RGExporter_t {
|
|
int import_state;
|
|
|
|
set<object_t> remaining_objects; // remote object list
|
|
set<object_t> stray_objects; // imported but not deleted.
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
----
|
|
all crap from here on down
|
|
|
|
|
|
|
|
|
|
REPLICAS
|
|
-
|
|
|
|
|
|
|
|
|
|
OSD STATES
|
|
- primary, up to date.
|
|
- replica, up to date.
|
|
|
|
- primary, proxy to old primary (primaries?)
|
|
|
|
- replica, not up to date.
|
|
|
|
|
|
REPLICATION STUFF
|
|
|
|
Per-RG metadata
|
|
- primary
|
|
- per-replica state: clean, catching up?
|
|
- replica
|
|
|
|
Per-object metadata
|
|
- primary and replica
|
|
- version number/mtime
|
|
- rg (reverse indexed)
|
|
- primary
|
|
- replication level and state.
|
|
- commited to memory and/or disk, on which replicas (#1, #2, etc.)
|
|
- replica
|
|
|
|
|
|
|
|
|
|
|
|
-> |