ceph/branches/riccardo/monitor2/ebofs/Allocator.cc
riccardo80 07ac5d3e74 creating branch for distributed monitor
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1068 29311d96-e01e-0410-9327-a35deaab8ce9
2007-02-01 05:43:23 +00:00

693 lines
18 KiB
C++

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#include "Allocator.h"
#include "Ebofs.h"
#undef dout
#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator."
void Allocator::dump_freelist()
{
if (1) {
interval_set<block_t> free; // validate too
block_t n = 0;
for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
Table<block_t,block_t> *tab;
if (b < EBOFS_NUM_FREE_BUCKETS) {
tab = fs->free_tab[b];
dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << endl;
} else {
tab = fs->limbo_tab;
dout(0) << "dump limbo " << tab->get_num_keys() << endl;;
}
if (tab->get_num_keys() > 0) {
Table<block_t,block_t>::Cursor cursor(tab);
assert(tab->find(0, cursor) >= 0);
while (1) {
dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl;
assert(cursor.current().value > 0);
if (b < EBOFS_NUM_FREE_BUCKETS)
n += cursor.current().value;
if (free.contains( cursor.current().key, cursor.current().value ))
dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << endl;
assert(!free.contains( cursor.current().key, cursor.current().value ));
free.insert( cursor.current().key, cursor.current().value );
if (cursor.move_right() <= 0) break;
}
} else {
//cout << " empty" << endl;
}
}
assert(n == fs->free_blocks);
dout(0) << "dump combined freelist is " << free << endl;
// alloc_tab
if (fs->alloc_tab->get_num_keys() > 0) {
Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
assert(fs->alloc_tab->find(0, cursor) >= 0);
while (1) {
dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref "
<< cursor.current().value.second
<< endl;
assert(cursor.current().value.first > 0);
if (cursor.move_right() <= 0) break;
}
}
}
}
int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir)
{
Table<block_t,block_t>::Cursor cursor(fs->free_tab[bucket]);
bool found = false;
if ((dir == DIR_ANY || dir == DIR_FWD) &&
fs->free_tab[bucket]->find( near, cursor ) >= 0) {
// look to the right
do {
if (cursor.current().value >= num)
found = true;
} while (!found && cursor.move_right() > 0);
}
if ((dir == DIR_ANY || dir == DIR_BACK) &&
!found) {
// look to the left
fs->free_tab[bucket]->find( near, cursor );
while (!found && cursor.move_left() >= 0)
if (cursor.current().value >= num)
found = true;
}
if (found) {
ex.start = cursor.current().key;
ex.length = cursor.current().value;
return 0;
}
return -1;
}
int Allocator::allocate(Extent& ex, block_t num, block_t near)
{
//dump_freelist();
int dir = DIR_ANY; // no dir
if (near == NEAR_LAST_FWD) {
near = last_pos;
dir = DIR_FWD; // fwd
}
else if (near == NEAR_LAST)
near = last_pos;
int bucket;
while (1) { // try twice, if fwd = true
// look for contiguous extent
for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) {
if (find(ex, bucket, num, near, dir) >= 0) {
// yay!
// remove original
fs->free_tab[bucket]->remove( ex.start );
fs->free_blocks -= ex.length;
if (ex.length > num) {
if (ex.start < near) {
// to the left
if (ex.start + ex.length - num <= near) {
// by a lot. take right-most portion.
Extent left;
left.start = ex.start;
left.length = ex.length - num;
ex.start += left.length;
ex.length -= left.length;
assert(ex.length == num);
_release_loner(left);
} else {
// take middle part.
Extent left,right;
left.start = ex.start;
left.length = near - ex.start;
ex.start = near;
right.start = ex.start + num;
right.length = ex.length - left.length - num;
ex.length = num;
_release_loner(left);
_release_loner(right);
}
}
else {
// to the right. take left-most part.
Extent right;
right.start = ex.start + num;
right.length = ex.length - num;
ex.length = num;
_release_loner(right);
}
}
dout(20) << "allocate " << ex << " near " << near << endl;
last_pos = ex.end();
//dump_freelist();
if (g_conf.ebofs_cloneable)
alloc_inc(ex);
return num;
}
}
if (dir == DIR_BACK || dir == DIR_ANY) break;
dir = DIR_BACK;
}
// ok, find partial extent instead.
for (block_t trysize = num/2; trysize >= 1; trysize /= 2) {
int bucket = pick_bucket(trysize);
if (find(ex, bucket, trysize, near) >= 0) {
// yay!
assert(ex.length < num);
fs->free_tab[bucket]->remove(ex.start);
fs->free_blocks -= ex.length;
last_pos = ex.end();
dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << endl;
//dump_freelist();
if (g_conf.ebofs_cloneable)
alloc_inc(ex);
return ex.length;
}
}
dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << endl;
assert(0);
//dump_freelist();
return -1;
}
int Allocator::_release_into_limbo(Extent& ex)
{
dout(10) << "_release_into_limbo " << ex << endl;
dout(10) << "limbo is " << limbo << endl;
assert(ex.length > 0);
limbo.insert(ex.start, ex.length);
fs->limbo_blocks += ex.length;
return 0;
}
int Allocator::release(Extent& ex)
{
if (g_conf.ebofs_cloneable)
return alloc_dec(ex);
_release_into_limbo(ex);
return 0;
}
int Allocator::commit_limbo()
{
dout(20) << "commit_limbo" << endl;
for (map<block_t,block_t>::iterator i = limbo.m.begin();
i != limbo.m.end();
i++) {
fs->limbo_tab->insert(i->first, i->second);
//fs->free_blocks += i->second;
}
limbo.clear();
//fs->limbo_blocks = 0;
//dump_freelist();
return 0;
}
int Allocator::release_limbo()
{
//dump_freelist();
if (fs->limbo_tab->get_num_keys() > 0) {
Table<block_t,block_t>::Cursor cursor(fs->limbo_tab);
fs->limbo_tab->find(0, cursor);
while (1) {
Extent ex(cursor.current().key, cursor.current().value);
dout(20) << "release_limbo ex " << ex << endl;
fs->limbo_blocks -= ex.length;
_release_merge(ex);
if (cursor.move_right() <= 0) break;
}
}
fs->limbo_tab->clear();
//dump_freelist();
return 0;
}
/*
int Allocator::_alloc_loner_inc(Extent& ex)
{
Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
if (fs->alloc_tab->find( ex.start, cursor )
== Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
assert(cursor.current().value.first == ex.length);
pair<block_t,int>& v = cursor.dirty_current_value();
v.second++;
dout(10) << "_alloc_loner_inc " << ex << " "
<< (v.second-1) << " -> " << v.second
<< endl;
} else {
// insert it, @1
fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << endl;
}
return 0;
}
int Allocator::_alloc_loner_dec(Extent& ex)
{
Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
if (fs->alloc_tab->find( ex.start, cursor )
== Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
assert(cursor.current().value.first == ex.length);
if (cursor.current().value.second == 1) {
dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << endl;
fs->alloc_tab->remove( cursor.current().key );
} else {
pair<block_t,int>& v = cursor.dirty_current_value();
--v.second;
dout(10) << "_alloc_loner_dec " << ex << " "
<< (v.second+1) << " -> " << v.second
<< endl;
}
} else {
assert(0);
}
return 0;
}
*/
int Allocator::alloc_inc(Extent ex)
{
dout(10) << "alloc_inc " << ex << endl;
// empty table?
if (fs->alloc_tab->get_num_keys() == 0) {
// easy.
fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << endl;
return 0;
}
Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
// try to move to left (to check for overlap)
int r = fs->alloc_tab->find( ex.start, cursor );
if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
cursor.current().key > ex.start) {
r = cursor.move_left();
dout(10) << "alloc_inc move_left r = " << r << endl;
}
while (1) {
dout(10) << "alloc_inc loop at " << cursor.current().key
<< "~" << cursor.current().value.first
<< " ref " << cursor.current().value.second
<< endl;
// too far left?
if (cursor.current().key < ex.start &&
cursor.current().key + cursor.current().value.first <= ex.start) {
// adjacent?
bool adjacent = false;
if (cursor.current().key + cursor.current().value.first == ex.start &&
cursor.current().value.second == 1)
adjacent = true;
// no overlap.
r = cursor.move_right();
dout(10) << "alloc_inc move_right r = " << r << endl;
// at end?
if (r <= 0) {
// hmm!
if (adjacent) {
// adjust previous entry
cursor.move_left();
pair<block_t,int> &v = cursor.dirty_current_value();
v.first += ex.length; // yay!
dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << endl;
} else {
// insert at end, finish.
int r = fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << endl;
//dump_freelist();
}
return 0;
}
}
if (cursor.current().key > ex.start) {
// gap.
// oooooo
// nnnnn.....
block_t l = MIN(ex.length, cursor.current().key - ex.start);
fs->alloc_tab->insert(ex.start, pair<block_t,int>(l,1));
dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << endl;
ex.start += l;
ex.length -= l;
if (ex.length == 0) break;
fs->alloc_tab->find( ex.start, cursor );
}
else if (cursor.current().key < ex.start) {
block_t end = cursor.current().value.first + cursor.current().key;
if (end <= ex.end()) {
// single split
// oooooo
// nnnnn
pair<block_t,int>& v = cursor.dirty_current_value();
v.first = ex.start - cursor.current().key;
int ref = v.second;
block_t l = end - ex.start;
fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, 1+ref));
dout(10) << "alloc_inc " << ex.start << "~" << l
<< " " << ref << " -> " << ref+1
<< " (right split)" << endl;
ex.start += l;
ex.length -= l;
if (ex.length == 0) break;
fs->alloc_tab->find( ex.start, cursor );
} else {
// double split, finish.
// -------------
// ------
pair<block_t,int>& v = cursor.dirty_current_value();
v.first = ex.start - cursor.current().key;
int ref = v.second;
fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, 1+ref));
int rl = end - ex.end();
fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
dout(10) << "alloc_inc " << ex
<< " " << ref << " -> " << ref+1
<< " (double split finish)"
<< endl;
break;
}
}
else {
assert(cursor.current().key == ex.start);
if (cursor.current().value.first <= ex.length) {
// inc.
// oooooo
// nnnnnnnn
pair<block_t,int>& v = cursor.dirty_current_value();
v.second++;
dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first
<< " " << cursor.current().value.second-1 << " -> "
<< cursor.current().value.second
<< " (left split)" << endl;
ex.start += v.first;
ex.length -= v.first;
if (ex.length == 0) break;
cursor.move_right();
} else {
// single split, finish.
// oooooo
// nnn
block_t l = cursor.current().value.first - ex.length;
int ref = cursor.current().value.second;
pair<block_t,int>& v = cursor.dirty_current_value();
v.first = ex.length;
v.second++;
fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
dout(10) << "alloc_inc " << ex
<< " " << ref << " -> " << ref+1
<< " (left split finish)"
<< endl;
break;
}
}
}
return 0;
}
int Allocator::alloc_dec(Extent ex)
{
dout(10) << "alloc_dec " << ex << endl;
assert(fs->alloc_tab->get_num_keys() >= 0);
Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
// try to move to left (to check for overlap)
int r = fs->alloc_tab->find( ex.start, cursor );
dout(10) << "alloc_dec find r = " << r << endl;
if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
cursor.current().key > ex.start) {
r = cursor.move_left();
dout(10) << "alloc_dec move_left r = " << r << endl;
// too far left?
if (cursor.current().key < ex.start &&
cursor.current().key + cursor.current().value.first <= ex.start) {
// no overlap.
dump_freelist();
assert(0);
}
}
while (1) {
dout(10) << "alloc_dec ? " << cursor.current().key
<< "~" << cursor.current().value.first
<< " " << cursor.current().value.second
<< ", ex is " << ex
<< endl;
assert(cursor.current().key <= ex.start); // no gap allowed.
if (cursor.current().key < ex.start) {
block_t end = cursor.current().value.first + cursor.current().key;
if (end <= ex.end()) {
// single split
// oooooo
// -----
pair<block_t,int>& v = cursor.dirty_current_value();
v.first = ex.start - cursor.current().key;
int ref = v.second;
dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first
<< " " << ref
<< " shortened left bit of single" << endl;
block_t l = end - ex.start;
if (ref > 1) {
fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, ref-1));
dout(10) << "alloc_dec . " << ex.start << "~" << l
<< " " << ref << " -> " << ref-1
<< endl;
} else {
Extent r(ex.start, l);
_release_into_limbo(r);
}
ex.start += l;
ex.length -= l;
if (ex.length == 0) break;
fs->alloc_tab->find( ex.start, cursor );
} else {
// double split, finish.
// ooooooooooooo
// ------
pair<block_t,int>& v = cursor.dirty_current_value();
v.first = ex.start - cursor.current().key;
int ref = v.second;
dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first
<< " " << ref
<< " shorted left bit of double split" << endl;
if (ref > 1) {
fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, ref-1));
dout(10) << "alloc_inc s " << ex
<< " " << ref << " -> " << ref-1
<< " reinserted middle bit of double split"
<< endl;
} else {
_release_into_limbo(ex);
}
int rl = end - ex.end();
fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
dout(10) << "alloc_dec s " << ex.end() << "~" << rl
<< " " << ref
<< " reinserted right bit of double split" << endl;
break;
}
}
else {
assert(cursor.current().key == ex.start);
if (cursor.current().value.first <= ex.length) {
// inc.
// oooooo
// nnnnnnnn
if (cursor.current().value.second > 1) {
pair<block_t,int>& v = cursor.dirty_current_value();
v.second--;
dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first
<< " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second
<< endl;
ex.start += v.first;
ex.length -= v.first;
if (ex.length == 0) break;
cursor.move_right();
} else {
Extent r(cursor.current().key, cursor.current().value.first);
_release_into_limbo(r);
ex.start += cursor.current().value.first;
ex.length -= cursor.current().value.first;
cursor.remove();
if (ex.length == 0) break;
fs->alloc_tab->find( ex.start, cursor );
}
} else {
// single split, finish.
// oooooo
// nnn
block_t l = cursor.current().value.first - ex.length;
int ref = cursor.current().value.second;
if (ref > 1) {
pair<block_t,int>& v = cursor.dirty_current_value();
v.first = ex.length;
v.second--;
dout(10) << "alloc_inc . " << ex
<< " " << ref << " -> " << ref-1
<< endl;
} else {
_release_into_limbo(ex);
cursor.remove();
}
dout(10) << "alloc_dec s " << ex.end() << "~" << l
<< " " << ref
<< " reinserted right bit of single split" << endl;
fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
break;
}
}
}
return 0;
}
/*
* release extent into freelist
* WARNING: *ONLY* use this if you _know_ there are no adjacent free extents
*/
int Allocator::_release_loner(Extent& ex)
{
assert(ex.length > 0);
int b = pick_bucket(ex.length);
fs->free_tab[b]->insert(ex.start, ex.length);
fs->free_blocks += ex.length;
return 0;
}
/*
* release extent into freelist
* look for any adjacent extents and merge with them!
*/
int Allocator::_release_merge(Extent& orig)
{
dout(15) << "_release_merge " << orig << endl;
assert(orig.length > 0);
Extent newex = orig;
// one after us?
for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
if (fs->free_tab[b]->find( newex.start+newex.length, cursor )
== Table<block_t,block_t>::Cursor::MATCH) {
// add following extent to ours
newex.length += cursor.current().value;
// remove it
fs->free_blocks -= cursor.current().value;
fs->free_tab[b]->remove( cursor.current().key );
break;
}
}
// one before us?
for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
fs->free_tab[b]->find( newex.start+newex.length, cursor );
if (cursor.move_left() >= 0 &&
(cursor.current().key + cursor.current().value == newex.start)) {
// merge
newex.start = cursor.current().key;
newex.length += cursor.current().value;
// remove it
fs->free_blocks -= cursor.current().value;
fs->free_tab[b]->remove( cursor.current().key );
break;
}
}
// ok, insert newex
_release_loner(newex);
return 0;
}