mirror of
https://github.com/ceph/ceph
synced 2025-02-19 00:47:49 +00:00
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1596 29311d96-e01e-0410-9327-a35deaab8ce9
1149 lines
32 KiB
C++
1149 lines
32 KiB
C++
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
|
// vim: ts=8 sw=2 smarttab
|
|
/*
|
|
* Ceph - scalable distributed file system
|
|
*
|
|
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
|
*
|
|
* This is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License version 2.1, as published by the Free Software
|
|
* Foundation. See file COPYING.
|
|
*
|
|
*/
|
|
|
|
|
|
|
|
#include "BufferCache.h"
|
|
#include "Onode.h"
|
|
|
|
|
|
/*********** BufferHead **************/
|
|
|
|
|
|
#undef dout
|
|
#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.bh."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/************ ObjectCache **************/
|
|
|
|
|
|
#undef dout
|
|
#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.oc."
|
|
|
|
|
|
|
|
void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl)
|
|
{
|
|
list<Context*> waiters;
|
|
|
|
dout(10) << "rx_finish " << start << "~" << length << endl;
|
|
for (map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
|
|
p != data.end();
|
|
p++) {
|
|
BufferHead *bh = p->second;
|
|
dout(10) << "rx_finish ?" << *bh << endl;
|
|
assert(p->first == bh->start());
|
|
|
|
// past?
|
|
if (p->first >= start+length) break;
|
|
if (bh->end() > start+length) break; // past
|
|
|
|
assert(p->first >= start);
|
|
assert(bh->end() <= start+length);
|
|
|
|
dout(10) << "rx_finish !" << *bh << endl;
|
|
|
|
if (bh->rx_ioh == ioh)
|
|
bh->rx_ioh = 0;
|
|
|
|
if (bh->is_rx()) {
|
|
assert(bh->get_version() == 0);
|
|
assert(bh->end() <= start+length);
|
|
assert(bh->start() >= start);
|
|
dout(10) << "rx_finish rx -> clean on " << *bh << endl;
|
|
bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE);
|
|
bc->mark_clean(bh);
|
|
}
|
|
else if (bh->is_partial()) {
|
|
dout(10) << "rx_finish partial -> tx on " << *bh << endl;
|
|
|
|
if (1) {
|
|
// double-check what block i am
|
|
vector<Extent> exv;
|
|
on->map_extents(bh->start(), 1, exv);
|
|
assert(exv.size() == 1);
|
|
block_t cur_block = exv[0].start;
|
|
assert(cur_block == bh->partial_tx_to);
|
|
}
|
|
|
|
// ok, cancel my low-level partial (since we're still here, and can bh_write ourselves)
|
|
bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch );
|
|
|
|
// apply partial to myself
|
|
assert(bh->data.length() == 0);
|
|
bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
|
|
bh->data.push_back( bp );
|
|
bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl);
|
|
bh->apply_partial();
|
|
|
|
// write "normally"
|
|
bc->mark_dirty(bh);
|
|
bc->bh_write(on, bh, bh->partial_tx_to);//cur_block);
|
|
|
|
// clean up a bit
|
|
bh->partial_tx_to = 0;
|
|
bh->partial_tx_epoch = 0;
|
|
bh->partial.clear();
|
|
}
|
|
else {
|
|
dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << endl;
|
|
assert(bh->is_dirty() || // was overwritten
|
|
bh->is_tx() || // was overwritten and queued
|
|
bh->is_clean()); // was overwritten, queued, _and_ flushed to disk
|
|
}
|
|
|
|
// trigger waiters
|
|
for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
|
|
p != bh->waitfor_read.end();
|
|
p++) {
|
|
assert(p->first >= bh->start() && p->first < bh->end());
|
|
waiters.splice(waiters.begin(), p->second);
|
|
}
|
|
bh->waitfor_read.clear();
|
|
}
|
|
|
|
finish_contexts(waiters);
|
|
}
|
|
|
|
|
|
void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length,
|
|
version_t version, version_t epoch)
|
|
{
|
|
dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl;
|
|
for (map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
|
|
p != data.end();
|
|
p++) {
|
|
BufferHead *bh = p->second;
|
|
dout(30) << "tx_finish ?bh " << *bh << endl;
|
|
assert(p->first == bh->start());
|
|
|
|
// past?
|
|
if (p->first >= start+length) break;
|
|
|
|
if (bh->tx_ioh == ioh)
|
|
bh->tx_ioh = 0;
|
|
|
|
if (!bh->is_tx()) {
|
|
dout(10) << "tx_finish bh not marked tx, skipping" << endl;
|
|
continue;
|
|
}
|
|
assert(bh->is_tx());
|
|
|
|
if (version == bh->version) {
|
|
dout(10) << "tx_finish tx -> clean on " << *bh << endl;
|
|
assert(bh->end() <= start+length);
|
|
bh->set_last_flushed(version);
|
|
bc->mark_clean(bh);
|
|
} else {
|
|
dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version
|
|
<< " on " << *bh << endl;
|
|
assert(bh->version > version);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
* return any bh's that are (partially) in this range that are TX.
|
|
*/
|
|
int ObjectCache::find_tx(block_t start, block_t len,
|
|
list<BufferHead*>& tx)
|
|
{
|
|
map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
|
|
|
|
block_t cur = start;
|
|
block_t left = len;
|
|
|
|
/* don't care about overlap, we want things _fully_ in start~len.
|
|
if (p != data.begin() &&
|
|
(p == data.end() || p->first > cur)) {
|
|
p--; // might overlap!
|
|
if (p->first + p->second->length() <= cur)
|
|
p++; // doesn't overlap.
|
|
}
|
|
*/
|
|
|
|
while (left > 0) {
|
|
assert(cur+left == start+len);
|
|
|
|
// at end?
|
|
if (p == data.end())
|
|
break;
|
|
|
|
if (p->first <= cur) {
|
|
// have it (or part of it)
|
|
BufferHead *e = p->second;
|
|
|
|
if (e->end() <= start+len &&
|
|
e->is_tx())
|
|
tx.push_back(e);
|
|
|
|
block_t lenfromcur = MIN(e->end() - cur, left);
|
|
cur += lenfromcur;
|
|
left -= lenfromcur;
|
|
p++;
|
|
continue; // more?
|
|
} else if (p->first > cur) {
|
|
// gap.. miss
|
|
block_t next = p->first;
|
|
left -= (next-cur);
|
|
cur = next;
|
|
continue;
|
|
}
|
|
else
|
|
assert(0);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int ObjectCache::try_map_read(block_t start, block_t len)
|
|
{
|
|
map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
|
|
|
|
block_t cur = start;
|
|
block_t left = len;
|
|
|
|
if (p != data.begin() &&
|
|
(p == data.end() || p->first > cur)) {
|
|
p--; // might overlap!
|
|
if (p->first + p->second->length() <= cur)
|
|
p++; // doesn't overlap.
|
|
}
|
|
|
|
int num_missing = 0;
|
|
|
|
while (left > 0) {
|
|
// at end?
|
|
if (p == data.end()) {
|
|
// rest is a miss.
|
|
vector<Extent> exv;
|
|
on->map_extents(cur,
|
|
left, // no prefetch here!
|
|
exv);
|
|
|
|
num_missing += exv.size();
|
|
left = 0;
|
|
cur = start+len;
|
|
break;
|
|
}
|
|
|
|
if (p->first <= cur) {
|
|
// have it (or part of it)
|
|
BufferHead *e = p->second;
|
|
|
|
if (e->is_clean() ||
|
|
e->is_dirty() ||
|
|
e->is_tx()) {
|
|
dout(20) << "try_map_read hit " << *e << endl;
|
|
}
|
|
else if (e->is_rx()) {
|
|
dout(20) << "try_map_read rx " << *e << endl;
|
|
num_missing++;
|
|
}
|
|
else if (e->is_partial()) {
|
|
dout(-20) << "try_map_read partial " << *e << endl;
|
|
num_missing++;
|
|
}
|
|
else {
|
|
dout(0) << "try_map_read got unexpected " << *e << endl;
|
|
assert(0);
|
|
}
|
|
|
|
block_t lenfromcur = MIN(e->end() - cur, left);
|
|
cur += lenfromcur;
|
|
left -= lenfromcur;
|
|
p++;
|
|
continue; // more?
|
|
} else if (p->first > cur) {
|
|
// gap.. miss
|
|
block_t next = p->first;
|
|
vector<Extent> exv;
|
|
on->map_extents(cur,
|
|
MIN(next-cur, left), // no prefetch
|
|
exv);
|
|
|
|
dout(-20) << "try_map_read gap of " << p->first-cur << " blocks, "
|
|
<< exv.size() << " extents" << endl;
|
|
num_missing += exv.size();
|
|
left -= (p->first - cur);
|
|
cur = p->first;
|
|
continue; // more?
|
|
}
|
|
else
|
|
assert(0);
|
|
}
|
|
|
|
assert(left == 0);
|
|
assert(cur == start+len);
|
|
return num_missing;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
* map a range of blocks into buffer_heads.
|
|
* - create missing buffer_heads as necessary.
|
|
* - fragment along disk extent boundaries
|
|
*/
|
|
int ObjectCache::map_read(block_t start, block_t len,
|
|
map<block_t, BufferHead*>& hits,
|
|
map<block_t, BufferHead*>& missing,
|
|
map<block_t, BufferHead*>& rx,
|
|
map<block_t, BufferHead*>& partial) {
|
|
|
|
map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
|
|
|
|
block_t cur = start;
|
|
block_t left = len;
|
|
|
|
if (p != data.begin() &&
|
|
(p == data.end() || p->first > cur)) {
|
|
p--; // might overlap!
|
|
if (p->first + p->second->length() <= cur)
|
|
p++; // doesn't overlap.
|
|
}
|
|
|
|
while (left > 0) {
|
|
// at end?
|
|
if (p == data.end()) {
|
|
// rest is a miss.
|
|
vector<Extent> exv;
|
|
//on->map_extents(cur, left, exv); // we might consider some prefetch here.
|
|
on->map_extents(cur,
|
|
//MIN(left + g_conf.ebofs_max_prefetch, // prefetch
|
|
//on->object_blocks-cur),
|
|
left, // no prefetch
|
|
exv);
|
|
for (unsigned i=0; i<exv.size() && left > 0; i++) {
|
|
BufferHead *n = new BufferHead(this);
|
|
n->set_start( cur );
|
|
n->set_length( exv[i].length );
|
|
bc->add_bh(n);
|
|
missing[cur] = n;
|
|
dout(20) << "map_read miss " << left << " left, " << *n << endl;
|
|
cur += MIN(left,exv[i].length);
|
|
left -= MIN(left,exv[i].length);
|
|
}
|
|
assert(left == 0);
|
|
assert(cur == start+len);
|
|
break;
|
|
}
|
|
|
|
if (p->first <= cur) {
|
|
// have it (or part of it)
|
|
BufferHead *e = p->second;
|
|
|
|
if (e->is_clean() ||
|
|
e->is_dirty() ||
|
|
e->is_tx()) {
|
|
hits[cur] = e; // readable!
|
|
dout(20) << "map_read hit " << *e << endl;
|
|
bc->touch(e);
|
|
}
|
|
else if (e->is_rx()) {
|
|
rx[cur] = e; // missing, not readable.
|
|
dout(20) << "map_read rx " << *e << endl;
|
|
}
|
|
else if (e->is_partial()) {
|
|
partial[cur] = e;
|
|
dout(20) << "map_read partial " << *e << endl;
|
|
}
|
|
else {
|
|
dout(0) << "map_read ??? got unexpected " << *e << endl;
|
|
assert(0);
|
|
}
|
|
|
|
block_t lenfromcur = MIN(e->end() - cur, left);
|
|
cur += lenfromcur;
|
|
left -= lenfromcur;
|
|
p++;
|
|
continue; // more?
|
|
} else if (p->first > cur) {
|
|
// gap.. miss
|
|
block_t next = p->first;
|
|
vector<Extent> exv;
|
|
on->map_extents(cur,
|
|
//MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch
|
|
// on->object_blocks-cur)),
|
|
MIN(next-cur, left), // no prefetch
|
|
exv);
|
|
|
|
for (unsigned i=0; i<exv.size() && left>0; i++) {
|
|
BufferHead *n = new BufferHead(this);
|
|
n->set_start( cur );
|
|
n->set_length( exv[i].length );
|
|
bc->add_bh(n);
|
|
missing[cur] = n;
|
|
cur += MIN(left, n->length());
|
|
left -= MIN(left, n->length());
|
|
dout(20) << "map_read gap " << *n << endl;
|
|
}
|
|
continue; // more?
|
|
}
|
|
else
|
|
assert(0);
|
|
}
|
|
|
|
assert(left == 0);
|
|
assert(cur == start+len);
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* map a range of pages on an object's buffer cache.
|
|
*
|
|
* - break up bufferheads that don't fall completely within the range
|
|
* - cancel rx ops we obsolete.
|
|
* - resubmit rx ops if we split bufferheads
|
|
*
|
|
* - leave potentially obsoleted tx ops alone (for now)
|
|
* - don't worry about disk extent boundaries (yet)
|
|
*/
|
|
int ObjectCache::map_write(block_t start, block_t len,
|
|
interval_set<block_t>& alloc,
|
|
map<block_t, BufferHead*>& hits,
|
|
version_t super_epoch)
|
|
{
|
|
map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
|
|
|
|
dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << endl;
|
|
// p->first >= start
|
|
|
|
block_t cur = start;
|
|
block_t left = len;
|
|
|
|
if (p != data.begin() &&
|
|
(p == data.end() || p->first > cur)) {
|
|
p--; // might overlap!
|
|
if (p->first + p->second->length() <= cur)
|
|
p++; // doesn't overlap.
|
|
}
|
|
|
|
//dump();
|
|
|
|
while (left > 0) {
|
|
// max for this bh (bc of (re)alloc on disk)
|
|
block_t max = left;
|
|
bool newalloc = false;
|
|
|
|
// based on alloc/no-alloc boundary ...
|
|
if (alloc.contains(cur, left)) {
|
|
if (alloc.contains(cur)) {
|
|
block_t ends = alloc.end_after(cur);
|
|
max = MIN(left, ends-cur);
|
|
newalloc = true;
|
|
} else {
|
|
if (alloc.starts_after(cur)) {
|
|
block_t st = alloc.start_after(cur);
|
|
max = MIN(left, st-cur);
|
|
}
|
|
}
|
|
}
|
|
|
|
// based on disk extent boundary ...
|
|
vector<Extent> exv;
|
|
on->map_extents(cur, max, exv);
|
|
if (exv.size() > 1)
|
|
max = exv[0].length;
|
|
|
|
if (newalloc) {
|
|
dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << endl;
|
|
} else {
|
|
dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << endl;
|
|
}
|
|
|
|
// at end?
|
|
if (p == data.end()) {
|
|
BufferHead *n = new BufferHead(this);
|
|
n->set_start( cur );
|
|
n->set_length( max );
|
|
bc->add_bh(n);
|
|
hits[cur] = n;
|
|
left -= max;
|
|
cur += max;
|
|
continue;
|
|
}
|
|
|
|
dout(10) << "p is " << *p->second << endl;
|
|
|
|
|
|
if (p->first <= cur) {
|
|
BufferHead *bh = p->second;
|
|
dout(10) << "map_write bh " << *bh << " intersected" << endl;
|
|
|
|
if (p->first < cur) {
|
|
if (cur+max >= p->first+p->second->length()) {
|
|
// we want right bit (one splice)
|
|
if (bh->is_rx() && bc->bh_cancel_read(bh)) {
|
|
BufferHead *right = bc->split(bh, cur);
|
|
bc->bh_read(on, bh); // reread left bit
|
|
bh = right;
|
|
} else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
|
|
BufferHead *right = bc->split(bh, cur);
|
|
bc->bh_write(on, bh); // rewrite left bit
|
|
bh = right;
|
|
} else {
|
|
bh = bc->split(bh, cur); // just split it
|
|
}
|
|
p++;
|
|
assert(p->second == bh);
|
|
} else {
|
|
// we want middle bit (two splices)
|
|
if (bh->is_rx() && bc->bh_cancel_read(bh)) {
|
|
BufferHead *middle = bc->split(bh, cur);
|
|
bc->bh_read(on, bh); // reread left
|
|
p++;
|
|
assert(p->second == middle);
|
|
BufferHead *right = bc->split(middle, cur+max);
|
|
bc->bh_read(on, right); // reread right
|
|
bh = middle;
|
|
} else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
|
|
BufferHead *middle = bc->split(bh, cur);
|
|
bc->bh_write(on, bh); // redo left
|
|
p++;
|
|
assert(p->second == middle);
|
|
BufferHead *right = bc->split(middle, cur+max);
|
|
bc->bh_write(on, right); // redo right
|
|
bh = middle;
|
|
} else {
|
|
BufferHead *middle = bc->split(bh, cur);
|
|
p++;
|
|
assert(p->second == middle);
|
|
bc->split(middle, cur+max);
|
|
bh = middle;
|
|
}
|
|
}
|
|
} else if (p->first == cur) {
|
|
if (p->second->length() <= max) {
|
|
// whole bufferhead, piece of cake.
|
|
} else {
|
|
// we want left bit (one splice)
|
|
if (bh->is_rx() && bc->bh_cancel_read(bh)) {
|
|
BufferHead *right = bc->split(bh, cur+max);
|
|
bc->bh_read(on, right); // re-rx the right bit
|
|
} else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) {
|
|
BufferHead *right = bc->split(bh, cur+max);
|
|
bc->bh_write(on, right); // re-tx the right bit
|
|
} else {
|
|
bc->split(bh, cur+max); // just split
|
|
}
|
|
}
|
|
}
|
|
|
|
// try to cancel tx?
|
|
if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch);
|
|
|
|
// put in our map
|
|
hits[cur] = bh;
|
|
|
|
// keep going.
|
|
block_t lenfromcur = bh->end() - cur;
|
|
cur += lenfromcur;
|
|
left -= lenfromcur;
|
|
p++;
|
|
continue;
|
|
} else {
|
|
// gap!
|
|
block_t next = p->first;
|
|
block_t glen = MIN(next-cur, max);
|
|
dout(10) << "map_write gap " << cur << "~" << glen << endl;
|
|
BufferHead *n = new BufferHead(this);
|
|
n->set_start( cur );
|
|
n->set_length( glen );
|
|
bc->add_bh(n);
|
|
hits[cur] = n;
|
|
|
|
cur += glen;
|
|
left -= glen;
|
|
continue; // more?
|
|
}
|
|
}
|
|
|
|
assert(left == 0);
|
|
assert(cur == start+len);
|
|
return 0;
|
|
}
|
|
|
|
/* don't need this.
|
|
int ObjectCache::scan_versions(block_t start, block_t len,
|
|
version_t& low, version_t& high)
|
|
{
|
|
map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
|
|
// p->first >= start
|
|
|
|
if (p != data.begin() && p->first > start) {
|
|
p--; // might overlap?
|
|
if (p->first + p->second->length() <= start)
|
|
p++; // doesn't overlap.
|
|
}
|
|
if (p->first >= start+len)
|
|
return -1; // to the right. no hits.
|
|
|
|
// start
|
|
low = high = p->second->get_version();
|
|
|
|
for (p++; p != data.end(); p++) {
|
|
// past?
|
|
if (p->first >= start+len) break;
|
|
|
|
const version_t v = p->second->get_version();
|
|
if (low > v) low = v;
|
|
if (high < v) high = v;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
*/
|
|
|
|
void ObjectCache::touch_bottom(block_t bstart, block_t blast)
|
|
{
|
|
for (map<block_t, BufferHead*>::iterator p = data.lower_bound(bstart);
|
|
p != data.end();
|
|
++p) {
|
|
BufferHead *bh = p->second;
|
|
|
|
// don't trim unless it's entirely in our range
|
|
if (bh->start() < bstart) continue;
|
|
if (bh->end() > blast) break;
|
|
|
|
dout(12) << "moving " << *bh << " to bottom of lru" << endl;
|
|
bc->touch_bottom(bh); // move to bottom of lru list
|
|
}
|
|
}
|
|
|
|
|
|
void ObjectCache::truncate(block_t blocks, version_t super_epoch)
|
|
{
|
|
dout(7) << "truncate " << object_id
|
|
<< " " << blocks << " blocks"
|
|
<< endl;
|
|
|
|
while (!data.empty()) {
|
|
block_t bhoff = data.rbegin()->first;
|
|
BufferHead *bh = data.rbegin()->second;
|
|
|
|
if (bh->end() <= blocks) break;
|
|
|
|
bool uncom = on->uncommitted.contains(bh->start(), bh->length());
|
|
dout(10) << "truncate " << *bh << " uncom " << uncom
|
|
<< " of " << on->uncommitted
|
|
<< endl;
|
|
|
|
if (bhoff < blocks) {
|
|
// we want right bit (one splice)
|
|
if (bh->is_rx() && bc->bh_cancel_read(bh)) {
|
|
BufferHead *right = bc->split(bh, blocks);
|
|
bc->bh_read(on, bh); // reread left bit
|
|
bh = right;
|
|
} else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) {
|
|
BufferHead *right = bc->split(bh, blocks);
|
|
bc->bh_write(on, bh); // rewrite left bit
|
|
bh = right;
|
|
} else {
|
|
bh = bc->split(bh, blocks); // just split it
|
|
}
|
|
// no worries about partials up here, they're always 1 block (and thus never split)
|
|
} else {
|
|
// whole thing
|
|
// cancel any pending/queued io, if possible.
|
|
if (bh->is_rx())
|
|
bc->bh_cancel_read(bh);
|
|
if (bh->is_tx() && uncom)
|
|
bc->bh_cancel_write(bh, super_epoch);
|
|
if (bh->shadow_of) {
|
|
dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << endl;
|
|
// shadow
|
|
bh->shadow_of->remove_shadow(bh);
|
|
if (bh->is_partial())
|
|
bc->cancel_shadow_partial(bh->rx_from.start, bh);
|
|
} else {
|
|
// normal
|
|
if (bh->is_partial() && uncom)
|
|
bc->bh_cancel_partial_write(bh);
|
|
}
|
|
}
|
|
|
|
for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
|
|
p != bh->waitfor_read.end();
|
|
p++) {
|
|
finish_contexts(p->second, -1);
|
|
}
|
|
|
|
bc->remove_bh(bh);
|
|
delete bh;
|
|
}
|
|
}
|
|
|
|
|
|
void ObjectCache::clone_to(Onode *other)
|
|
{
|
|
ObjectCache *ton = 0;
|
|
|
|
for (map<block_t, BufferHead*>::iterator p = data.begin();
|
|
p != data.end();
|
|
p++) {
|
|
BufferHead *bh = p->second;
|
|
dout(10) << "clone_to ? " << *bh << endl;
|
|
if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) {
|
|
// dup dirty or tx bh's
|
|
if (!ton)
|
|
ton = other->get_oc(bc);
|
|
BufferHead *nbh = new BufferHead(ton);
|
|
nbh->set_start( bh->start() );
|
|
nbh->set_length( bh->length() );
|
|
nbh->data = bh->data; // just copy refs to underlying buffers.
|
|
bc->add_bh(nbh);
|
|
|
|
if (bh->is_partial()) {
|
|
dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << endl;
|
|
nbh->partial = bh->partial;
|
|
bc->mark_partial(nbh);
|
|
// register as shadow_partial
|
|
bc->add_shadow_partial(bh->rx_from.start, nbh);
|
|
} else {
|
|
// clean buffer will shadow
|
|
bh->add_shadow(nbh);
|
|
bc->mark_clean(nbh);
|
|
}
|
|
|
|
dout(10) << "clone_to dup " << *bh << " -> " << *nbh << endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/************** BufferCache ***************/
|
|
|
|
#undef dout
|
|
#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.bc."
|
|
|
|
|
|
|
|
BufferHead *BufferCache::split(BufferHead *orig, block_t after)
|
|
{
|
|
dout(20) << "split " << *orig << " at " << after << endl;
|
|
|
|
// split off right
|
|
BufferHead *right = new BufferHead(orig->get_oc());
|
|
right->set_version(orig->get_version());
|
|
right->epoch_modified = orig->epoch_modified;
|
|
right->last_flushed = orig->last_flushed;
|
|
right->set_state(orig->get_state());
|
|
|
|
block_t newleftlen = after - orig->start();
|
|
right->set_start( after );
|
|
right->set_length( orig->length() - newleftlen );
|
|
|
|
// shorten left
|
|
stat_sub(orig);
|
|
orig->set_length( newleftlen );
|
|
stat_add(orig);
|
|
|
|
// add right
|
|
add_bh(right);
|
|
|
|
// adjust rx_from
|
|
if (orig->is_rx()) {
|
|
right->rx_from = orig->rx_from;
|
|
orig->rx_from.length = newleftlen;
|
|
right->rx_from.length -= newleftlen;
|
|
right->rx_from.start += newleftlen;
|
|
}
|
|
|
|
// dup shadows
|
|
for (set<BufferHead*>::iterator p = orig->shadows.begin();
|
|
p != orig->shadows.end();
|
|
++p)
|
|
right->add_shadow(*p);
|
|
|
|
// split buffers too
|
|
bufferlist bl;
|
|
bl.claim(orig->data);
|
|
if (bl.length()) {
|
|
assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE);
|
|
right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE);
|
|
orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE);
|
|
}
|
|
|
|
// move read waiters
|
|
if (!orig->waitfor_read.empty()) {
|
|
map<block_t, list<Context*> >::iterator o, p = orig->waitfor_read.end();
|
|
p--;
|
|
while (p != orig->waitfor_read.begin()) {
|
|
if (p->first < right->start()) break;
|
|
dout(0) << "split moving waiters at block " << p->first << " to right bh" << endl;
|
|
right->waitfor_read[p->first].swap( p->second );
|
|
o = p;
|
|
p--;
|
|
orig->waitfor_read.erase(o);
|
|
}
|
|
}
|
|
|
|
dout(20) << "split left is " << *orig << endl;
|
|
dout(20) << "split right is " << *right << endl;
|
|
return right;
|
|
}
|
|
|
|
|
|
void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from)
|
|
{
|
|
dout(10) << "bh_read " << *on << " on " << *bh << endl;
|
|
|
|
if (bh->is_missing()) {
|
|
mark_rx(bh);
|
|
} else {
|
|
assert(bh->is_partial());
|
|
}
|
|
|
|
// get extent. there should be only one!
|
|
vector<Extent> exv;
|
|
on->map_extents(bh->start(), bh->length(), exv);
|
|
assert(exv.size() == 1);
|
|
Extent ex = exv[0];
|
|
|
|
if (from) { // force behavior, used for reading partials
|
|
dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << endl;
|
|
ex.start = from;
|
|
ex.length = 1;
|
|
}
|
|
|
|
// this should be empty!!
|
|
assert(bh->rx_ioh == 0);
|
|
|
|
dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << endl;
|
|
|
|
C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc,
|
|
bh->start(), bh->length(),
|
|
ex.start);
|
|
|
|
//bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers!
|
|
fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
|
|
|
|
bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl,
|
|
fin);
|
|
bh->rx_from = ex;
|
|
on->oc->get();
|
|
|
|
}
|
|
|
|
bool BufferCache::bh_cancel_read(BufferHead *bh)
|
|
{
|
|
if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) {
|
|
dout(10) << "bh_cancel_read on " << *bh << endl;
|
|
bh->rx_ioh = 0;
|
|
mark_missing(bh);
|
|
int l = bh->oc->put();
|
|
assert(l);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe)
|
|
{
|
|
dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << endl;
|
|
assert(bh->get_version() > 0);
|
|
|
|
assert(bh->is_dirty());
|
|
mark_tx(bh);
|
|
|
|
// get extents
|
|
vector<Extent> exv;
|
|
on->map_extents(bh->start(), bh->length(), exv);
|
|
assert(exv.size() == 1);
|
|
Extent ex = exv[0];
|
|
|
|
if (shouldbe)
|
|
assert(ex.length == 1 && ex.start == shouldbe);
|
|
|
|
dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << endl;
|
|
|
|
//assert(bh->tx_ioh == 0);
|
|
|
|
assert(bh->get_last_flushed() < bh->get_version());
|
|
|
|
bh->tx_block = ex.start;
|
|
bh->tx_ioh = dev.write(ex.start, ex.length, bh->data,
|
|
new C_OC_TxFinish(ebofs_lock, on->oc,
|
|
bh->start(), bh->length(),
|
|
bh->get_version(),
|
|
bh->epoch_modified),
|
|
"bh_write");
|
|
|
|
on->oc->get();
|
|
inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified );
|
|
|
|
/*
|
|
// assert: no partials on the same block
|
|
// hose any partial on the same block
|
|
if (bh->partial_write.count(ex.start)) {
|
|
dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << endl;
|
|
dec_unflushed( bh->partial_write[ex.start].epoch );
|
|
bh->partial_write.erase(ex.start);
|
|
}
|
|
*/
|
|
}
|
|
|
|
|
|
bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch)
|
|
{
|
|
if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) {
|
|
dout(10) << "bh_cancel_write on " << *bh << endl;
|
|
bh->tx_ioh = 0;
|
|
mark_dirty(bh);
|
|
|
|
assert(bh->epoch_modified == cur_epoch);
|
|
assert(bh->epoch_modified > 0);
|
|
dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch!
|
|
|
|
int l = bh->oc->put();
|
|
assert(l);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void BufferCache::tx_finish(ObjectCache *oc,
|
|
ioh_t ioh, block_t start, block_t length,
|
|
version_t version, version_t epoch)
|
|
{
|
|
ebofs_lock.Lock();
|
|
|
|
// finish oc
|
|
if (oc->put() == 0) {
|
|
delete oc;
|
|
} else
|
|
oc->tx_finish(ioh, start, length, version, epoch);
|
|
|
|
// update unflushed counter
|
|
assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0);
|
|
dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch);
|
|
|
|
ebofs_lock.Unlock();
|
|
}
|
|
|
|
void BufferCache::rx_finish(ObjectCache *oc,
|
|
ioh_t ioh, block_t start, block_t length,
|
|
block_t diskstart,
|
|
bufferlist& bl)
|
|
{
|
|
ebofs_lock.Lock();
|
|
dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length
|
|
<< ", at device block " << diskstart << endl;
|
|
|
|
// oc
|
|
if (oc->put() == 0)
|
|
delete oc;
|
|
else
|
|
oc->rx_finish(ioh, start, length, bl);
|
|
|
|
// finish any partials?
|
|
// note: these are partials that were re-written after a commit,
|
|
// or for whom the OC was destroyed (eg truncated after a commit)
|
|
map<block_t, map<block_t, PartialWrite> >::iterator sp = partial_write.lower_bound(diskstart);
|
|
while (sp != partial_write.end()) {
|
|
if (sp->first >= diskstart+length) break;
|
|
assert(sp->first >= diskstart);
|
|
|
|
block_t pblock = sp->first;
|
|
map<block_t, PartialWrite> writes;
|
|
writes.swap( sp->second );
|
|
|
|
map<block_t, map<block_t, PartialWrite> >::iterator t = sp;
|
|
sp++;
|
|
partial_write.erase(t);
|
|
|
|
for (map<block_t, PartialWrite>::iterator p = writes.begin();
|
|
p != writes.end();
|
|
p++) {
|
|
dout(10) << "rx_finish partial from " << pblock << " -> " << p->first
|
|
<< " for epoch " << p->second.epoch
|
|
//<< " (bh.epoch_modified is now " << bh->epoch_modified << ")"
|
|
<< endl;
|
|
// this had better be a past epoch
|
|
//assert(p->epoch == epoch_modified - 1); // ??
|
|
|
|
// make the combined block
|
|
bufferlist combined;
|
|
bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
|
|
combined.push_back( bp );
|
|
combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
|
|
BufferHead::apply_partial( combined, p->second.partial );
|
|
|
|
// write it!
|
|
dev.write( pblock, 1, combined,
|
|
new C_OC_PartialTxFinish( this, p->second.epoch ),
|
|
"finish_partials");
|
|
}
|
|
}
|
|
|
|
// shadow partials?
|
|
{
|
|
list<Context*> waiters;
|
|
map<block_t, set<BufferHead*> >::iterator sp = shadow_partials.lower_bound(diskstart);
|
|
while (sp != shadow_partials.end()) {
|
|
if (sp->first >= diskstart+length) break;
|
|
assert(sp->first >= diskstart);
|
|
|
|
block_t pblock = sp->first;
|
|
set<BufferHead*> ls;
|
|
ls.swap( sp->second );
|
|
|
|
map<block_t, set<BufferHead*> >::iterator t = sp;
|
|
sp++;
|
|
shadow_partials.erase(t);
|
|
|
|
for (set<BufferHead*>::iterator p = ls.begin();
|
|
p != ls.end();
|
|
++p) {
|
|
BufferHead *bh = *p;
|
|
dout(10) << "rx_finish applying shadow_partial for " << pblock
|
|
<< " to " << *bh << endl;
|
|
bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
|
|
bh->data.clear();
|
|
bh->data.push_back( bp );
|
|
bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE,
|
|
(pblock-diskstart+1)*EBOFS_BLOCK_SIZE,
|
|
bl);
|
|
bh->apply_partial();
|
|
bh->set_state(BufferHead::STATE_CLEAN);
|
|
|
|
// trigger waiters
|
|
for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
|
|
p != bh->waitfor_read.end();
|
|
p++) {
|
|
assert(p->first >= bh->start() && p->first < bh->end());
|
|
waiters.splice(waiters.begin(), p->second);
|
|
}
|
|
bh->waitfor_read.clear();
|
|
}
|
|
}
|
|
|
|
// kick waiters
|
|
finish_contexts(waiters);
|
|
}
|
|
|
|
// done.
|
|
ebofs_lock.Unlock();
|
|
}
|
|
|
|
void BufferCache::partial_tx_finish(version_t epoch)
|
|
{
|
|
ebofs_lock.Lock();
|
|
|
|
dout(10) << "partial_tx_finish in epoch " << epoch << endl;
|
|
|
|
// update unflushed counter
|
|
assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0);
|
|
dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch);
|
|
|
|
ebofs_lock.Unlock();
|
|
}
|
|
|
|
|
|
|
|
|
|
void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh)
|
|
{
|
|
assert(bh->get_version() > 0);
|
|
|
|
assert(bh->is_partial());
|
|
assert(bh->length() == 1);
|
|
|
|
// get the block no
|
|
vector<Extent> exv;
|
|
on->map_extents(bh->start(), bh->length(), exv);
|
|
assert(exv.size() == 1);
|
|
block_t b = exv[0].start;
|
|
assert(exv[0].length == 1);
|
|
bh->partial_tx_to = exv[0].start;
|
|
bh->partial_tx_epoch = bh->epoch_modified;
|
|
|
|
dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << endl;
|
|
|
|
|
|
// copy map state, queue for this block
|
|
assert(bh->rx_from.length == 1);
|
|
queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch );
|
|
}
|
|
|
|
void BufferCache::bh_cancel_partial_write(BufferHead *bh)
|
|
{
|
|
assert(bh->is_partial());
|
|
assert(bh->length() == 1);
|
|
|
|
cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch );
|
|
}
|
|
|
|
|
|
void BufferCache::queue_partial(block_t from, block_t to,
|
|
map<off_t, bufferlist>& partial, version_t epoch)
|
|
{
|
|
dout(10) << "queue_partial " << from << " -> " << to
|
|
<< " in epoch " << epoch
|
|
<< endl;
|
|
|
|
if (partial_write[from].count(to)) {
|
|
// this should be in the same epoch.
|
|
assert( partial_write[from][to].epoch == epoch);
|
|
assert(0); // actually.. no!
|
|
} else {
|
|
inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
|
|
}
|
|
|
|
partial_write[from][to].partial = partial;
|
|
partial_write[from][to].epoch = epoch;
|
|
}
|
|
|
|
void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch)
|
|
{
|
|
assert(partial_write.count(from));
|
|
assert(partial_write[from].count(to));
|
|
assert(partial_write[from][to].epoch == epoch);
|
|
|
|
dout(10) << "cancel_partial " << from << " -> " << to
|
|
<< " (was epoch " << partial_write[from][to].epoch << ")"
|
|
<< endl;
|
|
|
|
partial_write[from].erase(to);
|
|
if (partial_write[from].empty())
|
|
partial_write.erase(from);
|
|
|
|
dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
|
|
}
|
|
|
|
|
|
void BufferCache::add_shadow_partial(block_t from, BufferHead *bh)
|
|
{
|
|
dout(10) << "add_shadow_partial from " << from << " " << *bh << endl;
|
|
shadow_partials[from].insert(bh);
|
|
}
|
|
|
|
void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh)
|
|
{
|
|
dout(10) << "cancel_shadow_partial from " << from << " " << *bh << endl;
|
|
shadow_partials[from].erase(bh);
|
|
}
|