Merge pull request #603 from dachary/wip-erasure-code-example

erasure code example cleanup
This commit is contained in:
Sage Weil 2013-09-23 10:26:48 -07:00
commit 10f291838e
3 changed files with 175 additions and 44 deletions

View File

@ -25,15 +25,15 @@
are systematic (i.e. the data is not mangled and can be
reconstructed by concatenating chunks ).
All methods returns **0** on success and a negative value on
All methods return **0** on success and a negative value on
error. If the value returned on error is not explained in
**ErasureCodeInterface**, the sources or the documentation of the
interface implementer must be read to figure out what it means. It
is recommended that each error code matches an *errno* value that
relates to the cause of the error.
interface implementer (i.e. the plugin ) must be read to figure
out what it means. It is recommended that each error code matches
an *errno* value that relates to the cause of the error.
Assuming the interface implementer provides three data chunks ( K
= 3 ) and two coding chunks ( M = 2 ), a buffer can be encoded as
= 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
follows:
~~~~~~~~~~~~~~~~{.c}
@ -50,16 +50,20 @@
encoded[4] // coding chunk 1
~~~~~~~~~~~~~~~~
If encoded[2] ( which contains **EF** ) is missing and accessing
encoded[3] ( the first coding chunk ) is more expensive than
accessing encoded[4] ( the second coding chunk ), the
**minimum_to_decode_with_cost** method can be called as follows:
The **minimum_to_decode_with_cost** method can be used to minimize
the cost of fetching the chunks necessary to retrieve a given
content. For instance, if encoded[2] (contained **EF**) is missing
and accessing encoded[3] (the first coding chunk) is more
expensive than accessing encoded[4] (the second coding chunk),
**minimum_to_decode_with_cost** is expected to chose the first
coding chunk.
~~~~~~~~~~~~~~~~{.c}
set<int> want_to_read(2); // want the chunk containing "EF"
map<int,int> available(
0 => 1, // data chunk 0 : available and costs 1
1 => 1, // data chunk 1 : available and costs 1
// data chunk 2 : missing
3 => 9, // coding chunk 1 : available and costs 9
4 => 1, // coding chunk 2 : available and costs 1
);
@ -67,14 +71,14 @@
minimum_to_decode_with_cost(want_to_read,
available,
&minimum);
minimum == set<int>(0, 1, 4);
minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
~~~~~~~~~~~~~~~~
It sets **minimum** with three chunks to reconstruct the desired
data chunk and will pick the second coding chunk ( 4 ) because it
is less expensive ( 1 < 9 ) to retrieve than the first coding
chunk ( 3 ). The caller is responsible for retrieving the chunks
and call **decode** to reconstruct the second data chunk content.
and call **decode** to reconstruct the second data chunk.
~~~~~~~~~~~~~~~~{.c}
map<int,bufferlist> chunks;
@ -85,6 +89,10 @@
decoded[2] == "EF"
~~~~~~~~~~~~~~~~
The semantic of the cost value is defined by the caller and must
be known to the implementer. For instance, it may be more
expensive to retrieve two chunks with cost 1 + 9 = 10 than two
chunks with cost 6 + 6 = 12.
*/
#include <map>
@ -113,7 +121,7 @@ namespace ceph {
*
* @param [in] want_to_read chunk indexes to be decoded
* @param [in] available chunk indexes containing valid data
* @param [out] minimum chunk indexes to retrieve for decode
* @param [out] minimum chunk indexes to retrieve
* @return **0** on success or a negative errno on error.
*/
virtual int minimum_to_decode(const set<int> &want_to_read,
@ -124,8 +132,8 @@ namespace ceph {
* Compute the smallest subset of **available** chunks that needs
* to be retrieved in order to successfully decode
* **want_to_read** chunks. If there are more than one possible
* subset, select the subset that contains the chunks with the
* lowest cost.
* subset, select the subset that minimizes the overall retrieval
* cost.
*
* The **available** parameter maps chunk indexes to their
* retrieval cost. The higher the cost value, the more costly it
@ -141,7 +149,7 @@ namespace ceph {
* @param [in] want_to_read chunk indexes to be decoded
* @param [in] available map chunk indexes containing valid data
* to their retrieval cost
* @param [out] minimum chunk indexes to retrieve for decode
* @param [out] minimum chunk indexes to retrieve
* @return **0** on success or a negative errno on error.
*/
virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
@ -150,15 +158,31 @@ namespace ceph {
/**
* Encode the content of **in** and store the result in
* **encoded**. The **encoded** map contains at least all
* chunk indexes found in the **want_to_encode** set.
* **encoded**. All buffers pointed to by **encoded** have the
* same size. The **encoded** map contains at least all chunk
* indexes found in the **want_to_encode** set.
*
* The **encoded** map is expected to be a pointer to an empty
* map.
*
* Assuming the **in** parameter is **length** bytes long,
* the concatenation of the first **length** bytes of the
* **encoded** buffers is equal to the content of the **in**
* parameter.
*
* The **encoded** map may contain more chunks than required by
* **want_to_encode** and the caller is expected to permanently
* store all of them, not just the chunks from **want_to_encode**.
* store all of them, not just the chunks listed in
* **want_to_encode**.
*
* The **encoded** map may contain pointers to data stored in
* the **in** parameter. If the caller modifies the content of
* **in** after calling the encode method, it may have a side
* effect on the content of **encoded**.
*
* The **encoded** map may contain pointers to buffers allocated
* by the encode method. They will be freed when **encoded** is
* freed. The allocation method is not specified.
*
* Returns 0 on success.
*
@ -172,24 +196,30 @@ namespace ceph {
map<int, bufferlist> *encoded) = 0;
/**
* Decode the **chunks** and store at least **want_to_read** chunks
* in **decoded**.
* Decode the **chunks** and store at least **want_to_read**
* chunks in **decoded**.
*
* The **decoded** map must be a pointer to an empty map.
*
* There must be enough **chunks** ( as returned by
* **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
* perform a successfull decoding of all chunks found in
* perform a successful decoding of all chunks listed in
* **want_to_read**.
*
* The **decoded** map is expected to be a pointer to an empty
* map.
* All buffers pointed by **in** must have the same size.
*
* The **decoded** map may contain more chunks than required by
* **want_to_read** and they can safely be used by the caller.
* On success, the **decoded** map may contain more chunks than
* required by **want_to_read** and they can safely be used by the
* caller.
*
* If a chunk is listed in **want_to_read** and there is
* corresponding **bufferlist** in **chunks**, it will be copied
* verbatim into **decoded**. If not it will be reconstructed from
* the existing chunks.
* If a chunk is listed in **want_to_read** and there is a
* corresponding **bufferlist** in **chunks**, it will be
* referenced in **decoded**. If not it will be reconstructed from
* the existing chunks.
*
* Because **decoded** may contain pointers to data found in
* **chunks**, modifying the content of **chunks** after calling
* decode may have a side effect on the content of **decoded**.
*
* Returns 0 on success.
*

View File

@ -19,12 +19,19 @@
#include <unistd.h>
#include <errno.h>
#include <algorithm>
#include <sstream>
#include "osd/ErasureCodeInterface.h"
#define FIRST_DATA_CHUNK 0
#define SECOND_DATA_CHUNK 1
#define DATA_CHUNKS 2u
#define CODING_CHUNK 2
#define CODING_CHUNKS 1u
#define MINIMUM_TO_RECOVER 2u
class ErasureCodeExample : public ErasureCodeInterface {
public:
useconds_t delay;
@ -43,21 +50,43 @@ public:
virtual int minimum_to_decode(const set<int> &want_to_read,
const set<int> &available_chunks,
set<int> *minimum) {
if (available_chunks.size() < DATA_CHUNKS)
if (includes(available_chunks.begin(), available_chunks.end(),
want_to_read.begin(), want_to_read.end())) {
*minimum = want_to_read;
return 0;
} else if (available_chunks.size() >= MINIMUM_TO_RECOVER) {
*minimum = available_chunks;
return 0;
} else {
return -EIO;
set<int>::iterator i;
unsigned j;
for (i = available_chunks.begin(), j = 0; j < DATA_CHUNKS; i++, j++)
minimum->insert(*i);
return 0;
}
}
virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
const map<int, int> &available,
set<int> *minimum) {
//
// If one chunk is more expensive to fetch than the others,
// recover it instead. For instance, if the cost reflects the
// time it takes for a chunk to be retrieved from a remote
// OSD and if CPU is cheap, it could make sense to recover
// instead of fetching the chunk.
//
map<int, int> c2c(available);
if (c2c.size() > DATA_CHUNKS) {
if (c2c[FIRST_DATA_CHUNK] > c2c[SECOND_DATA_CHUNK] &&
c2c[FIRST_DATA_CHUNK] > c2c[CODING_CHUNK])
c2c.erase(FIRST_DATA_CHUNK);
else if(c2c[SECOND_DATA_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
c2c[SECOND_DATA_CHUNK] > c2c[CODING_CHUNK])
c2c.erase(SECOND_DATA_CHUNK);
else if(c2c[CODING_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
c2c[CODING_CHUNK] > c2c[SECOND_DATA_CHUNK])
c2c.erase(CODING_CHUNK);
}
set <int> available_chunks;
for (map<int, int>::const_iterator i = available.begin();
i != available.end();
for (map<int, int>::const_iterator i = c2c.begin();
i != c2c.end();
i++)
available_chunks.insert(i->first);
return minimum_to_decode(want_to_read, available_chunks, minimum);
@ -66,16 +95,28 @@ public:
virtual int encode(const set<int> &want_to_encode,
const bufferlist &in,
map<int, bufferlist> *encoded) {
//
// make sure all data chunks have the same length, allocating
// padding if necessary.
//
unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
bufferlist out(in);
bufferptr pad(length - in.length());
pad.zero(0, DATA_CHUNKS);
out.push_back(pad);
//
// compute the coding chunk with first chunk ^ second chunk
//
char *p = out.c_str();
for (unsigned i = 0; i < chunk_length * DATA_CHUNKS; i++)
p[i + 2 * chunk_length] =
p[i + 0 * chunk_length] ^ p[i + 1 * chunk_length];
for (unsigned i = 0; i < chunk_length; i++)
p[i + CODING_CHUNK * chunk_length] =
p[i + FIRST_DATA_CHUNK * chunk_length] ^
p[i + SECOND_DATA_CHUNK * chunk_length];
//
// populate the bufferlist with bufferptr pointing
// to chunk boundaries
//
const bufferptr ptr = out.buffers().front();
for (set<int>::iterator j = want_to_encode.begin();
j != want_to_encode.end();
@ -89,14 +130,30 @@ public:
virtual int decode(const set<int> &want_to_read,
const map<int, bufferlist> &chunks,
map<int, bufferlist> *decoded) {
//
// All chunks have the same size
//
unsigned chunk_length = (*chunks.begin()).second.length();
for (set<int>::iterator i = want_to_read.begin();
i != want_to_read.end();
i++) {
if (chunks.find(*i) != chunks.end())
if (chunks.find(*i) != chunks.end()) {
//
// If the chunk is available, just copy the bufferptr pointer
// to the decoded argument.
//
(*decoded)[*i] = chunks.find(*i)->second;
else {
} else if(chunks.size() != 2) {
//
// If a chunk is missing and there are not enough chunks
// to recover, abort.
//
return -ERANGE;
} else {
//
// No matter what the missing chunk is, XOR of the other
// two recovers it.
//
bufferptr chunk(chunk_length);
map<int, bufferlist>::const_iterator k = chunks.begin();
const char *a = k->second.buffers().front().c_str();

View File

@ -65,10 +65,54 @@ TEST(ErasureCodeExample, minimum_to_decode)
EXPECT_EQ(0, example.minimum_to_decode(want_to_read,
available_chunks,
&minimum));
EXPECT_EQ(1u, minimum.size());
EXPECT_EQ(1u, minimum.count(1));
}
}
TEST(ErasureCodeExample, minimum_to_decode_with_cost)
{
map<std::string,std::string> parameters;
ErasureCodeExample example(parameters);
map<int,int> available;
set<int> want_to_read;
want_to_read.insert(1);
{
set<int> minimum;
EXPECT_EQ(-EIO, example.minimum_to_decode_with_cost(want_to_read,
available,
&minimum));
}
available[0] = 1;
available[2] = 1;
{
set<int> minimum;
EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
available,
&minimum));
EXPECT_EQ(2u, minimum.size());
EXPECT_EQ(1u, minimum.count(0));
EXPECT_EQ(1u, minimum.count(2));
}
{
set<int> minimum;
available[1] = 1;
EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
available,
&minimum));
EXPECT_EQ(1u, minimum.size());
EXPECT_EQ(1u, minimum.count(1));
}
{
set<int> minimum;
available[1] = 2;
EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
available,
&minimum));
EXPECT_EQ(2u, minimum.size());
EXPECT_EQ(1u, minimum.count(0));
EXPECT_EQ(1u, minimum.count(2));
}
}
TEST(ErasureCodeExample, encode_decode)
@ -142,5 +186,5 @@ int main(int argc, char **argv) {
}
// Local Variables:
// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
// End: