mirror of
https://github.com/ceph/ceph
synced 2025-01-01 16:42:29 +00:00
Merge pull request #603 from dachary/wip-erasure-code-example
erasure code example cleanup
This commit is contained in:
commit
10f291838e
@ -25,15 +25,15 @@
|
||||
are systematic (i.e. the data is not mangled and can be
|
||||
reconstructed by concatenating chunks ).
|
||||
|
||||
All methods returns **0** on success and a negative value on
|
||||
All methods return **0** on success and a negative value on
|
||||
error. If the value returned on error is not explained in
|
||||
**ErasureCodeInterface**, the sources or the documentation of the
|
||||
interface implementer must be read to figure out what it means. It
|
||||
is recommended that each error code matches an *errno* value that
|
||||
relates to the cause of the error.
|
||||
interface implementer (i.e. the plugin ) must be read to figure
|
||||
out what it means. It is recommended that each error code matches
|
||||
an *errno* value that relates to the cause of the error.
|
||||
|
||||
Assuming the interface implementer provides three data chunks ( K
|
||||
= 3 ) and two coding chunks ( M = 2 ), a buffer can be encoded as
|
||||
= 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
|
||||
follows:
|
||||
|
||||
~~~~~~~~~~~~~~~~{.c}
|
||||
@ -50,16 +50,20 @@
|
||||
encoded[4] // coding chunk 1
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
If encoded[2] ( which contains **EF** ) is missing and accessing
|
||||
encoded[3] ( the first coding chunk ) is more expensive than
|
||||
accessing encoded[4] ( the second coding chunk ), the
|
||||
**minimum_to_decode_with_cost** method can be called as follows:
|
||||
The **minimum_to_decode_with_cost** method can be used to minimize
|
||||
the cost of fetching the chunks necessary to retrieve a given
|
||||
content. For instance, if encoded[2] (contained **EF**) is missing
|
||||
and accessing encoded[3] (the first coding chunk) is more
|
||||
expensive than accessing encoded[4] (the second coding chunk),
|
||||
**minimum_to_decode_with_cost** is expected to chose the first
|
||||
coding chunk.
|
||||
|
||||
~~~~~~~~~~~~~~~~{.c}
|
||||
set<int> want_to_read(2); // want the chunk containing "EF"
|
||||
map<int,int> available(
|
||||
0 => 1, // data chunk 0 : available and costs 1
|
||||
1 => 1, // data chunk 1 : available and costs 1
|
||||
// data chunk 2 : missing
|
||||
3 => 9, // coding chunk 1 : available and costs 9
|
||||
4 => 1, // coding chunk 2 : available and costs 1
|
||||
);
|
||||
@ -67,14 +71,14 @@
|
||||
minimum_to_decode_with_cost(want_to_read,
|
||||
available,
|
||||
&minimum);
|
||||
minimum == set<int>(0, 1, 4);
|
||||
minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
It sets **minimum** with three chunks to reconstruct the desired
|
||||
data chunk and will pick the second coding chunk ( 4 ) because it
|
||||
is less expensive ( 1 < 9 ) to retrieve than the first coding
|
||||
chunk ( 3 ). The caller is responsible for retrieving the chunks
|
||||
and call **decode** to reconstruct the second data chunk content.
|
||||
and call **decode** to reconstruct the second data chunk.
|
||||
|
||||
~~~~~~~~~~~~~~~~{.c}
|
||||
map<int,bufferlist> chunks;
|
||||
@ -85,6 +89,10 @@
|
||||
decoded[2] == "EF"
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
The semantic of the cost value is defined by the caller and must
|
||||
be known to the implementer. For instance, it may be more
|
||||
expensive to retrieve two chunks with cost 1 + 9 = 10 than two
|
||||
chunks with cost 6 + 6 = 12.
|
||||
*/
|
||||
|
||||
#include <map>
|
||||
@ -113,7 +121,7 @@ namespace ceph {
|
||||
*
|
||||
* @param [in] want_to_read chunk indexes to be decoded
|
||||
* @param [in] available chunk indexes containing valid data
|
||||
* @param [out] minimum chunk indexes to retrieve for decode
|
||||
* @param [out] minimum chunk indexes to retrieve
|
||||
* @return **0** on success or a negative errno on error.
|
||||
*/
|
||||
virtual int minimum_to_decode(const set<int> &want_to_read,
|
||||
@ -124,8 +132,8 @@ namespace ceph {
|
||||
* Compute the smallest subset of **available** chunks that needs
|
||||
* to be retrieved in order to successfully decode
|
||||
* **want_to_read** chunks. If there are more than one possible
|
||||
* subset, select the subset that contains the chunks with the
|
||||
* lowest cost.
|
||||
* subset, select the subset that minimizes the overall retrieval
|
||||
* cost.
|
||||
*
|
||||
* The **available** parameter maps chunk indexes to their
|
||||
* retrieval cost. The higher the cost value, the more costly it
|
||||
@ -141,7 +149,7 @@ namespace ceph {
|
||||
* @param [in] want_to_read chunk indexes to be decoded
|
||||
* @param [in] available map chunk indexes containing valid data
|
||||
* to their retrieval cost
|
||||
* @param [out] minimum chunk indexes to retrieve for decode
|
||||
* @param [out] minimum chunk indexes to retrieve
|
||||
* @return **0** on success or a negative errno on error.
|
||||
*/
|
||||
virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
|
||||
@ -150,15 +158,31 @@ namespace ceph {
|
||||
|
||||
/**
|
||||
* Encode the content of **in** and store the result in
|
||||
* **encoded**. The **encoded** map contains at least all
|
||||
* chunk indexes found in the **want_to_encode** set.
|
||||
* **encoded**. All buffers pointed to by **encoded** have the
|
||||
* same size. The **encoded** map contains at least all chunk
|
||||
* indexes found in the **want_to_encode** set.
|
||||
*
|
||||
* The **encoded** map is expected to be a pointer to an empty
|
||||
* map.
|
||||
*
|
||||
* Assuming the **in** parameter is **length** bytes long,
|
||||
* the concatenation of the first **length** bytes of the
|
||||
* **encoded** buffers is equal to the content of the **in**
|
||||
* parameter.
|
||||
*
|
||||
* The **encoded** map may contain more chunks than required by
|
||||
* **want_to_encode** and the caller is expected to permanently
|
||||
* store all of them, not just the chunks from **want_to_encode**.
|
||||
* store all of them, not just the chunks listed in
|
||||
* **want_to_encode**.
|
||||
*
|
||||
* The **encoded** map may contain pointers to data stored in
|
||||
* the **in** parameter. If the caller modifies the content of
|
||||
* **in** after calling the encode method, it may have a side
|
||||
* effect on the content of **encoded**.
|
||||
*
|
||||
* The **encoded** map may contain pointers to buffers allocated
|
||||
* by the encode method. They will be freed when **encoded** is
|
||||
* freed. The allocation method is not specified.
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*
|
||||
@ -172,24 +196,30 @@ namespace ceph {
|
||||
map<int, bufferlist> *encoded) = 0;
|
||||
|
||||
/**
|
||||
* Decode the **chunks** and store at least **want_to_read** chunks
|
||||
* in **decoded**.
|
||||
* Decode the **chunks** and store at least **want_to_read**
|
||||
* chunks in **decoded**.
|
||||
*
|
||||
* The **decoded** map must be a pointer to an empty map.
|
||||
*
|
||||
* There must be enough **chunks** ( as returned by
|
||||
* **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
|
||||
* perform a successfull decoding of all chunks found in
|
||||
* perform a successful decoding of all chunks listed in
|
||||
* **want_to_read**.
|
||||
*
|
||||
* The **decoded** map is expected to be a pointer to an empty
|
||||
* map.
|
||||
* All buffers pointed by **in** must have the same size.
|
||||
*
|
||||
* The **decoded** map may contain more chunks than required by
|
||||
* **want_to_read** and they can safely be used by the caller.
|
||||
* On success, the **decoded** map may contain more chunks than
|
||||
* required by **want_to_read** and they can safely be used by the
|
||||
* caller.
|
||||
*
|
||||
* If a chunk is listed in **want_to_read** and there is
|
||||
* corresponding **bufferlist** in **chunks**, it will be copied
|
||||
* verbatim into **decoded**. If not it will be reconstructed from
|
||||
* the existing chunks.
|
||||
* If a chunk is listed in **want_to_read** and there is a
|
||||
* corresponding **bufferlist** in **chunks**, it will be
|
||||
* referenced in **decoded**. If not it will be reconstructed from
|
||||
* the existing chunks.
|
||||
*
|
||||
* Because **decoded** may contain pointers to data found in
|
||||
* **chunks**, modifying the content of **chunks** after calling
|
||||
* decode may have a side effect on the content of **decoded**.
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*
|
||||
|
@ -19,12 +19,19 @@
|
||||
|
||||
#include <unistd.h>
|
||||
#include <errno.h>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include "osd/ErasureCodeInterface.h"
|
||||
|
||||
#define FIRST_DATA_CHUNK 0
|
||||
#define SECOND_DATA_CHUNK 1
|
||||
#define DATA_CHUNKS 2u
|
||||
|
||||
#define CODING_CHUNK 2
|
||||
#define CODING_CHUNKS 1u
|
||||
|
||||
#define MINIMUM_TO_RECOVER 2u
|
||||
|
||||
class ErasureCodeExample : public ErasureCodeInterface {
|
||||
public:
|
||||
useconds_t delay;
|
||||
@ -43,21 +50,43 @@ public:
|
||||
virtual int minimum_to_decode(const set<int> &want_to_read,
|
||||
const set<int> &available_chunks,
|
||||
set<int> *minimum) {
|
||||
if (available_chunks.size() < DATA_CHUNKS)
|
||||
if (includes(available_chunks.begin(), available_chunks.end(),
|
||||
want_to_read.begin(), want_to_read.end())) {
|
||||
*minimum = want_to_read;
|
||||
return 0;
|
||||
} else if (available_chunks.size() >= MINIMUM_TO_RECOVER) {
|
||||
*minimum = available_chunks;
|
||||
return 0;
|
||||
} else {
|
||||
return -EIO;
|
||||
set<int>::iterator i;
|
||||
unsigned j;
|
||||
for (i = available_chunks.begin(), j = 0; j < DATA_CHUNKS; i++, j++)
|
||||
minimum->insert(*i);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
|
||||
const map<int, int> &available,
|
||||
set<int> *minimum) {
|
||||
//
|
||||
// If one chunk is more expensive to fetch than the others,
|
||||
// recover it instead. For instance, if the cost reflects the
|
||||
// time it takes for a chunk to be retrieved from a remote
|
||||
// OSD and if CPU is cheap, it could make sense to recover
|
||||
// instead of fetching the chunk.
|
||||
//
|
||||
map<int, int> c2c(available);
|
||||
if (c2c.size() > DATA_CHUNKS) {
|
||||
if (c2c[FIRST_DATA_CHUNK] > c2c[SECOND_DATA_CHUNK] &&
|
||||
c2c[FIRST_DATA_CHUNK] > c2c[CODING_CHUNK])
|
||||
c2c.erase(FIRST_DATA_CHUNK);
|
||||
else if(c2c[SECOND_DATA_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
|
||||
c2c[SECOND_DATA_CHUNK] > c2c[CODING_CHUNK])
|
||||
c2c.erase(SECOND_DATA_CHUNK);
|
||||
else if(c2c[CODING_CHUNK] > c2c[FIRST_DATA_CHUNK] &&
|
||||
c2c[CODING_CHUNK] > c2c[SECOND_DATA_CHUNK])
|
||||
c2c.erase(CODING_CHUNK);
|
||||
}
|
||||
set <int> available_chunks;
|
||||
for (map<int, int>::const_iterator i = available.begin();
|
||||
i != available.end();
|
||||
for (map<int, int>::const_iterator i = c2c.begin();
|
||||
i != c2c.end();
|
||||
i++)
|
||||
available_chunks.insert(i->first);
|
||||
return minimum_to_decode(want_to_read, available_chunks, minimum);
|
||||
@ -66,16 +95,28 @@ public:
|
||||
virtual int encode(const set<int> &want_to_encode,
|
||||
const bufferlist &in,
|
||||
map<int, bufferlist> *encoded) {
|
||||
//
|
||||
// make sure all data chunks have the same length, allocating
|
||||
// padding if necessary.
|
||||
//
|
||||
unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
|
||||
unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
|
||||
bufferlist out(in);
|
||||
bufferptr pad(length - in.length());
|
||||
pad.zero(0, DATA_CHUNKS);
|
||||
out.push_back(pad);
|
||||
//
|
||||
// compute the coding chunk with first chunk ^ second chunk
|
||||
//
|
||||
char *p = out.c_str();
|
||||
for (unsigned i = 0; i < chunk_length * DATA_CHUNKS; i++)
|
||||
p[i + 2 * chunk_length] =
|
||||
p[i + 0 * chunk_length] ^ p[i + 1 * chunk_length];
|
||||
for (unsigned i = 0; i < chunk_length; i++)
|
||||
p[i + CODING_CHUNK * chunk_length] =
|
||||
p[i + FIRST_DATA_CHUNK * chunk_length] ^
|
||||
p[i + SECOND_DATA_CHUNK * chunk_length];
|
||||
//
|
||||
// populate the bufferlist with bufferptr pointing
|
||||
// to chunk boundaries
|
||||
//
|
||||
const bufferptr ptr = out.buffers().front();
|
||||
for (set<int>::iterator j = want_to_encode.begin();
|
||||
j != want_to_encode.end();
|
||||
@ -89,14 +130,30 @@ public:
|
||||
virtual int decode(const set<int> &want_to_read,
|
||||
const map<int, bufferlist> &chunks,
|
||||
map<int, bufferlist> *decoded) {
|
||||
|
||||
//
|
||||
// All chunks have the same size
|
||||
//
|
||||
unsigned chunk_length = (*chunks.begin()).second.length();
|
||||
for (set<int>::iterator i = want_to_read.begin();
|
||||
i != want_to_read.end();
|
||||
i++) {
|
||||
if (chunks.find(*i) != chunks.end())
|
||||
if (chunks.find(*i) != chunks.end()) {
|
||||
//
|
||||
// If the chunk is available, just copy the bufferptr pointer
|
||||
// to the decoded argument.
|
||||
//
|
||||
(*decoded)[*i] = chunks.find(*i)->second;
|
||||
else {
|
||||
} else if(chunks.size() != 2) {
|
||||
//
|
||||
// If a chunk is missing and there are not enough chunks
|
||||
// to recover, abort.
|
||||
//
|
||||
return -ERANGE;
|
||||
} else {
|
||||
//
|
||||
// No matter what the missing chunk is, XOR of the other
|
||||
// two recovers it.
|
||||
//
|
||||
bufferptr chunk(chunk_length);
|
||||
map<int, bufferlist>::const_iterator k = chunks.begin();
|
||||
const char *a = k->second.buffers().front().c_str();
|
||||
|
@ -65,10 +65,54 @@ TEST(ErasureCodeExample, minimum_to_decode)
|
||||
EXPECT_EQ(0, example.minimum_to_decode(want_to_read,
|
||||
available_chunks,
|
||||
&minimum));
|
||||
EXPECT_EQ(1u, minimum.size());
|
||||
EXPECT_EQ(1u, minimum.count(1));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ErasureCodeExample, minimum_to_decode_with_cost)
|
||||
{
|
||||
map<std::string,std::string> parameters;
|
||||
ErasureCodeExample example(parameters);
|
||||
map<int,int> available;
|
||||
set<int> want_to_read;
|
||||
want_to_read.insert(1);
|
||||
{
|
||||
set<int> minimum;
|
||||
EXPECT_EQ(-EIO, example.minimum_to_decode_with_cost(want_to_read,
|
||||
available,
|
||||
&minimum));
|
||||
}
|
||||
available[0] = 1;
|
||||
available[2] = 1;
|
||||
{
|
||||
set<int> minimum;
|
||||
EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
|
||||
available,
|
||||
&minimum));
|
||||
EXPECT_EQ(2u, minimum.size());
|
||||
EXPECT_EQ(1u, minimum.count(0));
|
||||
EXPECT_EQ(1u, minimum.count(2));
|
||||
}
|
||||
{
|
||||
set<int> minimum;
|
||||
available[1] = 1;
|
||||
EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
|
||||
available,
|
||||
&minimum));
|
||||
EXPECT_EQ(1u, minimum.size());
|
||||
EXPECT_EQ(1u, minimum.count(1));
|
||||
}
|
||||
{
|
||||
set<int> minimum;
|
||||
available[1] = 2;
|
||||
EXPECT_EQ(0, example.minimum_to_decode_with_cost(want_to_read,
|
||||
available,
|
||||
&minimum));
|
||||
EXPECT_EQ(2u, minimum.size());
|
||||
EXPECT_EQ(1u, minimum.count(0));
|
||||
EXPECT_EQ(1u, minimum.count(2));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ErasureCodeExample, encode_decode)
|
||||
@ -142,5 +186,5 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
// Local Variables:
|
||||
// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
|
||||
// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
|
||||
// End:
|
||||
|
Loading…
Reference in New Issue
Block a user