Merge pull request #963 from dachary/wip-erasure-code-api

erasure code interface helpers

Reviewed-by: Samuel Just <sam.just@inktank.com>
This commit is contained in:
athanatos 2014-01-15 10:22:47 -08:00
commit 73e469c966
5 changed files with 224 additions and 32 deletions

View File

@ -25,12 +25,57 @@
are systematic (i.e. the data is not mangled and can be
reconstructed by concatenating chunks ).
All methods return **0** on success and a negative value on
error. If the value returned on error is not explained in
**ErasureCodeInterface**, the sources or the documentation of the
interface implementer (i.e. the plugin ) must be read to figure
out what it means. It is recommended that each error code matches
an *errno* value that relates to the cause of the error.
Methods returning an **int** return **0** on success and a
negative value on error. If the value returned on error is not
explained in **ErasureCodeInterface**, the sources or the
documentation of the interface implementer (i.e. the plugin ) must
be read to figure out what it means. It is recommended that each
error code matches an *errno* value that relates to the cause of
the error.
If an object is small enough, the caller can process it with
one call to the **encode** or **decode** method.
+---------------- coded object O -------------------------+
|+----------------+ +----------------+ +----------------+ |
|| chunk 0 | | chunk 1 | | chunk 2 | |
|| [0,N) | | [N,2N) | | [2N,3N) | |
|+----------------+ +----------------+ +----------------+ |
+------^--------------------------------------------------+
|
chunk B / C | offset B % C ( where C is the chunk size )
|
+-----^---- raw object O ----+------+
| B [0,X) | pad |
+----------------------------+------+
The object size is paded so that each chunks are of the same size.
In the example above, if the actual object size was X, then it
will be padded to 2N >= X assuming there are two data chunks (0
and 1) and one coding chunk (2).
For chunks of size C, byte B of the object is found in chunk number
B / C at offset B % C.
If an object is too large to be encoded in memory, the caller
should divide it in smaller units named **stripes**.
+---------------------- object O -------------------------+
|+----------------+ +----------------+ +----------------+ |
stripe || chunk 0 | | chunk 1 | | chunk 2 | |
0 || [0,N) | | [N,2N) | | [2N,3N) | |
|+----------------+ +----------------+ +----------------+ |
|+----------------+ +----------------+ +----------------+ |
stripe || chunk 0 | | chunk 1 | | chunk 2 | |
1 || [X,M) | | [X+M,X+2M) | | [X+2M,X+3M) | |
|| | | | | | |
|+----------------+ +----------------+ +----------------+ |
| ... |
+---------------------------------------------------------+
The interface does not concern itself with stripes nor does it
impose constraints on the size of each stripe. Variable names in
the interface always use **object** and never use **stripe**.
Assuming the interface implementer provides three data chunks ( K
= 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
@ -108,6 +153,48 @@ namespace ceph {
public:
virtual ~ErasureCodeInterface() {}
/**
* Return the number of chunks created by a call to the **encode**
* method.
*
* In the simplest case it can be K + M, i.e. the number
* of data chunks (K) plus the number of parity chunks
* (M). However, if the implementation provides local parity there
* could be an additional overhead.
*
* @return the number of chunks created by encode()
*/
virtual unsigned int get_chunk_count() const = 0;
/**
* Return the number of data chunks created by a call to the
* **encode** method. The data chunks contain the buffer provided
* to **encode**, verbatim, with padding at the end of the last
* chunk.
*
* @return the number of data chunks created by encode()
*/
virtual unsigned int get_data_chunk_count() const = 0;
/**
* Return the size (in bytes) of a single chunk created by a call
* to the **decode** method. The returned size multiplied by
* **get_chunk_count()** is greater or equal to **object_size**.
*
* If the object size is properly aligned, the chunk size is
* **object_size / get_chunk_count()**. However, if
* **object_size** is not a multiple of **get_chunk_count** or if
* the implementation imposes additional alignment constraints,
* the chunk size may be larger.
*
* The byte found at offset **B** of the original object is mapped
* to chunk **B / get_chunk_size()** at offset **B % get_chunk_size()**.
*
* @param [in] object_size the number of bytes of the object to **encode()**
* @return the size (in bytes) of a single chunk created by **encode()**
*/
virtual unsigned int get_chunk_size(unsigned int object_size) const = 0;
/**
* Compute the smallest subset of **available** chunks that needs
* to be retrieved in order to successfully decode
@ -231,6 +318,29 @@ namespace ceph {
virtual int decode(const set<int> &want_to_read,
const map<int, bufferlist> &chunks,
map<int, bufferlist> *decoded) = 0;
/**
* Decode the first **get_data_chunk_count()** **chunks** and
* concatenate them them into **decoded**.
*
* Returns 0 on success.
*
* @param [in] chunks map chunk indexes to chunk data
* @param [out] decoded concatenante of the data chunks
* @return **0** on success or a negative errno on error.
*/
int decode_concat(const map<int, bufferlist> &chunks,
bufferlist *decoded) {
set<int> want_to_read;
for (unsigned int i = 0; i < get_data_chunk_count(); i++)
want_to_read.insert(i);
map<int, bufferlist> decoded_map;
int r = decode(want_to_read, chunks, &decoded_map);
if (r == 0)
for (unsigned int i = 0; i < get_data_chunk_count(); i++)
decoded->claim_append(decoded_map[i]);
return r;
}
};
typedef std::tr1::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;

View File

@ -43,6 +43,15 @@ void ErasureCodeJerasure::init(const map<std::string,std::string> &parameters)
prepare();
}
unsigned int ErasureCodeJerasure::get_chunk_size(unsigned int object_size) const
{
unsigned alignment = get_alignment();
unsigned tail = object_size % alignment;
unsigned padded_length = object_size + ( tail ? ( alignment - tail ) : 0 );
assert(padded_length % k == 0);
return padded_length / k;
}
int ErasureCodeJerasure::minimum_to_decode(const set<int> &want_to_read,
const set<int> &available_chunks,
set<int> *minimum)
@ -77,9 +86,8 @@ int ErasureCodeJerasure::encode(const set<int> &want_to_encode,
const bufferlist &in,
map<int, bufferlist> *encoded)
{
unsigned alignment = get_alignment();
unsigned tail = in.length() % alignment;
unsigned padded_length = in.length() + ( tail ? ( alignment - tail ) : 0 );
unsigned blocksize = get_chunk_size(in.length());
unsigned padded_length = blocksize * k;
dout(10) << "encode adjusted buffer length from " << in.length()
<< " to " << padded_length << dendl;
assert(padded_length % k == 0);
@ -90,7 +98,6 @@ int ErasureCodeJerasure::encode(const set<int> &want_to_encode,
out.push_back(pad);
out.rebuild_page_aligned();
}
unsigned blocksize = padded_length / k;
unsigned coding_length = blocksize * m;
bufferptr coding(buffer::create_page_aligned(coding_length));
out.push_back(coding);
@ -196,7 +203,7 @@ int ErasureCodeJerasureReedSolomonVandermonde::jerasure_decode(int *erasures,
erasures, data, coding, blocksize);
}
unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment()
unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment() const
{
unsigned alignment = k*w*sizeof(int);
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
@ -240,7 +247,7 @@ int ErasureCodeJerasureReedSolomonRAID6::jerasure_decode(int *erasures,
return jerasure_matrix_decode(k, m, w, matrix, 1, erasures, data, coding, blocksize);
}
unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment()
unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment() const
{
unsigned alignment = k*w*sizeof(int);
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
@ -285,7 +292,7 @@ int ErasureCodeJerasureCauchy::jerasure_decode(int *erasures,
erasures, data, coding, blocksize, packetsize, 1);
}
unsigned ErasureCodeJerasureCauchy::get_alignment()
unsigned ErasureCodeJerasureCauchy::get_alignment() const
{
unsigned alignment = k*w*packetsize*sizeof(int);
if ( ((w*packetsize*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
@ -355,7 +362,7 @@ int ErasureCodeJerasureLiberation::jerasure_decode(int *erasures,
coding, blocksize, packetsize, 1);
}
unsigned ErasureCodeJerasureLiberation::get_alignment()
unsigned ErasureCodeJerasureLiberation::get_alignment() const
{
unsigned alignment = k*w*packetsize*sizeof(int);
if ( ((w*packetsize*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )

View File

@ -32,6 +32,16 @@ public:
virtual ~ErasureCodeJerasure() {}
virtual unsigned int get_chunk_count() const {
return k + m;
}
virtual unsigned int get_data_chunk_count() const {
return k;
}
virtual unsigned int get_chunk_size(unsigned int object_size) const;
virtual int minimum_to_decode(const set<int> &want_to_read,
const set<int> &available_chunks,
set<int> *minimum);
@ -56,7 +66,7 @@ public:
char **data,
char **coding,
int blocksize) = 0;
virtual unsigned get_alignment() = 0;
virtual unsigned get_alignment() const = 0;
virtual void parse(const map<std::string,std::string> &parameters) = 0;
virtual void prepare() = 0;
static int to_int(const std::string &name,
@ -88,7 +98,7 @@ public:
char **data,
char **coding,
int blocksize);
virtual unsigned get_alignment();
virtual unsigned get_alignment() const;
virtual void parse(const map<std::string,std::string> &parameters);
virtual void prepare();
};
@ -115,7 +125,7 @@ public:
char **data,
char **coding,
int blocksize);
virtual unsigned get_alignment();
virtual unsigned get_alignment() const;
virtual void parse(const map<std::string,std::string> &parameters);
virtual void prepare();
};
@ -149,7 +159,7 @@ public:
char **data,
char **coding,
int blocksize);
virtual unsigned get_alignment();
virtual unsigned get_alignment() const;
virtual void parse(const map<std::string,std::string> &parameters);
void prepare_schedule(int *matrix);
};
@ -196,7 +206,7 @@ public:
char **data,
char **coding,
int blocksize);
virtual unsigned get_alignment();
virtual unsigned get_alignment() const;
virtual void parse(const map<std::string,std::string> &parameters);
virtual void prepare();
};

View File

@ -81,6 +81,18 @@ public:
return minimum_to_decode(want_to_read, available_chunks, minimum);
}
virtual unsigned int get_chunk_count() const {
return DATA_CHUNKS + CODING_CHUNKS;
}
virtual unsigned int get_data_chunk_count() const {
return DATA_CHUNKS;
}
virtual unsigned int get_chunk_size(unsigned int object_size) const {
return ( object_size / DATA_CHUNKS ) + 1;
}
virtual int encode(const set<int> &want_to_encode,
const bufferlist &in,
map<int, bufferlist> *encoded) {
@ -88,11 +100,11 @@ public:
// make sure all data chunks have the same length, allocating
// padding if necessary.
//
unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
unsigned int chunk_length = get_chunk_size(in.length());
bufferlist out(in);
bufferptr pad(length - in.length());
pad.zero(0, DATA_CHUNKS);
unsigned int width = get_chunk_count() * get_chunk_size(in.length());
bufferptr pad(width - in.length());
pad.zero(0, get_data_chunk_count());
out.push_back(pad);
//
// compute the coding chunk with first chunk ^ second chunk

View File

@ -20,6 +20,13 @@
#include "global/global_context.h"
#include "gtest/gtest.h"
TEST(ErasureCodeExample, chunk_size)
{
ErasureCodeExample example;
EXPECT_EQ(3u, example.get_chunk_count());
EXPECT_EQ(11u, example.get_chunk_size(20));
}
TEST(ErasureCodeExample, minimum_to_decode)
{
ErasureCodeExample example;
@ -105,13 +112,13 @@ TEST(ErasureCodeExample, encode_decode)
bufferlist in;
in.append("ABCDE");
int want_to_encode[] = { 0, 1, 2 };
set<int> want_to_encode;
for(unsigned int i = 0; i < example.get_chunk_count(); i++)
want_to_encode.insert(i);
map<int, bufferlist> encoded;
EXPECT_EQ(0, example.encode(set<int>(want_to_encode, want_to_encode+3),
in,
&encoded));
EXPECT_EQ(3u, encoded.size());
EXPECT_EQ(3u, encoded[0].length());
EXPECT_EQ(0, example.encode(want_to_encode, in, &encoded));
EXPECT_EQ(example.get_chunk_count(), encoded.size());
EXPECT_EQ(example.get_chunk_size(in.length()), encoded[0].length());
EXPECT_EQ('A', encoded[0][0]);
EXPECT_EQ('B', encoded[0][1]);
EXPECT_EQ('C', encoded[0][2]);
@ -157,6 +164,43 @@ TEST(ErasureCodeExample, encode_decode)
}
}
TEST(ErasureCodeExample, decode)
{
ErasureCodeExample example;
#define LARGE_ENOUGH 2048
bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
in_ptr.zero();
in_ptr.set_length(0);
const char *payload =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
in_ptr.append(payload, strlen(payload));
bufferlist in;
in.push_front(in_ptr);
int want_to_encode[] = { 0, 1, 2 };
map<int, bufferlist> encoded;
EXPECT_EQ(0, example.encode(set<int>(want_to_encode, want_to_encode+3),
in,
&encoded));
EXPECT_EQ(3u, encoded.size());
// successfull decode
bufferlist out;
EXPECT_EQ(0, example.decode_concat(encoded, &out));
bufferlist usable;
usable.substr_of(out, 0, in.length());
EXPECT_TRUE(usable == in);
// cannot recover
map<int, bufferlist> degraded;
degraded[0] = encoded[0];
EXPECT_EQ(-ERANGE, example.decode_concat(degraded, &out));
}
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);
@ -168,6 +212,15 @@ int main(int argc, char **argv) {
return RUN_ALL_TESTS();
}
// Local Variables:
// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
// End:
/*
* Local Variables:
* compile-command: "cd ../.. ;
* make -j4 &&
* make unittest_erasure_code_example &&
* valgrind --leak-check=full --tool=memcheck \
* ./unittest_erasure_code_example --gtest_filter=*.* \
* --log-to-stderr=true --debug-osd=20
* "
* End:
*/