mirror of
https://github.com/ceph/ceph
synced 2024-12-18 01:16:55 +00:00
Merge pull request #963 from dachary/wip-erasure-code-api
erasure code interface helpers Reviewed-by: Samuel Just <sam.just@inktank.com>
This commit is contained in:
commit
73e469c966
@ -25,12 +25,57 @@
|
||||
are systematic (i.e. the data is not mangled and can be
|
||||
reconstructed by concatenating chunks ).
|
||||
|
||||
All methods return **0** on success and a negative value on
|
||||
error. If the value returned on error is not explained in
|
||||
**ErasureCodeInterface**, the sources or the documentation of the
|
||||
interface implementer (i.e. the plugin ) must be read to figure
|
||||
out what it means. It is recommended that each error code matches
|
||||
an *errno* value that relates to the cause of the error.
|
||||
Methods returning an **int** return **0** on success and a
|
||||
negative value on error. If the value returned on error is not
|
||||
explained in **ErasureCodeInterface**, the sources or the
|
||||
documentation of the interface implementer (i.e. the plugin ) must
|
||||
be read to figure out what it means. It is recommended that each
|
||||
error code matches an *errno* value that relates to the cause of
|
||||
the error.
|
||||
|
||||
If an object is small enough, the caller can process it with
|
||||
one call to the **encode** or **decode** method.
|
||||
|
||||
+---------------- coded object O -------------------------+
|
||||
|+----------------+ +----------------+ +----------------+ |
|
||||
|| chunk 0 | | chunk 1 | | chunk 2 | |
|
||||
|| [0,N) | | [N,2N) | | [2N,3N) | |
|
||||
|+----------------+ +----------------+ +----------------+ |
|
||||
+------^--------------------------------------------------+
|
||||
|
|
||||
chunk B / C | offset B % C ( where C is the chunk size )
|
||||
|
|
||||
+-----^---- raw object O ----+------+
|
||||
| B [0,X) | pad |
|
||||
+----------------------------+------+
|
||||
|
||||
The object size is paded so that each chunks are of the same size.
|
||||
In the example above, if the actual object size was X, then it
|
||||
will be padded to 2N >= X assuming there are two data chunks (0
|
||||
and 1) and one coding chunk (2).
|
||||
|
||||
For chunks of size C, byte B of the object is found in chunk number
|
||||
B / C at offset B % C.
|
||||
|
||||
If an object is too large to be encoded in memory, the caller
|
||||
should divide it in smaller units named **stripes**.
|
||||
|
||||
+---------------------- object O -------------------------+
|
||||
|+----------------+ +----------------+ +----------------+ |
|
||||
stripe || chunk 0 | | chunk 1 | | chunk 2 | |
|
||||
0 || [0,N) | | [N,2N) | | [2N,3N) | |
|
||||
|+----------------+ +----------------+ +----------------+ |
|
||||
|+----------------+ +----------------+ +----------------+ |
|
||||
stripe || chunk 0 | | chunk 1 | | chunk 2 | |
|
||||
1 || [X,M) | | [X+M,X+2M) | | [X+2M,X+3M) | |
|
||||
|| | | | | | |
|
||||
|+----------------+ +----------------+ +----------------+ |
|
||||
| ... |
|
||||
+---------------------------------------------------------+
|
||||
|
||||
The interface does not concern itself with stripes nor does it
|
||||
impose constraints on the size of each stripe. Variable names in
|
||||
the interface always use **object** and never use **stripe**.
|
||||
|
||||
Assuming the interface implementer provides three data chunks ( K
|
||||
= 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
|
||||
@ -108,6 +153,48 @@ namespace ceph {
|
||||
public:
|
||||
virtual ~ErasureCodeInterface() {}
|
||||
|
||||
/**
|
||||
* Return the number of chunks created by a call to the **encode**
|
||||
* method.
|
||||
*
|
||||
* In the simplest case it can be K + M, i.e. the number
|
||||
* of data chunks (K) plus the number of parity chunks
|
||||
* (M). However, if the implementation provides local parity there
|
||||
* could be an additional overhead.
|
||||
*
|
||||
* @return the number of chunks created by encode()
|
||||
*/
|
||||
virtual unsigned int get_chunk_count() const = 0;
|
||||
|
||||
/**
|
||||
* Return the number of data chunks created by a call to the
|
||||
* **encode** method. The data chunks contain the buffer provided
|
||||
* to **encode**, verbatim, with padding at the end of the last
|
||||
* chunk.
|
||||
*
|
||||
* @return the number of data chunks created by encode()
|
||||
*/
|
||||
virtual unsigned int get_data_chunk_count() const = 0;
|
||||
|
||||
/**
|
||||
* Return the size (in bytes) of a single chunk created by a call
|
||||
* to the **decode** method. The returned size multiplied by
|
||||
* **get_chunk_count()** is greater or equal to **object_size**.
|
||||
*
|
||||
* If the object size is properly aligned, the chunk size is
|
||||
* **object_size / get_chunk_count()**. However, if
|
||||
* **object_size** is not a multiple of **get_chunk_count** or if
|
||||
* the implementation imposes additional alignment constraints,
|
||||
* the chunk size may be larger.
|
||||
*
|
||||
* The byte found at offset **B** of the original object is mapped
|
||||
* to chunk **B / get_chunk_size()** at offset **B % get_chunk_size()**.
|
||||
*
|
||||
* @param [in] object_size the number of bytes of the object to **encode()**
|
||||
* @return the size (in bytes) of a single chunk created by **encode()**
|
||||
*/
|
||||
virtual unsigned int get_chunk_size(unsigned int object_size) const = 0;
|
||||
|
||||
/**
|
||||
* Compute the smallest subset of **available** chunks that needs
|
||||
* to be retrieved in order to successfully decode
|
||||
@ -231,6 +318,29 @@ namespace ceph {
|
||||
virtual int decode(const set<int> &want_to_read,
|
||||
const map<int, bufferlist> &chunks,
|
||||
map<int, bufferlist> *decoded) = 0;
|
||||
|
||||
/**
|
||||
* Decode the first **get_data_chunk_count()** **chunks** and
|
||||
* concatenate them them into **decoded**.
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*
|
||||
* @param [in] chunks map chunk indexes to chunk data
|
||||
* @param [out] decoded concatenante of the data chunks
|
||||
* @return **0** on success or a negative errno on error.
|
||||
*/
|
||||
int decode_concat(const map<int, bufferlist> &chunks,
|
||||
bufferlist *decoded) {
|
||||
set<int> want_to_read;
|
||||
for (unsigned int i = 0; i < get_data_chunk_count(); i++)
|
||||
want_to_read.insert(i);
|
||||
map<int, bufferlist> decoded_map;
|
||||
int r = decode(want_to_read, chunks, &decoded_map);
|
||||
if (r == 0)
|
||||
for (unsigned int i = 0; i < get_data_chunk_count(); i++)
|
||||
decoded->claim_append(decoded_map[i]);
|
||||
return r;
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::tr1::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;
|
||||
|
@ -43,6 +43,15 @@ void ErasureCodeJerasure::init(const map<std::string,std::string> ¶meters)
|
||||
prepare();
|
||||
}
|
||||
|
||||
unsigned int ErasureCodeJerasure::get_chunk_size(unsigned int object_size) const
|
||||
{
|
||||
unsigned alignment = get_alignment();
|
||||
unsigned tail = object_size % alignment;
|
||||
unsigned padded_length = object_size + ( tail ? ( alignment - tail ) : 0 );
|
||||
assert(padded_length % k == 0);
|
||||
return padded_length / k;
|
||||
}
|
||||
|
||||
int ErasureCodeJerasure::minimum_to_decode(const set<int> &want_to_read,
|
||||
const set<int> &available_chunks,
|
||||
set<int> *minimum)
|
||||
@ -77,9 +86,8 @@ int ErasureCodeJerasure::encode(const set<int> &want_to_encode,
|
||||
const bufferlist &in,
|
||||
map<int, bufferlist> *encoded)
|
||||
{
|
||||
unsigned alignment = get_alignment();
|
||||
unsigned tail = in.length() % alignment;
|
||||
unsigned padded_length = in.length() + ( tail ? ( alignment - tail ) : 0 );
|
||||
unsigned blocksize = get_chunk_size(in.length());
|
||||
unsigned padded_length = blocksize * k;
|
||||
dout(10) << "encode adjusted buffer length from " << in.length()
|
||||
<< " to " << padded_length << dendl;
|
||||
assert(padded_length % k == 0);
|
||||
@ -90,7 +98,6 @@ int ErasureCodeJerasure::encode(const set<int> &want_to_encode,
|
||||
out.push_back(pad);
|
||||
out.rebuild_page_aligned();
|
||||
}
|
||||
unsigned blocksize = padded_length / k;
|
||||
unsigned coding_length = blocksize * m;
|
||||
bufferptr coding(buffer::create_page_aligned(coding_length));
|
||||
out.push_back(coding);
|
||||
@ -196,7 +203,7 @@ int ErasureCodeJerasureReedSolomonVandermonde::jerasure_decode(int *erasures,
|
||||
erasures, data, coding, blocksize);
|
||||
}
|
||||
|
||||
unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment()
|
||||
unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment() const
|
||||
{
|
||||
unsigned alignment = k*w*sizeof(int);
|
||||
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
|
||||
@ -240,7 +247,7 @@ int ErasureCodeJerasureReedSolomonRAID6::jerasure_decode(int *erasures,
|
||||
return jerasure_matrix_decode(k, m, w, matrix, 1, erasures, data, coding, blocksize);
|
||||
}
|
||||
|
||||
unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment()
|
||||
unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment() const
|
||||
{
|
||||
unsigned alignment = k*w*sizeof(int);
|
||||
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
|
||||
@ -285,7 +292,7 @@ int ErasureCodeJerasureCauchy::jerasure_decode(int *erasures,
|
||||
erasures, data, coding, blocksize, packetsize, 1);
|
||||
}
|
||||
|
||||
unsigned ErasureCodeJerasureCauchy::get_alignment()
|
||||
unsigned ErasureCodeJerasureCauchy::get_alignment() const
|
||||
{
|
||||
unsigned alignment = k*w*packetsize*sizeof(int);
|
||||
if ( ((w*packetsize*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
|
||||
@ -355,7 +362,7 @@ int ErasureCodeJerasureLiberation::jerasure_decode(int *erasures,
|
||||
coding, blocksize, packetsize, 1);
|
||||
}
|
||||
|
||||
unsigned ErasureCodeJerasureLiberation::get_alignment()
|
||||
unsigned ErasureCodeJerasureLiberation::get_alignment() const
|
||||
{
|
||||
unsigned alignment = k*w*packetsize*sizeof(int);
|
||||
if ( ((w*packetsize*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
|
||||
|
@ -32,6 +32,16 @@ public:
|
||||
|
||||
virtual ~ErasureCodeJerasure() {}
|
||||
|
||||
virtual unsigned int get_chunk_count() const {
|
||||
return k + m;
|
||||
}
|
||||
|
||||
virtual unsigned int get_data_chunk_count() const {
|
||||
return k;
|
||||
}
|
||||
|
||||
virtual unsigned int get_chunk_size(unsigned int object_size) const;
|
||||
|
||||
virtual int minimum_to_decode(const set<int> &want_to_read,
|
||||
const set<int> &available_chunks,
|
||||
set<int> *minimum);
|
||||
@ -56,7 +66,7 @@ public:
|
||||
char **data,
|
||||
char **coding,
|
||||
int blocksize) = 0;
|
||||
virtual unsigned get_alignment() = 0;
|
||||
virtual unsigned get_alignment() const = 0;
|
||||
virtual void parse(const map<std::string,std::string> ¶meters) = 0;
|
||||
virtual void prepare() = 0;
|
||||
static int to_int(const std::string &name,
|
||||
@ -88,7 +98,7 @@ public:
|
||||
char **data,
|
||||
char **coding,
|
||||
int blocksize);
|
||||
virtual unsigned get_alignment();
|
||||
virtual unsigned get_alignment() const;
|
||||
virtual void parse(const map<std::string,std::string> ¶meters);
|
||||
virtual void prepare();
|
||||
};
|
||||
@ -115,7 +125,7 @@ public:
|
||||
char **data,
|
||||
char **coding,
|
||||
int blocksize);
|
||||
virtual unsigned get_alignment();
|
||||
virtual unsigned get_alignment() const;
|
||||
virtual void parse(const map<std::string,std::string> ¶meters);
|
||||
virtual void prepare();
|
||||
};
|
||||
@ -149,7 +159,7 @@ public:
|
||||
char **data,
|
||||
char **coding,
|
||||
int blocksize);
|
||||
virtual unsigned get_alignment();
|
||||
virtual unsigned get_alignment() const;
|
||||
virtual void parse(const map<std::string,std::string> ¶meters);
|
||||
void prepare_schedule(int *matrix);
|
||||
};
|
||||
@ -196,7 +206,7 @@ public:
|
||||
char **data,
|
||||
char **coding,
|
||||
int blocksize);
|
||||
virtual unsigned get_alignment();
|
||||
virtual unsigned get_alignment() const;
|
||||
virtual void parse(const map<std::string,std::string> ¶meters);
|
||||
virtual void prepare();
|
||||
};
|
||||
|
@ -81,6 +81,18 @@ public:
|
||||
return minimum_to_decode(want_to_read, available_chunks, minimum);
|
||||
}
|
||||
|
||||
virtual unsigned int get_chunk_count() const {
|
||||
return DATA_CHUNKS + CODING_CHUNKS;
|
||||
}
|
||||
|
||||
virtual unsigned int get_data_chunk_count() const {
|
||||
return DATA_CHUNKS;
|
||||
}
|
||||
|
||||
virtual unsigned int get_chunk_size(unsigned int object_size) const {
|
||||
return ( object_size / DATA_CHUNKS ) + 1;
|
||||
}
|
||||
|
||||
virtual int encode(const set<int> &want_to_encode,
|
||||
const bufferlist &in,
|
||||
map<int, bufferlist> *encoded) {
|
||||
@ -88,11 +100,11 @@ public:
|
||||
// make sure all data chunks have the same length, allocating
|
||||
// padding if necessary.
|
||||
//
|
||||
unsigned chunk_length = ( in.length() / DATA_CHUNKS ) + 1;
|
||||
unsigned length = chunk_length * ( DATA_CHUNKS + CODING_CHUNKS );
|
||||
unsigned int chunk_length = get_chunk_size(in.length());
|
||||
bufferlist out(in);
|
||||
bufferptr pad(length - in.length());
|
||||
pad.zero(0, DATA_CHUNKS);
|
||||
unsigned int width = get_chunk_count() * get_chunk_size(in.length());
|
||||
bufferptr pad(width - in.length());
|
||||
pad.zero(0, get_data_chunk_count());
|
||||
out.push_back(pad);
|
||||
//
|
||||
// compute the coding chunk with first chunk ^ second chunk
|
||||
|
@ -20,6 +20,13 @@
|
||||
#include "global/global_context.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
TEST(ErasureCodeExample, chunk_size)
|
||||
{
|
||||
ErasureCodeExample example;
|
||||
EXPECT_EQ(3u, example.get_chunk_count());
|
||||
EXPECT_EQ(11u, example.get_chunk_size(20));
|
||||
}
|
||||
|
||||
TEST(ErasureCodeExample, minimum_to_decode)
|
||||
{
|
||||
ErasureCodeExample example;
|
||||
@ -105,13 +112,13 @@ TEST(ErasureCodeExample, encode_decode)
|
||||
|
||||
bufferlist in;
|
||||
in.append("ABCDE");
|
||||
int want_to_encode[] = { 0, 1, 2 };
|
||||
set<int> want_to_encode;
|
||||
for(unsigned int i = 0; i < example.get_chunk_count(); i++)
|
||||
want_to_encode.insert(i);
|
||||
map<int, bufferlist> encoded;
|
||||
EXPECT_EQ(0, example.encode(set<int>(want_to_encode, want_to_encode+3),
|
||||
in,
|
||||
&encoded));
|
||||
EXPECT_EQ(3u, encoded.size());
|
||||
EXPECT_EQ(3u, encoded[0].length());
|
||||
EXPECT_EQ(0, example.encode(want_to_encode, in, &encoded));
|
||||
EXPECT_EQ(example.get_chunk_count(), encoded.size());
|
||||
EXPECT_EQ(example.get_chunk_size(in.length()), encoded[0].length());
|
||||
EXPECT_EQ('A', encoded[0][0]);
|
||||
EXPECT_EQ('B', encoded[0][1]);
|
||||
EXPECT_EQ('C', encoded[0][2]);
|
||||
@ -157,6 +164,43 @@ TEST(ErasureCodeExample, encode_decode)
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ErasureCodeExample, decode)
|
||||
{
|
||||
ErasureCodeExample example;
|
||||
|
||||
#define LARGE_ENOUGH 2048
|
||||
bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
|
||||
in_ptr.zero();
|
||||
in_ptr.set_length(0);
|
||||
const char *payload =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
|
||||
in_ptr.append(payload, strlen(payload));
|
||||
bufferlist in;
|
||||
in.push_front(in_ptr);
|
||||
int want_to_encode[] = { 0, 1, 2 };
|
||||
map<int, bufferlist> encoded;
|
||||
EXPECT_EQ(0, example.encode(set<int>(want_to_encode, want_to_encode+3),
|
||||
in,
|
||||
&encoded));
|
||||
EXPECT_EQ(3u, encoded.size());
|
||||
|
||||
// successfull decode
|
||||
bufferlist out;
|
||||
EXPECT_EQ(0, example.decode_concat(encoded, &out));
|
||||
bufferlist usable;
|
||||
usable.substr_of(out, 0, in.length());
|
||||
EXPECT_TRUE(usable == in);
|
||||
|
||||
// cannot recover
|
||||
map<int, bufferlist> degraded;
|
||||
degraded[0] = encoded[0];
|
||||
EXPECT_EQ(-ERANGE, example.decode_concat(degraded, &out));
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
vector<const char*> args;
|
||||
argv_to_vec(argc, (const char **)argv, args);
|
||||
@ -168,6 +212,15 @@ int main(int argc, char **argv) {
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
// Local Variables:
|
||||
// compile-command: "cd ../.. ; make -j4 && make unittest_erasure_code_example && valgrind --leak-check=full --tool=memcheck ./unittest_erasure_code_example --gtest_filter=*.* --log-to-stderr=true --debug-osd=20"
|
||||
// End:
|
||||
/*
|
||||
* Local Variables:
|
||||
* compile-command: "cd ../.. ;
|
||||
* make -j4 &&
|
||||
* make unittest_erasure_code_example &&
|
||||
* valgrind --leak-check=full --tool=memcheck \
|
||||
* ./unittest_erasure_code_example --gtest_filter=*.* \
|
||||
* --log-to-stderr=true --debug-osd=20
|
||||
* "
|
||||
* End:
|
||||
*/
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user