From 50d781af5a0979d2d531a5d9ca5d5432927764ef Mon Sep 17 00:00:00 2001 From: Andrew Solomon Date: Sun, 14 May 2017 04:52:11 +0000 Subject: [PATCH] crc32c: Add ppc64le fast zero optimized assembly. Allow faster calculation of crc32c when a NULL buffer is passed. Signed-off-by: Andrew Solomon --- src/CMakeLists.txt | 4 +- src/common/crc32c_ppc.c | 55 +++++++++++++++---- src/common/crc32c_ppc_constants.h | 71 ++++++++++++++++++++++++ src/common/crc32c_ppc_fast_zero_asm.S | 77 +++++++++++++++++++++++++++ src/common/hobject.h | 23 ++------ src/common/reverse.c | 42 +++++++++++++++ src/common/reverse.h | 31 +++++++++++ src/include/crc32c.h | 10 +++- src/test/common/test_crc32c.cc | 19 +++++-- 9 files changed, 298 insertions(+), 34 deletions(-) create mode 100644 src/common/crc32c_ppc_fast_zero_asm.S create mode 100644 src/common/reverse.c create mode 100644 src/common/reverse.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 746c4d97328..649a27944ac 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -471,6 +471,7 @@ set(libcommon_files ${async_rdma_common_srcs} ${dpdk_common_srcs} msg/msg_types.cc + common/reverse.c common/hobject.cc osd/OSDMap.cc osd/OSDMapMapping.cc @@ -544,7 +545,8 @@ if(HAVE_INTEL) elseif(HAVE_POWER8) list(APPEND libcommon_files common/crc32c_ppc.c - common/crc32c_ppc_asm.S) + common/crc32c_ppc_asm.S + common/crc32c_ppc_fast_zero_asm.S) endif(HAVE_INTEL) if(LINUX) diff --git a/src/common/crc32c_ppc.c b/src/common/crc32c_ppc.c index e113ad8e1a6..43756e24ef8 100644 --- a/src/common/crc32c_ppc.c +++ b/src/common/crc32c_ppc.c @@ -7,9 +7,12 @@ * 2 of the License, or (at your option) any later version. */ #define CRC_TABLE +#define FAST_ZERO_TABLE + #include "acconfig.h" #include "include/int_types.h" #include "crc32c_ppc_constants.h" +#include "reverse.h" #include #include @@ -35,8 +38,38 @@ static unsigned int crc32_align(unsigned int crc, unsigned char const *p, } #endif - #ifdef HAVE_POWER8 +static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) { + vector unsigned int va = {a, 0, 0, 0}; + vector unsigned int vb = {b, 0, 0, 0}; + vector unsigned long vt; + + __asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb)); + + return vt[0]; +} + +unsigned int barrett_reduction(unsigned long val); + +static inline unsigned int gf_multiply(unsigned int a, unsigned int b) { + return barrett_reduction(polynomial_multiply(a, b)); +} + +unsigned int append_zeros(unsigned int crc, unsigned long length) { + unsigned long i = 0; + + while (length) { + if (length & 1) { + crc = gf_multiply(crc, crc_zero[i]); + } + i++; + length /= 2; + } + + return crc; +} + + unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p, unsigned long len); @@ -79,19 +112,23 @@ out: } /* This wrapper function works around the fact that crc32_vpmsum - * does not gracefully handle the case where the data pointer is NULL. There - * may be room for performance improvement here. + * does not gracefully handle the case where the data pointer is NULL. */ uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) { - unsigned char *buf2; - if (!data) { - buf2 = malloc(len); - bzero(buf2, len); - crc = crc32_vpmsum(crc, buf2, len); - free(buf2); + /* Handle the NULL buffer case. */ +#ifdef REFLECT + crc = reverse_bits(crc); +#endif + + crc = append_zeros(crc, len); + +#ifdef REFLECT + crc = reverse_bits(crc); +#endif } else { + /* Handle the valid buffer case. */ crc = crc32_vpmsum(crc, data, (unsigned long)len); } return crc; diff --git a/src/common/crc32c_ppc_constants.h b/src/common/crc32c_ppc_constants.h index 25864f1045e..12a1e1d51fa 100644 --- a/src/common/crc32c_ppc_constants.h +++ b/src/common/crc32c_ppc_constants.h @@ -78,6 +78,77 @@ static const unsigned int crc_table[] = { 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,}; #endif + +#ifdef FAST_ZERO_TABLE +/* fast zero table */ +unsigned int crc_zero[] = { + 0x100, + 0x10000, + 0x1edc6f41, + 0x3aab4576, + 0x18571d18, + 0x59a3508a, + 0xaa97d41d, + 0xe78dbf1d, + 0x4ef6a711, + 0x2506c32e, + 0x68d4e827, + 0x546ea6b0, + 0x465cebac, + 0x26a86214, + 0x964aa2fd, + 0x3b4c5747, + 0x6702ee7f, + 0xd086629f, + 0xf1f2043c, + 0xc761a1ca, + 0xa8964e9a, + 0x90cab2ce, + 0xc6e3583d, + 0x3344e0be, + 0x7d53914b, + 0x3d953297, + 0xfcf2eda0, + 0x42f878a5, + 0x2, + 0x4, + 0x10, + 0x100, + 0x10000, + 0x1edc6f41, + 0x3aab4576, + 0x18571d18, + 0x59a3508a, + 0xaa97d41d, + 0xe78dbf1d, + 0x4ef6a711, + 0x2506c32e, + 0x68d4e827, + 0x546ea6b0, + 0x465cebac, + 0x26a86214, + 0x964aa2fd, + 0x3b4c5747, + 0x6702ee7f, + 0xd086629f, + 0xf1f2043c, + 0xc761a1ca, + 0xa8964e9a, + 0x90cab2ce, + 0xc6e3583d, + 0x3344e0be, + 0x7d53914b, + 0x3d953297, + 0xfcf2eda0, + 0x42f878a5, + 0x2, + 0x4, + 0x10, + 0x100, + 0x10000 +}; +#endif + #else #define MAX_SIZE 32768 .constants: diff --git a/src/common/crc32c_ppc_fast_zero_asm.S b/src/common/crc32c_ppc_fast_zero_asm.S new file mode 100644 index 00000000000..a53df1deead --- /dev/null +++ b/src/common/crc32c_ppc_fast_zero_asm.S @@ -0,0 +1,77 @@ +/* + * Use the fixed point version of Barrett reduction to compute a mod n + * over GF(2) for given n using POWER8 instructions. We use k = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard , IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of either: + * + * a) the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) + * any later version, or + * b) the Apache License, Version 2.0 + */ +#include +#include "common/ppc-opcode.h" + +#undef toc + +#ifndef r1 +#define r1 1 +#endif + +#ifndef r2 +#define r2 2 +#endif + + .section .data +.balign 16 + +.barrett_fz_constants: + /* Barrett constant m - (4^32)/n */ + .octa 0x0000000000000000000000011f91caf6 /* x^64 div p(x) */ + /* Barrett constant n */ + .octa 0x0000000000000000000000011edc6f41 + +.text +/* unsigned int barrett_reduction(unsigned long val) */ +FUNC_START(barrett_reduction) + addis r4,r2,.barrett_fz_constants@toc@ha + addi r4,r4,.barrett_fz_constants@toc@l + + li r5,16 + vxor v1,v1,v1 /* zero v1 */ + + /* Get a into v0 */ + MTVRD(v0, r3) + vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */ + + /* Load constants */ + lvx v2,0,r4 /* m */ + lvx v3,r5,r4 /* n */ + + /* + * Now for the actual algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v4,v0,v2) /* ma */ + vsldoi v4,v1,v4,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v4,v4,v3) /* qn */ + vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,v1,8 /* shift result into top 64 bits of v0 */ + MFVRD(r3, v0) + + blr +FUNC_END(barrett_reduction) + diff --git a/src/common/hobject.h b/src/common/hobject.h index 258d6a3b42b..9b3f38f4494 100644 --- a/src/common/hobject.h +++ b/src/common/hobject.h @@ -21,6 +21,8 @@ #include "json_spirit/json_spirit_value.h" #include "include/assert.h" // spirit clobbers it! +#include "reverse.h" + namespace ceph { class Formatter; } @@ -197,27 +199,10 @@ public: } static uint32_t _reverse_bits(uint32_t v) { - if (v == 0) - return v; - // reverse bits - // swap odd and even bits - v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); - // swap consecutive pairs - v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); - // swap nibbles ... - v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); - // swap bytes - v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); - // swap 2-byte long pairs - v = ( v >> 16 ) | ( v << 16); - return v; + return reverse_bits(v); } static uint32_t _reverse_nibbles(uint32_t retval) { - // reverse nibbles - retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4); - retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8); - retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16); - return retval; + return reverse_nibbles(retval); } /** diff --git a/src/common/reverse.c b/src/common/reverse.c new file mode 100644 index 00000000000..f65540d54fa --- /dev/null +++ b/src/common/reverse.c @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "reverse.h" + +uint32_t reverse_bits(uint32_t v) { + if (v == 0) + return v; + + /* reverse bits + * swap odd and even bits + */ + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + /* swap consecutive pairs */ + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + /* swap nibbles ... */ + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + /* swap bytes */ + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + /* swap 2-byte long pairs */ + v = ( v >> 16 ) | ( v << 16); + return v; +} + +uint32_t reverse_nibbles(uint32_t retval) { + /* reverse nibbles */ + retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4); + retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8); + retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16); + return retval; +} diff --git a/src/common/reverse.h b/src/common/reverse.h new file mode 100644 index 00000000000..9a199a8472b --- /dev/null +++ b/src/common/reverse.h @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __CEPH_OS_REVERSE_H +#define __CEPH_OS_REVERSE_H + +#include "include/int_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint32_t reverse_bits(uint32_t v); +extern uint32_t reverse_nibbles(uint32_t retval); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/crc32c.h b/src/include/crc32c.h index 86d9c8d229c..dd4ede666ec 100644 --- a/src/include/crc32c.h +++ b/src/include/crc32c.h @@ -20,7 +20,10 @@ extern ceph_crc32c_func_t ceph_choose_crc32(void); /** * calculate crc32c for data that is entirely 0 (ZERO) * - * Note: works the same as \ref ceph_crc32c for data == nullptr, but faster + * Note: works the same as ceph_crc32c_func for data == nullptr, + * but faster than the optimized assembly on certain architectures. + * This is faster than intel optimized assembly, but not as fast as + * ppc64le optimized assembly. * * @param crc initial value * @param length length of buffer @@ -39,9 +42,12 @@ uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length); */ static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length) { +#ifndef HAVE_POWER8 if (!data && length > 16) return ceph_crc32c_zeros(crc, length); - return ceph_crc32c_func(crc, data, length); +#endif /* HAVE_POWER8 */ + + return ceph_crc32c_func(crc, data, length); } #ifdef __cplusplus diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc index c51006732e8..7071728bb46 100644 --- a/src/test/common/test_crc32c.cc +++ b/src/test/common/test_crc32c.cc @@ -319,12 +319,23 @@ TEST(Crc32c, zeros_performance_compare) { pre_start = ceph_clock_now(); start = ceph_clock_now(); +#ifdef HAVE_POWER8 + uint32_t crc_b = ceph_crc32c_zeros(111, size); +#else uint32_t crc_b = ceph_crc32c_func(111, nullptr, size); +#endif end = ceph_clock_now(); time_adjusted = (end - start) - (start - pre_start); +#ifdef HAVE_POWER8 + std::cout << "ceph_crc32c_zeros method. size=" << size << " time=" + << (double)(end-start) << " at " << (double)size/(1024*1024)/(time_adjusted) + << " MB/sec" << " error=" << resolution / time_adjusted * 100 << "%" + << std::endl; +#else std::cout << "fallback method. size=" << size << " time=" << (double)(end-start) << " at " << (double)size/(1024*1024)/(time_adjusted) << " MB/sec" << " error=" << resolution / time_adjusted * 100 << "%" << std::endl; +#endif EXPECT_EQ(crc_a, crc_b); } } @@ -336,10 +347,12 @@ TEST(Crc32c, zeros_performance) { start = ceph_clock_now(); for (size_t i=0; i