crc32c: Add ppc64le fast zero optimized assembly.

Allow faster calculation of crc32c when a NULL
buffer is passed.

Signed-off-by: Andrew Solomon <asolomon@us.ibm.com>
This commit is contained in:
Andrew Solomon 2017-05-14 04:52:11 +00:00
parent d7742d2c2d
commit 50d781af5a
9 changed files with 298 additions and 34 deletions

View File

@ -471,6 +471,7 @@ set(libcommon_files
${async_rdma_common_srcs}
${dpdk_common_srcs}
msg/msg_types.cc
common/reverse.c
common/hobject.cc
osd/OSDMap.cc
osd/OSDMapMapping.cc
@ -544,7 +545,8 @@ if(HAVE_INTEL)
elseif(HAVE_POWER8)
list(APPEND libcommon_files
common/crc32c_ppc.c
common/crc32c_ppc_asm.S)
common/crc32c_ppc_asm.S
common/crc32c_ppc_fast_zero_asm.S)
endif(HAVE_INTEL)
if(LINUX)

View File

@ -7,9 +7,12 @@
* 2 of the License, or (at your option) any later version.
*/
#define CRC_TABLE
#define FAST_ZERO_TABLE
#include "acconfig.h"
#include "include/int_types.h"
#include "crc32c_ppc_constants.h"
#include "reverse.h"
#include <stdlib.h>
#include <strings.h>
@ -35,8 +38,38 @@ static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
}
#endif
#ifdef HAVE_POWER8
static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) {
vector unsigned int va = {a, 0, 0, 0};
vector unsigned int vb = {b, 0, 0, 0};
vector unsigned long vt;
__asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb));
return vt[0];
}
unsigned int barrett_reduction(unsigned long val);
static inline unsigned int gf_multiply(unsigned int a, unsigned int b) {
return barrett_reduction(polynomial_multiply(a, b));
}
unsigned int append_zeros(unsigned int crc, unsigned long length) {
unsigned long i = 0;
while (length) {
if (length & 1) {
crc = gf_multiply(crc, crc_zero[i]);
}
i++;
length /= 2;
}
return crc;
}
unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
unsigned long len);
@ -79,19 +112,23 @@ out:
}
/* This wrapper function works around the fact that crc32_vpmsum
* does not gracefully handle the case where the data pointer is NULL. There
* may be room for performance improvement here.
* does not gracefully handle the case where the data pointer is NULL.
*/
uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
{
unsigned char *buf2;
if (!data) {
buf2 = malloc(len);
bzero(buf2, len);
crc = crc32_vpmsum(crc, buf2, len);
free(buf2);
/* Handle the NULL buffer case. */
#ifdef REFLECT
crc = reverse_bits(crc);
#endif
crc = append_zeros(crc, len);
#ifdef REFLECT
crc = reverse_bits(crc);
#endif
} else {
/* Handle the valid buffer case. */
crc = crc32_vpmsum(crc, data, (unsigned long)len);
}
return crc;

View File

@ -78,6 +78,77 @@ static const unsigned int crc_table[] = {
0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,};
#endif
#ifdef FAST_ZERO_TABLE
/* fast zero table */
unsigned int crc_zero[] = {
0x100,
0x10000,
0x1edc6f41,
0x3aab4576,
0x18571d18,
0x59a3508a,
0xaa97d41d,
0xe78dbf1d,
0x4ef6a711,
0x2506c32e,
0x68d4e827,
0x546ea6b0,
0x465cebac,
0x26a86214,
0x964aa2fd,
0x3b4c5747,
0x6702ee7f,
0xd086629f,
0xf1f2043c,
0xc761a1ca,
0xa8964e9a,
0x90cab2ce,
0xc6e3583d,
0x3344e0be,
0x7d53914b,
0x3d953297,
0xfcf2eda0,
0x42f878a5,
0x2,
0x4,
0x10,
0x100,
0x10000,
0x1edc6f41,
0x3aab4576,
0x18571d18,
0x59a3508a,
0xaa97d41d,
0xe78dbf1d,
0x4ef6a711,
0x2506c32e,
0x68d4e827,
0x546ea6b0,
0x465cebac,
0x26a86214,
0x964aa2fd,
0x3b4c5747,
0x6702ee7f,
0xd086629f,
0xf1f2043c,
0xc761a1ca,
0xa8964e9a,
0x90cab2ce,
0xc6e3583d,
0x3344e0be,
0x7d53914b,
0x3d953297,
0xfcf2eda0,
0x42f878a5,
0x2,
0x4,
0x10,
0x100,
0x10000
};
#endif
#else
#define MAX_SIZE 32768
.constants:

View File

@ -0,0 +1,77 @@
/*
* Use the fixed point version of Barrett reduction to compute a mod n
* over GF(2) for given n using POWER8 instructions. We use k = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of either:
*
* a) the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version, or
* b) the Apache License, Version 2.0
*/
#include <ppc-asm.h>
#include "common/ppc-opcode.h"
#undef toc
#ifndef r1
#define r1 1
#endif
#ifndef r2
#define r2 2
#endif
.section .data
.balign 16
.barrett_fz_constants:
/* Barrett constant m - (4^32)/n */
.octa 0x0000000000000000000000011f91caf6 /* x^64 div p(x) */
/* Barrett constant n */
.octa 0x0000000000000000000000011edc6f41
.text
/* unsigned int barrett_reduction(unsigned long val) */
FUNC_START(barrett_reduction)
addis r4,r2,.barrett_fz_constants@toc@ha
addi r4,r4,.barrett_fz_constants@toc@l
li r5,16
vxor v1,v1,v1 /* zero v1 */
/* Get a into v0 */
MTVRD(v0, r3)
vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
/* Load constants */
lvx v2,0,r4 /* m */
lvx v3,r5,r4 /* n */
/*
* Now for the actual algorithm. The idea is to calculate q,
* the multiple of our polynomial that we need to subtract. By
* doing the computation 2x bits higher (ie 64 bits) and shifting the
* result back down 2x bits, we round down to the nearest multiple.
*/
VPMSUMD(v4,v0,v2) /* ma */
vsldoi v4,v1,v4,8 /* q = floor(ma/(2^64)) */
VPMSUMD(v4,v4,v3) /* qn */
vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
/*
* Get the result into r3. We need to shift it left 8 bytes:
* V0 [ 0 1 2 X ]
* V0 [ 0 X 2 3 ]
*/
vsldoi v0,v0,v1,8 /* shift result into top 64 bits of v0 */
MFVRD(r3, v0)
blr
FUNC_END(barrett_reduction)

View File

@ -21,6 +21,8 @@
#include "json_spirit/json_spirit_value.h"
#include "include/assert.h" // spirit clobbers it!
#include "reverse.h"
namespace ceph {
class Formatter;
}
@ -197,27 +199,10 @@ public:
}
static uint32_t _reverse_bits(uint32_t v) {
if (v == 0)
return v;
// reverse bits
// swap odd and even bits
v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
// swap consecutive pairs
v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
// swap nibbles ...
v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
// swap bytes
v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
// swap 2-byte long pairs
v = ( v >> 16 ) | ( v << 16);
return v;
return reverse_bits(v);
}
static uint32_t _reverse_nibbles(uint32_t retval) {
// reverse nibbles
retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
return retval;
return reverse_nibbles(retval);
}
/**

42
src/common/reverse.c Normal file
View File

@ -0,0 +1,42 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#include "reverse.h"
uint32_t reverse_bits(uint32_t v) {
if (v == 0)
return v;
/* reverse bits
* swap odd and even bits
*/
v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
/* swap consecutive pairs */
v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
/* swap nibbles ... */
v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
/* swap bytes */
v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
/* swap 2-byte long pairs */
v = ( v >> 16 ) | ( v << 16);
return v;
}
uint32_t reverse_nibbles(uint32_t retval) {
/* reverse nibbles */
retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
return retval;
}

31
src/common/reverse.h Normal file
View File

@ -0,0 +1,31 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef __CEPH_OS_REVERSE_H
#define __CEPH_OS_REVERSE_H
#include "include/int_types.h"
#ifdef __cplusplus
extern "C" {
#endif
extern uint32_t reverse_bits(uint32_t v);
extern uint32_t reverse_nibbles(uint32_t retval);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -20,7 +20,10 @@ extern ceph_crc32c_func_t ceph_choose_crc32(void);
/**
* calculate crc32c for data that is entirely 0 (ZERO)
*
* Note: works the same as \ref ceph_crc32c for data == nullptr, but faster
* Note: works the same as ceph_crc32c_func for data == nullptr,
* but faster than the optimized assembly on certain architectures.
* This is faster than intel optimized assembly, but not as fast as
* ppc64le optimized assembly.
*
* @param crc initial value
* @param length length of buffer
@ -39,9 +42,12 @@ uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
*/
static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
{
#ifndef HAVE_POWER8
if (!data && length > 16)
return ceph_crc32c_zeros(crc, length);
return ceph_crc32c_func(crc, data, length);
#endif /* HAVE_POWER8 */
return ceph_crc32c_func(crc, data, length);
}
#ifdef __cplusplus

View File

@ -319,12 +319,23 @@ TEST(Crc32c, zeros_performance_compare) {
pre_start = ceph_clock_now();
start = ceph_clock_now();
#ifdef HAVE_POWER8
uint32_t crc_b = ceph_crc32c_zeros(111, size);
#else
uint32_t crc_b = ceph_crc32c_func(111, nullptr, size);
#endif
end = ceph_clock_now();
time_adjusted = (end - start) - (start - pre_start);
#ifdef HAVE_POWER8
std::cout << "ceph_crc32c_zeros method. size=" << size << " time="
<< (double)(end-start) << " at " << (double)size/(1024*1024)/(time_adjusted)
<< " MB/sec" << " error=" << resolution / time_adjusted * 100 << "%"
<< std::endl;
#else
std::cout << "fallback method. size=" << size << " time=" << (double)(end-start)
<< " at " << (double)size/(1024*1024)/(time_adjusted) << " MB/sec"
<< " error=" << resolution / time_adjusted * 100 << "%" << std::endl;
#endif
EXPECT_EQ(crc_a, crc_b);
}
}
@ -336,10 +347,12 @@ TEST(Crc32c, zeros_performance) {
start = ceph_clock_now();
for (size_t i=0; i<ITER; i++)
for (size_t scale=1; scale < 31; scale++)
{
size_t size = (1<<scale) + rand() % (1<<scale);
ceph_crc32c(rand(), nullptr, size);
for (size_t scale=1; scale < 31; scale++)
{
size_t size = (1<<scale) + rand() % (1<<scale);
ceph_crc32c(rand(), nullptr, size);
}
}
end = ceph_clock_now();
std::cout << "iterations="<< ITER*31 << " time=" << (double)(end-start) << std::endl;