mirror of
https://github.com/ceph/ceph
synced 2024-12-17 00:46:05 +00:00
crc32c: Add ppc64le fast zero optimized assembly.
Allow faster calculation of crc32c when a NULL buffer is passed. Signed-off-by: Andrew Solomon <asolomon@us.ibm.com>
This commit is contained in:
parent
d7742d2c2d
commit
50d781af5a
@ -471,6 +471,7 @@ set(libcommon_files
|
||||
${async_rdma_common_srcs}
|
||||
${dpdk_common_srcs}
|
||||
msg/msg_types.cc
|
||||
common/reverse.c
|
||||
common/hobject.cc
|
||||
osd/OSDMap.cc
|
||||
osd/OSDMapMapping.cc
|
||||
@ -544,7 +545,8 @@ if(HAVE_INTEL)
|
||||
elseif(HAVE_POWER8)
|
||||
list(APPEND libcommon_files
|
||||
common/crc32c_ppc.c
|
||||
common/crc32c_ppc_asm.S)
|
||||
common/crc32c_ppc_asm.S
|
||||
common/crc32c_ppc_fast_zero_asm.S)
|
||||
endif(HAVE_INTEL)
|
||||
|
||||
if(LINUX)
|
||||
|
@ -7,9 +7,12 @@
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
#define CRC_TABLE
|
||||
#define FAST_ZERO_TABLE
|
||||
|
||||
#include "acconfig.h"
|
||||
#include "include/int_types.h"
|
||||
#include "crc32c_ppc_constants.h"
|
||||
#include "reverse.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <strings.h>
|
||||
@ -35,8 +38,38 @@ static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef HAVE_POWER8
|
||||
static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) {
|
||||
vector unsigned int va = {a, 0, 0, 0};
|
||||
vector unsigned int vb = {b, 0, 0, 0};
|
||||
vector unsigned long vt;
|
||||
|
||||
__asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb));
|
||||
|
||||
return vt[0];
|
||||
}
|
||||
|
||||
unsigned int barrett_reduction(unsigned long val);
|
||||
|
||||
static inline unsigned int gf_multiply(unsigned int a, unsigned int b) {
|
||||
return barrett_reduction(polynomial_multiply(a, b));
|
||||
}
|
||||
|
||||
unsigned int append_zeros(unsigned int crc, unsigned long length) {
|
||||
unsigned long i = 0;
|
||||
|
||||
while (length) {
|
||||
if (length & 1) {
|
||||
crc = gf_multiply(crc, crc_zero[i]);
|
||||
}
|
||||
i++;
|
||||
length /= 2;
|
||||
}
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
|
||||
unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
|
||||
unsigned long len);
|
||||
|
||||
@ -79,19 +112,23 @@ out:
|
||||
}
|
||||
|
||||
/* This wrapper function works around the fact that crc32_vpmsum
|
||||
* does not gracefully handle the case where the data pointer is NULL. There
|
||||
* may be room for performance improvement here.
|
||||
* does not gracefully handle the case where the data pointer is NULL.
|
||||
*/
|
||||
uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
|
||||
{
|
||||
unsigned char *buf2;
|
||||
|
||||
if (!data) {
|
||||
buf2 = malloc(len);
|
||||
bzero(buf2, len);
|
||||
crc = crc32_vpmsum(crc, buf2, len);
|
||||
free(buf2);
|
||||
/* Handle the NULL buffer case. */
|
||||
#ifdef REFLECT
|
||||
crc = reverse_bits(crc);
|
||||
#endif
|
||||
|
||||
crc = append_zeros(crc, len);
|
||||
|
||||
#ifdef REFLECT
|
||||
crc = reverse_bits(crc);
|
||||
#endif
|
||||
} else {
|
||||
/* Handle the valid buffer case. */
|
||||
crc = crc32_vpmsum(crc, data, (unsigned long)len);
|
||||
}
|
||||
return crc;
|
||||
|
@ -78,6 +78,77 @@ static const unsigned int crc_table[] = {
|
||||
0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,};
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef FAST_ZERO_TABLE
|
||||
/* fast zero table */
|
||||
unsigned int crc_zero[] = {
|
||||
0x100,
|
||||
0x10000,
|
||||
0x1edc6f41,
|
||||
0x3aab4576,
|
||||
0x18571d18,
|
||||
0x59a3508a,
|
||||
0xaa97d41d,
|
||||
0xe78dbf1d,
|
||||
0x4ef6a711,
|
||||
0x2506c32e,
|
||||
0x68d4e827,
|
||||
0x546ea6b0,
|
||||
0x465cebac,
|
||||
0x26a86214,
|
||||
0x964aa2fd,
|
||||
0x3b4c5747,
|
||||
0x6702ee7f,
|
||||
0xd086629f,
|
||||
0xf1f2043c,
|
||||
0xc761a1ca,
|
||||
0xa8964e9a,
|
||||
0x90cab2ce,
|
||||
0xc6e3583d,
|
||||
0x3344e0be,
|
||||
0x7d53914b,
|
||||
0x3d953297,
|
||||
0xfcf2eda0,
|
||||
0x42f878a5,
|
||||
0x2,
|
||||
0x4,
|
||||
0x10,
|
||||
0x100,
|
||||
0x10000,
|
||||
0x1edc6f41,
|
||||
0x3aab4576,
|
||||
0x18571d18,
|
||||
0x59a3508a,
|
||||
0xaa97d41d,
|
||||
0xe78dbf1d,
|
||||
0x4ef6a711,
|
||||
0x2506c32e,
|
||||
0x68d4e827,
|
||||
0x546ea6b0,
|
||||
0x465cebac,
|
||||
0x26a86214,
|
||||
0x964aa2fd,
|
||||
0x3b4c5747,
|
||||
0x6702ee7f,
|
||||
0xd086629f,
|
||||
0xf1f2043c,
|
||||
0xc761a1ca,
|
||||
0xa8964e9a,
|
||||
0x90cab2ce,
|
||||
0xc6e3583d,
|
||||
0x3344e0be,
|
||||
0x7d53914b,
|
||||
0x3d953297,
|
||||
0xfcf2eda0,
|
||||
0x42f878a5,
|
||||
0x2,
|
||||
0x4,
|
||||
0x10,
|
||||
0x100,
|
||||
0x10000
|
||||
};
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define MAX_SIZE 32768
|
||||
.constants:
|
||||
|
77
src/common/crc32c_ppc_fast_zero_asm.S
Normal file
77
src/common/crc32c_ppc_fast_zero_asm.S
Normal file
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Use the fixed point version of Barrett reduction to compute a mod n
|
||||
* over GF(2) for given n using POWER8 instructions. We use k = 32.
|
||||
*
|
||||
* http://en.wikipedia.org/wiki/Barrett_reduction
|
||||
*
|
||||
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of either:
|
||||
*
|
||||
* a) the GNU General Public License as published by the Free Software
|
||||
* Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version, or
|
||||
* b) the Apache License, Version 2.0
|
||||
*/
|
||||
#include <ppc-asm.h>
|
||||
#include "common/ppc-opcode.h"
|
||||
|
||||
#undef toc
|
||||
|
||||
#ifndef r1
|
||||
#define r1 1
|
||||
#endif
|
||||
|
||||
#ifndef r2
|
||||
#define r2 2
|
||||
#endif
|
||||
|
||||
.section .data
|
||||
.balign 16
|
||||
|
||||
.barrett_fz_constants:
|
||||
/* Barrett constant m - (4^32)/n */
|
||||
.octa 0x0000000000000000000000011f91caf6 /* x^64 div p(x) */
|
||||
/* Barrett constant n */
|
||||
.octa 0x0000000000000000000000011edc6f41
|
||||
|
||||
.text
|
||||
/* unsigned int barrett_reduction(unsigned long val) */
|
||||
FUNC_START(barrett_reduction)
|
||||
addis r4,r2,.barrett_fz_constants@toc@ha
|
||||
addi r4,r4,.barrett_fz_constants@toc@l
|
||||
|
||||
li r5,16
|
||||
vxor v1,v1,v1 /* zero v1 */
|
||||
|
||||
/* Get a into v0 */
|
||||
MTVRD(v0, r3)
|
||||
vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
|
||||
|
||||
/* Load constants */
|
||||
lvx v2,0,r4 /* m */
|
||||
lvx v3,r5,r4 /* n */
|
||||
|
||||
/*
|
||||
* Now for the actual algorithm. The idea is to calculate q,
|
||||
* the multiple of our polynomial that we need to subtract. By
|
||||
* doing the computation 2x bits higher (ie 64 bits) and shifting the
|
||||
* result back down 2x bits, we round down to the nearest multiple.
|
||||
*/
|
||||
VPMSUMD(v4,v0,v2) /* ma */
|
||||
vsldoi v4,v1,v4,8 /* q = floor(ma/(2^64)) */
|
||||
VPMSUMD(v4,v4,v3) /* qn */
|
||||
vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
|
||||
|
||||
/*
|
||||
* Get the result into r3. We need to shift it left 8 bytes:
|
||||
* V0 [ 0 1 2 X ]
|
||||
* V0 [ 0 X 2 3 ]
|
||||
*/
|
||||
vsldoi v0,v0,v1,8 /* shift result into top 64 bits of v0 */
|
||||
MFVRD(r3, v0)
|
||||
|
||||
blr
|
||||
FUNC_END(barrett_reduction)
|
||||
|
@ -21,6 +21,8 @@
|
||||
#include "json_spirit/json_spirit_value.h"
|
||||
#include "include/assert.h" // spirit clobbers it!
|
||||
|
||||
#include "reverse.h"
|
||||
|
||||
namespace ceph {
|
||||
class Formatter;
|
||||
}
|
||||
@ -197,27 +199,10 @@ public:
|
||||
}
|
||||
|
||||
static uint32_t _reverse_bits(uint32_t v) {
|
||||
if (v == 0)
|
||||
return v;
|
||||
// reverse bits
|
||||
// swap odd and even bits
|
||||
v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
|
||||
// swap consecutive pairs
|
||||
v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
|
||||
// swap nibbles ...
|
||||
v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
|
||||
// swap bytes
|
||||
v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
|
||||
// swap 2-byte long pairs
|
||||
v = ( v >> 16 ) | ( v << 16);
|
||||
return v;
|
||||
return reverse_bits(v);
|
||||
}
|
||||
static uint32_t _reverse_nibbles(uint32_t retval) {
|
||||
// reverse nibbles
|
||||
retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
|
||||
retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
|
||||
retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
|
||||
return retval;
|
||||
return reverse_nibbles(retval);
|
||||
}
|
||||
|
||||
/**
|
||||
|
42
src/common/reverse.c
Normal file
42
src/common/reverse.c
Normal file
@ -0,0 +1,42 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "reverse.h"
|
||||
|
||||
uint32_t reverse_bits(uint32_t v) {
|
||||
if (v == 0)
|
||||
return v;
|
||||
|
||||
/* reverse bits
|
||||
* swap odd and even bits
|
||||
*/
|
||||
v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
|
||||
/* swap consecutive pairs */
|
||||
v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
|
||||
/* swap nibbles ... */
|
||||
v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
|
||||
/* swap bytes */
|
||||
v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
|
||||
/* swap 2-byte long pairs */
|
||||
v = ( v >> 16 ) | ( v << 16);
|
||||
return v;
|
||||
}
|
||||
|
||||
uint32_t reverse_nibbles(uint32_t retval) {
|
||||
/* reverse nibbles */
|
||||
retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
|
||||
retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
|
||||
retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
|
||||
return retval;
|
||||
}
|
31
src/common/reverse.h
Normal file
31
src/common/reverse.h
Normal file
@ -0,0 +1,31 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __CEPH_OS_REVERSE_H
|
||||
#define __CEPH_OS_REVERSE_H
|
||||
|
||||
#include "include/int_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern uint32_t reverse_bits(uint32_t v);
|
||||
extern uint32_t reverse_nibbles(uint32_t retval);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@ -20,7 +20,10 @@ extern ceph_crc32c_func_t ceph_choose_crc32(void);
|
||||
/**
|
||||
* calculate crc32c for data that is entirely 0 (ZERO)
|
||||
*
|
||||
* Note: works the same as \ref ceph_crc32c for data == nullptr, but faster
|
||||
* Note: works the same as ceph_crc32c_func for data == nullptr,
|
||||
* but faster than the optimized assembly on certain architectures.
|
||||
* This is faster than intel optimized assembly, but not as fast as
|
||||
* ppc64le optimized assembly.
|
||||
*
|
||||
* @param crc initial value
|
||||
* @param length length of buffer
|
||||
@ -39,9 +42,12 @@ uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
|
||||
*/
|
||||
static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
|
||||
{
|
||||
#ifndef HAVE_POWER8
|
||||
if (!data && length > 16)
|
||||
return ceph_crc32c_zeros(crc, length);
|
||||
return ceph_crc32c_func(crc, data, length);
|
||||
#endif /* HAVE_POWER8 */
|
||||
|
||||
return ceph_crc32c_func(crc, data, length);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -319,12 +319,23 @@ TEST(Crc32c, zeros_performance_compare) {
|
||||
|
||||
pre_start = ceph_clock_now();
|
||||
start = ceph_clock_now();
|
||||
#ifdef HAVE_POWER8
|
||||
uint32_t crc_b = ceph_crc32c_zeros(111, size);
|
||||
#else
|
||||
uint32_t crc_b = ceph_crc32c_func(111, nullptr, size);
|
||||
#endif
|
||||
end = ceph_clock_now();
|
||||
time_adjusted = (end - start) - (start - pre_start);
|
||||
#ifdef HAVE_POWER8
|
||||
std::cout << "ceph_crc32c_zeros method. size=" << size << " time="
|
||||
<< (double)(end-start) << " at " << (double)size/(1024*1024)/(time_adjusted)
|
||||
<< " MB/sec" << " error=" << resolution / time_adjusted * 100 << "%"
|
||||
<< std::endl;
|
||||
#else
|
||||
std::cout << "fallback method. size=" << size << " time=" << (double)(end-start)
|
||||
<< " at " << (double)size/(1024*1024)/(time_adjusted) << " MB/sec"
|
||||
<< " error=" << resolution / time_adjusted * 100 << "%" << std::endl;
|
||||
#endif
|
||||
EXPECT_EQ(crc_a, crc_b);
|
||||
}
|
||||
}
|
||||
@ -336,10 +347,12 @@ TEST(Crc32c, zeros_performance) {
|
||||
|
||||
start = ceph_clock_now();
|
||||
for (size_t i=0; i<ITER; i++)
|
||||
for (size_t scale=1; scale < 31; scale++)
|
||||
{
|
||||
size_t size = (1<<scale) + rand() % (1<<scale);
|
||||
ceph_crc32c(rand(), nullptr, size);
|
||||
for (size_t scale=1; scale < 31; scale++)
|
||||
{
|
||||
size_t size = (1<<scale) + rand() % (1<<scale);
|
||||
ceph_crc32c(rand(), nullptr, size);
|
||||
}
|
||||
}
|
||||
end = ceph_clock_now();
|
||||
std::cout << "iterations="<< ITER*31 << " time=" << (double)(end-start) << std::endl;
|
||||
|
Loading…
Reference in New Issue
Block a user