From 50d781af5a0979d2d531a5d9ca5d5432927764ef Mon Sep 17 00:00:00 2001
From: Andrew Solomon <asolomon@us.ibm.com>
Date: Sun, 14 May 2017 04:52:11 +0000
Subject: [PATCH] crc32c: Add ppc64le fast zero optimized assembly.

Allow faster calculation of crc32c when a NULL
buffer is passed.

Signed-off-by: Andrew Solomon <asolomon@us.ibm.com>
---
 src/CMakeLists.txt                    |  4 +-
 src/common/crc32c_ppc.c               | 55 +++++++++++++++----
 src/common/crc32c_ppc_constants.h     | 71 ++++++++++++++++++++++++
 src/common/crc32c_ppc_fast_zero_asm.S | 77 +++++++++++++++++++++++++++
 src/common/hobject.h                  | 23 ++------
 src/common/reverse.c                  | 42 +++++++++++++++
 src/common/reverse.h                  | 31 +++++++++++
 src/include/crc32c.h                  | 10 +++-
 src/test/common/test_crc32c.cc        | 19 +++++--
 9 files changed, 298 insertions(+), 34 deletions(-)
 create mode 100644 src/common/crc32c_ppc_fast_zero_asm.S
 create mode 100644 src/common/reverse.c
 create mode 100644 src/common/reverse.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 746c4d97328..649a27944ac 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -471,6 +471,7 @@ set(libcommon_files
   ${async_rdma_common_srcs}
   ${dpdk_common_srcs}
   msg/msg_types.cc
+  common/reverse.c
   common/hobject.cc
   osd/OSDMap.cc
   osd/OSDMapMapping.cc
@@ -544,7 +545,8 @@ if(HAVE_INTEL)
 elseif(HAVE_POWER8)
   list(APPEND libcommon_files
     common/crc32c_ppc.c
-    common/crc32c_ppc_asm.S)
+    common/crc32c_ppc_asm.S
+    common/crc32c_ppc_fast_zero_asm.S)
 endif(HAVE_INTEL)
 
 if(LINUX)
diff --git a/src/common/crc32c_ppc.c b/src/common/crc32c_ppc.c
index e113ad8e1a6..43756e24ef8 100644
--- a/src/common/crc32c_ppc.c
+++ b/src/common/crc32c_ppc.c
@@ -7,9 +7,12 @@
  * 2 of the License, or (at your option) any later version.
  */
 #define CRC_TABLE
+#define FAST_ZERO_TABLE
+
 #include "acconfig.h"
 #include "include/int_types.h"
 #include "crc32c_ppc_constants.h"
+#include "reverse.h"
 
 #include <stdlib.h>
 #include <strings.h>
@@ -35,8 +38,38 @@ static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
 }
 #endif
 
-
 #ifdef HAVE_POWER8
+static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) {
+        vector unsigned int va = {a, 0, 0, 0};
+        vector unsigned int vb = {b, 0, 0, 0};
+        vector unsigned long vt;
+
+        __asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb));
+
+        return vt[0];
+}
+
+unsigned int barrett_reduction(unsigned long val);
+
+static inline unsigned int gf_multiply(unsigned int a, unsigned int b) {
+        return barrett_reduction(polynomial_multiply(a, b));
+}
+
+unsigned int append_zeros(unsigned int crc, unsigned long length) {
+        unsigned long i = 0;
+
+        while (length) {
+                if (length & 1) {
+                        crc = gf_multiply(crc, crc_zero[i]);
+                }
+                i++;
+                length /= 2;
+        }
+
+        return crc;
+}
+
+
 unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
                             unsigned long len);
 
@@ -79,19 +112,23 @@ out:
 }
 
 /* This wrapper function works around the fact that crc32_vpmsum 
- * does not gracefully handle the case where the data pointer is NULL.  There
- * may be room for performance improvement here.
+ * does not gracefully handle the case where the data pointer is NULL.
  */
 uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
 {
-  unsigned char *buf2;
-
   if (!data) {
-    buf2 = malloc(len);
-    bzero(buf2, len);
-    crc = crc32_vpmsum(crc, buf2, len);
-    free(buf2);
+    /* Handle the NULL buffer case. */
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
+
+    crc = append_zeros(crc, len);
+
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
   } else {
+    /* Handle the valid buffer case. */
     crc = crc32_vpmsum(crc, data, (unsigned long)len);
   }
   return crc;
diff --git a/src/common/crc32c_ppc_constants.h b/src/common/crc32c_ppc_constants.h
index 25864f1045e..12a1e1d51fa 100644
--- a/src/common/crc32c_ppc_constants.h
+++ b/src/common/crc32c_ppc_constants.h
@@ -78,6 +78,77 @@ static const unsigned int crc_table[] = {
 	0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,};
 
 #endif
+
+#ifdef FAST_ZERO_TABLE
+/* fast zero table */
+unsigned int crc_zero[] = {
+	0x100,
+	0x10000,
+	0x1edc6f41,
+	0x3aab4576,
+	0x18571d18,
+	0x59a3508a,
+	0xaa97d41d,
+	0xe78dbf1d,
+	0x4ef6a711,
+	0x2506c32e,
+	0x68d4e827,
+	0x546ea6b0,
+	0x465cebac,
+	0x26a86214,
+	0x964aa2fd,
+	0x3b4c5747,
+	0x6702ee7f,
+	0xd086629f,
+	0xf1f2043c,
+	0xc761a1ca,
+	0xa8964e9a,
+	0x90cab2ce,
+	0xc6e3583d,
+	0x3344e0be,
+	0x7d53914b,
+	0x3d953297,
+	0xfcf2eda0,
+	0x42f878a5,
+	0x2,
+	0x4,
+	0x10,
+	0x100,
+	0x10000,
+	0x1edc6f41,
+	0x3aab4576,
+	0x18571d18,
+	0x59a3508a,
+	0xaa97d41d,
+	0xe78dbf1d,
+	0x4ef6a711,
+	0x2506c32e,
+	0x68d4e827,
+	0x546ea6b0,
+	0x465cebac,
+	0x26a86214,
+	0x964aa2fd,
+	0x3b4c5747,
+	0x6702ee7f,
+	0xd086629f,
+	0xf1f2043c,
+	0xc761a1ca,
+	0xa8964e9a,
+	0x90cab2ce,
+	0xc6e3583d,
+	0x3344e0be,
+	0x7d53914b,
+	0x3d953297,
+	0xfcf2eda0,
+	0x42f878a5,
+	0x2,
+	0x4,
+	0x10,
+	0x100,
+	0x10000
+};
+#endif
+
 #else
 #define MAX_SIZE	32768
 .constants:
diff --git a/src/common/crc32c_ppc_fast_zero_asm.S b/src/common/crc32c_ppc_fast_zero_asm.S
new file mode 100644
index 00000000000..a53df1deead
--- /dev/null
+++ b/src/common/crc32c_ppc_fast_zero_asm.S
@@ -0,0 +1,77 @@
+/*
+ * Use the fixed point version of Barrett reduction to compute a mod n
+ * over GF(2) for given n using POWER8 instructions. We use k = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+#include <ppc-asm.h>
+#include "common/ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+	.section	.data
+.balign 16
+
+.barrett_fz_constants:
+	/* Barrett constant m - (4^32)/n */
+	.octa 0x0000000000000000000000011f91caf6	/* x^64 div p(x) */
+	/* Barrett constant n */
+	.octa 0x0000000000000000000000011edc6f41
+
+.text
+/* unsigned int barrett_reduction(unsigned long val) */
+FUNC_START(barrett_reduction)
+	addis	r4,r2,.barrett_fz_constants@toc@ha
+	addi	r4,r4,.barrett_fz_constants@toc@l
+
+	li	r5,16
+	vxor	v1,v1,v1	/* zero v1 */
+
+	/* Get a into v0 */
+	MTVRD(v0, r3)
+	vsldoi	v0,v1,v0,8	/* shift into bottom 64 bits, this is a */
+
+	/* Load constants */
+	lvx	v2,0,r4		/* m */
+	lvx	v3,r5,r4	/* n */
+
+	/*
+	 * Now for the actual algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v4,v0,v2)	/* ma */
+	vsldoi	v4,v1,v4,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v4,v4,v3)	/* qn */
+	vxor	v0,v0,v4	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,v1,8	/* shift result into top 64 bits of v0 */
+	MFVRD(r3, v0)
+
+	blr
+FUNC_END(barrett_reduction)
+	
diff --git a/src/common/hobject.h b/src/common/hobject.h
index 258d6a3b42b..9b3f38f4494 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -21,6 +21,8 @@
 #include "json_spirit/json_spirit_value.h"
 #include "include/assert.h"   // spirit clobbers it!
 
+#include "reverse.h"
+
 namespace ceph {
   class Formatter;
 }
@@ -197,27 +199,10 @@ public:
   }
 
   static uint32_t _reverse_bits(uint32_t v) {
-    if (v == 0)
-      return v;
-    // reverse bits
-    // swap odd and even bits
-    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
-    // swap consecutive pairs
-    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
-    // swap nibbles ...
-    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
-    // swap bytes
-    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
-    // swap 2-byte long pairs
-    v = ( v >> 16             ) | ( v               << 16);
-    return v;
+    return reverse_bits(v);
   }
   static uint32_t _reverse_nibbles(uint32_t retval) {
-    // reverse nibbles
-    retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
-    retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
-    retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
-    return retval;
+    return reverse_nibbles(retval);
   }
 
   /**
diff --git a/src/common/reverse.c b/src/common/reverse.c
new file mode 100644
index 00000000000..f65540d54fa
--- /dev/null
+++ b/src/common/reverse.c
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "reverse.h"
+
+uint32_t reverse_bits(uint32_t v) {
+  if (v == 0)
+    return v;
+
+  /* reverse bits
+   * swap odd and even bits
+   */
+  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+  /* swap consecutive pairs */
+  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+  /* swap nibbles ... */
+  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+  /* swap bytes */
+  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+  /* swap 2-byte long pairs */
+  v = ( v >> 16             ) | ( v               << 16);                                
+  return v;
+}
+
+uint32_t reverse_nibbles(uint32_t retval) {
+  /* reverse nibbles */
+  retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
+  retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
+  retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
+  return retval;
+}
diff --git a/src/common/reverse.h b/src/common/reverse.h
new file mode 100644
index 00000000000..9a199a8472b
--- /dev/null
+++ b/src/common/reverse.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CEPH_OS_REVERSE_H
+#define __CEPH_OS_REVERSE_H
+
+#include "include/int_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t reverse_bits(uint32_t v);
+extern uint32_t reverse_nibbles(uint32_t retval);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif    
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index 86d9c8d229c..dd4ede666ec 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -20,7 +20,10 @@ extern ceph_crc32c_func_t ceph_choose_crc32(void);
 /**
  * calculate crc32c for data that is entirely 0 (ZERO)
  *
- * Note: works the same as \ref ceph_crc32c for data == nullptr, but faster
+ * Note: works the same as ceph_crc32c_func for data == nullptr, 
+ * but faster than the optimized assembly on certain architectures.
+ * This is faster than intel optimized assembly, but not as fast as 
+ * ppc64le optimized assembly.  
  *
  * @param crc initial value
  * @param length length of buffer
@@ -39,9 +42,12 @@ uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length);
  */
 static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length)
 {
+#ifndef HAVE_POWER8
   if (!data && length > 16)
     return ceph_crc32c_zeros(crc, length);
-	return ceph_crc32c_func(crc, data, length);
+#endif /* HAVE_POWER8 */
+
+  return ceph_crc32c_func(crc, data, length);
 }
 
 #ifdef __cplusplus
diff --git a/src/test/common/test_crc32c.cc b/src/test/common/test_crc32c.cc
index c51006732e8..7071728bb46 100644
--- a/src/test/common/test_crc32c.cc
+++ b/src/test/common/test_crc32c.cc
@@ -319,12 +319,23 @@ TEST(Crc32c, zeros_performance_compare) {
 
     pre_start = ceph_clock_now();
     start = ceph_clock_now();
+#ifdef HAVE_POWER8
+    uint32_t crc_b = ceph_crc32c_zeros(111, size);
+#else
     uint32_t crc_b = ceph_crc32c_func(111, nullptr, size);
+#endif
     end = ceph_clock_now();
     time_adjusted = (end - start) - (start - pre_start);
+#ifdef HAVE_POWER8
+    std::cout << "ceph_crc32c_zeros method. size=" << size << " time=" 
+        << (double)(end-start) << " at " << (double)size/(1024*1024)/(time_adjusted) 
+        << " MB/sec" << " error=" << resolution / time_adjusted * 100 << "%" 
+        << std::endl;
+#else
     std::cout << "fallback method. size=" << size << " time=" << (double)(end-start)
         << " at " << (double)size/(1024*1024)/(time_adjusted) << " MB/sec"
         << " error=" << resolution / time_adjusted * 100 << "%" << std::endl;
+#endif
     EXPECT_EQ(crc_a, crc_b);
   }
 }
@@ -336,10 +347,12 @@ TEST(Crc32c, zeros_performance) {
 
   start = ceph_clock_now();
   for (size_t i=0; i<ITER; i++)
-  for (size_t scale=1; scale < 31; scale++)
   {
-    size_t size = (1<<scale) + rand() % (1<<scale);
-    ceph_crc32c(rand(), nullptr, size);
+    for (size_t scale=1; scale < 31; scale++)
+    {
+      size_t size = (1<<scale) + rand() % (1<<scale);
+      ceph_crc32c(rand(), nullptr, size);
+    }
   }
   end = ceph_clock_now();
   std::cout << "iterations="<< ITER*31 << " time=" << (double)(end-start) << std::endl;