btrfs-progs: crypto: add PCL based implementation for crc32c
Copy faster implementation of crc32c from linux kernel as of 6.5-rc7 (x86_64, arch/x86/crypto/crc32c-pcl-intel-asm_64.S). This needs assembler build support, so detect target architecture so cross-compilation still works. Add a special CPU flag so the old and new implementations can be benchmarked and verified separately. Sample benchmark: CPU flags: 0x1ff CPU features: SSE2 SSSE3 SSE41 SSE42 SHA AVX AVX2 CRC32C_PCL Block size: 4096 Iterations: 1000000 Implementation: builtin Units: CPU cycles NULL-NOP: cycles: 77177218, cycles/i 77 NULL-MEMCPY: cycles: 226313072, cycles/i 226, 62133.395 MiB/s CRC32C-ref: cycles: 24418596066, cycles/i 24418, 575.859 MiB/s CRC32C-NI: cycles: 1188335920, cycles/i 1188, 11833.073 MiB/s CRC32C-PCL: cycles: 463193456, cycles/i 463, 30358.037 MiB/s XXHASH: cycles: 851606646, cycles/i 851, 16511.916 MiB/s SHA256-ref: cycles: 74476234956, cycles/i 74476, 188.808 MiB/s SHA256-NI: cycles: 34198637428, cycles/i 34198, 411.177 MiB/s BLAKE2-ref: cycles: 14761411664, cycles/i 14761, 952.597 MiB/s BLAKE2-SSE2: cycles: 18101896796, cycles/i 18101, 776.807 MiB/s BLAKE2-SSE41: cycles: 12599091062, cycles/i 12599, 1116.087 MiB/s BLAKE2-AVX2: cycles: 9668247506, cycles/i 9668, 1454.418 MiB/s The new implementation is about 2.5x faster. Note: there new version does not work on musl because of linkage problems (relocations in .rodata), so it's still using the old implementation. Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
8461513b9a
commit
992be8b50a
17
Makefile
17
Makefile
|
@ -88,6 +88,8 @@ DISABLE_WARNING_FLAGS := $(call cc-disable-warning, format-truncation) \
|
|||
ENABLE_WARNING_FLAGS := $(call cc-option, -Wimplicit-fallthrough) \
|
||||
$(call cc-option, -Wmissing-prototypes)
|
||||
|
||||
ASFLAGS =
|
||||
|
||||
# Common build flags
|
||||
CFLAGS = $(SUBST_CFLAGS) \
|
||||
-std=gnu11 \
|
||||
|
@ -383,6 +385,14 @@ CRYPTO_OBJECTS = crypto/sha224-256.o crypto/blake2b-ref.o crypto/blake2b-sse2.o
|
|||
CRYPTO_CFLAGS = -DCRYPTOPROVIDER_BUILTIN=1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_CPU),x86_64)
|
||||
# FIXME: linkage is broken on musl for some reason
|
||||
ifeq ($(HAVE_GLIBC),1)
|
||||
CRYPTO_OBJECTS += crypto/crc32c-pcl-intel-asm_64.o
|
||||
ASFLAGS += -fPIC
|
||||
endif
|
||||
endif
|
||||
|
||||
CHECKER_FLAGS += $(btrfs_convert_cflags)
|
||||
|
||||
# collect values of the variables above
|
||||
|
@ -450,6 +460,13 @@ endif
|
|||
-MT $($(dir $@).deps/$(notdir $@):.o.d=.static.o) \
|
||||
-MT $(dir $@).deps/$(notdir $@) $(CFLAGS) $<
|
||||
|
||||
.S.o:
|
||||
@echo " [AS] $@"
|
||||
$(Q)$(CC) $(CFLAGS) $(ASFLAGS) -c $< -o $@
|
||||
|
||||
%.static.o: %.S
|
||||
@echo " [AS] $@"
|
||||
$(Q)$(CC) $(CFLAGS) $(ASFLAGS) -c $< -o $@
|
||||
#
|
||||
# Pick from per-file variables, btrfs_*_cflags
|
||||
#
|
||||
|
|
|
@ -28,6 +28,8 @@ HAVE_CFLAG_msse2 = @HAVE_CFLAG_msse2@
|
|||
HAVE_CFLAG_msse41 = @HAVE_CFLAG_msse41@
|
||||
HAVE_CFLAG_mavx2 = @HAVE_CFLAG_mavx2@
|
||||
HAVE_CFLAG_msha = @HAVE_CFLAG_msha@
|
||||
TARGET_CPU = @target_cpu@
|
||||
HAVE_GLIBC = @HAVE_GLIBC@
|
||||
|
||||
SUBST_CFLAGS = @CFLAGS@
|
||||
SUBST_LDFLAGS = @LDFLAGS@
|
||||
|
|
|
@ -58,6 +58,7 @@ void cpu_print_flags(void) {
|
|||
FLAG(SHA);
|
||||
FLAG(AVX);
|
||||
FLAG(AVX2);
|
||||
FLAG(CRC32C_PCL);
|
||||
putchar(10);
|
||||
}
|
||||
#undef FLAG
|
||||
|
@ -88,6 +89,7 @@ void cpu_detect_flags(void)
|
|||
if (b & (1UL << 29))
|
||||
__cpu_flags |= CPU_FLAG_SHA;
|
||||
|
||||
__cpu_flags |= CPU_FLAG_CRC32C_PCL;
|
||||
__cpu_flags_orig = __cpu_flags;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,6 +37,9 @@ enum cpu_feature {
|
|||
ENUM_CPU_BIT(CPU_FLAG_SHA),
|
||||
ENUM_CPU_BIT(CPU_FLAG_AVX),
|
||||
ENUM_CPU_BIT(CPU_FLAG_AVX2),
|
||||
|
||||
/* Special features */
|
||||
ENUM_CPU_BIT(CPU_FLAG_CRC32C_PCL),
|
||||
};
|
||||
|
||||
#undef ENUM_CPU_BIT
|
||||
|
|
|
@ -40,6 +40,7 @@ AC_PREFIX_DEFAULT([/usr/local])
|
|||
|
||||
AC_PROG_CC
|
||||
AC_CANONICAL_HOST
|
||||
AC_CANONICAL_TARGET
|
||||
AC_C_CONST
|
||||
AC_C_VOLATILE
|
||||
AC_C_BIGENDIAN
|
||||
|
@ -79,6 +80,9 @@ AC_CHECK_FUNCS([reallocarray])
|
|||
|
||||
AC_CHECK_FUNCS([clock_gettime])
|
||||
|
||||
AX_CHECK_DEFINE([features.h],[__GLIBC__],[HAVE_GLIBC=1],[HAVE_GLIBC=0])
|
||||
AC_SUBST([HAVE_GLIBC])
|
||||
|
||||
AX_GCC_BUILTIN([__builtin_add_overflow])
|
||||
AX_GCC_BUILTIN([__builtin_sub_overflow])
|
||||
AX_GCC_BUILTIN([__builtin_mul_overflow])
|
||||
|
|
|
@ -0,0 +1,475 @@
|
|||
/*
|
||||
* Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
|
||||
*
|
||||
* The white papers on CRC32C calculations with PCLMULQDQ instruction can be
|
||||
* downloaded from:
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2012 Intel Corporation.
|
||||
*
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* James Guilford <james.guilford@intel.com>
|
||||
* David Cote <david.m.cote@intel.com>
|
||||
* Tim Chen <tim.c.chen@linux.intel.com>
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the
|
||||
* OpenIB.org BSD license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
##include "linkage.h"
|
||||
##include <asm/nospec-branch.h>
|
||||
|
||||
#define ENDBR
|
||||
|
||||
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
|
||||
|
||||
.macro LABEL prefix n
|
||||
.L\prefix\n\():
|
||||
.endm
|
||||
|
||||
.macro JMPTBL_ENTRY i
|
||||
.quad .Lcrc_\i
|
||||
.endm
|
||||
|
||||
.macro JNC_LESS_THAN j
|
||||
jnc .Lless_than_\j
|
||||
.endm
|
||||
|
||||
# Define threshold where buffers are considered "small" and routed to more
|
||||
# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
|
||||
# SMALL_SIZE can be no larger than 255.
|
||||
|
||||
#define SMALL_SIZE 200
|
||||
|
||||
.if (SMALL_SIZE > 255)
|
||||
.error "SMALL_ SIZE must be < 256"
|
||||
.endif
|
||||
|
||||
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
|
||||
|
||||
.text
|
||||
###SYM_FUNC_START(crc_pcl)
|
||||
.globl crc_pcl
|
||||
crc_pcl:
|
||||
###SYM_FUNC_START(crc_pcl)
|
||||
#define bufp rdi
|
||||
#define bufp_dw %edi
|
||||
#define bufp_w %di
|
||||
#define bufp_b %dil
|
||||
#define bufptmp %rcx
|
||||
#define block_0 %rcx
|
||||
#define block_1 %rdx
|
||||
#define block_2 %r11
|
||||
#define len %rsi
|
||||
#define len_dw %esi
|
||||
#define len_w %si
|
||||
#define len_b %sil
|
||||
#define crc_init_arg %rdx
|
||||
#define tmp %rbx
|
||||
#define crc_init %r8
|
||||
#define crc_init_dw %r8d
|
||||
#define crc1 %r9
|
||||
#define crc2 %r10
|
||||
|
||||
pushq %rbx
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
|
||||
## Move crc_init for Linux to a different
|
||||
mov crc_init_arg, crc_init
|
||||
|
||||
################################################################
|
||||
## 1) ALIGN:
|
||||
################################################################
|
||||
|
||||
mov %bufp, bufptmp # rdi = *buf
|
||||
neg %bufp
|
||||
and $7, %bufp # calculate the unalignment amount of
|
||||
# the address
|
||||
je .Lproc_block # Skip if aligned
|
||||
|
||||
## If len is less than 8 and we're unaligned, we need to jump
|
||||
## to special code to avoid reading beyond the end of the buffer
|
||||
cmp $8, len
|
||||
jae .Ldo_align
|
||||
# less_than_8 expects length in upper 3 bits of len_dw
|
||||
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
shl $32-3+1, len_dw
|
||||
jmp .Lless_than_8_post_shl1
|
||||
|
||||
.Ldo_align:
|
||||
#### Calculate CRC of unaligned bytes of the buffer (if any)
|
||||
movq (bufptmp), tmp # load a quadward from the buffer
|
||||
add %bufp, bufptmp # align buffer pointer for quadword
|
||||
# processing
|
||||
sub %bufp, len # update buffer length
|
||||
.Lalign_loop:
|
||||
crc32b %bl, crc_init_dw # compute crc32 of 1-byte
|
||||
shr $8, tmp # get next byte
|
||||
dec %bufp
|
||||
jne .Lalign_loop
|
||||
|
||||
.Lproc_block:
|
||||
|
||||
################################################################
|
||||
## 2) PROCESS BLOCKS:
|
||||
################################################################
|
||||
|
||||
## compute num of bytes to be processed
|
||||
movq len, tmp # save num bytes in tmp
|
||||
|
||||
cmpq $128*24, len
|
||||
jae .Lfull_block
|
||||
|
||||
.Lcontinue_block:
|
||||
cmpq $SMALL_SIZE, len
|
||||
jb .Lsmall
|
||||
|
||||
## len < 128*24
|
||||
movq $2731, %rax # 2731 = ceil(2^16 / 24)
|
||||
mul len_dw
|
||||
shrq $16, %rax
|
||||
|
||||
## eax contains floor(bytes / 24) = num 24-byte chunks to do
|
||||
|
||||
## process rax 24-byte chunks (128 >= rax >= 0)
|
||||
|
||||
## compute end address of each block
|
||||
## block 0 (base addr + RAX * 8)
|
||||
## block 1 (base addr + RAX * 16)
|
||||
## block 2 (base addr + RAX * 24)
|
||||
lea (bufptmp, %rax, 8), block_0
|
||||
lea (block_0, %rax, 8), block_1
|
||||
lea (block_1, %rax, 8), block_2
|
||||
|
||||
xor crc1, crc1
|
||||
xor crc2, crc2
|
||||
|
||||
## branch into array
|
||||
leaq jump_table(%rip), %bufp
|
||||
mov (%bufp,%rax,8), %bufp
|
||||
## JMP_NOSPEC
|
||||
JMP *%bufp
|
||||
## JMP_NOSPEC
|
||||
|
||||
################################################################
|
||||
## 2a) PROCESS FULL BLOCKS:
|
||||
################################################################
|
||||
.Lfull_block:
|
||||
movl $128,%eax
|
||||
lea 128*8*2(block_0), block_1
|
||||
lea 128*8*3(block_0), block_2
|
||||
add $128*8*1, block_0
|
||||
|
||||
xor crc1,crc1
|
||||
xor crc2,crc2
|
||||
|
||||
# Fall thruogh into top of crc array (crc_128)
|
||||
|
||||
################################################################
|
||||
## 3) CRC Array:
|
||||
################################################################
|
||||
|
||||
i=128
|
||||
.rept 128-1
|
||||
.altmacro
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
crc32q -i*8(block_0), crc_init
|
||||
crc32q -i*8(block_1), crc1
|
||||
crc32q -i*8(block_2), crc2
|
||||
i=(i-1)
|
||||
.endr
|
||||
|
||||
.altmacro
|
||||
LABEL crc_ %i
|
||||
.noaltmacro
|
||||
ENDBR
|
||||
crc32q -i*8(block_0), crc_init
|
||||
crc32q -i*8(block_1), crc1
|
||||
# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
|
||||
|
||||
mov block_2, block_0
|
||||
|
||||
################################################################
|
||||
## 4) Combine three results:
|
||||
################################################################
|
||||
|
||||
lea (K_table-8)(%rip), %bufp # first entry is for idx 1
|
||||
shlq $3, %rax # rax *= 8
|
||||
pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
|
||||
leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
|
||||
subq %rax, tmp # tmp -= rax*24
|
||||
|
||||
movq crc_init, %xmm1 # CRC for block 1
|
||||
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
|
||||
|
||||
movq crc1, %xmm2 # CRC for block 2
|
||||
pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
|
||||
|
||||
pxor %xmm2,%xmm1
|
||||
movq %xmm1, %rax
|
||||
xor -i*8(block_2), %rax
|
||||
mov crc2, crc_init
|
||||
crc32 %rax, crc_init
|
||||
|
||||
################################################################
|
||||
## 5) Check for end:
|
||||
################################################################
|
||||
|
||||
LABEL crc_ 0
|
||||
ENDBR
|
||||
mov tmp, len
|
||||
cmp $128*24, tmp
|
||||
jae .Lfull_block
|
||||
cmp $24, tmp
|
||||
jae .Lcontinue_block
|
||||
|
||||
.Lless_than_24:
|
||||
shl $32-4, len_dw # less_than_16 expects length
|
||||
# in upper 4 bits of len_dw
|
||||
jnc .Lless_than_16
|
||||
crc32q (bufptmp), crc_init
|
||||
crc32q 8(bufptmp), crc_init
|
||||
jz .Ldo_return
|
||||
add $16, bufptmp
|
||||
# len is less than 8 if we got here
|
||||
# less_than_8 expects length in upper 3 bits of len_dw
|
||||
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
|
||||
shl $2, len_dw
|
||||
jmp .Lless_than_8_post_shl1
|
||||
|
||||
#######################################################################
|
||||
## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
|
||||
#######################################################################
|
||||
.Lsmall:
|
||||
shl $32-8, len_dw # Prepare len_dw for less_than_256
|
||||
j=256
|
||||
.rept 5 # j = {256, 128, 64, 32, 16}
|
||||
.altmacro
|
||||
LABEL less_than_ %j # less_than_j: Length should be in
|
||||
# upper lg(j) bits of len_dw
|
||||
j=(j/2)
|
||||
shl $1, len_dw # Get next MSB
|
||||
JNC_LESS_THAN %j
|
||||
.noaltmacro
|
||||
i=0
|
||||
.rept (j/8)
|
||||
crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
|
||||
i=i+8
|
||||
.endr
|
||||
jz .Ldo_return # Return if remaining length is zero
|
||||
add $j, bufptmp # Advance buf
|
||||
.endr
|
||||
|
||||
.Lless_than_8: # Length should be stored in
|
||||
# upper 3 bits of len_dw
|
||||
shl $1, len_dw
|
||||
.Lless_than_8_post_shl1:
|
||||
jnc .Lless_than_4
|
||||
crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
|
||||
jz .Ldo_return # return if remaining data is zero
|
||||
add $4, bufptmp
|
||||
.Lless_than_4: # Length should be stored in
|
||||
# upper 2 bits of len_dw
|
||||
shl $1, len_dw
|
||||
jnc .Lless_than_2
|
||||
crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
|
||||
jz .Ldo_return # return if remaining data is zero
|
||||
add $2, bufptmp
|
||||
.Lless_than_2: # Length should be stored in the MSB
|
||||
# of len_dw
|
||||
shl $1, len_dw
|
||||
jnc .Lless_than_1
|
||||
crc32b (bufptmp), crc_init_dw # CRC of 1 byte
|
||||
.Lless_than_1: # Length should be zero
|
||||
.Ldo_return:
|
||||
movq crc_init, %rax
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rbx
|
||||
RET
|
||||
###SYM_FUNC_END(crc_pcl)
|
||||
.size crc_pcl, .-crc_pcl
|
||||
###SYM_FUNC_END(crc_pcl)
|
||||
|
||||
.section .rodata, "a", @progbits
|
||||
################################################################
|
||||
## jump table Table is 129 entries x 2 bytes each
|
||||
################################################################
|
||||
.align 4
|
||||
jump_table:
|
||||
i=0
|
||||
.rept 129
|
||||
.altmacro
|
||||
JMPTBL_ENTRY %i
|
||||
.noaltmacro
|
||||
i=i+1
|
||||
.endr
|
||||
|
||||
|
||||
################################################################
|
||||
## PCLMULQDQ tables
|
||||
## Table is 128 entries x 2 words (8 bytes) each
|
||||
################################################################
|
||||
.align 8
|
||||
K_table:
|
||||
.long 0x493c7d27, 0x00000001
|
||||
.long 0xba4fc28e, 0x493c7d27
|
||||
.long 0xddc0152b, 0xf20c0dfe
|
||||
.long 0x9e4addf8, 0xba4fc28e
|
||||
.long 0x39d3b296, 0x3da6d0cb
|
||||
.long 0x0715ce53, 0xddc0152b
|
||||
.long 0x47db8317, 0x1c291d04
|
||||
.long 0x0d3b6092, 0x9e4addf8
|
||||
.long 0xc96cfdc0, 0x740eef02
|
||||
.long 0x878a92a7, 0x39d3b296
|
||||
.long 0xdaece73e, 0x083a6eec
|
||||
.long 0xab7aff2a, 0x0715ce53
|
||||
.long 0x2162d385, 0xc49f4f67
|
||||
.long 0x83348832, 0x47db8317
|
||||
.long 0x299847d5, 0x2ad91c30
|
||||
.long 0xb9e02b86, 0x0d3b6092
|
||||
.long 0x18b33a4e, 0x6992cea2
|
||||
.long 0xb6dd949b, 0xc96cfdc0
|
||||
.long 0x78d9ccb7, 0x7e908048
|
||||
.long 0xbac2fd7b, 0x878a92a7
|
||||
.long 0xa60ce07b, 0x1b3d8f29
|
||||
.long 0xce7f39f4, 0xdaece73e
|
||||
.long 0x61d82e56, 0xf1d0f55e
|
||||
.long 0xd270f1a2, 0xab7aff2a
|
||||
.long 0xc619809d, 0xa87ab8a8
|
||||
.long 0x2b3cac5d, 0x2162d385
|
||||
.long 0x65863b64, 0x8462d800
|
||||
.long 0x1b03397f, 0x83348832
|
||||
.long 0xebb883bd, 0x71d111a8
|
||||
.long 0xb3e32c28, 0x299847d5
|
||||
.long 0x064f7f26, 0xffd852c6
|
||||
.long 0xdd7e3b0c, 0xb9e02b86
|
||||
.long 0xf285651c, 0xdcb17aa4
|
||||
.long 0x10746f3c, 0x18b33a4e
|
||||
.long 0xc7a68855, 0xf37c5aee
|
||||
.long 0x271d9844, 0xb6dd949b
|
||||
.long 0x8e766a0c, 0x6051d5a2
|
||||
.long 0x93a5f730, 0x78d9ccb7
|
||||
.long 0x6cb08e5c, 0x18b0d4ff
|
||||
.long 0x6b749fb2, 0xbac2fd7b
|
||||
.long 0x1393e203, 0x21f3d99c
|
||||
.long 0xcec3662e, 0xa60ce07b
|
||||
.long 0x96c515bb, 0x8f158014
|
||||
.long 0xe6fc4e6a, 0xce7f39f4
|
||||
.long 0x8227bb8a, 0xa00457f7
|
||||
.long 0xb0cd4768, 0x61d82e56
|
||||
.long 0x39c7ff35, 0x8d6d2c43
|
||||
.long 0xd7a4825c, 0xd270f1a2
|
||||
.long 0x0ab3844b, 0x00ac29cf
|
||||
.long 0x0167d312, 0xc619809d
|
||||
.long 0xf6076544, 0xe9adf796
|
||||
.long 0x26f6a60a, 0x2b3cac5d
|
||||
.long 0xa741c1bf, 0x96638b34
|
||||
.long 0x98d8d9cb, 0x65863b64
|
||||
.long 0x49c3cc9c, 0xe0e9f351
|
||||
.long 0x68bce87a, 0x1b03397f
|
||||
.long 0x57a3d037, 0x9af01f2d
|
||||
.long 0x6956fc3b, 0xebb883bd
|
||||
.long 0x42d98888, 0x2cff42cf
|
||||
.long 0x3771e98f, 0xb3e32c28
|
||||
.long 0xb42ae3d9, 0x88f25a3a
|
||||
.long 0x2178513a, 0x064f7f26
|
||||
.long 0xe0ac139e, 0x4e36f0b0
|
||||
.long 0x170076fa, 0xdd7e3b0c
|
||||
.long 0x444dd413, 0xbd6f81f8
|
||||
.long 0x6f345e45, 0xf285651c
|
||||
.long 0x41d17b64, 0x91c9bd4b
|
||||
.long 0xff0dba97, 0x10746f3c
|
||||
.long 0xa2b73df1, 0x885f087b
|
||||
.long 0xf872e54c, 0xc7a68855
|
||||
.long 0x1e41e9fc, 0x4c144932
|
||||
.long 0x86d8e4d2, 0x271d9844
|
||||
.long 0x651bd98b, 0x52148f02
|
||||
.long 0x5bb8f1bc, 0x8e766a0c
|
||||
.long 0xa90fd27a, 0xa3c6f37a
|
||||
.long 0xb3af077a, 0x93a5f730
|
||||
.long 0x4984d782, 0xd7c0557f
|
||||
.long 0xca6ef3ac, 0x6cb08e5c
|
||||
.long 0x234e0b26, 0x63ded06a
|
||||
.long 0xdd66cbbb, 0x6b749fb2
|
||||
.long 0x4597456a, 0x4d56973c
|
||||
.long 0xe9e28eb4, 0x1393e203
|
||||
.long 0x7b3ff57a, 0x9669c9df
|
||||
.long 0xc9c8b782, 0xcec3662e
|
||||
.long 0x3f70cc6f, 0xe417f38a
|
||||
.long 0x93e106a4, 0x96c515bb
|
||||
.long 0x62ec6c6d, 0x4b9e0f71
|
||||
.long 0xd813b325, 0xe6fc4e6a
|
||||
.long 0x0df04680, 0xd104b8fc
|
||||
.long 0x2342001e, 0x8227bb8a
|
||||
.long 0x0a2a8d7e, 0x5b397730
|
||||
.long 0x6d9a4957, 0xb0cd4768
|
||||
.long 0xe8b6368b, 0xe78eb416
|
||||
.long 0xd2c3ed1a, 0x39c7ff35
|
||||
.long 0x995a5724, 0x61ff0e01
|
||||
.long 0x9ef68d35, 0xd7a4825c
|
||||
.long 0x0c139b31, 0x8d96551c
|
||||
.long 0xf2271e60, 0x0ab3844b
|
||||
.long 0x0b0bf8ca, 0x0bf80dd2
|
||||
.long 0x2664fd8b, 0x0167d312
|
||||
.long 0xed64812d, 0x8821abed
|
||||
.long 0x02ee03b2, 0xf6076544
|
||||
.long 0x8604ae0f, 0x6a45d2b2
|
||||
.long 0x363bd6b3, 0x26f6a60a
|
||||
.long 0x135c83fd, 0xd8d26619
|
||||
.long 0x5fabe670, 0xa741c1bf
|
||||
.long 0x35ec3279, 0xde87806c
|
||||
.long 0x00bcf5f6, 0x98d8d9cb
|
||||
.long 0x8ae00689, 0x14338754
|
||||
.long 0x17f27698, 0x49c3cc9c
|
||||
.long 0x58ca5f00, 0x5bd2011f
|
||||
.long 0xaa7c7ad5, 0x68bce87a
|
||||
.long 0xb5cfca28, 0xdd07448e
|
||||
.long 0xded288f8, 0x57a3d037
|
||||
.long 0x59f229bc, 0xdde8f5b9
|
||||
.long 0x6d390dec, 0x6956fc3b
|
||||
.long 0x37170390, 0xa3e3e02c
|
||||
.long 0x6353c1cc, 0x42d98888
|
||||
.long 0xc4584f5c, 0xd73c7bea
|
||||
.long 0xf48642e9, 0x3771e98f
|
||||
.long 0x531377e2, 0x80ff0093
|
||||
.long 0xdd35bc8d, 0xb42ae3d9
|
||||
.long 0xb25b29f2, 0x8fe4c34d
|
||||
.long 0x9a5ede41, 0x2178513a
|
||||
.long 0xa563905d, 0xdf99fc11
|
||||
.long 0x45cddf4e, 0xe0ac139e
|
||||
.long 0xacfa3103, 0x6c23e841
|
||||
.long 0xa51b6135, 0x170076fa
|
||||
|
||||
## Warning
|
||||
.section .note.GNU-stack,"",@progbits
|
|
@ -9,6 +9,7 @@
|
|||
*/
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include "crypto/crc32c.h"
|
||||
#include "common/cpu-utils.h"
|
||||
|
||||
|
@ -17,6 +18,15 @@ static uint32_t (*crc_function)(uint32_t crc, unsigned char const *data, uint32_
|
|||
|
||||
#ifdef __x86_64__
|
||||
|
||||
#ifdef __GLIBC__
|
||||
|
||||
/* asmlinkage */ unsigned int crc_pcl(const unsigned char *buffer, int len, unsigned int crc_init);
|
||||
static unsigned int crc32c_pcl(uint32_t crc, unsigned char const *data, uint32_t len) {
|
||||
return crc_pcl(data, len, crc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* Based on a posting to lkml by Austin Zhang <austin.zhang@intel.com>
|
||||
*
|
||||
|
@ -76,13 +86,28 @@ static uint32_t crc32c_intel(uint32_t crc, unsigned char const *data, uint32_t l
|
|||
return crc;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void crc32c_init_accel(void)
|
||||
{
|
||||
/* CRC32 is in SSE4.2 */
|
||||
if (cpu_has_feature(CPU_FLAG_SSE42))
|
||||
/*
|
||||
* Musl reports a problem with linkage, use the old implementation for
|
||||
* now.
|
||||
*/
|
||||
if (0) {
|
||||
#ifdef __GLIBC__
|
||||
} else if (cpu_has_feature(CPU_FLAG_CRC32C_PCL)) {
|
||||
/* printf("CRC32C: pcl\n"); */
|
||||
crc_function = crc32c_pcl;
|
||||
#else
|
||||
} else if (cpu_has_feature(CPU_FLAG_SSE42)) {
|
||||
/* printf("CRC32c: intel\n"); */
|
||||
crc_function = crc32c_intel;
|
||||
else
|
||||
#endif
|
||||
} else {
|
||||
/* printf("CRC32c: fallback\n"); */
|
||||
crc_function = __crc32c_le;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
|
|
@ -191,6 +191,8 @@ int main(int argc, char **argv) {
|
|||
.cpu_flag = CPU_FLAG_NONE },
|
||||
{ .name = "CRC32C-NI", .digest = hash_crc32c, .digest_size = 4,
|
||||
.cpu_flag = CPU_FLAG_SSE42 },
|
||||
{ .name = "CRC32C-PCL", .digest = hash_crc32c, .digest_size = 4,
|
||||
.cpu_flag = CPU_FLAG_CRC32C_PCL },
|
||||
{ .name = "XXHASH", .digest = hash_xxhash, .digest_size = 8 },
|
||||
{ .name = "SHA256-ref", .digest = hash_sha256, .digest_size = 32,
|
||||
.cpu_flag = CPU_FLAG_NONE, .backend = CRYPTOPROVIDER_BUILTIN + 1 },
|
||||
|
@ -282,7 +284,7 @@ int main(int argc, char **argv) {
|
|||
u64 total = 0;
|
||||
|
||||
if (c->cpu_flag != 0 && !cpu_has_feature(c->cpu_flag)) {
|
||||
printf("%12s: no CPU support\n", c->name);
|
||||
printf("%14s: no CPU support\n", c->name);
|
||||
continue;
|
||||
}
|
||||
/* Backend not compiled in */
|
||||
|
|
|
@ -444,6 +444,13 @@ static const struct hash_testspec test_spec[] = {
|
|||
.count = ARRAY_SIZE(crc32c_tv),
|
||||
.cpu_flag = CPU_FLAG_SSE42,
|
||||
.hash = hash_crc32c
|
||||
}, {
|
||||
.name = "CRC32C-PCL",
|
||||
.digest_size = 4,
|
||||
.testvec = crc32c_tv,
|
||||
.count = ARRAY_SIZE(crc32c_tv),
|
||||
.cpu_flag = CPU_FLAG_CRC32C_PCL,
|
||||
.hash = hash_crc32c
|
||||
}, {
|
||||
.name = "XXHASH",
|
||||
.digest_size = 8,
|
||||
|
|
Loading…
Reference in New Issue