mirror of
https://github.com/kdave/btrfs-progs
synced 2024-12-17 20:05:24 +00:00
btrfs-progs: crypto: add AVX2 implementation of BLAKE2
Copy AVX2 implementation from https://github.com/sneves/blake2-avx2 . Though this is marked experimental, libsodium uses this version. Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
d61739003d
commit
23cb9771bc
3
Makefile
3
Makefile
@ -132,6 +132,7 @@ CRYPTO_OBJECTS =
|
||||
ifeq ($(shell uname -m),x86_64)
|
||||
crypto_blake2b_sse2_cflags = -msse2
|
||||
crypto_blake2b_sse41_cflags = -msse4.1
|
||||
crypto_blake2b_avx2_cflags = -mavx2
|
||||
endif
|
||||
|
||||
LIBS = $(LIBS_BASE) $(LIBS_CRYPTO)
|
||||
@ -353,7 +354,7 @@ cmds_restore_cflags = -DCOMPRESSION_LZO=$(COMPRESSION_LZO) -DCOMPRESSION_ZSTD=$(
|
||||
|
||||
ifeq ($(CRYPTOPROVIDER_BUILTIN),1)
|
||||
CRYPTO_OBJECTS = crypto/sha224-256.o crypto/blake2b-ref.o crypto/blake2b-sse2.o \
|
||||
crypto/blake2b-sse41.o
|
||||
crypto/blake2b-sse41.o crypto/blake2b-avx2.o
|
||||
CRYPTO_CFLAGS = -DCRYPTOPROVIDER_BUILTIN=1
|
||||
endif
|
||||
|
||||
|
@ -32,6 +32,10 @@
|
||||
#define HAVE_AVX
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define HAVE_AVX2
|
||||
#endif
|
||||
|
||||
#if defined(__XOP__)
|
||||
#define HAVE_XOP
|
||||
#endif
|
||||
|
233
crypto/blake2b-avx2.c
Normal file
233
crypto/blake2b-avx2.c
Normal file
@ -0,0 +1,233 @@
|
||||
/*
|
||||
* Creative Commons Legal Code
|
||||
*
|
||||
* CC0 1.0 Universal
|
||||
*
|
||||
* CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
|
||||
* LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
|
||||
* ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
|
||||
* INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
|
||||
* REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
|
||||
* PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
|
||||
* THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
|
||||
* HEREUNDER.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "blake2.h"
|
||||
#include "blake2-impl.h"
|
||||
#include "blake2-config.h"
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
|
||||
/* Shuffles seem to give the best performance compared to gahter and simple */
|
||||
#define PERMUTE_WITH_SHUFFLES
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define LOAD128(p) _mm_load_si128( (__m128i *)(p) )
|
||||
#define STORE128(p,r) _mm_store_si128((__m128i *)(p), r)
|
||||
|
||||
#define LOADU128(p) _mm_loadu_si128( (__m128i *)(p) )
|
||||
#define STOREU128(p,r) _mm_storeu_si128((__m128i *)(p), r)
|
||||
|
||||
#define LOAD(p) _mm256_load_si256( (__m256i *)(p) )
|
||||
#define STORE(p,r) _mm256_store_si256((__m256i *)(p), r)
|
||||
|
||||
#define LOADU(p) _mm256_loadu_si256( (__m256i *)(p) )
|
||||
#define STOREU(p,r) _mm256_storeu_si256((__m256i *)(p), r)
|
||||
|
||||
static BLAKE2_INLINE uint64_t LOADU64(void const * p) {
|
||||
uint64_t v;
|
||||
memcpy(&v, p, sizeof v);
|
||||
return v;
|
||||
}
|
||||
|
||||
#define ROTATE16 _mm256_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, \
|
||||
2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 )
|
||||
|
||||
#define ROTATE24 _mm256_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, \
|
||||
3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 )
|
||||
|
||||
#define ADD(a, b) _mm256_add_epi64(a, b)
|
||||
#define SUB(a, b) _mm256_sub_epi64(a, b)
|
||||
|
||||
#define XOR(a, b) _mm256_xor_si256(a, b)
|
||||
#define AND(a, b) _mm256_and_si256(a, b)
|
||||
#define OR(a, b) _mm256_or_si256(a, b)
|
||||
|
||||
#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))
|
||||
#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24)
|
||||
#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16)
|
||||
#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
|
||||
|
||||
#define ALIGN(x) __attribute__((aligned(x)))
|
||||
|
||||
ALIGN(64) static const uint64_t blake2b_IV[8] = {
|
||||
UINT64_C(0x6A09E667F3BCC908), UINT64_C(0xBB67AE8584CAA73B),
|
||||
UINT64_C(0x3C6EF372FE94F82B), UINT64_C(0xA54FF53A5F1D36F1),
|
||||
UINT64_C(0x510E527FADE682D1), UINT64_C(0x9B05688C2B3E6C1F),
|
||||
UINT64_C(0x1F83D9ABFB41BD6B), UINT64_C(0x5BE0CD19137E2179),
|
||||
};
|
||||
|
||||
#define BLAKE2B_G1_V1(a, b, c, d, m) do { \
|
||||
a = ADD(a, m); \
|
||||
a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \
|
||||
c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_G2_V1(a, b, c, d, m) do { \
|
||||
a = ADD(a, m); \
|
||||
a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \
|
||||
c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_DIAG_V1(a, b, c, d) do { \
|
||||
a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(2,1,0,3)); \
|
||||
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1,0,3,2)); \
|
||||
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(0,3,2,1)); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_UNDIAG_V1(a, b, c, d) do { \
|
||||
a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(0,3,2,1)); \
|
||||
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1,0,3,2)); \
|
||||
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(2,1,0,3)); \
|
||||
} while(0)
|
||||
|
||||
#if defined(PERMUTE_WITH_SHUFFLES)
|
||||
#include "blake2b-load-avx2.h"
|
||||
#elif defined(PERMUTE_WITH_GATHER)
|
||||
#else
|
||||
#include "blake2b-load-avx2-simple.h"
|
||||
#endif
|
||||
|
||||
#if defined(PERMUTE_WITH_GATHER)
|
||||
ALIGN(64) static const uint32_t indices[12][16] = {
|
||||
{ 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13},
|
||||
{14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7},
|
||||
{11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1},
|
||||
{ 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0},
|
||||
{ 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8},
|
||||
{ 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14},
|
||||
{12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2},
|
||||
{13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6},
|
||||
{ 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4},
|
||||
{10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12},
|
||||
{ 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13},
|
||||
{14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7},
|
||||
};
|
||||
|
||||
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \
|
||||
__m256i b0; \
|
||||
b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][ 0]), 8); \
|
||||
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
||||
b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][ 4]), 8); \
|
||||
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_DIAG_V1(a, b, c, d); \
|
||||
b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][ 8]), 8); \
|
||||
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
||||
b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][12]), 8); \
|
||||
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_UNDIAG_V1(a, b, c, d); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \
|
||||
int i; \
|
||||
for(i = 0; i < 12; ++i) { \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, i, m); \
|
||||
} \
|
||||
} while(0)
|
||||
#else /* !PERMUTE_WITH_GATHER */
|
||||
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \
|
||||
__m256i b0; \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_1(b0); \
|
||||
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_2(b0); \
|
||||
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_DIAG_V1(a, b, c, d); \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_3(b0); \
|
||||
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_LOAD_MSG_ ##r ##_4(b0); \
|
||||
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
||||
BLAKE2B_UNDIAG_V1(a, b, c, d); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
|
||||
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
#if defined(PERMUTE_WITH_GATHER)
|
||||
#define DECLARE_MESSAGE_WORDS(m)
|
||||
#elif defined(PERMUTE_WITH_SHUFFLES)
|
||||
#define DECLARE_MESSAGE_WORDS(m) \
|
||||
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
|
||||
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
|
||||
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
|
||||
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
|
||||
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
|
||||
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
|
||||
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
|
||||
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
|
||||
__m256i t0, t1;
|
||||
#else
|
||||
#define DECLARE_MESSAGE_WORDS(m) \
|
||||
const uint64_t m0 = LOADU64((m) + 0); \
|
||||
const uint64_t m1 = LOADU64((m) + 8); \
|
||||
const uint64_t m2 = LOADU64((m) + 16); \
|
||||
const uint64_t m3 = LOADU64((m) + 24); \
|
||||
const uint64_t m4 = LOADU64((m) + 32); \
|
||||
const uint64_t m5 = LOADU64((m) + 40); \
|
||||
const uint64_t m6 = LOADU64((m) + 48); \
|
||||
const uint64_t m7 = LOADU64((m) + 56); \
|
||||
const uint64_t m8 = LOADU64((m) + 64); \
|
||||
const uint64_t m9 = LOADU64((m) + 72); \
|
||||
const uint64_t m10 = LOADU64((m) + 80); \
|
||||
const uint64_t m11 = LOADU64((m) + 88); \
|
||||
const uint64_t m12 = LOADU64((m) + 96); \
|
||||
const uint64_t m13 = LOADU64((m) + 104); \
|
||||
const uint64_t m14 = LOADU64((m) + 112); \
|
||||
const uint64_t m15 = LOADU64((m) + 120);
|
||||
#endif
|
||||
|
||||
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \
|
||||
DECLARE_MESSAGE_WORDS(m) \
|
||||
const __m256i iv0 = a; \
|
||||
const __m256i iv1 = b; \
|
||||
__m256i c = LOAD(&blake2b_IV[0]); \
|
||||
__m256i d = XOR( \
|
||||
LOAD(&blake2b_IV[4]), \
|
||||
_mm256_set_epi64x(f1, f0, t1, t0) \
|
||||
); \
|
||||
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
|
||||
a = XOR(a, c); \
|
||||
b = XOR(b, d); \
|
||||
a = XOR(a, iv0); \
|
||||
b = XOR(b, iv1); \
|
||||
} while(0)
|
||||
|
||||
void blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
|
||||
{
|
||||
__m256i a = LOADU(&S->h[0]);
|
||||
__m256i b = LOADU(&S->h[4]);
|
||||
BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]);
|
||||
STOREU(&S->h[0], a);
|
||||
STOREU(&S->h[4], b);
|
||||
}
|
||||
|
||||
#endif
|
54
crypto/blake2b-load-avx2-simple.h
Normal file
54
crypto/blake2b-load-avx2-simple.h
Normal file
@ -0,0 +1,54 @@
|
||||
#ifndef BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_SIMPLE_H
|
||||
#define BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_SIMPLE_H
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_1(b0) b0 = _mm256_set_epi64x(m6, m4, m2, m0);
|
||||
#define BLAKE2B_LOAD_MSG_0_2(b0) b0 = _mm256_set_epi64x(m7, m5, m3, m1);
|
||||
#define BLAKE2B_LOAD_MSG_0_3(b0) b0 = _mm256_set_epi64x(m12, m10, m8, m14);
|
||||
#define BLAKE2B_LOAD_MSG_0_4(b0) b0 = _mm256_set_epi64x(m13, m11, m9, m15);
|
||||
#define BLAKE2B_LOAD_MSG_1_1(b0) b0 = _mm256_set_epi64x(m13, m9, m4, m14);
|
||||
#define BLAKE2B_LOAD_MSG_1_2(b0) b0 = _mm256_set_epi64x(m6, m15, m8, m10);
|
||||
#define BLAKE2B_LOAD_MSG_1_3(b0) b0 = _mm256_set_epi64x(m11, m0, m1, m5);
|
||||
#define BLAKE2B_LOAD_MSG_1_4(b0) b0 = _mm256_set_epi64x(m7, m2, m12, m3);
|
||||
#define BLAKE2B_LOAD_MSG_2_1(b0) b0 = _mm256_set_epi64x(m15, m5, m12, m11);
|
||||
#define BLAKE2B_LOAD_MSG_2_2(b0) b0 = _mm256_set_epi64x(m13, m2, m0, m8);
|
||||
#define BLAKE2B_LOAD_MSG_2_3(b0) b0 = _mm256_set_epi64x(m7, m3, m10, m9);
|
||||
#define BLAKE2B_LOAD_MSG_2_4(b0) b0 = _mm256_set_epi64x(m1, m6, m14, m4);
|
||||
#define BLAKE2B_LOAD_MSG_3_1(b0) b0 = _mm256_set_epi64x(m11, m13, m3, m7);
|
||||
#define BLAKE2B_LOAD_MSG_3_2(b0) b0 = _mm256_set_epi64x(m14, m12, m1, m9);
|
||||
#define BLAKE2B_LOAD_MSG_3_3(b0) b0 = _mm256_set_epi64x(m4, m5, m2, m15);
|
||||
#define BLAKE2B_LOAD_MSG_3_4(b0) b0 = _mm256_set_epi64x(m0, m10, m6, m8);
|
||||
#define BLAKE2B_LOAD_MSG_4_1(b0) b0 = _mm256_set_epi64x(m10, m2, m5, m9);
|
||||
#define BLAKE2B_LOAD_MSG_4_2(b0) b0 = _mm256_set_epi64x(m15, m4, m7, m0);
|
||||
#define BLAKE2B_LOAD_MSG_4_3(b0) b0 = _mm256_set_epi64x(m6, m11, m14, m3);
|
||||
#define BLAKE2B_LOAD_MSG_4_4(b0) b0 = _mm256_set_epi64x(m8, m12, m1, m13);
|
||||
#define BLAKE2B_LOAD_MSG_5_1(b0) b0 = _mm256_set_epi64x(m8, m0, m6, m2);
|
||||
#define BLAKE2B_LOAD_MSG_5_2(b0) b0 = _mm256_set_epi64x(m3, m11, m10, m12);
|
||||
#define BLAKE2B_LOAD_MSG_5_3(b0) b0 = _mm256_set_epi64x(m15, m7, m4, m1);
|
||||
#define BLAKE2B_LOAD_MSG_5_4(b0) b0 = _mm256_set_epi64x(m14, m5, m13, m9);
|
||||
#define BLAKE2B_LOAD_MSG_6_1(b0) b0 = _mm256_set_epi64x(m4, m14, m1, m12);
|
||||
#define BLAKE2B_LOAD_MSG_6_2(b0) b0 = _mm256_set_epi64x(m10, m13, m15, m5);
|
||||
#define BLAKE2B_LOAD_MSG_6_3(b0) b0 = _mm256_set_epi64x(m9, m6, m0, m8);
|
||||
#define BLAKE2B_LOAD_MSG_6_4(b0) b0 = _mm256_set_epi64x(m2, m3, m7, m11);
|
||||
#define BLAKE2B_LOAD_MSG_7_1(b0) b0 = _mm256_set_epi64x(m3, m12, m7, m13);
|
||||
#define BLAKE2B_LOAD_MSG_7_2(b0) b0 = _mm256_set_epi64x(m9, m1, m14, m11);
|
||||
#define BLAKE2B_LOAD_MSG_7_3(b0) b0 = _mm256_set_epi64x(m8, m15, m5, m2);
|
||||
#define BLAKE2B_LOAD_MSG_7_4(b0) b0 = _mm256_set_epi64x(m6, m4, m0, m10);
|
||||
#define BLAKE2B_LOAD_MSG_8_1(b0) b0 = _mm256_set_epi64x(m0, m11, m14, m6);
|
||||
#define BLAKE2B_LOAD_MSG_8_2(b0) b0 = _mm256_set_epi64x(m8, m3, m9, m15);
|
||||
#define BLAKE2B_LOAD_MSG_8_3(b0) b0 = _mm256_set_epi64x(m1, m13, m12, m10);
|
||||
#define BLAKE2B_LOAD_MSG_8_4(b0) b0 = _mm256_set_epi64x(m4, m7, m2, m5);
|
||||
#define BLAKE2B_LOAD_MSG_9_1(b0) b0 = _mm256_set_epi64x(m1, m7, m8, m10);
|
||||
#define BLAKE2B_LOAD_MSG_9_2(b0) b0 = _mm256_set_epi64x(m5, m6, m4, m2);
|
||||
#define BLAKE2B_LOAD_MSG_9_3(b0) b0 = _mm256_set_epi64x(m3, m9, m15, m13);
|
||||
#define BLAKE2B_LOAD_MSG_9_4(b0) b0 = _mm256_set_epi64x(m12, m14, m11, m0);
|
||||
#define BLAKE2B_LOAD_MSG_10_1(b0) b0 = _mm256_set_epi64x(m6, m4, m2, m0);
|
||||
#define BLAKE2B_LOAD_MSG_10_2(b0) b0 = _mm256_set_epi64x(m7, m5, m3, m1);
|
||||
#define BLAKE2B_LOAD_MSG_10_3(b0) b0 = _mm256_set_epi64x(m12, m10, m8, m14);
|
||||
#define BLAKE2B_LOAD_MSG_10_4(b0) b0 = _mm256_set_epi64x(m13, m11, m9, m15);
|
||||
#define BLAKE2B_LOAD_MSG_11_1(b0) b0 = _mm256_set_epi64x(m13, m9, m4, m14);
|
||||
#define BLAKE2B_LOAD_MSG_11_2(b0) b0 = _mm256_set_epi64x(m6, m15, m8, m10);
|
||||
#define BLAKE2B_LOAD_MSG_11_3(b0) b0 = _mm256_set_epi64x(m11, m0, m1, m5);
|
||||
#define BLAKE2B_LOAD_MSG_11_4(b0) b0 = _mm256_set_epi64x(m7, m2, m12, m3);
|
||||
|
||||
#endif
|
||||
|
340
crypto/blake2b-load-avx2.h
Normal file
340
crypto/blake2b-load-avx2.h
Normal file
@ -0,0 +1,340 @@
|
||||
#ifndef BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_H
|
||||
#define BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_H
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_1(b0) do { \
|
||||
t0 = _mm256_unpacklo_epi64(m0, m1); \
|
||||
t1 = _mm256_unpacklo_epi64(m2, m3); \
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m0, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m2, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m7, m4);\
|
||||
t1 = _mm256_unpacklo_epi64(m5, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_0_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m7, m4);\
|
||||
t1 = _mm256_unpackhi_epi64(m5, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m7, m2);\
|
||||
t1 = _mm256_unpackhi_epi64(m4, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m4);\
|
||||
t1 = _mm256_alignr_epi8(m3, m7, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m2, m0);\
|
||||
t1 = _mm256_blend_epi32(m5, m0, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_1_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m6, m1, 8);\
|
||||
t1 = _mm256_blend_epi32(m3, m1, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m6, m5, 8);\
|
||||
t1 = _mm256_unpackhi_epi64(m2, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m4, m0);\
|
||||
t1 = _mm256_blend_epi32(m6, m1, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m5, m4, 8);\
|
||||
t1 = _mm256_unpackhi_epi64(m1, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_2_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m2, m7);\
|
||||
t1 = _mm256_blend_epi32(m0, m3, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m3, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m6, m5);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m4, m0);\
|
||||
t1 = _mm256_unpacklo_epi64(m6, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m1, m7, 8);\
|
||||
t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE(1,0,3,2));\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_3_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m4, m3);\
|
||||
t1 = _mm256_unpacklo_epi64(m5, m0);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m4, m2);\
|
||||
t1 = _mm256_unpacklo_epi64(m1, m5);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m3, m0, 0x33);\
|
||||
t1 = _mm256_blend_epi32(m7, m2, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m7, m1, 8);\
|
||||
t1 = _mm256_alignr_epi8(m3, m5, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_4_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m6, m0);\
|
||||
t1 = _mm256_unpacklo_epi64(m6, m4);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m1, m3);\
|
||||
t1 = _mm256_unpacklo_epi64(m0, m4);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m6, m5);\
|
||||
t1 = _mm256_unpackhi_epi64(m5, m1);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m2, m0, 8);\
|
||||
t1 = _mm256_unpackhi_epi64(m3, m7);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_5_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m4, m6);\
|
||||
t1 = _mm256_alignr_epi8(m7, m2, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m0, m6, 0x33);\
|
||||
t1 = _mm256_unpacklo_epi64(m7, m2);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m2, m7);\
|
||||
t1 = _mm256_alignr_epi8(m5, m6, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m4, m0);\
|
||||
t1 = _mm256_blend_epi32(m4, m3, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_6_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m5, m3);\
|
||||
t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE(1,0,3,2));\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m6, m3);\
|
||||
t1 = _mm256_blend_epi32(m1, m6, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m7, m5, 8);\
|
||||
t1 = _mm256_unpackhi_epi64(m0, m4);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m2, m1, 0x33);\
|
||||
t1 = _mm256_alignr_epi8(m4, m7, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_7_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m0);\
|
||||
t1 = _mm256_unpacklo_epi64(m2, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m3, m7);\
|
||||
t1 = _mm256_alignr_epi8(m0, m5, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m7, m4);\
|
||||
t1 = _mm256_alignr_epi8(m4, m1, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m6);\
|
||||
t1 = _mm256_unpackhi_epi64(m6, m0);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_8_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m1, m2, 8);\
|
||||
t1 = _mm256_alignr_epi8(m2, m3, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m4);\
|
||||
t1 = _mm256_unpackhi_epi64(m3, m0);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m1, m2);\
|
||||
t1 = _mm256_blend_epi32(m2, m3, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m6, m7);\
|
||||
t1 = _mm256_unpackhi_epi64(m4, m1);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_9_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_blend_epi32(m5, m0, 0x33);\
|
||||
t1 = _mm256_unpacklo_epi64(m7, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m0, m1);\
|
||||
t1 = _mm256_unpacklo_epi64(m2, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m0, m1);\
|
||||
t1 = _mm256_unpackhi_epi64(m2, m3);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m7, m4);\
|
||||
t1 = _mm256_unpacklo_epi64(m5, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_10_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m7, m4);\
|
||||
t1 = _mm256_unpackhi_epi64(m5, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_1(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m7, m2);\
|
||||
t1 = _mm256_unpackhi_epi64(m4, m6);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_2(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpacklo_epi64(m5, m4);\
|
||||
t1 = _mm256_alignr_epi8(m3, m7, 8);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_3(b0) \
|
||||
do { \
|
||||
t0 = _mm256_unpackhi_epi64(m2, m0);\
|
||||
t1 = _mm256_blend_epi32(m5, m0, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#define BLAKE2B_LOAD_MSG_11_4(b0) \
|
||||
do { \
|
||||
t0 = _mm256_alignr_epi8(m6, m1, 8);\
|
||||
t1 = _mm256_blend_epi32(m3, m1, 0x33);\
|
||||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
@ -222,9 +222,14 @@ static void blake2b_compress_ref( blake2b_state *S, const uint8_t block[BLAKE2B_
|
||||
|
||||
void blake2b_compress_sse2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
|
||||
void blake2b_compress_sse41( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
|
||||
void blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
|
||||
|
||||
static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
|
||||
{
|
||||
#if HAVE_AVX2
|
||||
if (cpu_has_feature(CPU_FLAG_AVX2))
|
||||
return blake2b_compress_avx2(S, block);
|
||||
#endif
|
||||
#if HAVE_SSE41
|
||||
if (cpu_has_feature(CPU_FLAG_SSE41))
|
||||
return blake2b_compress_sse41(S, block);
|
||||
|
Loading…
Reference in New Issue
Block a user