From 7e4fe5162ab94a413e04caae19193c5e7a4c6478 Mon Sep 17 00:00:00 2001 From: James Almer Date: Mon, 9 Sep 2013 20:16:39 -0300 Subject: [PATCH] lavu/sha: Fully unroll the transform function loops crypto_bench SHA-1 and SHA-256 results using an AMD Athlon X2 7750+, mingw32-w64 GCC 4.7.3 x86_64 Before: lavu SHA-1 size: 1048576 runs: 1024 time: 9.012 +- 0.162 lavu SHA-256 size: 1048576 runs: 1024 time: 19.625 +- 0.173 After: lavu SHA-1 size: 1048576 runs: 1024 time: 7.948 +- 0.154 lavu SHA-256 size: 1048576 runs: 1024 time: 17.841 +- 0.170 Signed-off-by: James Almer Signed-off-by: Michael Niedermayer --- libavutil/sha.c | 115 ++++++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 48 deletions(-) diff --git a/libavutil/sha.c b/libavutil/sha.c index 8c4f8a0e43..0cf94245a9 100644 --- a/libavutil/sha.c +++ b/libavutil/sha.c @@ -98,39 +98,53 @@ static void sha1_transform(uint32_t state[5], const uint8_t buffer[64]) a = t; } #else - for (i = 0; i < 15; i += 5) { - R0(a, b, c, d, e, 0 + i); - R0(e, a, b, c, d, 1 + i); - R0(d, e, a, b, c, 2 + i); - R0(c, d, e, a, b, 3 + i); - R0(b, c, d, e, a, 4 + i); - } + +#define R1_0 \ + R0(a, b, c, d, e, 0 + i); \ + R0(e, a, b, c, d, 1 + i); \ + R0(d, e, a, b, c, 2 + i); \ + R0(c, d, e, a, b, 3 + i); \ + R0(b, c, d, e, a, 4 + i); \ + i += 5 + + i = 0; + R1_0; R1_0; R1_0; R0(a, b, c, d, e, 15); R1(e, a, b, c, d, 16); R1(d, e, a, b, c, 17); R1(c, d, e, a, b, 18); R1(b, c, d, e, a, 19); - for (i = 20; i < 40; i += 5) { - R2(a, b, c, d, e, 0 + i); - R2(e, a, b, c, d, 1 + i); - R2(d, e, a, b, c, 2 + i); - R2(c, d, e, a, b, 3 + i); - R2(b, c, d, e, a, 4 + i); - } - for (; i < 60; i += 5) { - R3(a, b, c, d, e, 0 + i); - R3(e, a, b, c, d, 1 + i); - R3(d, e, a, b, c, 2 + i); - R3(c, d, e, a, b, 3 + i); - R3(b, c, d, e, a, 4 + i); - } - for (; i < 80; i += 5) { - R4(a, b, c, d, e, 0 + i); - R4(e, a, b, c, d, 1 + i); - R4(d, e, a, b, c, 2 + i); - R4(c, d, e, a, b, 3 + i); - R4(b, c, d, e, a, 4 + i); - } + +#define R1_20 \ + R2(a, b, c, d, e, 0 + i); \ + R2(e, a, b, c, d, 1 + i); \ + R2(d, e, a, b, c, 2 + i); \ + R2(c, d, e, a, b, 3 + i); \ + R2(b, c, d, e, a, 4 + i); \ + i += 5 + + i = 20; + R1_20; R1_20; R1_20; R1_20; + +#define R1_40 \ + R3(a, b, c, d, e, 0 + i); \ + R3(e, a, b, c, d, 1 + i); \ + R3(d, e, a, b, c, 2 + i); \ + R3(c, d, e, a, b, 3 + i); \ + R3(b, c, d, e, a, 4 + i); \ + i += 5 + + R1_40; R1_40; R1_40; R1_40; + +#define R1_60 \ + R4(a, b, c, d, e, 0 + i); \ + R4(e, a, b, c, d, 1 + i); \ + R4(d, e, a, b, c, 2 + i); \ + R4(c, d, e, a, b, 3 + i); \ + R4(b, c, d, e, a, 4 + i); \ + i += 5 + + R1_60; R1_60; R1_60; R1_60; #endif state[0] += a; state[1] += b; @@ -218,27 +232,32 @@ static void sha256_transform(uint32_t *state, const uint8_t buffer[64]) a = T1 + T2; } #else - for (i = 0; i < 16 - 7;) { - ROUND256_0_TO_15(a, b, c, d, e, f, g, h); - ROUND256_0_TO_15(h, a, b, c, d, e, f, g); - ROUND256_0_TO_15(g, h, a, b, c, d, e, f); - ROUND256_0_TO_15(f, g, h, a, b, c, d, e); - ROUND256_0_TO_15(e, f, g, h, a, b, c, d); - ROUND256_0_TO_15(d, e, f, g, h, a, b, c); - ROUND256_0_TO_15(c, d, e, f, g, h, a, b); - ROUND256_0_TO_15(b, c, d, e, f, g, h, a); - } - for (; i < 64 - 7;) { - ROUND256_16_TO_63(a, b, c, d, e, f, g, h); - ROUND256_16_TO_63(h, a, b, c, d, e, f, g); - ROUND256_16_TO_63(g, h, a, b, c, d, e, f); - ROUND256_16_TO_63(f, g, h, a, b, c, d, e); - ROUND256_16_TO_63(e, f, g, h, a, b, c, d); - ROUND256_16_TO_63(d, e, f, g, h, a, b, c); - ROUND256_16_TO_63(c, d, e, f, g, h, a, b); - ROUND256_16_TO_63(b, c, d, e, f, g, h, a); - } + i = 0; +#define R256_0 \ + ROUND256_0_TO_15(a, b, c, d, e, f, g, h); \ + ROUND256_0_TO_15(h, a, b, c, d, e, f, g); \ + ROUND256_0_TO_15(g, h, a, b, c, d, e, f); \ + ROUND256_0_TO_15(f, g, h, a, b, c, d, e); \ + ROUND256_0_TO_15(e, f, g, h, a, b, c, d); \ + ROUND256_0_TO_15(d, e, f, g, h, a, b, c); \ + ROUND256_0_TO_15(c, d, e, f, g, h, a, b); \ + ROUND256_0_TO_15(b, c, d, e, f, g, h, a) + + R256_0; R256_0; + +#define R256_16 \ + ROUND256_16_TO_63(a, b, c, d, e, f, g, h); \ + ROUND256_16_TO_63(h, a, b, c, d, e, f, g); \ + ROUND256_16_TO_63(g, h, a, b, c, d, e, f); \ + ROUND256_16_TO_63(f, g, h, a, b, c, d, e); \ + ROUND256_16_TO_63(e, f, g, h, a, b, c, d); \ + ROUND256_16_TO_63(d, e, f, g, h, a, b, c); \ + ROUND256_16_TO_63(c, d, e, f, g, h, a, b); \ + ROUND256_16_TO_63(b, c, d, e, f, g, h, a) + + R256_16; R256_16; R256_16; + R256_16; R256_16; R256_16; #endif state[0] += a; state[1] += b;