lavu/ripemd: Fully unroll the transform function loops

crypto_bench RIPEMD-160 results using an AMD Athlon X2 7750+, mingw32-w64 GCC 4.8.1 x86_64

Before:
lavu       RIPEMD-160   size: 1048576  runs:   1024  time:   12.342 +- 0.199

After:
lavu       RIPEMD-160   size: 1048576  runs:   1024  time:   10.143 +- 0.192

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
James Almer 2013-09-09 05:42:21 -03:00 committed by Michael Niedermayer
parent b4e1630d4d
commit 452ac2aaec
1 changed files with 70 additions and 59 deletions

View File

@ -128,37 +128,42 @@ static void ripemd128_transform(uint32_t *state, const uint8_t buffer[64], int e
for (n = 0; n < 16; n++)
block[n] = AV_RL32(buffer + 4 * n);
n = 0;
for (n = 0; n < 16;) {
ROUND128_0_TO_15(a,b,c,d,e,f,g,h);
ROUND128_0_TO_15(d,a,b,c,h,e,f,g);
ROUND128_0_TO_15(c,d,a,b,g,h,e,f);
ROUND128_0_TO_15(b,c,d,a,f,g,h,e);
}
#define R128_0 \
ROUND128_0_TO_15(a,b,c,d,e,f,g,h); \
ROUND128_0_TO_15(d,a,b,c,h,e,f,g); \
ROUND128_0_TO_15(c,d,a,b,g,h,e,f); \
ROUND128_0_TO_15(b,c,d,a,f,g,h,e)
R128_0; R128_0; R128_0; R128_0;
SWAP(a,e)
for (; n < 32;) {
ROUND128_16_TO_31(a,b,c,d,e,f,g,h);
ROUND128_16_TO_31(d,a,b,c,h,e,f,g);
ROUND128_16_TO_31(c,d,a,b,g,h,e,f);
ROUND128_16_TO_31(b,c,d,a,f,g,h,e);
}
#define R128_16 \
ROUND128_16_TO_31(a,b,c,d,e,f,g,h); \
ROUND128_16_TO_31(d,a,b,c,h,e,f,g); \
ROUND128_16_TO_31(c,d,a,b,g,h,e,f); \
ROUND128_16_TO_31(b,c,d,a,f,g,h,e)
R128_16; R128_16; R128_16; R128_16;
SWAP(b,f)
for (; n < 48;) {
ROUND128_32_TO_47(a,b,c,d,e,f,g,h);
ROUND128_32_TO_47(d,a,b,c,h,e,f,g);
ROUND128_32_TO_47(c,d,a,b,g,h,e,f);
ROUND128_32_TO_47(b,c,d,a,f,g,h,e);
}
#define R128_32 \
ROUND128_32_TO_47(a,b,c,d,e,f,g,h); \
ROUND128_32_TO_47(d,a,b,c,h,e,f,g); \
ROUND128_32_TO_47(c,d,a,b,g,h,e,f); \
ROUND128_32_TO_47(b,c,d,a,f,g,h,e)
R128_32; R128_32; R128_32; R128_32;
SWAP(c,g)
for (; n < 64;) {
ROUND128_48_TO_63(a,b,c,d,e,f,g,h);
ROUND128_48_TO_63(d,a,b,c,h,e,f,g);
ROUND128_48_TO_63(c,d,a,b,g,h,e,f);
ROUND128_48_TO_63(b,c,d,a,f,g,h,e);
}
#define R128_48 \
ROUND128_48_TO_63(a,b,c,d,e,f,g,h); \
ROUND128_48_TO_63(d,a,b,c,h,e,f,g); \
ROUND128_48_TO_63(c,d,a,b,g,h,e,f); \
ROUND128_48_TO_63(b,c,d,a,f,g,h,e)
R128_48; R128_48; R128_48; R128_48;
SWAP(d,h)
if (ext) {
@ -222,54 +227,60 @@ static void ripemd160_transform(uint32_t *state, const uint8_t buffer[64], int e
for (n = 0; n < 16; n++)
block[n] = AV_RL32(buffer + 4 * n);
n = 0;
for (n = 0; n < 16 - 1;) {
ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j);
ROUND160_0_TO_15(e,a,b,c,d,j,f,g,h,i);
ROUND160_0_TO_15(d,e,a,b,c,i,j,f,g,h);
ROUND160_0_TO_15(c,d,e,a,b,h,i,j,f,g);
ROUND160_0_TO_15(b,c,d,e,a,g,h,i,j,f);
}
#define R160_0 \
ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j); \
ROUND160_0_TO_15(e,a,b,c,d,j,f,g,h,i); \
ROUND160_0_TO_15(d,e,a,b,c,i,j,f,g,h); \
ROUND160_0_TO_15(c,d,e,a,b,h,i,j,f,g); \
ROUND160_0_TO_15(b,c,d,e,a,g,h,i,j,f)
R160_0; R160_0; R160_0;
ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j);
SWAP(a,f)
for (; n < 32 - 1;) {
ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i);
ROUND160_16_TO_31(d,e,a,b,c,i,j,f,g,h);
ROUND160_16_TO_31(c,d,e,a,b,h,i,j,f,g);
ROUND160_16_TO_31(b,c,d,e,a,g,h,i,j,f);
ROUND160_16_TO_31(a,b,c,d,e,f,g,h,i,j);
}
#define R160_16 \
ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i); \
ROUND160_16_TO_31(d,e,a,b,c,i,j,f,g,h); \
ROUND160_16_TO_31(c,d,e,a,b,h,i,j,f,g); \
ROUND160_16_TO_31(b,c,d,e,a,g,h,i,j,f); \
ROUND160_16_TO_31(a,b,c,d,e,f,g,h,i,j)
R160_16; R160_16; R160_16;
ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i);
SWAP(b,g)
for (; n < 48 - 1;) {
ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h);
ROUND160_32_TO_47(c,d,e,a,b,h,i,j,f,g);
ROUND160_32_TO_47(b,c,d,e,a,g,h,i,j,f);
ROUND160_32_TO_47(a,b,c,d,e,f,g,h,i,j);
ROUND160_32_TO_47(e,a,b,c,d,j,f,g,h,i);
}
#define R160_32 \
ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h); \
ROUND160_32_TO_47(c,d,e,a,b,h,i,j,f,g); \
ROUND160_32_TO_47(b,c,d,e,a,g,h,i,j,f); \
ROUND160_32_TO_47(a,b,c,d,e,f,g,h,i,j); \
ROUND160_32_TO_47(e,a,b,c,d,j,f,g,h,i)
R160_32; R160_32; R160_32;
ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h);
SWAP(c,h)
for (; n < 64 - 1;) {
ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g);
ROUND160_48_TO_63(b,c,d,e,a,g,h,i,j,f);
ROUND160_48_TO_63(a,b,c,d,e,f,g,h,i,j);
ROUND160_48_TO_63(e,a,b,c,d,j,f,g,h,i);
ROUND160_48_TO_63(d,e,a,b,c,i,j,f,g,h);
}
#define R160_48 \
ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g); \
ROUND160_48_TO_63(b,c,d,e,a,g,h,i,j,f); \
ROUND160_48_TO_63(a,b,c,d,e,f,g,h,i,j); \
ROUND160_48_TO_63(e,a,b,c,d,j,f,g,h,i); \
ROUND160_48_TO_63(d,e,a,b,c,i,j,f,g,h)
R160_48; R160_48; R160_48;
ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g);
SWAP(d,i)
for (; n < 75;) {
ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f);
ROUND160_64_TO_79(a,b,c,d,e,f,g,h,i,j);
ROUND160_64_TO_79(e,a,b,c,d,j,f,g,h,i);
ROUND160_64_TO_79(d,e,a,b,c,i,j,f,g,h);
ROUND160_64_TO_79(c,d,e,a,b,h,i,j,f,g);
}
#define R160_64 \
ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f); \
ROUND160_64_TO_79(a,b,c,d,e,f,g,h,i,j); \
ROUND160_64_TO_79(e,a,b,c,d,j,f,g,h,i); \
ROUND160_64_TO_79(d,e,a,b,c,i,j,f,g,h); \
ROUND160_64_TO_79(c,d,e,a,b,h,i,j,f,g)
R160_64; R160_64; R160_64;
ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f);
SWAP(e,j)