Fossil SCM
substantive improvement to sha1 speed (especially on x86)
Commit
f2ede7da6d70851ab1e13f7d2238a94ef76f2370
Parent
f7f4a80ea0a421b…
1 file changed
+49
-34
+49
-34
| --- src/sha1.c | ||
| +++ src/sha1.c | ||
| @@ -27,60 +27,79 @@ | ||
| 27 | 27 | * blk0() and blk() perform the initial expand. |
| 28 | 28 | * I got the idea of expanding during the round function from SSLeay |
| 29 | 29 | * |
| 30 | 30 | * blk0le() for little-endian and blk0be() for big-endian. |
| 31 | 31 | */ |
| 32 | -#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) | |
| 33 | -#define blk0le(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ | |
| 34 | - |(rol(block->l[i],8)&0x00FF00FF)) | |
| 35 | -#define blk0be(i) block->l[i] | |
| 36 | -#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ | |
| 37 | - ^block->l[(i+2)&15]^block->l[i&15],1)) | |
| 32 | +#if __GNUC__ && (defined(__i386__) || defined(__x86_64__)) | |
| 33 | +/* | |
| 34 | + * GCC by itself only generates left rotates. Use right rotates if | |
| 35 | + * possible to be kinder to dinky implementations with iterative rotate | |
| 36 | + * instructions. | |
| 37 | + */ | |
| 38 | +#define SHA_ROT(op, x, k) \ | |
| 39 | + ({ unsigned int y; asm(op " %1,%0" : "=r" (y) : "I" (k), "0" (x)); y; }) | |
| 40 | +#define rol(x,k) SHA_ROT("roll", x, k) | |
| 41 | +#define ror(x,k) SHA_ROT("rorl", x, k) | |
| 42 | + | |
| 43 | +#else | |
| 44 | +/* Generic C equivalent */ | |
| 45 | +#define SHA_ROT(x,l,r) ((x) << (l) | (x) >> (r)) | |
| 46 | +#define rol(x,k) SHA_ROT(x,k,32-(k)) | |
| 47 | +#define ror(x,k) SHA_ROT(x,32-(k),k) | |
| 48 | +#endif | |
| 49 | + | |
| 50 | + | |
| 51 | +#define blk0le(i) (block[i] = (ror(block[i],8)&0xFF00FF00) \ | |
| 52 | + |(rol(block[i],8)&0x00FF00FF)) | |
| 53 | +#define blk0be(i) block[i] | |
| 54 | +#define blk(i) (block[i&15] = rol(block[(i+13)&15]^block[(i+8)&15] \ | |
| 55 | + ^block[(i+2)&15]^block[i&15],1)) | |
| 38 | 56 | |
| 39 | 57 | /* |
| 40 | 58 | * (R0+R1), R2, R3, R4 are the different operations (rounds) used in SHA1 |
| 41 | 59 | * |
| 42 | 60 | * Rl0() for little-endian and Rb0() for big-endian. Endianness is |
| 43 | 61 | * determined at run-time. |
| 44 | 62 | */ |
| 45 | 63 | #define Rl0(v,w,x,y,z,i) \ |
| 46 | - z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=rol(w,30); | |
| 64 | + z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=ror(w,2); | |
| 47 | 65 | #define Rb0(v,w,x,y,z,i) \ |
| 48 | - z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=rol(w,30); | |
| 66 | + z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=ror(w,2); | |
| 49 | 67 | #define R1(v,w,x,y,z,i) \ |
| 50 | - z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); | |
| 68 | + z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=ror(w,2); | |
| 51 | 69 | #define R2(v,w,x,y,z,i) \ |
| 52 | - z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); | |
| 70 | + z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=ror(w,2); | |
| 53 | 71 | #define R3(v,w,x,y,z,i) \ |
| 54 | - z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); | |
| 72 | + z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=ror(w,2); | |
| 55 | 73 | #define R4(v,w,x,y,z,i) \ |
| 56 | - z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); | |
| 57 | - | |
| 58 | -typedef union { | |
| 59 | - unsigned char c[64]; | |
| 60 | - unsigned int l[16]; | |
| 61 | -} CHAR64LONG16; | |
| 74 | + z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=ror(w,2); | |
| 62 | 75 | |
| 63 | 76 | /* |
| 64 | 77 | * Hash a single 512-bit block. This is the core of the algorithm. |
| 65 | 78 | */ |
| 79 | +#define a qq[0] | |
| 80 | +#define b qq[1] | |
| 81 | +#define c qq[2] | |
| 82 | +#define d qq[3] | |
| 83 | +#define e qq[4] | |
| 84 | + | |
| 66 | 85 | void SHA1Transform(unsigned int state[5], const unsigned char buffer[64]) |
| 67 | 86 | { |
| 68 | - unsigned int a, b, c, d, e; | |
| 69 | - CHAR64LONG16 *block; | |
| 87 | + unsigned int qq[5]; // a, b, c, d, e; | |
| 70 | 88 | static int one = 1; |
| 71 | - CHAR64LONG16 workspace; | |
| 72 | - | |
| 73 | - block = &workspace; | |
| 74 | - (void)memcpy(block, buffer, 64); | |
| 89 | + unsigned int block[16]; | |
| 90 | + memcpy(block, buffer, 64); | |
| 91 | + memcpy(qq,state,5*sizeof(unsigned int)); | |
| 75 | 92 | |
| 76 | 93 | /* Copy context->state[] to working vars */ |
| 94 | + /* | |
| 77 | 95 | a = state[0]; |
| 78 | 96 | b = state[1]; |
| 79 | 97 | c = state[2]; |
| 80 | 98 | d = state[3]; |
| 81 | 99 | e = state[4]; |
| 100 | + */ | |
| 82 | 101 | |
| 83 | 102 | /* 4 rounds of 20 operations each. Loop unrolled. */ |
| 84 | 103 | if( 1 == *(unsigned char*)&one ){ |
| 85 | 104 | Rl0(a,b,c,d,e, 0); Rl0(e,a,b,c,d, 1); Rl0(d,e,a,b,c, 2); Rl0(c,d,e,a,b, 3); |
| 86 | 105 | Rl0(b,c,d,e,a, 4); Rl0(a,b,c,d,e, 5); Rl0(e,a,b,c,d, 6); Rl0(d,e,a,b,c, 7); |
| @@ -113,13 +132,10 @@ | ||
| 113 | 132 | state[0] += a; |
| 114 | 133 | state[1] += b; |
| 115 | 134 | state[2] += c; |
| 116 | 135 | state[3] += d; |
| 117 | 136 | state[4] += e; |
| 118 | - | |
| 119 | - /* Wipe variables */ | |
| 120 | - a = b = c = d = e = 0; | |
| 121 | 137 | } |
| 122 | 138 | |
| 123 | 139 | |
| 124 | 140 | /* |
| 125 | 141 | * SHA1Init - Initialize new context |
| @@ -192,18 +208,17 @@ | ||
| 192 | 208 | ** digest is stored in the first 20 bytes. zBuf should |
| 193 | 209 | ** be "char zBuf[41]". |
| 194 | 210 | */ |
| 195 | 211 | static void DigestToBase16(unsigned char *digest, char *zBuf){ |
| 196 | 212 | static char const zEncode[] = "0123456789abcdef"; |
| 197 | - int i, j; | |
| 198 | - | |
| 199 | - for(j=i=0; i<20; i++){ | |
| 200 | - int a = digest[i]; | |
| 201 | - zBuf[j++] = zEncode[(a>>4)&0xf]; | |
| 202 | - zBuf[j++] = zEncode[a & 0xf]; | |
| 203 | - } | |
| 204 | - zBuf[j] = 0; | |
| 213 | + int ix; | |
| 214 | + | |
| 215 | + for(ix=0; ix<20; ix++){ | |
| 216 | + *zBuf++ = zEncode[(*digest>>4)&0xf]; | |
| 217 | + *zBuf++ = zEncode[*digest++ & 0xf]; | |
| 218 | + } | |
| 219 | + *zBuf = '\0'; | |
| 205 | 220 | } |
| 206 | 221 | |
| 207 | 222 | /* |
| 208 | 223 | ** The state of a incremental SHA1 checksum computation. Only one |
| 209 | 224 | ** such computation can be underway at a time, of course. |
| 210 | 225 |
| --- src/sha1.c | |
| +++ src/sha1.c | |
| @@ -27,60 +27,79 @@ | |
| 27 | * blk0() and blk() perform the initial expand. |
| 28 | * I got the idea of expanding during the round function from SSLeay |
| 29 | * |
| 30 | * blk0le() for little-endian and blk0be() for big-endian. |
| 31 | */ |
| 32 | #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) |
| 33 | #define blk0le(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ |
| 34 | |(rol(block->l[i],8)&0x00FF00FF)) |
| 35 | #define blk0be(i) block->l[i] |
| 36 | #define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ |
| 37 | ^block->l[(i+2)&15]^block->l[i&15],1)) |
| 38 | |
| 39 | /* |
| 40 | * (R0+R1), R2, R3, R4 are the different operations (rounds) used in SHA1 |
| 41 | * |
| 42 | * Rl0() for little-endian and Rb0() for big-endian. Endianness is |
| 43 | * determined at run-time. |
| 44 | */ |
| 45 | #define Rl0(v,w,x,y,z,i) \ |
| 46 | z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=rol(w,30); |
| 47 | #define Rb0(v,w,x,y,z,i) \ |
| 48 | z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=rol(w,30); |
| 49 | #define R1(v,w,x,y,z,i) \ |
| 50 | z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); |
| 51 | #define R2(v,w,x,y,z,i) \ |
| 52 | z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); |
| 53 | #define R3(v,w,x,y,z,i) \ |
| 54 | z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); |
| 55 | #define R4(v,w,x,y,z,i) \ |
| 56 | z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); |
| 57 | |
| 58 | typedef union { |
| 59 | unsigned char c[64]; |
| 60 | unsigned int l[16]; |
| 61 | } CHAR64LONG16; |
| 62 | |
| 63 | /* |
| 64 | * Hash a single 512-bit block. This is the core of the algorithm. |
| 65 | */ |
| 66 | void SHA1Transform(unsigned int state[5], const unsigned char buffer[64]) |
| 67 | { |
| 68 | unsigned int a, b, c, d, e; |
| 69 | CHAR64LONG16 *block; |
| 70 | static int one = 1; |
| 71 | CHAR64LONG16 workspace; |
| 72 | |
| 73 | block = &workspace; |
| 74 | (void)memcpy(block, buffer, 64); |
| 75 | |
| 76 | /* Copy context->state[] to working vars */ |
| 77 | a = state[0]; |
| 78 | b = state[1]; |
| 79 | c = state[2]; |
| 80 | d = state[3]; |
| 81 | e = state[4]; |
| 82 | |
| 83 | /* 4 rounds of 20 operations each. Loop unrolled. */ |
| 84 | if( 1 == *(unsigned char*)&one ){ |
| 85 | Rl0(a,b,c,d,e, 0); Rl0(e,a,b,c,d, 1); Rl0(d,e,a,b,c, 2); Rl0(c,d,e,a,b, 3); |
| 86 | Rl0(b,c,d,e,a, 4); Rl0(a,b,c,d,e, 5); Rl0(e,a,b,c,d, 6); Rl0(d,e,a,b,c, 7); |
| @@ -113,13 +132,10 @@ | |
| 113 | state[0] += a; |
| 114 | state[1] += b; |
| 115 | state[2] += c; |
| 116 | state[3] += d; |
| 117 | state[4] += e; |
| 118 | |
| 119 | /* Wipe variables */ |
| 120 | a = b = c = d = e = 0; |
| 121 | } |
| 122 | |
| 123 | |
| 124 | /* |
| 125 | * SHA1Init - Initialize new context |
| @@ -192,18 +208,17 @@ | |
| 192 | ** digest is stored in the first 20 bytes. zBuf should |
| 193 | ** be "char zBuf[41]". |
| 194 | */ |
| 195 | static void DigestToBase16(unsigned char *digest, char *zBuf){ |
| 196 | static char const zEncode[] = "0123456789abcdef"; |
| 197 | int i, j; |
| 198 | |
| 199 | for(j=i=0; i<20; i++){ |
| 200 | int a = digest[i]; |
| 201 | zBuf[j++] = zEncode[(a>>4)&0xf]; |
| 202 | zBuf[j++] = zEncode[a & 0xf]; |
| 203 | } |
| 204 | zBuf[j] = 0; |
| 205 | } |
| 206 | |
| 207 | /* |
| 208 | ** The state of a incremental SHA1 checksum computation. Only one |
| 209 | ** such computation can be underway at a time, of course. |
| 210 |
| --- src/sha1.c | |
| +++ src/sha1.c | |
| @@ -27,60 +27,79 @@ | |
| 27 | * blk0() and blk() perform the initial expand. |
| 28 | * I got the idea of expanding during the round function from SSLeay |
| 29 | * |
| 30 | * blk0le() for little-endian and blk0be() for big-endian. |
| 31 | */ |
| 32 | #if __GNUC__ && (defined(__i386__) || defined(__x86_64__)) |
| 33 | /* |
| 34 | * GCC by itself only generates left rotates. Use right rotates if |
| 35 | * possible to be kinder to dinky implementations with iterative rotate |
| 36 | * instructions. |
| 37 | */ |
| 38 | #define SHA_ROT(op, x, k) \ |
| 39 | ({ unsigned int y; asm(op " %1,%0" : "=r" (y) : "I" (k), "0" (x)); y; }) |
| 40 | #define rol(x,k) SHA_ROT("roll", x, k) |
| 41 | #define ror(x,k) SHA_ROT("rorl", x, k) |
| 42 | |
| 43 | #else |
| 44 | /* Generic C equivalent */ |
| 45 | #define SHA_ROT(x,l,r) ((x) << (l) | (x) >> (r)) |
| 46 | #define rol(x,k) SHA_ROT(x,k,32-(k)) |
| 47 | #define ror(x,k) SHA_ROT(x,32-(k),k) |
| 48 | #endif |
| 49 | |
| 50 | |
| 51 | #define blk0le(i) (block[i] = (ror(block[i],8)&0xFF00FF00) \ |
| 52 | |(rol(block[i],8)&0x00FF00FF)) |
| 53 | #define blk0be(i) block[i] |
| 54 | #define blk(i) (block[i&15] = rol(block[(i+13)&15]^block[(i+8)&15] \ |
| 55 | ^block[(i+2)&15]^block[i&15],1)) |
| 56 | |
| 57 | /* |
| 58 | * (R0+R1), R2, R3, R4 are the different operations (rounds) used in SHA1 |
| 59 | * |
| 60 | * Rl0() for little-endian and Rb0() for big-endian. Endianness is |
| 61 | * determined at run-time. |
| 62 | */ |
| 63 | #define Rl0(v,w,x,y,z,i) \ |
| 64 | z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=ror(w,2); |
| 65 | #define Rb0(v,w,x,y,z,i) \ |
| 66 | z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=ror(w,2); |
| 67 | #define R1(v,w,x,y,z,i) \ |
| 68 | z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=ror(w,2); |
| 69 | #define R2(v,w,x,y,z,i) \ |
| 70 | z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=ror(w,2); |
| 71 | #define R3(v,w,x,y,z,i) \ |
| 72 | z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=ror(w,2); |
| 73 | #define R4(v,w,x,y,z,i) \ |
| 74 | z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=ror(w,2); |
| 75 | |
| 76 | /* |
| 77 | * Hash a single 512-bit block. This is the core of the algorithm. |
| 78 | */ |
| 79 | #define a qq[0] |
| 80 | #define b qq[1] |
| 81 | #define c qq[2] |
| 82 | #define d qq[3] |
| 83 | #define e qq[4] |
| 84 | |
| 85 | void SHA1Transform(unsigned int state[5], const unsigned char buffer[64]) |
| 86 | { |
| 87 | unsigned int qq[5]; // a, b, c, d, e; |
| 88 | static int one = 1; |
| 89 | unsigned int block[16]; |
| 90 | memcpy(block, buffer, 64); |
| 91 | memcpy(qq,state,5*sizeof(unsigned int)); |
| 92 | |
| 93 | /* Copy context->state[] to working vars */ |
| 94 | /* |
| 95 | a = state[0]; |
| 96 | b = state[1]; |
| 97 | c = state[2]; |
| 98 | d = state[3]; |
| 99 | e = state[4]; |
| 100 | */ |
| 101 | |
| 102 | /* 4 rounds of 20 operations each. Loop unrolled. */ |
| 103 | if( 1 == *(unsigned char*)&one ){ |
| 104 | Rl0(a,b,c,d,e, 0); Rl0(e,a,b,c,d, 1); Rl0(d,e,a,b,c, 2); Rl0(c,d,e,a,b, 3); |
| 105 | Rl0(b,c,d,e,a, 4); Rl0(a,b,c,d,e, 5); Rl0(e,a,b,c,d, 6); Rl0(d,e,a,b,c, 7); |
| @@ -113,13 +132,10 @@ | |
| 132 | state[0] += a; |
| 133 | state[1] += b; |
| 134 | state[2] += c; |
| 135 | state[3] += d; |
| 136 | state[4] += e; |
| 137 | } |
| 138 | |
| 139 | |
| 140 | /* |
| 141 | * SHA1Init - Initialize new context |
| @@ -192,18 +208,17 @@ | |
| 208 | ** digest is stored in the first 20 bytes. zBuf should |
| 209 | ** be "char zBuf[41]". |
| 210 | */ |
| 211 | static void DigestToBase16(unsigned char *digest, char *zBuf){ |
| 212 | static char const zEncode[] = "0123456789abcdef"; |
| 213 | int ix; |
| 214 | |
| 215 | for(ix=0; ix<20; ix++){ |
| 216 | *zBuf++ = zEncode[(*digest>>4)&0xf]; |
| 217 | *zBuf++ = zEncode[*digest++ & 0xf]; |
| 218 | } |
| 219 | *zBuf = '\0'; |
| 220 | } |
| 221 | |
| 222 | /* |
| 223 | ** The state of a incremental SHA1 checksum computation. Only one |
| 224 | ** such computation can be underway at a time, of course. |
| 225 |