Fossil SCM

substantive improvement to sha1 speed (especially on x86)

ron 2011-09-01 14:31 trunk
Commit f2ede7da6d70851ab1e13f7d2238a94ef76f2370
1 file changed +49 -34
+49 -34
--- src/sha1.c
+++ src/sha1.c
@@ -27,60 +27,79 @@
2727
* blk0() and blk() perform the initial expand.
2828
* I got the idea of expanding during the round function from SSLeay
2929
*
3030
* blk0le() for little-endian and blk0be() for big-endian.
3131
*/
32
-#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
33
-#define blk0le(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
34
- |(rol(block->l[i],8)&0x00FF00FF))
35
-#define blk0be(i) block->l[i]
36
-#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
37
- ^block->l[(i+2)&15]^block->l[i&15],1))
32
+#if __GNUC__ && (defined(__i386__) || defined(__x86_64__))
33
+/*
34
+ * GCC by itself only generates left rotates. Use right rotates if
35
+ * possible to be kinder to dinky implementations with iterative rotate
36
+ * instructions.
37
+ */
38
+#define SHA_ROT(op, x, k) \
39
+ ({ unsigned int y; asm(op " %1,%0" : "=r" (y) : "I" (k), "0" (x)); y; })
40
+#define rol(x,k) SHA_ROT("roll", x, k)
41
+#define ror(x,k) SHA_ROT("rorl", x, k)
42
+
43
+#else
44
+/* Generic C equivalent */
45
+#define SHA_ROT(x,l,r) ((x) << (l) | (x) >> (r))
46
+#define rol(x,k) SHA_ROT(x,k,32-(k))
47
+#define ror(x,k) SHA_ROT(x,32-(k),k)
48
+#endif
49
+
50
+
51
+#define blk0le(i) (block[i] = (ror(block[i],8)&0xFF00FF00) \
52
+ |(rol(block[i],8)&0x00FF00FF))
53
+#define blk0be(i) block[i]
54
+#define blk(i) (block[i&15] = rol(block[(i+13)&15]^block[(i+8)&15] \
55
+ ^block[(i+2)&15]^block[i&15],1))
3856
3957
/*
4058
* (R0+R1), R2, R3, R4 are the different operations (rounds) used in SHA1
4159
*
4260
* Rl0() for little-endian and Rb0() for big-endian. Endianness is
4361
* determined at run-time.
4462
*/
4563
#define Rl0(v,w,x,y,z,i) \
46
- z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=rol(w,30);
64
+ z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=ror(w,2);
4765
#define Rb0(v,w,x,y,z,i) \
48
- z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=rol(w,30);
66
+ z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=ror(w,2);
4967
#define R1(v,w,x,y,z,i) \
50
- z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
68
+ z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=ror(w,2);
5169
#define R2(v,w,x,y,z,i) \
52
- z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
70
+ z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=ror(w,2);
5371
#define R3(v,w,x,y,z,i) \
54
- z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
72
+ z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=ror(w,2);
5573
#define R4(v,w,x,y,z,i) \
56
- z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
57
-
58
-typedef union {
59
- unsigned char c[64];
60
- unsigned int l[16];
61
-} CHAR64LONG16;
74
+ z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=ror(w,2);
6275
6376
/*
6477
* Hash a single 512-bit block. This is the core of the algorithm.
6578
*/
79
+#define a qq[0]
80
+#define b qq[1]
81
+#define c qq[2]
82
+#define d qq[3]
83
+#define e qq[4]
84
+
6685
void SHA1Transform(unsigned int state[5], const unsigned char buffer[64])
6786
{
68
- unsigned int a, b, c, d, e;
69
- CHAR64LONG16 *block;
87
+ unsigned int qq[5]; // a, b, c, d, e;
7088
static int one = 1;
71
- CHAR64LONG16 workspace;
72
-
73
- block = &workspace;
74
- (void)memcpy(block, buffer, 64);
89
+ unsigned int block[16];
90
+ memcpy(block, buffer, 64);
91
+ memcpy(qq,state,5*sizeof(unsigned int));
7592
7693
/* Copy context->state[] to working vars */
94
+ /*
7795
a = state[0];
7896
b = state[1];
7997
c = state[2];
8098
d = state[3];
8199
e = state[4];
100
+ */
82101
83102
/* 4 rounds of 20 operations each. Loop unrolled. */
84103
if( 1 == *(unsigned char*)&one ){
85104
Rl0(a,b,c,d,e, 0); Rl0(e,a,b,c,d, 1); Rl0(d,e,a,b,c, 2); Rl0(c,d,e,a,b, 3);
86105
Rl0(b,c,d,e,a, 4); Rl0(a,b,c,d,e, 5); Rl0(e,a,b,c,d, 6); Rl0(d,e,a,b,c, 7);
@@ -113,13 +132,10 @@
113132
state[0] += a;
114133
state[1] += b;
115134
state[2] += c;
116135
state[3] += d;
117136
state[4] += e;
118
-
119
- /* Wipe variables */
120
- a = b = c = d = e = 0;
121137
}
122138
123139
124140
/*
125141
* SHA1Init - Initialize new context
@@ -192,18 +208,17 @@
192208
** digest is stored in the first 20 bytes. zBuf should
193209
** be "char zBuf[41]".
194210
*/
195211
static void DigestToBase16(unsigned char *digest, char *zBuf){
196212
static char const zEncode[] = "0123456789abcdef";
197
- int i, j;
198
-
199
- for(j=i=0; i<20; i++){
200
- int a = digest[i];
201
- zBuf[j++] = zEncode[(a>>4)&0xf];
202
- zBuf[j++] = zEncode[a & 0xf];
203
- }
204
- zBuf[j] = 0;
213
+ int ix;
214
+
215
+ for(ix=0; ix<20; ix++){
216
+ *zBuf++ = zEncode[(*digest>>4)&0xf];
217
+ *zBuf++ = zEncode[*digest++ & 0xf];
218
+ }
219
+ *zBuf = '\0';
205220
}
206221
207222
/*
208223
** The state of a incremental SHA1 checksum computation. Only one
209224
** such computation can be underway at a time, of course.
210225
--- src/sha1.c
+++ src/sha1.c
@@ -27,60 +27,79 @@
27 * blk0() and blk() perform the initial expand.
28 * I got the idea of expanding during the round function from SSLeay
29 *
30 * blk0le() for little-endian and blk0be() for big-endian.
31 */
32 #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
33 #define blk0le(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
34 |(rol(block->l[i],8)&0x00FF00FF))
35 #define blk0be(i) block->l[i]
36 #define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
37 ^block->l[(i+2)&15]^block->l[i&15],1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
39 /*
40 * (R0+R1), R2, R3, R4 are the different operations (rounds) used in SHA1
41 *
42 * Rl0() for little-endian and Rb0() for big-endian. Endianness is
43 * determined at run-time.
44 */
45 #define Rl0(v,w,x,y,z,i) \
46 z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=rol(w,30);
47 #define Rb0(v,w,x,y,z,i) \
48 z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=rol(w,30);
49 #define R1(v,w,x,y,z,i) \
50 z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
51 #define R2(v,w,x,y,z,i) \
52 z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
53 #define R3(v,w,x,y,z,i) \
54 z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
55 #define R4(v,w,x,y,z,i) \
56 z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
57
58 typedef union {
59 unsigned char c[64];
60 unsigned int l[16];
61 } CHAR64LONG16;
62
63 /*
64 * Hash a single 512-bit block. This is the core of the algorithm.
65 */
 
 
 
 
 
 
66 void SHA1Transform(unsigned int state[5], const unsigned char buffer[64])
67 {
68 unsigned int a, b, c, d, e;
69 CHAR64LONG16 *block;
70 static int one = 1;
71 CHAR64LONG16 workspace;
72
73 block = &workspace;
74 (void)memcpy(block, buffer, 64);
75
76 /* Copy context->state[] to working vars */
 
77 a = state[0];
78 b = state[1];
79 c = state[2];
80 d = state[3];
81 e = state[4];
 
82
83 /* 4 rounds of 20 operations each. Loop unrolled. */
84 if( 1 == *(unsigned char*)&one ){
85 Rl0(a,b,c,d,e, 0); Rl0(e,a,b,c,d, 1); Rl0(d,e,a,b,c, 2); Rl0(c,d,e,a,b, 3);
86 Rl0(b,c,d,e,a, 4); Rl0(a,b,c,d,e, 5); Rl0(e,a,b,c,d, 6); Rl0(d,e,a,b,c, 7);
@@ -113,13 +132,10 @@
113 state[0] += a;
114 state[1] += b;
115 state[2] += c;
116 state[3] += d;
117 state[4] += e;
118
119 /* Wipe variables */
120 a = b = c = d = e = 0;
121 }
122
123
124 /*
125 * SHA1Init - Initialize new context
@@ -192,18 +208,17 @@
192 ** digest is stored in the first 20 bytes. zBuf should
193 ** be "char zBuf[41]".
194 */
195 static void DigestToBase16(unsigned char *digest, char *zBuf){
196 static char const zEncode[] = "0123456789abcdef";
197 int i, j;
198
199 for(j=i=0; i<20; i++){
200 int a = digest[i];
201 zBuf[j++] = zEncode[(a>>4)&0xf];
202 zBuf[j++] = zEncode[a & 0xf];
203 }
204 zBuf[j] = 0;
205 }
206
207 /*
208 ** The state of a incremental SHA1 checksum computation. Only one
209 ** such computation can be underway at a time, of course.
210
--- src/sha1.c
+++ src/sha1.c
@@ -27,60 +27,79 @@
27 * blk0() and blk() perform the initial expand.
28 * I got the idea of expanding during the round function from SSLeay
29 *
30 * blk0le() for little-endian and blk0be() for big-endian.
31 */
32 #if __GNUC__ && (defined(__i386__) || defined(__x86_64__))
33 /*
34 * GCC by itself only generates left rotates. Use right rotates if
35 * possible to be kinder to dinky implementations with iterative rotate
36 * instructions.
37 */
38 #define SHA_ROT(op, x, k) \
39 ({ unsigned int y; asm(op " %1,%0" : "=r" (y) : "I" (k), "0" (x)); y; })
40 #define rol(x,k) SHA_ROT("roll", x, k)
41 #define ror(x,k) SHA_ROT("rorl", x, k)
42
43 #else
44 /* Generic C equivalent */
45 #define SHA_ROT(x,l,r) ((x) << (l) | (x) >> (r))
46 #define rol(x,k) SHA_ROT(x,k,32-(k))
47 #define ror(x,k) SHA_ROT(x,32-(k),k)
48 #endif
49
50
51 #define blk0le(i) (block[i] = (ror(block[i],8)&0xFF00FF00) \
52 |(rol(block[i],8)&0x00FF00FF))
53 #define blk0be(i) block[i]
54 #define blk(i) (block[i&15] = rol(block[(i+13)&15]^block[(i+8)&15] \
55 ^block[(i+2)&15]^block[i&15],1))
56
57 /*
58 * (R0+R1), R2, R3, R4 are the different operations (rounds) used in SHA1
59 *
60 * Rl0() for little-endian and Rb0() for big-endian. Endianness is
61 * determined at run-time.
62 */
63 #define Rl0(v,w,x,y,z,i) \
64 z+=((w&(x^y))^y)+blk0le(i)+0x5A827999+rol(v,5);w=ror(w,2);
65 #define Rb0(v,w,x,y,z,i) \
66 z+=((w&(x^y))^y)+blk0be(i)+0x5A827999+rol(v,5);w=ror(w,2);
67 #define R1(v,w,x,y,z,i) \
68 z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=ror(w,2);
69 #define R2(v,w,x,y,z,i) \
70 z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=ror(w,2);
71 #define R3(v,w,x,y,z,i) \
72 z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=ror(w,2);
73 #define R4(v,w,x,y,z,i) \
74 z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=ror(w,2);
 
 
 
 
 
75
76 /*
77 * Hash a single 512-bit block. This is the core of the algorithm.
78 */
79 #define a qq[0]
80 #define b qq[1]
81 #define c qq[2]
82 #define d qq[3]
83 #define e qq[4]
84
85 void SHA1Transform(unsigned int state[5], const unsigned char buffer[64])
86 {
87 unsigned int qq[5]; // a, b, c, d, e;
 
88 static int one = 1;
89 unsigned int block[16];
90 memcpy(block, buffer, 64);
91 memcpy(qq,state,5*sizeof(unsigned int));
 
92
93 /* Copy context->state[] to working vars */
94 /*
95 a = state[0];
96 b = state[1];
97 c = state[2];
98 d = state[3];
99 e = state[4];
100 */
101
102 /* 4 rounds of 20 operations each. Loop unrolled. */
103 if( 1 == *(unsigned char*)&one ){
104 Rl0(a,b,c,d,e, 0); Rl0(e,a,b,c,d, 1); Rl0(d,e,a,b,c, 2); Rl0(c,d,e,a,b, 3);
105 Rl0(b,c,d,e,a, 4); Rl0(a,b,c,d,e, 5); Rl0(e,a,b,c,d, 6); Rl0(d,e,a,b,c, 7);
@@ -113,13 +132,10 @@
132 state[0] += a;
133 state[1] += b;
134 state[2] += c;
135 state[3] += d;
136 state[4] += e;
 
 
 
137 }
138
139
140 /*
141 * SHA1Init - Initialize new context
@@ -192,18 +208,17 @@
208 ** digest is stored in the first 20 bytes. zBuf should
209 ** be "char zBuf[41]".
210 */
211 static void DigestToBase16(unsigned char *digest, char *zBuf){
212 static char const zEncode[] = "0123456789abcdef";
213 int ix;
214
215 for(ix=0; ix<20; ix++){
216 *zBuf++ = zEncode[(*digest>>4)&0xf];
217 *zBuf++ = zEncode[*digest++ & 0xf];
218 }
219 *zBuf = '\0';
 
220 }
221
222 /*
223 ** The state of a incremental SHA1 checksum computation. Only one
224 ** such computation can be underway at a time, of course.
225

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button