Fossil SCM

fossil-scm / compat / zlib / contrib / crc32vx / crc32_vx.c

Blame History Raw 255 lines

1	`/*`
2	`* Hardware-accelerated CRC-32 variants for Linux on z Systems`
3	`*`
4	`* Use the z/Architecture Vector Extension Facility to accelerate the`
5	`* computing of bitreflected CRC-32 checksums.`
6	`*`
7	`* This CRC-32 implementation algorithm is bitreflected and processes`
8	`* the least-significant bit first (Little-Endian).`
9	`*`
10	`* This code was originally written by Hendrik Brueckner`
11	`* <[email protected]> for use in the Linux kernel and has been`
12	`* relicensed under the zlib license.`
13	`*/`
14	`#define Z_ONCE`
15	`#include "../../zutil.h"`
16	`#include "crc32_vx_hooks.h"`
17
18	`#include <stdint.h>`
19	`#include <stdio.h>`
20	`#include <vecintrin.h>`
21	`#include <sys/auxv.h>`
22
23	`#ifdef __clang__`
24	`# if ((__clang_major__ == 18) \|\| (__clang_major__ == 19 && (__clang_minor__ < 1 \|\| (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))`
25	`# error crc32_vx optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \`
26	`Either disable the zlib crc32_vx optimization, or switch to another compiler/compiler version.`
27	`# endif`
28	`#endif`
29
30	`#define VX_MIN_LEN 64`
31	`#define VX_ALIGNMENT 16L`
32	`#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)`
33
34	`typedef unsigned char uv16qi __attribute__((vector_size(16)));`
35	`typedef unsigned int uv4si __attribute__((vector_size(16)));`
36	`typedef unsigned long long uv2di __attribute__((vector_size(16)));`
37
38	`local uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {`
39	`/*`
40	`* The CRC-32 constant block contains reduction constants to fold and`
41	`* process particular chunks of the input data stream in parallel.`
42	`*`
43	`* For the CRC-32 variants, the constants are precomputed according to`
44	`* these definitions:`
45	`*`
46	`* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1`
47	`* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1`
48	`* R3 = [(x128+32 mod P'(x) << 32)]' << 1`
49	`* R4 = [(x128-32 mod P'(x) << 32)]' << 1`
50	`* R5 = [(x64 mod P'(x) << 32)]' << 1`
51	`* R6 = [(x32 mod P'(x) << 32)]' << 1`
52	`*`
53	`* The bitreflected Barret reduction constant, u', is defined as`
54	`* the bit reversal of floor(x**64 / P(x)).`
55	`*`
56	`* where P(x) is the polynomial in the normal domain and the P'(x) is the`
57	`* polynomial in the reversed (bitreflected) domain.`
58	`*`
59	`* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:`
60	`*`
61	`* P(x) = 0x04C11DB7`
62	`* P'(x) = 0xEDB88320`
63	`*/`
64	`const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */`
65	`const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */`
66	`const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */`
67	`const uv2di r5 = {0, 0x163CD6124}; /* R5 */`
68	`const uv2di ru_poly = {0, 0x1F7011641}; /* u' */`
69	`const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */`
70
71	`/*`
72	`* Load the initial CRC value.`
73	`*`
74	`* The CRC value is loaded into the rightmost word of the`
75	`* vector register and is later XORed with the LSB portion`
76	`* of the loaded input data.`
77	`*/`
78	`uv2di v0 = {0, 0};`
79	`v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);`
80
81	`/* Load a 64-byte data chunk and XOR with CRC */`
82	`uv2di v1 = vec_perm(((uv2di )buf)[0], ((uv2di )buf)[0], perm_le2be);`
83	`uv2di v2 = vec_perm(((uv2di )buf)[1], ((uv2di )buf)[1], perm_le2be);`
84	`uv2di v3 = vec_perm(((uv2di )buf)[2], ((uv2di )buf)[2], perm_le2be);`
85	`uv2di v4 = vec_perm(((uv2di )buf)[3], ((uv2di )buf)[3], perm_le2be);`
86
87	`v1 ^= v0;`
88	`buf += 64;`
89	`len -= 64;`
90
91	`while (len >= 64) {`
92	`/* Load the next 64-byte data chunk */`
93	`uv16qi part1 = vec_perm(((uv16qi )buf)[0], ((uv16qi )buf)[0], perm_le2be);`
94	`uv16qi part2 = vec_perm(((uv16qi )buf)[1], ((uv16qi )buf)[1], perm_le2be);`
95	`uv16qi part3 = vec_perm(((uv16qi )buf)[2], ((uv16qi )buf)[2], perm_le2be);`
96	`uv16qi part4 = vec_perm(((uv16qi )buf)[3], ((uv16qi )buf)[3], perm_le2be);`
97
98	`/*`
99	`* Perform a GF(2) multiplication of the doublewords in V1 with`
100	`* the R1 and R2 reduction constants in V0. The intermediate result`
101	`* is then folded (accumulated) with the next data chunk in PART1 and`
102	`* stored in V1. Repeat this step for the register contents`
103	`* in V2, V3, and V4 respectively.`
104	`*/`
105	`v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);`
106	`v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);`
107	`v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);`
108	`v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);`
109
110	`buf += 64;`
111	`len -= 64;`
112	`}`
113
114	`/*`
115	`* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3`
116	`* and R4 and accumulating the next 128-bit chunk until a single 128-bit`
117	`* value remains.`
118	`*/`
119	`v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);`
120	`v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);`
121	`v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);`
122
123	`while (len >= 16) {`
124	`/* Load next data chunk */`
125	`v2 = vec_perm((uv2di )buf, (uv2di )buf, perm_le2be);`
126
127	`/* Fold next data chunk */`
128	`v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);`
129
130	`buf += 16;`
131	`len -= 16;`
132	`}`
133
134	`/*`
135	`* Set up a vector register for byte shifts. The shift value must`
136	`* be loaded in bits 1-4 in byte element 7 of a vector register.`
137	`* Shift by 8 bytes: 0x40`
138	`* Shift by 4 bytes: 0x20`
139	`*/`
140	`uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};`
141	`v9 = vec_insert((unsigned char)0x40, v9, 7);`
142
143	`/*`
144	`* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes`
145	`* to move R4 into the rightmost doubleword and set the leftmost`
146	`* doubleword to 0x1.`
147	`*/`
148	`v0 = vec_srb(r4r3, (uv2di)v9);`
149	`v0[0] = 1;`
150
151	`/*`
152	`* Compute GF(2) product of V1 and V0. The rightmost doubleword`
153	`* of V1 is multiplied with R4. The leftmost doubleword of V1 is`
154	`* multiplied by 0x1 and is then XORed with rightmost product.`
155	`* Implicitly, the intermediate leftmost product becomes padded`
156	`*/`
157	`v1 = (uv2di)vec_gfmsum_128(v0, v1);`
158
159	`/*`
160	`* Now do the final 32-bit fold by multiplying the rightmost word`
161	`* in V1 with R5 and XOR the result with the remaining bits in V1.`
162	`*`
163	`* To achieve this by a single VGFMAG, right shift V1 by a word`
164	`* and store the result in V2 which is then accumulated. Use the`
165	`* vector unpack instruction to load the rightmost half of the`
166	`* doubleword into the rightmost doubleword element of V1; the other`
167	`* half is loaded in the leftmost doubleword.`
168	`* The vector register with CONST_R5 contains the R5 constant in the`
169	`* rightmost doubleword and the leftmost doubleword is zero to ignore`
170	`* the leftmost product of V1.`
171	`*/`
172	`v9 = vec_insert((unsigned char)0x20, v9, 7);`
173	`v2 = vec_srb(v1, (uv2di)v9);`
174	`v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */`
175	`v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);`
176
177	`/*`
178	`* Apply a Barret reduction to compute the final 32-bit CRC value.`
179	`*`
180	`* The input values to the Barret reduction are the degree-63 polynomial`
181	`* in V1 (R(x)), degree-32 generator polynomial, and the reduction`
182	`* constant u. The Barret reduction result is the CRC value of R(x) mod`
183	`* P(x).`
184	`*`
185	`* The Barret reduction algorithm is defined as:`
186	`*`
187	`* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u`
188	`* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)`
189	`* 3. C(x) = R(x) XOR T2(x) mod x^32`
190	`*`
191	`* Note: The leftmost doubleword of vector register containing`
192	`* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product`
193	`* is zero and does not contribute to the final result.`
194	`*/`
195
196	`/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */`
197	`v2 = vec_unpackl((uv4si)v1);`
198	`v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);`
199
200	`/*`
201	`* Compute the GF(2) product of the CRC polynomial with T1(x) in`
202	`* V2 and XOR the intermediate result, T2(x), with the value in V1.`
203	`* The final result is stored in word element 2 of V2.`
204	`*/`
205	`v2 = vec_unpackl((uv4si)v2);`
206	`v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);`
207
208	`return ((uv4si)v2)[2];`
209	`}`
210
211
212	`local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)`
213	`{`
214	`uintptr_t prealign, aligned, remaining;`
215
216	`if (buf == Z_NULL) return 0UL;`
217
218	`if (len < VX_MIN_LEN + VX_ALIGN_MASK)`
219	`return crc32_z(crc, buf, len);`
220
221	`if ((uintptr_t)buf & VX_ALIGN_MASK) {`
222	`prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);`
223	`len -= prealign;`
224	`crc = crc32_z(crc, buf, prealign);`
225	`buf += prealign;`
226	`}`
227	`aligned = len & ~VX_ALIGN_MASK;`
228	`remaining = len & VX_ALIGN_MASK;`
229
230	`crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;`
231
232	`if (remaining)`
233	`crc = crc32_z(crc, buf + aligned, remaining);`
234
235	`return crc;`
236	`}`
237
238	`local z_once_t s390_crc32_made = Z_ONCE_INIT;`
239	`local void s390_crc32_setup() {`
240	`unsigned long hwcap = getauxval(AT_HWCAP);`
241
242	`if (hwcap & HWCAP_S390_VX)`
243	`crc32_z_hook = s390_crc32_vx;`
244	`else`
245	`crc32_z_hook = crc32_z;`
246	`}`
247
248	`local unsigned long s390_crc32_init(unsigned long crc, const unsigned char FAR *buf, z_size_t len)`
249	`{`
250	`z_once(&s390_crc32_made,s390_crc32_setup);`
251	`return crc32_z_hook(crc, buf, len);`
252	`}`
253
254	`ZLIB_INTERNAL unsigned long (crc32_z_hook)(unsigned long crc, const unsigned char FAR buf, z_size_t len) = s390_crc32_init;`
255

Fossil SCM

Keyboard Shortcuts