Fossil SCM

fossil-scm / compat / zlib / contrib / crc32vx / crc32_vx.c
Source Blame History 254 lines
6ea30fb… florian 1 /*
6ea30fb… florian 2 * Hardware-accelerated CRC-32 variants for Linux on z Systems
6ea30fb… florian 3 *
6ea30fb… florian 4 * Use the z/Architecture Vector Extension Facility to accelerate the
6ea30fb… florian 5 * computing of bitreflected CRC-32 checksums.
6ea30fb… florian 6 *
6ea30fb… florian 7 * This CRC-32 implementation algorithm is bitreflected and processes
6ea30fb… florian 8 * the least-significant bit first (Little-Endian).
6ea30fb… florian 9 *
6ea30fb… florian 10 * This code was originally written by Hendrik Brueckner
6ea30fb… florian 11 * <[email protected]> for use in the Linux kernel and has been
6ea30fb… florian 12 * relicensed under the zlib license.
6ea30fb… florian 13 */
6ea30fb… florian 14 #define Z_ONCE
6ea30fb… florian 15 #include "../../zutil.h"
6ea30fb… florian 16 #include "crc32_vx_hooks.h"
6ea30fb… florian 17
6ea30fb… florian 18 #include <stdint.h>
6ea30fb… florian 19 #include <stdio.h>
6ea30fb… florian 20 #include <vecintrin.h>
6ea30fb… florian 21 #include <sys/auxv.h>
6ea30fb… florian 22
6ea30fb… florian 23 #ifdef __clang__
6ea30fb… florian 24 # if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
6ea30fb… florian 25 # error crc32_vx optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \
6ea30fb… florian 26 Either disable the zlib crc32_vx optimization, or switch to another compiler/compiler version.
6ea30fb… florian 27 # endif
6ea30fb… florian 28 #endif
6ea30fb… florian 29
6ea30fb… florian 30 #define VX_MIN_LEN 64
6ea30fb… florian 31 #define VX_ALIGNMENT 16L
6ea30fb… florian 32 #define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
6ea30fb… florian 33
6ea30fb… florian 34 typedef unsigned char uv16qi __attribute__((vector_size(16)));
6ea30fb… florian 35 typedef unsigned int uv4si __attribute__((vector_size(16)));
6ea30fb… florian 36 typedef unsigned long long uv2di __attribute__((vector_size(16)));
6ea30fb… florian 37
6ea30fb… florian 38 local uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
6ea30fb… florian 39 /*
6ea30fb… florian 40 * The CRC-32 constant block contains reduction constants to fold and
6ea30fb… florian 41 * process particular chunks of the input data stream in parallel.
6ea30fb… florian 42 *
6ea30fb… florian 43 * For the CRC-32 variants, the constants are precomputed according to
6ea30fb… florian 44 * these definitions:
6ea30fb… florian 45 *
6ea30fb… florian 46 * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
6ea30fb… florian 47 * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
6ea30fb… florian 48 * R3 = [(x128+32 mod P'(x) << 32)]' << 1
6ea30fb… florian 49 * R4 = [(x128-32 mod P'(x) << 32)]' << 1
6ea30fb… florian 50 * R5 = [(x64 mod P'(x) << 32)]' << 1
6ea30fb… florian 51 * R6 = [(x32 mod P'(x) << 32)]' << 1
6ea30fb… florian 52 *
6ea30fb… florian 53 * The bitreflected Barret reduction constant, u', is defined as
6ea30fb… florian 54 * the bit reversal of floor(x**64 / P(x)).
6ea30fb… florian 55 *
6ea30fb… florian 56 * where P(x) is the polynomial in the normal domain and the P'(x) is the
6ea30fb… florian 57 * polynomial in the reversed (bitreflected) domain.
6ea30fb… florian 58 *
6ea30fb… florian 59 * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
6ea30fb… florian 60 *
6ea30fb… florian 61 * P(x) = 0x04C11DB7
6ea30fb… florian 62 * P'(x) = 0xEDB88320
6ea30fb… florian 63 */
6ea30fb… florian 64 const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
6ea30fb… florian 65 const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
6ea30fb… florian 66 const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
6ea30fb… florian 67 const uv2di r5 = {0, 0x163CD6124}; /* R5 */
6ea30fb… florian 68 const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
6ea30fb… florian 69 const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
6ea30fb… florian 70
6ea30fb… florian 71 /*
6ea30fb… florian 72 * Load the initial CRC value.
6ea30fb… florian 73 *
6ea30fb… florian 74 * The CRC value is loaded into the rightmost word of the
6ea30fb… florian 75 * vector register and is later XORed with the LSB portion
6ea30fb… florian 76 * of the loaded input data.
6ea30fb… florian 77 */
6ea30fb… florian 78 uv2di v0 = {0, 0};
6ea30fb… florian 79 v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
6ea30fb… florian 80
6ea30fb… florian 81 /* Load a 64-byte data chunk and XOR with CRC */
6ea30fb… florian 82 uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
6ea30fb… florian 83 uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
6ea30fb… florian 84 uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
6ea30fb… florian 85 uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
6ea30fb… florian 86
6ea30fb… florian 87 v1 ^= v0;
6ea30fb… florian 88 buf += 64;
6ea30fb… florian 89 len -= 64;
6ea30fb… florian 90
6ea30fb… florian 91 while (len >= 64) {
6ea30fb… florian 92 /* Load the next 64-byte data chunk */
6ea30fb… florian 93 uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
6ea30fb… florian 94 uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
6ea30fb… florian 95 uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
6ea30fb… florian 96 uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
6ea30fb… florian 97
6ea30fb… florian 98 /*
6ea30fb… florian 99 * Perform a GF(2) multiplication of the doublewords in V1 with
6ea30fb… florian 100 * the R1 and R2 reduction constants in V0. The intermediate result
6ea30fb… florian 101 * is then folded (accumulated) with the next data chunk in PART1 and
6ea30fb… florian 102 * stored in V1. Repeat this step for the register contents
6ea30fb… florian 103 * in V2, V3, and V4 respectively.
6ea30fb… florian 104 */
6ea30fb… florian 105 v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
6ea30fb… florian 106 v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
6ea30fb… florian 107 v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
6ea30fb… florian 108 v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
6ea30fb… florian 109
6ea30fb… florian 110 buf += 64;
6ea30fb… florian 111 len -= 64;
6ea30fb… florian 112 }
6ea30fb… florian 113
6ea30fb… florian 114 /*
6ea30fb… florian 115 * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
6ea30fb… florian 116 * and R4 and accumulating the next 128-bit chunk until a single 128-bit
6ea30fb… florian 117 * value remains.
6ea30fb… florian 118 */
6ea30fb… florian 119 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
6ea30fb… florian 120 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
6ea30fb… florian 121 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
6ea30fb… florian 122
6ea30fb… florian 123 while (len >= 16) {
6ea30fb… florian 124 /* Load next data chunk */
6ea30fb… florian 125 v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
6ea30fb… florian 126
6ea30fb… florian 127 /* Fold next data chunk */
6ea30fb… florian 128 v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
6ea30fb… florian 129
6ea30fb… florian 130 buf += 16;
6ea30fb… florian 131 len -= 16;
6ea30fb… florian 132 }
6ea30fb… florian 133
6ea30fb… florian 134 /*
6ea30fb… florian 135 * Set up a vector register for byte shifts. The shift value must
6ea30fb… florian 136 * be loaded in bits 1-4 in byte element 7 of a vector register.
6ea30fb… florian 137 * Shift by 8 bytes: 0x40
6ea30fb… florian 138 * Shift by 4 bytes: 0x20
6ea30fb… florian 139 */
6ea30fb… florian 140 uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
6ea30fb… florian 141 v9 = vec_insert((unsigned char)0x40, v9, 7);
6ea30fb… florian 142
6ea30fb… florian 143 /*
6ea30fb… florian 144 * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
6ea30fb… florian 145 * to move R4 into the rightmost doubleword and set the leftmost
6ea30fb… florian 146 * doubleword to 0x1.
6ea30fb… florian 147 */
6ea30fb… florian 148 v0 = vec_srb(r4r3, (uv2di)v9);
6ea30fb… florian 149 v0[0] = 1;
6ea30fb… florian 150
6ea30fb… florian 151 /*
6ea30fb… florian 152 * Compute GF(2) product of V1 and V0. The rightmost doubleword
6ea30fb… florian 153 * of V1 is multiplied with R4. The leftmost doubleword of V1 is
6ea30fb… florian 154 * multiplied by 0x1 and is then XORed with rightmost product.
6ea30fb… florian 155 * Implicitly, the intermediate leftmost product becomes padded
6ea30fb… florian 156 */
6ea30fb… florian 157 v1 = (uv2di)vec_gfmsum_128(v0, v1);
6ea30fb… florian 158
6ea30fb… florian 159 /*
6ea30fb… florian 160 * Now do the final 32-bit fold by multiplying the rightmost word
6ea30fb… florian 161 * in V1 with R5 and XOR the result with the remaining bits in V1.
6ea30fb… florian 162 *
6ea30fb… florian 163 * To achieve this by a single VGFMAG, right shift V1 by a word
6ea30fb… florian 164 * and store the result in V2 which is then accumulated. Use the
6ea30fb… florian 165 * vector unpack instruction to load the rightmost half of the
6ea30fb… florian 166 * doubleword into the rightmost doubleword element of V1; the other
6ea30fb… florian 167 * half is loaded in the leftmost doubleword.
6ea30fb… florian 168 * The vector register with CONST_R5 contains the R5 constant in the
6ea30fb… florian 169 * rightmost doubleword and the leftmost doubleword is zero to ignore
6ea30fb… florian 170 * the leftmost product of V1.
6ea30fb… florian 171 */
6ea30fb… florian 172 v9 = vec_insert((unsigned char)0x20, v9, 7);
6ea30fb… florian 173 v2 = vec_srb(v1, (uv2di)v9);
6ea30fb… florian 174 v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
6ea30fb… florian 175 v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
6ea30fb… florian 176
6ea30fb… florian 177 /*
6ea30fb… florian 178 * Apply a Barret reduction to compute the final 32-bit CRC value.
6ea30fb… florian 179 *
6ea30fb… florian 180 * The input values to the Barret reduction are the degree-63 polynomial
6ea30fb… florian 181 * in V1 (R(x)), degree-32 generator polynomial, and the reduction
6ea30fb… florian 182 * constant u. The Barret reduction result is the CRC value of R(x) mod
6ea30fb… florian 183 * P(x).
6ea30fb… florian 184 *
6ea30fb… florian 185 * The Barret reduction algorithm is defined as:
6ea30fb… florian 186 *
6ea30fb… florian 187 * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
6ea30fb… florian 188 * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
6ea30fb… florian 189 * 3. C(x) = R(x) XOR T2(x) mod x^32
6ea30fb… florian 190 *
6ea30fb… florian 191 * Note: The leftmost doubleword of vector register containing
6ea30fb… florian 192 * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
6ea30fb… florian 193 * is zero and does not contribute to the final result.
6ea30fb… florian 194 */
6ea30fb… florian 195
6ea30fb… florian 196 /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
6ea30fb… florian 197 v2 = vec_unpackl((uv4si)v1);
6ea30fb… florian 198 v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
6ea30fb… florian 199
6ea30fb… florian 200 /*
6ea30fb… florian 201 * Compute the GF(2) product of the CRC polynomial with T1(x) in
6ea30fb… florian 202 * V2 and XOR the intermediate result, T2(x), with the value in V1.
6ea30fb… florian 203 * The final result is stored in word element 2 of V2.
6ea30fb… florian 204 */
6ea30fb… florian 205 v2 = vec_unpackl((uv4si)v2);
6ea30fb… florian 206 v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
6ea30fb… florian 207
6ea30fb… florian 208 return ((uv4si)v2)[2];
6ea30fb… florian 209 }
6ea30fb… florian 210
6ea30fb… florian 211
6ea30fb… florian 212 local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
6ea30fb… florian 213 {
6ea30fb… florian 214 uintptr_t prealign, aligned, remaining;
6ea30fb… florian 215
6ea30fb… florian 216 if (buf == Z_NULL) return 0UL;
6ea30fb… florian 217
6ea30fb… florian 218 if (len < VX_MIN_LEN + VX_ALIGN_MASK)
6ea30fb… florian 219 return crc32_z(crc, buf, len);
6ea30fb… florian 220
6ea30fb… florian 221 if ((uintptr_t)buf & VX_ALIGN_MASK) {
6ea30fb… florian 222 prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
6ea30fb… florian 223 len -= prealign;
6ea30fb… florian 224 crc = crc32_z(crc, buf, prealign);
6ea30fb… florian 225 buf += prealign;
6ea30fb… florian 226 }
6ea30fb… florian 227 aligned = len & ~VX_ALIGN_MASK;
6ea30fb… florian 228 remaining = len & VX_ALIGN_MASK;
6ea30fb… florian 229
6ea30fb… florian 230 crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
6ea30fb… florian 231
6ea30fb… florian 232 if (remaining)
6ea30fb… florian 233 crc = crc32_z(crc, buf + aligned, remaining);
6ea30fb… florian 234
6ea30fb… florian 235 return crc;
6ea30fb… florian 236 }
6ea30fb… florian 237
6ea30fb… florian 238 local z_once_t s390_crc32_made = Z_ONCE_INIT;
6ea30fb… florian 239 local void s390_crc32_setup() {
6ea30fb… florian 240 unsigned long hwcap = getauxval(AT_HWCAP);
6ea30fb… florian 241
6ea30fb… florian 242 if (hwcap & HWCAP_S390_VX)
6ea30fb… florian 243 crc32_z_hook = s390_crc32_vx;
6ea30fb… florian 244 else
6ea30fb… florian 245 crc32_z_hook = crc32_z;
6ea30fb… florian 246 }
6ea30fb… florian 247
6ea30fb… florian 248 local unsigned long s390_crc32_init(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
6ea30fb… florian 249 {
6ea30fb… florian 250 z_once(&s390_crc32_made,s390_crc32_setup);
6ea30fb… florian 251 return crc32_z_hook(crc, buf, len);
6ea30fb… florian 252 }
6ea30fb… florian 253
6ea30fb… florian 254 ZLIB_INTERNAL unsigned long (*crc32_z_hook)(unsigned long crc, const unsigned char FAR *buf, z_size_t len) = s390_crc32_init;

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button