Fossil SCM

fossil-scm / compat / zlib / contrib / crc32vx / crc32_vx.c

Source Blame History 254 lines

6ea30fb…	florian	1	/*
6ea30fb…	florian	2	* Hardware-accelerated CRC-32 variants for Linux on z Systems
6ea30fb…	florian	3	*
6ea30fb…	florian	4	* Use the z/Architecture Vector Extension Facility to accelerate the
6ea30fb…	florian	5	* computing of bitreflected CRC-32 checksums.
6ea30fb…	florian	6	*
6ea30fb…	florian	7	* This CRC-32 implementation algorithm is bitreflected and processes
6ea30fb…	florian	8	* the least-significant bit first (Little-Endian).
6ea30fb…	florian	9	*
6ea30fb…	florian	10	* This code was originally written by Hendrik Brueckner
6ea30fb…	florian	11	* <[email protected]> for use in the Linux kernel and has been
6ea30fb…	florian	12	* relicensed under the zlib license.
6ea30fb…	florian	13	*/
6ea30fb…	florian	14	#define Z_ONCE
6ea30fb…	florian	15	#include "../../zutil.h"
6ea30fb…	florian	16	#include "crc32_vx_hooks.h"
6ea30fb…	florian	17
6ea30fb…	florian	18	#include <stdint.h>
6ea30fb…	florian	19	#include <stdio.h>
6ea30fb…	florian	20	#include <vecintrin.h>
6ea30fb…	florian	21	#include <sys/auxv.h>
6ea30fb…	florian	22
6ea30fb…	florian	23	#ifdef __clang__
6ea30fb…	florian	24	# if ((__clang_major__ == 18) \|\| (__clang_major__ == 19 && (__clang_minor__ < 1 \|\| (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
6ea30fb…	florian	25	# error crc32_vx optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. \
6ea30fb…	florian	26	Either disable the zlib crc32_vx optimization, or switch to another compiler/compiler version.
6ea30fb…	florian	27	# endif
6ea30fb…	florian	28	#endif
6ea30fb…	florian	29
6ea30fb…	florian	30	#define VX_MIN_LEN 64
6ea30fb…	florian	31	#define VX_ALIGNMENT 16L
6ea30fb…	florian	32	#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
6ea30fb…	florian	33
6ea30fb…	florian	34	typedef unsigned char uv16qi __attribute__((vector_size(16)));
6ea30fb…	florian	35	typedef unsigned int uv4si __attribute__((vector_size(16)));
6ea30fb…	florian	36	typedef unsigned long long uv2di __attribute__((vector_size(16)));
6ea30fb…	florian	37
6ea30fb…	florian	38	local uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
6ea30fb…	florian	39	/*
6ea30fb…	florian	40	* The CRC-32 constant block contains reduction constants to fold and
6ea30fb…	florian	41	* process particular chunks of the input data stream in parallel.
6ea30fb…	florian	42	*
6ea30fb…	florian	43	* For the CRC-32 variants, the constants are precomputed according to
6ea30fb…	florian	44	* these definitions:
6ea30fb…	florian	45	*
6ea30fb…	florian	46	* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
6ea30fb…	florian	47	* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
6ea30fb…	florian	48	* R3 = [(x128+32 mod P'(x) << 32)]' << 1
6ea30fb…	florian	49	* R4 = [(x128-32 mod P'(x) << 32)]' << 1
6ea30fb…	florian	50	* R5 = [(x64 mod P'(x) << 32)]' << 1
6ea30fb…	florian	51	* R6 = [(x32 mod P'(x) << 32)]' << 1
6ea30fb…	florian	52	*
6ea30fb…	florian	53	* The bitreflected Barret reduction constant, u', is defined as
6ea30fb…	florian	54	* the bit reversal of floor(x**64 / P(x)).
6ea30fb…	florian	55	*
6ea30fb…	florian	56	* where P(x) is the polynomial in the normal domain and the P'(x) is the
6ea30fb…	florian	57	* polynomial in the reversed (bitreflected) domain.
6ea30fb…	florian	58	*
6ea30fb…	florian	59	* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
6ea30fb…	florian	60	*
6ea30fb…	florian	61	* P(x) = 0x04C11DB7
6ea30fb…	florian	62	* P'(x) = 0xEDB88320
6ea30fb…	florian	63	*/
6ea30fb…	florian	64	const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
6ea30fb…	florian	65	const uv2di r2r1 = {0x1C6E41596, 0x154442BD4}; /* R2, R1 */
6ea30fb…	florian	66	const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0}; /* R4, R3 */
6ea30fb…	florian	67	const uv2di r5 = {0, 0x163CD6124}; /* R5 */
6ea30fb…	florian	68	const uv2di ru_poly = {0, 0x1F7011641}; /* u' */
6ea30fb…	florian	69	const uv2di crc_poly = {0, 0x1DB710641}; /* P'(x) << 1 */
6ea30fb…	florian	70
6ea30fb…	florian	71	/*
6ea30fb…	florian	72	* Load the initial CRC value.
6ea30fb…	florian	73	*
6ea30fb…	florian	74	* The CRC value is loaded into the rightmost word of the
6ea30fb…	florian	75	* vector register and is later XORed with the LSB portion
6ea30fb…	florian	76	* of the loaded input data.
6ea30fb…	florian	77	*/
6ea30fb…	florian	78	uv2di v0 = {0, 0};
6ea30fb…	florian	79	v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
6ea30fb…	florian	80
6ea30fb…	florian	81	/* Load a 64-byte data chunk and XOR with CRC */
6ea30fb…	florian	82	uv2di v1 = vec_perm(((uv2di )buf)[0], ((uv2di )buf)[0], perm_le2be);
6ea30fb…	florian	83	uv2di v2 = vec_perm(((uv2di )buf)[1], ((uv2di )buf)[1], perm_le2be);
6ea30fb…	florian	84	uv2di v3 = vec_perm(((uv2di )buf)[2], ((uv2di )buf)[2], perm_le2be);
6ea30fb…	florian	85	uv2di v4 = vec_perm(((uv2di )buf)[3], ((uv2di )buf)[3], perm_le2be);
6ea30fb…	florian	86
6ea30fb…	florian	87	v1 ^= v0;
6ea30fb…	florian	88	buf += 64;
6ea30fb…	florian	89	len -= 64;
6ea30fb…	florian	90
6ea30fb…	florian	91	while (len >= 64) {
6ea30fb…	florian	92	/* Load the next 64-byte data chunk */
6ea30fb…	florian	93	uv16qi part1 = vec_perm(((uv16qi )buf)[0], ((uv16qi )buf)[0], perm_le2be);
6ea30fb…	florian	94	uv16qi part2 = vec_perm(((uv16qi )buf)[1], ((uv16qi )buf)[1], perm_le2be);
6ea30fb…	florian	95	uv16qi part3 = vec_perm(((uv16qi )buf)[2], ((uv16qi )buf)[2], perm_le2be);
6ea30fb…	florian	96	uv16qi part4 = vec_perm(((uv16qi )buf)[3], ((uv16qi )buf)[3], perm_le2be);
6ea30fb…	florian	97
6ea30fb…	florian	98	/*
6ea30fb…	florian	99	* Perform a GF(2) multiplication of the doublewords in V1 with
6ea30fb…	florian	100	* the R1 and R2 reduction constants in V0. The intermediate result
6ea30fb…	florian	101	* is then folded (accumulated) with the next data chunk in PART1 and
6ea30fb…	florian	102	* stored in V1. Repeat this step for the register contents
6ea30fb…	florian	103	* in V2, V3, and V4 respectively.
6ea30fb…	florian	104	*/
6ea30fb…	florian	105	v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
6ea30fb…	florian	106	v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
6ea30fb…	florian	107	v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
6ea30fb…	florian	108	v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
6ea30fb…	florian	109
6ea30fb…	florian	110	buf += 64;
6ea30fb…	florian	111	len -= 64;
6ea30fb…	florian	112	}
6ea30fb…	florian	113
6ea30fb…	florian	114	/*
6ea30fb…	florian	115	* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
6ea30fb…	florian	116	* and R4 and accumulating the next 128-bit chunk until a single 128-bit
6ea30fb…	florian	117	* value remains.
6ea30fb…	florian	118	*/
6ea30fb…	florian	119	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
6ea30fb…	florian	120	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
6ea30fb…	florian	121	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
6ea30fb…	florian	122
6ea30fb…	florian	123	while (len >= 16) {
6ea30fb…	florian	124	/* Load next data chunk */
6ea30fb…	florian	125	v2 = vec_perm((uv2di )buf, (uv2di )buf, perm_le2be);
6ea30fb…	florian	126
6ea30fb…	florian	127	/* Fold next data chunk */
6ea30fb…	florian	128	v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
6ea30fb…	florian	129
6ea30fb…	florian	130	buf += 16;
6ea30fb…	florian	131	len -= 16;
6ea30fb…	florian	132	}
6ea30fb…	florian	133
6ea30fb…	florian	134	/*
6ea30fb…	florian	135	* Set up a vector register for byte shifts. The shift value must
6ea30fb…	florian	136	* be loaded in bits 1-4 in byte element 7 of a vector register.
6ea30fb…	florian	137	* Shift by 8 bytes: 0x40
6ea30fb…	florian	138	* Shift by 4 bytes: 0x20
6ea30fb…	florian	139	*/
6ea30fb…	florian	140	uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
6ea30fb…	florian	141	v9 = vec_insert((unsigned char)0x40, v9, 7);
6ea30fb…	florian	142
6ea30fb…	florian	143	/*
6ea30fb…	florian	144	* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
6ea30fb…	florian	145	* to move R4 into the rightmost doubleword and set the leftmost
6ea30fb…	florian	146	* doubleword to 0x1.
6ea30fb…	florian	147	*/
6ea30fb…	florian	148	v0 = vec_srb(r4r3, (uv2di)v9);
6ea30fb…	florian	149	v0[0] = 1;
6ea30fb…	florian	150
6ea30fb…	florian	151	/*
6ea30fb…	florian	152	* Compute GF(2) product of V1 and V0. The rightmost doubleword
6ea30fb…	florian	153	* of V1 is multiplied with R4. The leftmost doubleword of V1 is
6ea30fb…	florian	154	* multiplied by 0x1 and is then XORed with rightmost product.
6ea30fb…	florian	155	* Implicitly, the intermediate leftmost product becomes padded
6ea30fb…	florian	156	*/
6ea30fb…	florian	157	v1 = (uv2di)vec_gfmsum_128(v0, v1);
6ea30fb…	florian	158
6ea30fb…	florian	159	/*
6ea30fb…	florian	160	* Now do the final 32-bit fold by multiplying the rightmost word
6ea30fb…	florian	161	* in V1 with R5 and XOR the result with the remaining bits in V1.
6ea30fb…	florian	162	*
6ea30fb…	florian	163	* To achieve this by a single VGFMAG, right shift V1 by a word
6ea30fb…	florian	164	* and store the result in V2 which is then accumulated. Use the
6ea30fb…	florian	165	* vector unpack instruction to load the rightmost half of the
6ea30fb…	florian	166	* doubleword into the rightmost doubleword element of V1; the other
6ea30fb…	florian	167	* half is loaded in the leftmost doubleword.
6ea30fb…	florian	168	* The vector register with CONST_R5 contains the R5 constant in the
6ea30fb…	florian	169	* rightmost doubleword and the leftmost doubleword is zero to ignore
6ea30fb…	florian	170	* the leftmost product of V1.
6ea30fb…	florian	171	*/
6ea30fb…	florian	172	v9 = vec_insert((unsigned char)0x20, v9, 7);
6ea30fb…	florian	173	v2 = vec_srb(v1, (uv2di)v9);
6ea30fb…	florian	174	v1 = vec_unpackl((uv4si)v1); /* Split rightmost doubleword */
6ea30fb…	florian	175	v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
6ea30fb…	florian	176
6ea30fb…	florian	177	/*
6ea30fb…	florian	178	* Apply a Barret reduction to compute the final 32-bit CRC value.
6ea30fb…	florian	179	*
6ea30fb…	florian	180	* The input values to the Barret reduction are the degree-63 polynomial
6ea30fb…	florian	181	* in V1 (R(x)), degree-32 generator polynomial, and the reduction
6ea30fb…	florian	182	* constant u. The Barret reduction result is the CRC value of R(x) mod
6ea30fb…	florian	183	* P(x).
6ea30fb…	florian	184	*
6ea30fb…	florian	185	* The Barret reduction algorithm is defined as:
6ea30fb…	florian	186	*
6ea30fb…	florian	187	* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
6ea30fb…	florian	188	* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
6ea30fb…	florian	189	* 3. C(x) = R(x) XOR T2(x) mod x^32
6ea30fb…	florian	190	*
6ea30fb…	florian	191	* Note: The leftmost doubleword of vector register containing
6ea30fb…	florian	192	* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
6ea30fb…	florian	193	* is zero and does not contribute to the final result.
6ea30fb…	florian	194	*/
6ea30fb…	florian	195
6ea30fb…	florian	196	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
6ea30fb…	florian	197	v2 = vec_unpackl((uv4si)v1);
6ea30fb…	florian	198	v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
6ea30fb…	florian	199
6ea30fb…	florian	200	/*
6ea30fb…	florian	201	* Compute the GF(2) product of the CRC polynomial with T1(x) in
6ea30fb…	florian	202	* V2 and XOR the intermediate result, T2(x), with the value in V1.
6ea30fb…	florian	203	* The final result is stored in word element 2 of V2.
6ea30fb…	florian	204	*/
6ea30fb…	florian	205	v2 = vec_unpackl((uv4si)v2);
6ea30fb…	florian	206	v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
6ea30fb…	florian	207
6ea30fb…	florian	208	return ((uv4si)v2)[2];
6ea30fb…	florian	209	}
6ea30fb…	florian	210
6ea30fb…	florian	211
6ea30fb…	florian	212	local unsigned long s390_crc32_vx(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
6ea30fb…	florian	213	{
6ea30fb…	florian	214	uintptr_t prealign, aligned, remaining;
6ea30fb…	florian	215
6ea30fb…	florian	216	if (buf == Z_NULL) return 0UL;
6ea30fb…	florian	217
6ea30fb…	florian	218	if (len < VX_MIN_LEN + VX_ALIGN_MASK)
6ea30fb…	florian	219	return crc32_z(crc, buf, len);
6ea30fb…	florian	220
6ea30fb…	florian	221	if ((uintptr_t)buf & VX_ALIGN_MASK) {
6ea30fb…	florian	222	prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
6ea30fb…	florian	223	len -= prealign;
6ea30fb…	florian	224	crc = crc32_z(crc, buf, prealign);
6ea30fb…	florian	225	buf += prealign;
6ea30fb…	florian	226	}
6ea30fb…	florian	227	aligned = len & ~VX_ALIGN_MASK;
6ea30fb…	florian	228	remaining = len & VX_ALIGN_MASK;
6ea30fb…	florian	229
6ea30fb…	florian	230	crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
6ea30fb…	florian	231
6ea30fb…	florian	232	if (remaining)
6ea30fb…	florian	233	crc = crc32_z(crc, buf + aligned, remaining);
6ea30fb…	florian	234
6ea30fb…	florian	235	return crc;
6ea30fb…	florian	236	}
6ea30fb…	florian	237
6ea30fb…	florian	238	local z_once_t s390_crc32_made = Z_ONCE_INIT;
6ea30fb…	florian	239	local void s390_crc32_setup() {
6ea30fb…	florian	240	unsigned long hwcap = getauxval(AT_HWCAP);
6ea30fb…	florian	241
6ea30fb…	florian	242	if (hwcap & HWCAP_S390_VX)
6ea30fb…	florian	243	crc32_z_hook = s390_crc32_vx;
6ea30fb…	florian	244	else
6ea30fb…	florian	245	crc32_z_hook = crc32_z;
6ea30fb…	florian	246	}
6ea30fb…	florian	247
6ea30fb…	florian	248	local unsigned long s390_crc32_init(unsigned long crc, const unsigned char FAR *buf, z_size_t len)
6ea30fb…	florian	249	{
6ea30fb…	florian	250	z_once(&s390_crc32_made,s390_crc32_setup);
6ea30fb…	florian	251	return crc32_z_hook(crc, buf, len);
6ea30fb…	florian	252	}
6ea30fb…	florian	253
6ea30fb…	florian	254	ZLIB_INTERNAL unsigned long (crc32_z_hook)(unsigned long crc, const unsigned char FAR buf, z_size_t len) = s390_crc32_init;

Fossil SCM

Keyboard Shortcuts