/* SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only */
/* Copyright (c) 2022-2025 Brett Sheffield <bacs@librecast.net> */

#include <gf256.h>

/*
 * Method adapted from the technique described in:
 * J. S. Plank and K. M. Greenan and E. L. Miller (2013)
 * "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions"
 * http://web.eecs.utk.edu/~jplank/plank/papers/FAST-2013-GF.html
 */
/* This function "belongs" here, but presently the only compilation unit that
 * uses it is matrix_ssse3.c, so it has been copied there as mul_128() for
 * performance and so we're not relying on -flto */
__m256i gf256_mul_256(__m256i A, uint8_t y)
{
	__m256i t1, t2, mask1, mask2, l, h;
	t1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i_u *)GF256LR[y][0]));
	t2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i_u *)GF256LR[y][1]));
	mask1 = _mm256_set1_epi8((uint8_t)0x0f);
	mask2 = _mm256_set1_epi8((uint8_t)0xf0);
	l = _mm256_and_si256(A, mask1);
	l = _mm256_shuffle_epi8(t1, l);
	h = _mm256_and_si256(A, mask2);
	h = _mm256_srli_epi64(h, 4);
	h = _mm256_shuffle_epi8(t2, h);
	return _mm256_xor_si256(h, l);
}

void gf256_mul_256_inplace(uint8_t *d, uint8_t y)
{
	__m256i D = _mm256_loadu_si256((const __m256i_u *)d);
	D = gf256_mul_256(D, y);
	_mm256_storeu_si256((__m256i*)d, D);
}
