Blame - library/aesce.c - platform/external/u-boot

blob: 6a9e0a1c6bd03f3a23a22b874597faa8b13a308c [file] [log] [blame]

Tom Rini	0344c60	2024-10-08 13:56:50 -0600	[diff] [blame^]	1	/*
				2	* Armv8-A Cryptographic Extension support functions for Aarch64
				3	*
				4	* Copyright The Mbed TLS Contributors
				5	* SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
				6	*/
				7
				8	#if defined(__clang__) && (__clang_major__ >= 4)
				9
				10	/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
				11	* but that is defined by build_info.h, and we need this block to happen first. */
				12	#if defined(__ARM_ARCH)
				13	#if __ARM_ARCH >= 8
				14	#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
				15	#endif
				16	#endif
				17
				18	#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
				19	/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
				20	*
				21	* The intrinsic declaration are guarded by predefined ACLE macros in clang:
				22	* these are normally only enabled by the -march option on the command line.
				23	* By defining the macros ourselves we gain access to those declarations without
				24	* requiring -march on the command line.
				25	*
				26	* `arm_neon.h` is included by common.h, so we put these defines
				27	* at the top of this file, before any includes.
				28	*/
				29	#define __ARM_FEATURE_CRYPTO 1
				30	/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
				31	*
				32	* `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
				33	* for older compilers.
				34	*/
				35	#define __ARM_FEATURE_AES 1
				36	#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
				37	#endif
				38
				39	#endif /* defined(__clang__) && (__clang_major__ >= 4) */
				40
				41	#include <string.h>
				42	#include "common.h"
				43
				44	#if defined(MBEDTLS_AESCE_C)
				45
				46	#include "aesce.h"
				47
				48	#if defined(MBEDTLS_AESCE_HAVE_CODE)
				49
				50	/* Compiler version checks. */
				51	#if defined(__clang__)
				52	# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
				53	# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
				54	# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
				55	# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
				56	# endif
				57	#elif defined(__GNUC__)
				58	# if __GNUC__ < 6
				59	# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
				60	# endif
				61	#elif defined(_MSC_VER)
				62	/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
				63	* please update this and document of `MBEDTLS_AESCE_C` in
				64	* `mbedtls_config.h`. */
				65	# if _MSC_VER < 1929
				66	# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
				67	# endif
				68	#elif defined(__ARMCC_VERSION)
				69	# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
				70	/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
				71	* If someone verified that, please update this and document of
				72	* `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
				73	# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
				74	# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
				75	# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
				76	# endif
				77	#endif
				78
				79	#if !(defined(__ARM_FEATURE_CRYPTO) \|\| defined(__ARM_FEATURE_AES)) \|\| \
				80	defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
				81	# if defined(__ARMCOMPILER_VERSION)
				82	# if __ARMCOMPILER_VERSION <= 6090000
				83	# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
				84	# else
				85	# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
				86	# define MBEDTLS_POP_TARGET_PRAGMA
				87	# endif
				88	# elif defined(__clang__)
				89	# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
				90	# define MBEDTLS_POP_TARGET_PRAGMA
				91	# elif defined(__GNUC__)
				92	# pragma GCC push_options
				93	# pragma GCC target ("+crypto")
				94	# define MBEDTLS_POP_TARGET_PRAGMA
				95	# elif defined(_MSC_VER)
				96	# error "Required feature(__ARM_FEATURE_AES) is not enabled."
				97	# endif
				98	#endif /* !(__ARM_FEATURE_CRYPTO \|\| __ARM_FEATURE_AES) \|\|
				99	MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
				100
				101	#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
				102
				103	#include <sys/auxv.h>
				104	#if !defined(HWCAP_NEON)
				105	#define HWCAP_NEON (1 << 12)
				106	#endif
				107	#if !defined(HWCAP2_AES)
				108	#define HWCAP2_AES (1 << 0)
				109	#endif
				110	#if !defined(HWCAP_AES)
				111	#define HWCAP_AES (1 << 3)
				112	#endif
				113	#if !defined(HWCAP_ASIMD)
				114	#define HWCAP_ASIMD (1 << 1)
				115	#endif
				116
				117	signed char mbedtls_aesce_has_support_result = -1;
				118
				119	#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
				120	/*
				121	* AES instruction support detection routine
				122	*/
				123	int mbedtls_aesce_has_support_impl(void)
				124	{
				125	/* To avoid many calls to getauxval, cache the result. This is
				126	* thread-safe, because we store the result in a char so cannot
				127	* be vulnerable to non-atomic updates.
				128	* It is possible that we could end up setting result more than
				129	* once, but that is harmless.
				130	*/
				131	if (mbedtls_aesce_has_support_result == -1) {
				132	#if defined(MBEDTLS_ARCH_IS_ARM32)
				133	unsigned long auxval = getauxval(AT_HWCAP);
				134	unsigned long auxval2 = getauxval(AT_HWCAP2);
				135	if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
				136	((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
				137	mbedtls_aesce_has_support_result = 1;
				138	} else {
				139	mbedtls_aesce_has_support_result = 0;
				140	}
				141	#else
				142	unsigned long auxval = getauxval(AT_HWCAP);
				143	if ((auxval & (HWCAP_ASIMD \| HWCAP_AES)) ==
				144	(HWCAP_ASIMD \| HWCAP_AES)) {
				145	mbedtls_aesce_has_support_result = 1;
				146	} else {
				147	mbedtls_aesce_has_support_result = 0;
				148	}
				149	#endif
				150	}
				151	return mbedtls_aesce_has_support_result;
				152	}
				153	#endif
				154
				155	#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
				156
				157	/* Single round of AESCE encryption */
				158	#define AESCE_ENCRYPT_ROUND \
				159	block = vaeseq_u8(block, vld1q_u8(keys)); \
				160	block = vaesmcq_u8(block); \
				161	keys += 16
				162	/* Two rounds of AESCE encryption */
				163	#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
				164
				165	MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
				166	static uint8x16_t aesce_encrypt_block(uint8x16_t block,
				167	unsigned char *keys,
				168	int rounds)
				169	{
				170	/* 10, 12 or 14 rounds. Unroll loop. */
				171	if (rounds == 10) {
				172	goto rounds_10;
				173	}
				174	if (rounds == 12) {
				175	goto rounds_12;
				176	}
				177	AESCE_ENCRYPT_ROUND_X2;
				178	rounds_12:
				179	AESCE_ENCRYPT_ROUND_X2;
				180	rounds_10:
				181	AESCE_ENCRYPT_ROUND_X2;
				182	AESCE_ENCRYPT_ROUND_X2;
				183	AESCE_ENCRYPT_ROUND_X2;
				184	AESCE_ENCRYPT_ROUND_X2;
				185	AESCE_ENCRYPT_ROUND;
				186
				187	/* AES AddRoundKey for the previous round.
				188	* SubBytes, ShiftRows for the final round. */
				189	block = vaeseq_u8(block, vld1q_u8(keys));
				190	keys += 16;
				191
				192	/* Final round: no MixColumns */
				193
				194	/* Final AddRoundKey */
				195	block = veorq_u8(block, vld1q_u8(keys));
				196
				197	return block;
				198	}
				199
				200	/* Single round of AESCE decryption
				201	*
				202	* AES AddRoundKey, SubBytes, ShiftRows
				203	*
				204	* block = vaesdq_u8(block, vld1q_u8(keys));
				205	*
				206	* AES inverse MixColumns for the next round.
				207	*
				208	* This means that we switch the order of the inverse AddRoundKey and
				209	* inverse MixColumns operations. We have to do this as AddRoundKey is
				210	* done in an atomic instruction together with the inverses of SubBytes
				211	* and ShiftRows.
				212	*
				213	* It works because MixColumns is a linear operation over GF(2^8) and
				214	* AddRoundKey is an exclusive or, which is equivalent to addition over
				215	* GF(2^8). (The inverse of MixColumns needs to be applied to the
				216	* affected round keys separately which has been done when the
				217	* decryption round keys were calculated.)
				218	*
				219	* block = vaesimcq_u8(block);
				220	*/
				221	#define AESCE_DECRYPT_ROUND \
				222	block = vaesdq_u8(block, vld1q_u8(keys)); \
				223	block = vaesimcq_u8(block); \
				224	keys += 16
				225	/* Two rounds of AESCE decryption */
				226	#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
				227
				228	#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
				229	static uint8x16_t aesce_decrypt_block(uint8x16_t block,
				230	unsigned char *keys,
				231	int rounds)
				232	{
				233	/* 10, 12 or 14 rounds. Unroll loop. */
				234	if (rounds == 10) {
				235	goto rounds_10;
				236	}
				237	if (rounds == 12) {
				238	goto rounds_12;
				239	}
				240	AESCE_DECRYPT_ROUND_X2;
				241	rounds_12:
				242	AESCE_DECRYPT_ROUND_X2;
				243	rounds_10:
				244	AESCE_DECRYPT_ROUND_X2;
				245	AESCE_DECRYPT_ROUND_X2;
				246	AESCE_DECRYPT_ROUND_X2;
				247	AESCE_DECRYPT_ROUND_X2;
				248	AESCE_DECRYPT_ROUND;
				249
				250	/* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
				251	* last full round. */
				252	block = vaesdq_u8(block, vld1q_u8(keys));
				253	keys += 16;
				254
				255	/* Inverse AddRoundKey for inverting the initial round key addition. */
				256	block = veorq_u8(block, vld1q_u8(keys));
				257
				258	return block;
				259	}
				260	#endif
				261
				262	/*
				263	* AES-ECB block en(de)cryption
				264	*/
				265	int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
				266	int mode,
				267	const unsigned char input[16],
				268	unsigned char output[16])
				269	{
				270	uint8x16_t block = vld1q_u8(&input[0]);
				271	unsigned char keys = (unsigned char ) (ctx->buf + ctx->rk_offset);
				272
				273	#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
				274	if (mode == MBEDTLS_AES_DECRYPT) {
				275	block = aesce_decrypt_block(block, keys, ctx->nr);
				276	} else
				277	#else
				278	(void) mode;
				279	#endif
				280	{
				281	block = aesce_encrypt_block(block, keys, ctx->nr);
				282	}
				283	vst1q_u8(&output[0], block);
				284
				285	return 0;
				286	}
				287
				288	/*
				289	* Compute decryption round keys from encryption round keys
				290	*/
				291	#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
				292	void mbedtls_aesce_inverse_key(unsigned char *invkey,
				293	const unsigned char *fwdkey,
				294	int nr)
				295	{
				296	int i, j;
				297	j = nr;
				298	vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
				299	for (i = 1, j--; j > 0; i++, j--) {
				300	vst1q_u8(invkey + i * 16,
				301	vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
				302	}
				303	vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
				304
				305	}
				306	#endif
				307
				308	static inline uint32_t aes_rot_word(uint32_t word)
				309	{
				310	return (word << (32 - 8)) \| (word >> 8);
				311	}
				312
				313	static inline uint32_t aes_sub_word(uint32_t in)
				314	{
				315	uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
				316	uint8x16_t zero = vdupq_n_u8(0);
				317
				318	/* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
				319	* the correct result as ShiftRows doesn't change the first row. */
				320	v = vaeseq_u8(zero, v);
				321	return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
				322	}
				323
				324	/*
				325	* Key expansion function
				326	*/
				327	static void aesce_setkey_enc(unsigned char *rk,
				328	const unsigned char *key,
				329	const size_t key_bit_length)
				330	{
				331	static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
				332	0x20, 0x40, 0x80, 0x1b, 0x36 };
				333	/* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
				334	* - Section 5, Nr = Nk + 6
				335	* - Section 5.2, the length of round keys is Nb*(Nr+1)
				336	*/
				337	const size_t key_len_in_words = key_bit_length / 32; /* Nk */
				338	const size_t round_key_len_in_words = 4; /* Nb */
				339	const size_t rounds_needed = key_len_in_words + 6; /* Nr */
				340	const size_t round_keys_len_in_words =
				341	round_key_len_in_words * (rounds_needed + 1); /* Nb(Nr+1) /
				342	const uint32_t rko_end = (uint32_t ) rk + round_keys_len_in_words;
				343
				344	memcpy(rk, key, key_len_in_words * 4);
				345
				346	for (uint32_t rki = (uint32_t ) rk;
				347	rki + key_len_in_words < rko_end;
				348	rki += key_len_in_words) {
				349
				350	size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
				351	uint32_t *rko;
				352	rko = rki + key_len_in_words;
				353	rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
				354	rko[0] ^= rcon[iteration] ^ rki[0];
				355	rko[1] = rko[0] ^ rki[1];
				356	rko[2] = rko[1] ^ rki[2];
				357	rko[3] = rko[2] ^ rki[3];
				358	if (rko + key_len_in_words > rko_end) {
				359	/* Do not write overflow words.*/
				360	continue;
				361	}
				362	#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
				363	switch (key_bit_length) {
				364	case 128:
				365	break;
				366	case 192:
				367	rko[4] = rko[3] ^ rki[4];
				368	rko[5] = rko[4] ^ rki[5];
				369	break;
				370	case 256:
				371	rko[4] = aes_sub_word(rko[3]) ^ rki[4];
				372	rko[5] = rko[4] ^ rki[5];
				373	rko[6] = rko[5] ^ rki[6];
				374	rko[7] = rko[6] ^ rki[7];
				375	break;
				376	}
				377	#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
				378	}
				379	}
				380
				381	/*
				382	* Key expansion, wrapper
				383	*/
				384	int mbedtls_aesce_setkey_enc(unsigned char *rk,
				385	const unsigned char *key,
				386	size_t bits)
				387	{
				388	switch (bits) {
				389	case 128:
				390	case 192:
				391	case 256:
				392	aesce_setkey_enc(rk, key, bits);
				393	break;
				394	default:
				395	return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
				396	}
				397
				398	return 0;
				399	}
				400
				401	#if defined(MBEDTLS_GCM_C)
				402
				403	#if defined(MBEDTLS_ARCH_IS_ARM32)
				404
				405	#if defined(__clang__)
				406	/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
				407	* [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
				408	* These are only required for GCM.
				409	*/
				410	#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
				411
				412	typedef uint8x16_t poly128_t;
				413
				414	static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
				415	{
				416	poly128_t r;
				417	asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
				418	return r;
				419	}
				420
				421	/* This is set to cause some more missing intrinsics to be defined below */
				422	#define COMMON_MISSING_INTRINSICS
				423
				424	static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
				425	{
				426	return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
				427	(poly64_t) (vget_high_u64((uint64x2_t) b)));
				428	}
				429
				430	#endif /* defined(__clang__) */
				431
				432	static inline uint8x16_t vrbitq_u8(uint8x16_t x)
				433	{
				434	/* There is no vrbitq_u8 instruction in A32/T32, so provide
				435	* an equivalent non-Neon implementation. Reverse bit order in each
				436	* byte with 4x rbit, rev. */
				437	asm ("ldm %[p], { r2-r5 } \n\t"
				438	"rbit r2, r2 \n\t"
				439	"rev r2, r2 \n\t"
				440	"rbit r3, r3 \n\t"
				441	"rev r3, r3 \n\t"
				442	"rbit r4, r4 \n\t"
				443	"rev r4, r4 \n\t"
				444	"rbit r5, r5 \n\t"
				445	"rev r5, r5 \n\t"
				446	"stm %[p], { r2-r5 } \n\t"
				447	:
				448	/* Output: 16 bytes of memory pointed to by &x */
				449	"+m" ((uint8_t()[16]) &x)
				450	:
				451	[p] "r" (&x)
				452	:
				453	"r2", "r3", "r4", "r5"
				454	);
				455	return x;
				456	}
				457
				458	#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
				459
				460	#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
				461	/* Some intrinsics are not available for GCC 5.X. */
				462	#define COMMON_MISSING_INTRINSICS
				463	#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
				464
				465
				466	#if defined(COMMON_MISSING_INTRINSICS)
				467
				468	/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
				469
				470	#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
				471	#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
				472
				473	static inline poly64x1_t vget_low_p64(poly64x2_t a)
				474	{
				475	uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
				476	return (poly64x1_t) r;
				477
				478	}
				479
				480	#endif /* COMMON_MISSING_INTRINSICS */
				481
				482	/* vmull_p64/vmull_high_p64 wrappers.
				483	*
				484	* Older compilers miss some intrinsic functions for `poly*_t`. We use
				485	* uint8x16_t and uint8x16x3_t as input/output parameters.
				486	*/
				487	#if defined(MBEDTLS_COMPILER_IS_GCC)
				488	/* GCC reports incompatible type error without cast. GCC think poly64_t and
				489	* poly64x1_t are different, that is different with MSVC and Clang. */
				490	#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
				491	#else
				492	/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
				493	* error with/without cast. And I think poly64_t and poly64x1_t are same, no
				494	* cast for clang also. */
				495	#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
				496	#endif /* MBEDTLS_COMPILER_IS_GCC */
				497
				498	static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
				499	{
				500
				501	return vreinterpretq_u8_p128(
				502	MBEDTLS_VMULL_P64(
				503	(poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
				504	(poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
				505	));
				506	}
				507
				508	static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
				509	{
				510	return vreinterpretq_u8_p128(
				511	vmull_high_p64(vreinterpretq_p64_u8(a),
				512	vreinterpretq_p64_u8(b)));
				513	}
				514
				515	/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
				516	* `x^128 + x^7 + x^2 + x + 1`.
				517	*
				518	* Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
				519	* multiplies to generate a 128b.
				520	*
				521	* `poly_mult_128` executes polynomial multiplication and outputs 256b that
				522	* represented by 3 128b due to code size optimization.
				523	*
				524	* Output layout:
				525	* \| \| \| \|
				526	* \|------------\|-------------\|-------------\|
				527	* \| ret.val[0] \| h3:h2:00:00 \| high 128b \|
				528	* \| ret.val[1] \| :m2:m1:00 \| middle 128b \|
				529	* \| ret.val[2] \| : :l1:l0 \| low 128b \|
				530	*/
				531	static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
				532	{
				533	uint8x16x3_t ret;
				534	uint8x16_t h, m, l; /* retval high/middle/low */
				535	uint8x16_t c, d, e;
				536
				537	h = pmull_high(a, b); /* h3:h2:00:00 = a1b1 /
				538	l = pmull_low(a, b); /* : :l1:l0 = a0b0 /
				539	c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
				540	d = pmull_high(a, c); /* :d2:d1:00 = a1b0 /
				541	e = pmull_low(a, c); /* :e2:e1:00 = a0b1 /
				542	m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
				543
				544	ret.val[0] = h;
				545	ret.val[1] = m;
				546	ret.val[2] = l;
				547	return ret;
				548	}
				549
				550	/*
				551	* Modulo reduction.
				552	*
				553	* See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
				554	*
				555	* Section 4.3
				556	*
				557	* Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
				558	* z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
				559	* consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
				560	* operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
				561	* simply multiply the higher part of the operand by r(z) and add it to l(z). If
				562	* the result is still larger than 128 bits, we reduce again.
				563	*/
				564	static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
				565	{
				566	uint8x16_t const ZERO = vdupq_n_u8(0);
				567
				568	uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
				569	#if defined(__GNUC__)
				570	/* use 'asm' as an optimisation barrier to prevent loading MODULO from
				571	* memory. It is for GNUC compatible compilers.
				572	*/
				573	asm volatile ("" : "+w" (r));
				574	#endif
				575	uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
				576	uint8x16_t h, m, l; /* input high/middle/low 128b */
				577	uint8x16_t c, d, e, f, g, n, o;
				578	h = input.val[0]; /* h3:h2:00:00 */
				579	m = input.val[1]; /* :m2:m1:00 */
				580	l = input.val[2]; /* : :l1:l0 */
				581	c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
				582	d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
				583	e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
				584	f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
				585	g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
				586	n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
				587	o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
				588	return veorq_u8(o, g); /* = o1:o0 + g1:00 */
				589	}
				590
				591	/*
				592	* GCM multiplication: c = a times b in GF(2^128)
				593	*/
				594	void mbedtls_aesce_gcm_mult(unsigned char c[16],
				595	const unsigned char a[16],
				596	const unsigned char b[16])
				597	{
				598	uint8x16_t va, vb, vc;
				599	va = vrbitq_u8(vld1q_u8(&a[0]));
				600	vb = vrbitq_u8(vld1q_u8(&b[0]));
				601	vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
				602	vst1q_u8(&c[0], vc);
				603	}
				604
				605	#endif /* MBEDTLS_GCM_C */
				606
				607	#if defined(MBEDTLS_POP_TARGET_PRAGMA)
				608	#if defined(__clang__)
				609	#pragma clang attribute pop
				610	#elif defined(__GNUC__)
				611	#pragma GCC pop_options
				612	#endif
				613	#undef MBEDTLS_POP_TARGET_PRAGMA
				614	#endif
				615
				616	#endif /* MBEDTLS_AESCE_HAVE_CODE */
				617
				618	#endif /* MBEDTLS_AESCE_C */