2023-01-10 09:57:21 +01:00
/*
2023-03-17 18:52:23 +01:00
* Armv8 - A Cryptographic Extension support functions for Aarch64
2023-01-10 09:57:21 +01:00
*
* Copyright The Mbed TLS Contributors
* SPDX - License - Identifier : Apache - 2.0
*
* Licensed under the Apache License , Version 2.0 ( the " License " ) ; you may
* not use this file except in compliance with the License .
* You may obtain a copy of the License at
*
* http : //www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS , WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
2023-10-08 11:29:26 +02:00
# if defined(__clang__) && (__clang_major__ >= 4)
/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8 in the following #if,
* but that is defined by build_info . h , and we need this block to happen first . */
# if defined(__ARM_ARCH)
# if __ARM_ARCH >= 8
# define MBEDTLS_AESCE_ARCH_IS_ARMV8
# endif
# endif
# if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8) && !defined(__ARM_FEATURE_CRYPTO)
2023-03-03 08:51:07 +01:00
/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
*
* The intrinsic declaration are guarded by predefined ACLE macros in clang :
* these are normally only enabled by the - march option on the command line .
* By defining the macros ourselves we gain access to those declarations without
* requiring - march on the command line .
*
* ` arm_neon . h ` could be included by any header file , so we put these defines
* at the top of this file , before any includes .
*/
# define __ARM_FEATURE_CRYPTO 1
2023-03-03 08:55:56 +01:00
/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
*
2023-03-06 08:21:44 +01:00
* ` __ARM_FEATURE_CRYPTO ` is deprecated , but we need to continue to specify it
* for older compilers .
2023-03-03 08:55:56 +01:00
*/
# define __ARM_FEATURE_AES 1
2023-03-14 17:03:57 +01:00
# define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
2023-03-06 08:21:44 +01:00
# endif
2023-03-03 08:51:07 +01:00
2023-10-08 11:29:26 +02:00
# endif /* defined(__clang__) && (__clang_major__ >= 4) */
2023-01-10 09:57:21 +01:00
# include <string.h>
# include "common.h"
# if defined(MBEDTLS_AESCE_C)
# include "aesce.h"
2023-10-08 21:24:48 +02:00
# if defined(MBEDTLS_ARCH_IS_ARMV8) && defined(__ARM_NEON)
2023-01-10 09:57:21 +01:00
2023-04-26 05:06:51 +02:00
/* Compiler version checks. */
2023-04-26 10:55:37 +02:00
# if defined(__clang__)
2023-10-09 13:19:44 +02:00
# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 111.0."
# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
2023-04-26 10:55:37 +02:00
# endif
# elif defined(__GNUC__)
# if __GNUC__ < 6
# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
# endif
# elif defined(_MSC_VER)
2023-04-26 05:06:51 +02:00
/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
* please update this and document of ` MBEDTLS_AESCE_C ` in
* ` mbedtls_config . h ` . */
2023-04-26 10:55:37 +02:00
# if _MSC_VER < 1929
# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
# endif
2023-10-08 22:41:40 +02:00
# elif defined(__ARMCC_VERSION)
# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
* If someone verified that , please update this and document of
* ` MBEDTLS_AESCE_C ` in ` mbedtls_config . h ` . */
# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
# endif
2023-04-26 05:06:51 +02:00
# endif
2023-05-04 10:30:21 +02:00
# ifdef __ARM_NEON
2023-04-27 12:28:00 +02:00
# include <arm_neon.h>
2023-10-08 11:29:26 +02:00
# if defined(MBEDTLS_ARCH_IS_ARM32)
# if defined(__clang__)
2023-10-09 14:40:36 +02:00
/* On clang for A32/T32, work around some missing intrinsics and types which are listed in [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1) */
2023-10-08 11:29:26 +02:00
# ifndef vreinterpretq_p64_u8
# define vreinterpretq_p64_u8 (poly64x2_t)
# endif
# ifndef vreinterpretq_u8_p128
# define vreinterpretq_u8_p128 (uint8x16_t)
# endif
# ifndef vreinterpretq_u64_p64
# define vreinterpretq_u64_p64 (uint64x2_t)
# endif
typedef uint8x16_t poly128_t ;
static inline poly128_t vmull_p64 ( poly64_t a , poly64_t b )
{
poly128_t r ;
2023-10-08 23:14:41 +02:00
asm ( " vmull.p64 %[r], %[a], %[b] " : [ r ] " =w " ( r ) : [ a ] " w " ( a ) , [ b ] " w " ( b ) : ) ;
2023-10-08 11:29:26 +02:00
return r ;
}
static inline poly64x1_t vget_low_p64 ( poly64x2_t a )
{
return ( poly64x1_t ) vget_low_u64 ( vreinterpretq_u64_p64 ( a ) ) ;
}
static inline poly128_t vmull_high_p64 ( poly64x2_t a , poly64x2_t b )
{
return vmull_p64 ( ( poly64_t ) ( vget_high_u64 ( ( uint64x2_t ) a ) ) ,
( poly64_t ) ( vget_high_u64 ( ( uint64x2_t ) b ) ) ) ;
}
# endif /* defined(__clang__) */
static inline uint8x16_t vrbitq_u8 ( uint8x16_t x )
{
/* There is no vrbitq_u8 instruction in A32/T32, so provide
* an equivalent non - Neon implementation . Reverse bit order in each
* byte with 4 x rbit , rev . */
asm ( " ldm %[p], { r2-r5 } \n \t "
" rbit r2, r2 \n \t "
" rev r2, r2 \n \t "
" rbit r3, r3 \n \t "
" rev r3, r3 \n \t "
" rbit r4, r4 \n \t "
" rev r4, r4 \n \t "
" rbit r5, r5 \n \t "
" rev r5, r5 \n \t "
" stm %[p], { r2-r5 } \n \t "
:
/* Output: 16 bytes of memory pointed to by &x */
" +m " ( * ( uint8_t ( * ) [ 16 ] ) & x )
:
[ p ] " r " ( & x )
:
" r2 " , " r3 " , " r4 " , " r5 "
) ;
return x ;
}
# endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
2023-05-04 10:30:21 +02:00
# else
# error "Target does not support NEON instructions"
# endif
2023-04-27 12:28:00 +02:00
2023-04-28 11:42:40 +02:00
# if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
defined ( MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG )
2023-05-05 08:05:07 +02:00
# if defined(__ARMCOMPILER_VERSION)
# if __ARMCOMPILER_VERSION <= 6090000
# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
# else
2023-07-13 11:32:11 +02:00
# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
2023-05-05 08:05:07 +02:00
# define MBEDTLS_POP_TARGET_PRAGMA
# endif
# elif defined(__clang__)
2023-07-13 11:32:11 +02:00
# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
2023-03-14 03:42:47 +01:00
# define MBEDTLS_POP_TARGET_PRAGMA
# elif defined(__GNUC__)
# pragma GCC push_options
2023-06-25 19:16:16 +02:00
# pragma GCC target ("+crypto")
2023-03-14 03:42:47 +01:00
# define MBEDTLS_POP_TARGET_PRAGMA
2023-03-20 11:12:36 +01:00
# elif defined(_MSC_VER)
2023-04-26 05:06:51 +02:00
# error "Required feature(__ARM_FEATURE_AES) is not enabled."
2023-01-10 09:57:21 +01:00
# endif
2023-04-28 11:42:40 +02:00
# endif / * !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
2023-01-10 09:57:21 +01:00
2023-08-04 13:31:58 +02:00
# if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
2023-01-10 09:59:51 +01:00
# include <asm/hwcap.h>
# include <sys/auxv.h>
2023-08-04 13:31:58 +02:00
2023-08-04 13:52:51 +02:00
signed char mbedtls_aesce_has_support_result = - 1 ;
2023-01-10 09:59:51 +01:00
2023-04-19 04:44:29 +02:00
# if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
2023-01-10 09:59:51 +01:00
/*
* AES instruction support detection routine
*/
2023-08-04 13:31:58 +02:00
int mbedtls_aesce_has_support_impl ( void )
2023-01-10 09:59:51 +01:00
{
2023-08-04 13:31:58 +02:00
/* To avoid many calls to getauxval, cache the result. This is
* thread - safe , because we store the result in a char so cannot
* be vulnerable to non - atomic updates .
* It is possible that we could end up setting result more than
* once , but that is harmless .
*/
2023-08-04 13:52:51 +02:00
if ( mbedtls_aesce_has_support_result = = - 1 ) {
2023-10-08 13:26:41 +02:00
# if defined(MBEDTLS_ARCH_IS_ARM32)
unsigned long auxval = getauxval ( AT_HWCAP ) ;
unsigned long auxval2 = getauxval ( AT_HWCAP2 ) ;
if ( ( ( auxval & HWCAP_NEON ) = = HWCAP_NEON ) & &
( ( auxval2 & HWCAP2_AES ) = = HWCAP2_AES ) ) {
mbedtls_aesce_has_support_result = 1 ;
} else {
mbedtls_aesce_has_support_result = 0 ;
}
# else
2023-08-04 13:31:58 +02:00
unsigned long auxval = getauxval ( AT_HWCAP ) ;
if ( ( auxval & ( HWCAP_ASIMD | HWCAP_AES ) ) = =
( HWCAP_ASIMD | HWCAP_AES ) ) {
mbedtls_aesce_has_support_result = 1 ;
} else {
mbedtls_aesce_has_support_result = 0 ;
}
2023-10-08 13:26:41 +02:00
# endif
2023-08-04 13:31:58 +02:00
}
return mbedtls_aesce_has_support_result ;
2023-01-10 09:59:51 +01:00
}
2023-03-31 08:32:47 +02:00
# endif
2023-01-10 09:59:51 +01:00
2023-08-04 13:31:58 +02:00
# endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
2023-06-16 10:36:50 +02:00
/* Single round of AESCE encryption */
# define AESCE_ENCRYPT_ROUND \
block = vaeseq_u8 ( block , vld1q_u8 ( keys ) ) ; \
block = vaesmcq_u8 ( block ) ; \
keys + = 16
/* Two rounds of AESCE encryption */
# define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
2023-06-16 10:41:21 +02:00
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
2023-01-10 10:38:26 +01:00
static uint8x16_t aesce_encrypt_block ( uint8x16_t block ,
unsigned char * keys ,
int rounds )
{
2023-06-16 15:48:14 +02:00
/* 10, 12 or 14 rounds. Unroll loop. */
2023-06-15 17:21:31 +02:00
if ( rounds = = 10 ) {
goto rounds_10 ;
2023-01-10 10:38:26 +01:00
}
2023-06-15 17:21:31 +02:00
if ( rounds = = 12 ) {
goto rounds_12 ;
2023-01-10 10:38:26 +01:00
}
2023-06-16 10:36:50 +02:00
AESCE_ENCRYPT_ROUND_X2 ;
2023-06-15 17:21:31 +02:00
rounds_12 :
2023-06-16 10:36:50 +02:00
AESCE_ENCRYPT_ROUND_X2 ;
2023-06-15 17:21:31 +02:00
rounds_10 :
2023-06-16 10:36:50 +02:00
AESCE_ENCRYPT_ROUND_X2 ;
AESCE_ENCRYPT_ROUND_X2 ;
AESCE_ENCRYPT_ROUND_X2 ;
AESCE_ENCRYPT_ROUND_X2 ;
AESCE_ENCRYPT_ROUND ;
2023-01-10 10:38:26 +01:00
2023-02-21 07:49:02 +01:00
/* AES AddRoundKey for the previous round.
* SubBytes , ShiftRows for the final round . */
2023-06-15 17:21:31 +02:00
block = vaeseq_u8 ( block , vld1q_u8 ( keys ) ) ;
keys + = 16 ;
2023-01-10 10:38:26 +01:00
2023-02-21 07:49:02 +01:00
/* Final round: no MixColumns */
2023-02-22 07:37:11 +01:00
/* Final AddRoundKey */
2023-06-15 17:21:31 +02:00
block = veorq_u8 ( block , vld1q_u8 ( keys ) ) ;
2023-01-10 10:38:26 +01:00
return block ;
}
2023-06-16 10:36:50 +02:00
/* Single round of AESCE decryption
*
* AES AddRoundKey , SubBytes , ShiftRows
*
* block = vaesdq_u8 ( block , vld1q_u8 ( keys ) ) ;
*
* AES inverse MixColumns for the next round .
*
* This means that we switch the order of the inverse AddRoundKey and
* inverse MixColumns operations . We have to do this as AddRoundKey is
* done in an atomic instruction together with the inverses of SubBytes
* and ShiftRows .
*
* It works because MixColumns is a linear operation over GF ( 2 ^ 8 ) and
* AddRoundKey is an exclusive or , which is equivalent to addition over
* GF ( 2 ^ 8 ) . ( The inverse of MixColumns needs to be applied to the
* affected round keys separately which has been done when the
* decryption round keys were calculated . )
*
* block = vaesimcq_u8 ( block ) ;
*/
# define AESCE_DECRYPT_ROUND \
block = vaesdq_u8 ( block , vld1q_u8 ( keys ) ) ; \
block = vaesimcq_u8 ( block ) ; \
keys + = 16
/* Two rounds of AESCE decryption */
# define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
2023-01-10 10:38:26 +01:00
static uint8x16_t aesce_decrypt_block ( uint8x16_t block ,
unsigned char * keys ,
int rounds )
{
2023-06-16 15:48:14 +02:00
/* 10, 12 or 14 rounds. Unroll loop. */
2023-06-15 17:28:00 +02:00
if ( rounds = = 10 ) {
goto rounds_10 ;
}
if ( rounds = = 12 ) {
goto rounds_12 ;
2023-01-10 10:38:26 +01:00
}
2023-06-16 10:36:50 +02:00
AESCE_DECRYPT_ROUND_X2 ;
2023-06-15 17:28:00 +02:00
rounds_12 :
2023-06-16 10:36:50 +02:00
AESCE_DECRYPT_ROUND_X2 ;
2023-06-15 17:28:00 +02:00
rounds_10 :
2023-06-16 10:36:50 +02:00
AESCE_DECRYPT_ROUND_X2 ;
AESCE_DECRYPT_ROUND_X2 ;
AESCE_DECRYPT_ROUND_X2 ;
AESCE_DECRYPT_ROUND_X2 ;
AESCE_DECRYPT_ROUND ;
2023-01-10 10:38:26 +01:00
2023-02-21 07:49:02 +01:00
/* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
* last full round . */
2023-06-15 17:28:00 +02:00
block = vaesdq_u8 ( block , vld1q_u8 ( keys ) ) ;
keys + = 16 ;
2023-01-10 10:38:26 +01:00
2023-02-21 07:49:02 +01:00
/* Inverse AddRoundKey for inverting the initial round key addition. */
2023-06-15 17:28:00 +02:00
block = veorq_u8 ( block , vld1q_u8 ( keys ) ) ;
2023-01-10 10:38:26 +01:00
return block ;
}
/*
* AES - ECB block en ( de ) cryption
*/
int mbedtls_aesce_crypt_ecb ( mbedtls_aes_context * ctx ,
int mode ,
const unsigned char input [ 16 ] ,
unsigned char output [ 16 ] )
{
uint8x16_t block = vld1q_u8 ( & input [ 0 ] ) ;
unsigned char * keys = ( unsigned char * ) ( ctx - > buf + ctx - > rk_offset ) ;
if ( mode = = MBEDTLS_AES_ENCRYPT ) {
block = aesce_encrypt_block ( block , keys , ctx - > nr ) ;
} else {
block = aesce_decrypt_block ( block , keys , ctx - > nr ) ;
}
vst1q_u8 ( & output [ 0 ] , block ) ;
return 0 ;
}
2023-01-10 10:07:01 +01:00
/*
* Compute decryption round keys from encryption round keys
*/
void mbedtls_aesce_inverse_key ( unsigned char * invkey ,
const unsigned char * fwdkey ,
int nr )
{
int i , j ;
j = nr ;
vst1q_u8 ( invkey , vld1q_u8 ( fwdkey + j * 16 ) ) ;
for ( i = 1 , j - - ; j > 0 ; i + + , j - - ) {
vst1q_u8 ( invkey + i * 16 ,
vaesimcq_u8 ( vld1q_u8 ( fwdkey + j * 16 ) ) ) ;
}
vst1q_u8 ( invkey + i * 16 , vld1q_u8 ( fwdkey + j * 16 ) ) ;
}
2023-02-21 07:49:02 +01:00
static inline uint32_t aes_rot_word ( uint32_t word )
2023-01-10 10:05:42 +01:00
{
return ( word < < ( 32 - 8 ) ) | ( word > > 8 ) ;
}
2023-02-21 07:49:02 +01:00
static inline uint32_t aes_sub_word ( uint32_t in )
2023-01-10 10:05:42 +01:00
{
2023-02-21 07:49:02 +01:00
uint8x16_t v = vreinterpretq_u8_u32 ( vdupq_n_u32 ( in ) ) ;
2023-01-10 10:05:42 +01:00
uint8x16_t zero = vdupq_n_u8 ( 0 ) ;
2023-02-21 07:49:02 +01:00
/* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
* the correct result as ShiftRows doesn ' t change the first row . */
v = vaeseq_u8 ( zero , v ) ;
return vgetq_lane_u32 ( vreinterpretq_u32_u8 ( v ) , 0 ) ;
2023-01-10 10:05:42 +01:00
}
/*
2023-02-21 08:26:13 +01:00
* Key expansion function
2023-01-10 10:05:42 +01:00
*/
2023-02-21 08:26:13 +01:00
static void aesce_setkey_enc ( unsigned char * rk ,
const unsigned char * key ,
const size_t key_bit_length )
2023-01-10 10:05:42 +01:00
{
2023-02-21 08:26:13 +01:00
static uint8_t const rcon [ ] = { 0x01 , 0x02 , 0x04 , 0x08 , 0x10 ,
0x20 , 0x40 , 0x80 , 0x1b , 0x36 } ;
2023-02-23 04:07:57 +01:00
/* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
* - Section 5 , Nr = Nk + 6
2023-03-01 04:18:20 +01:00
* - Section 5.2 , the length of round keys is Nb * ( Nr + 1 )
2023-02-23 04:07:57 +01:00
*/
const uint32_t key_len_in_words = key_bit_length / 32 ; /* Nk */
const size_t round_key_len_in_words = 4 ; /* Nb */
2023-03-01 04:18:20 +01:00
const size_t rounds_needed = key_len_in_words + 6 ; /* Nr */
const size_t round_keys_len_in_words =
round_key_len_in_words * ( rounds_needed + 1 ) ; /* Nb*(Nr+1) */
const uint32_t * rko_end = ( uint32_t * ) rk + round_keys_len_in_words ;
2023-02-22 07:37:11 +01:00
memcpy ( rk , key , key_len_in_words * 4 ) ;
2023-02-21 08:26:13 +01:00
2023-02-22 07:37:11 +01:00
for ( uint32_t * rki = ( uint32_t * ) rk ;
rki + key_len_in_words < rko_end ;
rki + = key_len_in_words ) {
2023-02-21 08:26:13 +01:00
2023-02-23 03:13:40 +01:00
size_t iteration = ( rki - ( uint32_t * ) rk ) / key_len_in_words ;
2023-02-22 07:37:11 +01:00
uint32_t * rko ;
2023-02-21 08:26:13 +01:00
rko = rki + key_len_in_words ;
rko [ 0 ] = aes_rot_word ( aes_sub_word ( rki [ key_len_in_words - 1 ] ) ) ;
2023-02-22 07:37:11 +01:00
rko [ 0 ] ^ = rcon [ iteration ] ^ rki [ 0 ] ;
2023-01-10 10:05:42 +01:00
rko [ 1 ] = rko [ 0 ] ^ rki [ 1 ] ;
rko [ 2 ] = rko [ 1 ] ^ rki [ 2 ] ;
rko [ 3 ] = rko [ 2 ] ^ rki [ 3 ] ;
2023-02-23 03:13:40 +01:00
if ( rko + key_len_in_words > rko_end ) {
2023-02-22 07:37:11 +01:00
/* Do not write overflow words.*/
continue ;
}
2023-05-08 04:28:53 +02:00
# if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
2023-02-21 08:26:13 +01:00
switch ( key_bit_length ) {
2023-02-22 07:37:11 +01:00
case 128 :
break ;
2023-02-21 08:26:13 +01:00
case 192 :
2023-02-22 07:37:11 +01:00
rko [ 4 ] = rko [ 3 ] ^ rki [ 4 ] ;
rko [ 5 ] = rko [ 4 ] ^ rki [ 5 ] ;
2023-02-21 08:26:13 +01:00
break ;
case 256 :
2023-02-22 07:37:11 +01:00
rko [ 4 ] = aes_sub_word ( rko [ 3 ] ) ^ rki [ 4 ] ;
rko [ 5 ] = rko [ 4 ] ^ rki [ 5 ] ;
rko [ 6 ] = rko [ 5 ] ^ rki [ 6 ] ;
rko [ 7 ] = rko [ 6 ] ^ rki [ 7 ] ;
2023-02-21 08:26:13 +01:00
break ;
2023-01-10 10:05:42 +01:00
}
2023-05-08 04:28:53 +02:00
# endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
2023-01-10 10:05:42 +01:00
}
}
/*
* Key expansion , wrapper
*/
int mbedtls_aesce_setkey_enc ( unsigned char * rk ,
const unsigned char * key ,
size_t bits )
{
switch ( bits ) {
2023-02-21 08:26:13 +01:00
case 128 :
case 192 :
case 256 :
2023-02-24 04:18:16 +01:00
aesce_setkey_enc ( rk , key , bits ) ;
break ;
default :
return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH ;
2023-01-10 10:05:42 +01:00
}
return 0 ;
}
2023-01-10 11:17:15 +01:00
# if defined(MBEDTLS_GCM_C)
2023-03-02 10:35:53 +01:00
# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5
2023-03-07 08:44:59 +01:00
/* Some intrinsics are not available for GCC 5.X. */
2023-03-02 10:35:53 +01:00
# define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
# define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
static inline poly64_t vget_low_p64 ( poly64x2_t __a )
{
uint64x2_t tmp = ( uint64x2_t ) ( __a ) ;
uint64x1_t lo = vcreate_u64 ( vgetq_lane_u64 ( tmp , 0 ) ) ;
return ( poly64_t ) ( lo ) ;
}
# endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/
2023-03-07 08:44:59 +01:00
/* vmull_p64/vmull_high_p64 wrappers.
*
* Older compilers miss some intrinsic functions for ` poly * _t ` . We use
* uint8x16_t and uint8x16x3_t as input / output parameters .
*/
2023-03-21 09:56:43 +01:00
# if defined(__GNUC__) && !defined(__clang__)
/* GCC reports incompatible type error without cast. GCC think poly64_t and
* poly64x1_t are different , that is different with MSVC and Clang . */
# define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
# else
/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
* error with / without cast . And I think poly64_t and poly64x1_t are same , no
* cast for clang also . */
# define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
# endif
2023-01-10 11:17:15 +01:00
static inline uint8x16_t pmull_low ( uint8x16_t a , uint8x16_t b )
{
2023-03-21 09:56:43 +01:00
2023-01-10 11:17:15 +01:00
return vreinterpretq_u8_p128 (
2023-03-21 09:56:43 +01:00
MBEDTLS_VMULL_P64 (
vget_low_p64 ( vreinterpretq_p64_u8 ( a ) ) ,
vget_low_p64 ( vreinterpretq_p64_u8 ( b ) )
) ) ;
2023-01-10 11:17:15 +01:00
}
static inline uint8x16_t pmull_high ( uint8x16_t a , uint8x16_t b )
{
return vreinterpretq_u8_p128 (
vmull_high_p64 ( vreinterpretq_p64_u8 ( a ) ,
vreinterpretq_p64_u8 ( b ) ) ) ;
}
2023-03-14 08:00:29 +01:00
/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
2023-03-13 03:09:34 +01:00
* ` x ^ 128 + x ^ 7 + x ^ 2 + x + 1 ` .
2023-03-07 08:44:59 +01:00
*
* Arm64 only has 64 b - > 128 b polynomial multipliers , we need to do 4 64 b
* multiplies to generate a 128 b .
*
* ` poly_mult_128 ` executes polynomial multiplication and outputs 256 b that
* represented by 3 128 b due to code size optimization .
*
* Output layout :
* | | | |
* | - - - - - - - - - - - - | - - - - - - - - - - - - - | - - - - - - - - - - - - - |
* | ret . val [ 0 ] | h3 : h2 : 00 : 00 | high 128 b |
2023-03-14 10:28:52 +01:00
* | ret . val [ 1 ] | : m2 : m1 : 00 | middle 128 b |
2023-03-07 08:44:59 +01:00
* | ret . val [ 2 ] | : : l1 : l0 | low 128 b |
*/
2023-01-10 11:17:15 +01:00
static inline uint8x16x3_t poly_mult_128 ( uint8x16_t a , uint8x16_t b )
{
uint8x16x3_t ret ;
2023-03-14 10:28:52 +01:00
uint8x16_t h , m , l ; /* retval high/middle/low */
2023-03-07 08:44:59 +01:00
uint8x16_t c , d , e ;
h = pmull_high ( a , b ) ; /* h3:h2:00:00 = a1*b1 */
l = pmull_low ( a , b ) ; /* : :l1:l0 = a0*b0 */
c = vextq_u8 ( b , b , 8 ) ; /* :c1:c0 = b0:b1 */
d = pmull_high ( a , c ) ; /* :d2:d1:00 = a1*b0 */
e = pmull_low ( a , c ) ; /* :e2:e1:00 = a0*b1 */
m = veorq_u8 ( d , e ) ; /* :m2:m1:00 = d + e */
ret . val [ 0 ] = h ;
ret . val [ 1 ] = m ;
ret . val [ 2 ] = l ;
2023-01-10 11:17:15 +01:00
return ret ;
}
2023-03-07 08:44:59 +01:00
/*
* Modulo reduction .
*
* See : https : //www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
*
* Section 4.3
*
* Modular reduction is slightly more complex . Write the GCM modulus as f ( z ) =
* z ^ 128 + r ( z ) , where r ( z ) = z ^ 7 + z ^ 2 + z + 1. The well known approach is to
2023-03-15 07:50:42 +01:00
* consider that z ^ 128 ≡ r ( z ) ( mod z ^ 128 + r ( z ) ) , allowing us to write the 256 - bit
* operand to be reduced as a ( z ) = h ( z ) z ^ 128 + l ( z ) ≡ h ( z ) r ( z ) + l ( z ) . That is , we
* simply multiply the higher part of the operand by r ( z ) and add it to l ( z ) . If
2023-03-07 08:44:59 +01:00
* the result is still larger than 128 bits , we reduce again .
*/
static inline uint8x16_t poly_mult_reduce ( uint8x16x3_t input )
2023-01-10 11:17:15 +01:00
{
2023-03-07 08:44:59 +01:00
uint8x16_t const ZERO = vdupq_n_u8 ( 0 ) ;
2023-03-21 09:59:13 +01:00
2023-01-10 11:17:15 +01:00
uint64x2_t r = vreinterpretq_u64_u8 ( vdupq_n_u8 ( 0x87 ) ) ;
2023-03-21 09:59:13 +01:00
# if defined(__GNUC__)
/* use 'asm' as an optimisation barrier to prevent loading MODULO from
* memory . It is for GNUC compatible compilers .
*/
2023-01-10 11:17:15 +01:00
asm ( " " : " +w " ( r ) ) ;
2023-03-21 09:59:13 +01:00
# endif
2023-03-07 08:44:59 +01:00
uint8x16_t const MODULO = vreinterpretq_u8_u64 ( vshrq_n_u64 ( r , 64 - 8 ) ) ;
2023-03-14 10:28:52 +01:00
uint8x16_t h , m , l ; /* input high/middle/low 128b */
2023-03-07 08:44:59 +01:00
uint8x16_t c , d , e , f , g , n , o ;
h = input . val [ 0 ] ; /* h3:h2:00:00 */
m = input . val [ 1 ] ; /* :m2:m1:00 */
l = input . val [ 2 ] ; /* : :l1:l0 */
c = pmull_high ( h , MODULO ) ; /* :c2:c1:00 = reduction of h3 */
d = pmull_low ( h , MODULO ) ; /* : :d1:d0 = reduction of h2 */
e = veorq_u8 ( c , m ) ; /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
f = pmull_high ( e , MODULO ) ; /* : :f1:f0 = reduction of e2 */
g = vextq_u8 ( ZERO , e , 8 ) ; /* : :g1:00 = e1:00 */
n = veorq_u8 ( d , l ) ; /* : :n1:n0 = d1:d0 + l1:l0 */
o = veorq_u8 ( n , f ) ; /* o1:o0 = f1:f0 + n1:n0 */
return veorq_u8 ( o , g ) ; /* = o1:o0 + g1:00 */
2023-01-10 11:17:15 +01:00
}
/*
* GCM multiplication : c = a times b in GF ( 2 ^ 128 )
*/
void mbedtls_aesce_gcm_mult ( unsigned char c [ 16 ] ,
const unsigned char a [ 16 ] ,
const unsigned char b [ 16 ] )
{
uint8x16_t va , vb , vc ;
va = vrbitq_u8 ( vld1q_u8 ( & a [ 0 ] ) ) ;
vb = vrbitq_u8 ( vld1q_u8 ( & b [ 0 ] ) ) ;
vc = vrbitq_u8 ( poly_mult_reduce ( poly_mult_128 ( va , vb ) ) ) ;
vst1q_u8 ( & c [ 0 ] , vc ) ;
}
# endif /* MBEDTLS_GCM_C */
2023-03-03 08:51:07 +01:00
# if defined(MBEDTLS_POP_TARGET_PRAGMA)
# if defined(__clang__)
# pragma clang attribute pop
# elif defined(__GNUC__)
# pragma GCC pop_options
# endif
# undef MBEDTLS_POP_TARGET_PRAGMA
# endif
2023-10-08 11:29:26 +02:00
# endif /* MBEDTLS_ARCH_IS_ARMV8 */
2023-01-10 09:57:21 +01:00
# endif /* MBEDTLS_AESCE_C */