diff --git a/qemu/aarch64.h b/qemu/aarch64.h index fce5481f..3f0af216 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_aarch64 #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_aarch64 #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_aarch64 +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_aarch64 +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_aarch64 +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_aarch64 +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_aarch64 #define helper_gvec_fmul_d helper_gvec_fmul_d_aarch64 #define helper_gvec_fmul_h helper_gvec_fmul_h_aarch64 #define helper_gvec_fmul_s helper_gvec_fmul_s_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index 76815cd1..afe6f2d3 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_aarch64eb #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_aarch64eb #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_aarch64eb +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_aarch64eb +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_aarch64eb +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_aarch64eb +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_aarch64eb #define helper_gvec_fmul_d helper_gvec_fmul_d_aarch64eb #define helper_gvec_fmul_h helper_gvec_fmul_h_aarch64eb #define helper_gvec_fmul_s helper_gvec_fmul_s_aarch64eb diff --git a/qemu/arm.h b/qemu/arm.h index 894a2b15..2721ccde 100644 --- a/qemu/arm.h +++ b/qemu/arm.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_arm #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_arm #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_arm +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_arm +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_arm +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_arm +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_arm #define helper_gvec_fmul_d helper_gvec_fmul_d_arm #define helper_gvec_fmul_h helper_gvec_fmul_h_arm #define helper_gvec_fmul_s helper_gvec_fmul_s_arm diff --git a/qemu/armeb.h b/qemu/armeb.h index 2dedfd2d..80a18393 100644 --- a/qemu/armeb.h +++ b/qemu/armeb.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_armeb #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_armeb #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_armeb +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_armeb +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_armeb +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_armeb +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_armeb #define helper_gvec_fmul_d helper_gvec_fmul_d_armeb #define helper_gvec_fmul_h helper_gvec_fmul_h_armeb #define helper_gvec_fmul_s helper_gvec_fmul_s_armeb diff --git a/qemu/header_gen.py b/qemu/header_gen.py index 60d9c560..82bcd67d 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -1156,6 +1156,10 @@ symbols = ( 'helper_gvec_fmla_idx_d', 'helper_gvec_fmla_idx_h', 'helper_gvec_fmla_idx_s', + 'helper_gvec_fmlal_a32', + 'helper_gvec_fmlal_a64', + 'helper_gvec_fmlal_idx_a32', + 'helper_gvec_fmlal_idx_a64', 'helper_gvec_fmul_d', 'helper_gvec_fmul_h', 'helper_gvec_fmul_s', diff --git a/qemu/m68k.h b/qemu/m68k.h index 100b1ca8..fa802545 100644 --- a/qemu/m68k.h +++ b/qemu/m68k.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_m68k #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_m68k #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_m68k +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_m68k +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_m68k +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_m68k +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_m68k #define helper_gvec_fmul_d helper_gvec_fmul_d_m68k #define helper_gvec_fmul_h helper_gvec_fmul_h_m68k #define helper_gvec_fmul_s helper_gvec_fmul_s_m68k diff --git a/qemu/mips.h b/qemu/mips.h index 28f74421..bee595d3 100644 --- a/qemu/mips.h +++ b/qemu/mips.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_mips #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_mips #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_mips +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_mips +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_mips +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_mips +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_mips #define helper_gvec_fmul_d helper_gvec_fmul_d_mips #define helper_gvec_fmul_h helper_gvec_fmul_h_mips #define helper_gvec_fmul_s helper_gvec_fmul_s_mips diff --git a/qemu/mips64.h b/qemu/mips64.h index 26ed4126..62b7f566 100644 --- a/qemu/mips64.h +++ b/qemu/mips64.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_mips64 #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_mips64 #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_mips64 +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_mips64 +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_mips64 +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_mips64 +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_mips64 #define helper_gvec_fmul_d helper_gvec_fmul_d_mips64 #define helper_gvec_fmul_h helper_gvec_fmul_h_mips64 #define helper_gvec_fmul_s helper_gvec_fmul_s_mips64 diff --git a/qemu/mips64el.h b/qemu/mips64el.h index 0db313b7..fdfe192a 100644 --- a/qemu/mips64el.h +++ b/qemu/mips64el.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_mips64el #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_mips64el #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_mips64el +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_mips64el +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_mips64el +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_mips64el +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_mips64el #define helper_gvec_fmul_d helper_gvec_fmul_d_mips64el #define helper_gvec_fmul_h helper_gvec_fmul_h_mips64el #define helper_gvec_fmul_s helper_gvec_fmul_s_mips64el diff --git a/qemu/mipsel.h b/qemu/mipsel.h index bc681380..83479f98 100644 --- a/qemu/mipsel.h +++ b/qemu/mipsel.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_mipsel #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_mipsel #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_mipsel +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_mipsel +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_mipsel +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_mipsel +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_mipsel #define helper_gvec_fmul_d helper_gvec_fmul_d_mipsel #define helper_gvec_fmul_h helper_gvec_fmul_h_mipsel #define helper_gvec_fmul_s helper_gvec_fmul_s_mipsel diff --git a/qemu/powerpc.h b/qemu/powerpc.h index 78775f1d..5782a984 100644 --- a/qemu/powerpc.h +++ b/qemu/powerpc.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_powerpc #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_powerpc #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_powerpc +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_powerpc +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_powerpc +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_powerpc +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_powerpc #define helper_gvec_fmul_d helper_gvec_fmul_d_powerpc #define helper_gvec_fmul_h helper_gvec_fmul_h_powerpc #define helper_gvec_fmul_s helper_gvec_fmul_s_powerpc diff --git a/qemu/sparc.h b/qemu/sparc.h index 2d280458..0f7d3048 100644 --- a/qemu/sparc.h +++ b/qemu/sparc.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_sparc #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_sparc #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_sparc +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_sparc +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_sparc +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_sparc +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_sparc #define helper_gvec_fmul_d helper_gvec_fmul_d_sparc #define helper_gvec_fmul_h helper_gvec_fmul_h_sparc #define helper_gvec_fmul_s helper_gvec_fmul_s_sparc diff --git a/qemu/sparc64.h b/qemu/sparc64.h index 95b8d658..5e4ae4ad 100644 --- a/qemu/sparc64.h +++ b/qemu/sparc64.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_sparc64 #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_sparc64 #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_sparc64 +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_sparc64 +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_sparc64 +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_sparc64 +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_sparc64 #define helper_gvec_fmul_d helper_gvec_fmul_d_sparc64 #define helper_gvec_fmul_h helper_gvec_fmul_h_sparc64 #define helper_gvec_fmul_s helper_gvec_fmul_s_sparc64 diff --git a/qemu/target/arm/helper.h b/qemu/target/arm/helper.h index 44dbf8d6..1f037e09 100644 --- a/qemu/target/arm/helper.h +++ b/qemu/target/arm/helper.h @@ -679,6 +679,15 @@ DEF_HELPER_FLAGS_5(gvec_sqsub_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_sqsub_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmlal_a32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmlal_a64, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_a64, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + #ifdef TARGET_ARM #define helper_clz helper_clz_arm #define gen_helper_clz gen_helper_clz_arm diff --git a/qemu/target/arm/vec_helper.c b/qemu/target/arm/vec_helper.c index b661e8fb..bec39188 100644 --- a/qemu/target/arm/vec_helper.c +++ b/qemu/target/arm/vec_helper.c @@ -900,3 +900,150 @@ void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, clear_tail(d, oprsz, simd_maxsz(desc)); } +/* + * Convert float16 to float32, raising no exceptions and + * preserving exceptional values, including SNaN. + * This is effectively an unpack+repack operation. + */ +static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) +{ + const int f16_bias = 15; + const int f32_bias = 127; + uint32_t sign = extract32(f16, 15, 1); + uint32_t exp = extract32(f16, 10, 5); + uint32_t frac = extract32(f16, 0, 10); + + if (exp == 0x1f) { + /* Inf or NaN */ + exp = 0xff; + } else if (exp == 0) { + /* Zero or denormal. */ + if (frac != 0) { + if (fz16) { + frac = 0; + } else { + /* + * Denormal; these are all normal float32. + * Shift the fraction so that the msb is at bit 11, + * then remove bit 11 as the implicit bit of the + * normalized float32. Note that we still go through + * the shift for normal numbers below, to put the + * float32 fraction at the right place. + */ + int shift = clz32(frac) - 21; + frac = (frac << shift) & 0x3ff; + exp = f32_bias - f16_bias - shift + 1; + } + } + } else { + /* Normal number; adjust the bias. */ + exp += f32_bias - f16_bias; + } + sign <<= 31; + exp <<= 23; + frac <<= 23 - 10; + + return sign | exp | frac; +} + +static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) +{ + /* + * Branchless load of u32[0], u64[0], u32[1], or u64[1]. + * Load the 2nd qword iff is_q & is_2. + * Shift to the 2nd dword iff !is_q & is_2. + * For !is_q & !is_2, the upper bits of the result are garbage. + */ + return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); +} + +/* + * Note that FMLAL requires oprsz == 8 or oprsz == 16, + * as there is not yet SVE versions that might use blocking. + */ + +static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, + uint32_t desc, bool fz16) +{ + intptr_t i, oprsz = simd_oprsz(desc); + int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + int is_q = oprsz == 16; + uint64_t n_4, m_4; + + /* Pre-load all of the f16 data, avoiding overlap issues. */ + n_4 = load4_f16(vn, is_q, is_2); + m_4 = load4_f16(vm, is_q, is_2); + + /* Negate all inputs for FMLSL at once. */ + if (is_s) { + n_4 ^= 0x8000800080008000ull; + } + + for (i = 0; i < oprsz / 4; i++) { + float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); + float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + +void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} + +void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} + +static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, + uint32_t desc, bool fz16) +{ + intptr_t i, oprsz = simd_oprsz(desc); + int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); + int is_q = oprsz == 16; + uint64_t n_4; + float32 m_1; + + /* Pre-load all of the f16 data, avoiding overlap issues. */ + n_4 = load4_f16(vn, is_q, is_2); + + /* Negate all inputs for FMLSL at once. */ + if (is_s) { + n_4 ^= 0x8000800080008000ull; + } + + m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); + + for (i = 0; i < oprsz / 4; i++) { + float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + } + clear_tail(d, oprsz, simd_maxsz(desc)); +} + +void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} + +void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, + void *venv, uint32_t desc) +{ + CPUARMState *env = venv; + do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, + get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); +} diff --git a/qemu/x86_64.h b/qemu/x86_64.h index 58894f6e..c828445f 100644 --- a/qemu/x86_64.h +++ b/qemu/x86_64.h @@ -1150,6 +1150,10 @@ #define helper_gvec_fmla_idx_d helper_gvec_fmla_idx_d_x86_64 #define helper_gvec_fmla_idx_h helper_gvec_fmla_idx_h_x86_64 #define helper_gvec_fmla_idx_s helper_gvec_fmla_idx_s_x86_64 +#define helper_gvec_fmlal_a32 helper_gvec_fmlal_a32_x86_64 +#define helper_gvec_fmlal_a64 helper_gvec_fmlal_a64_x86_64 +#define helper_gvec_fmlal_idx_a32 helper_gvec_fmlal_idx_a32_x86_64 +#define helper_gvec_fmlal_idx_a64 helper_gvec_fmlal_idx_a64_x86_64 #define helper_gvec_fmul_d helper_gvec_fmul_d_x86_64 #define helper_gvec_fmul_h helper_gvec_fmul_h_x86_64 #define helper_gvec_fmul_s helper_gvec_fmul_s_x86_64