From fcce8d4aa1a14e28d46ac11c44ab15a77e143a0f Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 21 Mar 2020 19:33:18 -0400 Subject: [PATCH] target/arm: Convert PMULL.8 to gvec We still need two different helpers, since NEON and SVE2 get the inputs from different locations within the source vector. However, we can convert both to the same internal form for computation. The sve2 helper is not used yet, but adding it with this patch helps illustrate why the neon changes are helpful. Backports commit e7e96fc5ec8c79dc77fef522d5226ac09f684ba5 from qemu --- qemu/aarch64.h | 3 +- qemu/aarch64eb.h | 3 +- qemu/arm.h | 2 +- qemu/armeb.h | 2 +- qemu/header_gen.py | 3 +- qemu/m68k.h | 2 +- qemu/mips.h | 2 +- qemu/mips64.h | 2 +- qemu/mips64el.h | 2 +- qemu/mipsel.h | 2 +- qemu/powerpc.h | 2 +- qemu/riscv32.h | 2 +- qemu/riscv64.h | 2 +- qemu/sparc.h | 2 +- qemu/sparc64.h | 2 +- qemu/target/arm/helper-sve.h | 2 ++ qemu/target/arm/helper.h | 3 +- qemu/target/arm/neon_helper.c | 33 ------------------ qemu/target/arm/translate-a64.c | 27 ++++++++++----- qemu/target/arm/translate.c | 26 +++++++------- qemu/target/arm/vec_helper.c | 60 +++++++++++++++++++++++++++++++++ qemu/x86_64.h | 2 +- 22 files changed, 114 insertions(+), 72 deletions(-) diff --git a/qemu/aarch64.h b/qemu/aarch64.h index 4273c459..2c6ac67a 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_aarch64 #define helper_neon_mul_u16 helper_neon_mul_u16_aarch64 #define helper_neon_mul_u8 helper_neon_mul_u8_aarch64 -#define helper_neon_mull_p8 helper_neon_mull_p8_aarch64 #define helper_neon_mull_s16 helper_neon_mull_s16_aarch64 #define helper_neon_mull_s8 helper_neon_mull_s8_aarch64 #define helper_neon_mull_u16 helper_neon_mull_u16_aarch64 @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_aarch64 #define helper_neon_pmin_u16 helper_neon_pmin_u16_aarch64 #define helper_neon_pmin_u8 helper_neon_pmin_u8_aarch64 +#define helper_neon_pmull_h helper_neon_pmull_h_aarch64 #define helper_neon_qabs_s16 helper_neon_qabs_s16_aarch64 #define helper_neon_qabs_s32 helper_neon_qabs_s32_aarch64 #define helper_neon_qabs_s64 helper_neon_qabs_s64_aarch64 @@ -4401,6 +4401,7 @@ #define helper_sve_zip_h helper_sve_zip_h_aarch64 #define helper_sve_zip_s helper_sve_zip_s_aarch64 #define helper_sve_zip_p helper_sve_zip_p_aarch64 +#define helper_sve2_pmull_h helper_sve2_pmull_h_aarch64 #define helper_udiv64 helper_udiv64_aarch64 #define helper_vfp_cmpd_a64 helper_vfp_cmpd_a64_aarch64 #define helper_vfp_cmped_a64 helper_vfp_cmped_a64_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index 18057d81..076ff7e3 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_aarch64eb #define helper_neon_mul_u16 helper_neon_mul_u16_aarch64eb #define helper_neon_mul_u8 helper_neon_mul_u8_aarch64eb -#define helper_neon_mull_p8 helper_neon_mull_p8_aarch64eb #define helper_neon_mull_s16 helper_neon_mull_s16_aarch64eb #define helper_neon_mull_s8 helper_neon_mull_s8_aarch64eb #define helper_neon_mull_u16 helper_neon_mull_u16_aarch64eb @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_aarch64eb #define helper_neon_pmin_u16 helper_neon_pmin_u16_aarch64eb #define helper_neon_pmin_u8 helper_neon_pmin_u8_aarch64eb +#define helper_neon_pmull_h helper_neon_pmull_h_aarch64eb #define helper_neon_qabs_s16 helper_neon_qabs_s16_aarch64eb #define helper_neon_qabs_s32 helper_neon_qabs_s32_aarch64eb #define helper_neon_qabs_s64 helper_neon_qabs_s64_aarch64eb @@ -4401,6 +4401,7 @@ #define helper_sve_zip_h helper_sve_zip_h_aarch64eb #define helper_sve_zip_s helper_sve_zip_s_aarch64eb #define helper_sve_zip_p helper_sve_zip_p_aarch64eb +#define helper_sve2_pmull_h helper_sve2_pmull_h_aarch64eb #define helper_udiv64 helper_udiv64_aarch64eb #define helper_vfp_cmpd_a64 helper_vfp_cmpd_a64_aarch64eb #define helper_vfp_cmped_a64 helper_vfp_cmped_a64_aarch64eb diff --git a/qemu/arm.h b/qemu/arm.h index f6150cec..bfc64fb3 100644 --- a/qemu/arm.h +++ b/qemu/arm.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_arm #define helper_neon_mul_u16 helper_neon_mul_u16_arm #define helper_neon_mul_u8 helper_neon_mul_u8_arm -#define helper_neon_mull_p8 helper_neon_mull_p8_arm #define helper_neon_mull_s16 helper_neon_mull_s16_arm #define helper_neon_mull_s8 helper_neon_mull_s8_arm #define helper_neon_mull_u16 helper_neon_mull_u16_arm @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_arm #define helper_neon_pmin_u16 helper_neon_pmin_u16_arm #define helper_neon_pmin_u8 helper_neon_pmin_u8_arm +#define helper_neon_pmull_h helper_neon_pmull_h_arm #define helper_neon_qabs_s16 helper_neon_qabs_s16_arm #define helper_neon_qabs_s32 helper_neon_qabs_s32_arm #define helper_neon_qabs_s64 helper_neon_qabs_s64_arm diff --git a/qemu/armeb.h b/qemu/armeb.h index dd09f2e3..de1363ff 100644 --- a/qemu/armeb.h +++ b/qemu/armeb.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_armeb #define helper_neon_mul_u16 helper_neon_mul_u16_armeb #define helper_neon_mul_u8 helper_neon_mul_u8_armeb -#define helper_neon_mull_p8 helper_neon_mull_p8_armeb #define helper_neon_mull_s16 helper_neon_mull_s16_armeb #define helper_neon_mull_s8 helper_neon_mull_s8_armeb #define helper_neon_mull_u16 helper_neon_mull_u16_armeb @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_armeb #define helper_neon_pmin_u16 helper_neon_pmin_u16_armeb #define helper_neon_pmin_u8 helper_neon_pmin_u8_armeb +#define helper_neon_pmull_h helper_neon_pmull_h_armeb #define helper_neon_qabs_s16 helper_neon_qabs_s16_armeb #define helper_neon_qabs_s32 helper_neon_qabs_s32_armeb #define helper_neon_qabs_s64 helper_neon_qabs_s64_armeb diff --git a/qemu/header_gen.py b/qemu/header_gen.py index fd588eea..837fe4a6 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -1535,7 +1535,6 @@ symbols = ( 'helper_neon_min_u8', 'helper_neon_mul_u16', 'helper_neon_mul_u8', - 'helper_neon_mull_p8', 'helper_neon_mull_s16', 'helper_neon_mull_s8', 'helper_neon_mull_u16', @@ -1566,6 +1565,7 @@ symbols = ( 'helper_neon_pmin_s8', 'helper_neon_pmin_u16', 'helper_neon_pmin_u8', + 'helper_neon_pmull_h', 'helper_neon_qabs_s16', 'helper_neon_qabs_s32', 'helper_neon_qabs_s64', @@ -4463,6 +4463,7 @@ aarch64_symbols = ( 'helper_sve_zip_h', 'helper_sve_zip_s', 'helper_sve_zip_p', + 'helper_sve2_pmull_h', 'helper_udiv64', 'helper_vfp_cmpd_a64', 'helper_vfp_cmped_a64', diff --git a/qemu/m68k.h b/qemu/m68k.h index 5e5d870d..643d05f7 100644 --- a/qemu/m68k.h +++ b/qemu/m68k.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_m68k #define helper_neon_mul_u16 helper_neon_mul_u16_m68k #define helper_neon_mul_u8 helper_neon_mul_u8_m68k -#define helper_neon_mull_p8 helper_neon_mull_p8_m68k #define helper_neon_mull_s16 helper_neon_mull_s16_m68k #define helper_neon_mull_s8 helper_neon_mull_s8_m68k #define helper_neon_mull_u16 helper_neon_mull_u16_m68k @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_m68k #define helper_neon_pmin_u16 helper_neon_pmin_u16_m68k #define helper_neon_pmin_u8 helper_neon_pmin_u8_m68k +#define helper_neon_pmull_h helper_neon_pmull_h_m68k #define helper_neon_qabs_s16 helper_neon_qabs_s16_m68k #define helper_neon_qabs_s32 helper_neon_qabs_s32_m68k #define helper_neon_qabs_s64 helper_neon_qabs_s64_m68k diff --git a/qemu/mips.h b/qemu/mips.h index 578d7dd1..acc4146e 100644 --- a/qemu/mips.h +++ b/qemu/mips.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_mips #define helper_neon_mul_u16 helper_neon_mul_u16_mips #define helper_neon_mul_u8 helper_neon_mul_u8_mips -#define helper_neon_mull_p8 helper_neon_mull_p8_mips #define helper_neon_mull_s16 helper_neon_mull_s16_mips #define helper_neon_mull_s8 helper_neon_mull_s8_mips #define helper_neon_mull_u16 helper_neon_mull_u16_mips @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_mips #define helper_neon_pmin_u16 helper_neon_pmin_u16_mips #define helper_neon_pmin_u8 helper_neon_pmin_u8_mips +#define helper_neon_pmull_h helper_neon_pmull_h_mips #define helper_neon_qabs_s16 helper_neon_qabs_s16_mips #define helper_neon_qabs_s32 helper_neon_qabs_s32_mips #define helper_neon_qabs_s64 helper_neon_qabs_s64_mips diff --git a/qemu/mips64.h b/qemu/mips64.h index 8652d37e..4058f353 100644 --- a/qemu/mips64.h +++ b/qemu/mips64.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_mips64 #define helper_neon_mul_u16 helper_neon_mul_u16_mips64 #define helper_neon_mul_u8 helper_neon_mul_u8_mips64 -#define helper_neon_mull_p8 helper_neon_mull_p8_mips64 #define helper_neon_mull_s16 helper_neon_mull_s16_mips64 #define helper_neon_mull_s8 helper_neon_mull_s8_mips64 #define helper_neon_mull_u16 helper_neon_mull_u16_mips64 @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_mips64 #define helper_neon_pmin_u16 helper_neon_pmin_u16_mips64 #define helper_neon_pmin_u8 helper_neon_pmin_u8_mips64 +#define helper_neon_pmull_h helper_neon_pmull_h_mips64 #define helper_neon_qabs_s16 helper_neon_qabs_s16_mips64 #define helper_neon_qabs_s32 helper_neon_qabs_s32_mips64 #define helper_neon_qabs_s64 helper_neon_qabs_s64_mips64 diff --git a/qemu/mips64el.h b/qemu/mips64el.h index 051bb0af..581e41ef 100644 --- a/qemu/mips64el.h +++ b/qemu/mips64el.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_mips64el #define helper_neon_mul_u16 helper_neon_mul_u16_mips64el #define helper_neon_mul_u8 helper_neon_mul_u8_mips64el -#define helper_neon_mull_p8 helper_neon_mull_p8_mips64el #define helper_neon_mull_s16 helper_neon_mull_s16_mips64el #define helper_neon_mull_s8 helper_neon_mull_s8_mips64el #define helper_neon_mull_u16 helper_neon_mull_u16_mips64el @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_mips64el #define helper_neon_pmin_u16 helper_neon_pmin_u16_mips64el #define helper_neon_pmin_u8 helper_neon_pmin_u8_mips64el +#define helper_neon_pmull_h helper_neon_pmull_h_mips64el #define helper_neon_qabs_s16 helper_neon_qabs_s16_mips64el #define helper_neon_qabs_s32 helper_neon_qabs_s32_mips64el #define helper_neon_qabs_s64 helper_neon_qabs_s64_mips64el diff --git a/qemu/mipsel.h b/qemu/mipsel.h index f303f30a..e4ae3f39 100644 --- a/qemu/mipsel.h +++ b/qemu/mipsel.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_mipsel #define helper_neon_mul_u16 helper_neon_mul_u16_mipsel #define helper_neon_mul_u8 helper_neon_mul_u8_mipsel -#define helper_neon_mull_p8 helper_neon_mull_p8_mipsel #define helper_neon_mull_s16 helper_neon_mull_s16_mipsel #define helper_neon_mull_s8 helper_neon_mull_s8_mipsel #define helper_neon_mull_u16 helper_neon_mull_u16_mipsel @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_mipsel #define helper_neon_pmin_u16 helper_neon_pmin_u16_mipsel #define helper_neon_pmin_u8 helper_neon_pmin_u8_mipsel +#define helper_neon_pmull_h helper_neon_pmull_h_mipsel #define helper_neon_qabs_s16 helper_neon_qabs_s16_mipsel #define helper_neon_qabs_s32 helper_neon_qabs_s32_mipsel #define helper_neon_qabs_s64 helper_neon_qabs_s64_mipsel diff --git a/qemu/powerpc.h b/qemu/powerpc.h index 663ad749..5b712ba5 100644 --- a/qemu/powerpc.h +++ b/qemu/powerpc.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_powerpc #define helper_neon_mul_u16 helper_neon_mul_u16_powerpc #define helper_neon_mul_u8 helper_neon_mul_u8_powerpc -#define helper_neon_mull_p8 helper_neon_mull_p8_powerpc #define helper_neon_mull_s16 helper_neon_mull_s16_powerpc #define helper_neon_mull_s8 helper_neon_mull_s8_powerpc #define helper_neon_mull_u16 helper_neon_mull_u16_powerpc @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_powerpc #define helper_neon_pmin_u16 helper_neon_pmin_u16_powerpc #define helper_neon_pmin_u8 helper_neon_pmin_u8_powerpc +#define helper_neon_pmull_h helper_neon_pmull_h_powerpc #define helper_neon_qabs_s16 helper_neon_qabs_s16_powerpc #define helper_neon_qabs_s32 helper_neon_qabs_s32_powerpc #define helper_neon_qabs_s64 helper_neon_qabs_s64_powerpc diff --git a/qemu/riscv32.h b/qemu/riscv32.h index 5e66f042..56dd4025 100644 --- a/qemu/riscv32.h +++ b/qemu/riscv32.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_riscv32 #define helper_neon_mul_u16 helper_neon_mul_u16_riscv32 #define helper_neon_mul_u8 helper_neon_mul_u8_riscv32 -#define helper_neon_mull_p8 helper_neon_mull_p8_riscv32 #define helper_neon_mull_s16 helper_neon_mull_s16_riscv32 #define helper_neon_mull_s8 helper_neon_mull_s8_riscv32 #define helper_neon_mull_u16 helper_neon_mull_u16_riscv32 @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_riscv32 #define helper_neon_pmin_u16 helper_neon_pmin_u16_riscv32 #define helper_neon_pmin_u8 helper_neon_pmin_u8_riscv32 +#define helper_neon_pmull_h helper_neon_pmull_h_riscv32 #define helper_neon_qabs_s16 helper_neon_qabs_s16_riscv32 #define helper_neon_qabs_s32 helper_neon_qabs_s32_riscv32 #define helper_neon_qabs_s64 helper_neon_qabs_s64_riscv32 diff --git a/qemu/riscv64.h b/qemu/riscv64.h index ee9c4d34..070e5824 100644 --- a/qemu/riscv64.h +++ b/qemu/riscv64.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_riscv64 #define helper_neon_mul_u16 helper_neon_mul_u16_riscv64 #define helper_neon_mul_u8 helper_neon_mul_u8_riscv64 -#define helper_neon_mull_p8 helper_neon_mull_p8_riscv64 #define helper_neon_mull_s16 helper_neon_mull_s16_riscv64 #define helper_neon_mull_s8 helper_neon_mull_s8_riscv64 #define helper_neon_mull_u16 helper_neon_mull_u16_riscv64 @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_riscv64 #define helper_neon_pmin_u16 helper_neon_pmin_u16_riscv64 #define helper_neon_pmin_u8 helper_neon_pmin_u8_riscv64 +#define helper_neon_pmull_h helper_neon_pmull_h_riscv64 #define helper_neon_qabs_s16 helper_neon_qabs_s16_riscv64 #define helper_neon_qabs_s32 helper_neon_qabs_s32_riscv64 #define helper_neon_qabs_s64 helper_neon_qabs_s64_riscv64 diff --git a/qemu/sparc.h b/qemu/sparc.h index d0d6eb0f..ffb965be 100644 --- a/qemu/sparc.h +++ b/qemu/sparc.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_sparc #define helper_neon_mul_u16 helper_neon_mul_u16_sparc #define helper_neon_mul_u8 helper_neon_mul_u8_sparc -#define helper_neon_mull_p8 helper_neon_mull_p8_sparc #define helper_neon_mull_s16 helper_neon_mull_s16_sparc #define helper_neon_mull_s8 helper_neon_mull_s8_sparc #define helper_neon_mull_u16 helper_neon_mull_u16_sparc @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_sparc #define helper_neon_pmin_u16 helper_neon_pmin_u16_sparc #define helper_neon_pmin_u8 helper_neon_pmin_u8_sparc +#define helper_neon_pmull_h helper_neon_pmull_h_sparc #define helper_neon_qabs_s16 helper_neon_qabs_s16_sparc #define helper_neon_qabs_s32 helper_neon_qabs_s32_sparc #define helper_neon_qabs_s64 helper_neon_qabs_s64_sparc diff --git a/qemu/sparc64.h b/qemu/sparc64.h index 3454c034..b823f0c7 100644 --- a/qemu/sparc64.h +++ b/qemu/sparc64.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_sparc64 #define helper_neon_mul_u16 helper_neon_mul_u16_sparc64 #define helper_neon_mul_u8 helper_neon_mul_u8_sparc64 -#define helper_neon_mull_p8 helper_neon_mull_p8_sparc64 #define helper_neon_mull_s16 helper_neon_mull_s16_sparc64 #define helper_neon_mull_s8 helper_neon_mull_s8_sparc64 #define helper_neon_mull_u16 helper_neon_mull_u16_sparc64 @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_sparc64 #define helper_neon_pmin_u16 helper_neon_pmin_u16_sparc64 #define helper_neon_pmin_u8 helper_neon_pmin_u8_sparc64 +#define helper_neon_pmull_h helper_neon_pmull_h_sparc64 #define helper_neon_qabs_s16 helper_neon_qabs_s16_sparc64 #define helper_neon_qabs_s32 helper_neon_qabs_s32_sparc64 #define helper_neon_qabs_s64 helper_neon_qabs_s64_sparc64 diff --git a/qemu/target/arm/helper-sve.h b/qemu/target/arm/helper-sve.h index 9e79182a..2f472791 100644 --- a/qemu/target/arm/helper-sve.h +++ b/qemu/target/arm/helper-sve.h @@ -1574,3 +1574,5 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) diff --git a/qemu/target/arm/helper.h b/qemu/target/arm/helper.h index 0271d075..c7f89b0f 100644 --- a/qemu/target/arm/helper.h +++ b/qemu/target/arm/helper.h @@ -339,7 +339,6 @@ DEF_HELPER_2(neon_sub_u8, i32, i32, i32) DEF_HELPER_2(neon_sub_u16, i32, i32, i32) DEF_HELPER_2(neon_mul_u8, i32, i32, i32) DEF_HELPER_2(neon_mul_u16, i32, i32, i32) -DEF_HELPER_2(neon_mull_p8, i64, i32, i32) DEF_HELPER_2(neon_tst_u8, i32, i32, i32) DEF_HELPER_2(neon_tst_u16, i32, i32, i32) @@ -692,6 +691,8 @@ DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + #ifdef TARGET_ARM #define helper_clz helper_clz_arm #define gen_helper_clz gen_helper_clz_arm diff --git a/qemu/target/arm/neon_helper.c b/qemu/target/arm/neon_helper.c index 540cefa3..8cf09dd1 100644 --- a/qemu/target/arm/neon_helper.c +++ b/qemu/target/arm/neon_helper.c @@ -1141,39 +1141,6 @@ NEON_VOP(mul_u8, neon_u8, 4) NEON_VOP(mul_u16, neon_u16, 2) #undef NEON_FN -/* Polynomial multiplication is like integer multiplication except the - partial products are XORed, not added. */ - -uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2) -{ - uint64_t result = 0; - uint64_t mask; - uint64_t op2ex = op2; - op2ex = (op2ex & 0xff) | - ((op2ex & 0xff00) << 8) | - ((op2ex & 0xff0000) << 16) | - ((op2ex & 0xff000000) << 24); - while (op1) { - mask = 0; - if (op1 & 1) { - mask |= 0xffff; - } - if (op1 & (1 << 8)) { - mask |= (0xffffU << 16); - } - if (op1 & (1 << 16)) { - mask |= (0xffffULL << 32); - } - if (op1 & (1 << 24)) { - mask |= (0xffffULL << 48); - } - result ^= op2ex & mask; - op1 = (op1 >> 1) & 0x7f7f7f7f; - op2ex <<= 1; - } - return result; -} - #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 NEON_VOP(tst_u8, neon_u8, 4) NEON_VOP(tst_u16, neon_u16, 2) diff --git a/qemu/target/arm/translate-a64.c b/qemu/target/arm/translate-a64.c index 72fc409d..6121c1ed 100644 --- a/qemu/target/arm/translate-a64.c +++ b/qemu/target/arm/translate-a64.c @@ -10830,10 +10830,6 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size, gen_helper_neon_addl_saturate_s32(tcg_ctx, tcg_passres, tcg_ctx->cpu_env, tcg_passres, tcg_passres); break; - case 14: /* PMULL */ - assert(size == 0); - gen_helper_neon_mull_p8(tcg_ctx, tcg_passres, tcg_op1, tcg_op2); - break; default: g_assert_not_reached(); } @@ -10999,11 +10995,21 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm); break; case 14: /* PMULL, PMULL2 */ - if (is_u || size == 1 || size == 2) { + if (is_u) { unallocated_encoding(s); return; } - if (size == 3) { + switch (size) { + case 0: /* PMULL.P8 */ + if (!fp_access_check(s)) { + return; + } + /* The Q field specifies lo/hi half input for this insn. */ + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, + gen_helper_neon_pmull_h); + break; + + case 3: /* PMULL.P64 */ if (!dc_isar_feature(aa64_pmull, s)) { unallocated_encoding(s); return; @@ -11014,9 +11020,13 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) /* The Q field specifies lo/hi half input for this insn. */ gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, gen_helper_gvec_pmull_q); - return; + break; + + default: + unallocated_encoding(s); + break; } - goto is_widening; + return; case 9: /* SQDMLAL, SQDMLAL2 */ case 11: /* SQDMLSL, SQDMLSL2 */ case 13: /* SQDMULL, SQDMULL2 */ @@ -11037,7 +11047,6 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) unallocated_encoding(s); return; } - is_widening: if (!fp_access_check(s)) { return; } diff --git a/qemu/target/arm/translate.c b/qemu/target/arm/translate.c index 4135f3d6..a5d53ad5 100644 --- a/qemu/target/arm/translate.c +++ b/qemu/target/arm/translate.c @@ -5999,15 +5999,20 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) return 1; } - /* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply) - * outside the loop below as it only performs a single pass. - */ - if (op == 14 && size == 2) { - if (!dc_isar_feature(aa32_pmull, s)) { - return 1; + /* Handle polynomial VMULL in a single pass. */ + if (op == 14) { + if (size == 0) { + /* VMULL.P8 */ + tcg_gen_gvec_3_ool(tcg_ctx, rd_ofs, rn_ofs, rm_ofs, 16, 16, + 0, gen_helper_neon_pmull_h); + } else { + /* VMULL.P64 */ + if (!dc_isar_feature(aa32_pmull, s)) { + return 1; + } + tcg_gen_gvec_3_ool(tcg_ctx, rd_ofs, rn_ofs, rm_ofs, 16, 16, + 0, gen_helper_gvec_pmull_q); } - tcg_gen_gvec_3_ool(tcg_ctx, rd_ofs, rn_ofs, rm_ofs, 16, 16, - 0, gen_helper_gvec_pmull_q); return 0; } @@ -6085,11 +6090,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */ gen_neon_mull(s, s->V0, tmp, tmp2, size, u); break; - case 14: /* Polynomial VMULL */ - gen_helper_neon_mull_p8(tcg_ctx, s->V0, tmp, tmp2); - tcg_temp_free_i32(tcg_ctx, tmp2); - tcg_temp_free_i32(tcg_ctx, tmp); - break; default: /* 15 is RESERVED: caught earlier */ abort(); } diff --git a/qemu/target/arm/vec_helper.c b/qemu/target/arm/vec_helper.c index 6bf1ec09..e2ca4a42 100644 --- a/qemu/target/arm/vec_helper.c +++ b/qemu/target/arm/vec_helper.c @@ -1198,3 +1198,63 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) } clear_tail(d, opr_sz, simd_maxsz(desc)); } + +/* + * 8x8->16 polynomial multiply. + * + * The byte inputs are expanded to (or extracted from) half-words. + * Note that neon and sve2 get the inputs from different positions. + * This allows 4 bytes to be processed in parallel with uint64_t. + */ + +static uint64_t expand_byte_to_half(uint64_t x) +{ + return (x & 0x000000ff) + | ((x & 0x0000ff00) << 8) + | ((x & 0x00ff0000) << 16) + | ((x & 0xff000000) << 24); +} + +static uint64_t pmull_h(uint64_t op1, uint64_t op2) +{ + uint64_t result = 0; + int i; + + for (i = 0; i < 8; ++i) { + uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; + result ^= op2 & mask; + op1 >>= 1; + op2 <<= 1; + } + return result; +} + +void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + int hi = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + uint64_t nn = n[hi], mm = m[hi]; + + d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + nn >>= 32; + mm >>= 32; + d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + + clear_tail(d, 16, simd_maxsz(desc)); +} + +#ifdef TARGET_AARCH64 +void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + int shift = simd_data(desc) * 8; + intptr_t i, opr_sz = simd_oprsz(desc); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; + uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; + + d[i] = pmull_h(nn, mm); + } +} +#endif diff --git a/qemu/x86_64.h b/qemu/x86_64.h index 5a154d5e..93c0b9f7 100644 --- a/qemu/x86_64.h +++ b/qemu/x86_64.h @@ -1529,7 +1529,6 @@ #define helper_neon_min_u8 helper_neon_min_u8_x86_64 #define helper_neon_mul_u16 helper_neon_mul_u16_x86_64 #define helper_neon_mul_u8 helper_neon_mul_u8_x86_64 -#define helper_neon_mull_p8 helper_neon_mull_p8_x86_64 #define helper_neon_mull_s16 helper_neon_mull_s16_x86_64 #define helper_neon_mull_s8 helper_neon_mull_s8_x86_64 #define helper_neon_mull_u16 helper_neon_mull_u16_x86_64 @@ -1560,6 +1559,7 @@ #define helper_neon_pmin_s8 helper_neon_pmin_s8_x86_64 #define helper_neon_pmin_u16 helper_neon_pmin_u16_x86_64 #define helper_neon_pmin_u8 helper_neon_pmin_u8_x86_64 +#define helper_neon_pmull_h helper_neon_pmull_h_x86_64 #define helper_neon_qabs_s16 helper_neon_qabs_s16_x86_64 #define helper_neon_qabs_s32 helper_neon_qabs_s32_x86_64 #define helper_neon_qabs_s64 helper_neon_qabs_s64_x86_64