diff --git a/qemu/target/arm/neon-dp.decode b/qemu/target/arm/neon-dp.decode
index d66c67ca..4c2f8c77 100644
--- a/qemu/target/arm/neon-dp.decode
+++ b/qemu/target/arm/neon-dp.decode
@@ -180,5 +180,8 @@ VADD_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 0 .... @3same_fp
 VSUB_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
 VPADD_fp_3s      1111 001 1 0 . 0 . .... .... 1101 ... 0 .... @3same_fp_q0
 VABD_fp_3s       1111 001 1 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
+VMLA_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
+VMLS_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 1 .... @3same_fp
+VMUL_fp_3s       1111 001 1 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
 VPMAX_fp_3s      1111 001 1 0 . 0 . .... .... 1111 ... 0 .... @3same_fp_q0
 VPMIN_fp_3s      1111 001 1 0 . 1 . .... .... 1111 ... 0 .... @3same_fp_q0
diff --git a/qemu/target/arm/translate-neon.inc.c b/qemu/target/arm/translate-neon.inc.c
index fb1c5a90..73c6fbd7 100644
--- a/qemu/target/arm/translate-neon.inc.c
+++ b/qemu/target/arm/translate-neon.inc.c
@@ -1040,6 +1040,56 @@ DO_3SAME_PAIR(VPADD, padd_u)
 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
 
+static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
+                        bool reads_vd)
+{
+    /*
+     * FP operations handled elementwise 32 bits at a time.
+     * If reads_vd is true then the old value of Vd will be
+     * loaded before calling the callback function. This is
+     * used for multiply-accumulate type operations.
+     */
+    TCGv_i32 tmp, tmp2;
+    int pass;
+    TCGContext *tcg_ctx = s->uc->tcg_ctx;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
+        return false;
+    }
+
+    /* UNDEF accesses to D16-D31 if they don't exist. */
+    if (!dc_isar_feature(aa32_simd_r32, s) &&
+        ((a->vd | a->vn | a->vm) & 0x10)) {
+        return false;
+    }
+
+    if ((a->vn | a->vm | a->vd) & a->q) {
+        return false;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    TCGv_ptr fpstatus = get_fpstatus_ptr(tcg_ctx, 1);
+    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
+        tmp = neon_load_reg(s, a->vn, pass);
+        tmp2 = neon_load_reg(s, a->vm, pass);
+        if (reads_vd) {
+            TCGv_i32 tmp_rd = neon_load_reg(s, a->vd, pass);
+            fn(tcg_ctx, tmp_rd, tmp, tmp2, fpstatus);
+            neon_store_reg(s, a->vd, pass, tmp_rd);
+            tcg_temp_free_i32(tcg_ctx, tmp);
+        } else {
+            fn(tcg_ctx, tmp, tmp, tmp2, fpstatus);
+            neon_store_reg(s, a->vd, pass, tmp);
+        }
+        tcg_temp_free_i32(tcg_ctx, tmp2);
+    }
+    tcg_temp_free_ptr(tcg_ctx, fpstatus);
+    return true;
+}
+
 /*
  * For all the functions using this macro, size == 1 means fp16,
  * which is an architecture extension we don't implement yet.
@@ -1067,6 +1117,38 @@ DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s)
 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s)
 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s)
+DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s)
+
+/*
+ * For all the functions using this macro, size == 1 means fp16,
+ * which is an architecture extension we don't implement yet.
+ */
+#define DO_3S_FP(INSN,FUNC,READS_VD)                                \
+    static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
+    {                                                               \
+        if (a->size != 0) {                                         \
+            /* TODO fp16 support */                                 \
+            return false;                                           \
+        }                                                           \
+        return do_3same_fp(s, a, FUNC, READS_VD);                   \
+    }
+
+static void gen_VMLA_fp_3s(TCGContext *s, TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
+                           TCGv_ptr fpstatus)
+{
+    gen_helper_vfp_muls(s, vn, vn, vm, fpstatus);
+    gen_helper_vfp_adds(s, vd, vd, vn, fpstatus);
+}
+
+static void gen_VMLS_fp_3s(TCGContext *s, TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
+                           TCGv_ptr fpstatus)
+{
+    gen_helper_vfp_muls(s, vn, vn, vm, fpstatus);
+    gen_helper_vfp_subs(s, vd, vd, vn, fpstatus);
+}
+
+DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
+DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
 
 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
 {
diff --git a/qemu/target/arm/translate.c b/qemu/target/arm/translate.c
index 2ac405c3..b13c010c 100644
--- a/qemu/target/arm/translate.c
+++ b/qemu/target/arm/translate.c
@@ -5559,6 +5559,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
         case NEON_3R_VPADD_VQRDMLAH:
         case NEON_3R_VQDMULH_VQRDMULH:
         case NEON_3R_FLOAT_ARITH:
+        case NEON_3R_FLOAT_MULTIPLY:
             /* Already handled by decodetree */
             return 1;
         }
@@ -5605,22 +5606,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
         tmp = neon_load_reg(s, rn, pass);
         tmp2 = neon_load_reg(s, rm, pass);
         switch (op) {
-        case NEON_3R_FLOAT_MULTIPLY:
-        {
-            TCGv_ptr fpstatus = get_fpstatus_ptr(tcg_ctx, 1);
-            gen_helper_vfp_muls(tcg_ctx, tmp, tmp, tmp2, fpstatus);
-            if (!u) {
-                tcg_temp_free_i32(tcg_ctx, tmp2);
-                tmp2 = neon_load_reg(s, rd, pass);
-                if (size == 0) {
-                    gen_helper_vfp_adds(tcg_ctx, tmp, tmp, tmp2, fpstatus);
-                } else {
-                    gen_helper_vfp_subs(tcg_ctx, tmp, tmp2, tmp, fpstatus);
-                }
-            }
-            tcg_temp_free_ptr(tcg_ctx, fpstatus);
-            break;
-        }
         case NEON_3R_FLOAT_CMP:
         {
             TCGv_ptr fpstatus = get_fpstatus_ptr(tcg_ctx, 1);