target/arm: Implement bfloat widening fma (indexed)

This is BFMLAL{B,T} for both AArch64 AdvSIMD and SVE,
and VFMA{B,T}.BF16 for AArch32 NEON.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210525225817.400336-11-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Richard Henderson 2021-05-25 15:58:15 -07:00 committed by Peter Maydell
parent 5693887f2e
commit 458d0ab683
7 changed files with 82 additions and 1 deletions

View file

@ -2528,3 +2528,25 @@ void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
void *va, void *stat, uint32_t desc)
{
intptr_t i, j, opr_sz = simd_oprsz(desc);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
intptr_t elements = opr_sz / 4;
intptr_t eltspersegment = MIN(16 / 4, elements);
float32 *d = vd, *a = va;
bfloat16 *n = vn, *m = vm;
for (i = 0; i < elements; i += eltspersegment) {
float32 m_idx = m[H2(2 * i + index)] << 16;
for (j = i; j < i + eltspersegment; j++) {
float32 n_j = n[H2(2 * j + sel)] << 16;
d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
}
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}