target/arm: Implement integer matrix multiply accumulate

This is {S,U,US}MMLA for both AArch64 AdvSIMD and SVE,
and V{S,U,US}MMLA.S8 for AArch32 NEON.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210525010358.152808-91-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Richard Henderson 2021-05-24 18:03:56 -07:00 committed by Peter Maydell
parent 51879c671b
commit 2323c5ffd4
7 changed files with 169 additions and 0 deletions

View file

@ -2335,3 +2335,80 @@ void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
}
clear_tail(d, opr_sz * 8, simd_maxsz(desc));
}
/*
* Integer matrix-multiply accumulate
*/
static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
{
int8_t *n = vn, *m = vm;
for (intptr_t k = 0; k < 8; ++k) {
sum += n[H1(k)] * m[H1(k)];
}
return sum;
}
static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
{
uint8_t *n = vn, *m = vm;
for (intptr_t k = 0; k < 8; ++k) {
sum += n[H1(k)] * m[H1(k)];
}
return sum;
}
static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
{
uint8_t *n = vn;
int8_t *m = vm;
for (intptr_t k = 0; k < 8; ++k) {
sum += n[H1(k)] * m[H1(k)];
}
return sum;
}
static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
uint32_t (*inner_loop)(uint32_t, void *, void *))
{
intptr_t seg, opr_sz = simd_oprsz(desc);
for (seg = 0; seg < opr_sz; seg += 16) {
uint32_t *d = vd + seg;
uint32_t *a = va + seg;
uint32_t sum0, sum1, sum2, sum3;
/*
* Process the entire segment at once, writing back the
* results only after we've consumed all of the inputs.
*
* Key to indicies by column:
* i j i j
*/
sum0 = a[H4(0 + 0)];
sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
sum1 = a[H4(0 + 1)];
sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
sum2 = a[H4(2 + 0)];
sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
sum3 = a[H4(2 + 1)];
sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
d[H4(0)] = sum0;
d[H4(1)] = sum1;
d[H4(2)] = sum2;
d[H4(3)] = sum3;
}
clear_tail(vd, opr_sz, simd_maxsz(desc));
}
#define DO_MMLA_B(NAME, INNER) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
{ do_mmla_b(vd, vn, vm, va, desc, INNER); }
DO_MMLA_B(gvec_smmla_b, do_smmla_b)
DO_MMLA_B(gvec_ummla_b, do_ummla_b)
DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)