mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-09-02 06:51:53 -06:00
arm/translate-a64: implement half-precision F(MIN|MAX)(V|NMV)
This implements the half-precision variants of the across vector reduction operations. This involves a re-factor of the reduction code which more closely matches the ARM ARM order (and handles 8 element reductions). Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20180227143852.11175-7-alex.bennee@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
parent
9b04991686
commit
807cdd5042
3 changed files with 109 additions and 53 deletions
|
@ -572,3 +572,21 @@ uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
|
||||||
{
|
{
|
||||||
return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, true, GETPC());
|
return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, true, GETPC());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* AdvSIMD half-precision
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
|
||||||
|
|
||||||
|
#define ADVSIMD_HALFOP(name) \
|
||||||
|
float16 ADVSIMD_HELPER(name, h)(float16 a, float16 b, void *fpstp) \
|
||||||
|
{ \
|
||||||
|
float_status *fpst = fpstp; \
|
||||||
|
return float16_ ## name(a, b, fpst); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ADVSIMD_HALFOP(min)
|
||||||
|
ADVSIMD_HALFOP(max)
|
||||||
|
ADVSIMD_HALFOP(minnum)
|
||||||
|
ADVSIMD_HALFOP(maxnum)
|
||||||
|
|
|
@ -48,3 +48,7 @@ DEF_HELPER_FLAGS_4(paired_cmpxchg64_le_parallel, TCG_CALL_NO_WG,
|
||||||
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
|
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
|
||||||
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,
|
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,
|
||||||
i64, env, i64, i64, i64)
|
i64, env, i64, i64, i64)
|
||||||
|
DEF_HELPER_FLAGS_3(advsimd_maxh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
||||||
|
DEF_HELPER_FLAGS_3(advsimd_minh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
||||||
|
DEF_HELPER_FLAGS_3(advsimd_maxnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
||||||
|
DEF_HELPER_FLAGS_3(advsimd_minnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
||||||
|
|
|
@ -5741,26 +5741,75 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
|
||||||
tcg_temp_free_i64(tcg_resh);
|
tcg_temp_free_i64(tcg_resh);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void do_minmaxop(DisasContext *s, TCGv_i32 tcg_elt1, TCGv_i32 tcg_elt2,
|
/*
|
||||||
int opc, bool is_min, TCGv_ptr fpst)
|
* do_reduction_op helper
|
||||||
|
*
|
||||||
|
* This mirrors the Reduce() pseudocode in the ARM ARM. It is
|
||||||
|
* important for correct NaN propagation that we do these
|
||||||
|
* operations in exactly the order specified by the pseudocode.
|
||||||
|
*
|
||||||
|
* This is a recursive function, TCG temps should be freed by the
|
||||||
|
* calling function once it is done with the values.
|
||||||
|
*/
|
||||||
|
static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
|
||||||
|
int esize, int size, int vmap, TCGv_ptr fpst)
|
||||||
{
|
{
|
||||||
/* Helper function for disas_simd_across_lanes: do a single precision
|
if (esize == size) {
|
||||||
* min/max operation on the specified two inputs,
|
int element;
|
||||||
* and return the result in tcg_elt1.
|
TCGMemOp msize = esize == 16 ? MO_16 : MO_32;
|
||||||
*/
|
TCGv_i32 tcg_elem;
|
||||||
if (opc == 0xc) {
|
|
||||||
if (is_min) {
|
/* We should have one register left here */
|
||||||
gen_helper_vfp_minnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
|
assert(ctpop8(vmap) == 1);
|
||||||
} else {
|
element = ctz32(vmap);
|
||||||
gen_helper_vfp_maxnums(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
|
assert(element < 8);
|
||||||
}
|
|
||||||
|
tcg_elem = tcg_temp_new_i32();
|
||||||
|
read_vec_element_i32(s, tcg_elem, rn, element, msize);
|
||||||
|
return tcg_elem;
|
||||||
} else {
|
} else {
|
||||||
assert(opc == 0xf);
|
int bits = size / 2;
|
||||||
if (is_min) {
|
int shift = ctpop8(vmap) / 2;
|
||||||
gen_helper_vfp_mins(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
|
int vmap_lo = (vmap >> shift) & vmap;
|
||||||
} else {
|
int vmap_hi = (vmap & ~vmap_lo);
|
||||||
gen_helper_vfp_maxs(tcg_elt1, tcg_elt1, tcg_elt2, fpst);
|
TCGv_i32 tcg_hi, tcg_lo, tcg_res;
|
||||||
|
|
||||||
|
tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
|
||||||
|
tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
|
||||||
|
tcg_res = tcg_temp_new_i32();
|
||||||
|
|
||||||
|
switch (fpopcode) {
|
||||||
|
case 0x0c: /* fmaxnmv half-precision */
|
||||||
|
gen_helper_advsimd_maxnumh(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
case 0x0f: /* fmaxv half-precision */
|
||||||
|
gen_helper_advsimd_maxh(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
case 0x1c: /* fminnmv half-precision */
|
||||||
|
gen_helper_advsimd_minnumh(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
case 0x1f: /* fminv half-precision */
|
||||||
|
gen_helper_advsimd_minh(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
case 0x2c: /* fmaxnmv */
|
||||||
|
gen_helper_vfp_maxnums(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
case 0x2f: /* fmaxv */
|
||||||
|
gen_helper_vfp_maxs(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
case 0x3c: /* fminnmv */
|
||||||
|
gen_helper_vfp_minnums(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
case 0x3f: /* fminv */
|
||||||
|
gen_helper_vfp_mins(tcg_res, tcg_lo, tcg_hi, fpst);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
g_assert_not_reached();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tcg_temp_free_i32(tcg_hi);
|
||||||
|
tcg_temp_free_i32(tcg_lo);
|
||||||
|
return tcg_res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5802,16 +5851,21 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
|
||||||
break;
|
break;
|
||||||
case 0xc: /* FMAXNMV, FMINNMV */
|
case 0xc: /* FMAXNMV, FMINNMV */
|
||||||
case 0xf: /* FMAXV, FMINV */
|
case 0xf: /* FMAXV, FMINV */
|
||||||
if (!is_u || !is_q || extract32(size, 0, 1)) {
|
/* Bit 1 of size field encodes min vs max and the actual size
|
||||||
unallocated_encoding(s);
|
* depends on the encoding of the U bit. If not set (and FP16
|
||||||
return;
|
* enabled) then we do half-precision float instead of single
|
||||||
}
|
* precision.
|
||||||
/* Bit 1 of size field encodes min vs max, and actual size is always
|
|
||||||
* 32 bits: adjust the size variable so following code can rely on it
|
|
||||||
*/
|
*/
|
||||||
is_min = extract32(size, 1, 1);
|
is_min = extract32(size, 1, 1);
|
||||||
is_fp = true;
|
is_fp = true;
|
||||||
size = 2;
|
if (!is_u && arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
|
||||||
|
size = 1;
|
||||||
|
} else if (!is_u || !is_q || extract32(size, 0, 1)) {
|
||||||
|
unallocated_encoding(s);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
size = 2;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
unallocated_encoding(s);
|
unallocated_encoding(s);
|
||||||
|
@ -5868,38 +5922,18 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
|
||||||
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* Floating point ops which work on 32 bit (single) intermediates.
|
/* Floating point vector reduction ops which work across 32
|
||||||
|
* bit (single) or 16 bit (half-precision) intermediates.
|
||||||
* Note that correct NaN propagation requires that we do these
|
* Note that correct NaN propagation requires that we do these
|
||||||
* operations in exactly the order specified by the pseudocode.
|
* operations in exactly the order specified by the pseudocode.
|
||||||
*/
|
*/
|
||||||
TCGv_i32 tcg_elt1 = tcg_temp_new_i32();
|
TCGv_ptr fpst = get_fpstatus_ptr(size == MO_16);
|
||||||
TCGv_i32 tcg_elt2 = tcg_temp_new_i32();
|
int fpopcode = opcode | is_min << 4 | is_u << 5;
|
||||||
TCGv_i32 tcg_elt3 = tcg_temp_new_i32();
|
int vmap = (1 << elements) - 1;
|
||||||
TCGv_ptr fpst = get_fpstatus_ptr(false);
|
TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize,
|
||||||
|
(is_q ? 128 : 64), vmap, fpst);
|
||||||
assert(esize == 32);
|
tcg_gen_extu_i32_i64(tcg_res, tcg_res32);
|
||||||
assert(elements == 4);
|
tcg_temp_free_i32(tcg_res32);
|
||||||
|
|
||||||
read_vec_element(s, tcg_elt, rn, 0, MO_32);
|
|
||||||
tcg_gen_extrl_i64_i32(tcg_elt1, tcg_elt);
|
|
||||||
read_vec_element(s, tcg_elt, rn, 1, MO_32);
|
|
||||||
tcg_gen_extrl_i64_i32(tcg_elt2, tcg_elt);
|
|
||||||
|
|
||||||
do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
|
|
||||||
|
|
||||||
read_vec_element(s, tcg_elt, rn, 2, MO_32);
|
|
||||||
tcg_gen_extrl_i64_i32(tcg_elt2, tcg_elt);
|
|
||||||
read_vec_element(s, tcg_elt, rn, 3, MO_32);
|
|
||||||
tcg_gen_extrl_i64_i32(tcg_elt3, tcg_elt);
|
|
||||||
|
|
||||||
do_minmaxop(s, tcg_elt2, tcg_elt3, opcode, is_min, fpst);
|
|
||||||
|
|
||||||
do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
|
|
||||||
|
|
||||||
tcg_gen_extu_i32_i64(tcg_res, tcg_elt1);
|
|
||||||
tcg_temp_free_i32(tcg_elt1);
|
|
||||||
tcg_temp_free_i32(tcg_elt2);
|
|
||||||
tcg_temp_free_i32(tcg_elt3);
|
|
||||||
tcg_temp_free_ptr(fpst);
|
tcg_temp_free_ptr(fpst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue