target/arm: Vectorize USHL and SSHL

These instructions shift left or right depending on the sign
of the input, and 7 bits are significant to the shift.  This
requires several masks and selects in addition to the actual
shifts to form the complete answer.

That said, the operation is still a small improvement even for
two 64-bit elements -- 13 vector operations instead of 2 * 7
integer operations.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200216214232.4230-2-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Richard Henderson 2020-02-16 13:42:29 -08:00 committed by Peter Maydell
parent 7abc8cabad
commit 87b74e8b6e
6 changed files with 389 additions and 66 deletions

View file

@ -3575,13 +3575,13 @@ static inline void gen_neon_shift_narrow(int size, TCGv_i32 var, TCGv_i32 shift,
if (u) {
switch (size) {
case 1: gen_helper_neon_shl_u16(var, var, shift); break;
case 2: gen_helper_neon_shl_u32(var, var, shift); break;
case 2: gen_ushl_i32(var, var, shift); break;
default: abort();
}
} else {
switch (size) {
case 1: gen_helper_neon_shl_s16(var, var, shift); break;
case 2: gen_helper_neon_shl_s32(var, var, shift); break;
case 2: gen_sshl_i32(var, var, shift); break;
default: abort();
}
}
@ -4384,6 +4384,280 @@ const GVecGen3 cmtst_op[4] = {
.vece = MO_64 },
};
void gen_ushl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
{
TCGv_i32 lval = tcg_temp_new_i32();
TCGv_i32 rval = tcg_temp_new_i32();
TCGv_i32 lsh = tcg_temp_new_i32();
TCGv_i32 rsh = tcg_temp_new_i32();
TCGv_i32 zero = tcg_const_i32(0);
TCGv_i32 max = tcg_const_i32(32);
/*
* Rely on the TCG guarantee that out of range shifts produce
* unspecified results, not undefined behaviour (i.e. no trap).
* Discard out-of-range results after the fact.
*/
tcg_gen_ext8s_i32(lsh, shift);
tcg_gen_neg_i32(rsh, lsh);
tcg_gen_shl_i32(lval, src, lsh);
tcg_gen_shr_i32(rval, src, rsh);
tcg_gen_movcond_i32(TCG_COND_LTU, dst, lsh, max, lval, zero);
tcg_gen_movcond_i32(TCG_COND_LTU, dst, rsh, max, rval, dst);
tcg_temp_free_i32(lval);
tcg_temp_free_i32(rval);
tcg_temp_free_i32(lsh);
tcg_temp_free_i32(rsh);
tcg_temp_free_i32(zero);
tcg_temp_free_i32(max);
}
void gen_ushl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
{
TCGv_i64 lval = tcg_temp_new_i64();
TCGv_i64 rval = tcg_temp_new_i64();
TCGv_i64 lsh = tcg_temp_new_i64();
TCGv_i64 rsh = tcg_temp_new_i64();
TCGv_i64 zero = tcg_const_i64(0);
TCGv_i64 max = tcg_const_i64(64);
/*
* Rely on the TCG guarantee that out of range shifts produce
* unspecified results, not undefined behaviour (i.e. no trap).
* Discard out-of-range results after the fact.
*/
tcg_gen_ext8s_i64(lsh, shift);
tcg_gen_neg_i64(rsh, lsh);
tcg_gen_shl_i64(lval, src, lsh);
tcg_gen_shr_i64(rval, src, rsh);
tcg_gen_movcond_i64(TCG_COND_LTU, dst, lsh, max, lval, zero);
tcg_gen_movcond_i64(TCG_COND_LTU, dst, rsh, max, rval, dst);
tcg_temp_free_i64(lval);
tcg_temp_free_i64(rval);
tcg_temp_free_i64(lsh);
tcg_temp_free_i64(rsh);
tcg_temp_free_i64(zero);
tcg_temp_free_i64(max);
}
static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
TCGv_vec src, TCGv_vec shift)
{
TCGv_vec lval = tcg_temp_new_vec_matching(dst);
TCGv_vec rval = tcg_temp_new_vec_matching(dst);
TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
TCGv_vec msk, max;
tcg_gen_neg_vec(vece, rsh, shift);
if (vece == MO_8) {
tcg_gen_mov_vec(lsh, shift);
} else {
msk = tcg_temp_new_vec_matching(dst);
tcg_gen_dupi_vec(vece, msk, 0xff);
tcg_gen_and_vec(vece, lsh, shift, msk);
tcg_gen_and_vec(vece, rsh, rsh, msk);
tcg_temp_free_vec(msk);
}
/*
* Rely on the TCG guarantee that out of range shifts produce
* unspecified results, not undefined behaviour (i.e. no trap).
* Discard out-of-range results after the fact.
*/
tcg_gen_shlv_vec(vece, lval, src, lsh);
tcg_gen_shrv_vec(vece, rval, src, rsh);
max = tcg_temp_new_vec_matching(dst);
tcg_gen_dupi_vec(vece, max, 8 << vece);
/*
* The choice of LT (signed) and GEU (unsigned) are biased toward
* the instructions of the x86_64 host. For MO_8, the whole byte
* is significant so we must use an unsigned compare; otherwise we
* have already masked to a byte and so a signed compare works.
* Other tcg hosts have a full set of comparisons and do not care.
*/
if (vece == MO_8) {
tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
tcg_gen_andc_vec(vece, lval, lval, lsh);
tcg_gen_andc_vec(vece, rval, rval, rsh);
} else {
tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
tcg_gen_and_vec(vece, lval, lval, lsh);
tcg_gen_and_vec(vece, rval, rval, rsh);
}
tcg_gen_or_vec(vece, dst, lval, rval);
tcg_temp_free_vec(max);
tcg_temp_free_vec(lval);
tcg_temp_free_vec(rval);
tcg_temp_free_vec(lsh);
tcg_temp_free_vec(rsh);
}
static const TCGOpcode ushl_list[] = {
INDEX_op_neg_vec, INDEX_op_shlv_vec,
INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
};
const GVecGen3 ushl_op[4] = {
{ .fniv = gen_ushl_vec,
.fno = gen_helper_gvec_ushl_b,
.opt_opc = ushl_list,
.vece = MO_8 },
{ .fniv = gen_ushl_vec,
.fno = gen_helper_gvec_ushl_h,
.opt_opc = ushl_list,
.vece = MO_16 },
{ .fni4 = gen_ushl_i32,
.fniv = gen_ushl_vec,
.opt_opc = ushl_list,
.vece = MO_32 },
{ .fni8 = gen_ushl_i64,
.fniv = gen_ushl_vec,
.opt_opc = ushl_list,
.vece = MO_64 },
};
void gen_sshl_i32(TCGv_i32 dst, TCGv_i32 src, TCGv_i32 shift)
{
TCGv_i32 lval = tcg_temp_new_i32();
TCGv_i32 rval = tcg_temp_new_i32();
TCGv_i32 lsh = tcg_temp_new_i32();
TCGv_i32 rsh = tcg_temp_new_i32();
TCGv_i32 zero = tcg_const_i32(0);
TCGv_i32 max = tcg_const_i32(31);
/*
* Rely on the TCG guarantee that out of range shifts produce
* unspecified results, not undefined behaviour (i.e. no trap).
* Discard out-of-range results after the fact.
*/
tcg_gen_ext8s_i32(lsh, shift);
tcg_gen_neg_i32(rsh, lsh);
tcg_gen_shl_i32(lval, src, lsh);
tcg_gen_umin_i32(rsh, rsh, max);
tcg_gen_sar_i32(rval, src, rsh);
tcg_gen_movcond_i32(TCG_COND_LEU, lval, lsh, max, lval, zero);
tcg_gen_movcond_i32(TCG_COND_LT, dst, lsh, zero, rval, lval);
tcg_temp_free_i32(lval);
tcg_temp_free_i32(rval);
tcg_temp_free_i32(lsh);
tcg_temp_free_i32(rsh);
tcg_temp_free_i32(zero);
tcg_temp_free_i32(max);
}
void gen_sshl_i64(TCGv_i64 dst, TCGv_i64 src, TCGv_i64 shift)
{
TCGv_i64 lval = tcg_temp_new_i64();
TCGv_i64 rval = tcg_temp_new_i64();
TCGv_i64 lsh = tcg_temp_new_i64();
TCGv_i64 rsh = tcg_temp_new_i64();
TCGv_i64 zero = tcg_const_i64(0);
TCGv_i64 max = tcg_const_i64(63);
/*
* Rely on the TCG guarantee that out of range shifts produce
* unspecified results, not undefined behaviour (i.e. no trap).
* Discard out-of-range results after the fact.
*/
tcg_gen_ext8s_i64(lsh, shift);
tcg_gen_neg_i64(rsh, lsh);
tcg_gen_shl_i64(lval, src, lsh);
tcg_gen_umin_i64(rsh, rsh, max);
tcg_gen_sar_i64(rval, src, rsh);
tcg_gen_movcond_i64(TCG_COND_LEU, lval, lsh, max, lval, zero);
tcg_gen_movcond_i64(TCG_COND_LT, dst, lsh, zero, rval, lval);
tcg_temp_free_i64(lval);
tcg_temp_free_i64(rval);
tcg_temp_free_i64(lsh);
tcg_temp_free_i64(rsh);
tcg_temp_free_i64(zero);
tcg_temp_free_i64(max);
}
static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
TCGv_vec src, TCGv_vec shift)
{
TCGv_vec lval = tcg_temp_new_vec_matching(dst);
TCGv_vec rval = tcg_temp_new_vec_matching(dst);
TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
/*
* Rely on the TCG guarantee that out of range shifts produce
* unspecified results, not undefined behaviour (i.e. no trap).
* Discard out-of-range results after the fact.
*/
tcg_gen_neg_vec(vece, rsh, shift);
if (vece == MO_8) {
tcg_gen_mov_vec(lsh, shift);
} else {
tcg_gen_dupi_vec(vece, tmp, 0xff);
tcg_gen_and_vec(vece, lsh, shift, tmp);
tcg_gen_and_vec(vece, rsh, rsh, tmp);
}
/* Bound rsh so out of bound right shift gets -1. */
tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
tcg_gen_umin_vec(vece, rsh, rsh, tmp);
tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
tcg_gen_shlv_vec(vece, lval, src, lsh);
tcg_gen_sarv_vec(vece, rval, src, rsh);
/* Select in-bound left shift. */
tcg_gen_andc_vec(vece, lval, lval, tmp);
/* Select between left and right shift. */
if (vece == MO_8) {
tcg_gen_dupi_vec(vece, tmp, 0);
tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
} else {
tcg_gen_dupi_vec(vece, tmp, 0x80);
tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
}
tcg_temp_free_vec(lval);
tcg_temp_free_vec(rval);
tcg_temp_free_vec(lsh);
tcg_temp_free_vec(rsh);
tcg_temp_free_vec(tmp);
}
static const TCGOpcode sshl_list[] = {
INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
};
const GVecGen3 sshl_op[4] = {
{ .fniv = gen_sshl_vec,
.fno = gen_helper_gvec_sshl_b,
.opt_opc = sshl_list,
.vece = MO_8 },
{ .fniv = gen_sshl_vec,
.fno = gen_helper_gvec_sshl_h,
.opt_opc = sshl_list,
.vece = MO_16 },
{ .fni4 = gen_sshl_i32,
.fniv = gen_sshl_vec,
.opt_opc = sshl_list,
.vece = MO_32 },
{ .fni8 = gen_sshl_i64,
.fniv = gen_sshl_vec,
.opt_opc = sshl_list,
.vece = MO_64 },
};
static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
TCGv_vec a, TCGv_vec b)
{
@ -4787,6 +5061,12 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
vec_size, vec_size);
}
return 0;
case NEON_3R_VSHL:
/* Note the operation is vshl vd,vm,vn */
tcg_gen_gvec_3(rd_ofs, rm_ofs, rn_ofs, vec_size, vec_size,
u ? &ushl_op[size] : &sshl_op[size]);
return 0;
}
if (size == 3) {
@ -4795,13 +5075,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
neon_load_reg64(cpu_V0, rn + pass);
neon_load_reg64(cpu_V1, rm + pass);
switch (op) {
case NEON_3R_VSHL:
if (u) {
gen_helper_neon_shl_u64(cpu_V0, cpu_V1, cpu_V0);
} else {
gen_helper_neon_shl_s64(cpu_V0, cpu_V1, cpu_V0);
}
break;
case NEON_3R_VQSHL:
if (u) {
gen_helper_neon_qshl_u64(cpu_V0, cpu_env,
@ -4836,7 +5109,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
}
pairwise = 0;
switch (op) {
case NEON_3R_VSHL:
case NEON_3R_VQSHL:
case NEON_3R_VRSHL:
case NEON_3R_VQRSHL:
@ -4916,9 +5188,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
case NEON_3R_VHSUB:
GEN_NEON_INTEGER_OP(hsub);
break;
case NEON_3R_VSHL:
GEN_NEON_INTEGER_OP(shl);
break;
case NEON_3R_VQSHL:
GEN_NEON_INTEGER_OP_ENV(qshl);
break;
@ -5327,9 +5596,9 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
}
} else {
if (input_unsigned) {
gen_helper_neon_shl_u64(cpu_V0, in, tmp64);
gen_ushl_i64(cpu_V0, in, tmp64);
} else {
gen_helper_neon_shl_s64(cpu_V0, in, tmp64);
gen_sshl_i64(cpu_V0, in, tmp64);
}
}
tmp = tcg_temp_new_i32();