target/i386: reimplement 0x0f 0x10-0x17, add AVX

These are mostly moves, and yet are a total pain.  The main issue
is that:

1) some instructions are selected by mod==11 (register operand)
vs. mod=00/01/10 (memory operand)

2) stores to memory are two-operand operations, while the 3-register
and load-from-memory versions operate on the entire contents of the
destination; this makes it easier to separate the gen_* function for
the store case

3) it's inefficient to load into xmm_T0 only to move the value out
again, so the gen_* function for the load case is separated too

The manual also has various mistakes in the operands here, for example
the store case of MOVHPS operates on a 128-bit source (albeit discarding
the bottom 64 bits) and therefore should be Mq,Vdq rather than Mq,Vq.
Likewise for the destination and source of MOVHLPS.

VUNPCK?PS and VUNPCK?PD are the same as VUNPCK?DQ and VUNPCK?QDQ,
but encoded as prefixes rather than separate operands.  The helpers
can be reused however.

For MOVSLDUP, MOVSHDUP and MOVDDUP I chose to reimplement them as
helpers.  I named the helper for MOVDDUP "movdldup" in preparation
for possible future introduction of MOVDHDUP and to clarify the
similarity with MOVSLDUP.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Paolo Bonzini 2022-09-17 23:22:36 +02:00
parent aba2b8ecb9
commit 7170a17ec3
5 changed files with 264 additions and 0 deletions

View file

@ -394,6 +394,7 @@ static inline void gen_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn
gen_illegal_opcode(s);
}
}
#define FP_SSE(uname, lname) \
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
{ \
@ -412,6 +413,20 @@ FP_SSE(VMIN, min)
FP_SSE(VDIV, div)
FP_SSE(VMAX, max)
#define FP_UNPACK_SSE(uname, lname) \
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
{ \
/* PS maps to the DQ integer instruction, PD maps to QDQ. */ \
gen_fp_sse(s, env, decode, \
gen_helper_##lname##qdq_xmm, \
gen_helper_##lname##dq_xmm, \
gen_helper_##lname##qdq_ymm, \
gen_helper_##lname##dq_ymm, \
NULL, NULL); \
}
FP_UNPACK_SSE(VUNPCKLPx, punpckl)
FP_UNPACK_SSE(VUNPCKHPx, punpckh)
/*
* 00 = v*ps Vps, Wpd
* f3 = v*ss Vss, Wps
@ -749,6 +764,10 @@ UNARY_INT_SSE(VPMOVZXWD, pmovzxwd)
UNARY_INT_SSE(VPMOVZXWQ, pmovzxwq)
UNARY_INT_SSE(VPMOVZXDQ, pmovzxdq)
UNARY_INT_SSE(VMOVSLDUP, pmovsldup)
UNARY_INT_SSE(VMOVSHDUP, pmovshdup)
UNARY_INT_SSE(VMOVDDUP, pmovdldup)
UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
@ -1808,6 +1827,114 @@ static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn
gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
}
static void gen_VMOVHPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
if (decode->op[0].offset != decode->op[1].offset) {
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
}
}
static void gen_VMOVHPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
}
static void gen_VMOVHPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
if (decode->op[0].offset != decode->op[2].offset) {
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
}
if (decode->op[0].offset != decode->op[1].offset) {
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
}
}
static void gen_VMOVHLPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
if (decode->op[0].offset != decode->op[1].offset) {
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1)));
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
}
}
static void gen_VMOVLHPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset);
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
if (decode->op[0].offset != decode->op[1].offset) {
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
}
}
/*
* Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx.
* Use a gvec move to move everything above the bottom 64 bits.
*/
static void gen_VMOVLPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
int vec_len = vector_len(s, decode);
tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0)));
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
}
static void gen_VMOVLPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
int vec_len = vector_len(s, decode);
tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
}
static void gen_VMOVLPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
}
static void gen_VMOVSD_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
TCGv_i64 zero = tcg_constant_i64(0);
tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1)));
tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
}
static void gen_VMOVSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
int vec_len = vector_len(s, decode);
tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
}
static void gen_VMOVSS_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
int vec_len = vector_len(s, decode);
tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
}
static void gen_VMOVSS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
}
static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
if (s->vex_w) {