tcg: Add add/sub with carry opcodes and infrastructure

Liveness needs to track carry-live state in order to
determine if the (hidden) output of the opcode is used.
Code generation needs to track carry-live state in order
to avoid clobbering cpu flags when loading constants.

So far, output routines and backends are unchanged.

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2025-01-14 13:58:39 -08:00
parent 3e3689df4e
commit 76f4278029
5 changed files with 235 additions and 10 deletions

View file

@ -593,6 +593,67 @@ Multiword arithmetic support
.. list-table::
* - addco *t0*, *t1*, *t2*
- | Compute *t0* = *t1* + *t2* and in addition output to the
carry bit provided by the host architecture.
* - addci *t0, *t1*, *t2*
- | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
input carry bit provided by the host architecture.
The output carry bit need not be computed.
* - addcio *t0, *t1*, *t2*
- | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
input carry bit provided by the host architecture,
and also compute the output carry bit.
* - addc1o *t0, *t1*, *t2*
- | Compute *t0* = *t1* + *t2* + 1, and in addition output to the
carry bit provided by the host architecture. This is akin to
*addcio* with a fixed carry-in value of 1.
| This is intended to be used by the optimization pass,
intermediate to complete folding of the addition chain.
In some cases complete folding is not possible and this
opcode will remain until output. If this happens, the
code generator will use ``tcg_out_set_carry`` and then
the output routine for *addcio*.
* - subbo *t0*, *t1*, *t2*
- | Compute *t0* = *t1* - *t2* and in addition output to the
borrow bit provided by the host architecture.
| Depending on the host architecture, the carry bit may or may not be
identical to the borrow bit. Thus the addc\* and subb\*
opcodes must not be mixed.
* - subbi *t0, *t1*, *t2*
- | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
input borrow bit provided by the host architecture.
The output borrow bit need not be computed.
* - subbio *t0, *t1*, *t2*
- | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
input borrow bit provided by the host architecture,
and also compute the output borrow bit.
* - subb1o *t0, *t1*, *t2*
- | Compute *t0* = *t1* - *t2* - 1, and in addition output to the
borrow bit provided by the host architecture. This is akin to
*subbio* with a fixed borrow-in value of 1.
| This is intended to be used by the optimization pass,
intermediate to complete folding of the subtraction chain.
In some cases complete folding is not possible and this
opcode will remain until output. If this happens, the
code generator will use ``tcg_out_set_borrow`` and then
the output routine for *subbio*.
* - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*

View file

@ -82,6 +82,16 @@ DEF(shr, 1, 2, 0, TCG_OPF_INT)
DEF(sub, 1, 2, 0, TCG_OPF_INT)
DEF(xor, 1, 2, 0, TCG_OPF_INT)
DEF(addco, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
DEF(addc1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
DEF(addci, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
DEF(addcio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
DEF(subbo, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
DEF(subb1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
DEF(subbi, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
DEF(subbio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
/* load/store */
DEF(ld8u_i32, 1, 1, 1, 0)
DEF(ld8s_i32, 1, 1, 1, 0)

View file

@ -418,6 +418,11 @@ struct TCGContext {
MemOp riscv_cur_vsew;
TCGType riscv_cur_type;
#endif
/*
* During the tcg_reg_alloc_op loop, we are within a sequence of
* carry-using opcodes like addco+addci.
*/
bool carry_live;
GHashTable *const_table[TCG_TYPE_COUNT];
TCGTempSet free_temps[TCG_TYPE_COUNT];
@ -749,13 +754,17 @@ enum {
/* Instruction operands are vectors. */
TCG_OPF_VECTOR = 0x40,
/* Instruction is a conditional branch. */
TCG_OPF_COND_BRANCH = 0x80
TCG_OPF_COND_BRANCH = 0x80,
/* Instruction produces carry out. */
TCG_OPF_CARRY_OUT = 0x100,
/* Instruction consumes carry in. */
TCG_OPF_CARRY_IN = 0x200,
};
typedef struct TCGOpDef {
const char *name;
uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args;
uint8_t flags;
uint16_t flags;
} TCGOpDef;
extern const TCGOpDef tcg_op_defs[];

View file

@ -1226,6 +1226,12 @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op)
return finish_folding(ctx, op);
}
static bool fold_add_carry(OptContext *ctx, TCGOp *op)
{
fold_commutative(ctx, op);
return finish_folding(ctx, op);
}
static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
{
bool a_const = arg_is_const(op->args[2]) && arg_is_const(op->args[3]);
@ -2829,6 +2835,11 @@ void tcg_optimize(TCGContext *s)
case INDEX_op_add_vec:
done = fold_add_vec(&ctx, op);
break;
case INDEX_op_addci:
case INDEX_op_addco:
case INDEX_op_addcio:
done = fold_add_carry(&ctx, op);
break;
CASE_OP_32_64(add2):
done = fold_add2(&ctx, op);
break;

150
tcg/tcg.c
View file

@ -3914,6 +3914,17 @@ liveness_pass_0(TCGContext *s)
}
}
static void assert_carry_dead(TCGContext *s)
{
/*
* Carry operations can be separated by a few insns like mov,
* load or store, but they should always be "close", and
* carry-out operations should always be paired with carry-in.
* At various boundaries, carry must have been consumed.
*/
tcg_debug_assert(!s->carry_live);
}
/* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@ -3933,17 +3944,19 @@ liveness_pass_1(TCGContext *s)
/* ??? Should be redundant with the exit_tb that ends the TB. */
la_func_end(s, nb_globals, nb_temps);
s->carry_live = false;
QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, link, op_prev) {
int nb_iargs, nb_oargs;
TCGOpcode opc_new, opc_new2;
TCGLifeData arg_life = 0;
TCGTemp *ts;
TCGOpcode opc = op->opc;
const TCGOpDef *def = &tcg_op_defs[opc];
const TCGOpDef *def;
const TCGArgConstraint *args_ct;
switch (opc) {
case INDEX_op_call:
assert_carry_dead(s);
{
const TCGHelperInfo *info = tcg_call_info(op);
int call_flags = tcg_call_flags(op);
@ -4055,6 +4068,7 @@ liveness_pass_1(TCGContext *s)
}
break;
case INDEX_op_insn_start:
assert_carry_dead(s);
break;
case INDEX_op_discard:
/* mark the temporary as dead */
@ -4071,6 +4085,7 @@ liveness_pass_1(TCGContext *s)
case INDEX_op_sub2_i64:
opc_new = INDEX_op_sub;
do_addsub2:
assert_carry_dead(s);
/* Test if the high part of the operation is dead, but not
the low part. The result can be optimized to a simple
add or sub. This happens often for x86_64 guest when the
@ -4096,6 +4111,7 @@ liveness_pass_1(TCGContext *s)
opc_new = INDEX_op_mul;
opc_new2 = INDEX_op_muluh;
do_mul2:
assert_carry_dead(s);
if (arg_temp(op->args[1])->state == TS_DEAD) {
if (arg_temp(op->args[0])->state == TS_DEAD) {
/* Both parts of the operation are dead. */
@ -4118,10 +4134,89 @@ liveness_pass_1(TCGContext *s)
/* Mark the single-word operation live. */
goto do_not_remove;
case INDEX_op_addco:
if (s->carry_live) {
goto do_not_remove;
}
op->opc = opc = INDEX_op_add;
goto do_default;
case INDEX_op_addcio:
if (s->carry_live) {
goto do_not_remove;
}
op->opc = opc = INDEX_op_addci;
goto do_default;
case INDEX_op_subbo:
if (s->carry_live) {
goto do_not_remove;
}
/* Lower to sub, but this may also require canonicalization. */
op->opc = opc = INDEX_op_sub;
ts = arg_temp(op->args[2]);
if (ts->kind == TEMP_CONST) {
ts = tcg_constant_internal(ts->type, -ts->val);
if (ts->state_ptr == NULL) {
tcg_debug_assert(temp_idx(ts) == nb_temps);
nb_temps++;
ts->state_ptr = tcg_malloc(sizeof(TCGRegSet));
ts->state = TS_DEAD;
la_reset_pref(ts);
}
op->args[2] = temp_arg(ts);
op->opc = opc = INDEX_op_add;
}
goto do_default;
case INDEX_op_subbio:
if (s->carry_live) {
goto do_not_remove;
}
op->opc = opc = INDEX_op_subbi;
goto do_default;
case INDEX_op_addc1o:
if (s->carry_live) {
goto do_not_remove;
}
/* Lower to add, add +1. */
op_prev = tcg_op_insert_before(s, op, INDEX_op_add,
TCGOP_TYPE(op), 3);
op_prev->args[0] = op->args[0];
op_prev->args[1] = op->args[1];
op_prev->args[2] = op->args[2];
op->opc = opc = INDEX_op_add;
op->args[1] = op->args[0];
ts = arg_temp(op->args[0]);
ts = tcg_constant_internal(ts->type, 1);
op->args[2] = temp_arg(ts);
goto do_default;
case INDEX_op_subb1o:
if (s->carry_live) {
goto do_not_remove;
}
/* Lower to sub, add -1. */
op_prev = tcg_op_insert_before(s, op, INDEX_op_sub,
TCGOP_TYPE(op), 3);
op_prev->args[0] = op->args[0];
op_prev->args[1] = op->args[1];
op_prev->args[2] = op->args[2];
op->opc = opc = INDEX_op_add;
op->args[1] = op->args[0];
ts = arg_temp(op->args[0]);
ts = tcg_constant_internal(ts->type, -1);
op->args[2] = temp_arg(ts);
goto do_default;
default:
/* Test if the operation can be removed because all
its outputs are dead. We assume that nb_oargs == 0
implies side effects */
do_default:
/*
* Test if the operation can be removed because all
* its outputs are dead. We assume that nb_oargs == 0
* implies side effects.
*/
def = &tcg_op_defs[opc];
if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && def->nb_oargs != 0) {
for (int i = def->nb_oargs - 1; i >= 0; i--) {
@ -4163,12 +4258,16 @@ liveness_pass_1(TCGContext *s)
/* If end of basic block, update. */
if (def->flags & TCG_OPF_BB_EXIT) {
assert_carry_dead(s);
la_func_end(s, nb_globals, nb_temps);
} else if (def->flags & TCG_OPF_COND_BRANCH) {
assert_carry_dead(s);
la_bb_sync(s, nb_globals, nb_temps);
} else if (def->flags & TCG_OPF_BB_END) {
assert_carry_dead(s);
la_bb_end(s, nb_globals, nb_temps);
} else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
assert_carry_dead(s);
la_global_sync(s, nb_globals);
if (def->flags & TCG_OPF_CALL_CLOBBER) {
la_cross_call(s, nb_temps);
@ -4182,6 +4281,9 @@ liveness_pass_1(TCGContext *s)
arg_life |= DEAD_ARG << i;
}
}
if (def->flags & TCG_OPF_CARRY_OUT) {
s->carry_live = false;
}
/* Input arguments are live for preceding opcodes. */
for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
@ -4193,6 +4295,9 @@ liveness_pass_1(TCGContext *s)
ts->state &= ~TS_DEAD;
}
}
if (def->flags & TCG_OPF_CARRY_IN) {
s->carry_live = true;
}
/* Incorporate constraints for this operand. */
switch (opc) {
@ -4232,6 +4337,7 @@ liveness_pass_1(TCGContext *s)
}
op->life = arg_life;
}
assert_carry_dead(s);
}
/* Liveness analysis: Convert indirect regs to direct temporaries. */
@ -4820,9 +4926,8 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
all globals are stored at their canonical location. */
static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
{
int i;
for (i = s->nb_globals; i < s->nb_temps; i++) {
assert_carry_dead(s);
for (int i = s->nb_globals; i < s->nb_temps; i++) {
TCGTemp *ts = &s->temps[i];
switch (ts->kind) {
@ -4853,6 +4958,7 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
*/
static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
{
assert_carry_dead(s);
sync_globals(s, allocated_regs);
for (int i = s->nb_globals; i < s->nb_temps; i++) {
@ -5124,6 +5230,10 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
int const_args[TCG_MAX_OP_ARGS];
TCGCond op_cond;
if (def->flags & TCG_OPF_CARRY_IN) {
tcg_debug_assert(s->carry_live);
}
nb_oargs = def->nb_oargs;
nb_iargs = def->nb_iargs;
@ -5380,6 +5490,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
tcg_reg_alloc_bb_end(s, i_allocated_regs);
} else {
if (def->flags & TCG_OPF_CALL_CLOBBER) {
assert_carry_dead(s);
/* XXX: permit generic clobber register list ? */
for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
@ -5497,7 +5608,8 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
case INDEX_op_sub:
{
const TCGOutOpSubtract *out = &outop_sub;
const TCGOutOpSubtract *out =
container_of(all_outop[op->opc], TCGOutOpSubtract, base);
/*
* Constants should never appear in the second source operand.
@ -5512,6 +5624,16 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
}
break;
case INDEX_op_addco:
case INDEX_op_subbo:
case INDEX_op_addci:
case INDEX_op_subbi:
case INDEX_op_addcio:
case INDEX_op_subbio:
case INDEX_op_addc1o:
case INDEX_op_subb1o:
g_assert_not_reached();
case INDEX_op_bswap64:
case INDEX_op_ext_i32_i64:
case INDEX_op_extu_i32_i64:
@ -5700,6 +5822,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
break;
}
if (def->flags & TCG_OPF_CARRY_IN) {
s->carry_live = false;
}
if (def->flags & TCG_OPF_CARRY_OUT) {
s->carry_live = true;
}
/* move the outputs in the correct register if needed */
for(i = 0; i < nb_oargs; i++) {
ts = arg_temp(op->args[i]);
@ -6702,6 +6831,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
tcg_out_tb_start(s);
num_insns = -1;
s->carry_live = false;
QTAILQ_FOREACH(op, &s->ops, link) {
TCGOpcode opc = op->opc;
@ -6730,6 +6860,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
tcg_reg_alloc_dup(s, op);
break;
case INDEX_op_insn_start:
assert_carry_dead(s);
if (num_insns >= 0) {
size_t off = tcg_current_code_size(s);
s->gen_insn_end_off[num_insns] = off;
@ -6750,6 +6881,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
tcg_out_label(s, arg_label(op->args[0]));
break;
case INDEX_op_call:
assert_carry_dead(s);
tcg_reg_alloc_call(s, op);
break;
case INDEX_op_exit_tb:
@ -6786,6 +6918,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
return -2;
}
}
assert_carry_dead(s);
tcg_debug_assert(num_insns + 1 == s->gen_tb->icount);
s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);