Add SSE variant of pow/powi to interpreter.

Use SSE pow/powi helper functions from compiled code.
Cleanup use of helper functions.
Related cleanups of folding functions in x64 interpreter.
This commit is contained in:
Mike Pall
2009-12-25 23:12:30 +01:00
parent 6ce0c90ed6
commit 690760aa38
4 changed files with 949 additions and 740 deletions

View File

@@ -1991,9 +1991,19 @@ static int fpmjoin_pow(ASMState *as, IRIns *ir)
IRIns *irpp = IR(irp->op1);
if (irpp == ir-2 && irpp->o == IR_FPMATH &&
irpp->op2 == IRFPM_LOG2 && !ra_used(irpp)) {
emit_call(as, lj_vm_pow); /* st0 = lj_vm_pow(st1, st0) */
asm_x87load(as, irp->op2);
asm_x87load(as, irpp->op1);
/* The modified regs must match with the *.dasc implementation. */
RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
IRIns *irx;
if (ra_hasreg(ir->r))
rset_clear(drop, ir->r); /* Dest reg handled below. */
ra_evictset(as, drop);
ra_destreg(as, ir, RID_XMM0);
emit_call(as, lj_vm_pow_sse);
irx = IR(irpp->op1);
if (ra_noreg(irx->r) && ra_gethint(irx->r) == RID_XMM1)
irx->r = RID_INIT; /* Avoid allocating xmm1 for x. */
ra_left(as, RID_XMM0, irpp->op1);
ra_left(as, RID_XMM1, irp->op2);
return 1;
}
}
@@ -2007,30 +2017,35 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
Reg dest = ra_dest(as, ir, RSET_FPR);
Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
emit_mrm(as, XO_SQRTSD, dest, left);
} else if ((as->flags & JIT_F_SSE4_1) && fpm <= IRFPM_TRUNC) {
Reg dest = ra_dest(as, ir, RSET_FPR);
Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
/* Round down/up/trunc == 1001/1010/1011. */
emit_i8(as, 0x09 + fpm);
/* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op. */
emit_mrm(as, XO_ROUNDSD, dest, left);
/* Let's pretend it's a 3-byte opcode, and compensate afterwards. */
/* This is atrocious, but the alternatives are much worse. */
if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */
}
*--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */
} else if (fpm <= IRFPM_TRUNC) {
/* The modified regs must match with the *.dasc implementation. */
RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
if (ra_hasreg(ir->r))
rset_clear(drop, ir->r); /* Dest reg handled below. */
ra_evictset(as, drop);
ra_destreg(as, ir, RID_XMM0);
emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
ra_left(as, RID_XMM0, ir->op1);
} else {
if (as->flags & JIT_F_SSE4_1) { /* SSE4.1 has a rounding instruction. */
Reg dest = ra_dest(as, ir, RSET_FPR);
Reg left = asm_fuseload(as, ir->op1, RSET_FPR);
/* ROUNDSD has a 4-byte opcode which doesn't fit in x86Op.
** Let's pretend it's a 3-byte opcode, and compensate afterwards.
** This is atrocious, but the alternatives are much worse.
*/
/* Round down/up/trunc == 1001/1010/1011. */
emit_i8(as, 0x09 + fpm);
emit_mrm(as, XO_ROUNDSD, dest, left);
if (LJ_64 && as->mcp[1] != (MCode)(XO_ROUNDSD >> 16)) {
as->mcp[0] = as->mcp[1]; as->mcp[1] = 0x0f; /* Swap 0F and REX. */
}
*--as->mcp = 0x66; /* 1st byte of ROUNDSD opcode. */
} else { /* Call helper functions for SSE2 variant. */
/* The modified regs must match with the *.dasc implementation. */
RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);
if (ra_hasreg(ir->r))
rset_clear(drop, ir->r); /* Dest reg handled below. */
ra_evictset(as, drop);
ra_destreg(as, ir, RID_XMM0);
emit_call(as, fpm == IRFPM_FLOOR ? lj_vm_floor_sse :
fpm == IRFPM_CEIL ? lj_vm_ceil_sse : lj_vm_trunc_sse);
ra_left(as, RID_XMM0, ir->op1);
}
} else if (fpm == IRFPM_EXP2 && fpmjoin_pow(as, ir)) {
/* Rejoined to pow(). */
} else { /* Handle x87 ops. */
int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */
Reg dest = ir->r;
if (ra_hasreg(dest)) {
@@ -2040,14 +2055,8 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
}
emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
switch (fpm) { /* st0 = lj_vm_*(st0) */
case IRFPM_FLOOR: emit_call(as, lj_vm_floor); break;
case IRFPM_CEIL: emit_call(as, lj_vm_ceil); break;
case IRFPM_TRUNC: emit_call(as, lj_vm_trunc); break;
case IRFPM_EXP: emit_call(as, lj_vm_exp); break;
case IRFPM_EXP2:
if (fpmjoin_pow(as, ir)) return;
emit_call(as, lj_vm_exp2); /* st0 = lj_vm_exp2(st0) */
break;
case IRFPM_EXP2: emit_call(as, lj_vm_exp2); break;
case IRFPM_SIN: emit_x87op(as, XI_FSIN); break;
case IRFPM_COS: emit_x87op(as, XI_FCOS); break;
case IRFPM_TAN: emit_x87op(as, XI_FPOP); emit_x87op(as, XI_FPTAN); break;
@@ -2063,10 +2072,6 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
emit_x87op(as, XI_FPATAN); asm_x87load(as, ir->op2); break;
case IR_LDEXP:
emit_x87op(as, XI_FPOP1); emit_x87op(as, XI_FSCALE); break;
case IR_POWI:
emit_call(as, lj_vm_powi); /* st0 = lj_vm_powi(st0, [esp]) */
emit_rmro(as, XO_MOVto, ra_alloc1(as, ir->op2, RSET_GPR), RID_ESP, 0);
break;
default: lua_assert(0); break;
}
break;
@@ -2085,6 +2090,19 @@ static void asm_fpmath(ASMState *as, IRIns *ir)
}
}
static void asm_powi(ASMState *as, IRIns *ir)
{
/* The modified regs must match with the *.dasc implementation. */
RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
if (ra_hasreg(ir->r))
rset_clear(drop, ir->r); /* Dest reg handled below. */
ra_evictset(as, drop);
ra_destreg(as, ir, RID_XMM0);
emit_call(as, lj_vm_powi_sse);
ra_left(as, RID_XMM0, ir->op1);
ra_left(as, RID_EAX, ir->op2);
}
/* Find out whether swapping operands might be beneficial. */
static int swapops(ASMState *as, IRIns *ir)
{
@@ -3132,9 +3150,10 @@ static void asm_ir(ASMState *as, IRIns *ir)
case IR_MIN: asm_fparith(as, ir, XO_MINSD); break;
case IR_MAX: asm_fparith(as, ir, XO_MAXSD); break;
case IR_FPMATH: case IR_ATAN2: case IR_LDEXP: case IR_POWI:
case IR_FPMATH: case IR_ATAN2: case IR_LDEXP:
asm_fpmath(as, ir);
break;
case IR_POWI: asm_powi(as, ir); break;
/* Overflow-checking arithmetic ops. Note: don't use LEA here! */
case IR_ADDOV: asm_intarith(as, ir, XOg_ADD); break;
@@ -3285,8 +3304,22 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
if (inloop)
as->modset = RSET_SCRATCH;
break;
case IR_POWI:
ir->prev = REGSP_HINT(RID_XMM0);
if (inloop)
as->modset |= RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
continue;
case IR_FPMATH:
if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
if (ir->op2 == IRFPM_EXP2) { /* May be joined to lj_vm_pow_sse. */
ir->prev = REGSP_HINT(RID_XMM0);
#if !LJ_64
if (as->evenspill < 4) /* Leave room for 16 byte scratch area. */
as->evenspill = 4;
#endif
if (inloop)
as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
continue;
} else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
ir->prev = REGSP_HINT(RID_XMM0);
if (inloop)
as->modset |= RSET_RANGE(RID_XMM0, RID_XMM3+1)|RID2RSET(RID_EAX);