Fast forward to sync public repo.

Compile math.sinh(), math.cosh(), math.tanh() and math.random().
Compile various io.*() functions.
Drive the GC forward on string allocations in the parser.
Improve KNUM fuse vs. load heuristics.
Add abstract C call handling to IR.
This commit is contained in:
Mike Pall
2009-12-08 20:35:29 +01:00
parent 5287b93264
commit 3f1f9e11f4
44 changed files with 1218 additions and 766 deletions

View File

@@ -13,6 +13,7 @@
#include "lj_gc.h"
#include "lj_str.h"
#include "lj_tab.h"
#include "lj_frame.h"
#include "lj_ir.h"
#include "lj_jit.h"
#include "lj_iropt.h"
@@ -81,6 +82,10 @@ typedef struct ASMState {
#define IR(ref) (&as->ir[(ref)])
#define ASMREF_TMP1 REF_TRUE /* Temp. register. */
#define ASMREF_TMP2 REF_FALSE /* Temp. register. */
#define ASMREF_L REF_NIL /* Stores register for L. */
/* Check for variant to invariant references. */
#define iscrossref(as, ref) ((ref) < as->sectref)
@@ -115,9 +120,11 @@ static LJ_NORET LJ_NOINLINE void asm_mclimit(ASMState *as)
{ MCode rex = 0x40 + (((rr)>>1)&4) + (((rb)>>3)&1); \
if (rex != 0x40) *--(p) = rex; }
#define FORCE_REX 0x200
#define REX_64 (FORCE_REX|0x080000)
#else
#define REXRB(p, rr, rb) ((void)0)
#define FORCE_REX 0
#define REX_64 0
#endif
#define emit_i8(as, i) (*--as->mcp = (MCode)(i))
@@ -144,6 +151,7 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
{
uint32_t rex = 0x40 + ((rr>>1)&(4+(FORCE_REX>>1)))+((rx>>2)&2)+((rb>>3)&1);
if (rex != 0x40) {
rex |= (rr >> 16);
if (n == -4) { *p = (MCode)rex; rex = (MCode)(xo >> 8); }
*--p = (MCode)rex;
}
@@ -451,14 +459,6 @@ static void emit_call_(ASMState *as, MCode *target)
#define emit_call(as, f) emit_call_(as, (MCode *)(void *)(f))
/* Argument setup for C calls. Up to 3 args need no stack adjustment. */
#define emit_setargr(as, narg, r) \
emit_movtomro(as, (r), RID_ESP, ((narg)-1)*4);
#define emit_setargi(as, narg, imm) \
emit_movmroi(as, RID_ESP, ((narg)-1)*4, (imm))
#define emit_setargp(as, narg, ptr) \
emit_setargi(as, (narg), ptr2addr((ptr)))
/* -- Register allocator debugging ---------------------------------------- */
/* #define LUAJIT_DEBUG_RA */
@@ -578,10 +578,6 @@ static void ra_setup(ASMState *as)
memset(as->phireg, 0, sizeof(as->phireg));
memset(as->cost, 0, sizeof(as->cost));
as->cost[RID_ESP] = REGCOST(~0u, 0u);
/* Start slots for spill slot allocation. */
as->evenspill = (SPS_FIRST+1)&~1;
as->oddspill = (SPS_FIRST&1) ? SPS_FIRST : 0;
}
/* Rematerialize constants. */
@@ -598,6 +594,9 @@ static Reg ra_rematk(ASMState *as, IRIns *ir)
} else if (ir->o == IR_BASE) {
ra_sethint(ir->r, RID_BASE); /* Restore BASE register hint. */
emit_getgl(as, r, jit_base);
} else if (ir->o == IR_KPRI) { /* REF_NIL stores ASMREF_L register. */
lua_assert(irt_isnil(ir->t));
emit_getgl(as, r, jit_L);
} else {
lua_assert(ir->o == IR_KINT || ir->o == IR_KGC ||
ir->o == IR_KPTR || ir->o == IR_KNULL);
@@ -629,6 +628,18 @@ static int32_t ra_spill(ASMState *as, IRIns *ir)
return sps_scale(slot);
}
/* Release the temporarily allocated register in ASMREF_TMP1/ASMREF_TMP2. */
static Reg ra_releasetmp(ASMState *as, IRRef ref)
{
IRIns *ir = IR(ref);
Reg r = ir->r;
lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
ra_free(as, r);
ra_modified(as, r);
ir->r = RID_INIT;
return r;
}
/* Restore a register (marked as free). Rematerialize or force a spill. */
static Reg ra_restore(ASMState *as, IRRef ref)
{
@@ -1008,7 +1019,7 @@ static void asm_guardcc(ASMState *as, int cc)
/* Arch-specific field offsets. */
static const uint8_t field_ofs[IRFL__MAX+1] = {
#define FLOFS(name, type, field) (uint8_t)offsetof(type, field),
#define FLOFS(name, ofs) (uint8_t)(ofs),
IRFLDEF(FLOFS)
#undef FLOFS
0
@@ -1129,7 +1140,7 @@ static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
{
IRIns *irr;
lua_assert(ir->o == IR_STRREF);
as->mrm.idx = as->mrm.base = RID_NONE;
as->mrm.base = as->mrm.idx = RID_NONE;
as->mrm.scale = XM_SCALE1;
as->mrm.ofs = sizeof(GCstr);
if (irref_isk(ir->op1)) {
@@ -1158,6 +1169,17 @@ static void asm_fusestrref(ASMState *as, IRIns *ir, RegSet allow)
}
}
static void asm_fusexref(ASMState *as, IRIns *ir, RegSet allow)
{
if (ir->o == IR_KPTR) {
as->mrm.ofs = ir->i;
as->mrm.base = as->mrm.idx = RID_NONE;
} else {
lua_assert(ir->o == IR_STRREF);
asm_fusestrref(as, ir, allow);
}
}
/* Fuse load into memory operand. */
static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
{
@@ -1172,8 +1194,9 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
return RID_MRM;
}
if (ir->o == IR_KNUM) {
RegSet avail = as->freeset & ~as->modset & RSET_FPR;
lua_assert(allow != RSET_EMPTY);
if (!(as->freeset & ~as->modset & RSET_FPR)) {
if (!(avail & (avail-1))) { /* Fuse if less than two regs available. */
as->mrm.ofs = ptr2addr(ir_knum(ir));
as->mrm.base = as->mrm.idx = RID_NONE;
return RID_MRM;
@@ -1188,8 +1211,9 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
return RID_MRM;
}
} else if (ir->o == IR_FLOAD) {
/* Generic fusion is only ok for IRT_INT operand (but see asm_comp). */
if (irt_isint(ir->t) && noconflict(as, ref, IR_FSTORE)) {
/* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
if ((irt_isint(ir->t) || irt_isaddr(ir->t)) &&
noconflict(as, ref, IR_FSTORE)) {
asm_fusefref(as, ir, xallow);
return RID_MRM;
}
@@ -1199,11 +1223,11 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
return RID_MRM;
}
} else if (ir->o == IR_XLOAD) {
/* Generic fusion is only ok for IRT_INT operand (but see asm_comp).
/* Generic fusion is only ok for 32 bit operand (but see asm_comp).
** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
*/
if (irt_isint(ir->t)) {
asm_fusestrref(as, IR(ir->op1), xallow);
if (irt_isint(ir->t) || irt_isaddr(ir->t)) {
asm_fusexref(as, IR(ir->op1), xallow);
return RID_MRM;
}
}
@@ -1214,6 +1238,137 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
return ra_allocref(as, ref, allow);
}
/* -- Calls --------------------------------------------------------------- */
/* Generate a call to a C function. */
static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args)
{
RegSet allow = RSET_ALL;
uint32_t n, nargs = CCI_NARGS(ci);
int32_t ofs = 0;
lua_assert(!(nargs > 2 && (ci->flags&CCI_FASTCALL))); /* Avoid stack adj. */
emit_call(as, ci->func);
for (n = 0; n < nargs; n++) { /* Setup args. */
#if LJ_64
#error "NYI: 64 bit mode call argument setup"
#endif
IRIns *ir = IR(args[n]);
if (irt_isnum(ir->t)) {
if ((ofs & 4) && irref_isk(args[n])) {
/* Split stores for unaligned FP consts. */
emit_movmroi(as, RID_ESP, ofs, (int32_t)ir_knum(ir)->u32.lo);
emit_movmroi(as, RID_ESP, ofs+4, (int32_t)ir_knum(ir)->u32.hi);
} else {
Reg r;
if ((allow & RSET_FPR) == RSET_EMPTY)
lj_trace_err(as->J, LJ_TRERR_NYICOAL);
r = ra_alloc1(as, args[n], allow & RSET_FPR);
allow &= ~RID2RSET(r);
emit_rmro(as, XO_MOVSDto, r, RID_ESP, ofs);
}
ofs += 8;
} else {
if ((ci->flags & CCI_FASTCALL) && n < 2) {
Reg r = n == 0 ? RID_ECX : RID_EDX;
if (args[n] < ASMREF_TMP1) {
emit_loadi(as, r, ir->i);
} else {
lua_assert(rset_test(as->freeset, r)); /* Must have been evicted. */
allow &= ~RID2RSET(r);
if (ra_hasreg(ir->r))
emit_movrr(as, r, ir->r);
else
ra_allocref(as, args[n], RID2RSET(r));
}
} else {
if (args[n] < ASMREF_TMP1) {
emit_movmroi(as, RID_ESP, ofs, ir->i);
} else {
Reg r;
if ((allow & RSET_GPR) == RSET_EMPTY)
lj_trace_err(as->J, LJ_TRERR_NYICOAL);
r = ra_alloc1(as, args[n], allow & RSET_GPR);
allow &= ~RID2RSET(r);
emit_movtomro(as, r, RID_ESP, ofs);
}
ofs += 4;
}
}
}
}
/* Setup result reg/sp for call. Evict scratch regs. */
static void asm_setupresult(ASMState *as, IRIns *ir, const CCallInfo *ci)
{
RegSet drop = RSET_SCRATCH;
if ((ci->flags & CCI_NOFPRCLOBBER))
drop &= ~RSET_FPR;
if (ra_hasreg(ir->r))
rset_clear(drop, ir->r); /* Dest reg handled below. */
ra_evictset(as, drop); /* Evictions must be performed first. */
if (ra_used(ir)) {
if (irt_isnum(ir->t)) {
int32_t ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */
#if LJ_64
if ((ci->flags & CCI_CASTU64)) {
Reg dest = ir->r;
if (ra_hasreg(dest)) {
ra_free(as, dest);
ra_modified(as, dest);
emit_rr(as, XO_MOVD, dest|REX_64, RID_RET); /* Really MOVQ. */
} else {
emit_movrmro(as, RID_RET, RID_ESP, ofs);
}
} else {
ra_destreg(as, ir, RID_FPRET);
}
#else
/* Number result is in x87 st0 for x86 calling convention. */
Reg dest = ir->r;
if (ra_hasreg(dest)) {
ra_free(as, dest);
ra_modified(as, dest);
emit_rmro(as, XMM_MOVRM(as), dest, RID_ESP, ofs);
}
if ((ci->flags & CCI_CASTU64)) {
emit_movtomro(as, RID_RET, RID_ESP, ofs);
emit_movtomro(as, RID_RETHI, RID_ESP, ofs+4);
} else {
emit_rmro(as, XO_FSTPq, XOg_FSTPq, RID_ESP, ofs);
}
#endif
} else {
lua_assert(!irt_ispri(ir->t));
ra_destreg(as, ir, RID_RET);
}
}
}
/* Collect arguments from CALL* and ARG instructions. */
static void asm_collectargs(ASMState *as, IRIns *ir,
const CCallInfo *ci, IRRef *args)
{
uint32_t n = CCI_NARGS(ci);
lua_assert(n <= CCI_NARGS_MAX);
if ((ci->flags & CCI_L)) { *args++ = ASMREF_L; n--; }
while (n-- > 1) {
ir = IR(ir->op1);
lua_assert(ir->o == IR_CARG);
args[n] = ir->op2;
}
args[0] = ir->op1;
lua_assert(IR(ir->op1)->o != IR_CARG);
}
static void asm_call(ASMState *as, IRIns *ir)
{
IRRef args[CCI_NARGS_MAX];
const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
asm_collectargs(as, ir, ci, args);
asm_setupresult(as, ir, ci);
asm_gencall(as, ci, args);
}
/* -- Type conversions ---------------------------------------------------- */
static void asm_tonum(ASMState *as, IRIns *ir)
@@ -1260,48 +1415,41 @@ static void asm_tobit(ASMState *as, IRIns *ir)
static void asm_strto(ASMState *as, IRIns *ir)
{
Reg str;
int32_t ofs;
RegSet drop = RSET_SCRATCH;
/* Force a spill slot for the destination register (if any). */
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_tonum];
IRRef args[2];
RegSet drop = RSET_SCRATCH;
if ((drop & RSET_FPR) != RSET_FPR && ra_hasreg(ir->r))
rset_set(drop, ir->r); /* WIN64 doesn't spill all FPRs. */
ra_evictset(as, drop);
asm_guardcc(as, CC_E);
emit_rr(as, XO_TEST, RID_RET, RID_RET);
/* int lj_str_numconv(const char *s, TValue *n) */
emit_call(as, lj_str_numconv);
ofs = sps_scale(ir->s); /* Use spill slot or slots SPS_TEMP1/2. */
if (ofs == 0) {
emit_setargr(as, 2, RID_ESP);
} else {
emit_setargr(as, 2, RID_RET);
emit_rmro(as, XO_LEA, RID_RET, RID_ESP, ofs);
}
emit_setargr(as, 1, RID_RET);
str = ra_alloc1(as, ir->op1, RSET_GPR);
emit_rmro(as, XO_LEA, RID_RET, str, sizeof(GCstr));
args[0] = ir->op1;
args[1] = ASMREF_TMP1;
asm_gencall(as, ci, args);
/* Store the result to the spill slot or slots SPS_TEMP1/2. */
emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1),
RID_ESP, sps_scale(ir->s));
}
static void asm_tostr(ASMState *as, IRIns *ir)
{
IRIns *irl = IR(ir->op1);
ra_destreg(as, ir, RID_RET);
ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
IRRef args[2];
args[0] = ASMREF_L;
as->gcsteps++;
if (irt_isnum(irl->t)) {
/* GCstr *lj_str_fromnum(lua_State *L, const lua_Number *np) */
emit_call(as, lj_str_fromnum);
emit_setargr(as, 1, RID_RET);
emit_getgl(as, RID_RET, jit_L);
emit_setargr(as, 2, RID_RET);
emit_rmro(as, XO_LEA, RID_RET, RID_ESP, ra_spill(as, irl));
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromnum];
args[1] = ASMREF_TMP1;
asm_setupresult(as, ir, ci);
asm_gencall(as, ci, args);
emit_rmro(as, XO_LEA, ra_releasetmp(as, ASMREF_TMP1),
RID_ESP, ra_spill(as, irl));
} else {
/* GCstr *lj_str_fromint(lua_State *L, int32_t k) */
emit_call(as, lj_str_fromint);
emit_setargr(as, 1, RID_RET);
emit_getgl(as, RID_RET, jit_L);
emit_setargr(as, 2, ra_alloc1(as, ir->op1, RSET_GPR));
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_fromint];
args[1] = ir->op1;
asm_setupresult(as, ir, ci);
asm_gencall(as, ci, args);
}
}
@@ -1330,7 +1478,7 @@ static uint32_t ir_khash(IRIns *ir)
lua_assert(!irt_isnil(ir->t));
return irt_type(ir->t)-IRT_FALSE;
} else {
lua_assert(irt_isaddr(ir->t));
lua_assert(irt_isgcv(ir->t));
lo = u32ptr(ir_kgc(ir));
hi = lo - 0x04c11db7;
}
@@ -1517,33 +1665,27 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
static void asm_newref(ASMState *as, IRIns *ir)
{
IRRef keyref = ir->op2;
IRIns *irkey = IR(keyref);
RegSet allow = RSET_GPR;
Reg tab, tmp;
ra_destreg(as, ir, RID_RET);
ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
tab = ra_alloc1(as, ir->op1, allow);
tmp = ra_scratch(as, rset_clear(allow, tab));
/* TValue *lj_tab_newkey(lua_State *L, GCtab *t, cTValue *key) */
emit_call(as, lj_tab_newkey);
emit_setargr(as, 1, tmp);
emit_setargr(as, 2, tab);
emit_getgl(as, tmp, jit_L);
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_newkey];
IRRef args[3];
IRIns *irkey;
Reg tmp;
args[0] = ASMREF_L;
args[1] = ir->op1;
args[2] = ASMREF_TMP1;
asm_setupresult(as, ir, ci);
asm_gencall(as, ci, args);
tmp = ra_releasetmp(as, ASMREF_TMP1);
irkey = IR(ir->op2);
if (irt_isnum(irkey->t)) {
/* For numbers use the constant itself or a spill slot as a TValue. */
if (irref_isk(keyref)) {
emit_setargp(as, 3, ir_knum(irkey));
} else {
emit_setargr(as, 3, tmp);
if (irref_isk(ir->op2))
emit_loada(as, tmp, ir_knum(irkey));
else
emit_rmro(as, XO_LEA, tmp, RID_ESP, ra_spill(as, irkey));
}
} else {
/* Otherwise use g->tmptv to hold the TValue. */
lua_assert(irt_ispri(irkey->t) || irt_isaddr(irkey->t));
emit_setargr(as, 3, tmp);
if (!irref_isk(keyref)) {
Reg src = ra_alloc1(as, keyref, rset_exclude(allow, tmp));
if (!irref_isk(ir->op2)) {
Reg src = ra_alloc1(as, ir->op2, rset_exclude(RSET_GPR, tmp));
emit_movtomro(as, src, tmp, 0);
} else if (!irt_ispri(irkey->t)) {
emit_movmroi(as, tmp, 0, irkey->i);
@@ -1600,11 +1742,15 @@ static void asm_strref(ASMState *as, IRIns *ir)
/* -- Loads and stores ---------------------------------------------------- */
static void asm_fload(ASMState *as, IRIns *ir)
static void asm_fxload(ASMState *as, IRIns *ir)
{
Reg dest = ra_dest(as, ir, RSET_GPR);
x86Op xo;
asm_fusefref(as, ir, RSET_GPR);
if (ir->o == IR_FLOAD)
asm_fusefref(as, ir, RSET_GPR);
else
asm_fusexref(as, IR(ir->op1), RSET_GPR);
/* ir->op2 is ignored -- unaligned loads are ok on x86. */
switch (irt_type(ir->t)) {
case IRT_I8: xo = XO_MOVSXb; break;
case IRT_U8: xo = XO_MOVZXb; break;
@@ -1731,96 +1877,44 @@ static void asm_sload(ASMState *as, IRIns *ir)
}
}
static void asm_xload(ASMState *as, IRIns *ir)
{
Reg dest = ra_dest(as, ir, RSET_GPR);
x86Op xo;
asm_fusestrref(as, IR(ir->op1), RSET_GPR); /* For now only support STRREF. */
/* ir->op2 is ignored -- unaligned loads are ok on x86. */
switch (irt_type(ir->t)) {
case IRT_I8: xo = XO_MOVSXb; break;
case IRT_U8: xo = XO_MOVZXb; break;
case IRT_I16: xo = XO_MOVSXw; break;
case IRT_U16: xo = XO_MOVZXw; break;
default: lua_assert(irt_isint(ir->t)); xo = XO_MOV; break;
}
emit_mrm(as, xo, dest, RID_MRM);
}
/* -- String ops ---------------------------------------------------------- */
/* -- Allocations --------------------------------------------------------- */
static void asm_snew(ASMState *as, IRIns *ir)
{
RegSet allow = RSET_GPR;
Reg left, right;
IRIns *irl;
ra_destreg(as, ir, RID_RET);
ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
irl = IR(ir->op1);
left = irl->r;
right = IR(ir->op2)->r;
if (ra_noreg(left)) {
lua_assert(irl->o == IR_STRREF);
/* Get register only for non-const STRREF. */
if (!(irref_isk(irl->op1) && irref_isk(irl->op2))) {
if (ra_hasreg(right)) rset_clear(allow, right);
left = ra_allocref(as, ir->op1, allow);
}
}
if (ra_noreg(right) && !irref_isk(ir->op2)) {
if (ra_hasreg(left)) rset_clear(allow, left);
right = ra_allocref(as, ir->op2, allow);
}
/* GCstr *lj_str_new(lua_State *L, const char *str, size_t len) */
emit_call(as, lj_str_new);
emit_setargr(as, 1, RID_RET);
emit_getgl(as, RID_RET, jit_L);
if (ra_noreg(left)) /* Use immediate for const STRREF. */
emit_setargi(as, 2, IR(irl->op1)->i + IR(irl->op2)->i +
(int32_t)sizeof(GCstr));
else
emit_setargr(as, 2, left);
if (ra_noreg(right))
emit_setargi(as, 3, IR(ir->op2)->i);
else
emit_setargr(as, 3, right);
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_str_new];
IRRef args[3];
args[0] = ASMREF_L;
args[1] = ir->op1;
args[2] = ir->op2;
as->gcsteps++;
asm_setupresult(as, ir, ci);
asm_gencall(as, ci, args);
}
/* -- Table ops ----------------------------------------------------------- */
static void asm_tnew(ASMState *as, IRIns *ir)
{
ra_destreg(as, ir, RID_RET);
ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
/* GCtab *lj_tab_new(lua_State *L, int32_t asize, uint32_t hbits) */
emit_call(as, lj_tab_new);
emit_setargr(as, 1, RID_RET);
emit_setargi(as, 2, ir->op1);
emit_setargi(as, 3, ir->op2);
emit_getgl(as, RID_RET, jit_L);
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_new1];
IRRef args[2];
args[0] = ASMREF_L;
args[1] = ASMREF_TMP1;
as->gcsteps++;
asm_setupresult(as, ir, ci);
asm_gencall(as, ci, args);
emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), ir->op1 | (ir->op2 << 24));
}
static void asm_tdup(ASMState *as, IRIns *ir)
{
ra_destreg(as, ir, RID_RET);
ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
/* GCtab *lj_tab_dup(lua_State *L, const GCtab *kt) */
emit_call(as, lj_tab_dup);
emit_setargr(as, 1, RID_RET);
emit_setargp(as, 2, ir_kgc(IR(ir->op1)));
emit_getgl(as, RID_RET, jit_L);
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_tab_dup];
IRRef args[2];
args[0] = ASMREF_L;
args[1] = ir->op1;
as->gcsteps++;
asm_setupresult(as, ir, ci);
asm_gencall(as, ci, args);
}
static void asm_tlen(ASMState *as, IRIns *ir)
{
ra_destreg(as, ir, RID_RET);
ra_evictset(as, rset_exclude(RSET_SCRATCH, RID_RET));
emit_call(as, lj_tab_len); /* MSize lj_tab_len(GCtab *t) */
emit_setargr(as, 1, ra_alloc1(as, ir->op1, RSET_GPR));
}
/* -- Write barriers ------------------------------------------------------ */
static void asm_tbar(ASMState *as, IRIns *ir)
{
@@ -1839,51 +1933,31 @@ static void asm_tbar(ASMState *as, IRIns *ir)
static void asm_obar(ASMState *as, IRIns *ir)
{
RegSet allow = RSET_GPR;
Reg obj, val;
GCobj *valp;
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_barrieruv];
IRRef args[2];
MCLabel l_end;
int32_t ofs;
ra_evictset(as, RSET_SCRATCH);
if (irref_isk(ir->op2)) {
valp = ir_kgc(IR(ir->op2));
val = RID_NONE;
} else {
valp = NULL;
val = ra_alloc1(as, ir->op2, allow);
rset_clear(allow, val);
}
obj = ra_alloc1(as, ir->op1, allow);
l_end = emit_label(as);
Reg obj;
/* No need for other object barriers (yet). */
lua_assert(IR(ir->op1)->o == IR_UREFC);
ofs = -(int32_t)offsetof(GCupval, tv);
/* void lj_gc_barrieruv(global_State *g, GCobj *o, GCobj *v) */
emit_call(as, lj_gc_barrieruv);
if (ofs == 0) {
emit_setargr(as, 2, obj);
} else if (rset_test(RSET_SCRATCH, obj) && !(as->flags & JIT_F_LEA_AGU)) {
emit_setargr(as, 2, obj);
emit_gri(as, XG_ARITHi(XOg_ADD), obj, ofs);
} else {
emit_setargr(as, 2, RID_RET);
emit_rmro(as, XO_LEA, RID_RET, obj, ofs);
}
emit_setargp(as, 1, J2G(as->J));
if (valp)
emit_setargp(as, 3, valp);
else
emit_setargr(as, 3, val);
l_end = emit_label(as);
args[0] = ASMREF_TMP1;
args[1] = ir->op1;
asm_gencall(as, ci, args);
emit_loada(as, ra_releasetmp(as, ASMREF_TMP1), J2G(as->J));
obj = IR(ir->op1)->r;
emit_sjcc(as, CC_Z, l_end);
emit_i8(as, LJ_GC_WHITES);
if (valp)
emit_rma(as, XO_GROUP3b, XOg_TEST, &valp->gch.marked);
else
if (irref_isk(ir->op2)) {
GCobj *vp = ir_kgc(IR(ir->op2));
emit_rma(as, XO_GROUP3b, XOg_TEST, &vp->gch.marked);
} else {
Reg val = ra_alloc1(as, ir->op2, rset_exclude(RSET_SCRATCH&RSET_GPR, obj));
emit_rmro(as, XO_GROUP3b, XOg_TEST, val, (int32_t)offsetof(GChead, marked));
}
emit_sjcc(as, CC_Z, l_end);
emit_i8(as, LJ_GC_BLACK);
emit_rmro(as, XO_GROUP3b, XOg_TEST, obj,
ofs + (int32_t)offsetof(GChead, marked));
(int32_t)offsetof(GCupval, marked)-(int32_t)offsetof(GCupval, tv));
}
/* -- FP/int arithmetic and logic operations ------------------------------ */
@@ -2260,10 +2334,10 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
}
}
emit_mrm(as, XO_UCOMISD, left, right);
} else if (!(irt_isstr(ir->t) && (cc & 0xe) != CC_E)) {
} else {
IRRef lref = ir->op1, rref = ir->op2;
IROp leftop = (IROp)(IR(lref)->o);
lua_assert(irt_isint(ir->t) || irt_isaddr(ir->t));
lua_assert(irt_isint(ir->t) || (irt_isaddr(ir->t) && (cc & 0xe) == CC_E));
/* Swap constants (only for ABC) and fusable loads to the right. */
if (irref_isk(lref) || (!irref_isk(rref) && opisfusableload(leftop))) {
if ((cc & 0xc) == 0xc) cc ^= 3; /* L <-> G, LE <-> GE */
@@ -2294,11 +2368,15 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
} else {
Reg left;
if (opisfusableload((IROp)irl->o) &&
((irt_isi8(irl->t) && checki8(imm)) ||
(irt_isu8(irl->t) && checku8(imm)))) {
/* Only the IRT_INT case is fused by asm_fuseload. The IRT_I8/IRT_U8
** loads are handled here. The IRT_I16/IRT_U16 loads should never be
** fused, since cmp word [mem], imm16 has a length-changing prefix.
((irt_isu8(irl->t) && checku8(imm)) ||
((irt_isi8(irl->t) || irt_isi16(irl->t)) && checki8(imm)) ||
(irt_isu16(irl->t) && checku16(imm) && checki8((int16_t)imm)))) {
/* Only the IRT_INT case is fused by asm_fuseload.
** The IRT_I8/IRT_U8 loads and some IRT_I16/IRT_U16 loads
** are handled here.
** Note that cmp word [mem], imm16 should not be generated,
** since it has a length-changing prefix. Compares of a word
** against a sign-extended imm8 are ok, however.
*/
IRType1 origt = irl->t; /* Temporarily flip types. */
irl->t.irt = (irl->t.irt & ~IRT_TYPE) | IRT_INT;
@@ -2307,7 +2385,8 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
if (left == RID_MRM) { /* Fusion succeeded? */
asm_guardcc(as, cc);
emit_i8(as, imm);
emit_mrm(as, XO_ARITHib, XOg_CMP, RID_MRM);
emit_mrm(as, (irt_isi8(origt) || irt_isu8(origt)) ?
XO_ARITHib : XO_ARITHiw8, XOg_CMP, RID_MRM);
return;
} /* Otherwise handle register case as usual. */
} else {
@@ -2337,26 +2416,6 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
asm_guardcc(as, cc);
emit_mrm(as, XO_CMP, left, right);
}
} else { /* Handle ordered string compares. */
RegSet allow = RSET_GPR;
/* This assumes lj_str_cmp never uses any SSE registers. */
ra_evictset(as, (RSET_SCRATCH & RSET_GPR));
asm_guardcc(as, cc);
emit_rr(as, XO_TEST, RID_RET, RID_RET);
emit_call(as, lj_str_cmp); /* int32_t lj_str_cmp(GCstr *a, GCstr *b) */
if (irref_isk(ir->op1)) {
emit_setargi(as, 1, IR(ir->op1)->i);
} else {
Reg left = ra_alloc1(as, ir->op1, allow);
rset_clear(allow, left);
emit_setargr(as, 1, left);
}
if (irref_isk(ir->op2)) {
emit_setargi(as, 2, IR(ir->op2)->i);
} else {
Reg right = ra_alloc1(as, ir->op2, allow);
emit_setargr(as, 2, right);
}
}
}
@@ -2366,8 +2425,14 @@ static void asm_comp_(ASMState *as, IRIns *ir, int cc)
/* -- GC handling --------------------------------------------------------- */
/* Sync all live GC values to Lua stack slots. */
static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base, RegSet allow)
static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base)
{
/* Some care must be taken when allocating registers here, since this is
** not part of the fast path. All scratch registers are evicted in the
** fast path, so it's easiest to force allocation from scratch registers
** only. This avoids register allocation state unification.
*/
RegSet allow = rset_exclude(RSET_SCRATCH & RSET_GPR, base);
IRRef2 *map = &as->T->snapmap[snap->mapofs];
BCReg s, nslots = snap->nslots;
for (s = 0; s < nslots; s++) {
@@ -2392,27 +2457,36 @@ static void asm_gc_sync(ASMState *as, SnapShot *snap, Reg base, RegSet allow)
/* Check GC threshold and do one or more GC steps. */
static void asm_gc_check(ASMState *as, SnapShot *snap)
{
const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_gc_step_jit];
IRRef args[2];
MCLabel l_end;
const BCIns *pc;
Reg tmp, base;
Reg base, lstate, tmp;
RegSet drop = RSET_SCRATCH;
/* Must evict BASE because the stack may be reallocated by the GC. */
if (ra_hasreg(IR(REF_BASE)->r))
drop |= RID2RSET(IR(REF_BASE)->r);
if (ra_hasreg(IR(REF_BASE)->r)) /* Stack may be reallocated by the GC. */
drop |= RID2RSET(IR(REF_BASE)->r); /* Need to evict BASE, too. */
ra_evictset(as, drop);
base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_GPR, RID_RET));
l_end = emit_label(as);
/* void lj_gc_step_jit(lua_State *L, const BCIns *pc, MSize steps) */
emit_call(as, lj_gc_step_jit);
emit_movtomro(as, base, RID_RET, offsetof(lua_State, base));
emit_setargr(as, 1, RID_RET);
emit_setargi(as, 3, (int32_t)as->gcsteps);
emit_getgl(as, RID_RET, jit_L);
pc = (const BCIns *)(uintptr_t)as->T->snapmap[snap->mapofs+snap->nslots];
emit_setargp(as, 2, pc);
asm_gc_sync(as, snap, base, rset_exclude(RSET_SCRATCH & RSET_GPR, base));
if (as->curins == as->loopref) /* BASE gets restored by LOOP anyway. */
ra_restore(as, REF_BASE); /* Better do it inside the slow path. */
args[0] = ASMREF_L;
args[1] = ASMREF_TMP1;
asm_gencall(as, ci, args);
tmp = ra_releasetmp(as, ASMREF_TMP1);
emit_loadi(as, tmp, (int32_t)as->gcsteps);
/* We don't know spadj yet, so get the C frame from L->cframe. */
emit_movmroi(as, tmp, CFRAME_OFS_PC,
(int32_t)as->T->snapmap[snap->mapofs+snap->nslots]);
emit_gri(as, XG_ARITHi(XOg_AND), tmp, CFRAME_RAWMASK);
lstate = IR(ASMREF_L)->r;
emit_movrmro(as, tmp, lstate, offsetof(lua_State, cframe));
/* It's ok if lstate is already in a non-scratch reg. But all allocations
** in the non-fast path must use a scratch reg. See comment above.
*/
base = ra_alloc1(as, REF_BASE, rset_exclude(RSET_SCRATCH & RSET_GPR, lstate));
emit_movtomro(as, base, lstate, offsetof(lua_State, base));
asm_gc_sync(as, snap, base);
/* BASE/L get restored anyway, better do it inside the slow path. */
if (as->parent || as->curins == as->loopref) ra_restore(as, REF_BASE);
if (rset_test(RSET_SCRATCH, lstate) && ra_hasreg(IR(ASMREF_L)->r))
ra_restore(as, ASMREF_L);
/* Jump around GC step if GC total < GC threshold. */
tmp = ra_scratch(as, RSET_SCRATCH & RSET_GPR);
emit_sjcc(as, CC_B, l_end);
@@ -2666,7 +2740,7 @@ static void asm_head_root(ASMState *as)
{
int32_t spadj;
emit_setgli(as, vmstate, (int32_t)as->J->curtrace);
spadj = sps_adjust(as);
spadj = sps_adjust(as->evenspill);
as->T->spadjust = (uint16_t)spadj;
emit_addptr(as, RID_ESP, -spadj);
}
@@ -2676,11 +2750,13 @@ static void asm_head_base(ASMState *as)
{
IRIns *ir = IR(REF_BASE);
Reg r = ir->r;
lua_assert(ra_hasreg(r) && !ra_hasspill(ir->s));
ra_free(as, r);
if (r != RID_BASE) {
ra_scratch(as, RID2RSET(RID_BASE));
emit_rr(as, XO_MOV, r, RID_BASE);
lua_assert(!ra_hasspill(ir->s));
if (ra_hasreg(r)) {
ra_free(as, r);
if (r != RID_BASE) {
ra_scratch(as, RID2RSET(RID_BASE));
emit_rr(as, XO_MOV, r, RID_BASE);
}
}
}
@@ -2749,7 +2825,7 @@ static void asm_head_side(ASMState *as)
}
/* Calculate stack frame adjustment. */
spadj = sps_adjust(as);
spadj = sps_adjust(as->evenspill);
spdelta = spadj - (int32_t)as->parent->spadjust;
if (spdelta < 0) { /* Don't shrink the stack frame. */
spadj = (int32_t)as->parent->spadjust;
@@ -2877,9 +2953,11 @@ static void asm_tail_sync(ASMState *as)
GCfunc *fn = ir_kfunc(IR(ir->op2));
if (isluafunc(fn)) {
BCReg fs = s + funcproto(fn)->framesize;
newbase = s;
if (secondbase == ~(BCReg)0) secondbase = s;
if (fs > topslot) topslot = fs;
if (s != 0) {
newbase = s;
if (secondbase == ~(BCReg)0) secondbase = s;
}
}
}
}
@@ -3063,20 +3141,18 @@ static void asm_ir(ASMState *as, IRIns *ir)
/* Loads and stores. */
case IR_ALOAD: case IR_HLOAD: case IR_ULOAD: asm_ahuload(as, ir); break;
case IR_FLOAD: asm_fload(as, ir); break;
case IR_FLOAD: case IR_XLOAD: asm_fxload(as, ir); break;
case IR_SLOAD: asm_sload(as, ir); break;
case IR_XLOAD: asm_xload(as, ir); break;
case IR_ASTORE: case IR_HSTORE: case IR_USTORE: asm_ahustore(as, ir); break;
case IR_FSTORE: asm_fstore(as, ir); break;
/* String ops. */
/* Allocations. */
case IR_SNEW: asm_snew(as, ir); break;
/* Table ops. */
case IR_TNEW: asm_tnew(as, ir); break;
case IR_TDUP: asm_tdup(as, ir); break;
case IR_TLEN: asm_tlen(as, ir); break;
/* Write barriers. */
case IR_TBAR: asm_tbar(as, ir); break;
case IR_OBAR: asm_obar(as, ir); break;
@@ -3092,6 +3168,10 @@ static void asm_ir(ASMState *as, IRIns *ir)
case IR_TOSTR: asm_tostr(as, ir); break;
case IR_STRTO: asm_strto(as, ir); break;
/* Calls. */
case IR_CALLN: case IR_CALLL: case IR_CALLS: asm_call(as, ir); break;
case IR_CARG: break;
default:
setintV(&as->J->errinfo, ir->o);
lj_trace_err_info(as->J, LJ_TRERR_NYIIR);
@@ -3123,6 +3203,8 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
IRRef i, nins;
int inloop;
ra_setup(as);
/* Clear reg/sp for constants. */
for (i = T->nk; i < REF_BIAS; i++)
IR(i)->prev = REGSP_INIT;
@@ -3144,6 +3226,7 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
as->curins = nins;
inloop = 0;
as->evenspill = SPS_FIRST;
for (i = REF_FIRST; i < nins; i++) {
IRIns *ir = IR(i);
switch (ir->o) {
@@ -3166,8 +3249,23 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
if (i == as->stopins+1 && ir->op1 == ir->op2)
as->stopins++;
break;
case IR_CALLN: case IR_CALLL: case IR_CALLS: {
const CCallInfo *ci = &lj_ir_callinfo[ir->op2];
/* NYI: not fastcall-aware, but doesn't matter (yet). */
if (CCI_NARGS(ci) > (uint32_t)as->evenspill) /* Leave room for args. */
as->evenspill = (int32_t)CCI_NARGS(ci);
#if LJ_64
ir->prev = REGSP_HINT(irt_isnum(ir->t) ? RID_FPRET : RID_RET);
#else
ir->prev = REGSP_HINT(RID_RET);
#endif
if (inloop)
as->modset |= (ci->flags & CCI_NOFPRCLOBBER) ?
(RSET_SCRATCH & ~RSET_FPR) : RSET_SCRATCH;
continue;
}
/* C calls evict all scratch regs and return results in RID_RET. */
case IR_SNEW: case IR_TNEW: case IR_TDUP: case IR_TLEN: case IR_TOSTR:
case IR_SNEW: case IR_TNEW: case IR_TDUP: case IR_TOSTR:
case IR_NEWREF:
ir->prev = REGSP_HINT(RID_RET);
if (inloop)
@@ -3177,11 +3275,6 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
if (inloop)
as->modset = RSET_SCRATCH;
break;
/* Ordered string compares evict all integer scratch registers. */
case IR_LT: case IR_GE: case IR_LE: case IR_GT:
if (irt_isstr(ir->t) && inloop)
as->modset |= (RSET_SCRATCH & RSET_GPR);
break;
/* Non-constant shift counts need to be in RID_ECX. */
case IR_BSHL: case IR_BSHR: case IR_BSAR: case IR_BROL: case IR_BROR:
if (!irref_isk(ir->op2) && !ra_hashint(IR(ir->op2)->r))
@@ -3200,6 +3293,10 @@ static void asm_setup_regsp(ASMState *as, Trace *T)
}
ir->prev = REGSP_INIT;
}
if ((as->evenspill & 1))
as->oddspill = as->evenspill++;
else
as->oddspill = 0;
}
/* -- Assembler core ------------------------------------------------------ */
@@ -3263,7 +3360,6 @@ void lj_asm_trace(jit_State *J, Trace *T)
as->fuseref = (as->flags & JIT_F_OPT_FUSE) ? as->loopref : FUSE_DISABLED;
/* Setup register allocation. */
ra_setup(as);
asm_setup_regsp(as, T);
if (!as->loopref) {