x64: Various fixes for CET IBT.

Also add ELF notes. #1391
This commit is contained in:
Mike Pall
2025-10-28 00:27:15 +01:00
parent 25a61a1821
commit e34a78acf6
6 changed files with 101 additions and 38 deletions

View File

@@ -446,9 +446,13 @@ ifneq (,$(findstring LJ_ABI_PAUTH 1,$(TARGET_TESTARCH)))
DASM_AFLAGS+= -D PAUTH
TARGET_ARCH+= -DLJ_ABI_PAUTH=1
endif
ifneq (,$(findstring LJ_CET_BR 1,$(TARGET_TESTARCH)))
DASM_AFLAGS+= -D CET_BR
TARGET_ARCH+= -DLJ_CET_BR=1
ifneq (,$(findstring LJ_ABI_BRANCH_TRACK 1,$(TARGET_TESTARCH)))
DASM_AFLAGS+= -D BRANCH_TRACK
TARGET_ARCH+= -DLJ_ABI_BRANCH_TRACK=1
endif
ifneq (,$(findstring LJ_ABI_SHADOW_STACK 1,$(TARGET_TESTARCH)))
DASM_AFLAGS+= -D SHADOW_STACK
TARGET_ARCH+= -DLJ_ABI_SHADOW_STACK=1
endif
DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subst LJ_ARCH_VERSION ,LJ_ARCH_VERSION_,$(TARGET_TESTARCH))))
ifeq (Windows,$(TARGET_SYS))

View File

@@ -219,15 +219,27 @@
#error "macOS requires GC64 -- don't disable it"
#endif
#if (__CET__ & 1) && defined(LUAJIT_ENABLE_CET_BR)
#if !defined(LJ_ABI_BRANCH_TRACK) && (__CET__ & 1) && \
LJ_TARGET_GC64 && defined(LUAJIT_ENABLE_CET_BR)
/*
** Control-Flow Enforcement Technique (CET) indirect branch tracking (IBT).
** This is not enabled by default because it causes a notable slowdown of
** the interpreter on all x64 CPUs, whether they have CET enabled or not.
** If your toolchain enables -fcf-protection=branch by default, you need
** to build with: make XCFLAGS=-DLUAJIT_ENABLE_CET_BR
** to build with: make amalg XCFLAGS=-DLUAJIT_ENABLE_CET_BR
*/
#define LJ_CET_BR 1
#define LJ_ABI_BRANCH_TRACK 1
#endif
#if !defined(LJ_ABI_SHADOW_STACK) && (__CET__ & 2)
/*
** Control-Flow Enforcement Technique (CET) shadow stack (CET-SS).
** It has no code overhead and doesn't cause any slowdowns when unused.
** It can also be unconditionally enabled since all code already follows
** a strict CALL to RET correspondence for performance reasons (all modern
** CPUs use a (non-enforcing) shadow stack for return branch prediction).
*/
#define LJ_ABI_SHADOW_STACK 1
#endif
#elif LUAJIT_TARGET == LUAJIT_ARCH_ARM

View File

@@ -2586,8 +2586,8 @@ void lj_asm_trace(jit_State *J, GCtrace *T)
asm_head_side(as);
else
asm_head_root(as);
#if LJ_CET_BR
emit_endbr(as);
#if LJ_ABI_BRANCH_TRACK
emit_branch_track(as);
#endif
asm_phi_fixup(as);

View File

@@ -34,22 +34,29 @@
#elif LJ_TARGET_X86ORX64
#if LJ_ABI_BRANCH_TRACK
#define CALLBACK_MCODE_SLOTSZ 8
#else
#define CALLBACK_MCODE_SLOTSZ 4
#endif
#define CALLBACK_MCODE_NSLOT (128 / CALLBACK_MCODE_SLOTSZ)
#define CALLBACK_MCODE_HEAD (LJ_64 ? 8 : 0)
#define CALLBACK_MCODE_GROUP (-2+1+2+(LJ_GC64 ? 10 : 5)+(LJ_64 ? 6 : 5))
#define CALLBACK_SLOT2OFS(slot) \
(CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/32) + 4*(slot))
(CALLBACK_MCODE_HEAD + CALLBACK_MCODE_GROUP*((slot)/CALLBACK_MCODE_NSLOT) + CALLBACK_MCODE_SLOTSZ*(slot))
static MSize CALLBACK_OFS2SLOT(MSize ofs)
{
MSize group;
ofs -= CALLBACK_MCODE_HEAD;
group = ofs / (32*4 + CALLBACK_MCODE_GROUP);
return (ofs % (32*4 + CALLBACK_MCODE_GROUP))/4 + group*32;
group = ofs / (128 + CALLBACK_MCODE_GROUP);
return (ofs % (128 + CALLBACK_MCODE_GROUP))/CALLBACK_MCODE_SLOTSZ + group*CALLBACK_MCODE_NSLOT;
}
#define CALLBACK_MAX_SLOT \
(((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+4*32))*32)
(((CALLBACK_MCODE_SIZE-CALLBACK_MCODE_HEAD)/(CALLBACK_MCODE_GROUP+128))*CALLBACK_MCODE_NSLOT)
#elif LJ_TARGET_ARM
@@ -118,9 +125,13 @@ static void *callback_mcode_init(global_State *g, uint8_t *page)
*(void **)p = target; p += 8;
#endif
for (slot = 0; slot < CALLBACK_MAX_SLOT; slot++) {
#if LJ_ABI_BRANCH_TRACK
*(uint32_t *)p = XI_ENDBR64; p += 4;
#endif
/* mov al, slot; jmp group */
*p++ = XI_MOVrib | RID_EAX; *p++ = (uint8_t)slot;
if ((slot & 31) == 31 || slot == CALLBACK_MAX_SLOT-1) {
if ((slot & (CALLBACK_MCODE_NSLOT-1)) == (CALLBACK_MCODE_NSLOT-1) ||
slot == CALLBACK_MAX_SLOT-1) {
/* push ebp/rbp; mov ah, slot>>8; mov ebp, &g. */
*p++ = XI_PUSH + RID_EBP;
*p++ = XI_MOVrib | (RID_EAX+4); *p++ = (uint8_t)(slot >> 8);
@@ -140,7 +151,8 @@ static void *callback_mcode_init(global_State *g, uint8_t *page)
*p++ = XI_JMP; *(int32_t *)p = target-(p+4); p += 4;
#endif
} else {
*p++ = XI_JMPs; *p++ = (uint8_t)((2+2)*(31-(slot&31)) - 2);
*p++ = XI_JMPs;
*p++ = (uint8_t)(CALLBACK_MCODE_SLOTSZ*(CALLBACK_MCODE_NSLOT-1-(slot&(CALLBACK_MCODE_NSLOT-1))) - 2);
}
}
return p;

View File

@@ -70,8 +70,8 @@ static LJ_AINLINE MCode *emit_op(x86Op xo, Reg rr, Reg rb, Reg rx,
return p;
}
#if LJ_CET_BR
static void emit_endbr(ASMState *as)
#if LJ_ABI_BRANCH_TRACK
static void emit_branch_track(ASMState *as)
{
emit_u32(as, XI_ENDBR64);
}

View File

@@ -191,7 +191,7 @@
|
|//-- Control-Flow Enforcement Technique (CET) ---------------------------
|
|.if CET_BR
|.if BRANCH_TRACK
|.macro endbr; endbr64; .endmacro
|.else
|.macro endbr; .endmacro
@@ -200,13 +200,13 @@
|//-----------------------------------------------------------------------
|
|// Instruction headers.
|.macro ins_A; endbr; .endmacro
|.macro ins_AD; endbr; .endmacro
|.macro ins_AJ; endbr; .endmacro
|.macro ins_ABC; endbr; movzx RBd, RCH; movzx RCd, RCL; .endmacro
|.macro ins_AB_; endbr; movzx RBd, RCH; .endmacro
|.macro ins_A_C; endbr; movzx RCd, RCL; .endmacro
|.macro ins_AND; endbr; not RD; .endmacro
|.macro ins_A; .endmacro
|.macro ins_AD; .endmacro
|.macro ins_AJ; .endmacro
|.macro ins_ABC; movzx RBd, RCH; movzx RCd, RCL; .endmacro
|.macro ins_AB_; movzx RBd, RCH; .endmacro
|.macro ins_A_C; movzx RCd, RCL; .endmacro
|.macro ins_AND; not RD; .endmacro
|
|// Instruction decode+dispatch. Carefully tuned (nope, lodsd is not faster).
|.macro ins_NEXT
@@ -487,13 +487,12 @@ static void build_subroutines(BuildCtx *ctx)
| jmp <3
|
|->vm_unwind_yield:
| endbr
| mov al, LUA_YIELD
| jmp ->vm_unwind_c_eh
|
|->vm_unwind_c: // Unwind C stack, return from vm_pcall.
| endbr
| // (void *cframe, int errcode)
| endbr
| mov eax, CARG2d // Error return status for vm_pcall.
| mov rsp, CARG1
|->vm_unwind_c_eh: // Landing pad for external unwinder.
@@ -513,8 +512,8 @@ static void build_subroutines(BuildCtx *ctx)
|.endif
|
|->vm_unwind_ff: // Unwind C stack, return from ff pcall.
| endbr
| // (void *cframe)
| endbr
| and CARG1, CFRAME_RAWMASK
| mov rsp, CARG1
|->vm_unwind_ff_eh: // Landing pad for external unwinder.
@@ -689,7 +688,6 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Continuation dispatch ----------------------------------------------
|
|->cont_dispatch:
| endbr
| // BASE = meta base, RA = resultofs, RD = nresults+1 (also in MULTRES)
| add RA, BASE
| and PC, -8
@@ -2338,8 +2336,8 @@ static void build_subroutines(BuildCtx *ctx)
|
|->cont_stitch: // Trace stitching.
|.if JIT
| endbr
| // BASE = base, RC = result, RB = mbase
| endbr
| mov TRACE:ITYPE, [RB-40] // Save previous trace.
| cleartp TRACE:ITYPE
| mov TMPRd, MULTRES
@@ -2460,8 +2458,8 @@ static void build_subroutines(BuildCtx *ctx)
| jmp >1
|.endif
|->vm_exit_interp:
| endbr
| // RD = MULTRES or negated error code, BASE, PC and DISPATCH set.
| endbr
|.if JIT
| // Restore additional callee-save registers only used in compiled code.
|.if X64WIN
@@ -2849,6 +2847,26 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|=>defop:
switch (op) {
#if !LJ_HASJIT
case BC_FORL:
case BC_JFORI:
case BC_JFORL:
case BC_ITERL:
case BC_JITERL:
case BC_LOOP:
case BC_JLOOP:
case BC_FUNCF:
case BC_JFUNCF:
case BC_JFUNCV:
#endif
case BC_FUNCV: /* NYI: compiled vararg functions. */
break; /* Avoid redundant endbr instructions. */
default:
| endbr
break;
}
switch (op) {
/* -- Comparison ops ---------------------------------------------------- */
@@ -4119,7 +4137,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
case BC_ITERN:
|.if JIT
| endbr
| hotloop RBd
|.endif
|->vm_IITERN:
@@ -4299,7 +4316,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| jnz >7 // Not returning to a fixarg Lua func?
switch (op) {
case BC_RET:
| endbr
|->BC_RET_Z:
| mov KBASE, BASE // Use KBASE for result move.
| sub RDd, 1
@@ -4318,12 +4334,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| ja >6
break;
case BC_RET1:
| endbr
| mov RB, [BASE+RA]
| mov [BASE-16], RB
/* fallthrough */
case BC_RET0:
| endbr
|5:
| cmp PC_RB, RDL // More results expected?
| ja >6
@@ -4370,7 +4384,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
case BC_FORL:
|.if JIT
| endbr
| hotloop RBd
|.endif
| // Fall through. Assumes BC_IFORL follows and ins_AJ is a no-op.
@@ -4522,7 +4535,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
case BC_ITERL:
|.if JIT
| endbr
| hotloop RBd
|.endif
| // Fall through. Assumes BC_IITERL follows and ins_AJ is a no-op.
@@ -4616,7 +4628,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
case BC_FUNCF:
|.if JIT
| endbr
| hotcall RBd
|.endif
case BC_FUNCV: /* NYI: compiled vararg functions. */
@@ -4886,6 +4897,30 @@ static void emit_asm_debug(BuildCtx *ctx)
"\t.align 8\n"
".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
#endif
#endif
#if LJ_TARGET_LINUX && (LJ_ABI_BRANCH_TRACK || LJ_ABI_SHADOW_STACK)
fprintf(ctx->fp,
"\t.section .note.gnu.property,\"a\"\n"
"\t.align 8\n"
"\t.long 4\n"
"\t.long 16\n"
"\t.long 5\n"
"\t.long 0x00554e47\n"
"\t.long 0xc0000002\n"
"\t.long 4\n"
"\t.long %d\n"
"\t.long 0\n",
#if LJ_ABI_BRANCH_TRACK
1|
#else
0|
#endif
#if LJ_ABI_SHADOW_STACK
2
#else
0
#endif
);
#endif
break;
#if !LJ_NO_UNWIND