Skip to content

Commit 2fe99eb

Browse files
Alexei Starovoitovanakryiko
authored andcommitted
bpf: Add x86-64 JIT support for PROBE_MEM32 pseudo instructions.
Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW] instructions. They are similar to PROBE_MEM instructions with the following differences: - PROBE_MEM has to check that the address is in the kernel range with src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE check - PROBE_MEM doesn't support store - PROBE_MEM32 relies on the verifier to clear upper 32-bit in the register - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in %r12 in the prologue) Due to bpf_arena constructions such %r12 + %reg + off16 access is guaranteed to be within arena virtual range, so no address check at run-time. - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When LDX faults the destination register is zeroed. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> Link: https://lore.kernel.org/bpf/20240308010812.89848-4-alexei.starovoitov@gmail.com
1 parent 667a86a commit 2fe99eb

3 files changed

Lines changed: 194 additions & 1 deletion

File tree

arch/x86/net/bpf_jit_comp.c

Lines changed: 190 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
113113
/* Pick a register outside of BPF range for JIT internal work */
114114
#define AUX_REG (MAX_BPF_JIT_REG + 1)
115115
#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
116+
#define X86_REG_R12 (MAX_BPF_JIT_REG + 3)
116117

117118
/*
118119
* The following table maps BPF registers to x86-64 registers.
@@ -139,6 +140,7 @@ static const int reg2hex[] = {
139140
[BPF_REG_AX] = 2, /* R10 temp register */
140141
[AUX_REG] = 3, /* R11 temp register */
141142
[X86_REG_R9] = 1, /* R9 register, 6th function argument */
143+
[X86_REG_R12] = 4, /* R12 callee saved */
142144
};
143145

144146
static const int reg2pt_regs[] = {
@@ -167,6 +169,7 @@ static bool is_ereg(u32 reg)
167169
BIT(BPF_REG_8) |
168170
BIT(BPF_REG_9) |
169171
BIT(X86_REG_R9) |
172+
BIT(X86_REG_R12) |
170173
BIT(BPF_REG_AX));
171174
}
172175

@@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
205208
return byte;
206209
}
207210

211+
static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
212+
{
213+
if (is_ereg(r1))
214+
byte |= 1;
215+
if (is_ereg(index))
216+
byte |= 2;
217+
if (is_ereg(r2))
218+
byte |= 4;
219+
return byte;
220+
}
221+
208222
/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
209223
static u8 add_1reg(u8 byte, u32 dst_reg)
210224
{
@@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
645659
pop_r12(&prog);
646660
} else {
647661
pop_callee_regs(&prog, callee_regs_used);
662+
if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
663+
pop_r12(&prog);
648664
}
649665

650666
EMIT1(0x58); /* pop rax */
@@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
704720
pop_r12(&prog);
705721
} else {
706722
pop_callee_regs(&prog, callee_regs_used);
723+
if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
724+
pop_r12(&prog);
707725
}
708726

709727
EMIT1(0x58); /* pop rax */
@@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
887905
*pprog = prog;
888906
}
889907

908+
static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
909+
{
910+
u8 *prog = *pprog;
911+
912+
if (is_imm8(off)) {
913+
EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
914+
} else {
915+
EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
916+
}
917+
*pprog = prog;
918+
}
919+
890920
/*
891921
* Emit a REX byte if it will be necessary to address these registers
892922
*/
@@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
968998
*pprog = prog;
969999
}
9701000

1001+
static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
1002+
{
1003+
u8 *prog = *pprog;
1004+
1005+
switch (size) {
1006+
case BPF_B:
1007+
/* movzx rax, byte ptr [rax + r12 + off] */
1008+
EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6);
1009+
break;
1010+
case BPF_H:
1011+
/* movzx rax, word ptr [rax + r12 + off] */
1012+
EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7);
1013+
break;
1014+
case BPF_W:
1015+
/* mov eax, dword ptr [rax + r12 + off] */
1016+
EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B);
1017+
break;
1018+
case BPF_DW:
1019+
/* mov rax, qword ptr [rax + r12 + off] */
1020+
EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B);
1021+
break;
1022+
}
1023+
emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off);
1024+
*pprog = prog;
1025+
}
1026+
1027+
static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
1028+
{
1029+
emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
1030+
}
1031+
9711032
/* STX: *(u8*)(dst_reg + off) = src_reg */
9721033
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
9731034
{
@@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
10021063
*pprog = prog;
10031064
}
10041065

1066+
/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
1067+
static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
1068+
{
1069+
u8 *prog = *pprog;
1070+
1071+
switch (size) {
1072+
case BPF_B:
1073+
/* mov byte ptr [rax + r12 + off], al */
1074+
EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88);
1075+
break;
1076+
case BPF_H:
1077+
/* mov word ptr [rax + r12 + off], ax */
1078+
EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
1079+
break;
1080+
case BPF_W:
1081+
/* mov dword ptr [rax + r12 + 1], eax */
1082+
EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
1083+
break;
1084+
case BPF_DW:
1085+
/* mov qword ptr [rax + r12 + 1], rax */
1086+
EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89);
1087+
break;
1088+
}
1089+
emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
1090+
*pprog = prog;
1091+
}
1092+
1093+
static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
1094+
{
1095+
emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
1096+
}
1097+
1098+
/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
1099+
static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
1100+
{
1101+
u8 *prog = *pprog;
1102+
1103+
switch (size) {
1104+
case BPF_B:
1105+
/* mov byte ptr [rax + r12 + off], imm8 */
1106+
EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6);
1107+
break;
1108+
case BPF_H:
1109+
/* mov word ptr [rax + r12 + off], imm16 */
1110+
EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
1111+
break;
1112+
case BPF_W:
1113+
/* mov dword ptr [rax + r12 + 1], imm32 */
1114+
EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
1115+
break;
1116+
case BPF_DW:
1117+
/* mov qword ptr [rax + r12 + 1], imm32 */
1118+
EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7);
1119+
break;
1120+
}
1121+
emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off);
1122+
EMIT(imm, bpf_size_to_x86_bytes(size));
1123+
*pprog = prog;
1124+
}
1125+
1126+
static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
1127+
{
1128+
emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
1129+
}
1130+
10051131
static int emit_atomic(u8 **pprog, u8 atomic_op,
10061132
u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
10071133
{
@@ -1043,12 +1169,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
10431169
return 0;
10441170
}
10451171

1172+
#define DONT_CLEAR 1
1173+
10461174
bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
10471175
{
10481176
u32 reg = x->fixup >> 8;
10491177

10501178
/* jump over faulting load and clear dest register */
1051-
*(unsigned long *)((void *)regs + reg) = 0;
1179+
if (reg != DONT_CLEAR)
1180+
*(unsigned long *)((void *)regs + reg) = 0;
10521181
regs->ip += x->fixup & 0xff;
10531182
return true;
10541183
}
@@ -1147,11 +1276,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
11471276
bool tail_call_seen = false;
11481277
bool seen_exit = false;
11491278
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
1279+
u64 arena_vm_start;
11501280
int i, excnt = 0;
11511281
int ilen, proglen = 0;
11521282
u8 *prog = temp;
11531283
int err;
11541284

1285+
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
1286+
11551287
detect_reg_usage(insn, insn_cnt, callee_regs_used,
11561288
&tail_call_seen);
11571289

@@ -1172,8 +1304,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
11721304
push_r12(&prog);
11731305
push_callee_regs(&prog, all_callee_regs_used);
11741306
} else {
1307+
if (arena_vm_start)
1308+
push_r12(&prog);
11751309
push_callee_regs(&prog, callee_regs_used);
11761310
}
1311+
if (arena_vm_start)
1312+
emit_mov_imm64(&prog, X86_REG_R12,
1313+
arena_vm_start >> 32, (u32) arena_vm_start);
11771314

11781315
ilen = prog - temp;
11791316
if (rw_image)
@@ -1564,6 +1701,56 @@ st: if (is_imm8(insn->off))
15641701
emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
15651702
break;
15661703

1704+
case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
1705+
case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
1706+
case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
1707+
case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
1708+
start_of_ldx = prog;
1709+
emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm);
1710+
goto populate_extable;
1711+
1712+
/* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */
1713+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
1714+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
1715+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
1716+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
1717+
case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
1718+
case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
1719+
case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
1720+
case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
1721+
start_of_ldx = prog;
1722+
if (BPF_CLASS(insn->code) == BPF_LDX)
1723+
emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
1724+
else
1725+
emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
1726+
populate_extable:
1727+
{
1728+
struct exception_table_entry *ex;
1729+
u8 *_insn = image + proglen + (start_of_ldx - temp);
1730+
s64 delta;
1731+
1732+
if (!bpf_prog->aux->extable)
1733+
break;
1734+
1735+
if (excnt >= bpf_prog->aux->num_exentries) {
1736+
pr_err("mem32 extable bug\n");
1737+
return -EFAULT;
1738+
}
1739+
ex = &bpf_prog->aux->extable[excnt++];
1740+
1741+
delta = _insn - (u8 *)&ex->insn;
1742+
/* switch ex to rw buffer for writes */
1743+
ex = (void *)rw_image + ((void *)ex - (void *)image);
1744+
1745+
ex->insn = delta;
1746+
1747+
ex->data = EX_TYPE_BPF;
1748+
1749+
ex->fixup = (prog - start_of_ldx) |
1750+
((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8);
1751+
}
1752+
break;
1753+
15671754
/* LDX: dst_reg = *(u8*)(src_reg + off) */
15681755
case BPF_LDX | BPF_MEM | BPF_B:
15691756
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
@@ -2036,6 +2223,8 @@ st: if (is_imm8(insn->off))
20362223
pop_r12(&prog);
20372224
} else {
20382225
pop_callee_regs(&prog, callee_regs_used);
2226+
if (arena_vm_start)
2227+
pop_r12(&prog);
20392228
}
20402229
EMIT1(0xC9); /* leave */
20412230
emit_return(&prog, image + addrs[i - 1] + (prog - temp));

include/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1458,6 +1458,7 @@ struct bpf_prog_aux {
14581458
bool xdp_has_frags;
14591459
bool exception_cb;
14601460
bool exception_boundary;
1461+
struct bpf_arena *arena;
14611462
/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
14621463
const struct btf_type *attach_func_proto;
14631464
/* function name for valid attach_btf_id */

include/linux/filter.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ struct ctl_table_header;
7272
/* unused opcode to mark special ldsx instruction. Same as BPF_IND */
7373
#define BPF_PROBE_MEMSX 0x40
7474

75+
/* unused opcode to mark special load instruction. Same as BPF_MSH */
76+
#define BPF_PROBE_MEM32 0xa0
77+
7578
/* unused opcode to mark call to interpreter with arguments */
7679
#define BPF_CALL_ARGS 0xe0
7780

0 commit comments

Comments
 (0)