Skip to content

Commit a76ab57

Browse files
Yonghong SongAlexei Starovoitov
authored andcommitted
bpf: Find eligible subprogs for private stack support
Private stack will be allocated with percpu allocator in jit time. To avoid complexity at runtime, only one copy of private stack is available per cpu per prog. So runtime recursion check is necessary to avoid stack corruption. Current private stack only supports kprobe/perf_event/tp/raw_tp which has recursion check in the kernel, and prog types that use bpf trampoline recursion check. For trampoline related prog types, currently only tracing progs have recursion checking. To avoid complexity, all async_cb subprogs use normal kernel stack including those subprogs used by both main prog subtree and async_cb subtree. Any prog having tail call also uses kernel stack. To avoid jit penalty with private stack support, a subprog stack size threshold is set such that only if the stack size is no less than the threshold, private stack is supported. The current threshold is 64 bytes. This avoids jit penality if the stack usage is small. A useless 'continue' is also removed from a loop in func check_max_stack_depth(). Signed-off-by: Yonghong Song <yonghong.song@linux.dev> Link: https://lore.kernel.org/r/20241112163907.2223839-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
1 parent c748a25 commit a76ab57

4 files changed

Lines changed: 99 additions & 10 deletions

File tree

include/linux/bpf_verifier.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,12 @@ struct bpf_subprog_arg_info {
633633
};
634634
};
635635

636+
enum priv_stack_mode {
637+
PRIV_STACK_UNKNOWN,
638+
NO_PRIV_STACK,
639+
PRIV_STACK_ADAPTIVE,
640+
};
641+
636642
struct bpf_subprog_info {
637643
/* 'start' has to be the first field otherwise find_subprog() won't work */
638644
u32 start; /* insn idx of function entry point */
@@ -653,6 +659,7 @@ struct bpf_subprog_info {
653659
/* true if bpf_fastcall stack region is used by functions that can't be inlined */
654660
bool keep_fastcall_stack: 1;
655661

662+
enum priv_stack_mode priv_stack_mode;
656663
u8 arg_cnt;
657664
struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
658665
};

include/linux/filter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
11191119
bool bpf_jit_supports_ptr_xchg(void);
11201120
bool bpf_jit_supports_arena(void);
11211121
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
1122+
bool bpf_jit_supports_private_stack(void);
11221123
u64 bpf_arch_uaddress_limit(void);
11231124
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
11241125
bool bpf_helper_changes_pkt_data(void *func);

kernel/bpf/core.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
30453045
return false;
30463046
}
30473047

3048+
bool __weak bpf_jit_supports_private_stack(void)
3049+
{
3050+
return false;
3051+
}
3052+
30483053
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
30493054
{
30503055
}

kernel/bpf/verifier.c

Lines changed: 86 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
194194

195195
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
196196

197+
#define BPF_PRIV_STACK_MIN_SIZE 64
198+
197199
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
198200
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
199201
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -6090,6 +6092,34 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
60906092
strict);
60916093
}
60926094

6095+
static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
6096+
{
6097+
if (!bpf_jit_supports_private_stack())
6098+
return NO_PRIV_STACK;
6099+
6100+
/* bpf_prog_check_recur() checks all prog types that use bpf trampoline
6101+
* while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
6102+
* explicitly.
6103+
*/
6104+
switch (prog->type) {
6105+
case BPF_PROG_TYPE_KPROBE:
6106+
case BPF_PROG_TYPE_TRACEPOINT:
6107+
case BPF_PROG_TYPE_PERF_EVENT:
6108+
case BPF_PROG_TYPE_RAW_TRACEPOINT:
6109+
return PRIV_STACK_ADAPTIVE;
6110+
case BPF_PROG_TYPE_TRACING:
6111+
case BPF_PROG_TYPE_LSM:
6112+
case BPF_PROG_TYPE_STRUCT_OPS:
6113+
if (bpf_prog_check_recur(prog))
6114+
return PRIV_STACK_ADAPTIVE;
6115+
fallthrough;
6116+
default:
6117+
break;
6118+
}
6119+
6120+
return NO_PRIV_STACK;
6121+
}
6122+
60936123
static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
60946124
{
60956125
if (env->prog->jit_requested)
@@ -6107,17 +6137,20 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
61076137
* Since recursion is prevented by check_cfg() this algorithm
61086138
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
61096139
*/
6110-
static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
6140+
static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
6141+
bool priv_stack_supported)
61116142
{
61126143
struct bpf_subprog_info *subprog = env->subprog_info;
61136144
struct bpf_insn *insn = env->prog->insnsi;
6114-
int depth = 0, frame = 0, i, subprog_end;
6145+
int depth = 0, frame = 0, i, subprog_end, subprog_depth;
61156146
bool tail_call_reachable = false;
61166147
int ret_insn[MAX_CALL_FRAMES];
61176148
int ret_prog[MAX_CALL_FRAMES];
61186149
int j;
61196150

61206151
i = subprog[idx].start;
6152+
if (!priv_stack_supported)
6153+
subprog[idx].priv_stack_mode = NO_PRIV_STACK;
61216154
process_func:
61226155
/* protect against potential stack overflow that might happen when
61236156
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -6144,11 +6177,31 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
61446177
depth);
61456178
return -EACCES;
61466179
}
6147-
depth += round_up_stack_depth(env, subprog[idx].stack_depth);
6148-
if (depth > MAX_BPF_STACK) {
6149-
verbose(env, "combined stack size of %d calls is %d. Too large\n",
6150-
frame + 1, depth);
6151-
return -EACCES;
6180+
6181+
subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
6182+
if (priv_stack_supported) {
6183+
/* Request private stack support only if the subprog stack
6184+
* depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
6185+
* avoid jit penalty if the stack usage is small.
6186+
*/
6187+
if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
6188+
subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
6189+
subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
6190+
}
6191+
6192+
if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
6193+
if (subprog_depth > MAX_BPF_STACK) {
6194+
verbose(env, "stack size of subprog %d is %d. Too large\n",
6195+
idx, subprog_depth);
6196+
return -EACCES;
6197+
}
6198+
} else {
6199+
depth += subprog_depth;
6200+
if (depth > MAX_BPF_STACK) {
6201+
verbose(env, "combined stack size of %d calls is %d. Too large\n",
6202+
frame + 1, depth);
6203+
return -EACCES;
6204+
}
61526205
}
61536206
continue_func:
61546207
subprog_end = subprog[idx + 1].start;
@@ -6205,6 +6258,8 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
62056258
}
62066259
i = next_insn;
62076260
idx = sidx;
6261+
if (!priv_stack_supported)
6262+
subprog[idx].priv_stack_mode = NO_PRIV_STACK;
62086263

62096264
if (subprog[idx].has_tail_call)
62106265
tail_call_reachable = true;
@@ -6238,7 +6293,8 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
62386293
*/
62396294
if (frame == 0)
62406295
return 0;
6241-
depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
6296+
if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
6297+
depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
62426298
frame--;
62436299
i = ret_insn[frame];
62446300
idx = ret_prog[frame];
@@ -6247,16 +6303,36 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
62476303

62486304
static int check_max_stack_depth(struct bpf_verifier_env *env)
62496305
{
6306+
enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
62506307
struct bpf_subprog_info *si = env->subprog_info;
6308+
bool priv_stack_supported;
62516309
int ret;
62526310

62536311
for (int i = 0; i < env->subprog_cnt; i++) {
6312+
if (si[i].has_tail_call) {
6313+
priv_stack_mode = NO_PRIV_STACK;
6314+
break;
6315+
}
6316+
}
6317+
6318+
if (priv_stack_mode == PRIV_STACK_UNKNOWN)
6319+
priv_stack_mode = bpf_enable_priv_stack(env->prog);
6320+
6321+
/* All async_cb subprogs use normal kernel stack. If a particular
6322+
* subprog appears in both main prog and async_cb subtree, that
6323+
* subprog will use normal kernel stack to avoid potential nesting.
6324+
* The reverse subprog traversal ensures when main prog subtree is
6325+
* checked, the subprogs appearing in async_cb subtrees are already
6326+
* marked as using normal kernel stack, so stack size checking can
6327+
* be done properly.
6328+
*/
6329+
for (int i = env->subprog_cnt - 1; i >= 0; i--) {
62546330
if (!i || si[i].is_async_cb) {
6255-
ret = check_max_stack_depth_subprog(env, i);
6331+
priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
6332+
ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
62566333
if (ret < 0)
62576334
return ret;
62586335
}
6259-
continue;
62606336
}
62616337
return 0;
62626338
}

0 commit comments

Comments
 (0)