eBPF-verifier源码分析


bpf_check

在bpf_check函数中,会按照顺序依次做如下check:

check_subprogs

先看check_subprogs:

static int check_subprogs(struct bpf_verifier_env *env)
{
int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;

/* Add entry function. */
ret = add_subprog(env, 0);
if (ret < 0)
return ret;

/* determine subprog starts. The end is one before the next starts */
for (i = 0; i < insn_cnt; i++) {
if (insn[i].code != (BPF_JMP | BPF_CALL))
continue;
if (insn[i].src_reg != BPF_PSEUDO_CALL)
continue;
if (!env->bpf_capable) {
verbose(env,
"function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
ret = add_subprog(env, i + insn[i].imm + 1); //所以说如果多个点对同一个函数进行调用,那么就会多次检查这个函数,为了提升效率就有必要进行prune的判断;
if (ret < 0)
return ret;
}

/* Add a fake 'exit' subprog which could simplify subprog iteration
* logic. 'subprog_cnt' should not be increased.
*/
subprog[env->subprog_cnt].start = insn_cnt;

if (env->log.level & BPF_LOG_LEVEL2)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);

/* now check that all jumps are within the same subprog */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start;
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;

if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
off = i + insn[i].off + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
}
next:
if (i == subprog_end - 1) {
/* to avoid fall-through from one subprog into another
* the last insn of the subprog should be either exit
* or unconditional jump back
*/
if (code != (BPF_JMP | BPF_EXIT) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n");
return -EINVAL;
}
subprog_start = subprog_end;
cur_subprog++;
if (cur_subprog < env->subprog_cnt)
subprog_end = subprog[cur_subprog + 1].start;
}
}
return 0;
}

先是对整个程序进行逐条指令检查,遇到 jmp|call 之后就要add_subprog;(注意最开始把主程序入口0add进去了)具体来看,感觉就是根据jmp|call划分基本块;

然后针对每一个subprog(基本块)为一个基本单位,逐条指令进行分析,如果不是跳转类指令(jmp、call)就跳过,分析下一条指令;

如果是,则判断目标地址有没有超出当前基本块的范围;

最后在每个subprog的结尾处判断其结尾指令是否是exit或者跳转;

check_cfg

从头开始分析指令,维护一个insn_stack用于DFS搜索,维护一个insn_state用于表示每一条指令的状态;然后开始DFS搜索,每次取栈顶元素进行分析,每一类跳转指令都要考虑将后续的点进行push_insn,然后如果之前discover过,那么就要标记这个地方是可考虑prune的。

/* non-recursive depth-first-search to detect loops in BPF program
* loop == back-edge in directed graph
*/
static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
int ret = 0;
int i, t;

insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;

insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_stack) {
kvfree(insn_state);
return -ENOMEM;
}

insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;

peek_stack:
if (env->cfg.cur_stack == 0) //如果栈为空,就去检查所有的state是否都被探索过了
goto check_state;
t = insn_stack[env->cfg.cur_stack - 1];//取栈顶元素

if (BPF_CLASS(insns[t].code) == BPF_JMP ||
BPF_CLASS(insns[t].code) == BPF_JMP32) {
u8 opcode = BPF_OP(insns[t].code);

if (opcode == BPF_EXIT) {
goto mark_explored; //将当前指令状态标记为 EXPLORED
} else if (opcode == BPF_CALL) {
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
if (t + 1 < insn_cnt) //ret == 0 说明call了一个已经discover过的函数,所以要标记一下这个点可以考虑优化掉
init_explored_state(env, t + 1);
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
init_explored_state(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else if (opcode == BPF_JA) {
if (BPF_SRC(insns[t].code) != BPF_K) {
ret = -EINVAL;
goto err_free;
}
/* unconditional jump with single edge */
ret = push_insn(t, t + insns[t].off + 1,
FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
/* unconditional jmp is not a good pruning point,
* but it's marked, since backtracking needs
* to record jmp history in is_state_visited().
*/
init_explored_state(env, t + insns[t].off + 1);
/* tell verifier to check for equivalent states
* after every call and jump
*/
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
} else {
/* conditional jump with two edges */
init_explored_state(env, t); //为什么直接调用?
ret = push_insn(t, t + 1, FALLTHROUGH, env, true); //失败的情况
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;

ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); //成功的情况
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else {
/* all other non-branch instructions with single
* fall-through edge
*/
ret = push_insn(t, t + 1, FALLTHROUGH, env, false); //非跳转指令直接push进去;
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}

mark_explored:
insn_state[t] = EXPLORED;
if (env->cfg.cur_stack-- <= 0) {
verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
goto peek_stack;

check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
}
ret = 0; /* cfg looks good */

err_free:
kvfree(insn_state);
kvfree(insn_stack);
env->cfg.insn_state = env->cfg.insn_stack = NULL;
return ret;
}

调用push_insn如果返回0,表示遇到了一个DISCOVERED的地址,那么就要调用init_explored_state,标记为可考虑优化;

do_check_subprogs

static int do_check_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog_aux *aux = env->prog->aux;
int i, ret;

if (!aux->func_info)
return 0;

for (i = 1; i < env->subprog_cnt; i++) {
if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
continue;
env->insn_idx = env->subprog_info[i].start;
WARN_ON_ONCE(env->insn_idx == 0);
ret = do_check_common(env, i);
if (ret) {
return ret;
} else if (env->log.level & BPF_LOG_LEVEL) {
verbose(env,
"Func#%d is safe for any args that match its prototype\n",
i);
}
}
return 0;
}

do_check_common

遍历所有的subprog,然后调用do_check_common:

static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;

env->prev_linfo = NULL;
env->pass_cnt++;

state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
if (!state)
return -ENOMEM;
state->curframe = 0;
state->speculative = false;
state->branches = 1;
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
if (!state->frame[0]) {
kfree(state);
return -ENOMEM;
}
env->cur_state = state;
init_func_state(env, state->frame[0],
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
subprog);

regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
ret = btf_prepare_func_args(env, subprog, regs);
if (ret)
goto out;
for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
if (regs[i].type == PTR_TO_CTX)
mark_reg_known_zero(env, regs, i);
else if (regs[i].type == SCALAR_VALUE)
mark_reg_unknown(env, regs, i); //参数传递,什么都有可能
}
} else {
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
mark_reg_known_zero(env, regs, BPF_REG_1);
ret = btf_check_func_arg_match(env, subprog, regs);
if (ret == -EFAULT)
/* unlikely verifier bug. abort.
* ret == 0 and ret < 0 are sadly acceptable for
* main() function due to backward compatibility.
* Like socket filter program may be written as:
* int bpf_prog(struct pt_regs *ctx)
* and never dereference that ctx in the program.
* 'struct pt_regs' is a type mismatch for socket
* filter that should be using 'struct __sk_buff'.
*/
goto out;
}

ret = do_check(env);
out:
/* check for NULL is necessary, since cur_state can be freed inside
* do_check() under memory pressure.
*/
if (env->cur_state) {
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
}
while (!pop_stack(env, NULL, NULL, false));
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
if (ret)
/* clean aux data in case subprog was rejected */
sanitize_insn_aux_data(env);
return ret;
}

首先准备一个state;然后根据函数类型和参数类型设置reg;

do_check

之后调用do_check:

static int do_check(struct bpf_verifier_env *env)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;

for (;;) {
struct bpf_insn *insn;
u8 class;
int err;

env->prev_insn_idx = prev_insn_idx;
if (env->insn_idx >= insn_cnt) {
verbose(env, "invalid insn idx %d insn_cnt %d\n",
env->insn_idx, insn_cnt);
return -EFAULT;
}

insn = &insns[env->insn_idx];
class = BPF_CLASS(insn->code);

if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose(env,
"BPF program is too large. Processed %d insn\n",
env->insn_processed);
return -E2BIG;
}

err = is_state_visited(env, env->insn_idx); ///// 如果这个state是visited,就直接prune掉
if (err < 0)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
if (env->log.level & BPF_LOG_LEVEL) {
if (do_print_state)
verbose(env, "\nfrom %d to %d%s: safe\n",
env->prev_insn_idx, env->insn_idx,
env->cur_state->speculative ?
" (speculative execution)" : "");
else
verbose(env, "%d: safe\n", env->insn_idx); /////
}
goto process_bpf_exit;
}

if (signal_pending(current))
return -EAGAIN;

if (need_resched())
cond_resched();

if (env->log.level & BPF_LOG_LEVEL2 ||
(env->log.level & BPF_LOG_LEVEL && do_print_state)) {
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "%d:", env->insn_idx);
else
verbose(env, "\nfrom %d to %d%s:",
env->prev_insn_idx, env->insn_idx,
env->cur_state->speculative ?
" (speculative execution)" : "");
print_verifier_state(env, state->frame[state->curframe]);
do_print_state = false;
}

if (env->log.level & BPF_LOG_LEVEL) {
const struct bpf_insn_cbs cbs = {
.cb_print = verbose,
.private_data = env,
};

verbose_linfo(env, env->insn_idx, "; ");
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
}

if (bpf_prog_is_dev_bound(env->prog->aux)) {
err = bpf_prog_offload_verify_insn(env, env->insn_idx,
env->prev_insn_idx);
if (err)
return err;
}

regs = cur_regs(env);
env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
prev_insn_idx = env->insn_idx;

if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
return err;

} else if (class == BPF_LDX) {
enum bpf_reg_type *prev_src_type, src_reg_type;

/* check for reserved fields is already done */

/* check src operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;

err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;

src_reg_type = regs[insn->src_reg].type;

/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
err = check_mem_access(env, env->insn_idx, insn->src_reg,
insn->off, BPF_SIZE(insn->code),
BPF_READ, insn->dst_reg, false);
if (err)
return err;

prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;

if (*prev_src_type == NOT_INIT) {
/* saw a valid insn
* dst_reg = *(u32 *)(src_reg + off)
* save type to validate intersecting paths
*/
*prev_src_type = src_reg_type;

} else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
/* ABuser program is trying to use the same insn
* dst_reg = *(u32*) (src_reg + off)
* with different pointer types:
* src_reg == ctx in one branch and
* src_reg == stack|map in some other branch.
* Reject it.
*/
verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}

} else if (class == BPF_STX) {
enum bpf_reg_type *prev_dst_type, dst_reg_type;

if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, env->insn_idx, insn);
if (err)
return err;
env->insn_idx++;
continue;
}

/* check src1 operand */
err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;

dst_reg_type = regs[insn->dst_reg].type;

/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
BPF_WRITE, insn->src_reg, false);
if (err)
return err;

prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;

if (*prev_dst_type == NOT_INIT) {
*prev_dst_type = dst_reg_type;
} else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}

} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_ST uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;

if (is_ctx_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str[reg_state(env, insn->dst_reg)->type]);
return -EACCES;
}

/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
insn->off, BPF_SIZE(insn->code),
BPF_WRITE, -1, false);
if (err)
return err;

} else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);

env->jmps_processed++;
if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
insn->src_reg != BPF_PSEUDO_CALL) ||
insn->dst_reg != BPF_REG_0 ||
class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}

if (env->cur_state->active_spin_lock &&
(insn->src_reg == BPF_PSEUDO_CALL ||
insn->imm != BPF_FUNC_spin_unlock)) {
verbose(env, "function calls are not allowed while holding a lock\n");
return -EINVAL;
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else
err = check_helper_call(env, insn->imm, env->insn_idx);
if (err)
return err;

} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0 ||
class == BPF_JMP32) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}

env->insn_idx += insn->off + 1;
continue;

} else if (opcode == BPF_EXIT) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0 ||
class == BPF_JMP32) {
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}

if (env->cur_state->active_spin_lock) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}

if (state->curframe) {
/* exit from nested function */
err = prepare_func_exit(env, &env->insn_idx);
if (err)
return err;
do_print_state = true;
continue;
}

err = check_reference_leak(env);
if (err)
return err;

err = check_return_code(env);
if (err)
return err;
process_bpf_exit:
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
&env->insn_idx, pop_log);
if (err < 0) {
if (err != -ENOENT)
return err;
break;
} else {
do_print_state = true;
continue;
}
} else {
err = check_cond_jmp_op(env, insn, &env->insn_idx);
if (err)
return err;
}
} else if (class == BPF_LD) {
u8 mode = BPF_MODE(insn->code);

if (mode == BPF_ABS || mode == BPF_IND) {
err = check_ld_abs(env, insn);
if (err)
return err;

} else if (mode == BPF_IMM) {
err = check_ld_imm(env, insn);
if (err)
return err;

env->insn_idx++;
env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
}
} else {
verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}

env->insn_idx++;
}

return 0;
}

do_check_common是针对每个subprog只调用一次的,但后在do_check_common中调用了do_check,即do_check也是对于每个subprog都只调用一次的;

在do_check内部,会从头开始遍历依次所有指令,首先进入is_state_visited函数,这个函数仅对prune_point有效,不是prune_point的会立即退出;然后这里的遍历指令不是简单的顺序遍历就完事了,这要充分考虑多种情况:条件跳转、直接跳转、exit;

在env中会维护一个stack,然后通过env->stack_size表示栈中元素的个数,env->head表示栈头,然后通过单链表的形式组织elem们进入栈;

当do_check遇到了条件跳转会进入到check_cond_jmp_op函数中,如果确定两个分支都会走到,则会先通过mark_chain_precision函数来修改precise标记,然后将true分支压入栈中,

上面这段代码,走到exit时会走下来,出栈一个指令继续分析;比如is_state_visited返回为true,那么就调到这里,直接分析下一个;同样,如果是条件跳转的false分支分析完了,进入到exit了,那么下面pop出来的就是true分支了;

is_state_visited

static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
bool add_new_state = env->test_state_freq ? true : false;

cur->last_insn_idx = env->prev_insn_idx;
if (!env->insn_aux_data[insn_idx].prune_point) //必须是可prune的点才可以继续分析
/* this 'insn_idx' instruction wasn't marked, so we will not
* be doing state search here
*/
return 0;

/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
* Do not add new state for future pruning if the verifier hasn't seen
* at least 2 jumps and at least 8 instructions.
* This heuristics helps decrease 'total_states' and 'peak_states' metric.
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;

pprev = explored_state(env, insn_idx); //直接拿到一个链表,最开始可能直接就是空的;
sl = *pprev;

clean_live_states(env, insn_idx, cur);

while (sl) {
states_cnt++;
if (sl->state.insn_idx != insn_idx)
goto next;
if (sl->state.branches) {
if (states_maybe_looping(&sl->state, cur) &&
states_equal(env, &sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
* too often, since different loop iterations have distinct
* states and may not help future pruning.
* This threshold shouldn't be too low to make sure that
* a loop with large bound will be rejected quickly.
* The most abusive loop will be:
* r1 += 1
* if r1 < 1000000 goto pc-2
* 1M insn_procssed limit / 100 == 10k peak states.
* This threshold shouldn't be too high either, since states
* at the end of the loop are likely to be useful in pruning.
*/
if (env->jmps_processed - env->prev_jmps_processed < 20 &&
env->insn_processed - env->prev_insn_processed < 100)
add_new_state = false;
goto miss;
}
if (states_equal(env, &sl->state, cur)) {
sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
* Registers read by the continuation are read by us.
* If we have any write marks in env->cur_state, they
* will prevent corresponding reads in the continuation
* from reaching our parent (an explored_state). Our
* own state will get the read marks recorded, but
* they'll be immediately forgotten as we're pruning
* this state and will pop a new one.
*/
err = propagate_liveness(env, &sl->state, cur);

/* if previous state reached the exit with precision and
* current state is equivalent to it (except precsion marks)
* the precision needs to be propagated back in
* the current state.
*/
err = err ? : push_jmp_history(env, cur);
err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
return 1;
}
miss:
/* when new state is not going to be added do not increase miss count.
* Otherwise several loop iterations will remove the state
* recorded earlier. The goal of these heuristics is to have
* states from some iterations of the loop (some in the beginning
* and some at the end) to help pruning.
*/
if (add_new_state)
sl->miss_cnt++;
/* heuristic to determine whether this state is beneficial
* to keep checking from state equivalence point of view.
* Higher numbers increase max_states_per_insn and verification time,
* but do not meaningfully decrease insn_processed.
*/
if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
/* the state is unlikely to be useful. Remove it to
* speed up verification
*/
*pprev = sl->next;
if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
u32 br = sl->state.branches;

WARN_ONCE(br,
"BUG live_done but branches_to_explore %d\n",
br);
free_verifier_state(&sl->state, false);
kfree(sl);
env->peak_states--;
} else {
/* cannot free this state, since parentage chain may
* walk it later. Add it for free_list instead to
* be freed at the end of verification
*/
sl->next = env->free_list;
env->free_list = sl;
}
sl = *pprev;
continue;
}
next:
pprev = &sl->next;
sl = *pprev;
}

if (env->max_states_per_insn < states_cnt)
env->max_states_per_insn = states_cnt;

if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
return push_jmp_history(env, cur);

if (!add_new_state)
return push_jmp_history(env, cur);

/* There were no equivalent states, remember the current one.
* Technically the current state is not proven to be safe yet,
* but it will either reach outer most bpf_exit (which means it's safe)
* or it will be rejected. When there are no loops the verifier won't be
* seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
* again on the way to bpf_exit.
* When looping the sl->state.branches will be > 0 and this state
* will not be considered for equivalence until branches == 0.
*/
new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
env->total_states++;
env->peak_states++;
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;

/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur); //创建一个只有当前state的state_list
if (err) {
free_verifier_state(new, false);
kfree(new_sl);
return err;
}
new->insn_idx = insn_idx;
WARN_ONCE(new->branches != 1,
"BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);

cur->parent = new;
cur->first_insn_idx = insn_idx;
clear_jmp_history(cur);
new_sl->next = *explored_state(env, insn_idx); //再次获取这个链表,并采用头插法链接进去
*explored_state(env, insn_idx) = new_sl;
/* connect new state to parentage chain. Current frame needs all
* registers connected. Only r6 - r9 of the callers are alive (pushed
* to the stack implicitly by JITs) so in callers' frames connect just
* r6 - r9 as an optimization. Callers will have r1 - r5 connected to
* the state of the call instruction (with WRITTEN set), and r0 comes
* from callee with its full parentage chain, anyway.
*/
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
* their parent and current state never has children yet. Only
* explored_states can get read marks.)
*/
for (j = 0; j <= cur->curframe; j++) {
for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
for (i = 0; i < BPF_REG_FP; i++)
cur->frame[j]->regs[i].live = REG_LIVE_NONE;
}

/* all stack frames are accessible from callee, clear them all */
for (j = 0; j <= cur->curframe; j++) {
struct bpf_func_state *frame = cur->frame[j];
struct bpf_func_state *newframe = new->frame[j];

for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
frame->stack[i].spilled_ptr.parent =
&newframe->stack[i].spilled_ptr;
}
}
return 0;
}

对于is_state_visited这个函数的理解,重点在于bpf_verifier_state_list这个结构,首先在env中存在一个名为explored_states的bpf_verifier_state_list类型数组,该数组的每一项要对应一个调用者和一个被调用者,因此搜索空间非常巨大,所以要采用哈希表的形式来存储;

在最开始的情况下,获取到一个空链表项,直接跳过前面的代码,看add_new,先分配一个新的链表项new_sl,然后将当前的state情况copy进去,链进链表中:

进入到is_state_visited函数中,首先就要获取这样一个链表,(当然它可能为空),然后开始遍历其中的每一个state,因为这些state其实都是相同的调用者和相同的被调用者了;

copy

看一下这个new_sl是如何创建与初始化的:

传递new进去copy:

static int copy_verifier_state(struct bpf_verifier_state *dst_state,
const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
int i, err;

if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
kfree(dst_state->jmp_history);
dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
if (!dst_state->jmp_history)
return -ENOMEM;
}
memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
dst_state->jmp_history_cnt = src->jmp_history_cnt;

/* if dst has more stack frames then src frame, free them */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
}
dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe;
dst_state->active_spin_lock = src->active_spin_lock;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
dst = kzalloc(sizeof(*dst), GFP_KERNEL);
if (!dst)
return -ENOMEM;
dst_state->frame[i] = dst;
}
err = copy_func_state(dst, src->frame[i]);
if (err)
return err;
}
return 0;
}

之后回来继续:

关于frame的定义如下:

states_equal

好下面看如何判断两个state相同:

static int is_state_visited(env, env->insn_idx){
pprev = explored_state(env, insn_idx);
sl = *pprev;
//sl 遍历每一个相同的调用者和被调用者的state_list
states_equal(env, &sl->state, cur){
//old = sl->state, cur = cur
for (i = 0; i <= old->curframe; i++) {
func_states_equal(old->frame[i], cur->frame[i]){
//old = old-frame[i], cur = cur->frame[i]
regsafe(&old->regs[i], &cur->regs[i], idmap){
//rold = old->regs[i], rcur = &cur->regs[i]
switch (rold->type) {
case SCALAR_VALUE:
if (!rold->precise && !rcur->precise)
return true;
}
}

}
}

}

总结

bpf_check(){
check_subprogs();//根据jmp|call划分基本块,加入到subprog数组中
check_cfg();//对所有指令DFS搜索,探索不可达指令,标记prune_point
do_check_common(){//检查某一个subprog是否可以优化掉
do_check(){
pprev = explored_state(env, insn_idx);
sl = *pprev;
is_state_visited(){//遍历pprev链表,检查是否是visit过,
//先获取state_list,callsite^callee->hashtab
//逐个state进行比较
while(){
states_equal(env, &sl->state, cur){
//old = sl->state, cur = cur
for (i = 0; i <= old->curframe; i++) {
func_states_equal(old->frame[i], cur->frame[i]){
//old = old-frame[i], cur = cur->frame[i]
regsafe(&old->regs[i], &cur->regs[i], idmap){
//rold = old->regs[i], rcur = &cur->regs[i]
switch (rold->type) {
case SCALAR_VALUE:
//sl->state->frame[i]->regs->precise
if (!rold->precise && !rcur->precise)
return true;
}
}

}
}
}
//添加新的sl到链表中 copy
new = &new_sl->state;
err = copy_verifier_state(new, cur){
//dst_state=new, src=cur
dst = dst_state->frame[i];
if (!dst) {
dst = kzalloc(sizeof(*dst), GFP_KERNEL);
if (!dst)
return -ENOMEM;
dst_state->frame[i] = dst;
}
err = copy_func_state(dst, src->frame[i]){
//...
memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
//...
}
}

}
//对所有指令进行switch,逐个检查:
switch(op){
case JMP:
//各种情况处理
check_cond_jmp_op(){//条件跳转,会对寄存器的precise进行标记
if(){//如果存在分支
mark_chain_precision(env, insn->dst_reg);//当前env的cur_state,然后找到curframe,将其regs[regno]设置为相应的precision等;后需要回溯的
//env->cur_state->frame[st->curframe]->reg[regno]
//...
mark_chain_precision(env, insn->src_reg);
}
}
}
}
}
}

precise的理解

首先在regsafe的检查中是先看reg的所有估计值和var_off的,如果完全一样直接认为是safe的(除了stack还要再检查一个frameno);如果存在不一样的地方,则检查type,这个precise是只针对标量有检查意义的;(因此,可以将precise理解为一个分支优化时对scalar的检查标准,如果是precise=false说明不必较真)

执行do_check_common->do_check->is_state_visited发现没有visit过,就要将当前state加入到链表中,这个时候就有了这次跳转(刚jmp,还没有执行任何新的指令)时的状态(可以理解为拍了一个快照一样保存到了sl中);

至于在对具体的指令进行分析,例如在条件跳转时要给src_reg标注精确,这个时候到了跳转进新的基本块的时候,这个标注就会带来,然后就会对src_reg的值进行严格检查,不精确就不让优化掉;

对于frame的理解

首先看eBPF的官方文档:

https://www.kernel.org/doc/html/latest/bpf/verifier.html

主要就是说了一个state,如果call一个函数了,那么就会创建一个新的frame,通常frame0都是给自己的,然后新的frame1给被调用者,此时复制frame0的r1r5给frame1作为参数传递,r0/r6r9对frame1来说是null,

看check_func_call函数:

可以看到创建一个新的frame之后,将state的curframe切换到新的栈中,PC也到了被调用函数中了;

state只有subprog才会创建新的(在do_check_common函数中切换env->cur_state);

而subprog的标准是call | jmp32 以及src_reg == BPF_PSEUDO_CALL; 即eBPF到eBPF的函数调用,区别于辅助函数;所以只有内部的call才会被切分成subprog;

再看prune:而所有的跳转分支则也会标记为prune_point,所以prune_point是subprog的超集;

所以就明朗了,其实 is_state_visited 中的比较、失败后的new_sl的copy、以及__mark_chain_precision都是围绕着frame来的!

只有func_state中才会有regs,bpf_verifier_state中包s含func_state,但是没有regs;

mark_chain_precision调用情况总结

check_stack_write:写栈上写8字节;

static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
int spi)
{
struct bpf_verifier_state *st = env->cur_state;
int first_idx = st->first_insn_idx;
int last_idx = env->insn_idx;
struct bpf_func_state *func;
struct bpf_reg_state *reg;
u32 reg_mask = regno >= 0 ? 1u << regno : 0;
u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
bool skip_first = true;
bool new_marks = false;
int i, err;

if (!env->bpf_capable)
return 0;
///===============找出当前func frame,找出需要标记的reg==================================
func = st->frame[st->curframe];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
if (!reg->precise)
new_marks = true;
else
reg_mask = 0;
reg->precise = true;
}
///=============设置堆栈precise的情况==================================================
while (spi >= 0) {
if (func->stack[spi].slot_type[0] != STACK_SPILL) {
stack_mask = 0;
break;
}
reg = &func->stack[spi].spilled_ptr;
if (reg->type != SCALAR_VALUE) {
stack_mask = 0;
break;
}
if (!reg->precise)
new_marks = true;
else
stack_mask = 0;
reg->precise = true;
break;
}

if (!new_marks)
return 0;
if (!reg_mask && !stack_mask)
return 0;
//===============================开始回溯================================================
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;

if (env->log.level & BPF_LOG_LEVEL)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
for (i = last_idx;;) {
if (skip_first) {
err = 0;
skip_first = false; //跳过第一条指令
} else {
err = backtrack_insn(env, i, &reg_mask, &stack_mask);//回溯一条指令,就是我现在加了一个precise=true,可能会对前面其他的指令有影响,所以要回溯
}
if (err == -ENOTSUPP) {
mark_all_scalars_precise(env, st);
return 0;
} else if (err) {
return err;
}
if (!reg_mask && !stack_mask)
/* Found assignment(s) into tracked register in this state.
* Since this state is already marked, just return.
* Nothing to be tracked further in the parent state.
*/
return 0;
if (i == first_idx)
break;
i = get_prev_insn_idx(st, i, &history);//向前回溯一条指令
if (i >= env->prog->len) {
/* This can happen if backtracking reached insn 0
* and there are still reg_mask or stack_mask
* to backtrack.
* It means the backtracking missed the spot where
* particular register was initialized with a constant.
*/
verbose(env, "BUG backtracking idx %d\n", i);
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
}
}//一个state的
st = st->parent; //向前一个状态
if (!st)
break;

new_marks = false;
func = st->frame[st->curframe];
bitmap_from_u64(mask, reg_mask);
for_each_set_bit(i, mask, 32) {
reg = &func->regs[i];
if (reg->type != SCALAR_VALUE) {
reg_mask &= ~(1u << i);
continue;
}
if (!reg->precise)
new_marks = true;
reg->precise = true;
}

bitmap_from_u64(mask, stack_mask);
for_each_set_bit(i, mask, 64) {
if (i >= func->allocated_stack / BPF_REG_SIZE) {
/* the sequence of instructions:
* 2: (bf) r3 = r10
* 3: (7b) *(u64 *)(r3 -8) = r0
* 4: (79) r4 = *(u64 *)(r10 -8)
* doesn't contain jmps. It's backtracked
* as a single block.
* During backtracking insn 3 is not recognized as
* stack access, so at the end of backtracking
* stack slot fp-8 is still marked in stack_mask.
* However the parent state may not have accessed
* fp-8 and it's "unallocated" stack space.
* In such case fallback to conservative.
*/
mark_all_scalars_precise(env, st);
return 0;
}

if (func->stack[i].slot_type[0] != STACK_SPILL) {
stack_mask &= ~(1ull << i);
continue;
}
reg = &func->stack[i].spilled_ptr;
if (reg->type != SCALAR_VALUE) {
stack_mask &= ~(1ull << i);
continue;
}
if (!reg->precise)
new_marks = true;
reg->precise = true;
}
if (env->log.level & BPF_LOG_LEVEL) {
print_verifier_state(env, func);
verbose(env, "parent %s regs=%x stack=%llx marks\n",
new_marks ? "didn't have" : "already had",
reg_mask, stack_mask);
}

if (!reg_mask && !stack_mask)
break;
if (!new_marks)
break;

last_idx = st->last_insn_idx;
first_idx = st->first_insn_idx;
}
return 0;
}//就是不断的回溯,只要reg_mask和stack_mask不为0就要继续向上回溯,

在当前state中先回溯,到第一条指令后对reg们进行标记,reg_mask和stack_mask还不为0,就得继续向上回溯parent;


文章作者: q1ming
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 q1ming !
  目录