pipapo_set删除elem过程浅析


前言

本篇文章仍然是服务于分析CVE-2023-4004.

netlink处理消息过程

重点函数是nfnetlink_rcv_batch: https://elixir.bootlin.com/linux/v5.15/source/net/netfilter/nfnetlink.c#L365

static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
u16 subsys_id, u32 genid)
{
struct sk_buff *oskb = skb;
struct net *net = sock_net(skb->sk);
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
struct netlink_ext_ack extack;
LIST_HEAD(err_list);
u32 status;
int err;

if (subsys_id >= NFNL_SUBSYS_COUNT)
return netlink_ack(skb, nlh, -EINVAL, NULL);
replay:
status = 0;
replay_abort:
skb = netlink_skb_clone(oskb, GFP_KERNEL);
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);

nfnl_lock(subsys_id);
ss = nfnl_dereference_protected(subsys_id);
if (!ss) {

// 跳过一些看不懂的部分

while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;

if (fatal_signal_pending(current)) {
nfnl_err_reset(&err_list);
err = -EINTR;
status = NFNL_BATCH_FAILURE;
goto done;
}

memset(&extack, 0, sizeof(extack));
nlh = nlmsg_hdr(skb);
err = 0;

if (nlh->nlmsg_len < NLMSG_HDRLEN ||
skb->len < nlh->nlmsg_len ||
nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
nfnl_err_reset(&err_list);
status |= NFNL_BATCH_FAILURE;
goto done;
}

/* Only requests are handled by the kernel */
if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
err = -EINVAL;
goto ack;
}

type = nlh->nlmsg_type;
if (type == NFNL_MSG_BATCH_BEGIN) { //第一个iov是begin类型
/* Malformed: Batch begin twice */
nfnl_err_reset(&err_list);
status |= NFNL_BATCH_FAILURE;
goto done;
} else if (type == NFNL_MSG_BATCH_END) { //最后一个iov是end类型
status |= NFNL_BATCH_DONE;
goto done;
} else if (type < NLMSG_MIN_TYPE) {
err = -EINVAL;
goto ack;
}

/* We only accept a batch with messages for the same
* subsystem.
*/
if (NFNL_SUBSYS_ID(type) != subsys_id) {
err = -EINVAL;
goto ack;
}

nc = nfnetlink_find_client(type, ss); //中间的iov会走这个地方
if (!nc) {
err = -EINVAL;
goto ack;
}

if (nc->type != NFNL_CB_BATCH) {
err = -EINVAL;
goto ack;
}

{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
int attrlen = nlh->nlmsg_len - min_len;
struct nfnl_info info = {
.net = net,
.sk = nfnlnet->nfnl,
.nlh = nlh,
.nfmsg = nlmsg_data(nlh),
.extack = &extack,
};

/* Sanity-check NFTA_MAX_ATTR */
if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
err = -ENOMEM;
goto ack;
}

err = nla_parse_deprecated(cda,
ss->cb[cb_id].attr_count,
attr, attrlen,
ss->cb[cb_id].policy, NULL);
if (err < 0)
goto ack;

//在这里会调用回调函数,比如nf_tables_newtable等
err = nc->call(skb, &info, (const struct nlattr **)cda);

/* The lock was released to autoload some module, we
* have to abort and start from scratch using the
* original skb.
*/
if (err == -EAGAIN) { //如果出错会带着err跳转到done,然后指向ss->abort
status |= NFNL_BATCH_REPLAY;
goto done;
}
}
ack:
if (nlh->nlmsg_flags & NLM_F_ACK || err) {
/* Errors are delivered once the full batch has been
* processed, this avoids that the same error is
* reported several times when replaying the batch.
*/
if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
/* We failed to enqueue an error, reset the
* list of errors and send OOM to userspace
* pointing to the batch header.
*/
nfnl_err_reset(&err_list);
netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM,
NULL);
status |= NFNL_BATCH_FAILURE;
goto done;
}
/* We don't stop processing the batch on errors, thus,
* userspace gets all the errors that the batch
* triggers.
*/
if (err)
status |= NFNL_BATCH_FAILURE;
}

msglen = NLMSG_ALIGN(nlh->nlmsg_len);
if (msglen > skb->len)
msglen = skb->len;
skb_pull(skb, msglen);
}
done:
if (status & NFNL_BATCH_REPLAY) {
ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); //err
nfnl_err_reset(&err_list);
kfree_skb(skb);
module_put(ss->owner);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
err = ss->commit(net, oskb); //
if (err == -EAGAIN) {
status |= NFNL_BATCH_REPLAY;
goto done;
} else if (err) {
ss->abort(net, oskb, NFNL_ABORT_NONE);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
}
} else {
enum nfnl_abort_action abort_action;

if (status & NFNL_BATCH_FAILURE)
abort_action = NFNL_ABORT_NONE;
else
abort_action = NFNL_ABORT_VALIDATE;

err = ss->abort(net, oskb, abort_action);
if (err == -EAGAIN) {
nfnl_err_reset(&err_list);
kfree_skb(skb);
module_put(ss->owner);
status |= NFNL_BATCH_FAILURE;
goto replay_abort;
}
}
if (ss->cleanup)
ss->cleanup(net);

nfnl_err_deliver(&err_list, oskb);
kfree_skb(skb);
module_put(ss->owner);
}

总结:

正常处理中间的iov的过程中,会通过其type寻找对应的nc,然后调用其中的回调函数;

回调函数执行完之后,如果发生err了就会跳到done,然后调用ss->abort,然后还会继续执行下一个批处理;

如果全都正常执行完了,就会执行ss->commit;

删除一个setelem

首先会调用到nf_tables_delsetelem

https://elixir.bootlin.com/linux/v5.15/source/net/netfilter/nf_tables_api.c#L6270

static int nf_tables_delsetelem(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const nla[])
{
struct netlink_ext_ack *extack = info->extack;
u8 genmask = nft_genmask_next(info->net);
u8 family = info->nfmsg->nfgen_family;
struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
int rem, err = 0;

table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
genmask, NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
return PTR_ERR(table);
}

set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
return -EBUSY;

nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
return nft_set_flush(&ctx, set, genmask);

nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
if (err < 0)
break;
}
return err;
}

然后调用到nft_del_setelem: https://elixir.bootlin.com/linux/v5.15/source/net/netfilter/nf_tables_api.c#L6103

static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
struct nft_set_ext_tmpl tmpl;
struct nft_set_elem elem;
struct nft_set_ext *ext;
struct nft_trans *trans;
u32 flags = 0;
int err;

err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
nft_set_elem_policy, NULL);
if (err < 0)
return err;

err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;

if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
return -EINVAL;

nft_set_ext_prepare(&tmpl);

if (flags != 0)
nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);

if (nla[NFTA_SET_ELEM_KEY]) {
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
return err;

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
}

if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
nla[NFTA_SET_ELEM_KEY_END]);
if (err < 0)
return err;

nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
}

err = -ENOMEM;
//分配一个elem
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, NULL, 0, 0,
GFP_KERNEL);
if (elem.priv == NULL)
goto fail_elem;

ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;

//创建一个trans
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
if (trans == NULL)
goto fail_trans;

/*
调用__nfts_setelem_deactivate(net, set, elem);
priv = set->ops->deactivate(net, set, elem);
pipapo_deactivate的作用是调用pipapo_get得到一个elem并返回
kfree(elem->priv);
elem->priv = priv;
set->ndeact++;
作用就是将elem->priv给释放掉并从链表中取下来,用pipapo_get得到的elem代替之;
*/
err = nft_setelem_deactivate(ctx->net, set, &elem, flags);
if (err < 0)
goto fail_ops;

//仅当set中有data或者objref的时候才有用
nft_setelem_data_deactivate(ctx->net, set, &elem);

//trans引用elem,并放到commit_list上
nft_trans_elem(trans) = elem;
nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;

fail_ops:
kfree(trans);
fail_trans:
kfree(elem.priv);
fail_elem:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
return err;
}

进入到ss->commit,也就是nf_tables_commit函数:

https://elixir.bootlin.com/linux/v5.15/source/net/netfilter/nf_tables_api.c#L8529

case NFT_MSG_DELSETELEM:
te = (struct nft_trans_elem *)trans->data;

nf_tables_setelem_notify(&trans->ctx, te->set,
&te->elem,
NFT_MSG_DELSETELEM);
//最终会调用到set->ops->remove, 然后就是pipapo_remove了
nft_setelem_remove(net, te->set, &te->elem);
if (!nft_setelem_is_catchall(te->set, &te->elem)) {
atomic_dec(&te->set->nelems);
te->set->ndeact--;
}
break;
//......
nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
nf_tables_commit_audit_log(&adl, nft_net->base_seq);
nf_tables_commit_release(net);

nf_tables_commit_release:其主要任务是通过schedule_work将trans_destroy_work提交上去:

static void nf_tables_commit_release(struct net *net)
{
struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans;

/* all side effects have to be made visible.
* For example, if a chain named 'foo' has been deleted, a
* new transaction must not find it anymore.
*
* Memory reclaim happens asynchronously from work queue
* to prevent expensive synchronize_rcu() in commit phase.
*/
if (list_empty(&nft_net->commit_list)) {
nf_tables_module_autoload_cleanup(net);
mutex_unlock(&nft_net->commit_mutex);
return;
}

trans = list_last_entry(&nft_net->commit_list,
struct nft_trans, list);
get_net(trans->ctx.net);
WARN_ON_ONCE(trans->put_net);

trans->put_net = true;
spin_lock(&nf_tables_destroy_list_lock);
list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);

nf_tables_module_autoload_cleanup(net);
schedule_work(&trans_destroy_work);

mutex_unlock(&nft_net->commit_mutex);
}

关于schedule_work:https://zhuanlan.zhihu.com/p/363279693

之后会在什么时机调用trans_destroy_work呢?

static void nf_tables_trans_destroy_work(struct work_struct *w)
{
struct nft_trans *trans, *next;
LIST_HEAD(head);

spin_lock(&nf_tables_destroy_list_lock);
list_splice_init(&nf_tables_destroy_list, &head);
spin_unlock(&nf_tables_destroy_list_lock);

if (list_empty(&head))
return;

synchronize_rcu();

list_for_each_entry_safe(trans, next, &head, list) {
list_del(&trans->list);
nft_commit_release(trans);
}
}

case NFT_MSG_DELSETELEM:
nf_tables_set_elem_destroy(&trans->ctx,
nft_trans_elem_set(trans),
nft_trans_elem(trans).priv);
break;

最后在nf_tables_set_elem_destroy函数中释放掉elem:

static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);

if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));

kfree(elem);
}

总结

nfnetlink_rcv_batch:
begin...
nc->call ==> nf_tables_delsetelem
nft_del_setelem
//根据传入的参数先构造一个elem
//然后通过pipapo_get找到真的elem,并释放掉原来构造的elem
//set->ndeact++
end...
ss->commit ==> nf_tables_commit
//......
case NFT_MSG_DELSETELEM:
nft_setelem_remove
pipapo_remove //修改pipapo_match的各field的lt和mt,作用是将elem从set中清除
atomic_dec(&te->set->nelems);
te->set->ndeact--;
nf_tables_commit_release(net);
schedule_work(&trans_destroy_work);
//......

//......
trans_destroy_work
list_for_each_entry_safe(trans, next, &head, list) {
list_del(&trans->list);
nft_commit_release(trans);
case NFT_MSG_DELSETELEM:
nf_tables_set_elem_destroy
kfree(elem)

}

参考

https://196082.github.io/2024/09/03/nftables-CVEs1/

https://zhuanlan.zhihu.com/p/363279693


文章作者: q1ming
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 q1ming !
  目录