Linux Kernel TLS Part 1

Part1: Linux Kernel TLS Part 1
Part2: Linux Kernel TLS Part 2

Last week, I prepared an introduction to Linux kernel TLS for my study group. You can check out the slide for a general concept.

In this post, I will provide more details than the slide, offering a deep dive into how the Linux kernel’s TLS implementation works and highlight some previously exploited vulnerabilities. Enjoy 🙂 !

1. Overview

Similar to other socket types, the Linux kernel’s TLS implementation defines its own packet-handling operations, making it a good entry point for understanding the architecture of the subsystem.

1.1. Initialization

The TLS subsystem registers its ULP ops during initialization.

static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
    .name            = "tls",
    .owner           = THIS_MODULE,
    .init            = tls_init,
    .update          = tls_update,
    .get_info        = tls_get_info,
    .get_info_size   = tls_get_info_size,
};

static int __init tls_register(void)
{
    int err;

    err = register_pernet_subsys(&tls_proc_ops);
    // [...]
    
    tcp_register_ulp(&tcp_tls_ulp_ops); // <-----------
    // [...]   
}

A TCP socket can set its ULP by SYS_setsockopt. If the given ULP name is “tls”, the __tcp_ulp_find_autoload() will return &tcp_tls_ulp_ops as ULP ops.

int do_tcp_setsockopt(struct sock *sk, int level, int optname,
              sockptr_t optval, unsigned int optlen)
{
    switch (optname) {
    case TCP_ULP:
        // [...]
        sockopt_lock_sock(sk);
        err = tcp_set_ulp(sk, name);
        sockopt_release_sock(sk);
        // [...]
    }
    // [...]
}

int tcp_set_ulp(struct sock *sk, const char *name)
{
    const struct tcp_ulp_ops *ulp_ops;
    ulp_ops = __tcp_ulp_find_autoload(name); // &tcp_tls_ulp_ops
    return __tcp_set_ulp(sk, ulp_ops);
}

The __tcp_set_ulp() will calls the TLS init handler to setting the socket ULP [1].

static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    int err;

    // [...]
    err = ulp_ops->init(sk); // [1]
    icsk->icsk_ulp_ops = ulp_ops;
    return 0;
}

The TLS init handler updates sock and its proto and create a TLS context (tls_context) object.

static int tls_init(struct sock *sk)
{
    struct tls_context *ctx;

    // [...]
    
    ctx = tls_ctx_create(sk);
    ctx->tx_conf = TLS_BASE;
    ctx->rx_conf = TLS_BASE;
    update_sk_prot(sk, ctx);
    
    // [...]
}

After itialization, the TLS socket object is as follows:

1.2. Set TX/RX

Once a TLS socket is created, we can further call SYS_setsockopt to set its TLS_TX and TLS_RX, which are crypto configurations for transmission and receive.

static int do_tls_getsockopt(struct sock *sk, int optname,
                 char __user *optval, int __user *optlen)
{
    int rc = 0;

    lock_sock(sk);

    switch (optname) {
    case TLS_TX:
    case TLS_RX:
        lock_sock(sk);
        rc = do_tls_setsockopt_conf(sk, optval, optlen, optname == TLS_TX);
        release_sock(sk);
        break;
    }
    // [...]
}

We focus solely on the TLS_TX setting here because the TLS_RX follows a similar process.

The do_tls_setsockopt_conf() function first attemps to offload TLS to hardware [1]. If it fails, this function then offloads TLS in the software level [2]. The most of network devices does not support TLS offload, so it is more frequent that the TLS is handled in the software level. Finally, it updates tx conf [3] and set the corresponding protocol ops [4].

static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
                  unsigned int optlen, int tx)
{
    struct tls_context *ctx = tls_get_ctx(sk);
    
    // [...]
    if (tx) {
        rc = tls_set_device_offload(sk, ctx); // [1]
        conf = TLS_HW;
        if (!rc) {
            // [...]
        } else {
            rc = tls_set_sw_offload(sk, ctx, 1); // [2]
            // [...]
            conf = TLS_SW;
        }
    }
    // [...]
    ctx->tx_conf = conf; // [3]
    update_sk_prot(sk, ctx); // [4]

    // [...]
}

There are many encryption types for TLS, and you can refer variable &tls_cipher_desc[] for more details. The example code for setting AES_GCM_128 crypto is as follows:

struct tls12_crypto_info_aes_gcm_128 crypto_info;

memset(&crypto_info, 0, sizeof(crypto_info));
crypto_info.info.version = TLS_1_2_VERSION;
crypto_info.info.cipher_type = TLS_CIPHER_AES_GCM_128;

memcpy(crypto_info.key, "0123456789ABCDEF", TLS_CIPHER_AES_GCM_128_KEY_SIZE); // 16
memcpy(crypto_info.iv, "12345678", TLS_CIPHER_AES_GCM_128_IV_SIZE); // 8
memcpy(crypto_info.salt, "SALT", TLS_CIPHER_AES_GCM_128_SALT_SIZE); // 4

setsockopt(sockfd, SOL_TLS, TLS_TX, &crypto_info, sizeof(crypto_info));
setsockopt(sockfd, SOL_TLS, TLS_RX, &crypto_info, sizeof(crypto_info));

The tls_set_sw_offload() function is used to initialized software TLS. First, it calls init_ctx_tx() to create a tls_sw_context_tx object [5]. After that, it initializes TLS protocol information [6] and the cipher context [7] with provided crypto parameters, such as IV and SALT.

int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
    const struct tls_cipher_desc *cipher_desc;
    struct tls_sw_context_tx *sw_ctx_tx = NULL;
    struct tls_prot_info *prot = &tls_ctx->prot_info;
    struct cipher_context *cctx;

    ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); // [5]
    cctx = &ctx->tx;

    // [6]
    prot->version = crypto_info->version;
    prot->cipher_type = crypto_info->cipher_type;
    // [...]

    // [7]
    cctx->iv = kmalloc(cipher_desc->iv + cipher_desc->salt, GFP_KERNEL);
    memcpy(cctx->iv, salt, cipher_desc->salt);
    memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv);
    
    cctx->rec_seq = kmemdup(rec_seq, cipher_desc->rec_seq, GFP_KERNEL);

    if (!*aead) {
        *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0);
        // [...]
    }
    // [...]
}

After setting the software TLS, the struct relationship will look like:

The corresponding protocol handlers are defined in the build_protos() function. I expand some assignment operations to make it more straightforward.

static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
             const struct proto *base)
{
    // [...]
    prot[TLS_SW][TLS_SW].setsockopt        = tls_setsockopt;
    prot[TLS_SW][TLS_SW].getsockopt        = tls_getsockopt;
    prot[TLS_SW][TLS_SW].sendmsg           = tls_sw_sendmsg;
    prot[TLS_SW][TLS_SW].recvmsg           = tls_sw_recvmsg;
    prot[TLS_SW][TLS_SW].splice_eof        = tls_sw_splice_eof;
    prot[TLS_SW][TLS_SW].sock_is_readable  = tls_sw_sock_is_readable;
    prot[TLS_SW][TLS_SW].close             = tls_sk_proto_close;
    // [...]
}

1.3. Sendmsg

The send handler of TLS sockets is tls_sw_sendmsg(). This function first acquires two locks: the TLS transmission lock [1] and the socket lock [2].

int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
    struct tls_context *tls_ctx = tls_get_ctx(sk);
    int ret;

    // [...]
    ret = mutex_lock_interruptible(&tls_ctx->tx_lock); // [1]
    lock_sock(sk); // [2]
    
    ret = tls_sw_sendmsg_locked(sk, msg, size);
    
    release_sock(sk);
    mutex_unlock(&tls_ctx->tx_lock);
    return ret;
}

When sending packets, the kernel maintains two skb_msg objects (used to store packet information): one for plaintext packets and another for encrypted packets. The function tls_sw_sendmsg_locked() first initializes the skb_msg used to store encrypted packets [3] and copies data into the plaintext skb [4]. It then calls bpf_exec_tx_verdict() [5] to push records and attempt transmission. A “record” can be considered equivalent to a TLS packet.

static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
                 size_t size)
{
    // [...]
    while (msg_data_left(msg)) {
        // [...]
        ret = tls_alloc_encrypted_msg(sk, required_size); // [3]

        if (/* ... */) {
            // [...]
            msg_pl = &rec->msg_plaintext;
            msg_en = &rec->msg_encrypted;

            // [...]
            ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, // [4]
                            msg_pl, try_to_copy);

            // [...]
            ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, // [5]
                          record_type, &copied,
                          msg->msg_flags);
            
            // [...]
            continue;
        }
    }
    // [...]
}

The tls_push_record() function is called internally. It begins by copying plaintext data to the data page of the encrypted packet [6]. Next, it invokes the tls_do_encryption() function [7] to encrypt the packet and then calls the tls_tx_records() function [8] to transmit the record.

static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
                   bool full_record, u8 record_type,
                   ssize_t *copied, int flags)
{
    // [...]
    err = tls_push_record(sk, flags, record_type); // <-----------
    // [...]
}

static int tls_push_record(struct sock *sk, int flags,
               unsigned char record_type)
{
    // [...]
    tls_fill_prepend(tls_ctx, // [6]
             page_address(sg_page(&msg_en->sg.data[i])) +
             msg_en->sg.data[i].offset,
             msg_pl->sg.size + prot->tail_size,
             record_type);
    
    // [...]
    rc = tls_do_encryption(sk, tls_ctx, ctx, req, // [7]
                   msg_pl->sg.size + prot->tail_size, i);
    
    // [...]
    tls_tx_records(sk, flags); // [8]
}

The tls_tx_records() function iterates the ctx->tx_list, which is the transmission packet list, and invokes tls_push_sg() for each packet [9]. The tls_push_sg() function then calls tcp_sendmsg_locked() [10] to transmit a TLS packet using the TCP transmission API.

int tls_tx_records(struct sock *sk, int flags)
{
    // [...]
    list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
        if (READ_ONCE(rec->tx_ready)) {
            // [...]
            msg_en = &rec->msg_encrypted;
            rc = tls_push_sg(sk, tls_ctx, // [9]
                     &msg_en->sg.data[msg_en->sg.curr],
                     0, tx_flags);
            // [...]
        } // [...]
    }
}

int tls_push_sg(struct sock *sk,
        struct tls_context *ctx,
        struct scatterlist *sg,
        u16 first_offset,
        int flags)
{
    struct msghdr msg = {
        .msg_flags = MSG_SPLICE_PAGES | flags,
    };
    
    // [...]
    while (1) {
        p = sg_page(sg);
        bvec_set_page(&bvec, p, size, offset);
        iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);

        ret = tcp_sendmsg_locked(sk, &msg, size); // [10]
        // [...]
    }
}

1.4. Recvmsg

The recevie handler of TLS sockets is tls_sw_recvmsg(). This function first calls process_rx_list() to process pending decrypted records [1] and then calls sock_rcvlowat() to wait for a packet. Since packets can be decrypted asynchronously, the function determines whether to handle a packet asynchoronusly or not based on the packet type and the capability of rx ctx [2].

Afterward, the tls_rx_one_record() function [3] is called to receive the packet. If processed asynchronously, this packet is also enqueued to ctx->rx_list [4]. During the next call to tls_sw_recvmsg(), the process_rx_list() will handle those enqueued packets.

int tls_sw_recvmsg(struct sock *sk,
           struct msghdr *msg,
           size_t len,
           int flags,
           int *addr_len)
{
    struct tls_context *tls_ctx = tls_get_ctx(sk);
    struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);

    // [...]
    err = process_rx_list(ctx, msg, &control, 0, len, is_peek, &rx_more); // [1]

    target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);

    while (/* ... */) {
        // [...]
        tlm = tls_msg(tls_strp_msg(ctx));

        if (tlm->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled /* true */) // [2]
            darg.async = ctx->async_capable;
        else
            darg.async = false;

        err = tls_rx_one_record(sk, msg, &darg); // [3]

        async |= darg.async;

        // [...]
        struct sk_buff *skb = darg.skb;
        
        // [...]
        if (async) {
            // [...]
            __skb_queue_tail(&ctx->rx_list, skb); // [4]
            // [...]
            continue;
        }
    }
}

The tls_decrypt_sg() function is called internally. It begins by allocating memory for an AEAD request [5] and then calls the tls_do_decryption() function [6] to prepare and submit the AEAD request.

static int tls_rx_one_record(struct sock *sk, struct msghdr *msg,
                 struct tls_decrypt_arg *darg)
{
    struct tls_context *tls_ctx = tls_get_ctx(sk);
    // [...]

    err = tls_decrypt_sw(sk, tls_ctx, msg, darg); // <-----------
    // [...]
}

static int
tls_decrypt_sw(struct sock *sk, struct tls_context *tls_ctx,
           struct msghdr *msg, struct tls_decrypt_arg *darg)
{
    err = tls_decrypt_sg(sk, &msg->msg_iter, NULL, darg); // <-----------
    // [...]
}

static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov,
              struct scatterlist *out_sg,
              struct tls_decrypt_arg *darg)
{
    struct tls_context *tls_ctx = tls_get_ctx(sk);
    struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
    struct tls_decrypt_ctx *dctx;
    struct aead_request *aead_req;

    // [...]
    aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
    aead_size = ALIGN(aead_size, __alignof__(*dctx));
    mem = kmalloc(aead_size + struct_size(dctx, sg, size_add(n_sgin, n_sgout)), // [5]
              sk->sk_allocation);

    aead_req = (struct aead_request *)mem;
    dctx = (struct tls_decrypt_ctx *)(mem + aead_size);
    dctx->sk = sk;
    sgin = &dctx->sg[0]; // in data segment
    sgout = &dctx->sg[n_sgin]; // out data segment

    // [...]
    err = tls_do_decryption(sk, sgin, sgout, dctx->iv, // [6]
                data_len + prot->tail_size, aead_req, darg);
    // [...]
}

The tls_do_decryption() function handles packet decryption differently depending on the value of darg->async. I have already reordered the function code to make it more comprehensible.

For synchronous handling, this function sets the decryption callback to crypto_req_done(). If the crypto_aead_decrypt() function returns an -INPROGRESS or -BUSY error [7], the function waits for the AEAD request to complete.

For asynchronous handling, this function sets callback to tls_decrypt_done() and waits for the request to complete if the error is -EINPROGRESS [8]. Additionally, the pending decryption count is updated both before and after submitting the request.

static int tls_do_decryption(struct sock *sk,
                 struct scatterlist *sgin,
                 struct scatterlist *sgout,
                 char *iv_recv,
                 size_t data_len,
                 struct aead_request *aead_req,
                 struct tls_decrypt_arg *darg)
{
    struct tls_context *tls_ctx = tls_get_ctx(sk);
    struct tls_prot_info *prot = &tls_ctx->prot_info;
    struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
    int ret;

    // [...]
    // ---------- synchronously ----------
    if (!darg->async) {
        // [...]
        aead_request_set_callback(aead_req,
                      CRYPTO_TFM_REQ_MAY_BACKLOG,
                      crypto_req_done, &wait);
        ret = crypto_aead_decrypt(aead_req);
        if (ret == -EINPROGRESS || ret == -EBUSY) // [7]
            ret = crypto_wait_req(ret, &wait);
        return ret;
    }

    // ---------- asynchronously ----------
    aead_request_set_callback(aead_req,
                CRYPTO_TFM_REQ_MAY_BACKLOG,
                tls_decrypt_done, aead_req);
    atomic_inc(&ctx->decrypt_pending);

    ret = crypto_aead_decrypt(aead_req);
    if (ret == -EINPROGRESS)
        return 0;

    if (ret == -EBUSY) { // [8]
        ret = tls_decrypt_async_wait(ctx);
        darg->async_done = true;
        darg->async = false;
        return ret;
    }

    atomic_dec(&ctx->decrypt_pending);
    darg->async = false;

    return ret;
}

1.5. Close

The close handler for TLS sockets is tls_sk_proto_close(). It first cancels the TX worker tx_work_handler() [1], which is responsible for transmitting encrypted records.

Next, it acquires the socket lock [2] and frees certain objects referenced by the members of TLS context object [3]. With the callback write lock held, the function restores the original TCP protocol ops [4], and releases both RX context (tls_sw_context_rx) and TX context (tls_sw_context_tx) [5].

Finally, it releases the tls_context object [6].

static void tls_sk_proto_close(struct sock *sk, long timeout)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct tls_context *ctx = tls_get_ctx(sk);
    bool free_ctx;

    tls_sw_cancel_work_tx(ctx); // [1]

    lock_sock(sk); // [2]
    tls_sk_proto_cleanup(sk, ctx, timeo); // [3]

    // [...]
    rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
    WRITE_ONCE(sk->sk_prot, ctx->sk_proto); // [4]
    // [...]
    
    release_sock(sk);
    
    // [5]
    tls_sw_free_ctx_tx(ctx);
    tls_sw_free_ctx_rx(ctx);
    ctx->sk_proto->close(sk, timeout);

    // [6]
    tls_ctx_free(sk, ctx);
}

2. Cryptography in TLS

2.1. Configuration

There are eight types of ciphers supported by TLS, and their descriptions are defined in the &tls_cipher_desc[] variable.

const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN] = {
    TLS_CIPHER_AES_GCM_128, ..., "gcm(aes)"
    TLS_CIPHER_AES_GCM_256, ..., "gcm(aes)"
    TLS_CIPHER_AES_CCM_128, ..., "ccm(aes)"
    TLS_CIPHER_CHACHA20_POLY1305, ..., "rfc7539(chacha20,poly1305)"
    TLS_CIPHER_SM4_GCM, ..., "gcm(sm4)"
    TLS_CIPHER_SM4_CCM, ..., "ccm(sm4)"
    TLS_CIPHER_ARIA_GCM_128, ..., "gcm(aria)"
    TLS_CIPHER_ARIA_GCM_256, ..., "gcm(aria)"
};

The get_cipher_desc() function serves as a wrapper to retrieve the description for the corresponding cipher type.

static inline const struct tls_cipher_desc *get_cipher_desc(u16 cipher_type)
{
    // [...] bound check
    return &tls_cipher_desc[cipher_type - TLS_CIPHER_MIN];
}

When configuring TX/RX, the tls_set_sw_offload() function initializes the crypto metadata based on the description, including the AEAD request object [1].

int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
    struct crypto_aead **aead;
    // [...]

    cipher_desc = get_cipher_desc(crypto_info->cipher_type);
    // [...]
    iv = crypto_info_iv(crypto_info, cipher_desc);
    key = crypto_info_key(crypto_info, cipher_desc);
    salt = crypto_info_salt(crypto_info, cipher_desc);
    rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc);
    // [...]
    *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0); // [1]
    // [...]
}

The crypto_alloc_tfm_node() function is called internally. It first gets algorithm object (crypto_alg) using the provided algorithm name, such as "gcm(aes)" or "ccm(aes)" [2]. It then allocates a transformation object [3] based on the algorithm object.

struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask)
{
    return crypto_alloc_tfm(alg_name, &crypto_aead_type, type, mask); // <-----------
}

static inline void *crypto_alloc_tfm(const char *alg_name,
               const struct crypto_type *frontend, u32 type, u32 mask)
{
    return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE); // <-----------
}

void *crypto_alloc_tfm_node(const char *alg_name,
               const struct crypto_type *frontend, u32 type, u32 mask,
               int node)
{
    void *tfm;
    struct crypto_alg *alg;
    
    // [...]
    alg = crypto_find_alg(alg_name, frontend, type, mask); // [2]
    
    // [...]
    tfm = crypto_create_tfm_node(alg, frontend, node); // [3]

    // [...]
    return tfm;
}

The crypto_alg_mod_lookup() is responsible for locating the targeted algorithm and returning it. First, it calls crypto_larval_lookup() [4] to lookup the algorithm from linked list. If the target alrorithm does not exist, its then invokes crypto_probing_notify() to send a CRYPTO_MSG_ALG_REQUEST request to Cryptomgr [5]. Finally, it waits for the probing to complete [6] and frees the larval object [7].

struct crypto_alg *crypto_find_alg(const char *alg_name,
                   const struct crypto_type *frontend,
                   u32 type, u32 mask)
{
    if (frontend) {
        // [...]
        // update type & mask
    }

    return crypto_alg_mod_lookup(alg_name, type, mask); // <-----------
}

struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask)
{
    struct crypto_alg *alg;
    struct crypto_alg *larval;

    larval = crypto_larval_lookup(name, type, mask); // [4]
    if (/* ... */ !crypto_is_larval(larval))
        return larval;
    
    ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval); // [5]
    if (ok == NOTIFY_STOP)
        alg = crypto_larval_wait(larval); // [6]
    
    // [...]
    crypto_larval_kill(larval); // [7]
    
    // [...]
    return alg;
}

2.1.1. Lookup

A larval is essentially a temporary placeholder for a cryptographic algorithm during its initialization phase, representing an “incomplete” or “not fully ready” state. During the lookup opeartion, if the targeted algorithm doesn’t exist, the newly created larval is linked to the &crypto_alg_list [1].

static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type,
                           u32 mask)
{
    struct crypto_alg *alg;

    // [...]
    alg = crypto_alg_lookup(name, type, mask);

    // [...]
    else if (!alg)
        alg = crypto_larval_add(name, type, mask); // <-----------

    return alg;
}

static struct crypto_alg *crypto_larval_add(const char *name, u32 type,
                        u32 mask)
{

    struct crypto_alg *alg;
    struct crypto_larval *larval;

    larval = crypto_larval_alloc(name, type, mask);
    alg = __crypto_alg_lookup(name, type, mask);
    if (!alg) {
        alg = &larval->alg;
        list_add(&alg->cra_list, &crypto_alg_list); // [1]
    }
    return alg;
}

The actual lookup operation is handled by __crypto_alg_lookup() internally. This function iterates &crypto_alg_list [2] and compares algoerithm names [3]. It returns either the matching algorithm or on with a higher priortiy.

static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type,
                        u32 mask)
{
    // [...]
    alg = __crypto_alg_lookup(name, (type | test) & ~fips, // <-----------
                  (mask | test) & ~fips);
    // [...]
}

static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type,
                          u32 mask)
{
    struct crypto_alg *q, *alg = NULL;
    int best = -2;

    list_for_each_entry(q, &crypto_alg_list, cra_list) { // [2]
        exact = !strcmp(q->cra_driver_name, name);
        fuzzy = !strcmp(q->cra_name, name);

        // [...]
        if (!exact && !(fuzzy && q->cra_priority > best)) // [3]
            continue;
        
        best = q->cra_priority;
        alg = q;

        if (exact)
            break;
    }

    return alg;
}

The following algorithms are registered during the boot process:

static struct akcipher_alg rsa (rsa)
static struct crypto_alg null_algs[] (cipher_null, compress_null, digest_null)
static struct skcipher_alg skcipher_null (ecb(cipher_null))
static struct shash_alg alg (md4)
static struct shash_alg alg (md5)
static struct shash_alg alg (sha1)
static struct shash_alg alg (sha256)
static struct shash_alg alg (sha224)
static struct shash_alg sha512_algs[2] (sha512, sha384)
static struct shash_alg algs[] (sha3-224, sha3-256, sha3-384, sha3-512)
static struct crypto_alg des_algs[2] (des, des3_ede)
static struct crypto_alg aes_alg (aes)
static struct skcipher_alg arc4_alg (ecb(arc4))
static struct crypto_alg alg (deflate)
static struct scomp_alg scomp[] (deflate, zlib-deflate)
static struct shash_alg alg (michael_mic)
static struct shash_alg alg (crc32c)
static struct shash_alg alg (crct10dif)
static struct crypto_alg alg (lzo)
static struct scomp_alg scomp (lzo)
static struct crypto_alg alg (lzo-rle)
static struct scomp_alg scomp (lzo-rle)
static struct crypto_alg alg_lz4 (lz4)
static struct scomp_alg scomp (lz4)
static struct rng_alg rng_algs[] (stdrng)
static struct shash_alg ghash_alg (ghash)
…

2.1.2. Probe

To probe the targeted algorithm, the kernel to dispatches a job to a kthread, and the handler is cryptomgr_probe() function [1].

int crypto_probing_notify(unsigned long val, void *v)
{
    int ok;

    ok = blocking_notifier_call_chain(&crypto_chain, val, v); // <-----------
    // [...]
}

int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
        unsigned long val, void *v)
{
    // [...]
    ret = notifier_call_chain(&nh->head, val, v, -1, NULL); // <-----------
    // [...]
    return ret;
}

static int notifier_call_chain(struct notifier_block **nl,
                   unsigned long val, void *v,
                   int nr_to_call, int *nr_calls)
{
    while (nb && nr_to_call) {
        ret = nb->notifier_call(nb, val, v); // <----------- cryptomgr_notify()
        // [...]
        
        if (ret & NOTIFY_STOP_MASK)
            break;
    }
    return ret;
}

static int cryptomgr_notify(struct notifier_block *this, unsigned long msg,
                void *data)
{
    switch (msg) {
    case CRYPTO_MSG_ALG_REQUEST:
        return cryptomgr_schedule_probe(data); // <-----------
    // [...]
    }
}

static int cryptomgr_schedule_probe(struct crypto_larval *larval)
{
    const char *name = larval->alg.cra_name;

    // [...]
    param = kzalloc(sizeof(*param), GFP_KERNEL);
    memcpy(param->template, name, len);

    // [...]
    param->larval = larval;
    thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); // [1]
    
    // [...]
    return NOTIFY_STOP;
}

The cryptomgr_probe() first looks up the corresponding crypto template by its name, which is derived from the targeted algorithm name. For instance, if the algorithm name is "gcm(aes)", the template name will be the substring before the left bracket, which in this case is "gcm".

Next, it calls the create handler of the found crypto template to generate a new algorithm object.

static int cryptomgr_probe(void *data)
{
    struct cryptomgr_param *param = data;
    struct crypto_template *tmpl;

    tmpl = crypto_lookup_template(param->template);
    tmpl->create(tmpl, param->tb);
    
    // [...]
}

The crypto template objects are linked to &crypto_template_list [2]. A subsystem can register its templates during the booting stage by calling crypto_register_template() [3].

static struct crypto_template *__crypto_lookup_template(const char *name)
{
    struct crypto_template *q, *tmpl = NULL;

    // [...]
    list_for_each_entry(q, &crypto_template_list, list) { // [2]
        if (strcmp(q->name, name))
            continue;
    }
    // [...]
}

int crypto_register_template(struct crypto_template *tmpl)
{
    // [...]
    list_add(&tmpl->list, &crypto_template_list); // [3]
    // [...]
}

In the kernelCTF environment, the following templates are registered by default:

seqiv_tmpl (seqiv)
echainiv_tmpl (echainiv)
rsa_pkcs1pad_tmpl (pkcs1pad)
crypto_cmac_tmpl (cmac)
hmac_tmpl (hmac)
crypto_ecb_tmpl (ecb)
crypto_cbc_tmpl (cbc)
crypto_cts_tmpl (cts)
lrw_tmpl (lrw)
xts_tmpl (xts)
crypto_ctr_tmpls (ctr, rfc3686)
crypto_gcm_tmpls (gcm_base, gcm, rfc4106, rfc4543)
crypto_ccm_tmpls (cbcmac, ccm_base, ccm, rfc4309)
cryptd_tmpl (cryptd)
crypto_authenc_tmpl (authenc)
crypto_authenc_esn_tmpl (authencesn)
essiv_tmpl (essiv)

We take "gcm" template as example. Its create handler is crypto_gcm_create() function. First, this function gets cipher name [4], which is the "aes" substring in the bracket. Then, it calls crypto_gcm_create_common() to do creation operation [5].

static struct crypto_template crypto_gcm_tmpls[] = {
    /* ... */ {
        .name = "gcm",
        .create = crypto_gcm_create,
        .module = THIS_MODULE,
    },
    // [...]
};

static int crypto_gcm_create(struct crypto_template *tmpl, struct rtattr **tb)
{
    const char *cipher_name;
    char ctr_name[CRYPTO_MAX_ALG_NAME];

    cipher_name = crypto_attr_alg_name(tb[1]); // [4]
    snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", cipher_name);
    return crypto_gcm_create_common(tmpl, tb, ctr_name, "ghash"); // [5]
}

The crypto_gcm_create_common() function allocates an AEAD instance object and initializes it though various complex operations. The desired crypto algorithm object is a member of this AEAD instance object and is initialized by aead_prepare_alg() function [6].

static int crypto_gcm_create_common(struct crypto_template *tmpl,
                    struct rtattr **tb,
                    const char *ctr_name,
                    const char *ghash_name)
{
    struct aead_instance *inst;

    inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);

    // [...]
    // initialization

    err = aead_register_instance(tmpl, inst); // <-----------
}

int aead_register_instance(struct crypto_template *tmpl,
               struct aead_instance *inst)
{
    int err;
    err = aead_prepare_alg(&inst->alg); // [6]
    return crypto_register_instance(tmpl, aead_crypto_instance(inst)); // <-----------
}

Finally, the crypto_register_instance() function is called, which links the algorithm to &crypto_alg_list [7] and links the instance object with the corresponding crypto template [8]. Additionally, the crypto_alg_finish_registration() function is called to find those larvals with the same algorithm names [9] and to update their adult field to the registerd algorithm object [10].

int crypto_register_instance(struct crypto_template *tmpl,
                 struct crypto_instance *inst)
{
    // [...]
    larval = __crypto_register_alg(&inst->alg, &algs_to_put); // <-----------
    hlist_add_head(&inst->list, &tmpl->instances); // [8]
    inst->tmpl = tmpl;
    // [...]
    return 0;
}

static struct crypto_larval *
__crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put)
{
    // [...]
    list_add(&alg->cra_list, &crypto_alg_list); // [7]
    
    // [...]
    crypto_alg_finish_registration(alg, true, algs_to_put);
    
    // [...]
}

static void crypto_alg_finish_registration(struct crypto_alg *alg,
                       bool fulfill_requests,
                       struct list_head *algs_to_put)
{
    list_for_each_entry(q, &crypto_alg_list, cra_list) {
        // [...]
        if (crypto_is_larval(q)) {
            struct crypto_larval *larval = (void *)q;
            // [...]

            if (strcmp(alg->cra_name, q->cra_name) && // [9]
                strcmp(alg->cra_driver_name, q->cra_name))
                continue;

            // [...]
            larval->adult = alg; // [10]

            // [...]
        }
    }
    // [...]
}

The hierarchy of algorithm, template, instance and spawn is shown below:

The simplified illustration of the relationship between each structure is as follows:

2.1.3. Wait

After dispatching the probing job to kthread, the kernel calls crypto_larval_wait() to wait for the probing process to complete. Once completion, larval->adult will point to the newly registered algorithm object [1].

static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg)
{
    struct crypto_larval *larval = (void *)alg;


    // [...]
    timeout = wait_for_completion_killable_timeout(
        &larval->completion, 60 * HZ);

    alg = larval->adult; // [1]

    // [...]
    return alg;
}

Upon returning to the caller, the crypto_larval_kill() function [2] is called to unlink [3] and release this temporary algorithm object [4].

struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask)
{
    // [...]
    alg = crypto_larval_wait(larval);
    crypto_larval_kill(larval); // [2]
    return alg;
}

void crypto_larval_kill(struct crypto_alg *alg)
{
    struct crypto_larval *larval = (void *)alg;

    // [...]
    list_del(&alg->cra_list); // [3]

    // [...]
    crypto_alg_put(alg); // [4]
}

2.2. Encryption

When configuring TX, the tls_set_sw_offload() function calls init_ctx_tx() to create a tls_sw_context_tx object.

int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
    // [...]
    if (tx) {
        ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); // <-----------

        // [...]
    } else {
        // [...]
    }
    // [...]
}

This object contains several interesting fields. For example, it includes a worker object sw_ctx_tx->tx_work [1], whose handler is set to the tx_work_handler() function.

static struct tls_sw_context_tx *init_ctx_tx(struct tls_context *ctx, struct sock *sk)
{
    struct tls_sw_context_tx *sw_ctx_tx;

    sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
    crypto_init_wait(&sw_ctx_tx->async_wait);
    atomic_set(&sw_ctx_tx->encrypt_pending, 1);
    INIT_LIST_HEAD(&sw_ctx_tx->tx_list);
    INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); // [1]
    sw_ctx_tx->tx_work.sk = sk;

    return sw_ctx_tx;
}

During packet transmission, the tls_do_encryption() function is internally invoked by tls_sw_sendmsg(). This function first specifies tls_encrypt_done() as the encryption callback [2], which runs after the encryption process is finished. It then adds the current record to tx_list [3], updates the pending count [4], and calls the crypto_aead_encrypt() API [5] exposed by the crypto subsystem.

Two specific return values require special handling: -EBUSY and -EINPROGRESS. The -EINPROGRESS value indicates that the encryption job is currently in progress, and the user must wait for its completion. Conversely, the -EBUSY value signifies that the request cannot be processed at the moment.

static int tls_do_encryption(struct sock *sk,
                 struct tls_context *tls_ctx,
                 struct tls_sw_context_tx *ctx,
                 struct aead_request *aead_req,
                 size_t data_len, u32 start)
{
    struct tls_rec *rec = ctx->open_rec;
    struct sk_msg *msg_en = &rec->msg_encrypted;
    struct scatterlist *sge = sk_msg_elem(msg_en, start);

    // [...]
    // do encryption

    aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
                  tls_encrypt_done, rec); // [2]

    list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); // [3]
    atomic_inc(&ctx->encrypt_pending); // [4]

    rc = crypto_aead_encrypt(aead_req); // [5]
    if (rc == -EBUSY) {
        rc = tls_encrypt_async_wait(ctx);
        rc = rc ?: -EINPROGRESS;
    }
    if (!rc || rc != -EINPROGRESS) {
        atomic_dec(&ctx->encrypt_pending);
        // [...]
    }

    if (!rc) {
        WRITE_ONCE(rec->tx_ready, true);
    } else if (rc != -EINPROGRESS) {
        list_del(&rec->list);
        return rc;
    }

    ctx->open_rec = NULL;
    // [...]
    return rc;
}

The crypto_gcm_encrypt() function serves as the handler for GCM encryption. It first invokes crypto_skcipher_encrypt() with the callback function gcm_encrypt_done() to perform symmetric encryption [6]. Then, it calls gcm_encrypt_continue() to generate an authentication tag [7].

int crypto_aead_encrypt(struct aead_request *req)
{
    struct crypto_aead *aead = crypto_aead_reqtfm(req);
    struct aead_alg *alg = crypto_aead_alg(aead);
    // [...]
    ret = alg->encrypt(req); // &crypto_gcm_encrypt()
    // [...]
}

static int crypto_gcm_encrypt(struct aead_request *req)
{
    struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
    struct skcipher_request *skreq = &pctx->u.skreq;
    u32 flags = aead_request_flags(req);

    // [...]
    skcipher_request_set_callback(skreq, flags, gcm_encrypt_done, req);
    return crypto_skcipher_encrypt(skreq) ?: // [6]
           gcm_encrypt_continue(req, flags); // [7]
}

Due to the complexity of the operation, I only provide an illustration of the execution flow here.

If this request is dispatched to a background worker, the callback function (in this case, tls_encrypt_done()) is executed within the crypto_request_complete() function upon completion of the request.

static inline void aead_request_complete(struct aead_request *req, int err)
{
    crypto_request_complete(&req->base, err); // <-----------
}

static inline void crypto_request_complete(struct crypto_async_request *req,
                       int err)
{
    req->complete(req->data, err);
}

2.3. Decryption

When configuring RX, the tls_set_sw_offload() function calls init_ctx_rx() to create a tls_sw_context_rx object.

int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
    // [...]
    if (tx) {
        // [...]
    } else {
        ctx->priv_ctx_rx = init_ctx_rx(ctx); // <-----------
        
        // [...]
    }
    // [...]
}

static struct tls_sw_context_rx *init_ctx_rx(struct tls_context *ctx)
{
    struct tls_sw_context_rx *sw_ctx_rx;

    sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
    crypto_init_wait(&sw_ctx_rx->async_wait);
    atomic_set(&sw_ctx_rx->decrypt_pending, 1);
    init_waitqueue_head(&sw_ctx_rx->wq);
    skb_queue_head_init(&sw_ctx_rx->rx_list);
    skb_queue_head_init(&sw_ctx_rx->async_hold);

    return sw_ctx_rx;
}

The tls_do_decryption() function is called internally during the process of receiving packets, as introduced in Section “1.4. Recvmsg.” Within the tls_do_decryption() function, the crypto_aead_decrypt() API from the crypto subsystem is invoked to decrypt TLS packets.

int crypto_aead_decrypt(struct aead_request *req)
{
    struct crypto_aead *aead = crypto_aead_reqtfm(req);
    struct aead_alg *alg = crypto_aead_alg(aead);
    // [...]
    ret = alg->decrypt(req); // &crypto_gcm_decrypt
    // [...]
}

The execution flow of decryption is similar to that of encryption but is simpler.

static int crypto_gcm_decrypt(struct aead_request *req)
{
    struct crypto_aead *aead = crypto_aead_reqtfm(req);
    struct crypto_gcm_req_priv_ctx *pctx = crypto_gcm_reqctx(req);
    struct crypto_gcm_ghash_ctx *gctx = &pctx->ghash_ctx;
    // [...]
    gctx->complete = gcm_dec_hash_continue;
    return gcm_hash(req, flags);
}

2.3. Asynchronous

2.3.1. Transmit

For transmission, the kernel determines if this encryption operation is handled asynchronously based on the return value of the tls_do_encryption() function. If the operation is asynchronous, it sets ctx->async_capable to 1 [1].

More specifically, different algorithms decide whether to create a kthread to handle requests [2]. When they choose to dispatch requests to kthead, the return value will be -EINPROGRESS.

static int tls_push_record(struct sock *sk, int flags,
               unsigned char record_type)
{
    // [...]
    rc = tls_do_encryption(sk, tls_ctx, ctx, req, // <-----------
                   msg_pl->sg.size + prot->tail_size, i);
    if (rc < 0) {
        // [...]
        ctx->async_capable = 1; // [1]
        return rc;
    }
    // [...]
}

static int tls_do_encryption(struct sock *sk,
                 struct tls_context *tls_ctx,
                 struct tls_sw_context_tx *ctx,
                 struct aead_request *aead_req,
                 size_t data_len, u32 start)
{
    rc = crypto_aead_encrypt(aead_req); // <-----------
    // [...]
    return rc;
}

int crypto_aead_encrypt(struct aead_request *req)
{
    struct crypto_aead *aead = crypto_aead_reqtfm(req);
    struct aead_alg *alg = crypto_aead_alg(aead);
    // [...]
    ret = alg->encrypt(req); // [2]
    return crypto_aead_errstat(istat, ret);
}

2.3.2. Receive

For receiving, the tls_do_decryption() function processes packets differently if the parameter darg->async is set to 1 [1]. For example it assigns the function tls_decrypt_done() as the decryption callback [2].

The value of darg->async is determined in the tls_sw_recvmsg() function. If the control field in the TLS message is TLS_RECORD_TYPE_DATA [3], the decryption argument darg.async is set to asynchronous capability of the receive context [4].

int tls_sw_recvmsg(struct sock *sk,
           struct msghdr *msg,
           size_t len,
           int flags,
           int *addr_len)
{
    struct tls_context *tls_ctx = tls_get_ctx(sk);
    struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
    struct tls_msg *tlm;
    // [...]

    while (/* ... */) {
        struct tls_decrypt_arg darg;

        // [...]
        err = tls_rx_rec_wait(sk, psock, flags & MSG_DONTWAIT,
                      released);

        // [...]
        tlm = tls_msg(tls_strp_msg(ctx)); // ctx->strp.anchor
        
        // [...]
        if (tlm->control == TLS_RECORD_TYPE_DATA /* 23 */ && /* ... true */) // [3]
            darg.async = ctx->async_capable; // [4]
        else
            darg.async = false;

        err = tls_rx_one_record(sk, msg, &darg); // <-----------
        
        // [...]
    }
}

static int tls_rx_one_record(struct sock *sk, struct msghdr *msg,
                 struct tls_decrypt_arg *darg)
{
    // [...]
    err = tls_decrypt_sw(sk, tls_ctx, msg, darg); // <-----------
    if (err < 0)
        return err;
    // [...]
}

static int
tls_decrypt_sw(struct sock *sk, struct tls_context *tls_ctx,
           struct msghdr *msg, struct tls_decrypt_arg *darg)
{
    err = tls_decrypt_sg(sk, &msg->msg_iter, NULL, darg); // <-----------
    // [...]
}

static int tls_decrypt_sg(struct sock *sk, /* ... */
              struct tls_decrypt_arg *darg)
{
    // [...]
    err = tls_do_decryption(sk, sgin, sgout, dctx->iv, // <-----------
                data_len + prot->tail_size, aead_req, darg);
    // [...]
}

static int tls_do_decryption(struct sock *sk,
                 // [...]
                 struct aead_request *aead_req,
                 struct tls_decrypt_arg *darg)
{
    // [...]

    if (darg->async) { // [1]
        aead_request_set_callback(aead_req,
                      CRYPTO_TFM_REQ_MAY_BACKLOG,
                      tls_decrypt_done, aead_req);  // [2]
        atomic_inc(&ctx->decrypt_pending);
    }

    ret = crypto_aead_decrypt(aead_req);
    if (ret == -EINPROGRESS)
        return 0;

    // [...]
}

The control field is set within the tls_rx_rec_wait() function, which determines its value based on strp (tls_strparser) object. The strp object is condfigured during reception. The execution flow is as follows:

Softirq
=> tcp_v4_rcv()
==> tcp_v4_do_rcv()
===> tcp_rcv_established()
====> tcp_data_queue()
=====> tls_data_ready() (sk->sk_data_ready())
======> tls_strp_check_rcv()
=======> tls_strp_read_sock() - Verifies the packet size and configures the tls_strparser object.

sys_recvmsg
=> tls_sw_recvmsg()
==> tls_rx_rec_wait()
===> tls_strp_msg_load() - Configures the TLS message (tlm) using the tls_strparser (strp) object.

The async capability is configured during the RX setup. While TLS version is user-controllable [5], only algorithms with the CRYPTO_ALG_ASYNC flag support asynchronous operations [6].

int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
    struct tls_sw_context_rx *sw_ctx_rx = NULL;
    struct crypto_tfm *tfm;

    // [...]
    else {
        sw_ctx_rx = ctx->priv_ctx_rx;
    }
    
    // [...]
    if (sw_ctx_rx) {
        tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); // &tfm->base

        // [...]
        sw_ctx_rx->async_capable =
            crypto_info->version != TLS_1_3_VERSION && // [5]
            !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); // [6]
    }
    // [...]
}

2.3.3. Recon

While reading /proc/crypto, the kernel iterates the loaded algorithms and calls crypto_aead_show() function is to indicate whether an algorithm supports asynchornous operation [1]. However, sinc an algorithm can be dynamically loaded through probing, this information may not be reliable.

static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
{
    struct aead_alg *aead = container_of(alg, struct aead_alg, base);

    seq_printf(m, "type : aead\n");
    seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? // [1]
                   "yes" : "no");
    // [...]
}

Unfortunately, after reviewing the source code, I found that this flag is primarily set in vendor specific drivers, which are disabled in the kernelCTF environment.

However, I unexpectedly discovered that some functions in the cryptd subsystem configure the given algorithm object with CRYPTO_ALG_ASYNC flag [2].

static int cryptd_create_aead(struct crypto_template *tmpl,
                      struct rtattr **tb,
                  struct crypto_attr_type *algt,
                  struct cryptd_queue *queue)
{
    struct aead_instance_ctx *ctx;
    struct aead_instance *inst;

    // [...]
    inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
    ctx = aead_instance_ctx(inst);
    
    // [...]
    inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC | // [2]
        (alg->base.cra_flags & CRYPTO_ALG_INTERNAL);
    
    // [...]
    err = aead_register_instance(tmpl, inst);

    // [...]
}

What is the “cryptd” subsystem, and how can we interact with it? We will answer these questions in the next section.

3. Cryptd

3.1. Introduce

If the kernel is compiled with CONFIG_CRYPTO_CRYPTD option, it will start an asynchronous crypto daemon, also known as cryptd, which converts an arbitrary synchronous crypto algorithm into an asynchronous algorithm that runs in a kthread.

The cryptd_tmpl defines the create handler of cryptd template, which is the cryptd_create() function [1]. Besides AEAD algorithms, the cryptd_create() function also handles other types of algorithms.

static struct crypto_template cryptd_tmpl = {
    .name   = "cryptd",
    .create = cryptd_create, // [1]
    .module = THIS_MODULE,
};

static int cryptd_create(struct crypto_template *tmpl, struct rtattr **tb)
{
    struct crypto_attr_type *algt;

    algt = crypto_get_attr_type(tb);
    
    switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
    case CRYPTO_ALG_TYPE_SKCIPHER:
        return cryptd_create_skcipher(tmpl, tb, algt, &queue);
    
    case CRYPTO_ALG_TYPE_HASH:
        return cryptd_create_hash(tmpl, tb, algt, &queue);
    
    case CRYPTO_ALG_TYPE_AEAD:
        return cryptd_create_aead(tmpl, tb, algt, &queue);
    }

    return -EINVAL;
}

As we have introduced in “2.1.2. Probe” section, the cryptomgr deals with algorithm probing requests. To load an algorithm with cryptd, we first need to find a way to ask cryptomgr to load an algorithm with an arbitrary name.

Let’s quickly review the probing flow. First, a crypto API is called with an algorithm name. Then, the cryptomgr splits the algorithm name into a template name and a cipher name. For example, the algorithm name "gcm(aes)" is splited "gcm" as template name and "aes" as the cipher name.

Following this, the kthread cryptomgr_probe() is created to locate the template object based on provided name. It then calls the create handler of the template object to initialize the algorithm object.

Therefore, to create an algorithm using cryptd, the algorithm name should follow the format "cryptd(XXXX)". For instance, if we create an algorithm named "cryptd(gcm(aes))", the cryptd_create_aead() function, which is the create handler of the “cryptd” template, will be called. This function first invokes crypto_grab_aead() function [2] to spawn an instance of the "gcm(aes)" algorithm. It then registers the newly created instance in the algorithm list [3].

static int cryptd_create_aead(struct crypto_template *tmpl,
                      struct rtattr **tb,
                  struct crypto_attr_type *algt,
                  struct cryptd_queue *queue)
{
    // [...]
    err = crypto_grab_aead(&ctx->aead_spawn, aead_crypto_instance(inst), // [2]
                   crypto_attr_alg_name(tb[1]) /* gcm(aes) */, type, mask);
    
    // [...]
    err = cryptd_init_instance(aead_crypto_instance(inst), &alg->base);
    
    // [...]
    err = aead_register_instance(tmpl, inst); // [3]

    // [...]
}

After this process, there will be two "gcm(aes)" algorithm instances with different priorities in the list:

Created by the “gcm” template, with a priority of 100.
Created by the “cryptd” template, with a priority of 150 and the flag CRYPTO_ALG_ASYNC is set.

The reason the instance created by “cryptd” has a higher priority is that cryptd_init_instance() increases the original priority by 50 [4].

static int cryptd_init_instance(struct crypto_instance *inst,
                struct crypto_alg *alg)
{
    // [...]
    inst->alg.cra_priority = alg->cra_priority + 50; // [4]
    // [...]
}

During an algorithm lookup, the __crypto_alg_lookup() function returns the algorithm instance with the highest priority [5].

static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type,
                          u32 mask)
{
    list_for_each_entry(q, &crypto_alg_list, cra_list) {
        // [...]
        if (!exact && !(fuzzy && q->cra_priority > best)) // [5]
            continue;
        // [...]
    }
}

Now, we know the algorithm name that needs to be used, but we still have no way to call the crypto API with a controllable algorithm name.

3.2. Registration

The algorithm socket (AF_ALG) is an interface to kernel crypto API. When creating an ALG socket, the alg_create() function [1] is invoked. Only ALG sockets with a protocol value of 0 and a type of SOCK_SEQPACKET are permitted. The type ops is set to &alg_proto_ops [2].

static const struct net_proto_family alg_family = {
    .family   =    PF_ALG,
    .create   =    alg_create, // [1]
    .owner    =    THIS_MODULE,
};

static int alg_create(struct net *net, struct socket *sock, int protocol,
              int kern)
{
    struct sock *sk;
    
    // [..]
    sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern);
    sock->ops = &alg_proto_ops; // [2]
    sock_init_data(sock, sk);
    sk->sk_destruct = alg_sock_destruct;

    return 0;
}

The bind handler is the alg_bind() function. This function retrieves algorithm type ops [3] and invokes its bind handler [4]

static const struct proto_ops alg_proto_ops = {
    .family  =    PF_ALG,
    .owner   =    THIS_MODULE,
    // [...]
    .bind    =    alg_bind,
    // [...]
};

static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
    const struct af_alg_type *type;
    
    // [...]
    sa->salg_type[sizeof(sa->salg_type) - 1] = 0;
    sa->salg_name[addr_len - sizeof(*sa) - 1] = 0;
    type = alg_get_type(sa->salg_type); // [3]
    private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask); // [4]
}

The alg_get_type() function iterates the &alg_types linked list [5] and returns the corresponding algorithm type object based on the provided type name.

static const struct af_alg_type *alg_get_type(const char *name)
{
    const struct af_alg_type *type = ERR_PTR(-ENOENT);
    struct alg_type_list *node;

    list_for_each_entry(node, &alg_types, list) { // [5]
        if (strcmp(node->type->name, name))
            continue;
        break;
    }
    return type;
}

Subsystems can call af_alg_register_type() to register their algorithm type objects into linked list [6].

int af_alg_register_type(const struct af_alg_type *type)
{
    struct alg_type_list *node;
    // [...]
    node = kmalloc(sizeof(*node), GFP_KERNEL);
    // [...]
    node->type = type;
    list_add(&node->list, &alg_types); // [6]
    // [...]
    return err;
}

The following three algorithm type objects are registered by default:

algif_type_hash (hash)
algif_type_skcipher (skcipher)
algif_type_aead (aead)

The bind handler for “aead” algorithm is the aead_bind() function. Surprisely, this function calls crypto_alloc_aead(), the crypto probing API, using our parameters [7].

static const struct af_alg_type algif_type_aead = {
    .bind = aead_bind,
    // [...]
    .name = "aead",
    // [...]
};

static void *aead_bind(const char *name, u32 type, u32 mask)
{
    struct aead_tfm *tfm;
    struct crypto_aead *aead;
    // [...]

    tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
    aead = crypto_alloc_aead(name, type, mask); // [7]

    // [...]
    tfm->aead = aead;

    // [...]
    return tfm;
}

As a result, we can create an AFG socket with an arbitrary .salg_name, and the bind handler of the “aead” algorithm will invoke crypto_alloc_aead() function with this name. For example, if we set .salg_name to "cryptd(gcm(aes))", the "gcm(aes)" algorithm instance will be created using the cryptd template.

#include <linux/if_alg.h>

int sock = socket(AF_ALG, SOCK_SEQPACKET, 0);
struct sockaddr_alg sa = {
    .salg_family = AF_ALG,
    .salg_type = "aead",
    .salg_name = "cryptd(gcm(aes))",
};
bind(sock, (struct sockaddr *)&sa, sizeof(sa));

3.3. Encryption

As the initialization by cryptd_create_aead() function, we can know that the encryption handler of the cryptd algorithm instance is the cryptd_aead_encrypt_enqueue() function [1].

static int cryptd_create_aead(struct crypto_template *tmpl,
                      struct rtattr **tb,
                  struct crypto_attr_type *algt,
                  struct cryptd_queue *queue)
{
    // [...]
    inst->alg.encrypt = cryptd_aead_encrypt_enqueue; // [1]
    // [...]
}

Encryption requests are enqueued to the CPU queue [1] and then dispatched to cryptd_wq workqueue [2].

static int cryptd_aead_encrypt_enqueue(struct aead_request *req)
{
    return cryptd_aead_enqueue(req, cryptd_aead_encrypt ); // <-----------
}

static int cryptd_aead_enqueue(struct aead_request *req,
                    crypto_completion_t compl)
{
    struct cryptd_aead_request_ctx *rctx = aead_request_ctx(req);
    struct crypto_aead *tfm = crypto_aead_reqtfm(req);
    struct cryptd_queue *queue = cryptd_get_queue(crypto_aead_tfm(tfm));
    
    // [...]
    // wrap the request
    return cryptd_enqueue_request(queue, &req->base); // <-----------
}

static int cryptd_enqueue_request(struct cryptd_queue *queue,
                  struct crypto_async_request *request)
{
    int err;
    struct cryptd_cpu_queue *cpu_queue;

    // [...]
    cpu_queue = this_cpu_ptr(queue->cpu_queue);
    err = crypto_enqueue_request(&cpu_queue->queue, request); // [1]

    // [...]
    queue_work_on(smp_processor_id(), cryptd_wq, &cpu_queue->work); // [2]

    // [...]
    return err;
}

The handler for the workqueue cryptd_wq is initialized to cryptd_queue_worker() during kernel boot.

static int cryptd_init_queue(struct cryptd_queue *queue,
                 unsigned int max_cpu_qlen /* cryptd_max_cpu_qlen, 1000 */)
{
    int cpu;
    struct cryptd_cpu_queue *cpu_queue;

    queue->cpu_queue = alloc_percpu(struct cryptd_cpu_queue);
    // [...]
    for_each_possible_cpu(cpu) {
        cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu);
        crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
        INIT_WORK(&cpu_queue->work, cryptd_queue_worker); // [3]
    }
    // [...]
    return 0;
}

The cryptd_queue_worker() function dequeues a request from the CPU queue [5] and calls the complete handler [6].

static void cryptd_queue_worker(struct work_struct *work)
{
    struct cryptd_cpu_queue *cpu_queue;
    struct crypto_async_request *req, *backlog;

    cpu_queue = container_of(work, struct cryptd_cpu_queue, work);
    backlog = crypto_get_backlog(&cpu_queue->queue);
    req = crypto_dequeue_request(&cpu_queue->queue); // [5]
    
    // [...]
    crypto_request_complete(req, 0);

    if (cpu_queue->queue.qlen)
        queue_work(cryptd_wq, &cpu_queue->work);
}

static inline void crypto_request_complete(struct crypto_async_request *req,
                       int err)
{
    req->complete(req->data, err); // [6] &cryptd_aead_encrypt
}

static void cryptd_aead_encrypt(void *data, int err)
{
    struct aead_request *req = data;
    struct cryptd_aead_ctx *ctx;
    struct crypto_aead *child;

    ctx = crypto_aead_ctx(crypto_aead_reqtfm(req));
    child = ctx->child;
    cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->encrypt,
              cryptd_aead_encrypt);
}

The cryptd_aead_crypt() function invokes the actual encryption handler [7], crypto_gcm_encrypt(), to encrypt packets. Afterward, it calls complete callback registered in the tls_do_encryption() function [8].

static void cryptd_aead_crypt(struct aead_request *req,
                  struct crypto_aead *child, int err,
                  int (*crypt)(struct aead_request *req),
                  crypto_completion_t compl)
{
    // [...]
    tfm = crypto_aead_reqtfm(req);
    
    // [...]
    err = crypt(subreq); // [7] &crypto_gcm_encrypt
    
    // [...]
    aead_request_complete(req, err); 

    // [...]
}

static inline void aead_request_complete(struct aead_request *req, int err)
{
    crypto_request_complete(&req->base, err); // [8] &tls_encrypt_done
}

After updating the socket metadata, the tls_encrypt_done() function updates the ecryption pending count and wakes the up waiting process [9].

static void tls_encrypt_done(void *data, int err)
{
    struct tls_sw_context_tx *ctx;
    // [...]

    if (err == -EINPROGRESS)
        return;

    // [...]
    if (atomic_dec_and_test(&ctx->encrypt_pending))
        complete(&ctx->async_wait.completion); // [9]
    // [...]
}

3.4. Decryption

The decryption handler of the cryptd algorithm instance is the cryptd_aead_decrypt_enqueue() function [1], and its execution flow is the same as that of the encryption handler. Notably, encryption requests and decryption requests share the same pending queue.

static int cryptd_create_aead(struct crypto_template *tmpl,
                      struct rtattr **tb,
                  struct crypto_attr_type *algt,
                  struct cryptd_queue *queue)
{
    // [...]
    inst->alg.decrypt = cryptd_aead_decrypt_enqueue; // [1]
    // [...]
}

static int cryptd_aead_decrypt_enqueue(struct aead_request *req)
{
    return cryptd_aead_enqueue(req, cryptd_aead_decrypt );
}

4. Vulnerability

4.1. CVE-2024-26583

This analysis is based on the kernel v6.6.17.

4.1.1. Patch

The commit of this vulnerability is “tls: fix race between async notify and socket close”.

This commit removes the spinlock field from the TX and RX contexts.

@@ -97,9 +97,6 @@ struct tls_sw_context_tx {
     // [...]
-    spinlock_t encrypt_compl_lock;
-    int async_notify;
     u8 async_capable:1;
 
@@ -136,8 +133,6 @@ struct tls_sw_context_rx {
     // [...]
-    spinlock_t decrypt_compl_lock;

This lock is used during encryption and decryption. For decryption, the callback function tls_decrypt_done() acquires the decryption lock to prevent race condition.

static void tls_decrypt_done(void *data, int err)
{
    // [...]
    spin_lock_bh(&ctx->decrypt_compl_lock);
    if (!atomic_dec_return(&ctx->decrypt_pending))
        complete(&ctx->async_wait.completion);
    spin_unlock_bh(&ctx->decrypt_compl_lock);
}

A similar operation can be found in the encryption callback function tls_encrypt_done().

static void tls_encrypt_done(void *data, int err)
{
    // [...]
    spin_lock_bh(&ctx->encrypt_compl_lock);
    pending = atomic_dec_return(&ctx->encrypt_pending);

    if (!pending && ctx->async_notify)
        complete(&ctx->async_wait.completion);
    spin_unlock_bh(&ctx->encrypt_compl_lock);
    // [...]
}

During initialization, the ctx->decrypt_pending and ctx->encrypt_pending are set to 1 now.

@@ -2601,7 +2578,7 @@ static struct tls_sw_context_tx *init_ctx_tx(struct tls_context *ctx, struct soc
// [...]
-   spin_lock_init(&sw_ctx_tx->encrypt_compl_lock);
+   atomic_set(&sw_ctx_tx->encrypt_pending, 1);

@@ -2622,7 +2599,7 @@ static struct tls_sw_context_rx *init_ctx_rx(struct tls_context *ctx)
// [...]
-   spin_lock_init(&sw_ctx_rx->decrypt_compl_lock);
+   atomic_set(&sw_ctx_rx->decrypt_pending, 1);

For decryption:

tls_decrypt_done(): Decrements the pending count and calls complete() if it reaches zero.
tls_decrypt_async_wait(): Decrements the pending count and calls crypto_wait_req() if it does not reach zero; afterward, it increments the count again.
tls_do_decryption(): Increments the pending count if the operation is asynchronous.

For encryption:

tls_encrypt_done(): Decrements the pending count and calls complete() if it reaches zero.
tls_encrypt_async_wait(): Decrements the pending count and calls crypto_wait_req() if it does not reach zero; afterward, it increments the count again.
tls_do_encryption(): Increments the pending count first and decrements it again if an error occurs.

4.1.2. Root Cause

If the tls_sw_recvmsg() function transmits TLS packets asynchronously, it will first dispatch requests to the kthread. Then, it will acquire the decryption completion lock [1], retrieve the decryption pending count [2], and finally wait for the requests to complete [3].

int tls_sw_recvmsg(struct sock *sk,
           struct msghdr *msg,
           size_t len,
           int flags,
           int *addr_len)
{
    // [...]
    if (async) {
        int ret, pending;
        
        spin_lock_bh(&ctx->decrypt_compl_lock); // [1]
        reinit_completion(&ctx->async_wait.completion);
        pending = atomic_read(&ctx->decrypt_pending); // [2]
        spin_unlock_bh(&ctx->decrypt_compl_lock);
        
        ret = 0;
        if (pending)
            ret = crypto_wait_req(-EINPROGRESS, &ctx->async_wait); // <-----------
        // [...]
    }
    // [...]
}

static inline int crypto_wait_req(int err, struct crypto_wait *wait)
{
    switch (err) {
    case -EINPROGRESS:
    // [...]
        wait_for_completion(&wait->completion); // [3]
        // [...]
        break;
    }
    // [...]
}

The callback function tls_decrypt_done() is invoked after asynchornous decryption is completed. However, as soon as complete() is called, the waiting thread may exit and invoke the socket release handler tls_sk_proto_close(). Once the release handler is invoked, the RX context will be freed.

static void tls_sk_proto_close(struct sock *sk, long timeout)
{
    // [...]
    if (ctx->rx_conf == TLS_SW)
        tls_sw_free_ctx_rx(ctx);
    // [...]
}

void tls_sw_free_ctx_rx(struct tls_context *tls_ctx)
{
    struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);

    kfree(ctx); // [4]
}

That is, if we can extend the time window between complete() and spin_unlock_bh() in the tls_decrypt_done() function, it becomes possible to trigger the release handler within this time window. Consequently, the RX context object (ctx) used by spin_unlock_bh() [5] would already be freed.

static void tls_decrypt_done(void *data, int err)
{
    struct tls_sw_context_rx *ctx;
    
    // [...]
    ctx = tls_sw_ctx_rx(tls_ctx);

    // [...]
    spin_lock_bh(&ctx->decrypt_compl_lock);
    if (!atomic_dec_return(&ctx->decrypt_pending))
        complete(&ctx->async_wait.completion);

    // ============== RACE !!!! ==============

    spin_unlock_bh(&ctx->decrypt_compl_lock); // [5]
}

The encryption operation is also vulnerable to the same issue.

4.1.3. Exploitation

I do not exploit it, but I am providing some analysis and potential exploitation paths here.

The decryption callback function, tls_decrypt_done(), appears unexploitable because it only performs an unlock operation.

static void tls_decrypt_done(void *data, int err)
{
    // [...]
    complete(&ctx->async_wait.completion);
    spin_unlock_bh(&ctx->decrypt_compl_lock);
}

However, the encryption callback function tls_encrypt_done() sets a bit in bitmask and schedule a work job [1], which is quite interesting.

static void tls_encrypt_done(void *data, int err)
{
    // [...]
    complete(&ctx->async_wait.completion);
    spin_unlock_bh(&ctx->encrypt_compl_lock);

    // [...]
    if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
        schedule_delayed_work(&ctx->tx_work.work, 1); // <-----------
}

static inline bool schedule_delayed_work(struct delayed_work *dwork,
                     unsigned long delay)
{
    return queue_delayed_work(system_wq, dwork, delay); // <-----------
}

static inline bool queue_delayed_work(struct workqueue_struct *wq,
                      struct delayed_work *dwork,
                      unsigned long delay)
{
    return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); // <-----------
}

bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
               struct delayed_work *dwork, unsigned long delay)
{
    struct work_struct *work = &dwork->work; // UAF object
    
    // [...]
    if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
        __queue_delayed_work(cpu, wq, dwork, delay); // [1]
        // [...]
    }
    // [...]
}

I manually set dwork->work.func to 0x4141414141414141 using GDB, and the kernel crashed!

[   17.407354] general protection fault: 0000 [#1] PREEMPT SMP
[   17.407936] CPU: 0 PID: 81 Comm: kworker/0:2 Not tainted 6.6.17 #16
[   17.407936] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[   17.407936] Workqueue: events 0x4141414141414141
[   17.407936] RIP: 0010:0x4141414141414141
[   17.407936] Code: Unable to access opcode bytes at 0x4141414141414117.
[   17.407936] RSP: 0018:ffffc9000055fe70 EFLAGS: 00000246
# [...]
[   17.407936]  <TASK>
[   17.407936]  ? die_addr+0x32/0x80
[   17.407936]  ? exc_general_protection+0x14c/0x3c0
[   17.407936]  ? asm_exc_general_protection+0x22/0x30
[   17.407936]  ? process_one_work+0x14a/0x300
[   17.407936]  ? worker_thread+0x273/0x390

To release the socket object, we need to return to userspace after sending the packets, but the tls_sw_sendmsg_locked() is blocked because the encryption pending count is not zero [2].

static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
                 size_t size)
{
    while (msg_data_left(msg)) {
        ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
                          record_type, &copied,
                          msg->msg_flags);
        if (ret) {
            if (ret == -EINPROGRESS)
                num_async++;
            // [...]
        }
    }

    if (!num_async) {
        goto send_end;
    } else if (num_zc) {
        spin_lock_bh(&ctx->encrypt_compl_lock);
        ctx->async_notify = true;

        pending = atomic_read(&ctx->encrypt_pending);
        spin_unlock_bh(&ctx->encrypt_compl_lock);
        if (pending) // [2]
            crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
        // [...]
    }
}

This issue can be bypassed by sending two packets, with the data address of the later packet being invalid.

static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg,
                 size_t size)
{
    // [...]
    while (msg_data_left(msg)) {
        // [...]
        if (try_to_copy) {
            ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter,
                               msg_pl, try_to_copy);
            if (ret < 0)
                goto trim_sgl;
        }
        // [...]
    trim_sgl:
        // [...]
        goto send_end;
    }

send_end:
    // [...]
    return copied > 0 ? copied : ret;
}

The example code snippet is as follows:

#define TLS_MAX_PAYLOAD_SIZE (1 << 14)
void *buffer = mmap(NULL, TLS_MAX_PAYLOAD_SIZE, PROT_READ | PROT_WRITE, MAP_POPULATE | MAP_ANON | MAP_PRIVATE, -1, 0);
write(sockfd, buffer, TLS_MAX_PAYLOAD_SIZE + 1);

Before the TX object is released [3], the tls_sw_release_resources_tx() function is called to wait for the pending encryption to complete [4].

static void tls_sk_proto_close(struct sock *sk, long timeout)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    struct tls_context *ctx = tls_get_ctx(sk);
    // [...]
    if (ctx->tx_conf != TLS_BASE /* ... */)
        tls_sk_proto_cleanup(sk, ctx, timeo); // <-----------

    // [...]
    if (ctx->tx_conf == TLS_SW)
        tls_sw_free_ctx_tx(ctx); // [3]
    
    // [...]
}

static void tls_sk_proto_cleanup(struct sock *sk,
                 struct tls_context *ctx, long timeo)
{
    // [...]
    if (ctx->tx_conf == TLS_SW) {
        // [...]
        tls_sw_release_resources_tx(sk); // <-----------
    }
    // [...]
}

void tls_sw_release_resources_tx(struct sock *sk)
{
    // [...]
    spin_lock_bh(&ctx->encrypt_compl_lock);
    ctx->async_notify = true;
    pending = atomic_read(&ctx->encrypt_pending);
    spin_unlock_bh(&ctx->encrypt_compl_lock);

    if (pending) // [4]
        crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
    // [...]
}

The exploitation flow might be as illustrated below:

I apply the following diff to patch the tls_encrypt_done() function to extend the race window for POC.

+   mdelay(2000);
    spin_lock_bh(&ctx->encrypt_compl_lock);
    pending = atomic_dec_return(&ctx->encrypt_pending);

    if (!pending && ctx->async_notify)
        complete(&ctx->async_wait.completion);
    spin_unlock_bh(&ctx->encrypt_compl_lock);
+   mdelay(2000)

4.1.4. Others

The relationship between request objects is a bit complex, so I have illustrated the structures and left them here for those who are interested. The data is the parameter of the callback functions tls_encrypt_done() and tls_decrypt_done().

In next post, I will explain four more vulnerabilities!