linux/drivers/nvme/target/tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics TCP target.
   4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <linux/err.h>
  11#include <linux/nvme-tcp.h>
  12#include <net/sock.h>
  13#include <net/tcp.h>
  14#include <linux/inet.h>
  15#include <linux/llist.h>
  16#include <crypto/hash.h>
  17
  18#include "nvmet.h"
  19
  20#define NVMET_TCP_DEF_INLINE_DATA_SIZE  (4 * PAGE_SIZE)
  21
  22/* Define the socket priority to use for connections were it is desirable
  23 * that the NIC consider performing optimized packet processing or filtering.
  24 * A non-zero value being sufficient to indicate general consideration of any
  25 * possible optimization.  Making it a module param allows for alternative
  26 * values that may be unique for some NIC implementations.
  27 */
  28static int so_priority;
  29module_param(so_priority, int, 0644);
  30MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority");
  31
  32/* Define a time period (in usecs) that io_work() shall sample an activated
  33 * queue before determining it to be idle.  This optional module behavior
  34 * can enable NIC solutions that support socket optimized packet processing
  35 * using advanced interrupt moderation techniques.
  36 */
  37static int idle_poll_period_usecs;
  38module_param(idle_poll_period_usecs, int, 0644);
  39MODULE_PARM_DESC(idle_poll_period_usecs,
  40                "nvmet tcp io_work poll till idle time period in usecs");
  41
  42#define NVMET_TCP_RECV_BUDGET           8
  43#define NVMET_TCP_SEND_BUDGET           8
  44#define NVMET_TCP_IO_WORK_BUDGET        64
  45
  46enum nvmet_tcp_send_state {
  47        NVMET_TCP_SEND_DATA_PDU,
  48        NVMET_TCP_SEND_DATA,
  49        NVMET_TCP_SEND_R2T,
  50        NVMET_TCP_SEND_DDGST,
  51        NVMET_TCP_SEND_RESPONSE
  52};
  53
  54enum nvmet_tcp_recv_state {
  55        NVMET_TCP_RECV_PDU,
  56        NVMET_TCP_RECV_DATA,
  57        NVMET_TCP_RECV_DDGST,
  58        NVMET_TCP_RECV_ERR,
  59};
  60
  61enum {
  62        NVMET_TCP_F_INIT_FAILED = (1 << 0),
  63};
  64
  65struct nvmet_tcp_cmd {
  66        struct nvmet_tcp_queue          *queue;
  67        struct nvmet_req                req;
  68
  69        struct nvme_tcp_cmd_pdu         *cmd_pdu;
  70        struct nvme_tcp_rsp_pdu         *rsp_pdu;
  71        struct nvme_tcp_data_pdu        *data_pdu;
  72        struct nvme_tcp_r2t_pdu         *r2t_pdu;
  73
  74        u32                             rbytes_done;
  75        u32                             wbytes_done;
  76
  77        u32                             pdu_len;
  78        u32                             pdu_recv;
  79        int                             sg_idx;
  80        int                             nr_mapped;
  81        struct msghdr                   recv_msg;
  82        struct kvec                     *iov;
  83        u32                             flags;
  84
  85        struct list_head                entry;
  86        struct llist_node               lentry;
  87
  88        /* send state */
  89        u32                             offset;
  90        struct scatterlist              *cur_sg;
  91        enum nvmet_tcp_send_state       state;
  92
  93        __le32                          exp_ddgst;
  94        __le32                          recv_ddgst;
  95};
  96
  97enum nvmet_tcp_queue_state {
  98        NVMET_TCP_Q_CONNECTING,
  99        NVMET_TCP_Q_LIVE,
 100        NVMET_TCP_Q_DISCONNECTING,
 101};
 102
 103struct nvmet_tcp_queue {
 104        struct socket           *sock;
 105        struct nvmet_tcp_port   *port;
 106        struct work_struct      io_work;
 107        struct nvmet_cq         nvme_cq;
 108        struct nvmet_sq         nvme_sq;
 109
 110        /* send state */
 111        struct nvmet_tcp_cmd    *cmds;
 112        unsigned int            nr_cmds;
 113        struct list_head        free_list;
 114        struct llist_head       resp_list;
 115        struct list_head        resp_send_list;
 116        int                     send_list_len;
 117        struct nvmet_tcp_cmd    *snd_cmd;
 118
 119        /* recv state */
 120        int                     offset;
 121        int                     left;
 122        enum nvmet_tcp_recv_state rcv_state;
 123        struct nvmet_tcp_cmd    *cmd;
 124        union nvme_tcp_pdu      pdu;
 125
 126        /* digest state */
 127        bool                    hdr_digest;
 128        bool                    data_digest;
 129        struct ahash_request    *snd_hash;
 130        struct ahash_request    *rcv_hash;
 131
 132        unsigned long           poll_end;
 133
 134        spinlock_t              state_lock;
 135        enum nvmet_tcp_queue_state state;
 136
 137        struct sockaddr_storage sockaddr;
 138        struct sockaddr_storage sockaddr_peer;
 139        struct work_struct      release_work;
 140
 141        int                     idx;
 142        struct list_head        queue_list;
 143
 144        struct nvmet_tcp_cmd    connect;
 145
 146        struct page_frag_cache  pf_cache;
 147
 148        void (*data_ready)(struct sock *);
 149        void (*state_change)(struct sock *);
 150        void (*write_space)(struct sock *);
 151};
 152
 153struct nvmet_tcp_port {
 154        struct socket           *sock;
 155        struct work_struct      accept_work;
 156        struct nvmet_port       *nport;
 157        struct sockaddr_storage addr;
 158        void (*data_ready)(struct sock *);
 159};
 160
 161static DEFINE_IDA(nvmet_tcp_queue_ida);
 162static LIST_HEAD(nvmet_tcp_queue_list);
 163static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
 164
 165static struct workqueue_struct *nvmet_tcp_wq;
 166static const struct nvmet_fabrics_ops nvmet_tcp_ops;
 167static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
 168static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
 169
 170static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
 171                struct nvmet_tcp_cmd *cmd)
 172{
 173        if (unlikely(!queue->nr_cmds)) {
 174                /* We didn't allocate cmds yet, send 0xffff */
 175                return USHRT_MAX;
 176        }
 177
 178        return cmd - queue->cmds;
 179}
 180
 181static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
 182{
 183        return nvme_is_write(cmd->req.cmd) &&
 184                cmd->rbytes_done < cmd->req.transfer_len;
 185}
 186
 187static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
 188{
 189        return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
 190}
 191
 192static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
 193{
 194        return !nvme_is_write(cmd->req.cmd) &&
 195                cmd->req.transfer_len > 0 &&
 196                !cmd->req.cqe->status;
 197}
 198
 199static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
 200{
 201        return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
 202                !cmd->rbytes_done;
 203}
 204
 205static inline struct nvmet_tcp_cmd *
 206nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
 207{
 208        struct nvmet_tcp_cmd *cmd;
 209
 210        cmd = list_first_entry_or_null(&queue->free_list,
 211                                struct nvmet_tcp_cmd, entry);
 212        if (!cmd)
 213                return NULL;
 214        list_del_init(&cmd->entry);
 215
 216        cmd->rbytes_done = cmd->wbytes_done = 0;
 217        cmd->pdu_len = 0;
 218        cmd->pdu_recv = 0;
 219        cmd->iov = NULL;
 220        cmd->flags = 0;
 221        return cmd;
 222}
 223
 224static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
 225{
 226        if (unlikely(cmd == &cmd->queue->connect))
 227                return;
 228
 229        list_add_tail(&cmd->entry, &cmd->queue->free_list);
 230}
 231
 232static inline int queue_cpu(struct nvmet_tcp_queue *queue)
 233{
 234        return queue->sock->sk->sk_incoming_cpu;
 235}
 236
 237static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
 238{
 239        return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 240}
 241
 242static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
 243{
 244        return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 245}
 246
 247static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
 248                void *pdu, size_t len)
 249{
 250        struct scatterlist sg;
 251
 252        sg_init_one(&sg, pdu, len);
 253        ahash_request_set_crypt(hash, &sg, pdu + len, len);
 254        crypto_ahash_digest(hash);
 255}
 256
 257static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
 258        void *pdu, size_t len)
 259{
 260        struct nvme_tcp_hdr *hdr = pdu;
 261        __le32 recv_digest;
 262        __le32 exp_digest;
 263
 264        if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 265                pr_err("queue %d: header digest enabled but no header digest\n",
 266                        queue->idx);
 267                return -EPROTO;
 268        }
 269
 270        recv_digest = *(__le32 *)(pdu + hdr->hlen);
 271        nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
 272        exp_digest = *(__le32 *)(pdu + hdr->hlen);
 273        if (recv_digest != exp_digest) {
 274                pr_err("queue %d: header digest error: recv %#x expected %#x\n",
 275                        queue->idx, le32_to_cpu(recv_digest),
 276                        le32_to_cpu(exp_digest));
 277                return -EPROTO;
 278        }
 279
 280        return 0;
 281}
 282
 283static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
 284{
 285        struct nvme_tcp_hdr *hdr = pdu;
 286        u8 digest_len = nvmet_tcp_hdgst_len(queue);
 287        u32 len;
 288
 289        len = le32_to_cpu(hdr->plen) - hdr->hlen -
 290                (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
 291
 292        if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 293                pr_err("queue %d: data digest flag is cleared\n", queue->idx);
 294                return -EPROTO;
 295        }
 296
 297        return 0;
 298}
 299
 300static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
 301{
 302        struct scatterlist *sg;
 303        int i;
 304
 305        sg = &cmd->req.sg[cmd->sg_idx];
 306
 307        for (i = 0; i < cmd->nr_mapped; i++)
 308                kunmap(sg_page(&sg[i]));
 309}
 310
 311static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
 312{
 313        struct kvec *iov = cmd->iov;
 314        struct scatterlist *sg;
 315        u32 length, offset, sg_offset;
 316
 317        length = cmd->pdu_len;
 318        cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
 319        offset = cmd->rbytes_done;
 320        cmd->sg_idx = offset / PAGE_SIZE;
 321        sg_offset = offset % PAGE_SIZE;
 322        sg = &cmd->req.sg[cmd->sg_idx];
 323
 324        while (length) {
 325                u32 iov_len = min_t(u32, length, sg->length - sg_offset);
 326
 327                iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
 328                iov->iov_len = iov_len;
 329
 330                length -= iov_len;
 331                sg = sg_next(sg);
 332                iov++;
 333                sg_offset = 0;
 334        }
 335
 336        iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
 337                cmd->nr_mapped, cmd->pdu_len);
 338}
 339
 340static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
 341{
 342        queue->rcv_state = NVMET_TCP_RECV_ERR;
 343        if (queue->nvme_sq.ctrl)
 344                nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
 345        else
 346                kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 347}
 348
 349static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
 350{
 351        if (status == -EPIPE || status == -ECONNRESET)
 352                kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 353        else
 354                nvmet_tcp_fatal_error(queue);
 355}
 356
 357static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
 358{
 359        struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
 360        u32 len = le32_to_cpu(sgl->length);
 361
 362        if (!len)
 363                return 0;
 364
 365        if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
 366                          NVME_SGL_FMT_OFFSET)) {
 367                if (!nvme_is_write(cmd->req.cmd))
 368                        return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 369
 370                if (len > cmd->req.port->inline_data_size)
 371                        return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
 372                cmd->pdu_len = len;
 373        }
 374        cmd->req.transfer_len += len;
 375
 376        cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
 377        if (!cmd->req.sg)
 378                return NVME_SC_INTERNAL;
 379        cmd->cur_sg = cmd->req.sg;
 380
 381        if (nvmet_tcp_has_data_in(cmd)) {
 382                cmd->iov = kmalloc_array(cmd->req.sg_cnt,
 383                                sizeof(*cmd->iov), GFP_KERNEL);
 384                if (!cmd->iov)
 385                        goto err;
 386        }
 387
 388        return 0;
 389err:
 390        sgl_free(cmd->req.sg);
 391        return NVME_SC_INTERNAL;
 392}
 393
 394static void nvmet_tcp_send_ddgst(struct ahash_request *hash,
 395                struct nvmet_tcp_cmd *cmd)
 396{
 397        ahash_request_set_crypt(hash, cmd->req.sg,
 398                (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
 399        crypto_ahash_digest(hash);
 400}
 401
 402static void nvmet_tcp_recv_ddgst(struct ahash_request *hash,
 403                struct nvmet_tcp_cmd *cmd)
 404{
 405        struct scatterlist sg;
 406        struct kvec *iov;
 407        int i;
 408
 409        crypto_ahash_init(hash);
 410        for (i = 0, iov = cmd->iov; i < cmd->nr_mapped; i++, iov++) {
 411                sg_init_one(&sg, iov->iov_base, iov->iov_len);
 412                ahash_request_set_crypt(hash, &sg, NULL, iov->iov_len);
 413                crypto_ahash_update(hash);
 414        }
 415        ahash_request_set_crypt(hash, NULL, (void *)&cmd->exp_ddgst, 0);
 416        crypto_ahash_final(hash);
 417}
 418
 419static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
 420{
 421        struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
 422        struct nvmet_tcp_queue *queue = cmd->queue;
 423        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 424        u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
 425
 426        cmd->offset = 0;
 427        cmd->state = NVMET_TCP_SEND_DATA_PDU;
 428
 429        pdu->hdr.type = nvme_tcp_c2h_data;
 430        pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
 431                                                NVME_TCP_F_DATA_SUCCESS : 0);
 432        pdu->hdr.hlen = sizeof(*pdu);
 433        pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
 434        pdu->hdr.plen =
 435                cpu_to_le32(pdu->hdr.hlen + hdgst +
 436                                cmd->req.transfer_len + ddgst);
 437        pdu->command_id = cmd->req.cqe->command_id;
 438        pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
 439        pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
 440
 441        if (queue->data_digest) {
 442                pdu->hdr.flags |= NVME_TCP_F_DDGST;
 443                nvmet_tcp_send_ddgst(queue->snd_hash, cmd);
 444        }
 445
 446        if (cmd->queue->hdr_digest) {
 447                pdu->hdr.flags |= NVME_TCP_F_HDGST;
 448                nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 449        }
 450}
 451
 452static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
 453{
 454        struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
 455        struct nvmet_tcp_queue *queue = cmd->queue;
 456        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 457
 458        cmd->offset = 0;
 459        cmd->state = NVMET_TCP_SEND_R2T;
 460
 461        pdu->hdr.type = nvme_tcp_r2t;
 462        pdu->hdr.flags = 0;
 463        pdu->hdr.hlen = sizeof(*pdu);
 464        pdu->hdr.pdo = 0;
 465        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
 466
 467        pdu->command_id = cmd->req.cmd->common.command_id;
 468        pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
 469        pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
 470        pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
 471        if (cmd->queue->hdr_digest) {
 472                pdu->hdr.flags |= NVME_TCP_F_HDGST;
 473                nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 474        }
 475}
 476
 477static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
 478{
 479        struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
 480        struct nvmet_tcp_queue *queue = cmd->queue;
 481        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 482
 483        cmd->offset = 0;
 484        cmd->state = NVMET_TCP_SEND_RESPONSE;
 485
 486        pdu->hdr.type = nvme_tcp_rsp;
 487        pdu->hdr.flags = 0;
 488        pdu->hdr.hlen = sizeof(*pdu);
 489        pdu->hdr.pdo = 0;
 490        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
 491        if (cmd->queue->hdr_digest) {
 492                pdu->hdr.flags |= NVME_TCP_F_HDGST;
 493                nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 494        }
 495}
 496
 497static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
 498{
 499        struct llist_node *node;
 500        struct nvmet_tcp_cmd *cmd;
 501
 502        for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
 503                cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
 504                list_add(&cmd->entry, &queue->resp_send_list);
 505                queue->send_list_len++;
 506        }
 507}
 508
 509static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
 510{
 511        queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
 512                                struct nvmet_tcp_cmd, entry);
 513        if (!queue->snd_cmd) {
 514                nvmet_tcp_process_resp_list(queue);
 515                queue->snd_cmd =
 516                        list_first_entry_or_null(&queue->resp_send_list,
 517                                        struct nvmet_tcp_cmd, entry);
 518                if (unlikely(!queue->snd_cmd))
 519                        return NULL;
 520        }
 521
 522        list_del_init(&queue->snd_cmd->entry);
 523        queue->send_list_len--;
 524
 525        if (nvmet_tcp_need_data_out(queue->snd_cmd))
 526                nvmet_setup_c2h_data_pdu(queue->snd_cmd);
 527        else if (nvmet_tcp_need_data_in(queue->snd_cmd))
 528                nvmet_setup_r2t_pdu(queue->snd_cmd);
 529        else
 530                nvmet_setup_response_pdu(queue->snd_cmd);
 531
 532        return queue->snd_cmd;
 533}
 534
 535static void nvmet_tcp_queue_response(struct nvmet_req *req)
 536{
 537        struct nvmet_tcp_cmd *cmd =
 538                container_of(req, struct nvmet_tcp_cmd, req);
 539        struct nvmet_tcp_queue  *queue = cmd->queue;
 540        struct nvme_sgl_desc *sgl;
 541        u32 len;
 542
 543        if (unlikely(cmd == queue->cmd)) {
 544                sgl = &cmd->req.cmd->common.dptr.sgl;
 545                len = le32_to_cpu(sgl->length);
 546
 547                /*
 548                 * Wait for inline data before processing the response.
 549                 * Avoid using helpers, this might happen before
 550                 * nvmet_req_init is completed.
 551                 */
 552                if (queue->rcv_state == NVMET_TCP_RECV_PDU &&
 553                    len && len <= cmd->req.port->inline_data_size &&
 554                    nvme_is_write(cmd->req.cmd))
 555                        return;
 556        }
 557
 558        llist_add(&cmd->lentry, &queue->resp_list);
 559        queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
 560}
 561
 562static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
 563{
 564        if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED))
 565                nvmet_tcp_queue_response(&cmd->req);
 566        else
 567                cmd->req.execute(&cmd->req);
 568}
 569
 570static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
 571{
 572        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 573        int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
 574        int ret;
 575
 576        ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
 577                        offset_in_page(cmd->data_pdu) + cmd->offset,
 578                        left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
 579        if (ret <= 0)
 580                return ret;
 581
 582        cmd->offset += ret;
 583        left -= ret;
 584
 585        if (left)
 586                return -EAGAIN;
 587
 588        cmd->state = NVMET_TCP_SEND_DATA;
 589        cmd->offset  = 0;
 590        return 1;
 591}
 592
 593static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 594{
 595        struct nvmet_tcp_queue *queue = cmd->queue;
 596        int ret;
 597
 598        while (cmd->cur_sg) {
 599                struct page *page = sg_page(cmd->cur_sg);
 600                u32 left = cmd->cur_sg->length - cmd->offset;
 601                int flags = MSG_DONTWAIT;
 602
 603                if ((!last_in_batch && cmd->queue->send_list_len) ||
 604                    cmd->wbytes_done + left < cmd->req.transfer_len ||
 605                    queue->data_digest || !queue->nvme_sq.sqhd_disabled)
 606                        flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 607
 608                ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
 609                                        left, flags);
 610                if (ret <= 0)
 611                        return ret;
 612
 613                cmd->offset += ret;
 614                cmd->wbytes_done += ret;
 615
 616                /* Done with sg?*/
 617                if (cmd->offset == cmd->cur_sg->length) {
 618                        cmd->cur_sg = sg_next(cmd->cur_sg);
 619                        cmd->offset = 0;
 620                }
 621        }
 622
 623        if (queue->data_digest) {
 624                cmd->state = NVMET_TCP_SEND_DDGST;
 625                cmd->offset = 0;
 626        } else {
 627                if (queue->nvme_sq.sqhd_disabled) {
 628                        cmd->queue->snd_cmd = NULL;
 629                        nvmet_tcp_put_cmd(cmd);
 630                } else {
 631                        nvmet_setup_response_pdu(cmd);
 632                }
 633        }
 634
 635        if (queue->nvme_sq.sqhd_disabled) {
 636                kfree(cmd->iov);
 637                sgl_free(cmd->req.sg);
 638        }
 639
 640        return 1;
 641
 642}
 643
 644static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 645                bool last_in_batch)
 646{
 647        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 648        int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
 649        int flags = MSG_DONTWAIT;
 650        int ret;
 651
 652        if (!last_in_batch && cmd->queue->send_list_len)
 653                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 654        else
 655                flags |= MSG_EOR;
 656
 657        ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
 658                offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
 659        if (ret <= 0)
 660                return ret;
 661        cmd->offset += ret;
 662        left -= ret;
 663
 664        if (left)
 665                return -EAGAIN;
 666
 667        kfree(cmd->iov);
 668        sgl_free(cmd->req.sg);
 669        cmd->queue->snd_cmd = NULL;
 670        nvmet_tcp_put_cmd(cmd);
 671        return 1;
 672}
 673
 674static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 675{
 676        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 677        int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
 678        int flags = MSG_DONTWAIT;
 679        int ret;
 680
 681        if (!last_in_batch && cmd->queue->send_list_len)
 682                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 683        else
 684                flags |= MSG_EOR;
 685
 686        ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
 687                offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
 688        if (ret <= 0)
 689                return ret;
 690        cmd->offset += ret;
 691        left -= ret;
 692
 693        if (left)
 694                return -EAGAIN;
 695
 696        cmd->queue->snd_cmd = NULL;
 697        return 1;
 698}
 699
 700static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 701{
 702        struct nvmet_tcp_queue *queue = cmd->queue;
 703        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
 704        struct kvec iov = {
 705                .iov_base = &cmd->exp_ddgst + cmd->offset,
 706                .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
 707        };
 708        int ret;
 709
 710        if (!last_in_batch && cmd->queue->send_list_len)
 711                msg.msg_flags |= MSG_MORE;
 712        else
 713                msg.msg_flags |= MSG_EOR;
 714
 715        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
 716        if (unlikely(ret <= 0))
 717                return ret;
 718
 719        cmd->offset += ret;
 720
 721        if (queue->nvme_sq.sqhd_disabled) {
 722                cmd->queue->snd_cmd = NULL;
 723                nvmet_tcp_put_cmd(cmd);
 724        } else {
 725                nvmet_setup_response_pdu(cmd);
 726        }
 727        return 1;
 728}
 729
 730static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
 731                bool last_in_batch)
 732{
 733        struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
 734        int ret = 0;
 735
 736        if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
 737                cmd = nvmet_tcp_fetch_cmd(queue);
 738                if (unlikely(!cmd))
 739                        return 0;
 740        }
 741
 742        if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
 743                ret = nvmet_try_send_data_pdu(cmd);
 744                if (ret <= 0)
 745                        goto done_send;
 746        }
 747
 748        if (cmd->state == NVMET_TCP_SEND_DATA) {
 749                ret = nvmet_try_send_data(cmd, last_in_batch);
 750                if (ret <= 0)
 751                        goto done_send;
 752        }
 753
 754        if (cmd->state == NVMET_TCP_SEND_DDGST) {
 755                ret = nvmet_try_send_ddgst(cmd, last_in_batch);
 756                if (ret <= 0)
 757                        goto done_send;
 758        }
 759
 760        if (cmd->state == NVMET_TCP_SEND_R2T) {
 761                ret = nvmet_try_send_r2t(cmd, last_in_batch);
 762                if (ret <= 0)
 763                        goto done_send;
 764        }
 765
 766        if (cmd->state == NVMET_TCP_SEND_RESPONSE)
 767                ret = nvmet_try_send_response(cmd, last_in_batch);
 768
 769done_send:
 770        if (ret < 0) {
 771                if (ret == -EAGAIN)
 772                        return 0;
 773                return ret;
 774        }
 775
 776        return 1;
 777}
 778
 779static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
 780                int budget, int *sends)
 781{
 782        int i, ret = 0;
 783
 784        for (i = 0; i < budget; i++) {
 785                ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
 786                if (unlikely(ret < 0)) {
 787                        nvmet_tcp_socket_error(queue, ret);
 788                        goto done;
 789                } else if (ret == 0) {
 790                        break;
 791                }
 792                (*sends)++;
 793        }
 794done:
 795        return ret;
 796}
 797
 798static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
 799{
 800        queue->offset = 0;
 801        queue->left = sizeof(struct nvme_tcp_hdr);
 802        queue->cmd = NULL;
 803        queue->rcv_state = NVMET_TCP_RECV_PDU;
 804}
 805
 806static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
 807{
 808        struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
 809
 810        ahash_request_free(queue->rcv_hash);
 811        ahash_request_free(queue->snd_hash);
 812        crypto_free_ahash(tfm);
 813}
 814
 815static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
 816{
 817        struct crypto_ahash *tfm;
 818
 819        tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
 820        if (IS_ERR(tfm))
 821                return PTR_ERR(tfm);
 822
 823        queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
 824        if (!queue->snd_hash)
 825                goto free_tfm;
 826        ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
 827
 828        queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
 829        if (!queue->rcv_hash)
 830                goto free_snd_hash;
 831        ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
 832
 833        return 0;
 834free_snd_hash:
 835        ahash_request_free(queue->snd_hash);
 836free_tfm:
 837        crypto_free_ahash(tfm);
 838        return -ENOMEM;
 839}
 840
 841
 842static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
 843{
 844        struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
 845        struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
 846        struct msghdr msg = {};
 847        struct kvec iov;
 848        int ret;
 849
 850        if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
 851                pr_err("bad nvme-tcp pdu length (%d)\n",
 852                        le32_to_cpu(icreq->hdr.plen));
 853                nvmet_tcp_fatal_error(queue);
 854        }
 855
 856        if (icreq->pfv != NVME_TCP_PFV_1_0) {
 857                pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
 858                return -EPROTO;
 859        }
 860
 861        if (icreq->hpda != 0) {
 862                pr_err("queue %d: unsupported hpda %d\n", queue->idx,
 863                        icreq->hpda);
 864                return -EPROTO;
 865        }
 866
 867        queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
 868        queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
 869        if (queue->hdr_digest || queue->data_digest) {
 870                ret = nvmet_tcp_alloc_crypto(queue);
 871                if (ret)
 872                        return ret;
 873        }
 874
 875        memset(icresp, 0, sizeof(*icresp));
 876        icresp->hdr.type = nvme_tcp_icresp;
 877        icresp->hdr.hlen = sizeof(*icresp);
 878        icresp->hdr.pdo = 0;
 879        icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
 880        icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
 881        icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
 882        icresp->cpda = 0;
 883        if (queue->hdr_digest)
 884                icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
 885        if (queue->data_digest)
 886                icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
 887
 888        iov.iov_base = icresp;
 889        iov.iov_len = sizeof(*icresp);
 890        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
 891        if (ret < 0)
 892                goto free_crypto;
 893
 894        queue->state = NVMET_TCP_Q_LIVE;
 895        nvmet_prepare_receive_pdu(queue);
 896        return 0;
 897free_crypto:
 898        if (queue->hdr_digest || queue->data_digest)
 899                nvmet_tcp_free_crypto(queue);
 900        return ret;
 901}
 902
 903static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
 904                struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
 905{
 906        size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
 907        int ret;
 908
 909        if (!nvme_is_write(cmd->req.cmd) ||
 910            data_len > cmd->req.port->inline_data_size) {
 911                nvmet_prepare_receive_pdu(queue);
 912                return;
 913        }
 914
 915        ret = nvmet_tcp_map_data(cmd);
 916        if (unlikely(ret)) {
 917                pr_err("queue %d: failed to map data\n", queue->idx);
 918                nvmet_tcp_fatal_error(queue);
 919                return;
 920        }
 921
 922        queue->rcv_state = NVMET_TCP_RECV_DATA;
 923        nvmet_tcp_map_pdu_iovec(cmd);
 924        cmd->flags |= NVMET_TCP_F_INIT_FAILED;
 925}
 926
 927static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
 928{
 929        struct nvme_tcp_data_pdu *data = &queue->pdu.data;
 930        struct nvmet_tcp_cmd *cmd;
 931
 932        if (likely(queue->nr_cmds))
 933                cmd = &queue->cmds[data->ttag];
 934        else
 935                cmd = &queue->connect;
 936
 937        if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
 938                pr_err("ttag %u unexpected data offset %u (expected %u)\n",
 939                        data->ttag, le32_to_cpu(data->data_offset),
 940                        cmd->rbytes_done);
 941                /* FIXME: use path and transport errors */
 942                nvmet_req_complete(&cmd->req,
 943                        NVME_SC_INVALID_FIELD | NVME_SC_DNR);
 944                return -EPROTO;
 945        }
 946
 947        cmd->pdu_len = le32_to_cpu(data->data_length);
 948        cmd->pdu_recv = 0;
 949        nvmet_tcp_map_pdu_iovec(cmd);
 950        queue->cmd = cmd;
 951        queue->rcv_state = NVMET_TCP_RECV_DATA;
 952
 953        return 0;
 954}
 955
 956static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
 957{
 958        struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
 959        struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
 960        struct nvmet_req *req;
 961        int ret;
 962
 963        if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
 964                if (hdr->type != nvme_tcp_icreq) {
 965                        pr_err("unexpected pdu type (%d) before icreq\n",
 966                                hdr->type);
 967                        nvmet_tcp_fatal_error(queue);
 968                        return -EPROTO;
 969                }
 970                return nvmet_tcp_handle_icreq(queue);
 971        }
 972
 973        if (hdr->type == nvme_tcp_h2c_data) {
 974                ret = nvmet_tcp_handle_h2c_data_pdu(queue);
 975                if (unlikely(ret))
 976                        return ret;
 977                return 0;
 978        }
 979
 980        queue->cmd = nvmet_tcp_get_cmd(queue);
 981        if (unlikely(!queue->cmd)) {
 982                /* This should never happen */
 983                pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
 984                        queue->idx, queue->nr_cmds, queue->send_list_len,
 985                        nvme_cmd->common.opcode);
 986                nvmet_tcp_fatal_error(queue);
 987                return -ENOMEM;
 988        }
 989
 990        req = &queue->cmd->req;
 991        memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
 992
 993        if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
 994                        &queue->nvme_sq, &nvmet_tcp_ops))) {
 995                pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
 996                        req->cmd, req->cmd->common.command_id,
 997                        req->cmd->common.opcode,
 998                        le32_to_cpu(req->cmd->common.dptr.sgl.length));
 999
1000                nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
1001                return 0;
1002        }
1003
1004        ret = nvmet_tcp_map_data(queue->cmd);
1005        if (unlikely(ret)) {
1006                pr_err("queue %d: failed to map data\n", queue->idx);
1007                if (nvmet_tcp_has_inline_data(queue->cmd))
1008                        nvmet_tcp_fatal_error(queue);
1009                else
1010                        nvmet_req_complete(req, ret);
1011                ret = -EAGAIN;
1012                goto out;
1013        }
1014
1015        if (nvmet_tcp_need_data_in(queue->cmd)) {
1016                if (nvmet_tcp_has_inline_data(queue->cmd)) {
1017                        queue->rcv_state = NVMET_TCP_RECV_DATA;
1018                        nvmet_tcp_map_pdu_iovec(queue->cmd);
1019                        return 0;
1020                }
1021                /* send back R2T */
1022                nvmet_tcp_queue_response(&queue->cmd->req);
1023                goto out;
1024        }
1025
1026        queue->cmd->req.execute(&queue->cmd->req);
1027out:
1028        nvmet_prepare_receive_pdu(queue);
1029        return ret;
1030}
1031
1032static const u8 nvme_tcp_pdu_sizes[] = {
1033        [nvme_tcp_icreq]        = sizeof(struct nvme_tcp_icreq_pdu),
1034        [nvme_tcp_cmd]          = sizeof(struct nvme_tcp_cmd_pdu),
1035        [nvme_tcp_h2c_data]     = sizeof(struct nvme_tcp_data_pdu),
1036};
1037
1038static inline u8 nvmet_tcp_pdu_size(u8 type)
1039{
1040        size_t idx = type;
1041
1042        return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
1043                nvme_tcp_pdu_sizes[idx]) ?
1044                        nvme_tcp_pdu_sizes[idx] : 0;
1045}
1046
1047static inline bool nvmet_tcp_pdu_valid(u8 type)
1048{
1049        switch (type) {
1050        case nvme_tcp_icreq:
1051        case nvme_tcp_cmd:
1052        case nvme_tcp_h2c_data:
1053                /* fallthru */
1054                return true;
1055        }
1056
1057        return false;
1058}
1059
1060static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
1061{
1062        struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1063        int len;
1064        struct kvec iov;
1065        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1066
1067recv:
1068        iov.iov_base = (void *)&queue->pdu + queue->offset;
1069        iov.iov_len = queue->left;
1070        len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1071                        iov.iov_len, msg.msg_flags);
1072        if (unlikely(len < 0))
1073                return len;
1074
1075        queue->offset += len;
1076        queue->left -= len;
1077        if (queue->left)
1078                return -EAGAIN;
1079
1080        if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1081                u8 hdgst = nvmet_tcp_hdgst_len(queue);
1082
1083                if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1084                        pr_err("unexpected pdu type %d\n", hdr->type);
1085                        nvmet_tcp_fatal_error(queue);
1086                        return -EIO;
1087                }
1088
1089                if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1090                        pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1091                        return -EIO;
1092                }
1093
1094                queue->left = hdr->hlen - queue->offset + hdgst;
1095                goto recv;
1096        }
1097
1098        if (queue->hdr_digest &&
1099            nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
1100                nvmet_tcp_fatal_error(queue); /* fatal */
1101                return -EPROTO;
1102        }
1103
1104        if (queue->data_digest &&
1105            nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1106                nvmet_tcp_fatal_error(queue); /* fatal */
1107                return -EPROTO;
1108        }
1109
1110        return nvmet_tcp_done_recv_pdu(queue);
1111}
1112
1113static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1114{
1115        struct nvmet_tcp_queue *queue = cmd->queue;
1116
1117        nvmet_tcp_recv_ddgst(queue->rcv_hash, cmd);
1118        queue->offset = 0;
1119        queue->left = NVME_TCP_DIGEST_LENGTH;
1120        queue->rcv_state = NVMET_TCP_RECV_DDGST;
1121}
1122
1123static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1124{
1125        struct nvmet_tcp_cmd  *cmd = queue->cmd;
1126        int ret;
1127
1128        while (msg_data_left(&cmd->recv_msg)) {
1129                ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1130                        cmd->recv_msg.msg_flags);
1131                if (ret <= 0)
1132                        return ret;
1133
1134                cmd->pdu_recv += ret;
1135                cmd->rbytes_done += ret;
1136        }
1137
1138        nvmet_tcp_unmap_pdu_iovec(cmd);
1139        if (queue->data_digest) {
1140                nvmet_tcp_prep_recv_ddgst(cmd);
1141                return 0;
1142        }
1143
1144        if (cmd->rbytes_done == cmd->req.transfer_len)
1145                nvmet_tcp_execute_request(cmd);
1146
1147        nvmet_prepare_receive_pdu(queue);
1148        return 0;
1149}
1150
1151static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1152{
1153        struct nvmet_tcp_cmd *cmd = queue->cmd;
1154        int ret;
1155        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1156        struct kvec iov = {
1157                .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1158                .iov_len = queue->left
1159        };
1160
1161        ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1162                        iov.iov_len, msg.msg_flags);
1163        if (unlikely(ret < 0))
1164                return ret;
1165
1166        queue->offset += ret;
1167        queue->left -= ret;
1168        if (queue->left)
1169                return -EAGAIN;
1170
1171        if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1172                pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1173                        queue->idx, cmd->req.cmd->common.command_id,
1174                        queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1175                        le32_to_cpu(cmd->exp_ddgst));
1176                nvmet_tcp_finish_cmd(cmd);
1177                nvmet_tcp_fatal_error(queue);
1178                ret = -EPROTO;
1179                goto out;
1180        }
1181
1182        if (cmd->rbytes_done == cmd->req.transfer_len)
1183                nvmet_tcp_execute_request(cmd);
1184
1185        ret = 0;
1186out:
1187        nvmet_prepare_receive_pdu(queue);
1188        return ret;
1189}
1190
1191static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1192{
1193        int result = 0;
1194
1195        if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1196                return 0;
1197
1198        if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1199                result = nvmet_tcp_try_recv_pdu(queue);
1200                if (result != 0)
1201                        goto done_recv;
1202        }
1203
1204        if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1205                result = nvmet_tcp_try_recv_data(queue);
1206                if (result != 0)
1207                        goto done_recv;
1208        }
1209
1210        if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1211                result = nvmet_tcp_try_recv_ddgst(queue);
1212                if (result != 0)
1213                        goto done_recv;
1214        }
1215
1216done_recv:
1217        if (result < 0) {
1218                if (result == -EAGAIN)
1219                        return 0;
1220                return result;
1221        }
1222        return 1;
1223}
1224
1225static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1226                int budget, int *recvs)
1227{
1228        int i, ret = 0;
1229
1230        for (i = 0; i < budget; i++) {
1231                ret = nvmet_tcp_try_recv_one(queue);
1232                if (unlikely(ret < 0)) {
1233                        nvmet_tcp_socket_error(queue, ret);
1234                        goto done;
1235                } else if (ret == 0) {
1236                        break;
1237                }
1238                (*recvs)++;
1239        }
1240done:
1241        return ret;
1242}
1243
1244static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1245{
1246        spin_lock(&queue->state_lock);
1247        if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1248                queue->state = NVMET_TCP_Q_DISCONNECTING;
1249                schedule_work(&queue->release_work);
1250        }
1251        spin_unlock(&queue->state_lock);
1252}
1253
1254static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue)
1255{
1256        queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs);
1257}
1258
1259static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue,
1260                int ops)
1261{
1262        if (!idle_poll_period_usecs)
1263                return false;
1264
1265        if (ops)
1266                nvmet_tcp_arm_queue_deadline(queue);
1267
1268        return !time_after(jiffies, queue->poll_end);
1269}
1270
1271static void nvmet_tcp_io_work(struct work_struct *w)
1272{
1273        struct nvmet_tcp_queue *queue =
1274                container_of(w, struct nvmet_tcp_queue, io_work);
1275        bool pending;
1276        int ret, ops = 0;
1277
1278        do {
1279                pending = false;
1280
1281                ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1282                if (ret > 0)
1283                        pending = true;
1284                else if (ret < 0)
1285                        return;
1286
1287                ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1288                if (ret > 0)
1289                        pending = true;
1290                else if (ret < 0)
1291                        return;
1292
1293        } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1294
1295        /*
1296         * Requeue the worker if idle deadline period is in progress or any
1297         * ops activity was recorded during the do-while loop above.
1298         */
1299        if (nvmet_tcp_check_queue_deadline(queue, ops) || pending)
1300                queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1301}
1302
1303static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1304                struct nvmet_tcp_cmd *c)
1305{
1306        u8 hdgst = nvmet_tcp_hdgst_len(queue);
1307
1308        c->queue = queue;
1309        c->req.port = queue->port->nport;
1310
1311        c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1312                        sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1313        if (!c->cmd_pdu)
1314                return -ENOMEM;
1315        c->req.cmd = &c->cmd_pdu->cmd;
1316
1317        c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1318                        sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1319        if (!c->rsp_pdu)
1320                goto out_free_cmd;
1321        c->req.cqe = &c->rsp_pdu->cqe;
1322
1323        c->data_pdu = page_frag_alloc(&queue->pf_cache,
1324                        sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1325        if (!c->data_pdu)
1326                goto out_free_rsp;
1327
1328        c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1329                        sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1330        if (!c->r2t_pdu)
1331                goto out_free_data;
1332
1333        c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1334
1335        list_add_tail(&c->entry, &queue->free_list);
1336
1337        return 0;
1338out_free_data:
1339        page_frag_free(c->data_pdu);
1340out_free_rsp:
1341        page_frag_free(c->rsp_pdu);
1342out_free_cmd:
1343        page_frag_free(c->cmd_pdu);
1344        return -ENOMEM;
1345}
1346
1347static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1348{
1349        page_frag_free(c->r2t_pdu);
1350        page_frag_free(c->data_pdu);
1351        page_frag_free(c->rsp_pdu);
1352        page_frag_free(c->cmd_pdu);
1353}
1354
1355static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1356{
1357        struct nvmet_tcp_cmd *cmds;
1358        int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1359
1360        cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1361        if (!cmds)
1362                goto out;
1363
1364        for (i = 0; i < nr_cmds; i++) {
1365                ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1366                if (ret)
1367                        goto out_free;
1368        }
1369
1370        queue->cmds = cmds;
1371
1372        return 0;
1373out_free:
1374        while (--i >= 0)
1375                nvmet_tcp_free_cmd(cmds + i);
1376        kfree(cmds);
1377out:
1378        return ret;
1379}
1380
1381static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1382{
1383        struct nvmet_tcp_cmd *cmds = queue->cmds;
1384        int i;
1385
1386        for (i = 0; i < queue->nr_cmds; i++)
1387                nvmet_tcp_free_cmd(cmds + i);
1388
1389        nvmet_tcp_free_cmd(&queue->connect);
1390        kfree(cmds);
1391}
1392
1393static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1394{
1395        struct socket *sock = queue->sock;
1396
1397        write_lock_bh(&sock->sk->sk_callback_lock);
1398        sock->sk->sk_data_ready =  queue->data_ready;
1399        sock->sk->sk_state_change = queue->state_change;
1400        sock->sk->sk_write_space = queue->write_space;
1401        sock->sk->sk_user_data = NULL;
1402        write_unlock_bh(&sock->sk->sk_callback_lock);
1403}
1404
1405static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1406{
1407        nvmet_req_uninit(&cmd->req);
1408        nvmet_tcp_unmap_pdu_iovec(cmd);
1409        kfree(cmd->iov);
1410        sgl_free(cmd->req.sg);
1411}
1412
1413static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1414{
1415        struct nvmet_tcp_cmd *cmd = queue->cmds;
1416        int i;
1417
1418        for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1419                if (nvmet_tcp_need_data_in(cmd))
1420                        nvmet_tcp_finish_cmd(cmd);
1421        }
1422
1423        if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1424                /* failed in connect */
1425                nvmet_tcp_finish_cmd(&queue->connect);
1426        }
1427}
1428
1429static void nvmet_tcp_release_queue_work(struct work_struct *w)
1430{
1431        struct nvmet_tcp_queue *queue =
1432                container_of(w, struct nvmet_tcp_queue, release_work);
1433
1434        mutex_lock(&nvmet_tcp_queue_mutex);
1435        list_del_init(&queue->queue_list);
1436        mutex_unlock(&nvmet_tcp_queue_mutex);
1437
1438        nvmet_tcp_restore_socket_callbacks(queue);
1439        flush_work(&queue->io_work);
1440
1441        nvmet_tcp_uninit_data_in_cmds(queue);
1442        nvmet_sq_destroy(&queue->nvme_sq);
1443        cancel_work_sync(&queue->io_work);
1444        sock_release(queue->sock);
1445        nvmet_tcp_free_cmds(queue);
1446        if (queue->hdr_digest || queue->data_digest)
1447                nvmet_tcp_free_crypto(queue);
1448        ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1449
1450        kfree(queue);
1451}
1452
1453static void nvmet_tcp_data_ready(struct sock *sk)
1454{
1455        struct nvmet_tcp_queue *queue;
1456
1457        read_lock_bh(&sk->sk_callback_lock);
1458        queue = sk->sk_user_data;
1459        if (likely(queue))
1460                queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1461        read_unlock_bh(&sk->sk_callback_lock);
1462}
1463
1464static void nvmet_tcp_write_space(struct sock *sk)
1465{
1466        struct nvmet_tcp_queue *queue;
1467
1468        read_lock_bh(&sk->sk_callback_lock);
1469        queue = sk->sk_user_data;
1470        if (unlikely(!queue))
1471                goto out;
1472
1473        if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1474                queue->write_space(sk);
1475                goto out;
1476        }
1477
1478        if (sk_stream_is_writeable(sk)) {
1479                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1480                queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1481        }
1482out:
1483        read_unlock_bh(&sk->sk_callback_lock);
1484}
1485
1486static void nvmet_tcp_state_change(struct sock *sk)
1487{
1488        struct nvmet_tcp_queue *queue;
1489
1490        read_lock_bh(&sk->sk_callback_lock);
1491        queue = sk->sk_user_data;
1492        if (!queue)
1493                goto done;
1494
1495        switch (sk->sk_state) {
1496        case TCP_FIN_WAIT1:
1497        case TCP_CLOSE_WAIT:
1498        case TCP_CLOSE:
1499                /* FALLTHRU */
1500                nvmet_tcp_schedule_release_queue(queue);
1501                break;
1502        default:
1503                pr_warn("queue %d unhandled state %d\n",
1504                        queue->idx, sk->sk_state);
1505        }
1506done:
1507        read_unlock_bh(&sk->sk_callback_lock);
1508}
1509
1510static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1511{
1512        struct socket *sock = queue->sock;
1513        struct inet_sock *inet = inet_sk(sock->sk);
1514        int ret;
1515
1516        ret = kernel_getsockname(sock,
1517                (struct sockaddr *)&queue->sockaddr);
1518        if (ret < 0)
1519                return ret;
1520
1521        ret = kernel_getpeername(sock,
1522                (struct sockaddr *)&queue->sockaddr_peer);
1523        if (ret < 0)
1524                return ret;
1525
1526        /*
1527         * Cleanup whatever is sitting in the TCP transmit queue on socket
1528         * close. This is done to prevent stale data from being sent should
1529         * the network connection be restored before TCP times out.
1530         */
1531        sock_no_linger(sock->sk);
1532
1533        if (so_priority > 0)
1534                sock_set_priority(sock->sk, so_priority);
1535
1536        /* Set socket type of service */
1537        if (inet->rcv_tos > 0)
1538                ip_sock_set_tos(sock->sk, inet->rcv_tos);
1539
1540        ret = 0;
1541        write_lock_bh(&sock->sk->sk_callback_lock);
1542        if (sock->sk->sk_state != TCP_ESTABLISHED) {
1543                /*
1544                 * If the socket is already closing, don't even start
1545                 * consuming it
1546                 */
1547                ret = -ENOTCONN;
1548        } else {
1549                sock->sk->sk_user_data = queue;
1550                queue->data_ready = sock->sk->sk_data_ready;
1551                sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1552                queue->state_change = sock->sk->sk_state_change;
1553                sock->sk->sk_state_change = nvmet_tcp_state_change;
1554                queue->write_space = sock->sk->sk_write_space;
1555                sock->sk->sk_write_space = nvmet_tcp_write_space;
1556                if (idle_poll_period_usecs)
1557                        nvmet_tcp_arm_queue_deadline(queue);
1558                queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1559        }
1560        write_unlock_bh(&sock->sk->sk_callback_lock);
1561
1562        return ret;
1563}
1564
1565static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1566                struct socket *newsock)
1567{
1568        struct nvmet_tcp_queue *queue;
1569        int ret;
1570
1571        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1572        if (!queue)
1573                return -ENOMEM;
1574
1575        INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1576        INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1577        queue->sock = newsock;
1578        queue->port = port;
1579        queue->nr_cmds = 0;
1580        spin_lock_init(&queue->state_lock);
1581        queue->state = NVMET_TCP_Q_CONNECTING;
1582        INIT_LIST_HEAD(&queue->free_list);
1583        init_llist_head(&queue->resp_list);
1584        INIT_LIST_HEAD(&queue->resp_send_list);
1585
1586        queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1587        if (queue->idx < 0) {
1588                ret = queue->idx;
1589                goto out_free_queue;
1590        }
1591
1592        ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1593        if (ret)
1594                goto out_ida_remove;
1595
1596        ret = nvmet_sq_init(&queue->nvme_sq);
1597        if (ret)
1598                goto out_free_connect;
1599
1600        nvmet_prepare_receive_pdu(queue);
1601
1602        mutex_lock(&nvmet_tcp_queue_mutex);
1603        list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1604        mutex_unlock(&nvmet_tcp_queue_mutex);
1605
1606        ret = nvmet_tcp_set_queue_sock(queue);
1607        if (ret)
1608                goto out_destroy_sq;
1609
1610        return 0;
1611out_destroy_sq:
1612        mutex_lock(&nvmet_tcp_queue_mutex);
1613        list_del_init(&queue->queue_list);
1614        mutex_unlock(&nvmet_tcp_queue_mutex);
1615        nvmet_sq_destroy(&queue->nvme_sq);
1616out_free_connect:
1617        nvmet_tcp_free_cmd(&queue->connect);
1618out_ida_remove:
1619        ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1620out_free_queue:
1621        kfree(queue);
1622        return ret;
1623}
1624
1625static void nvmet_tcp_accept_work(struct work_struct *w)
1626{
1627        struct nvmet_tcp_port *port =
1628                container_of(w, struct nvmet_tcp_port, accept_work);
1629        struct socket *newsock;
1630        int ret;
1631
1632        while (true) {
1633                ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1634                if (ret < 0) {
1635                        if (ret != -EAGAIN)
1636                                pr_warn("failed to accept err=%d\n", ret);
1637                        return;
1638                }
1639                ret = nvmet_tcp_alloc_queue(port, newsock);
1640                if (ret) {
1641                        pr_err("failed to allocate queue\n");
1642                        sock_release(newsock);
1643                }
1644        }
1645}
1646
1647static void nvmet_tcp_listen_data_ready(struct sock *sk)
1648{
1649        struct nvmet_tcp_port *port;
1650
1651        read_lock_bh(&sk->sk_callback_lock);
1652        port = sk->sk_user_data;
1653        if (!port)
1654                goto out;
1655
1656        if (sk->sk_state == TCP_LISTEN)
1657                schedule_work(&port->accept_work);
1658out:
1659        read_unlock_bh(&sk->sk_callback_lock);
1660}
1661
1662static int nvmet_tcp_add_port(struct nvmet_port *nport)
1663{
1664        struct nvmet_tcp_port *port;
1665        __kernel_sa_family_t af;
1666        int ret;
1667
1668        port = kzalloc(sizeof(*port), GFP_KERNEL);
1669        if (!port)
1670                return -ENOMEM;
1671
1672        switch (nport->disc_addr.adrfam) {
1673        case NVMF_ADDR_FAMILY_IP4:
1674                af = AF_INET;
1675                break;
1676        case NVMF_ADDR_FAMILY_IP6:
1677                af = AF_INET6;
1678                break;
1679        default:
1680                pr_err("address family %d not supported\n",
1681                                nport->disc_addr.adrfam);
1682                ret = -EINVAL;
1683                goto err_port;
1684        }
1685
1686        ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1687                        nport->disc_addr.trsvcid, &port->addr);
1688        if (ret) {
1689                pr_err("malformed ip/port passed: %s:%s\n",
1690                        nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1691                goto err_port;
1692        }
1693
1694        port->nport = nport;
1695        INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1696        if (port->nport->inline_data_size < 0)
1697                port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1698
1699        ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1700                                IPPROTO_TCP, &port->sock);
1701        if (ret) {
1702                pr_err("failed to create a socket\n");
1703                goto err_port;
1704        }
1705
1706        port->sock->sk->sk_user_data = port;
1707        port->data_ready = port->sock->sk->sk_data_ready;
1708        port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1709        sock_set_reuseaddr(port->sock->sk);
1710        tcp_sock_set_nodelay(port->sock->sk);
1711        if (so_priority > 0)
1712                sock_set_priority(port->sock->sk, so_priority);
1713
1714        ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1715                        sizeof(port->addr));
1716        if (ret) {
1717                pr_err("failed to bind port socket %d\n", ret);
1718                goto err_sock;
1719        }
1720
1721        ret = kernel_listen(port->sock, 128);
1722        if (ret) {
1723                pr_err("failed to listen %d on port sock\n", ret);
1724                goto err_sock;
1725        }
1726
1727        nport->priv = port;
1728        pr_info("enabling port %d (%pISpc)\n",
1729                le16_to_cpu(nport->disc_addr.portid), &port->addr);
1730
1731        return 0;
1732
1733err_sock:
1734        sock_release(port->sock);
1735err_port:
1736        kfree(port);
1737        return ret;
1738}
1739
1740static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1741{
1742        struct nvmet_tcp_port *port = nport->priv;
1743
1744        write_lock_bh(&port->sock->sk->sk_callback_lock);
1745        port->sock->sk->sk_data_ready = port->data_ready;
1746        port->sock->sk->sk_user_data = NULL;
1747        write_unlock_bh(&port->sock->sk->sk_callback_lock);
1748        cancel_work_sync(&port->accept_work);
1749
1750        sock_release(port->sock);
1751        kfree(port);
1752}
1753
1754static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1755{
1756        struct nvmet_tcp_queue *queue;
1757
1758        mutex_lock(&nvmet_tcp_queue_mutex);
1759        list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1760                if (queue->nvme_sq.ctrl == ctrl)
1761                        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1762        mutex_unlock(&nvmet_tcp_queue_mutex);
1763}
1764
1765static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1766{
1767        struct nvmet_tcp_queue *queue =
1768                container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1769
1770        if (sq->qid == 0) {
1771                /* Let inflight controller teardown complete */
1772                flush_scheduled_work();
1773        }
1774
1775        queue->nr_cmds = sq->size * 2;
1776        if (nvmet_tcp_alloc_cmds(queue))
1777                return NVME_SC_INTERNAL;
1778        return 0;
1779}
1780
1781static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1782                struct nvmet_port *nport, char *traddr)
1783{
1784        struct nvmet_tcp_port *port = nport->priv;
1785
1786        if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1787                struct nvmet_tcp_cmd *cmd =
1788                        container_of(req, struct nvmet_tcp_cmd, req);
1789                struct nvmet_tcp_queue *queue = cmd->queue;
1790
1791                sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1792        } else {
1793                memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1794        }
1795}
1796
1797static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
1798        .owner                  = THIS_MODULE,
1799        .type                   = NVMF_TRTYPE_TCP,
1800        .msdbd                  = 1,
1801        .add_port               = nvmet_tcp_add_port,
1802        .remove_port            = nvmet_tcp_remove_port,
1803        .queue_response         = nvmet_tcp_queue_response,
1804        .delete_ctrl            = nvmet_tcp_delete_ctrl,
1805        .install_queue          = nvmet_tcp_install_queue,
1806        .disc_traddr            = nvmet_tcp_disc_port_addr,
1807};
1808
1809static int __init nvmet_tcp_init(void)
1810{
1811        int ret;
1812
1813        nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1814        if (!nvmet_tcp_wq)
1815                return -ENOMEM;
1816
1817        ret = nvmet_register_transport(&nvmet_tcp_ops);
1818        if (ret)
1819                goto err;
1820
1821        return 0;
1822err:
1823        destroy_workqueue(nvmet_tcp_wq);
1824        return ret;
1825}
1826
1827static void __exit nvmet_tcp_exit(void)
1828{
1829        struct nvmet_tcp_queue *queue;
1830
1831        nvmet_unregister_transport(&nvmet_tcp_ops);
1832
1833        flush_scheduled_work();
1834        mutex_lock(&nvmet_tcp_queue_mutex);
1835        list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1836                kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1837        mutex_unlock(&nvmet_tcp_queue_mutex);
1838        flush_scheduled_work();
1839
1840        destroy_workqueue(nvmet_tcp_wq);
1841}
1842
1843module_init(nvmet_tcp_init);
1844module_exit(nvmet_tcp_exit);
1845
1846MODULE_LICENSE("GPL v2");
1847MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
1848