linux/drivers/block/nbd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Network block device - make block devices work over TCP
   4 *
   5 * Note that you can not swap over this thing, yet. Seems to work but
   6 * deadlocks sometimes - you can not swap over TCP in general.
   7 * 
   8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
   9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
  10 *
  11 * (part of code stolen from loop.c)
  12 */
  13
  14#include <linux/major.h>
  15
  16#include <linux/blkdev.h>
  17#include <linux/module.h>
  18#include <linux/init.h>
  19#include <linux/sched.h>
  20#include <linux/sched/mm.h>
  21#include <linux/fs.h>
  22#include <linux/bio.h>
  23#include <linux/stat.h>
  24#include <linux/errno.h>
  25#include <linux/file.h>
  26#include <linux/ioctl.h>
  27#include <linux/mutex.h>
  28#include <linux/compiler.h>
  29#include <linux/completion.h>
  30#include <linux/err.h>
  31#include <linux/kernel.h>
  32#include <linux/slab.h>
  33#include <net/sock.h>
  34#include <linux/net.h>
  35#include <linux/kthread.h>
  36#include <linux/types.h>
  37#include <linux/debugfs.h>
  38#include <linux/blk-mq.h>
  39
  40#include <linux/uaccess.h>
  41#include <asm/types.h>
  42
  43#include <linux/nbd.h>
  44#include <linux/nbd-netlink.h>
  45#include <net/genetlink.h>
  46
  47#define CREATE_TRACE_POINTS
  48#include <trace/events/nbd.h>
  49
  50static DEFINE_IDR(nbd_index_idr);
  51static DEFINE_MUTEX(nbd_index_mutex);
  52static int nbd_total_devices = 0;
  53
  54struct nbd_sock {
  55        struct socket *sock;
  56        struct mutex tx_lock;
  57        struct request *pending;
  58        int sent;
  59        bool dead;
  60        int fallback_index;
  61        int cookie;
  62};
  63
  64struct recv_thread_args {
  65        struct work_struct work;
  66        struct nbd_device *nbd;
  67        int index;
  68};
  69
  70struct link_dead_args {
  71        struct work_struct work;
  72        int index;
  73};
  74
  75#define NBD_RT_TIMEDOUT                 0
  76#define NBD_RT_DISCONNECT_REQUESTED     1
  77#define NBD_RT_DISCONNECTED             2
  78#define NBD_RT_HAS_PID_FILE             3
  79#define NBD_RT_HAS_CONFIG_REF           4
  80#define NBD_RT_BOUND                    5
  81#define NBD_RT_DISCONNECT_ON_CLOSE      6
  82#define NBD_RT_HAS_BACKEND_FILE         7
  83
  84#define NBD_DESTROY_ON_DISCONNECT       0
  85#define NBD_DISCONNECT_REQUESTED        1
  86
  87struct nbd_config {
  88        u32 flags;
  89        unsigned long runtime_flags;
  90        u64 dead_conn_timeout;
  91
  92        struct nbd_sock **socks;
  93        int num_connections;
  94        atomic_t live_connections;
  95        wait_queue_head_t conn_wait;
  96
  97        atomic_t recv_threads;
  98        wait_queue_head_t recv_wq;
  99        loff_t blksize;
 100        loff_t bytesize;
 101#if IS_ENABLED(CONFIG_DEBUG_FS)
 102        struct dentry *dbg_dir;
 103#endif
 104};
 105
 106struct nbd_device {
 107        struct blk_mq_tag_set tag_set;
 108
 109        int index;
 110        refcount_t config_refs;
 111        refcount_t refs;
 112        struct nbd_config *config;
 113        struct mutex config_lock;
 114        struct gendisk *disk;
 115        struct workqueue_struct *recv_workq;
 116
 117        struct list_head list;
 118        struct task_struct *task_recv;
 119        struct task_struct *task_setup;
 120
 121        struct completion *destroy_complete;
 122        unsigned long flags;
 123
 124        char *backend;
 125};
 126
 127#define NBD_CMD_REQUEUED        1
 128
 129struct nbd_cmd {
 130        struct nbd_device *nbd;
 131        struct mutex lock;
 132        int index;
 133        int cookie;
 134        int retries;
 135        blk_status_t status;
 136        unsigned long flags;
 137        u32 cmd_cookie;
 138};
 139
 140#if IS_ENABLED(CONFIG_DEBUG_FS)
 141static struct dentry *nbd_dbg_dir;
 142#endif
 143
 144#define nbd_name(nbd) ((nbd)->disk->disk_name)
 145
 146#define NBD_MAGIC 0x68797548
 147
 148#define NBD_DEF_BLKSIZE 1024
 149
 150static unsigned int nbds_max = 16;
 151static int max_part = 16;
 152static int part_shift;
 153
 154static int nbd_dev_dbg_init(struct nbd_device *nbd);
 155static void nbd_dev_dbg_close(struct nbd_device *nbd);
 156static void nbd_config_put(struct nbd_device *nbd);
 157static void nbd_connect_reply(struct genl_info *info, int index);
 158static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
 159static void nbd_dead_link_work(struct work_struct *work);
 160static void nbd_disconnect_and_put(struct nbd_device *nbd);
 161
 162static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 163{
 164        return disk_to_dev(nbd->disk);
 165}
 166
 167static void nbd_requeue_cmd(struct nbd_cmd *cmd)
 168{
 169        struct request *req = blk_mq_rq_from_pdu(cmd);
 170
 171        if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
 172                blk_mq_requeue_request(req, true);
 173}
 174
 175#define NBD_COOKIE_BITS 32
 176
 177static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
 178{
 179        struct request *req = blk_mq_rq_from_pdu(cmd);
 180        u32 tag = blk_mq_unique_tag(req);
 181        u64 cookie = cmd->cmd_cookie;
 182
 183        return (cookie << NBD_COOKIE_BITS) | tag;
 184}
 185
 186static u32 nbd_handle_to_tag(u64 handle)
 187{
 188        return (u32)handle;
 189}
 190
 191static u32 nbd_handle_to_cookie(u64 handle)
 192{
 193        return (u32)(handle >> NBD_COOKIE_BITS);
 194}
 195
 196static const char *nbdcmd_to_ascii(int cmd)
 197{
 198        switch (cmd) {
 199        case  NBD_CMD_READ: return "read";
 200        case NBD_CMD_WRITE: return "write";
 201        case  NBD_CMD_DISC: return "disconnect";
 202        case NBD_CMD_FLUSH: return "flush";
 203        case  NBD_CMD_TRIM: return "trim/discard";
 204        }
 205        return "invalid";
 206}
 207
 208static ssize_t pid_show(struct device *dev,
 209                        struct device_attribute *attr, char *buf)
 210{
 211        struct gendisk *disk = dev_to_disk(dev);
 212        struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 213
 214        return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
 215}
 216
 217static const struct device_attribute pid_attr = {
 218        .attr = { .name = "pid", .mode = 0444},
 219        .show = pid_show,
 220};
 221
 222static ssize_t backend_show(struct device *dev,
 223                struct device_attribute *attr, char *buf)
 224{
 225        struct gendisk *disk = dev_to_disk(dev);
 226        struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 227
 228        return sprintf(buf, "%s\n", nbd->backend ?: "");
 229}
 230
 231static const struct device_attribute backend_attr = {
 232        .attr = { .name = "backend", .mode = 0444},
 233        .show = backend_show,
 234};
 235
 236static void nbd_dev_remove(struct nbd_device *nbd)
 237{
 238        struct gendisk *disk = nbd->disk;
 239
 240        if (disk) {
 241                del_gendisk(disk);
 242                blk_cleanup_disk(disk);
 243                blk_mq_free_tag_set(&nbd->tag_set);
 244        }
 245
 246        /*
 247         * Place this in the last just before the nbd is freed to
 248         * make sure that the disk and the related kobject are also
 249         * totally removed to avoid duplicate creation of the same
 250         * one.
 251         */
 252        if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete)
 253                complete(nbd->destroy_complete);
 254
 255        kfree(nbd);
 256}
 257
 258static void nbd_put(struct nbd_device *nbd)
 259{
 260        if (refcount_dec_and_mutex_lock(&nbd->refs,
 261                                        &nbd_index_mutex)) {
 262                idr_remove(&nbd_index_idr, nbd->index);
 263                nbd_dev_remove(nbd);
 264                mutex_unlock(&nbd_index_mutex);
 265        }
 266}
 267
 268static int nbd_disconnected(struct nbd_config *config)
 269{
 270        return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
 271                test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
 272}
 273
 274static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 275                                int notify)
 276{
 277        if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
 278                struct link_dead_args *args;
 279                args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
 280                if (args) {
 281                        INIT_WORK(&args->work, nbd_dead_link_work);
 282                        args->index = nbd->index;
 283                        queue_work(system_wq, &args->work);
 284                }
 285        }
 286        if (!nsock->dead) {
 287                kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
 288                if (atomic_dec_return(&nbd->config->live_connections) == 0) {
 289                        if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
 290                                               &nbd->config->runtime_flags)) {
 291                                set_bit(NBD_RT_DISCONNECTED,
 292                                        &nbd->config->runtime_flags);
 293                                dev_info(nbd_to_dev(nbd),
 294                                        "Disconnected due to user request.\n");
 295                        }
 296                }
 297        }
 298        nsock->dead = true;
 299        nsock->pending = NULL;
 300        nsock->sent = 0;
 301}
 302
 303static void nbd_size_clear(struct nbd_device *nbd)
 304{
 305        if (nbd->config->bytesize) {
 306                set_capacity(nbd->disk, 0);
 307                kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 308        }
 309}
 310
 311static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
 312                loff_t blksize)
 313{
 314        if (!blksize)
 315                blksize = NBD_DEF_BLKSIZE;
 316        if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize))
 317                return -EINVAL;
 318
 319        nbd->config->bytesize = bytesize;
 320        nbd->config->blksize = blksize;
 321
 322        if (!nbd->task_recv)
 323                return 0;
 324
 325        if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
 326                nbd->disk->queue->limits.discard_granularity = blksize;
 327                nbd->disk->queue->limits.discard_alignment = blksize;
 328                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
 329        }
 330        blk_queue_logical_block_size(nbd->disk->queue, blksize);
 331        blk_queue_physical_block_size(nbd->disk->queue, blksize);
 332
 333        if (max_part)
 334                set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
 335        if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
 336                kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 337        return 0;
 338}
 339
 340static void nbd_complete_rq(struct request *req)
 341{
 342        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 343
 344        dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
 345                cmd->status ? "failed" : "done");
 346
 347        blk_mq_end_request(req, cmd->status);
 348}
 349
 350/*
 351 * Forcibly shutdown the socket causing all listeners to error
 352 */
 353static void sock_shutdown(struct nbd_device *nbd)
 354{
 355        struct nbd_config *config = nbd->config;
 356        int i;
 357
 358        if (config->num_connections == 0)
 359                return;
 360        if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 361                return;
 362
 363        for (i = 0; i < config->num_connections; i++) {
 364                struct nbd_sock *nsock = config->socks[i];
 365                mutex_lock(&nsock->tx_lock);
 366                nbd_mark_nsock_dead(nbd, nsock, 0);
 367                mutex_unlock(&nsock->tx_lock);
 368        }
 369        dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 370}
 371
 372static u32 req_to_nbd_cmd_type(struct request *req)
 373{
 374        switch (req_op(req)) {
 375        case REQ_OP_DISCARD:
 376                return NBD_CMD_TRIM;
 377        case REQ_OP_FLUSH:
 378                return NBD_CMD_FLUSH;
 379        case REQ_OP_WRITE:
 380                return NBD_CMD_WRITE;
 381        case REQ_OP_READ:
 382                return NBD_CMD_READ;
 383        default:
 384                return U32_MAX;
 385        }
 386}
 387
 388static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 389                                                 bool reserved)
 390{
 391        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 392        struct nbd_device *nbd = cmd->nbd;
 393        struct nbd_config *config;
 394
 395        if (!mutex_trylock(&cmd->lock))
 396                return BLK_EH_RESET_TIMER;
 397
 398        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 399                cmd->status = BLK_STS_TIMEOUT;
 400                mutex_unlock(&cmd->lock);
 401                goto done;
 402        }
 403        config = nbd->config;
 404
 405        if (config->num_connections > 1 ||
 406            (config->num_connections == 1 && nbd->tag_set.timeout)) {
 407                dev_err_ratelimited(nbd_to_dev(nbd),
 408                                    "Connection timed out, retrying (%d/%d alive)\n",
 409                                    atomic_read(&config->live_connections),
 410                                    config->num_connections);
 411                /*
 412                 * Hooray we have more connections, requeue this IO, the submit
 413                 * path will put it on a real connection. Or if only one
 414                 * connection is configured, the submit path will wait util
 415                 * a new connection is reconfigured or util dead timeout.
 416                 */
 417                if (config->socks) {
 418                        if (cmd->index < config->num_connections) {
 419                                struct nbd_sock *nsock =
 420                                        config->socks[cmd->index];
 421                                mutex_lock(&nsock->tx_lock);
 422                                /* We can have multiple outstanding requests, so
 423                                 * we don't want to mark the nsock dead if we've
 424                                 * already reconnected with a new socket, so
 425                                 * only mark it dead if its the same socket we
 426                                 * were sent out on.
 427                                 */
 428                                if (cmd->cookie == nsock->cookie)
 429                                        nbd_mark_nsock_dead(nbd, nsock, 1);
 430                                mutex_unlock(&nsock->tx_lock);
 431                        }
 432                        mutex_unlock(&cmd->lock);
 433                        nbd_requeue_cmd(cmd);
 434                        nbd_config_put(nbd);
 435                        return BLK_EH_DONE;
 436                }
 437        }
 438
 439        if (!nbd->tag_set.timeout) {
 440                /*
 441                 * Userspace sets timeout=0 to disable socket disconnection,
 442                 * so just warn and reset the timer.
 443                 */
 444                struct nbd_sock *nsock = config->socks[cmd->index];
 445                cmd->retries++;
 446                dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
 447                        req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
 448                        (unsigned long long)blk_rq_pos(req) << 9,
 449                        blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
 450
 451                mutex_lock(&nsock->tx_lock);
 452                if (cmd->cookie != nsock->cookie) {
 453                        nbd_requeue_cmd(cmd);
 454                        mutex_unlock(&nsock->tx_lock);
 455                        mutex_unlock(&cmd->lock);
 456                        nbd_config_put(nbd);
 457                        return BLK_EH_DONE;
 458                }
 459                mutex_unlock(&nsock->tx_lock);
 460                mutex_unlock(&cmd->lock);
 461                nbd_config_put(nbd);
 462                return BLK_EH_RESET_TIMER;
 463        }
 464
 465        dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
 466        set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
 467        cmd->status = BLK_STS_IOERR;
 468        mutex_unlock(&cmd->lock);
 469        sock_shutdown(nbd);
 470        nbd_config_put(nbd);
 471done:
 472        blk_mq_complete_request(req);
 473        return BLK_EH_DONE;
 474}
 475
 476/*
 477 *  Send or receive packet.
 478 */
 479static int sock_xmit(struct nbd_device *nbd, int index, int send,
 480                     struct iov_iter *iter, int msg_flags, int *sent)
 481{
 482        struct nbd_config *config = nbd->config;
 483        struct socket *sock = config->socks[index]->sock;
 484        int result;
 485        struct msghdr msg;
 486        unsigned int noreclaim_flag;
 487
 488        if (unlikely(!sock)) {
 489                dev_err_ratelimited(disk_to_dev(nbd->disk),
 490                        "Attempted %s on closed socket in sock_xmit\n",
 491                        (send ? "send" : "recv"));
 492                return -EINVAL;
 493        }
 494
 495        msg.msg_iter = *iter;
 496
 497        noreclaim_flag = memalloc_noreclaim_save();
 498        do {
 499                sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 500                msg.msg_name = NULL;
 501                msg.msg_namelen = 0;
 502                msg.msg_control = NULL;
 503                msg.msg_controllen = 0;
 504                msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 505
 506                if (send)
 507                        result = sock_sendmsg(sock, &msg);
 508                else
 509                        result = sock_recvmsg(sock, &msg, msg.msg_flags);
 510
 511                if (result <= 0) {
 512                        if (result == 0)
 513                                result = -EPIPE; /* short read */
 514                        break;
 515                }
 516                if (sent)
 517                        *sent += result;
 518        } while (msg_data_left(&msg));
 519
 520        memalloc_noreclaim_restore(noreclaim_flag);
 521
 522        return result;
 523}
 524
 525/*
 526 * Different settings for sk->sk_sndtimeo can result in different return values
 527 * if there is a signal pending when we enter sendmsg, because reasons?
 528 */
 529static inline int was_interrupted(int result)
 530{
 531        return result == -ERESTARTSYS || result == -EINTR;
 532}
 533
 534/* always call with the tx_lock held */
 535static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 536{
 537        struct request *req = blk_mq_rq_from_pdu(cmd);
 538        struct nbd_config *config = nbd->config;
 539        struct nbd_sock *nsock = config->socks[index];
 540        int result;
 541        struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
 542        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
 543        struct iov_iter from;
 544        unsigned long size = blk_rq_bytes(req);
 545        struct bio *bio;
 546        u64 handle;
 547        u32 type;
 548        u32 nbd_cmd_flags = 0;
 549        int sent = nsock->sent, skip = 0;
 550
 551        iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
 552
 553        type = req_to_nbd_cmd_type(req);
 554        if (type == U32_MAX)
 555                return -EIO;
 556
 557        if (rq_data_dir(req) == WRITE &&
 558            (config->flags & NBD_FLAG_READ_ONLY)) {
 559                dev_err_ratelimited(disk_to_dev(nbd->disk),
 560                                    "Write on read-only\n");
 561                return -EIO;
 562        }
 563
 564        if (req->cmd_flags & REQ_FUA)
 565                nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
 566
 567        /* We did a partial send previously, and we at least sent the whole
 568         * request struct, so just go and send the rest of the pages in the
 569         * request.
 570         */
 571        if (sent) {
 572                if (sent >= sizeof(request)) {
 573                        skip = sent - sizeof(request);
 574
 575                        /* initialize handle for tracing purposes */
 576                        handle = nbd_cmd_handle(cmd);
 577
 578                        goto send_pages;
 579                }
 580                iov_iter_advance(&from, sent);
 581        } else {
 582                cmd->cmd_cookie++;
 583        }
 584        cmd->index = index;
 585        cmd->cookie = nsock->cookie;
 586        cmd->retries = 0;
 587        request.type = htonl(type | nbd_cmd_flags);
 588        if (type != NBD_CMD_FLUSH) {
 589                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 590                request.len = htonl(size);
 591        }
 592        handle = nbd_cmd_handle(cmd);
 593        memcpy(request.handle, &handle, sizeof(handle));
 594
 595        trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
 596
 597        dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 598                req, nbdcmd_to_ascii(type),
 599                (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 600        result = sock_xmit(nbd, index, 1, &from,
 601                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
 602        trace_nbd_header_sent(req, handle);
 603        if (result <= 0) {
 604                if (was_interrupted(result)) {
 605                        /* If we havne't sent anything we can just return BUSY,
 606                         * however if we have sent something we need to make
 607                         * sure we only allow this req to be sent until we are
 608                         * completely done.
 609                         */
 610                        if (sent) {
 611                                nsock->pending = req;
 612                                nsock->sent = sent;
 613                        }
 614                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 615                        return BLK_STS_RESOURCE;
 616                }
 617                dev_err_ratelimited(disk_to_dev(nbd->disk),
 618                        "Send control failed (result %d)\n", result);
 619                return -EAGAIN;
 620        }
 621send_pages:
 622        if (type != NBD_CMD_WRITE)
 623                goto out;
 624
 625        bio = req->bio;
 626        while (bio) {
 627                struct bio *next = bio->bi_next;
 628                struct bvec_iter iter;
 629                struct bio_vec bvec;
 630
 631                bio_for_each_segment(bvec, bio, iter) {
 632                        bool is_last = !next && bio_iter_last(bvec, iter);
 633                        int flags = is_last ? 0 : MSG_MORE;
 634
 635                        dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 636                                req, bvec.bv_len);
 637                        iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
 638                        if (skip) {
 639                                if (skip >= iov_iter_count(&from)) {
 640                                        skip -= iov_iter_count(&from);
 641                                        continue;
 642                                }
 643                                iov_iter_advance(&from, skip);
 644                                skip = 0;
 645                        }
 646                        result = sock_xmit(nbd, index, 1, &from, flags, &sent);
 647                        if (result <= 0) {
 648                                if (was_interrupted(result)) {
 649                                        /* We've already sent the header, we
 650                                         * have no choice but to set pending and
 651                                         * return BUSY.
 652                                         */
 653                                        nsock->pending = req;
 654                                        nsock->sent = sent;
 655                                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 656                                        return BLK_STS_RESOURCE;
 657                                }
 658                                dev_err(disk_to_dev(nbd->disk),
 659                                        "Send data failed (result %d)\n",
 660                                        result);
 661                                return -EAGAIN;
 662                        }
 663                        /*
 664                         * The completion might already have come in,
 665                         * so break for the last one instead of letting
 666                         * the iterator do it. This prevents use-after-free
 667                         * of the bio.
 668                         */
 669                        if (is_last)
 670                                break;
 671                }
 672                bio = next;
 673        }
 674out:
 675        trace_nbd_payload_sent(req, handle);
 676        nsock->pending = NULL;
 677        nsock->sent = 0;
 678        return 0;
 679}
 680
 681/* NULL returned = something went wrong, inform userspace */
 682static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 683{
 684        struct nbd_config *config = nbd->config;
 685        int result;
 686        struct nbd_reply reply;
 687        struct nbd_cmd *cmd;
 688        struct request *req = NULL;
 689        u64 handle;
 690        u16 hwq;
 691        u32 tag;
 692        struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
 693        struct iov_iter to;
 694        int ret = 0;
 695
 696        reply.magic = 0;
 697        iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
 698        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 699        if (result <= 0) {
 700                if (!nbd_disconnected(config))
 701                        dev_err(disk_to_dev(nbd->disk),
 702                                "Receive control failed (result %d)\n", result);
 703                return ERR_PTR(result);
 704        }
 705
 706        if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
 707                dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 708                                (unsigned long)ntohl(reply.magic));
 709                return ERR_PTR(-EPROTO);
 710        }
 711
 712        memcpy(&handle, reply.handle, sizeof(handle));
 713        tag = nbd_handle_to_tag(handle);
 714        hwq = blk_mq_unique_tag_to_hwq(tag);
 715        if (hwq < nbd->tag_set.nr_hw_queues)
 716                req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
 717                                       blk_mq_unique_tag_to_tag(tag));
 718        if (!req || !blk_mq_request_started(req)) {
 719                dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
 720                        tag, req);
 721                return ERR_PTR(-ENOENT);
 722        }
 723        trace_nbd_header_received(req, handle);
 724        cmd = blk_mq_rq_to_pdu(req);
 725
 726        mutex_lock(&cmd->lock);
 727        if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
 728                dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
 729                        req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
 730                ret = -ENOENT;
 731                goto out;
 732        }
 733        if (cmd->status != BLK_STS_OK) {
 734                dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
 735                        req);
 736                ret = -ENOENT;
 737                goto out;
 738        }
 739        if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
 740                dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
 741                        req);
 742                ret = -ENOENT;
 743                goto out;
 744        }
 745        if (ntohl(reply.error)) {
 746                dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 747                        ntohl(reply.error));
 748                cmd->status = BLK_STS_IOERR;
 749                goto out;
 750        }
 751
 752        dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
 753        if (rq_data_dir(req) != WRITE) {
 754                struct req_iterator iter;
 755                struct bio_vec bvec;
 756
 757                rq_for_each_segment(bvec, req, iter) {
 758                        iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
 759                        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 760                        if (result <= 0) {
 761                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 762                                        result);
 763                                /*
 764                                 * If we've disconnected, we need to make sure we
 765                                 * complete this request, otherwise error out
 766                                 * and let the timeout stuff handle resubmitting
 767                                 * this request onto another connection.
 768                                 */
 769                                if (nbd_disconnected(config)) {
 770                                        cmd->status = BLK_STS_IOERR;
 771                                        goto out;
 772                                }
 773                                ret = -EIO;
 774                                goto out;
 775                        }
 776                        dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
 777                                req, bvec.bv_len);
 778                }
 779        }
 780out:
 781        trace_nbd_payload_received(req, handle);
 782        mutex_unlock(&cmd->lock);
 783        return ret ? ERR_PTR(ret) : cmd;
 784}
 785
 786static void recv_work(struct work_struct *work)
 787{
 788        struct recv_thread_args *args = container_of(work,
 789                                                     struct recv_thread_args,
 790                                                     work);
 791        struct nbd_device *nbd = args->nbd;
 792        struct nbd_config *config = nbd->config;
 793        struct nbd_cmd *cmd;
 794        struct request *rq;
 795
 796        while (1) {
 797                cmd = nbd_read_stat(nbd, args->index);
 798                if (IS_ERR(cmd)) {
 799                        struct nbd_sock *nsock = config->socks[args->index];
 800
 801                        mutex_lock(&nsock->tx_lock);
 802                        nbd_mark_nsock_dead(nbd, nsock, 1);
 803                        mutex_unlock(&nsock->tx_lock);
 804                        break;
 805                }
 806
 807                rq = blk_mq_rq_from_pdu(cmd);
 808                if (likely(!blk_should_fake_timeout(rq->q)))
 809                        blk_mq_complete_request(rq);
 810        }
 811        nbd_config_put(nbd);
 812        atomic_dec(&config->recv_threads);
 813        wake_up(&config->recv_wq);
 814        kfree(args);
 815}
 816
 817static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 818{
 819        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 820
 821        /* don't abort one completed request */
 822        if (blk_mq_request_completed(req))
 823                return true;
 824
 825        mutex_lock(&cmd->lock);
 826        cmd->status = BLK_STS_IOERR;
 827        mutex_unlock(&cmd->lock);
 828
 829        blk_mq_complete_request(req);
 830        return true;
 831}
 832
 833static void nbd_clear_que(struct nbd_device *nbd)
 834{
 835        blk_mq_quiesce_queue(nbd->disk->queue);
 836        blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 837        blk_mq_unquiesce_queue(nbd->disk->queue);
 838        dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 839}
 840
 841static int find_fallback(struct nbd_device *nbd, int index)
 842{
 843        struct nbd_config *config = nbd->config;
 844        int new_index = -1;
 845        struct nbd_sock *nsock = config->socks[index];
 846        int fallback = nsock->fallback_index;
 847
 848        if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 849                return new_index;
 850
 851        if (config->num_connections <= 1) {
 852                dev_err_ratelimited(disk_to_dev(nbd->disk),
 853                                    "Dead connection, failed to find a fallback\n");
 854                return new_index;
 855        }
 856
 857        if (fallback >= 0 && fallback < config->num_connections &&
 858            !config->socks[fallback]->dead)
 859                return fallback;
 860
 861        if (nsock->fallback_index < 0 ||
 862            nsock->fallback_index >= config->num_connections ||
 863            config->socks[nsock->fallback_index]->dead) {
 864                int i;
 865                for (i = 0; i < config->num_connections; i++) {
 866                        if (i == index)
 867                                continue;
 868                        if (!config->socks[i]->dead) {
 869                                new_index = i;
 870                                break;
 871                        }
 872                }
 873                nsock->fallback_index = new_index;
 874                if (new_index < 0) {
 875                        dev_err_ratelimited(disk_to_dev(nbd->disk),
 876                                            "Dead connection, failed to find a fallback\n");
 877                        return new_index;
 878                }
 879        }
 880        new_index = nsock->fallback_index;
 881        return new_index;
 882}
 883
 884static int wait_for_reconnect(struct nbd_device *nbd)
 885{
 886        struct nbd_config *config = nbd->config;
 887        if (!config->dead_conn_timeout)
 888                return 0;
 889        if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 890                return 0;
 891        return wait_event_timeout(config->conn_wait,
 892                                  atomic_read(&config->live_connections) > 0,
 893                                  config->dead_conn_timeout) > 0;
 894}
 895
 896static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 897{
 898        struct request *req = blk_mq_rq_from_pdu(cmd);
 899        struct nbd_device *nbd = cmd->nbd;
 900        struct nbd_config *config;
 901        struct nbd_sock *nsock;
 902        int ret;
 903
 904        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 905                dev_err_ratelimited(disk_to_dev(nbd->disk),
 906                                    "Socks array is empty\n");
 907                blk_mq_start_request(req);
 908                return -EINVAL;
 909        }
 910        config = nbd->config;
 911
 912        if (index >= config->num_connections) {
 913                dev_err_ratelimited(disk_to_dev(nbd->disk),
 914                                    "Attempted send on invalid socket\n");
 915                nbd_config_put(nbd);
 916                blk_mq_start_request(req);
 917                return -EINVAL;
 918        }
 919        cmd->status = BLK_STS_OK;
 920again:
 921        nsock = config->socks[index];
 922        mutex_lock(&nsock->tx_lock);
 923        if (nsock->dead) {
 924                int old_index = index;
 925                index = find_fallback(nbd, index);
 926                mutex_unlock(&nsock->tx_lock);
 927                if (index < 0) {
 928                        if (wait_for_reconnect(nbd)) {
 929                                index = old_index;
 930                                goto again;
 931                        }
 932                        /* All the sockets should already be down at this point,
 933                         * we just want to make sure that DISCONNECTED is set so
 934                         * any requests that come in that were queue'ed waiting
 935                         * for the reconnect timer don't trigger the timer again
 936                         * and instead just error out.
 937                         */
 938                        sock_shutdown(nbd);
 939                        nbd_config_put(nbd);
 940                        blk_mq_start_request(req);
 941                        return -EIO;
 942                }
 943                goto again;
 944        }
 945
 946        /* Handle the case that we have a pending request that was partially
 947         * transmitted that _has_ to be serviced first.  We need to call requeue
 948         * here so that it gets put _after_ the request that is already on the
 949         * dispatch list.
 950         */
 951        blk_mq_start_request(req);
 952        if (unlikely(nsock->pending && nsock->pending != req)) {
 953                nbd_requeue_cmd(cmd);
 954                ret = 0;
 955                goto out;
 956        }
 957        /*
 958         * Some failures are related to the link going down, so anything that
 959         * returns EAGAIN can be retried on a different socket.
 960         */
 961        ret = nbd_send_cmd(nbd, cmd, index);
 962        if (ret == -EAGAIN) {
 963                dev_err_ratelimited(disk_to_dev(nbd->disk),
 964                                    "Request send failed, requeueing\n");
 965                nbd_mark_nsock_dead(nbd, nsock, 1);
 966                nbd_requeue_cmd(cmd);
 967                ret = 0;
 968        }
 969out:
 970        mutex_unlock(&nsock->tx_lock);
 971        nbd_config_put(nbd);
 972        return ret;
 973}
 974
 975static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 976                        const struct blk_mq_queue_data *bd)
 977{
 978        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 979        int ret;
 980
 981        /*
 982         * Since we look at the bio's to send the request over the network we
 983         * need to make sure the completion work doesn't mark this request done
 984         * before we are done doing our send.  This keeps us from dereferencing
 985         * freed data if we have particularly fast completions (ie we get the
 986         * completion before we exit sock_xmit on the last bvec) or in the case
 987         * that the server is misbehaving (or there was an error) before we're
 988         * done sending everything over the wire.
 989         */
 990        mutex_lock(&cmd->lock);
 991        clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
 992
 993        /* We can be called directly from the user space process, which means we
 994         * could possibly have signals pending so our sendmsg will fail.  In
 995         * this case we need to return that we are busy, otherwise error out as
 996         * appropriate.
 997         */
 998        ret = nbd_handle_cmd(cmd, hctx->queue_num);
 999        if (ret < 0)
1000                ret = BLK_STS_IOERR;
1001        else if (!ret)
1002                ret = BLK_STS_OK;
1003        mutex_unlock(&cmd->lock);
1004
1005        return ret;
1006}
1007
1008static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
1009                                     int *err)
1010{
1011        struct socket *sock;
1012
1013        *err = 0;
1014        sock = sockfd_lookup(fd, err);
1015        if (!sock)
1016                return NULL;
1017
1018        if (sock->ops->shutdown == sock_no_shutdown) {
1019                dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
1020                *err = -EINVAL;
1021                sockfd_put(sock);
1022                return NULL;
1023        }
1024
1025        return sock;
1026}
1027
1028static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
1029                          bool netlink)
1030{
1031        struct nbd_config *config = nbd->config;
1032        struct socket *sock;
1033        struct nbd_sock **socks;
1034        struct nbd_sock *nsock;
1035        int err;
1036
1037        sock = nbd_get_socket(nbd, arg, &err);
1038        if (!sock)
1039                return err;
1040
1041        /*
1042         * We need to make sure we don't get any errant requests while we're
1043         * reallocating the ->socks array.
1044         */
1045        blk_mq_freeze_queue(nbd->disk->queue);
1046
1047        if (!netlink && !nbd->task_setup &&
1048            !test_bit(NBD_RT_BOUND, &config->runtime_flags))
1049                nbd->task_setup = current;
1050
1051        if (!netlink &&
1052            (nbd->task_setup != current ||
1053             test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
1054                dev_err(disk_to_dev(nbd->disk),
1055                        "Device being setup by another task");
1056                err = -EBUSY;
1057                goto put_socket;
1058        }
1059
1060        nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
1061        if (!nsock) {
1062                err = -ENOMEM;
1063                goto put_socket;
1064        }
1065
1066        socks = krealloc(config->socks, (config->num_connections + 1) *
1067                         sizeof(struct nbd_sock *), GFP_KERNEL);
1068        if (!socks) {
1069                kfree(nsock);
1070                err = -ENOMEM;
1071                goto put_socket;
1072        }
1073
1074        config->socks = socks;
1075
1076        nsock->fallback_index = -1;
1077        nsock->dead = false;
1078        mutex_init(&nsock->tx_lock);
1079        nsock->sock = sock;
1080        nsock->pending = NULL;
1081        nsock->sent = 0;
1082        nsock->cookie = 0;
1083        socks[config->num_connections++] = nsock;
1084        atomic_inc(&config->live_connections);
1085        blk_mq_unfreeze_queue(nbd->disk->queue);
1086
1087        return 0;
1088
1089put_socket:
1090        blk_mq_unfreeze_queue(nbd->disk->queue);
1091        sockfd_put(sock);
1092        return err;
1093}
1094
1095static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
1096{
1097        struct nbd_config *config = nbd->config;
1098        struct socket *sock, *old;
1099        struct recv_thread_args *args;
1100        int i;
1101        int err;
1102
1103        sock = nbd_get_socket(nbd, arg, &err);
1104        if (!sock)
1105                return err;
1106
1107        args = kzalloc(sizeof(*args), GFP_KERNEL);
1108        if (!args) {
1109                sockfd_put(sock);
1110                return -ENOMEM;
1111        }
1112
1113        for (i = 0; i < config->num_connections; i++) {
1114                struct nbd_sock *nsock = config->socks[i];
1115
1116                if (!nsock->dead)
1117                        continue;
1118
1119                mutex_lock(&nsock->tx_lock);
1120                if (!nsock->dead) {
1121                        mutex_unlock(&nsock->tx_lock);
1122                        continue;
1123                }
1124                sk_set_memalloc(sock->sk);
1125                if (nbd->tag_set.timeout)
1126                        sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
1127                atomic_inc(&config->recv_threads);
1128                refcount_inc(&nbd->config_refs);
1129                old = nsock->sock;
1130                nsock->fallback_index = -1;
1131                nsock->sock = sock;
1132                nsock->dead = false;
1133                INIT_WORK(&args->work, recv_work);
1134                args->index = i;
1135                args->nbd = nbd;
1136                nsock->cookie++;
1137                mutex_unlock(&nsock->tx_lock);
1138                sockfd_put(old);
1139
1140                clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
1141
1142                /* We take the tx_mutex in an error path in the recv_work, so we
1143                 * need to queue_work outside of the tx_mutex.
1144                 */
1145                queue_work(nbd->recv_workq, &args->work);
1146
1147                atomic_inc(&config->live_connections);
1148                wake_up(&config->conn_wait);
1149                return 0;
1150        }
1151        sockfd_put(sock);
1152        kfree(args);
1153        return -ENOSPC;
1154}
1155
1156static void nbd_bdev_reset(struct block_device *bdev)
1157{
1158        if (bdev->bd_openers > 1)
1159                return;
1160        set_capacity(bdev->bd_disk, 0);
1161}
1162
1163static void nbd_parse_flags(struct nbd_device *nbd)
1164{
1165        struct nbd_config *config = nbd->config;
1166        if (config->flags & NBD_FLAG_READ_ONLY)
1167                set_disk_ro(nbd->disk, true);
1168        else
1169                set_disk_ro(nbd->disk, false);
1170        if (config->flags & NBD_FLAG_SEND_TRIM)
1171                blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1172        if (config->flags & NBD_FLAG_SEND_FLUSH) {
1173                if (config->flags & NBD_FLAG_SEND_FUA)
1174                        blk_queue_write_cache(nbd->disk->queue, true, true);
1175                else
1176                        blk_queue_write_cache(nbd->disk->queue, true, false);
1177        }
1178        else
1179                blk_queue_write_cache(nbd->disk->queue, false, false);
1180}
1181
1182static void send_disconnects(struct nbd_device *nbd)
1183{
1184        struct nbd_config *config = nbd->config;
1185        struct nbd_request request = {
1186                .magic = htonl(NBD_REQUEST_MAGIC),
1187                .type = htonl(NBD_CMD_DISC),
1188        };
1189        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1190        struct iov_iter from;
1191        int i, ret;
1192
1193        for (i = 0; i < config->num_connections; i++) {
1194                struct nbd_sock *nsock = config->socks[i];
1195
1196                iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
1197                mutex_lock(&nsock->tx_lock);
1198                ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1199                if (ret <= 0)
1200                        dev_err(disk_to_dev(nbd->disk),
1201                                "Send disconnect failed %d\n", ret);
1202                mutex_unlock(&nsock->tx_lock);
1203        }
1204}
1205
1206static int nbd_disconnect(struct nbd_device *nbd)
1207{
1208        struct nbd_config *config = nbd->config;
1209
1210        dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1211        set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
1212        set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
1213        send_disconnects(nbd);
1214        return 0;
1215}
1216
1217static void nbd_clear_sock(struct nbd_device *nbd)
1218{
1219        sock_shutdown(nbd);
1220        nbd_clear_que(nbd);
1221        nbd->task_setup = NULL;
1222}
1223
1224static void nbd_config_put(struct nbd_device *nbd)
1225{
1226        if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1227                                        &nbd->config_lock)) {
1228                struct nbd_config *config = nbd->config;
1229                nbd_dev_dbg_close(nbd);
1230                nbd_size_clear(nbd);
1231                if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
1232                                       &config->runtime_flags))
1233                        device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1234                nbd->task_recv = NULL;
1235                if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
1236                                       &config->runtime_flags)) {
1237                        device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
1238                        kfree(nbd->backend);
1239                        nbd->backend = NULL;
1240                }
1241                nbd_clear_sock(nbd);
1242                if (config->num_connections) {
1243                        int i;
1244                        for (i = 0; i < config->num_connections; i++) {
1245                                sockfd_put(config->socks[i]->sock);
1246                                kfree(config->socks[i]);
1247                        }
1248                        kfree(config->socks);
1249                }
1250                kfree(nbd->config);
1251                nbd->config = NULL;
1252
1253                if (nbd->recv_workq)
1254                        destroy_workqueue(nbd->recv_workq);
1255                nbd->recv_workq = NULL;
1256
1257                nbd->tag_set.timeout = 0;
1258                nbd->disk->queue->limits.discard_granularity = 0;
1259                nbd->disk->queue->limits.discard_alignment = 0;
1260                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1261                blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1262
1263                mutex_unlock(&nbd->config_lock);
1264                nbd_put(nbd);
1265                module_put(THIS_MODULE);
1266        }
1267}
1268
1269static int nbd_start_device(struct nbd_device *nbd)
1270{
1271        struct nbd_config *config = nbd->config;
1272        int num_connections = config->num_connections;
1273        int error = 0, i;
1274
1275        if (nbd->task_recv)
1276                return -EBUSY;
1277        if (!config->socks)
1278                return -EINVAL;
1279        if (num_connections > 1 &&
1280            !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1281                dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1282                return -EINVAL;
1283        }
1284
1285        nbd->recv_workq = alloc_workqueue("knbd%d-recv",
1286                                          WQ_MEM_RECLAIM | WQ_HIGHPRI |
1287                                          WQ_UNBOUND, 0, nbd->index);
1288        if (!nbd->recv_workq) {
1289                dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
1290                return -ENOMEM;
1291        }
1292
1293        blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1294        nbd->task_recv = current;
1295
1296        nbd_parse_flags(nbd);
1297
1298        error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1299        if (error) {
1300                dev_err(disk_to_dev(nbd->disk), "device_create_file failed for pid!\n");
1301                return error;
1302        }
1303        set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
1304
1305        nbd_dev_dbg_init(nbd);
1306        for (i = 0; i < num_connections; i++) {
1307                struct recv_thread_args *args;
1308
1309                args = kzalloc(sizeof(*args), GFP_KERNEL);
1310                if (!args) {
1311                        sock_shutdown(nbd);
1312                        /*
1313                         * If num_connections is m (2 < m),
1314                         * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
1315                         * But NO.(n + 1) failed. We still have n recv threads.
1316                         * So, add flush_workqueue here to prevent recv threads
1317                         * dropping the last config_refs and trying to destroy
1318                         * the workqueue from inside the workqueue.
1319                         */
1320                        if (i)
1321                                flush_workqueue(nbd->recv_workq);
1322                        return -ENOMEM;
1323                }
1324                sk_set_memalloc(config->socks[i]->sock->sk);
1325                if (nbd->tag_set.timeout)
1326                        config->socks[i]->sock->sk->sk_sndtimeo =
1327                                nbd->tag_set.timeout;
1328                atomic_inc(&config->recv_threads);
1329                refcount_inc(&nbd->config_refs);
1330                INIT_WORK(&args->work, recv_work);
1331                args->nbd = nbd;
1332                args->index = i;
1333                queue_work(nbd->recv_workq, &args->work);
1334        }
1335        return nbd_set_size(nbd, config->bytesize, config->blksize);
1336}
1337
1338static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1339{
1340        struct nbd_config *config = nbd->config;
1341        int ret;
1342
1343        ret = nbd_start_device(nbd);
1344        if (ret)
1345                return ret;
1346
1347        if (max_part)
1348                set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
1349        mutex_unlock(&nbd->config_lock);
1350        ret = wait_event_interruptible(config->recv_wq,
1351                                         atomic_read(&config->recv_threads) == 0);
1352        if (ret)
1353                sock_shutdown(nbd);
1354        flush_workqueue(nbd->recv_workq);
1355
1356        mutex_lock(&nbd->config_lock);
1357        nbd_bdev_reset(bdev);
1358        /* user requested, ignore socket errors */
1359        if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
1360                ret = 0;
1361        if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
1362                ret = -ETIMEDOUT;
1363        return ret;
1364}
1365
1366static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1367                                 struct block_device *bdev)
1368{
1369        sock_shutdown(nbd);
1370        __invalidate_device(bdev, true);
1371        nbd_bdev_reset(bdev);
1372        if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
1373                               &nbd->config->runtime_flags))
1374                nbd_config_put(nbd);
1375}
1376
1377static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
1378{
1379        nbd->tag_set.timeout = timeout * HZ;
1380        if (timeout)
1381                blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1382        else
1383                blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
1384}
1385
1386/* Must be called with config_lock held */
1387static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1388                       unsigned int cmd, unsigned long arg)
1389{
1390        struct nbd_config *config = nbd->config;
1391
1392        switch (cmd) {
1393        case NBD_DISCONNECT:
1394                return nbd_disconnect(nbd);
1395        case NBD_CLEAR_SOCK:
1396                nbd_clear_sock_ioctl(nbd, bdev);
1397                return 0;
1398        case NBD_SET_SOCK:
1399                return nbd_add_socket(nbd, arg, false);
1400        case NBD_SET_BLKSIZE:
1401                return nbd_set_size(nbd, config->bytesize, arg);
1402        case NBD_SET_SIZE:
1403                return nbd_set_size(nbd, arg, config->blksize);
1404        case NBD_SET_SIZE_BLOCKS:
1405                return nbd_set_size(nbd, arg * config->blksize,
1406                                    config->blksize);
1407        case NBD_SET_TIMEOUT:
1408                nbd_set_cmd_timeout(nbd, arg);
1409                return 0;
1410
1411        case NBD_SET_FLAGS:
1412                config->flags = arg;
1413                return 0;
1414        case NBD_DO_IT:
1415                return nbd_start_device_ioctl(nbd, bdev);
1416        case NBD_CLEAR_QUE:
1417                /*
1418                 * This is for compatibility only.  The queue is always cleared
1419                 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1420                 */
1421                return 0;
1422        case NBD_PRINT_DEBUG:
1423                /*
1424                 * For compatibility only, we no longer keep a list of
1425                 * outstanding requests.
1426                 */
1427                return 0;
1428        }
1429        return -ENOTTY;
1430}
1431
1432static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1433                     unsigned int cmd, unsigned long arg)
1434{
1435        struct nbd_device *nbd = bdev->bd_disk->private_data;
1436        struct nbd_config *config = nbd->config;
1437        int error = -EINVAL;
1438
1439        if (!capable(CAP_SYS_ADMIN))
1440                return -EPERM;
1441
1442        /* The block layer will pass back some non-nbd ioctls in case we have
1443         * special handling for them, but we don't so just return an error.
1444         */
1445        if (_IOC_TYPE(cmd) != 0xab)
1446                return -EINVAL;
1447
1448        mutex_lock(&nbd->config_lock);
1449
1450        /* Don't allow ioctl operations on a nbd device that was created with
1451         * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1452         */
1453        if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
1454            (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1455                error = __nbd_ioctl(bdev, nbd, cmd, arg);
1456        else
1457                dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1458        mutex_unlock(&nbd->config_lock);
1459        return error;
1460}
1461
1462static struct nbd_config *nbd_alloc_config(void)
1463{
1464        struct nbd_config *config;
1465
1466        config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1467        if (!config)
1468                return NULL;
1469        atomic_set(&config->recv_threads, 0);
1470        init_waitqueue_head(&config->recv_wq);
1471        init_waitqueue_head(&config->conn_wait);
1472        config->blksize = NBD_DEF_BLKSIZE;
1473        atomic_set(&config->live_connections, 0);
1474        try_module_get(THIS_MODULE);
1475        return config;
1476}
1477
1478static int nbd_open(struct block_device *bdev, fmode_t mode)
1479{
1480        struct nbd_device *nbd;
1481        int ret = 0;
1482
1483        mutex_lock(&nbd_index_mutex);
1484        nbd = bdev->bd_disk->private_data;
1485        if (!nbd) {
1486                ret = -ENXIO;
1487                goto out;
1488        }
1489        if (!refcount_inc_not_zero(&nbd->refs)) {
1490                ret = -ENXIO;
1491                goto out;
1492        }
1493        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1494                struct nbd_config *config;
1495
1496                mutex_lock(&nbd->config_lock);
1497                if (refcount_inc_not_zero(&nbd->config_refs)) {
1498                        mutex_unlock(&nbd->config_lock);
1499                        goto out;
1500                }
1501                config = nbd->config = nbd_alloc_config();
1502                if (!config) {
1503                        ret = -ENOMEM;
1504                        mutex_unlock(&nbd->config_lock);
1505                        goto out;
1506                }
1507                refcount_set(&nbd->config_refs, 1);
1508                refcount_inc(&nbd->refs);
1509                mutex_unlock(&nbd->config_lock);
1510                if (max_part)
1511                        set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
1512        } else if (nbd_disconnected(nbd->config)) {
1513                if (max_part)
1514                        set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
1515        }
1516out:
1517        mutex_unlock(&nbd_index_mutex);
1518        return ret;
1519}
1520
1521static void nbd_release(struct gendisk *disk, fmode_t mode)
1522{
1523        struct nbd_device *nbd = disk->private_data;
1524
1525        if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1526                        disk->part0->bd_openers == 0)
1527                nbd_disconnect_and_put(nbd);
1528
1529        nbd_config_put(nbd);
1530        nbd_put(nbd);
1531}
1532
1533static const struct block_device_operations nbd_fops =
1534{
1535        .owner =        THIS_MODULE,
1536        .open =         nbd_open,
1537        .release =      nbd_release,
1538        .ioctl =        nbd_ioctl,
1539        .compat_ioctl = nbd_ioctl,
1540};
1541
1542#if IS_ENABLED(CONFIG_DEBUG_FS)
1543
1544static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1545{
1546        struct nbd_device *nbd = s->private;
1547
1548        if (nbd->task_recv)
1549                seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1550
1551        return 0;
1552}
1553
1554DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
1555
1556static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1557{
1558        struct nbd_device *nbd = s->private;
1559        u32 flags = nbd->config->flags;
1560
1561        seq_printf(s, "Hex: 0x%08x\n\n", flags);
1562
1563        seq_puts(s, "Known flags:\n");
1564
1565        if (flags & NBD_FLAG_HAS_FLAGS)
1566                seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1567        if (flags & NBD_FLAG_READ_ONLY)
1568                seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1569        if (flags & NBD_FLAG_SEND_FLUSH)
1570                seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1571        if (flags & NBD_FLAG_SEND_FUA)
1572                seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1573        if (flags & NBD_FLAG_SEND_TRIM)
1574                seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1575
1576        return 0;
1577}
1578
1579DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
1580
1581static int nbd_dev_dbg_init(struct nbd_device *nbd)
1582{
1583        struct dentry *dir;
1584        struct nbd_config *config = nbd->config;
1585
1586        if (!nbd_dbg_dir)
1587                return -EIO;
1588
1589        dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1590        if (!dir) {
1591                dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1592                        nbd_name(nbd));
1593                return -EIO;
1594        }
1595        config->dbg_dir = dir;
1596
1597        debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
1598        debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1599        debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1600        debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1601        debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
1602
1603        return 0;
1604}
1605
1606static void nbd_dev_dbg_close(struct nbd_device *nbd)
1607{
1608        debugfs_remove_recursive(nbd->config->dbg_dir);
1609}
1610
1611static int nbd_dbg_init(void)
1612{
1613        struct dentry *dbg_dir;
1614
1615        dbg_dir = debugfs_create_dir("nbd", NULL);
1616        if (!dbg_dir)
1617                return -EIO;
1618
1619        nbd_dbg_dir = dbg_dir;
1620
1621        return 0;
1622}
1623
1624static void nbd_dbg_close(void)
1625{
1626        debugfs_remove_recursive(nbd_dbg_dir);
1627}
1628
1629#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1630
1631static int nbd_dev_dbg_init(struct nbd_device *nbd)
1632{
1633        return 0;
1634}
1635
1636static void nbd_dev_dbg_close(struct nbd_device *nbd)
1637{
1638}
1639
1640static int nbd_dbg_init(void)
1641{
1642        return 0;
1643}
1644
1645static void nbd_dbg_close(void)
1646{
1647}
1648
1649#endif
1650
1651static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1652                            unsigned int hctx_idx, unsigned int numa_node)
1653{
1654        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1655        cmd->nbd = set->driver_data;
1656        cmd->flags = 0;
1657        mutex_init(&cmd->lock);
1658        return 0;
1659}
1660
1661static const struct blk_mq_ops nbd_mq_ops = {
1662        .queue_rq       = nbd_queue_rq,
1663        .complete       = nbd_complete_rq,
1664        .init_request   = nbd_init_request,
1665        .timeout        = nbd_xmit_timeout,
1666};
1667
1668static int nbd_dev_add(int index)
1669{
1670        struct nbd_device *nbd;
1671        struct gendisk *disk;
1672        int err = -ENOMEM;
1673
1674        nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1675        if (!nbd)
1676                goto out;
1677
1678        nbd->tag_set.ops = &nbd_mq_ops;
1679        nbd->tag_set.nr_hw_queues = 1;
1680        nbd->tag_set.queue_depth = 128;
1681        nbd->tag_set.numa_node = NUMA_NO_NODE;
1682        nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1683        nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1684                BLK_MQ_F_BLOCKING;
1685        nbd->tag_set.driver_data = nbd;
1686        nbd->destroy_complete = NULL;
1687        nbd->backend = NULL;
1688
1689        err = blk_mq_alloc_tag_set(&nbd->tag_set);
1690        if (err)
1691                goto out_free_nbd;
1692
1693        if (index >= 0) {
1694                err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1695                                GFP_KERNEL);
1696                if (err == -ENOSPC)
1697                        err = -EEXIST;
1698        } else {
1699                err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1700                if (err >= 0)
1701                        index = err;
1702        }
1703        if (err < 0)
1704                goto out_free_tags;
1705        nbd->index = index;
1706
1707        disk = blk_mq_alloc_disk(&nbd->tag_set, NULL);
1708        if (IS_ERR(disk)) {
1709                err = PTR_ERR(disk);
1710                goto out_free_idr;
1711        }
1712        nbd->disk = disk;
1713
1714        /*
1715         * Tell the block layer that we are not a rotational device
1716         */
1717        blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1718        blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1719        disk->queue->limits.discard_granularity = 0;
1720        disk->queue->limits.discard_alignment = 0;
1721        blk_queue_max_discard_sectors(disk->queue, 0);
1722        blk_queue_max_segment_size(disk->queue, UINT_MAX);
1723        blk_queue_max_segments(disk->queue, USHRT_MAX);
1724        blk_queue_max_hw_sectors(disk->queue, 65536);
1725        disk->queue->limits.max_sectors = 256;
1726
1727        mutex_init(&nbd->config_lock);
1728        refcount_set(&nbd->config_refs, 0);
1729        refcount_set(&nbd->refs, 1);
1730        INIT_LIST_HEAD(&nbd->list);
1731        disk->major = NBD_MAJOR;
1732        disk->first_minor = index << part_shift;
1733        disk->minors = 1 << part_shift;
1734        disk->fops = &nbd_fops;
1735        disk->private_data = nbd;
1736        sprintf(disk->disk_name, "nbd%d", index);
1737        add_disk(disk);
1738        nbd_total_devices++;
1739        return index;
1740
1741out_free_idr:
1742        idr_remove(&nbd_index_idr, index);
1743out_free_tags:
1744        blk_mq_free_tag_set(&nbd->tag_set);
1745out_free_nbd:
1746        kfree(nbd);
1747out:
1748        return err;
1749}
1750
1751static int find_free_cb(int id, void *ptr, void *data)
1752{
1753        struct nbd_device *nbd = ptr;
1754        struct nbd_device **found = data;
1755
1756        if (!refcount_read(&nbd->config_refs)) {
1757                *found = nbd;
1758                return 1;
1759        }
1760        return 0;
1761}
1762
1763/* Netlink interface. */
1764static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1765        [NBD_ATTR_INDEX]                =       { .type = NLA_U32 },
1766        [NBD_ATTR_SIZE_BYTES]           =       { .type = NLA_U64 },
1767        [NBD_ATTR_BLOCK_SIZE_BYTES]     =       { .type = NLA_U64 },
1768        [NBD_ATTR_TIMEOUT]              =       { .type = NLA_U64 },
1769        [NBD_ATTR_SERVER_FLAGS]         =       { .type = NLA_U64 },
1770        [NBD_ATTR_CLIENT_FLAGS]         =       { .type = NLA_U64 },
1771        [NBD_ATTR_SOCKETS]              =       { .type = NLA_NESTED},
1772        [NBD_ATTR_DEAD_CONN_TIMEOUT]    =       { .type = NLA_U64 },
1773        [NBD_ATTR_DEVICE_LIST]          =       { .type = NLA_NESTED},
1774        [NBD_ATTR_BACKEND_IDENTIFIER]   =       { .type = NLA_STRING},
1775};
1776
1777static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1778        [NBD_SOCK_FD]                   =       { .type = NLA_U32 },
1779};
1780
1781/* We don't use this right now since we don't parse the incoming list, but we
1782 * still want it here so userspace knows what to expect.
1783 */
1784static const struct nla_policy __attribute__((unused))
1785nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1786        [NBD_DEVICE_INDEX]              =       { .type = NLA_U32 },
1787        [NBD_DEVICE_CONNECTED]          =       { .type = NLA_U8 },
1788};
1789
1790static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
1791{
1792        struct nbd_config *config = nbd->config;
1793        u64 bsize = config->blksize;
1794        u64 bytes = config->bytesize;
1795
1796        if (info->attrs[NBD_ATTR_SIZE_BYTES])
1797                bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1798
1799        if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
1800                bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1801
1802        if (bytes != config->bytesize || bsize != config->blksize)
1803                return nbd_set_size(nbd, bytes, bsize);
1804        return 0;
1805}
1806
1807static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1808{
1809        DECLARE_COMPLETION_ONSTACK(destroy_complete);
1810        struct nbd_device *nbd = NULL;
1811        struct nbd_config *config;
1812        int index = -1;
1813        int ret;
1814        bool put_dev = false;
1815
1816        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1817                return -EPERM;
1818
1819        if (info->attrs[NBD_ATTR_INDEX])
1820                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1821        if (!info->attrs[NBD_ATTR_SOCKETS]) {
1822                printk(KERN_ERR "nbd: must specify at least one socket\n");
1823                return -EINVAL;
1824        }
1825        if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1826                printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1827                return -EINVAL;
1828        }
1829again:
1830        mutex_lock(&nbd_index_mutex);
1831        if (index == -1) {
1832                ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1833                if (ret == 0) {
1834                        int new_index;
1835                        new_index = nbd_dev_add(-1);
1836                        if (new_index < 0) {
1837                                mutex_unlock(&nbd_index_mutex);
1838                                printk(KERN_ERR "nbd: failed to add new device\n");
1839                                return new_index;
1840                        }
1841                        nbd = idr_find(&nbd_index_idr, new_index);
1842                }
1843        } else {
1844                nbd = idr_find(&nbd_index_idr, index);
1845                if (!nbd) {
1846                        ret = nbd_dev_add(index);
1847                        if (ret < 0) {
1848                                mutex_unlock(&nbd_index_mutex);
1849                                printk(KERN_ERR "nbd: failed to add new device\n");
1850                                return ret;
1851                        }
1852                        nbd = idr_find(&nbd_index_idr, index);
1853                }
1854        }
1855        if (!nbd) {
1856                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1857                       index);
1858                mutex_unlock(&nbd_index_mutex);
1859                return -EINVAL;
1860        }
1861
1862        if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
1863            test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
1864                nbd->destroy_complete = &destroy_complete;
1865                mutex_unlock(&nbd_index_mutex);
1866
1867                /* Wait untill the the nbd stuff is totally destroyed */
1868                wait_for_completion(&destroy_complete);
1869                goto again;
1870        }
1871
1872        if (!refcount_inc_not_zero(&nbd->refs)) {
1873                mutex_unlock(&nbd_index_mutex);
1874                if (index == -1)
1875                        goto again;
1876                printk(KERN_ERR "nbd: device at index %d is going down\n",
1877                       index);
1878                return -EINVAL;
1879        }
1880        mutex_unlock(&nbd_index_mutex);
1881
1882        mutex_lock(&nbd->config_lock);
1883        if (refcount_read(&nbd->config_refs)) {
1884                mutex_unlock(&nbd->config_lock);
1885                nbd_put(nbd);
1886                if (index == -1)
1887                        goto again;
1888                printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1889                return -EBUSY;
1890        }
1891        if (WARN_ON(nbd->config)) {
1892                mutex_unlock(&nbd->config_lock);
1893                nbd_put(nbd);
1894                return -EINVAL;
1895        }
1896        config = nbd->config = nbd_alloc_config();
1897        if (!nbd->config) {
1898                mutex_unlock(&nbd->config_lock);
1899                nbd_put(nbd);
1900                printk(KERN_ERR "nbd: couldn't allocate config\n");
1901                return -ENOMEM;
1902        }
1903        refcount_set(&nbd->config_refs, 1);
1904        set_bit(NBD_RT_BOUND, &config->runtime_flags);
1905
1906        ret = nbd_genl_size_set(info, nbd);
1907        if (ret)
1908                goto out;
1909
1910        if (info->attrs[NBD_ATTR_TIMEOUT])
1911                nbd_set_cmd_timeout(nbd,
1912                                    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
1913        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1914                config->dead_conn_timeout =
1915                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1916                config->dead_conn_timeout *= HZ;
1917        }
1918        if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1919                config->flags =
1920                        nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1921        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1922                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1923                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1924                        /*
1925                         * We have 1 ref to keep the device around, and then 1
1926                         * ref for our current operation here, which will be
1927                         * inherited by the config.  If we already have
1928                         * DESTROY_ON_DISCONNECT set then we know we don't have
1929                         * that extra ref already held so we don't need the
1930                         * put_dev.
1931                         */
1932                        if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1933                                              &nbd->flags))
1934                                put_dev = true;
1935                } else {
1936                        if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1937                                               &nbd->flags))
1938                                refcount_inc(&nbd->refs);
1939                }
1940                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1941                        set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
1942                                &config->runtime_flags);
1943                }
1944        }
1945
1946        if (info->attrs[NBD_ATTR_SOCKETS]) {
1947                struct nlattr *attr;
1948                int rem, fd;
1949
1950                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1951                                    rem) {
1952                        struct nlattr *socks[NBD_SOCK_MAX+1];
1953
1954                        if (nla_type(attr) != NBD_SOCK_ITEM) {
1955                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1956                                ret = -EINVAL;
1957                                goto out;
1958                        }
1959                        ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
1960                                                          attr,
1961                                                          nbd_sock_policy,
1962                                                          info->extack);
1963                        if (ret != 0) {
1964                                printk(KERN_ERR "nbd: error processing sock list\n");
1965                                ret = -EINVAL;
1966                                goto out;
1967                        }
1968                        if (!socks[NBD_SOCK_FD])
1969                                continue;
1970                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1971                        ret = nbd_add_socket(nbd, fd, true);
1972                        if (ret)
1973                                goto out;
1974                }
1975        }
1976        ret = nbd_start_device(nbd);
1977        if (ret)
1978                goto out;
1979        if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
1980                nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
1981                                          GFP_KERNEL);
1982                if (!nbd->backend) {
1983                        ret = -ENOMEM;
1984                        goto out;
1985                }
1986        }
1987        ret = device_create_file(disk_to_dev(nbd->disk), &backend_attr);
1988        if (ret) {
1989                dev_err(disk_to_dev(nbd->disk),
1990                        "device_create_file failed for backend!\n");
1991                goto out;
1992        }
1993        set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
1994out:
1995        mutex_unlock(&nbd->config_lock);
1996        if (!ret) {
1997                set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
1998                refcount_inc(&nbd->config_refs);
1999                nbd_connect_reply(info, nbd->index);
2000        }
2001        nbd_config_put(nbd);
2002        if (put_dev)
2003                nbd_put(nbd);
2004        return ret;
2005}
2006
2007static void nbd_disconnect_and_put(struct nbd_device *nbd)
2008{
2009        mutex_lock(&nbd->config_lock);
2010        nbd_disconnect(nbd);
2011        sock_shutdown(nbd);
2012        /*
2013         * Make sure recv thread has finished, so it does not drop the last
2014         * config ref and try to destroy the workqueue from inside the work
2015         * queue. And this also ensure that we can safely call nbd_clear_que()
2016         * to cancel the inflight I/Os.
2017         */
2018        if (nbd->recv_workq)
2019                flush_workqueue(nbd->recv_workq);
2020        nbd_clear_que(nbd);
2021        nbd->task_setup = NULL;
2022        mutex_unlock(&nbd->config_lock);
2023
2024        if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
2025                               &nbd->config->runtime_flags))
2026                nbd_config_put(nbd);
2027}
2028
2029static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
2030{
2031        struct nbd_device *nbd;
2032        int index;
2033
2034        if (!netlink_capable(skb, CAP_SYS_ADMIN))
2035                return -EPERM;
2036
2037        if (!info->attrs[NBD_ATTR_INDEX]) {
2038                printk(KERN_ERR "nbd: must specify an index to disconnect\n");
2039                return -EINVAL;
2040        }
2041        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2042        mutex_lock(&nbd_index_mutex);
2043        nbd = idr_find(&nbd_index_idr, index);
2044        if (!nbd) {
2045                mutex_unlock(&nbd_index_mutex);
2046                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
2047                       index);
2048                return -EINVAL;
2049        }
2050        if (!refcount_inc_not_zero(&nbd->refs)) {
2051                mutex_unlock(&nbd_index_mutex);
2052                printk(KERN_ERR "nbd: device at index %d is going down\n",
2053                       index);
2054                return -EINVAL;
2055        }
2056        mutex_unlock(&nbd_index_mutex);
2057        if (!refcount_inc_not_zero(&nbd->config_refs))
2058                goto put_nbd;
2059        nbd_disconnect_and_put(nbd);
2060        nbd_config_put(nbd);
2061put_nbd:
2062        nbd_put(nbd);
2063        return 0;
2064}
2065
2066static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
2067{
2068        struct nbd_device *nbd = NULL;
2069        struct nbd_config *config;
2070        int index;
2071        int ret = 0;
2072        bool put_dev = false;
2073
2074        if (!netlink_capable(skb, CAP_SYS_ADMIN))
2075                return -EPERM;
2076
2077        if (!info->attrs[NBD_ATTR_INDEX]) {
2078                printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
2079                return -EINVAL;
2080        }
2081        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2082        mutex_lock(&nbd_index_mutex);
2083        nbd = idr_find(&nbd_index_idr, index);
2084        if (!nbd) {
2085                mutex_unlock(&nbd_index_mutex);
2086                printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
2087                       index);
2088                return -EINVAL;
2089        }
2090        if (nbd->backend) {
2091                if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
2092                        if (nla_strcmp(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
2093                                       nbd->backend)) {
2094                                mutex_unlock(&nbd_index_mutex);
2095                                dev_err(nbd_to_dev(nbd),
2096                                        "backend image doesn't match with %s\n",
2097                                        nbd->backend);
2098                                return -EINVAL;
2099                        }
2100                } else {
2101                        mutex_unlock(&nbd_index_mutex);
2102                        dev_err(nbd_to_dev(nbd), "must specify backend\n");
2103                        return -EINVAL;
2104                }
2105        }
2106        if (!refcount_inc_not_zero(&nbd->refs)) {
2107                mutex_unlock(&nbd_index_mutex);
2108                printk(KERN_ERR "nbd: device at index %d is going down\n",
2109                       index);
2110                return -EINVAL;
2111        }
2112        mutex_unlock(&nbd_index_mutex);
2113
2114        if (!refcount_inc_not_zero(&nbd->config_refs)) {
2115                dev_err(nbd_to_dev(nbd),
2116                        "not configured, cannot reconfigure\n");
2117                nbd_put(nbd);
2118                return -EINVAL;
2119        }
2120
2121        mutex_lock(&nbd->config_lock);
2122        config = nbd->config;
2123        if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
2124            !nbd->task_recv) {
2125                dev_err(nbd_to_dev(nbd),
2126                        "not configured, cannot reconfigure\n");
2127                ret = -EINVAL;
2128                goto out;
2129        }
2130
2131        ret = nbd_genl_size_set(info, nbd);
2132        if (ret)
2133                goto out;
2134
2135        if (info->attrs[NBD_ATTR_TIMEOUT])
2136                nbd_set_cmd_timeout(nbd,
2137                                    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
2138        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
2139                config->dead_conn_timeout =
2140                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
2141                config->dead_conn_timeout *= HZ;
2142        }
2143        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
2144                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
2145                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
2146                        if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
2147                                              &nbd->flags))
2148                                put_dev = true;
2149                } else {
2150                        if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
2151                                               &nbd->flags))
2152                                refcount_inc(&nbd->refs);
2153                }
2154
2155                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
2156                        set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2157                                        &config->runtime_flags);
2158                } else {
2159                        clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2160                                        &config->runtime_flags);
2161                }
2162        }
2163
2164        if (info->attrs[NBD_ATTR_SOCKETS]) {
2165                struct nlattr *attr;
2166                int rem, fd;
2167
2168                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2169                                    rem) {
2170                        struct nlattr *socks[NBD_SOCK_MAX+1];
2171
2172                        if (nla_type(attr) != NBD_SOCK_ITEM) {
2173                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
2174                                ret = -EINVAL;
2175                                goto out;
2176                        }
2177                        ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2178                                                          attr,
2179                                                          nbd_sock_policy,
2180                                                          info->extack);
2181                        if (ret != 0) {
2182                                printk(KERN_ERR "nbd: error processing sock list\n");
2183                                ret = -EINVAL;
2184                                goto out;
2185                        }
2186                        if (!socks[NBD_SOCK_FD])
2187                                continue;
2188                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2189                        ret = nbd_reconnect_socket(nbd, fd);
2190                        if (ret) {
2191                                if (ret == -ENOSPC)
2192                                        ret = 0;
2193                                goto out;
2194                        }
2195                        dev_info(nbd_to_dev(nbd), "reconnected socket\n");
2196                }
2197        }
2198out:
2199        mutex_unlock(&nbd->config_lock);
2200        nbd_config_put(nbd);
2201        nbd_put(nbd);
2202        if (put_dev)
2203                nbd_put(nbd);
2204        return ret;
2205}
2206
2207static const struct genl_small_ops nbd_connect_genl_ops[] = {
2208        {
2209                .cmd    = NBD_CMD_CONNECT,
2210                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2211                .doit   = nbd_genl_connect,
2212        },
2213        {
2214                .cmd    = NBD_CMD_DISCONNECT,
2215                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2216                .doit   = nbd_genl_disconnect,
2217        },
2218        {
2219                .cmd    = NBD_CMD_RECONFIGURE,
2220                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2221                .doit   = nbd_genl_reconfigure,
2222        },
2223        {
2224                .cmd    = NBD_CMD_STATUS,
2225                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2226                .doit   = nbd_genl_status,
2227        },
2228};
2229
2230static const struct genl_multicast_group nbd_mcast_grps[] = {
2231        { .name = NBD_GENL_MCAST_GROUP_NAME, },
2232};
2233
2234static struct genl_family nbd_genl_family __ro_after_init = {
2235        .hdrsize        = 0,
2236        .name           = NBD_GENL_FAMILY_NAME,
2237        .version        = NBD_GENL_VERSION,
2238        .module         = THIS_MODULE,
2239        .small_ops      = nbd_connect_genl_ops,
2240        .n_small_ops    = ARRAY_SIZE(nbd_connect_genl_ops),
2241        .maxattr        = NBD_ATTR_MAX,
2242        .policy = nbd_attr_policy,
2243        .mcgrps         = nbd_mcast_grps,
2244        .n_mcgrps       = ARRAY_SIZE(nbd_mcast_grps),
2245};
2246
2247static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2248{
2249        struct nlattr *dev_opt;
2250        u8 connected = 0;
2251        int ret;
2252
2253        /* This is a little racey, but for status it's ok.  The
2254         * reason we don't take a ref here is because we can't
2255         * take a ref in the index == -1 case as we would need
2256         * to put under the nbd_index_mutex, which could
2257         * deadlock if we are configured to remove ourselves
2258         * once we're disconnected.
2259         */
2260        if (refcount_read(&nbd->config_refs))
2261                connected = 1;
2262        dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
2263        if (!dev_opt)
2264                return -EMSGSIZE;
2265        ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2266        if (ret)
2267                return -EMSGSIZE;
2268        ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2269                         connected);
2270        if (ret)
2271                return -EMSGSIZE;
2272        nla_nest_end(reply, dev_opt);
2273        return 0;
2274}
2275
2276static int status_cb(int id, void *ptr, void *data)
2277{
2278        struct nbd_device *nbd = ptr;
2279        return populate_nbd_status(nbd, (struct sk_buff *)data);
2280}
2281
2282static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2283{
2284        struct nlattr *dev_list;
2285        struct sk_buff *reply;
2286        void *reply_head;
2287        size_t msg_size;
2288        int index = -1;
2289        int ret = -ENOMEM;
2290
2291        if (info->attrs[NBD_ATTR_INDEX])
2292                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2293
2294        mutex_lock(&nbd_index_mutex);
2295
2296        msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2297                                  nla_attr_size(sizeof(u8)));
2298        msg_size *= (index == -1) ? nbd_total_devices : 1;
2299
2300        reply = genlmsg_new(msg_size, GFP_KERNEL);
2301        if (!reply)
2302                goto out;
2303        reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2304                                       NBD_CMD_STATUS);
2305        if (!reply_head) {
2306                nlmsg_free(reply);
2307                goto out;
2308        }
2309
2310        dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
2311        if (index == -1) {
2312                ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2313                if (ret) {
2314                        nlmsg_free(reply);
2315                        goto out;
2316                }
2317        } else {
2318                struct nbd_device *nbd;
2319                nbd = idr_find(&nbd_index_idr, index);
2320                if (nbd) {
2321                        ret = populate_nbd_status(nbd, reply);
2322                        if (ret) {
2323                                nlmsg_free(reply);
2324                                goto out;
2325                        }
2326                }
2327        }
2328        nla_nest_end(reply, dev_list);
2329        genlmsg_end(reply, reply_head);
2330        ret = genlmsg_reply(reply, info);
2331out:
2332        mutex_unlock(&nbd_index_mutex);
2333        return ret;
2334}
2335
2336static void nbd_connect_reply(struct genl_info *info, int index)
2337{
2338        struct sk_buff *skb;
2339        void *msg_head;
2340        int ret;
2341
2342        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2343        if (!skb)
2344                return;
2345        msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2346                                     NBD_CMD_CONNECT);
2347        if (!msg_head) {
2348                nlmsg_free(skb);
2349                return;
2350        }
2351        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2352        if (ret) {
2353                nlmsg_free(skb);
2354                return;
2355        }
2356        genlmsg_end(skb, msg_head);
2357        genlmsg_reply(skb, info);
2358}
2359
2360static void nbd_mcast_index(int index)
2361{
2362        struct sk_buff *skb;
2363        void *msg_head;
2364        int ret;
2365
2366        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2367        if (!skb)
2368                return;
2369        msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2370                                     NBD_CMD_LINK_DEAD);
2371        if (!msg_head) {
2372                nlmsg_free(skb);
2373                return;
2374        }
2375        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2376        if (ret) {
2377                nlmsg_free(skb);
2378                return;
2379        }
2380        genlmsg_end(skb, msg_head);
2381        genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2382}
2383
2384static void nbd_dead_link_work(struct work_struct *work)
2385{
2386        struct link_dead_args *args = container_of(work, struct link_dead_args,
2387                                                   work);
2388        nbd_mcast_index(args->index);
2389        kfree(args);
2390}
2391
2392static int __init nbd_init(void)
2393{
2394        int i;
2395
2396        BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2397
2398        if (max_part < 0) {
2399                printk(KERN_ERR "nbd: max_part must be >= 0\n");
2400                return -EINVAL;
2401        }
2402
2403        part_shift = 0;
2404        if (max_part > 0) {
2405                part_shift = fls(max_part);
2406
2407                /*
2408                 * Adjust max_part according to part_shift as it is exported
2409                 * to user space so that user can know the max number of
2410                 * partition kernel should be able to manage.
2411                 *
2412                 * Note that -1 is required because partition 0 is reserved
2413                 * for the whole disk.
2414                 */
2415                max_part = (1UL << part_shift) - 1;
2416        }
2417
2418        if ((1UL << part_shift) > DISK_MAX_PARTS)
2419                return -EINVAL;
2420
2421        if (nbds_max > 1UL << (MINORBITS - part_shift))
2422                return -EINVAL;
2423
2424        if (register_blkdev(NBD_MAJOR, "nbd"))
2425                return -EIO;
2426
2427        if (genl_register_family(&nbd_genl_family)) {
2428                unregister_blkdev(NBD_MAJOR, "nbd");
2429                return -EINVAL;
2430        }
2431        nbd_dbg_init();
2432
2433        mutex_lock(&nbd_index_mutex);
2434        for (i = 0; i < nbds_max; i++)
2435                nbd_dev_add(i);
2436        mutex_unlock(&nbd_index_mutex);
2437        return 0;
2438}
2439
2440static int nbd_exit_cb(int id, void *ptr, void *data)
2441{
2442        struct list_head *list = (struct list_head *)data;
2443        struct nbd_device *nbd = ptr;
2444
2445        list_add_tail(&nbd->list, list);
2446        return 0;
2447}
2448
2449static void __exit nbd_cleanup(void)
2450{
2451        struct nbd_device *nbd;
2452        LIST_HEAD(del_list);
2453
2454        nbd_dbg_close();
2455
2456        mutex_lock(&nbd_index_mutex);
2457        idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2458        mutex_unlock(&nbd_index_mutex);
2459
2460        while (!list_empty(&del_list)) {
2461                nbd = list_first_entry(&del_list, struct nbd_device, list);
2462                list_del_init(&nbd->list);
2463                if (refcount_read(&nbd->refs) != 1)
2464                        printk(KERN_ERR "nbd: possibly leaking a device\n");
2465                nbd_put(nbd);
2466        }
2467
2468        idr_destroy(&nbd_index_idr);
2469        genl_unregister_family(&nbd_genl_family);
2470        unregister_blkdev(NBD_MAJOR, "nbd");
2471}
2472
2473module_init(nbd_init);
2474module_exit(nbd_cleanup);
2475
2476MODULE_DESCRIPTION("Network Block Device");
2477MODULE_LICENSE("GPL");
2478
2479module_param(nbds_max, int, 0444);
2480MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2481module_param(max_part, int, 0444);
2482MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2483