linux/drivers/block/nbd.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Network block device - make block devices work over TCP
   4 *
   5 * Note that you can not swap over this thing, yet. Seems to work but
   6 * deadlocks sometimes - you can not swap over TCP in general.
   7 * 
   8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
   9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
  10 *
  11 * (part of code stolen from loop.c)
  12 */
  13
  14#include <linux/major.h>
  15
  16#include <linux/blkdev.h>
  17#include <linux/module.h>
  18#include <linux/init.h>
  19#include <linux/sched.h>
  20#include <linux/sched/mm.h>
  21#include <linux/fs.h>
  22#include <linux/bio.h>
  23#include <linux/stat.h>
  24#include <linux/errno.h>
  25#include <linux/file.h>
  26#include <linux/ioctl.h>
  27#include <linux/mutex.h>
  28#include <linux/compiler.h>
  29#include <linux/completion.h>
  30#include <linux/err.h>
  31#include <linux/kernel.h>
  32#include <linux/slab.h>
  33#include <net/sock.h>
  34#include <linux/net.h>
  35#include <linux/kthread.h>
  36#include <linux/types.h>
  37#include <linux/debugfs.h>
  38#include <linux/blk-mq.h>
  39
  40#include <linux/uaccess.h>
  41#include <asm/types.h>
  42
  43#include <linux/nbd.h>
  44#include <linux/nbd-netlink.h>
  45#include <net/genetlink.h>
  46
  47#define CREATE_TRACE_POINTS
  48#include <trace/events/nbd.h>
  49
  50static DEFINE_IDR(nbd_index_idr);
  51static DEFINE_MUTEX(nbd_index_mutex);
  52static int nbd_total_devices = 0;
  53
  54struct nbd_sock {
  55        struct socket *sock;
  56        struct mutex tx_lock;
  57        struct request *pending;
  58        int sent;
  59        bool dead;
  60        int fallback_index;
  61        int cookie;
  62};
  63
  64struct recv_thread_args {
  65        struct work_struct work;
  66        struct nbd_device *nbd;
  67        int index;
  68};
  69
  70struct link_dead_args {
  71        struct work_struct work;
  72        int index;
  73};
  74
  75#define NBD_RT_TIMEDOUT                 0
  76#define NBD_RT_DISCONNECT_REQUESTED     1
  77#define NBD_RT_DISCONNECTED             2
  78#define NBD_RT_HAS_PID_FILE             3
  79#define NBD_RT_HAS_CONFIG_REF           4
  80#define NBD_RT_BOUND                    5
  81#define NBD_RT_DISCONNECT_ON_CLOSE      6
  82
  83#define NBD_DESTROY_ON_DISCONNECT       0
  84#define NBD_DISCONNECT_REQUESTED        1
  85
  86struct nbd_config {
  87        u32 flags;
  88        unsigned long runtime_flags;
  89        u64 dead_conn_timeout;
  90
  91        struct nbd_sock **socks;
  92        int num_connections;
  93        atomic_t live_connections;
  94        wait_queue_head_t conn_wait;
  95
  96        atomic_t recv_threads;
  97        wait_queue_head_t recv_wq;
  98        loff_t blksize;
  99        loff_t bytesize;
 100#if IS_ENABLED(CONFIG_DEBUG_FS)
 101        struct dentry *dbg_dir;
 102#endif
 103};
 104
 105struct nbd_device {
 106        struct blk_mq_tag_set tag_set;
 107
 108        int index;
 109        refcount_t config_refs;
 110        refcount_t refs;
 111        struct nbd_config *config;
 112        struct mutex config_lock;
 113        struct gendisk *disk;
 114        struct workqueue_struct *recv_workq;
 115
 116        struct list_head list;
 117        struct task_struct *task_recv;
 118        struct task_struct *task_setup;
 119
 120        struct completion *destroy_complete;
 121        unsigned long flags;
 122};
 123
 124#define NBD_CMD_REQUEUED        1
 125
 126struct nbd_cmd {
 127        struct nbd_device *nbd;
 128        struct mutex lock;
 129        int index;
 130        int cookie;
 131        int retries;
 132        blk_status_t status;
 133        unsigned long flags;
 134        u32 cmd_cookie;
 135};
 136
 137#if IS_ENABLED(CONFIG_DEBUG_FS)
 138static struct dentry *nbd_dbg_dir;
 139#endif
 140
 141#define nbd_name(nbd) ((nbd)->disk->disk_name)
 142
 143#define NBD_MAGIC 0x68797548
 144
 145#define NBD_DEF_BLKSIZE 1024
 146
 147static unsigned int nbds_max = 16;
 148static int max_part = 16;
 149static int part_shift;
 150
 151static int nbd_dev_dbg_init(struct nbd_device *nbd);
 152static void nbd_dev_dbg_close(struct nbd_device *nbd);
 153static void nbd_config_put(struct nbd_device *nbd);
 154static void nbd_connect_reply(struct genl_info *info, int index);
 155static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
 156static void nbd_dead_link_work(struct work_struct *work);
 157static void nbd_disconnect_and_put(struct nbd_device *nbd);
 158
 159static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 160{
 161        return disk_to_dev(nbd->disk);
 162}
 163
 164static void nbd_requeue_cmd(struct nbd_cmd *cmd)
 165{
 166        struct request *req = blk_mq_rq_from_pdu(cmd);
 167
 168        if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
 169                blk_mq_requeue_request(req, true);
 170}
 171
 172#define NBD_COOKIE_BITS 32
 173
 174static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
 175{
 176        struct request *req = blk_mq_rq_from_pdu(cmd);
 177        u32 tag = blk_mq_unique_tag(req);
 178        u64 cookie = cmd->cmd_cookie;
 179
 180        return (cookie << NBD_COOKIE_BITS) | tag;
 181}
 182
 183static u32 nbd_handle_to_tag(u64 handle)
 184{
 185        return (u32)handle;
 186}
 187
 188static u32 nbd_handle_to_cookie(u64 handle)
 189{
 190        return (u32)(handle >> NBD_COOKIE_BITS);
 191}
 192
 193static const char *nbdcmd_to_ascii(int cmd)
 194{
 195        switch (cmd) {
 196        case  NBD_CMD_READ: return "read";
 197        case NBD_CMD_WRITE: return "write";
 198        case  NBD_CMD_DISC: return "disconnect";
 199        case NBD_CMD_FLUSH: return "flush";
 200        case  NBD_CMD_TRIM: return "trim/discard";
 201        }
 202        return "invalid";
 203}
 204
 205static ssize_t pid_show(struct device *dev,
 206                        struct device_attribute *attr, char *buf)
 207{
 208        struct gendisk *disk = dev_to_disk(dev);
 209        struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 210
 211        return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
 212}
 213
 214static const struct device_attribute pid_attr = {
 215        .attr = { .name = "pid", .mode = 0444},
 216        .show = pid_show,
 217};
 218
 219static void nbd_dev_remove(struct nbd_device *nbd)
 220{
 221        struct gendisk *disk = nbd->disk;
 222        struct request_queue *q;
 223
 224        if (disk) {
 225                q = disk->queue;
 226                del_gendisk(disk);
 227                blk_cleanup_queue(q);
 228                blk_mq_free_tag_set(&nbd->tag_set);
 229                disk->private_data = NULL;
 230                put_disk(disk);
 231        }
 232
 233        /*
 234         * Place this in the last just before the nbd is freed to
 235         * make sure that the disk and the related kobject are also
 236         * totally removed to avoid duplicate creation of the same
 237         * one.
 238         */
 239        if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete)
 240                complete(nbd->destroy_complete);
 241
 242        kfree(nbd);
 243}
 244
 245static void nbd_put(struct nbd_device *nbd)
 246{
 247        if (refcount_dec_and_mutex_lock(&nbd->refs,
 248                                        &nbd_index_mutex)) {
 249                idr_remove(&nbd_index_idr, nbd->index);
 250                nbd_dev_remove(nbd);
 251                mutex_unlock(&nbd_index_mutex);
 252        }
 253}
 254
 255static int nbd_disconnected(struct nbd_config *config)
 256{
 257        return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
 258                test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
 259}
 260
 261static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 262                                int notify)
 263{
 264        if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
 265                struct link_dead_args *args;
 266                args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
 267                if (args) {
 268                        INIT_WORK(&args->work, nbd_dead_link_work);
 269                        args->index = nbd->index;
 270                        queue_work(system_wq, &args->work);
 271                }
 272        }
 273        if (!nsock->dead) {
 274                kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
 275                if (atomic_dec_return(&nbd->config->live_connections) == 0) {
 276                        if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
 277                                               &nbd->config->runtime_flags)) {
 278                                set_bit(NBD_RT_DISCONNECTED,
 279                                        &nbd->config->runtime_flags);
 280                                dev_info(nbd_to_dev(nbd),
 281                                        "Disconnected due to user request.\n");
 282                        }
 283                }
 284        }
 285        nsock->dead = true;
 286        nsock->pending = NULL;
 287        nsock->sent = 0;
 288}
 289
 290static void nbd_size_clear(struct nbd_device *nbd)
 291{
 292        if (nbd->config->bytesize) {
 293                set_capacity(nbd->disk, 0);
 294                kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 295        }
 296}
 297
 298static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
 299                loff_t blksize)
 300{
 301        if (!blksize)
 302                blksize = NBD_DEF_BLKSIZE;
 303        if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize))
 304                return -EINVAL;
 305
 306        nbd->config->bytesize = bytesize;
 307        nbd->config->blksize = blksize;
 308
 309        if (!nbd->task_recv)
 310                return 0;
 311
 312        if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
 313                nbd->disk->queue->limits.discard_granularity = blksize;
 314                nbd->disk->queue->limits.discard_alignment = blksize;
 315                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
 316        }
 317        blk_queue_logical_block_size(nbd->disk->queue, blksize);
 318        blk_queue_physical_block_size(nbd->disk->queue, blksize);
 319
 320        if (max_part)
 321                set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
 322        if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
 323                kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 324        return 0;
 325}
 326
 327static void nbd_complete_rq(struct request *req)
 328{
 329        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 330
 331        dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
 332                cmd->status ? "failed" : "done");
 333
 334        blk_mq_end_request(req, cmd->status);
 335}
 336
 337/*
 338 * Forcibly shutdown the socket causing all listeners to error
 339 */
 340static void sock_shutdown(struct nbd_device *nbd)
 341{
 342        struct nbd_config *config = nbd->config;
 343        int i;
 344
 345        if (config->num_connections == 0)
 346                return;
 347        if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 348                return;
 349
 350        for (i = 0; i < config->num_connections; i++) {
 351                struct nbd_sock *nsock = config->socks[i];
 352                mutex_lock(&nsock->tx_lock);
 353                nbd_mark_nsock_dead(nbd, nsock, 0);
 354                mutex_unlock(&nsock->tx_lock);
 355        }
 356        dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 357}
 358
 359static u32 req_to_nbd_cmd_type(struct request *req)
 360{
 361        switch (req_op(req)) {
 362        case REQ_OP_DISCARD:
 363                return NBD_CMD_TRIM;
 364        case REQ_OP_FLUSH:
 365                return NBD_CMD_FLUSH;
 366        case REQ_OP_WRITE:
 367                return NBD_CMD_WRITE;
 368        case REQ_OP_READ:
 369                return NBD_CMD_READ;
 370        default:
 371                return U32_MAX;
 372        }
 373}
 374
 375static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 376                                                 bool reserved)
 377{
 378        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 379        struct nbd_device *nbd = cmd->nbd;
 380        struct nbd_config *config;
 381
 382        if (!mutex_trylock(&cmd->lock))
 383                return BLK_EH_RESET_TIMER;
 384
 385        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 386                cmd->status = BLK_STS_TIMEOUT;
 387                mutex_unlock(&cmd->lock);
 388                goto done;
 389        }
 390        config = nbd->config;
 391
 392        if (config->num_connections > 1 ||
 393            (config->num_connections == 1 && nbd->tag_set.timeout)) {
 394                dev_err_ratelimited(nbd_to_dev(nbd),
 395                                    "Connection timed out, retrying (%d/%d alive)\n",
 396                                    atomic_read(&config->live_connections),
 397                                    config->num_connections);
 398                /*
 399                 * Hooray we have more connections, requeue this IO, the submit
 400                 * path will put it on a real connection. Or if only one
 401                 * connection is configured, the submit path will wait util
 402                 * a new connection is reconfigured or util dead timeout.
 403                 */
 404                if (config->socks) {
 405                        if (cmd->index < config->num_connections) {
 406                                struct nbd_sock *nsock =
 407                                        config->socks[cmd->index];
 408                                mutex_lock(&nsock->tx_lock);
 409                                /* We can have multiple outstanding requests, so
 410                                 * we don't want to mark the nsock dead if we've
 411                                 * already reconnected with a new socket, so
 412                                 * only mark it dead if its the same socket we
 413                                 * were sent out on.
 414                                 */
 415                                if (cmd->cookie == nsock->cookie)
 416                                        nbd_mark_nsock_dead(nbd, nsock, 1);
 417                                mutex_unlock(&nsock->tx_lock);
 418                        }
 419                        mutex_unlock(&cmd->lock);
 420                        nbd_requeue_cmd(cmd);
 421                        nbd_config_put(nbd);
 422                        return BLK_EH_DONE;
 423                }
 424        }
 425
 426        if (!nbd->tag_set.timeout) {
 427                /*
 428                 * Userspace sets timeout=0 to disable socket disconnection,
 429                 * so just warn and reset the timer.
 430                 */
 431                struct nbd_sock *nsock = config->socks[cmd->index];
 432                cmd->retries++;
 433                dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
 434                        req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
 435                        (unsigned long long)blk_rq_pos(req) << 9,
 436                        blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
 437
 438                mutex_lock(&nsock->tx_lock);
 439                if (cmd->cookie != nsock->cookie) {
 440                        nbd_requeue_cmd(cmd);
 441                        mutex_unlock(&nsock->tx_lock);
 442                        mutex_unlock(&cmd->lock);
 443                        nbd_config_put(nbd);
 444                        return BLK_EH_DONE;
 445                }
 446                mutex_unlock(&nsock->tx_lock);
 447                mutex_unlock(&cmd->lock);
 448                nbd_config_put(nbd);
 449                return BLK_EH_RESET_TIMER;
 450        }
 451
 452        dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
 453        set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
 454        cmd->status = BLK_STS_IOERR;
 455        mutex_unlock(&cmd->lock);
 456        sock_shutdown(nbd);
 457        nbd_config_put(nbd);
 458done:
 459        blk_mq_complete_request(req);
 460        return BLK_EH_DONE;
 461}
 462
 463/*
 464 *  Send or receive packet.
 465 */
 466static int sock_xmit(struct nbd_device *nbd, int index, int send,
 467                     struct iov_iter *iter, int msg_flags, int *sent)
 468{
 469        struct nbd_config *config = nbd->config;
 470        struct socket *sock = config->socks[index]->sock;
 471        int result;
 472        struct msghdr msg;
 473        unsigned int noreclaim_flag;
 474
 475        if (unlikely(!sock)) {
 476                dev_err_ratelimited(disk_to_dev(nbd->disk),
 477                        "Attempted %s on closed socket in sock_xmit\n",
 478                        (send ? "send" : "recv"));
 479                return -EINVAL;
 480        }
 481
 482        msg.msg_iter = *iter;
 483
 484        noreclaim_flag = memalloc_noreclaim_save();
 485        do {
 486                sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 487                msg.msg_name = NULL;
 488                msg.msg_namelen = 0;
 489                msg.msg_control = NULL;
 490                msg.msg_controllen = 0;
 491                msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 492
 493                if (send)
 494                        result = sock_sendmsg(sock, &msg);
 495                else
 496                        result = sock_recvmsg(sock, &msg, msg.msg_flags);
 497
 498                if (result <= 0) {
 499                        if (result == 0)
 500                                result = -EPIPE; /* short read */
 501                        break;
 502                }
 503                if (sent)
 504                        *sent += result;
 505        } while (msg_data_left(&msg));
 506
 507        memalloc_noreclaim_restore(noreclaim_flag);
 508
 509        return result;
 510}
 511
 512/*
 513 * Different settings for sk->sk_sndtimeo can result in different return values
 514 * if there is a signal pending when we enter sendmsg, because reasons?
 515 */
 516static inline int was_interrupted(int result)
 517{
 518        return result == -ERESTARTSYS || result == -EINTR;
 519}
 520
 521/* always call with the tx_lock held */
 522static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 523{
 524        struct request *req = blk_mq_rq_from_pdu(cmd);
 525        struct nbd_config *config = nbd->config;
 526        struct nbd_sock *nsock = config->socks[index];
 527        int result;
 528        struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
 529        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
 530        struct iov_iter from;
 531        unsigned long size = blk_rq_bytes(req);
 532        struct bio *bio;
 533        u64 handle;
 534        u32 type;
 535        u32 nbd_cmd_flags = 0;
 536        int sent = nsock->sent, skip = 0;
 537
 538        iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
 539
 540        type = req_to_nbd_cmd_type(req);
 541        if (type == U32_MAX)
 542                return -EIO;
 543
 544        if (rq_data_dir(req) == WRITE &&
 545            (config->flags & NBD_FLAG_READ_ONLY)) {
 546                dev_err_ratelimited(disk_to_dev(nbd->disk),
 547                                    "Write on read-only\n");
 548                return -EIO;
 549        }
 550
 551        if (req->cmd_flags & REQ_FUA)
 552                nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
 553
 554        /* We did a partial send previously, and we at least sent the whole
 555         * request struct, so just go and send the rest of the pages in the
 556         * request.
 557         */
 558        if (sent) {
 559                if (sent >= sizeof(request)) {
 560                        skip = sent - sizeof(request);
 561
 562                        /* initialize handle for tracing purposes */
 563                        handle = nbd_cmd_handle(cmd);
 564
 565                        goto send_pages;
 566                }
 567                iov_iter_advance(&from, sent);
 568        } else {
 569                cmd->cmd_cookie++;
 570        }
 571        cmd->index = index;
 572        cmd->cookie = nsock->cookie;
 573        cmd->retries = 0;
 574        request.type = htonl(type | nbd_cmd_flags);
 575        if (type != NBD_CMD_FLUSH) {
 576                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 577                request.len = htonl(size);
 578        }
 579        handle = nbd_cmd_handle(cmd);
 580        memcpy(request.handle, &handle, sizeof(handle));
 581
 582        trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
 583
 584        dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 585                req, nbdcmd_to_ascii(type),
 586                (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 587        result = sock_xmit(nbd, index, 1, &from,
 588                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
 589        trace_nbd_header_sent(req, handle);
 590        if (result <= 0) {
 591                if (was_interrupted(result)) {
 592                        /* If we havne't sent anything we can just return BUSY,
 593                         * however if we have sent something we need to make
 594                         * sure we only allow this req to be sent until we are
 595                         * completely done.
 596                         */
 597                        if (sent) {
 598                                nsock->pending = req;
 599                                nsock->sent = sent;
 600                        }
 601                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 602                        return BLK_STS_RESOURCE;
 603                }
 604                dev_err_ratelimited(disk_to_dev(nbd->disk),
 605                        "Send control failed (result %d)\n", result);
 606                return -EAGAIN;
 607        }
 608send_pages:
 609        if (type != NBD_CMD_WRITE)
 610                goto out;
 611
 612        bio = req->bio;
 613        while (bio) {
 614                struct bio *next = bio->bi_next;
 615                struct bvec_iter iter;
 616                struct bio_vec bvec;
 617
 618                bio_for_each_segment(bvec, bio, iter) {
 619                        bool is_last = !next && bio_iter_last(bvec, iter);
 620                        int flags = is_last ? 0 : MSG_MORE;
 621
 622                        dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 623                                req, bvec.bv_len);
 624                        iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
 625                        if (skip) {
 626                                if (skip >= iov_iter_count(&from)) {
 627                                        skip -= iov_iter_count(&from);
 628                                        continue;
 629                                }
 630                                iov_iter_advance(&from, skip);
 631                                skip = 0;
 632                        }
 633                        result = sock_xmit(nbd, index, 1, &from, flags, &sent);
 634                        if (result <= 0) {
 635                                if (was_interrupted(result)) {
 636                                        /* We've already sent the header, we
 637                                         * have no choice but to set pending and
 638                                         * return BUSY.
 639                                         */
 640                                        nsock->pending = req;
 641                                        nsock->sent = sent;
 642                                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 643                                        return BLK_STS_RESOURCE;
 644                                }
 645                                dev_err(disk_to_dev(nbd->disk),
 646                                        "Send data failed (result %d)\n",
 647                                        result);
 648                                return -EAGAIN;
 649                        }
 650                        /*
 651                         * The completion might already have come in,
 652                         * so break for the last one instead of letting
 653                         * the iterator do it. This prevents use-after-free
 654                         * of the bio.
 655                         */
 656                        if (is_last)
 657                                break;
 658                }
 659                bio = next;
 660        }
 661out:
 662        trace_nbd_payload_sent(req, handle);
 663        nsock->pending = NULL;
 664        nsock->sent = 0;
 665        return 0;
 666}
 667
 668/* NULL returned = something went wrong, inform userspace */
 669static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 670{
 671        struct nbd_config *config = nbd->config;
 672        int result;
 673        struct nbd_reply reply;
 674        struct nbd_cmd *cmd;
 675        struct request *req = NULL;
 676        u64 handle;
 677        u16 hwq;
 678        u32 tag;
 679        struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
 680        struct iov_iter to;
 681        int ret = 0;
 682
 683        reply.magic = 0;
 684        iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
 685        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 686        if (result <= 0) {
 687                if (!nbd_disconnected(config))
 688                        dev_err(disk_to_dev(nbd->disk),
 689                                "Receive control failed (result %d)\n", result);
 690                return ERR_PTR(result);
 691        }
 692
 693        if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
 694                dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 695                                (unsigned long)ntohl(reply.magic));
 696                return ERR_PTR(-EPROTO);
 697        }
 698
 699        memcpy(&handle, reply.handle, sizeof(handle));
 700        tag = nbd_handle_to_tag(handle);
 701        hwq = blk_mq_unique_tag_to_hwq(tag);
 702        if (hwq < nbd->tag_set.nr_hw_queues)
 703                req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
 704                                       blk_mq_unique_tag_to_tag(tag));
 705        if (!req || !blk_mq_request_started(req)) {
 706                dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
 707                        tag, req);
 708                return ERR_PTR(-ENOENT);
 709        }
 710        trace_nbd_header_received(req, handle);
 711        cmd = blk_mq_rq_to_pdu(req);
 712
 713        mutex_lock(&cmd->lock);
 714        if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
 715                dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
 716                        req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
 717                ret = -ENOENT;
 718                goto out;
 719        }
 720        if (cmd->status != BLK_STS_OK) {
 721                dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
 722                        req);
 723                ret = -ENOENT;
 724                goto out;
 725        }
 726        if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
 727                dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
 728                        req);
 729                ret = -ENOENT;
 730                goto out;
 731        }
 732        if (ntohl(reply.error)) {
 733                dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 734                        ntohl(reply.error));
 735                cmd->status = BLK_STS_IOERR;
 736                goto out;
 737        }
 738
 739        dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
 740        if (rq_data_dir(req) != WRITE) {
 741                struct req_iterator iter;
 742                struct bio_vec bvec;
 743
 744                rq_for_each_segment(bvec, req, iter) {
 745                        iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
 746                        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 747                        if (result <= 0) {
 748                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 749                                        result);
 750                                /*
 751                                 * If we've disconnected, we need to make sure we
 752                                 * complete this request, otherwise error out
 753                                 * and let the timeout stuff handle resubmitting
 754                                 * this request onto another connection.
 755                                 */
 756                                if (nbd_disconnected(config)) {
 757                                        cmd->status = BLK_STS_IOERR;
 758                                        goto out;
 759                                }
 760                                ret = -EIO;
 761                                goto out;
 762                        }
 763                        dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
 764                                req, bvec.bv_len);
 765                }
 766        }
 767out:
 768        trace_nbd_payload_received(req, handle);
 769        mutex_unlock(&cmd->lock);
 770        return ret ? ERR_PTR(ret) : cmd;
 771}
 772
 773static void recv_work(struct work_struct *work)
 774{
 775        struct recv_thread_args *args = container_of(work,
 776                                                     struct recv_thread_args,
 777                                                     work);
 778        struct nbd_device *nbd = args->nbd;
 779        struct nbd_config *config = nbd->config;
 780        struct nbd_cmd *cmd;
 781        struct request *rq;
 782
 783        while (1) {
 784                cmd = nbd_read_stat(nbd, args->index);
 785                if (IS_ERR(cmd)) {
 786                        struct nbd_sock *nsock = config->socks[args->index];
 787
 788                        mutex_lock(&nsock->tx_lock);
 789                        nbd_mark_nsock_dead(nbd, nsock, 1);
 790                        mutex_unlock(&nsock->tx_lock);
 791                        break;
 792                }
 793
 794                rq = blk_mq_rq_from_pdu(cmd);
 795                if (likely(!blk_should_fake_timeout(rq->q)))
 796                        blk_mq_complete_request(rq);
 797        }
 798        nbd_config_put(nbd);
 799        atomic_dec(&config->recv_threads);
 800        wake_up(&config->recv_wq);
 801        kfree(args);
 802}
 803
 804static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 805{
 806        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 807
 808        mutex_lock(&cmd->lock);
 809        cmd->status = BLK_STS_IOERR;
 810        mutex_unlock(&cmd->lock);
 811
 812        blk_mq_complete_request(req);
 813        return true;
 814}
 815
 816static void nbd_clear_que(struct nbd_device *nbd)
 817{
 818        blk_mq_quiesce_queue(nbd->disk->queue);
 819        blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 820        blk_mq_unquiesce_queue(nbd->disk->queue);
 821        dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 822}
 823
 824static int find_fallback(struct nbd_device *nbd, int index)
 825{
 826        struct nbd_config *config = nbd->config;
 827        int new_index = -1;
 828        struct nbd_sock *nsock = config->socks[index];
 829        int fallback = nsock->fallback_index;
 830
 831        if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 832                return new_index;
 833
 834        if (config->num_connections <= 1) {
 835                dev_err_ratelimited(disk_to_dev(nbd->disk),
 836                                    "Dead connection, failed to find a fallback\n");
 837                return new_index;
 838        }
 839
 840        if (fallback >= 0 && fallback < config->num_connections &&
 841            !config->socks[fallback]->dead)
 842                return fallback;
 843
 844        if (nsock->fallback_index < 0 ||
 845            nsock->fallback_index >= config->num_connections ||
 846            config->socks[nsock->fallback_index]->dead) {
 847                int i;
 848                for (i = 0; i < config->num_connections; i++) {
 849                        if (i == index)
 850                                continue;
 851                        if (!config->socks[i]->dead) {
 852                                new_index = i;
 853                                break;
 854                        }
 855                }
 856                nsock->fallback_index = new_index;
 857                if (new_index < 0) {
 858                        dev_err_ratelimited(disk_to_dev(nbd->disk),
 859                                            "Dead connection, failed to find a fallback\n");
 860                        return new_index;
 861                }
 862        }
 863        new_index = nsock->fallback_index;
 864        return new_index;
 865}
 866
 867static int wait_for_reconnect(struct nbd_device *nbd)
 868{
 869        struct nbd_config *config = nbd->config;
 870        if (!config->dead_conn_timeout)
 871                return 0;
 872        if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 873                return 0;
 874        return wait_event_timeout(config->conn_wait,
 875                                  atomic_read(&config->live_connections) > 0,
 876                                  config->dead_conn_timeout) > 0;
 877}
 878
 879static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 880{
 881        struct request *req = blk_mq_rq_from_pdu(cmd);
 882        struct nbd_device *nbd = cmd->nbd;
 883        struct nbd_config *config;
 884        struct nbd_sock *nsock;
 885        int ret;
 886
 887        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 888                dev_err_ratelimited(disk_to_dev(nbd->disk),
 889                                    "Socks array is empty\n");
 890                blk_mq_start_request(req);
 891                return -EINVAL;
 892        }
 893        config = nbd->config;
 894
 895        if (index >= config->num_connections) {
 896                dev_err_ratelimited(disk_to_dev(nbd->disk),
 897                                    "Attempted send on invalid socket\n");
 898                nbd_config_put(nbd);
 899                blk_mq_start_request(req);
 900                return -EINVAL;
 901        }
 902        cmd->status = BLK_STS_OK;
 903again:
 904        nsock = config->socks[index];
 905        mutex_lock(&nsock->tx_lock);
 906        if (nsock->dead) {
 907                int old_index = index;
 908                index = find_fallback(nbd, index);
 909                mutex_unlock(&nsock->tx_lock);
 910                if (index < 0) {
 911                        if (wait_for_reconnect(nbd)) {
 912                                index = old_index;
 913                                goto again;
 914                        }
 915                        /* All the sockets should already be down at this point,
 916                         * we just want to make sure that DISCONNECTED is set so
 917                         * any requests that come in that were queue'ed waiting
 918                         * for the reconnect timer don't trigger the timer again
 919                         * and instead just error out.
 920                         */
 921                        sock_shutdown(nbd);
 922                        nbd_config_put(nbd);
 923                        blk_mq_start_request(req);
 924                        return -EIO;
 925                }
 926                goto again;
 927        }
 928
 929        /* Handle the case that we have a pending request that was partially
 930         * transmitted that _has_ to be serviced first.  We need to call requeue
 931         * here so that it gets put _after_ the request that is already on the
 932         * dispatch list.
 933         */
 934        blk_mq_start_request(req);
 935        if (unlikely(nsock->pending && nsock->pending != req)) {
 936                nbd_requeue_cmd(cmd);
 937                ret = 0;
 938                goto out;
 939        }
 940        /*
 941         * Some failures are related to the link going down, so anything that
 942         * returns EAGAIN can be retried on a different socket.
 943         */
 944        ret = nbd_send_cmd(nbd, cmd, index);
 945        if (ret == -EAGAIN) {
 946                dev_err_ratelimited(disk_to_dev(nbd->disk),
 947                                    "Request send failed, requeueing\n");
 948                nbd_mark_nsock_dead(nbd, nsock, 1);
 949                nbd_requeue_cmd(cmd);
 950                ret = 0;
 951        }
 952out:
 953        mutex_unlock(&nsock->tx_lock);
 954        nbd_config_put(nbd);
 955        return ret;
 956}
 957
 958static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 959                        const struct blk_mq_queue_data *bd)
 960{
 961        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 962        int ret;
 963
 964        /*
 965         * Since we look at the bio's to send the request over the network we
 966         * need to make sure the completion work doesn't mark this request done
 967         * before we are done doing our send.  This keeps us from dereferencing
 968         * freed data if we have particularly fast completions (ie we get the
 969         * completion before we exit sock_xmit on the last bvec) or in the case
 970         * that the server is misbehaving (or there was an error) before we're
 971         * done sending everything over the wire.
 972         */
 973        mutex_lock(&cmd->lock);
 974        clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
 975
 976        /* We can be called directly from the user space process, which means we
 977         * could possibly have signals pending so our sendmsg will fail.  In
 978         * this case we need to return that we are busy, otherwise error out as
 979         * appropriate.
 980         */
 981        ret = nbd_handle_cmd(cmd, hctx->queue_num);
 982        if (ret < 0)
 983                ret = BLK_STS_IOERR;
 984        else if (!ret)
 985                ret = BLK_STS_OK;
 986        mutex_unlock(&cmd->lock);
 987
 988        return ret;
 989}
 990
 991static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
 992                                     int *err)
 993{
 994        struct socket *sock;
 995
 996        *err = 0;
 997        sock = sockfd_lookup(fd, err);
 998        if (!sock)
 999                return NULL;
1000
1001        if (sock->ops->shutdown == sock_no_shutdown) {
1002                dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
1003                *err = -EINVAL;
1004                sockfd_put(sock);
1005                return NULL;
1006        }
1007
1008        return sock;
1009}
1010
1011static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
1012                          bool netlink)
1013{
1014        struct nbd_config *config = nbd->config;
1015        struct socket *sock;
1016        struct nbd_sock **socks;
1017        struct nbd_sock *nsock;
1018        int err;
1019
1020        sock = nbd_get_socket(nbd, arg, &err);
1021        if (!sock)
1022                return err;
1023
1024        /*
1025         * We need to make sure we don't get any errant requests while we're
1026         * reallocating the ->socks array.
1027         */
1028        blk_mq_freeze_queue(nbd->disk->queue);
1029
1030        if (!netlink && !nbd->task_setup &&
1031            !test_bit(NBD_RT_BOUND, &config->runtime_flags))
1032                nbd->task_setup = current;
1033
1034        if (!netlink &&
1035            (nbd->task_setup != current ||
1036             test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
1037                dev_err(disk_to_dev(nbd->disk),
1038                        "Device being setup by another task");
1039                err = -EBUSY;
1040                goto put_socket;
1041        }
1042
1043        nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
1044        if (!nsock) {
1045                err = -ENOMEM;
1046                goto put_socket;
1047        }
1048
1049        socks = krealloc(config->socks, (config->num_connections + 1) *
1050                         sizeof(struct nbd_sock *), GFP_KERNEL);
1051        if (!socks) {
1052                kfree(nsock);
1053                err = -ENOMEM;
1054                goto put_socket;
1055        }
1056
1057        config->socks = socks;
1058
1059        nsock->fallback_index = -1;
1060        nsock->dead = false;
1061        mutex_init(&nsock->tx_lock);
1062        nsock->sock = sock;
1063        nsock->pending = NULL;
1064        nsock->sent = 0;
1065        nsock->cookie = 0;
1066        socks[config->num_connections++] = nsock;
1067        atomic_inc(&config->live_connections);
1068        blk_mq_unfreeze_queue(nbd->disk->queue);
1069
1070        return 0;
1071
1072put_socket:
1073        blk_mq_unfreeze_queue(nbd->disk->queue);
1074        sockfd_put(sock);
1075        return err;
1076}
1077
1078static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
1079{
1080        struct nbd_config *config = nbd->config;
1081        struct socket *sock, *old;
1082        struct recv_thread_args *args;
1083        int i;
1084        int err;
1085
1086        sock = nbd_get_socket(nbd, arg, &err);
1087        if (!sock)
1088                return err;
1089
1090        args = kzalloc(sizeof(*args), GFP_KERNEL);
1091        if (!args) {
1092                sockfd_put(sock);
1093                return -ENOMEM;
1094        }
1095
1096        for (i = 0; i < config->num_connections; i++) {
1097                struct nbd_sock *nsock = config->socks[i];
1098
1099                if (!nsock->dead)
1100                        continue;
1101
1102                mutex_lock(&nsock->tx_lock);
1103                if (!nsock->dead) {
1104                        mutex_unlock(&nsock->tx_lock);
1105                        continue;
1106                }
1107                sk_set_memalloc(sock->sk);
1108                if (nbd->tag_set.timeout)
1109                        sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
1110                atomic_inc(&config->recv_threads);
1111                refcount_inc(&nbd->config_refs);
1112                old = nsock->sock;
1113                nsock->fallback_index = -1;
1114                nsock->sock = sock;
1115                nsock->dead = false;
1116                INIT_WORK(&args->work, recv_work);
1117                args->index = i;
1118                args->nbd = nbd;
1119                nsock->cookie++;
1120                mutex_unlock(&nsock->tx_lock);
1121                sockfd_put(old);
1122
1123                clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
1124
1125                /* We take the tx_mutex in an error path in the recv_work, so we
1126                 * need to queue_work outside of the tx_mutex.
1127                 */
1128                queue_work(nbd->recv_workq, &args->work);
1129
1130                atomic_inc(&config->live_connections);
1131                wake_up(&config->conn_wait);
1132                return 0;
1133        }
1134        sockfd_put(sock);
1135        kfree(args);
1136        return -ENOSPC;
1137}
1138
1139static void nbd_bdev_reset(struct block_device *bdev)
1140{
1141        if (bdev->bd_openers > 1)
1142                return;
1143        set_capacity(bdev->bd_disk, 0);
1144}
1145
1146static void nbd_parse_flags(struct nbd_device *nbd)
1147{
1148        struct nbd_config *config = nbd->config;
1149        if (config->flags & NBD_FLAG_READ_ONLY)
1150                set_disk_ro(nbd->disk, true);
1151        else
1152                set_disk_ro(nbd->disk, false);
1153        if (config->flags & NBD_FLAG_SEND_TRIM)
1154                blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1155        if (config->flags & NBD_FLAG_SEND_FLUSH) {
1156                if (config->flags & NBD_FLAG_SEND_FUA)
1157                        blk_queue_write_cache(nbd->disk->queue, true, true);
1158                else
1159                        blk_queue_write_cache(nbd->disk->queue, true, false);
1160        }
1161        else
1162                blk_queue_write_cache(nbd->disk->queue, false, false);
1163}
1164
1165static void send_disconnects(struct nbd_device *nbd)
1166{
1167        struct nbd_config *config = nbd->config;
1168        struct nbd_request request = {
1169                .magic = htonl(NBD_REQUEST_MAGIC),
1170                .type = htonl(NBD_CMD_DISC),
1171        };
1172        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1173        struct iov_iter from;
1174        int i, ret;
1175
1176        for (i = 0; i < config->num_connections; i++) {
1177                struct nbd_sock *nsock = config->socks[i];
1178
1179                iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
1180                mutex_lock(&nsock->tx_lock);
1181                ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1182                if (ret <= 0)
1183                        dev_err(disk_to_dev(nbd->disk),
1184                                "Send disconnect failed %d\n", ret);
1185                mutex_unlock(&nsock->tx_lock);
1186        }
1187}
1188
1189static int nbd_disconnect(struct nbd_device *nbd)
1190{
1191        struct nbd_config *config = nbd->config;
1192
1193        dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1194        set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
1195        set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
1196        send_disconnects(nbd);
1197        return 0;
1198}
1199
1200static void nbd_clear_sock(struct nbd_device *nbd)
1201{
1202        sock_shutdown(nbd);
1203        nbd_clear_que(nbd);
1204        nbd->task_setup = NULL;
1205}
1206
1207static void nbd_config_put(struct nbd_device *nbd)
1208{
1209        if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1210                                        &nbd->config_lock)) {
1211                struct nbd_config *config = nbd->config;
1212                nbd_dev_dbg_close(nbd);
1213                nbd_size_clear(nbd);
1214                if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
1215                                       &config->runtime_flags))
1216                        device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1217                nbd->task_recv = NULL;
1218                nbd_clear_sock(nbd);
1219                if (config->num_connections) {
1220                        int i;
1221                        for (i = 0; i < config->num_connections; i++) {
1222                                sockfd_put(config->socks[i]->sock);
1223                                kfree(config->socks[i]);
1224                        }
1225                        kfree(config->socks);
1226                }
1227                kfree(nbd->config);
1228                nbd->config = NULL;
1229
1230                if (nbd->recv_workq)
1231                        destroy_workqueue(nbd->recv_workq);
1232                nbd->recv_workq = NULL;
1233
1234                nbd->tag_set.timeout = 0;
1235                nbd->disk->queue->limits.discard_granularity = 0;
1236                nbd->disk->queue->limits.discard_alignment = 0;
1237                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1238                blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1239
1240                mutex_unlock(&nbd->config_lock);
1241                nbd_put(nbd);
1242                module_put(THIS_MODULE);
1243        }
1244}
1245
1246static int nbd_start_device(struct nbd_device *nbd)
1247{
1248        struct nbd_config *config = nbd->config;
1249        int num_connections = config->num_connections;
1250        int error = 0, i;
1251
1252        if (nbd->task_recv)
1253                return -EBUSY;
1254        if (!config->socks)
1255                return -EINVAL;
1256        if (num_connections > 1 &&
1257            !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1258                dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1259                return -EINVAL;
1260        }
1261
1262        nbd->recv_workq = alloc_workqueue("knbd%d-recv",
1263                                          WQ_MEM_RECLAIM | WQ_HIGHPRI |
1264                                          WQ_UNBOUND, 0, nbd->index);
1265        if (!nbd->recv_workq) {
1266                dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
1267                return -ENOMEM;
1268        }
1269
1270        blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1271        nbd->task_recv = current;
1272
1273        nbd_parse_flags(nbd);
1274
1275        error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1276        if (error) {
1277                dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
1278                return error;
1279        }
1280        set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
1281
1282        nbd_dev_dbg_init(nbd);
1283        for (i = 0; i < num_connections; i++) {
1284                struct recv_thread_args *args;
1285
1286                args = kzalloc(sizeof(*args), GFP_KERNEL);
1287                if (!args) {
1288                        sock_shutdown(nbd);
1289                        /*
1290                         * If num_connections is m (2 < m),
1291                         * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
1292                         * But NO.(n + 1) failed. We still have n recv threads.
1293                         * So, add flush_workqueue here to prevent recv threads
1294                         * dropping the last config_refs and trying to destroy
1295                         * the workqueue from inside the workqueue.
1296                         */
1297                        if (i)
1298                                flush_workqueue(nbd->recv_workq);
1299                        return -ENOMEM;
1300                }
1301                sk_set_memalloc(config->socks[i]->sock->sk);
1302                if (nbd->tag_set.timeout)
1303                        config->socks[i]->sock->sk->sk_sndtimeo =
1304                                nbd->tag_set.timeout;
1305                atomic_inc(&config->recv_threads);
1306                refcount_inc(&nbd->config_refs);
1307                INIT_WORK(&args->work, recv_work);
1308                args->nbd = nbd;
1309                args->index = i;
1310                queue_work(nbd->recv_workq, &args->work);
1311        }
1312        return nbd_set_size(nbd, config->bytesize, config->blksize);
1313}
1314
1315static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1316{
1317        struct nbd_config *config = nbd->config;
1318        int ret;
1319
1320        ret = nbd_start_device(nbd);
1321        if (ret)
1322                return ret;
1323
1324        if (max_part)
1325                set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
1326        mutex_unlock(&nbd->config_lock);
1327        ret = wait_event_interruptible(config->recv_wq,
1328                                         atomic_read(&config->recv_threads) == 0);
1329        if (ret)
1330                sock_shutdown(nbd);
1331        flush_workqueue(nbd->recv_workq);
1332
1333        mutex_lock(&nbd->config_lock);
1334        nbd_bdev_reset(bdev);
1335        /* user requested, ignore socket errors */
1336        if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
1337                ret = 0;
1338        if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
1339                ret = -ETIMEDOUT;
1340        return ret;
1341}
1342
1343static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1344                                 struct block_device *bdev)
1345{
1346        sock_shutdown(nbd);
1347        __invalidate_device(bdev, true);
1348        nbd_bdev_reset(bdev);
1349        if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
1350                               &nbd->config->runtime_flags))
1351                nbd_config_put(nbd);
1352}
1353
1354static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
1355{
1356        nbd->tag_set.timeout = timeout * HZ;
1357        if (timeout)
1358                blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1359        else
1360                blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
1361}
1362
1363/* Must be called with config_lock held */
1364static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1365                       unsigned int cmd, unsigned long arg)
1366{
1367        struct nbd_config *config = nbd->config;
1368
1369        switch (cmd) {
1370        case NBD_DISCONNECT:
1371                return nbd_disconnect(nbd);
1372        case NBD_CLEAR_SOCK:
1373                nbd_clear_sock_ioctl(nbd, bdev);
1374                return 0;
1375        case NBD_SET_SOCK:
1376                return nbd_add_socket(nbd, arg, false);
1377        case NBD_SET_BLKSIZE:
1378                return nbd_set_size(nbd, config->bytesize, arg);
1379        case NBD_SET_SIZE:
1380                return nbd_set_size(nbd, arg, config->blksize);
1381        case NBD_SET_SIZE_BLOCKS:
1382                return nbd_set_size(nbd, arg * config->blksize,
1383                                    config->blksize);
1384        case NBD_SET_TIMEOUT:
1385                nbd_set_cmd_timeout(nbd, arg);
1386                return 0;
1387
1388        case NBD_SET_FLAGS:
1389                config->flags = arg;
1390                return 0;
1391        case NBD_DO_IT:
1392                return nbd_start_device_ioctl(nbd, bdev);
1393        case NBD_CLEAR_QUE:
1394                /*
1395                 * This is for compatibility only.  The queue is always cleared
1396                 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1397                 */
1398                return 0;
1399        case NBD_PRINT_DEBUG:
1400                /*
1401                 * For compatibility only, we no longer keep a list of
1402                 * outstanding requests.
1403                 */
1404                return 0;
1405        }
1406        return -ENOTTY;
1407}
1408
1409static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1410                     unsigned int cmd, unsigned long arg)
1411{
1412        struct nbd_device *nbd = bdev->bd_disk->private_data;
1413        struct nbd_config *config = nbd->config;
1414        int error = -EINVAL;
1415
1416        if (!capable(CAP_SYS_ADMIN))
1417                return -EPERM;
1418
1419        /* The block layer will pass back some non-nbd ioctls in case we have
1420         * special handling for them, but we don't so just return an error.
1421         */
1422        if (_IOC_TYPE(cmd) != 0xab)
1423                return -EINVAL;
1424
1425        mutex_lock(&nbd->config_lock);
1426
1427        /* Don't allow ioctl operations on a nbd device that was created with
1428         * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1429         */
1430        if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
1431            (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1432                error = __nbd_ioctl(bdev, nbd, cmd, arg);
1433        else
1434                dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1435        mutex_unlock(&nbd->config_lock);
1436        return error;
1437}
1438
1439static struct nbd_config *nbd_alloc_config(void)
1440{
1441        struct nbd_config *config;
1442
1443        config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1444        if (!config)
1445                return NULL;
1446        atomic_set(&config->recv_threads, 0);
1447        init_waitqueue_head(&config->recv_wq);
1448        init_waitqueue_head(&config->conn_wait);
1449        config->blksize = NBD_DEF_BLKSIZE;
1450        atomic_set(&config->live_connections, 0);
1451        try_module_get(THIS_MODULE);
1452        return config;
1453}
1454
1455static int nbd_open(struct block_device *bdev, fmode_t mode)
1456{
1457        struct nbd_device *nbd;
1458        int ret = 0;
1459
1460        mutex_lock(&nbd_index_mutex);
1461        nbd = bdev->bd_disk->private_data;
1462        if (!nbd) {
1463                ret = -ENXIO;
1464                goto out;
1465        }
1466        if (!refcount_inc_not_zero(&nbd->refs)) {
1467                ret = -ENXIO;
1468                goto out;
1469        }
1470        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1471                struct nbd_config *config;
1472
1473                mutex_lock(&nbd->config_lock);
1474                if (refcount_inc_not_zero(&nbd->config_refs)) {
1475                        mutex_unlock(&nbd->config_lock);
1476                        goto out;
1477                }
1478                config = nbd->config = nbd_alloc_config();
1479                if (!config) {
1480                        ret = -ENOMEM;
1481                        mutex_unlock(&nbd->config_lock);
1482                        goto out;
1483                }
1484                refcount_set(&nbd->config_refs, 1);
1485                refcount_inc(&nbd->refs);
1486                mutex_unlock(&nbd->config_lock);
1487                if (max_part)
1488                        set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
1489        } else if (nbd_disconnected(nbd->config)) {
1490                if (max_part)
1491                        set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
1492        }
1493out:
1494        mutex_unlock(&nbd_index_mutex);
1495        return ret;
1496}
1497
1498static void nbd_release(struct gendisk *disk, fmode_t mode)
1499{
1500        struct nbd_device *nbd = disk->private_data;
1501
1502        if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1503                        disk->part0->bd_openers == 0)
1504                nbd_disconnect_and_put(nbd);
1505
1506        nbd_config_put(nbd);
1507        nbd_put(nbd);
1508}
1509
1510static const struct block_device_operations nbd_fops =
1511{
1512        .owner =        THIS_MODULE,
1513        .open =         nbd_open,
1514        .release =      nbd_release,
1515        .ioctl =        nbd_ioctl,
1516        .compat_ioctl = nbd_ioctl,
1517};
1518
1519#if IS_ENABLED(CONFIG_DEBUG_FS)
1520
1521static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1522{
1523        struct nbd_device *nbd = s->private;
1524
1525        if (nbd->task_recv)
1526                seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1527
1528        return 0;
1529}
1530
1531DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
1532
1533static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1534{
1535        struct nbd_device *nbd = s->private;
1536        u32 flags = nbd->config->flags;
1537
1538        seq_printf(s, "Hex: 0x%08x\n\n", flags);
1539
1540        seq_puts(s, "Known flags:\n");
1541
1542        if (flags & NBD_FLAG_HAS_FLAGS)
1543                seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1544        if (flags & NBD_FLAG_READ_ONLY)
1545                seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1546        if (flags & NBD_FLAG_SEND_FLUSH)
1547                seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1548        if (flags & NBD_FLAG_SEND_FUA)
1549                seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1550        if (flags & NBD_FLAG_SEND_TRIM)
1551                seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1552
1553        return 0;
1554}
1555
1556DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
1557
1558static int nbd_dev_dbg_init(struct nbd_device *nbd)
1559{
1560        struct dentry *dir;
1561        struct nbd_config *config = nbd->config;
1562
1563        if (!nbd_dbg_dir)
1564                return -EIO;
1565
1566        dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1567        if (!dir) {
1568                dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1569                        nbd_name(nbd));
1570                return -EIO;
1571        }
1572        config->dbg_dir = dir;
1573
1574        debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
1575        debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1576        debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1577        debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1578        debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
1579
1580        return 0;
1581}
1582
1583static void nbd_dev_dbg_close(struct nbd_device *nbd)
1584{
1585        debugfs_remove_recursive(nbd->config->dbg_dir);
1586}
1587
1588static int nbd_dbg_init(void)
1589{
1590        struct dentry *dbg_dir;
1591
1592        dbg_dir = debugfs_create_dir("nbd", NULL);
1593        if (!dbg_dir)
1594                return -EIO;
1595
1596        nbd_dbg_dir = dbg_dir;
1597
1598        return 0;
1599}
1600
1601static void nbd_dbg_close(void)
1602{
1603        debugfs_remove_recursive(nbd_dbg_dir);
1604}
1605
1606#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1607
1608static int nbd_dev_dbg_init(struct nbd_device *nbd)
1609{
1610        return 0;
1611}
1612
1613static void nbd_dev_dbg_close(struct nbd_device *nbd)
1614{
1615}
1616
1617static int nbd_dbg_init(void)
1618{
1619        return 0;
1620}
1621
1622static void nbd_dbg_close(void)
1623{
1624}
1625
1626#endif
1627
1628static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1629                            unsigned int hctx_idx, unsigned int numa_node)
1630{
1631        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1632        cmd->nbd = set->driver_data;
1633        cmd->flags = 0;
1634        mutex_init(&cmd->lock);
1635        return 0;
1636}
1637
1638static const struct blk_mq_ops nbd_mq_ops = {
1639        .queue_rq       = nbd_queue_rq,
1640        .complete       = nbd_complete_rq,
1641        .init_request   = nbd_init_request,
1642        .timeout        = nbd_xmit_timeout,
1643};
1644
1645static int nbd_dev_add(int index)
1646{
1647        struct nbd_device *nbd;
1648        struct gendisk *disk;
1649        struct request_queue *q;
1650        int err = -ENOMEM;
1651
1652        nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1653        if (!nbd)
1654                goto out;
1655
1656        disk = alloc_disk(1 << part_shift);
1657        if (!disk)
1658                goto out_free_nbd;
1659
1660        if (index >= 0) {
1661                err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1662                                GFP_KERNEL);
1663                if (err == -ENOSPC)
1664                        err = -EEXIST;
1665        } else {
1666                err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1667                if (err >= 0)
1668                        index = err;
1669        }
1670        if (err < 0)
1671                goto out_free_disk;
1672
1673        nbd->index = index;
1674        nbd->disk = disk;
1675        nbd->tag_set.ops = &nbd_mq_ops;
1676        nbd->tag_set.nr_hw_queues = 1;
1677        nbd->tag_set.queue_depth = 128;
1678        nbd->tag_set.numa_node = NUMA_NO_NODE;
1679        nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1680        nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1681                BLK_MQ_F_BLOCKING;
1682        nbd->tag_set.driver_data = nbd;
1683        nbd->destroy_complete = NULL;
1684
1685        err = blk_mq_alloc_tag_set(&nbd->tag_set);
1686        if (err)
1687                goto out_free_idr;
1688
1689        q = blk_mq_init_queue(&nbd->tag_set);
1690        if (IS_ERR(q)) {
1691                err = PTR_ERR(q);
1692                goto out_free_tags;
1693        }
1694        disk->queue = q;
1695
1696        /*
1697         * Tell the block layer that we are not a rotational device
1698         */
1699        blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1700        blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1701        disk->queue->limits.discard_granularity = 0;
1702        disk->queue->limits.discard_alignment = 0;
1703        blk_queue_max_discard_sectors(disk->queue, 0);
1704        blk_queue_max_segment_size(disk->queue, UINT_MAX);
1705        blk_queue_max_segments(disk->queue, USHRT_MAX);
1706        blk_queue_max_hw_sectors(disk->queue, 65536);
1707        disk->queue->limits.max_sectors = 256;
1708
1709        mutex_init(&nbd->config_lock);
1710        refcount_set(&nbd->config_refs, 0);
1711        refcount_set(&nbd->refs, 1);
1712        INIT_LIST_HEAD(&nbd->list);
1713        disk->major = NBD_MAJOR;
1714        disk->first_minor = index << part_shift;
1715        disk->fops = &nbd_fops;
1716        disk->private_data = nbd;
1717        sprintf(disk->disk_name, "nbd%d", index);
1718        add_disk(disk);
1719        nbd_total_devices++;
1720        return index;
1721
1722out_free_tags:
1723        blk_mq_free_tag_set(&nbd->tag_set);
1724out_free_idr:
1725        idr_remove(&nbd_index_idr, index);
1726out_free_disk:
1727        put_disk(disk);
1728out_free_nbd:
1729        kfree(nbd);
1730out:
1731        return err;
1732}
1733
1734static int find_free_cb(int id, void *ptr, void *data)
1735{
1736        struct nbd_device *nbd = ptr;
1737        struct nbd_device **found = data;
1738
1739        if (!refcount_read(&nbd->config_refs)) {
1740                *found = nbd;
1741                return 1;
1742        }
1743        return 0;
1744}
1745
1746/* Netlink interface. */
1747static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1748        [NBD_ATTR_INDEX]                =       { .type = NLA_U32 },
1749        [NBD_ATTR_SIZE_BYTES]           =       { .type = NLA_U64 },
1750        [NBD_ATTR_BLOCK_SIZE_BYTES]     =       { .type = NLA_U64 },
1751        [NBD_ATTR_TIMEOUT]              =       { .type = NLA_U64 },
1752        [NBD_ATTR_SERVER_FLAGS]         =       { .type = NLA_U64 },
1753        [NBD_ATTR_CLIENT_FLAGS]         =       { .type = NLA_U64 },
1754        [NBD_ATTR_SOCKETS]              =       { .type = NLA_NESTED},
1755        [NBD_ATTR_DEAD_CONN_TIMEOUT]    =       { .type = NLA_U64 },
1756        [NBD_ATTR_DEVICE_LIST]          =       { .type = NLA_NESTED},
1757};
1758
1759static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1760        [NBD_SOCK_FD]                   =       { .type = NLA_U32 },
1761};
1762
1763/* We don't use this right now since we don't parse the incoming list, but we
1764 * still want it here so userspace knows what to expect.
1765 */
1766static const struct nla_policy __attribute__((unused))
1767nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1768        [NBD_DEVICE_INDEX]              =       { .type = NLA_U32 },
1769        [NBD_DEVICE_CONNECTED]          =       { .type = NLA_U8 },
1770};
1771
1772static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
1773{
1774        struct nbd_config *config = nbd->config;
1775        u64 bsize = config->blksize;
1776        u64 bytes = config->bytesize;
1777
1778        if (info->attrs[NBD_ATTR_SIZE_BYTES])
1779                bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1780
1781        if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
1782                bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1783
1784        if (bytes != config->bytesize || bsize != config->blksize)
1785                return nbd_set_size(nbd, bytes, bsize);
1786        return 0;
1787}
1788
1789static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1790{
1791        DECLARE_COMPLETION_ONSTACK(destroy_complete);
1792        struct nbd_device *nbd = NULL;
1793        struct nbd_config *config;
1794        int index = -1;
1795        int ret;
1796        bool put_dev = false;
1797
1798        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1799                return -EPERM;
1800
1801        if (info->attrs[NBD_ATTR_INDEX])
1802                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1803        if (!info->attrs[NBD_ATTR_SOCKETS]) {
1804                printk(KERN_ERR "nbd: must specify at least one socket\n");
1805                return -EINVAL;
1806        }
1807        if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1808                printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1809                return -EINVAL;
1810        }
1811again:
1812        mutex_lock(&nbd_index_mutex);
1813        if (index == -1) {
1814                ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1815                if (ret == 0) {
1816                        int new_index;
1817                        new_index = nbd_dev_add(-1);
1818                        if (new_index < 0) {
1819                                mutex_unlock(&nbd_index_mutex);
1820                                printk(KERN_ERR "nbd: failed to add new device\n");
1821                                return new_index;
1822                        }
1823                        nbd = idr_find(&nbd_index_idr, new_index);
1824                }
1825        } else {
1826                nbd = idr_find(&nbd_index_idr, index);
1827                if (!nbd) {
1828                        ret = nbd_dev_add(index);
1829                        if (ret < 0) {
1830                                mutex_unlock(&nbd_index_mutex);
1831                                printk(KERN_ERR "nbd: failed to add new device\n");
1832                                return ret;
1833                        }
1834                        nbd = idr_find(&nbd_index_idr, index);
1835                }
1836        }
1837        if (!nbd) {
1838                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1839                       index);
1840                mutex_unlock(&nbd_index_mutex);
1841                return -EINVAL;
1842        }
1843
1844        if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
1845            test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
1846                nbd->destroy_complete = &destroy_complete;
1847                mutex_unlock(&nbd_index_mutex);
1848
1849                /* Wait untill the the nbd stuff is totally destroyed */
1850                wait_for_completion(&destroy_complete);
1851                goto again;
1852        }
1853
1854        if (!refcount_inc_not_zero(&nbd->refs)) {
1855                mutex_unlock(&nbd_index_mutex);
1856                if (index == -1)
1857                        goto again;
1858                printk(KERN_ERR "nbd: device at index %d is going down\n",
1859                       index);
1860                return -EINVAL;
1861        }
1862        mutex_unlock(&nbd_index_mutex);
1863
1864        mutex_lock(&nbd->config_lock);
1865        if (refcount_read(&nbd->config_refs)) {
1866                mutex_unlock(&nbd->config_lock);
1867                nbd_put(nbd);
1868                if (index == -1)
1869                        goto again;
1870                printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1871                return -EBUSY;
1872        }
1873        if (WARN_ON(nbd->config)) {
1874                mutex_unlock(&nbd->config_lock);
1875                nbd_put(nbd);
1876                return -EINVAL;
1877        }
1878        config = nbd->config = nbd_alloc_config();
1879        if (!nbd->config) {
1880                mutex_unlock(&nbd->config_lock);
1881                nbd_put(nbd);
1882                printk(KERN_ERR "nbd: couldn't allocate config\n");
1883                return -ENOMEM;
1884        }
1885        refcount_set(&nbd->config_refs, 1);
1886        set_bit(NBD_RT_BOUND, &config->runtime_flags);
1887
1888        ret = nbd_genl_size_set(info, nbd);
1889        if (ret)
1890                goto out;
1891
1892        if (info->attrs[NBD_ATTR_TIMEOUT])
1893                nbd_set_cmd_timeout(nbd,
1894                                    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
1895        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1896                config->dead_conn_timeout =
1897                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1898                config->dead_conn_timeout *= HZ;
1899        }
1900        if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1901                config->flags =
1902                        nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1903        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1904                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1905                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1906                        /*
1907                         * We have 1 ref to keep the device around, and then 1
1908                         * ref for our current operation here, which will be
1909                         * inherited by the config.  If we already have
1910                         * DESTROY_ON_DISCONNECT set then we know we don't have
1911                         * that extra ref already held so we don't need the
1912                         * put_dev.
1913                         */
1914                        if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1915                                              &nbd->flags))
1916                                put_dev = true;
1917                } else {
1918                        if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1919                                               &nbd->flags))
1920                                refcount_inc(&nbd->refs);
1921                }
1922                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1923                        set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
1924                                &config->runtime_flags);
1925                }
1926        }
1927
1928        if (info->attrs[NBD_ATTR_SOCKETS]) {
1929                struct nlattr *attr;
1930                int rem, fd;
1931
1932                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1933                                    rem) {
1934                        struct nlattr *socks[NBD_SOCK_MAX+1];
1935
1936                        if (nla_type(attr) != NBD_SOCK_ITEM) {
1937                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1938                                ret = -EINVAL;
1939                                goto out;
1940                        }
1941                        ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
1942                                                          attr,
1943                                                          nbd_sock_policy,
1944                                                          info->extack);
1945                        if (ret != 0) {
1946                                printk(KERN_ERR "nbd: error processing sock list\n");
1947                                ret = -EINVAL;
1948                                goto out;
1949                        }
1950                        if (!socks[NBD_SOCK_FD])
1951                                continue;
1952                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1953                        ret = nbd_add_socket(nbd, fd, true);
1954                        if (ret)
1955                                goto out;
1956                }
1957        }
1958        ret = nbd_start_device(nbd);
1959out:
1960        mutex_unlock(&nbd->config_lock);
1961        if (!ret) {
1962                set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
1963                refcount_inc(&nbd->config_refs);
1964                nbd_connect_reply(info, nbd->index);
1965        }
1966        nbd_config_put(nbd);
1967        if (put_dev)
1968                nbd_put(nbd);
1969        return ret;
1970}
1971
1972static void nbd_disconnect_and_put(struct nbd_device *nbd)
1973{
1974        mutex_lock(&nbd->config_lock);
1975        nbd_disconnect(nbd);
1976        nbd_clear_sock(nbd);
1977        mutex_unlock(&nbd->config_lock);
1978        /*
1979         * Make sure recv thread has finished, so it does not drop the last
1980         * config ref and try to destroy the workqueue from inside the work
1981         * queue.
1982         */
1983        if (nbd->recv_workq)
1984                flush_workqueue(nbd->recv_workq);
1985        if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
1986                               &nbd->config->runtime_flags))
1987                nbd_config_put(nbd);
1988}
1989
1990static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
1991{
1992        struct nbd_device *nbd;
1993        int index;
1994
1995        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1996                return -EPERM;
1997
1998        if (!info->attrs[NBD_ATTR_INDEX]) {
1999                printk(KERN_ERR "nbd: must specify an index to disconnect\n");
2000                return -EINVAL;
2001        }
2002        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2003        mutex_lock(&nbd_index_mutex);
2004        nbd = idr_find(&nbd_index_idr, index);
2005        if (!nbd) {
2006                mutex_unlock(&nbd_index_mutex);
2007                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
2008                       index);
2009                return -EINVAL;
2010        }
2011        if (!refcount_inc_not_zero(&nbd->refs)) {
2012                mutex_unlock(&nbd_index_mutex);
2013                printk(KERN_ERR "nbd: device at index %d is going down\n",
2014                       index);
2015                return -EINVAL;
2016        }
2017        mutex_unlock(&nbd_index_mutex);
2018        if (!refcount_inc_not_zero(&nbd->config_refs))
2019                goto put_nbd;
2020        nbd_disconnect_and_put(nbd);
2021        nbd_config_put(nbd);
2022put_nbd:
2023        nbd_put(nbd);
2024        return 0;
2025}
2026
2027static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
2028{
2029        struct nbd_device *nbd = NULL;
2030        struct nbd_config *config;
2031        int index;
2032        int ret = 0;
2033        bool put_dev = false;
2034
2035        if (!netlink_capable(skb, CAP_SYS_ADMIN))
2036                return -EPERM;
2037
2038        if (!info->attrs[NBD_ATTR_INDEX]) {
2039                printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
2040                return -EINVAL;
2041        }
2042        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2043        mutex_lock(&nbd_index_mutex);
2044        nbd = idr_find(&nbd_index_idr, index);
2045        if (!nbd) {
2046                mutex_unlock(&nbd_index_mutex);
2047                printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
2048                       index);
2049                return -EINVAL;
2050        }
2051        if (!refcount_inc_not_zero(&nbd->refs)) {
2052                mutex_unlock(&nbd_index_mutex);
2053                printk(KERN_ERR "nbd: device at index %d is going down\n",
2054                       index);
2055                return -EINVAL;
2056        }
2057        mutex_unlock(&nbd_index_mutex);
2058
2059        if (!refcount_inc_not_zero(&nbd->config_refs)) {
2060                dev_err(nbd_to_dev(nbd),
2061                        "not configured, cannot reconfigure\n");
2062                nbd_put(nbd);
2063                return -EINVAL;
2064        }
2065
2066        mutex_lock(&nbd->config_lock);
2067        config = nbd->config;
2068        if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
2069            !nbd->task_recv) {
2070                dev_err(nbd_to_dev(nbd),
2071                        "not configured, cannot reconfigure\n");
2072                ret = -EINVAL;
2073                goto out;
2074        }
2075
2076        ret = nbd_genl_size_set(info, nbd);
2077        if (ret)
2078                goto out;
2079
2080        if (info->attrs[NBD_ATTR_TIMEOUT])
2081                nbd_set_cmd_timeout(nbd,
2082                                    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
2083        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
2084                config->dead_conn_timeout =
2085                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
2086                config->dead_conn_timeout *= HZ;
2087        }
2088        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
2089                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
2090                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
2091                        if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
2092                                              &nbd->flags))
2093                                put_dev = true;
2094                } else {
2095                        if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
2096                                               &nbd->flags))
2097                                refcount_inc(&nbd->refs);
2098                }
2099
2100                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
2101                        set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2102                                        &config->runtime_flags);
2103                } else {
2104                        clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2105                                        &config->runtime_flags);
2106                }
2107        }
2108
2109        if (info->attrs[NBD_ATTR_SOCKETS]) {
2110                struct nlattr *attr;
2111                int rem, fd;
2112
2113                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2114                                    rem) {
2115                        struct nlattr *socks[NBD_SOCK_MAX+1];
2116
2117                        if (nla_type(attr) != NBD_SOCK_ITEM) {
2118                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
2119                                ret = -EINVAL;
2120                                goto out;
2121                        }
2122                        ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2123                                                          attr,
2124                                                          nbd_sock_policy,
2125                                                          info->extack);
2126                        if (ret != 0) {
2127                                printk(KERN_ERR "nbd: error processing sock list\n");
2128                                ret = -EINVAL;
2129                                goto out;
2130                        }
2131                        if (!socks[NBD_SOCK_FD])
2132                                continue;
2133                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2134                        ret = nbd_reconnect_socket(nbd, fd);
2135                        if (ret) {
2136                                if (ret == -ENOSPC)
2137                                        ret = 0;
2138                                goto out;
2139                        }
2140                        dev_info(nbd_to_dev(nbd), "reconnected socket\n");
2141                }
2142        }
2143out:
2144        mutex_unlock(&nbd->config_lock);
2145        nbd_config_put(nbd);
2146        nbd_put(nbd);
2147        if (put_dev)
2148                nbd_put(nbd);
2149        return ret;
2150}
2151
2152static const struct genl_small_ops nbd_connect_genl_ops[] = {
2153        {
2154                .cmd    = NBD_CMD_CONNECT,
2155                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2156                .doit   = nbd_genl_connect,
2157        },
2158        {
2159                .cmd    = NBD_CMD_DISCONNECT,
2160                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2161                .doit   = nbd_genl_disconnect,
2162        },
2163        {
2164                .cmd    = NBD_CMD_RECONFIGURE,
2165                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2166                .doit   = nbd_genl_reconfigure,
2167        },
2168        {
2169                .cmd    = NBD_CMD_STATUS,
2170                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2171                .doit   = nbd_genl_status,
2172        },
2173};
2174
2175static const struct genl_multicast_group nbd_mcast_grps[] = {
2176        { .name = NBD_GENL_MCAST_GROUP_NAME, },
2177};
2178
2179static struct genl_family nbd_genl_family __ro_after_init = {
2180        .hdrsize        = 0,
2181        .name           = NBD_GENL_FAMILY_NAME,
2182        .version        = NBD_GENL_VERSION,
2183        .module         = THIS_MODULE,
2184        .small_ops      = nbd_connect_genl_ops,
2185        .n_small_ops    = ARRAY_SIZE(nbd_connect_genl_ops),
2186        .maxattr        = NBD_ATTR_MAX,
2187        .policy = nbd_attr_policy,
2188        .mcgrps         = nbd_mcast_grps,
2189        .n_mcgrps       = ARRAY_SIZE(nbd_mcast_grps),
2190};
2191
2192static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2193{
2194        struct nlattr *dev_opt;
2195        u8 connected = 0;
2196        int ret;
2197
2198        /* This is a little racey, but for status it's ok.  The
2199         * reason we don't take a ref here is because we can't
2200         * take a ref in the index == -1 case as we would need
2201         * to put under the nbd_index_mutex, which could
2202         * deadlock if we are configured to remove ourselves
2203         * once we're disconnected.
2204         */
2205        if (refcount_read(&nbd->config_refs))
2206                connected = 1;
2207        dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
2208        if (!dev_opt)
2209                return -EMSGSIZE;
2210        ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2211        if (ret)
2212                return -EMSGSIZE;
2213        ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2214                         connected);
2215        if (ret)
2216                return -EMSGSIZE;
2217        nla_nest_end(reply, dev_opt);
2218        return 0;
2219}
2220
2221static int status_cb(int id, void *ptr, void *data)
2222{
2223        struct nbd_device *nbd = ptr;
2224        return populate_nbd_status(nbd, (struct sk_buff *)data);
2225}
2226
2227static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2228{
2229        struct nlattr *dev_list;
2230        struct sk_buff *reply;
2231        void *reply_head;
2232        size_t msg_size;
2233        int index = -1;
2234        int ret = -ENOMEM;
2235
2236        if (info->attrs[NBD_ATTR_INDEX])
2237                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2238
2239        mutex_lock(&nbd_index_mutex);
2240
2241        msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2242                                  nla_attr_size(sizeof(u8)));
2243        msg_size *= (index == -1) ? nbd_total_devices : 1;
2244
2245        reply = genlmsg_new(msg_size, GFP_KERNEL);
2246        if (!reply)
2247                goto out;
2248        reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2249                                       NBD_CMD_STATUS);
2250        if (!reply_head) {
2251                nlmsg_free(reply);
2252                goto out;
2253        }
2254
2255        dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
2256        if (index == -1) {
2257                ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2258                if (ret) {
2259                        nlmsg_free(reply);
2260                        goto out;
2261                }
2262        } else {
2263                struct nbd_device *nbd;
2264                nbd = idr_find(&nbd_index_idr, index);
2265                if (nbd) {
2266                        ret = populate_nbd_status(nbd, reply);
2267                        if (ret) {
2268                                nlmsg_free(reply);
2269                                goto out;
2270                        }
2271                }
2272        }
2273        nla_nest_end(reply, dev_list);
2274        genlmsg_end(reply, reply_head);
2275        ret = genlmsg_reply(reply, info);
2276out:
2277        mutex_unlock(&nbd_index_mutex);
2278        return ret;
2279}
2280
2281static void nbd_connect_reply(struct genl_info *info, int index)
2282{
2283        struct sk_buff *skb;
2284        void *msg_head;
2285        int ret;
2286
2287        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2288        if (!skb)
2289                return;
2290        msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2291                                     NBD_CMD_CONNECT);
2292        if (!msg_head) {
2293                nlmsg_free(skb);
2294                return;
2295        }
2296        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2297        if (ret) {
2298                nlmsg_free(skb);
2299                return;
2300        }
2301        genlmsg_end(skb, msg_head);
2302        genlmsg_reply(skb, info);
2303}
2304
2305static void nbd_mcast_index(int index)
2306{
2307        struct sk_buff *skb;
2308        void *msg_head;
2309        int ret;
2310
2311        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2312        if (!skb)
2313                return;
2314        msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2315                                     NBD_CMD_LINK_DEAD);
2316        if (!msg_head) {
2317                nlmsg_free(skb);
2318                return;
2319        }
2320        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2321        if (ret) {
2322                nlmsg_free(skb);
2323                return;
2324        }
2325        genlmsg_end(skb, msg_head);
2326        genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2327}
2328
2329static void nbd_dead_link_work(struct work_struct *work)
2330{
2331        struct link_dead_args *args = container_of(work, struct link_dead_args,
2332                                                   work);
2333        nbd_mcast_index(args->index);
2334        kfree(args);
2335}
2336
2337static int __init nbd_init(void)
2338{
2339        int i;
2340
2341        BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2342
2343        if (max_part < 0) {
2344                printk(KERN_ERR "nbd: max_part must be >= 0\n");
2345                return -EINVAL;
2346        }
2347
2348        part_shift = 0;
2349        if (max_part > 0) {
2350                part_shift = fls(max_part);
2351
2352                /*
2353                 * Adjust max_part according to part_shift as it is exported
2354                 * to user space so that user can know the max number of
2355                 * partition kernel should be able to manage.
2356                 *
2357                 * Note that -1 is required because partition 0 is reserved
2358                 * for the whole disk.
2359                 */
2360                max_part = (1UL << part_shift) - 1;
2361        }
2362
2363        if ((1UL << part_shift) > DISK_MAX_PARTS)
2364                return -EINVAL;
2365
2366        if (nbds_max > 1UL << (MINORBITS - part_shift))
2367                return -EINVAL;
2368
2369        if (register_blkdev(NBD_MAJOR, "nbd"))
2370                return -EIO;
2371
2372        if (genl_register_family(&nbd_genl_family)) {
2373                unregister_blkdev(NBD_MAJOR, "nbd");
2374                return -EINVAL;
2375        }
2376        nbd_dbg_init();
2377
2378        mutex_lock(&nbd_index_mutex);
2379        for (i = 0; i < nbds_max; i++)
2380                nbd_dev_add(i);
2381        mutex_unlock(&nbd_index_mutex);
2382        return 0;
2383}
2384
2385static int nbd_exit_cb(int id, void *ptr, void *data)
2386{
2387        struct list_head *list = (struct list_head *)data;
2388        struct nbd_device *nbd = ptr;
2389
2390        list_add_tail(&nbd->list, list);
2391        return 0;
2392}
2393
2394static void __exit nbd_cleanup(void)
2395{
2396        struct nbd_device *nbd;
2397        LIST_HEAD(del_list);
2398
2399        nbd_dbg_close();
2400
2401        mutex_lock(&nbd_index_mutex);
2402        idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2403        mutex_unlock(&nbd_index_mutex);
2404
2405        while (!list_empty(&del_list)) {
2406                nbd = list_first_entry(&del_list, struct nbd_device, list);
2407                list_del_init(&nbd->list);
2408                if (refcount_read(&nbd->refs) != 1)
2409                        printk(KERN_ERR "nbd: possibly leaking a device\n");
2410                nbd_put(nbd);
2411        }
2412
2413        idr_destroy(&nbd_index_idr);
2414        genl_unregister_family(&nbd_genl_family);
2415        unregister_blkdev(NBD_MAJOR, "nbd");
2416}
2417
2418module_init(nbd_init);
2419module_exit(nbd_cleanup);
2420
2421MODULE_DESCRIPTION("Network Block Device");
2422MODULE_LICENSE("GPL");
2423
2424module_param(nbds_max, int, 0444);
2425MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2426module_param(max_part, int, 0444);
2427MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2428