linux/drivers/block/nbd.c
<<
>>
Prefs
   1/*
   2 * Network block device - make block devices work over TCP
   3 *
   4 * Note that you can not swap over this thing, yet. Seems to work but
   5 * deadlocks sometimes - you can not swap over TCP in general.
   6 * 
   7 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
   8 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
   9 *
  10 * This file is released under GPLv2 or later.
  11 *
  12 * (part of code stolen from loop.c)
  13 */
  14
  15#include <linux/major.h>
  16
  17#include <linux/blkdev.h>
  18#include <linux/module.h>
  19#include <linux/init.h>
  20#include <linux/sched.h>
  21#include <linux/fs.h>
  22#include <linux/bio.h>
  23#include <linux/stat.h>
  24#include <linux/errno.h>
  25#include <linux/file.h>
  26#include <linux/ioctl.h>
  27#include <linux/mutex.h>
  28#include <linux/compiler.h>
  29#include <linux/err.h>
  30#include <linux/kernel.h>
  31#include <linux/slab.h>
  32#include <net/sock.h>
  33#include <linux/net.h>
  34#include <linux/kthread.h>
  35#include <linux/types.h>
  36#include <linux/debugfs.h>
  37#include <linux/blk-mq.h>
  38
  39#include <linux/uaccess.h>
  40#include <asm/types.h>
  41
  42#include <linux/nbd.h>
  43
  44struct nbd_sock {
  45        struct socket *sock;
  46        struct mutex tx_lock;
  47};
  48
  49#define NBD_TIMEDOUT                    0
  50#define NBD_DISCONNECT_REQUESTED        1
  51#define NBD_DISCONNECTED                2
  52#define NBD_RUNNING                     3
  53
  54struct nbd_device {
  55        u32 flags;
  56        unsigned long runtime_flags;
  57        struct nbd_sock **socks;
  58        int magic;
  59
  60        struct blk_mq_tag_set tag_set;
  61
  62        struct mutex config_lock;
  63        struct gendisk *disk;
  64        int num_connections;
  65        atomic_t recv_threads;
  66        wait_queue_head_t recv_wq;
  67        loff_t blksize;
  68        loff_t bytesize;
  69
  70        struct task_struct *task_recv;
  71        struct task_struct *task_setup;
  72
  73#if IS_ENABLED(CONFIG_DEBUG_FS)
  74        struct dentry *dbg_dir;
  75#endif
  76};
  77
  78struct nbd_cmd {
  79        struct nbd_device *nbd;
  80        struct completion send_complete;
  81};
  82
  83#if IS_ENABLED(CONFIG_DEBUG_FS)
  84static struct dentry *nbd_dbg_dir;
  85#endif
  86
  87#define nbd_name(nbd) ((nbd)->disk->disk_name)
  88
  89#define NBD_MAGIC 0x68797548
  90
  91static unsigned int nbds_max = 16;
  92static struct nbd_device *nbd_dev;
  93static int max_part;
  94
  95static inline struct device *nbd_to_dev(struct nbd_device *nbd)
  96{
  97        return disk_to_dev(nbd->disk);
  98}
  99
 100static bool nbd_is_connected(struct nbd_device *nbd)
 101{
 102        return !!nbd->task_recv;
 103}
 104
 105static const char *nbdcmd_to_ascii(int cmd)
 106{
 107        switch (cmd) {
 108        case  NBD_CMD_READ: return "read";
 109        case NBD_CMD_WRITE: return "write";
 110        case  NBD_CMD_DISC: return "disconnect";
 111        case NBD_CMD_FLUSH: return "flush";
 112        case  NBD_CMD_TRIM: return "trim/discard";
 113        }
 114        return "invalid";
 115}
 116
 117static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
 118{
 119        bdev->bd_inode->i_size = 0;
 120        set_capacity(nbd->disk, 0);
 121        kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 122
 123        return 0;
 124}
 125
 126static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
 127{
 128        if (!nbd_is_connected(nbd))
 129                return;
 130
 131        bdev->bd_inode->i_size = nbd->bytesize;
 132        set_capacity(nbd->disk, nbd->bytesize >> 9);
 133        kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 134}
 135
 136static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
 137                        loff_t blocksize, loff_t nr_blocks)
 138{
 139        int ret;
 140
 141        ret = set_blocksize(bdev, blocksize);
 142        if (ret)
 143                return ret;
 144
 145        nbd->blksize = blocksize;
 146        nbd->bytesize = blocksize * nr_blocks;
 147
 148        nbd_size_update(nbd, bdev);
 149
 150        return 0;
 151}
 152
 153static void nbd_end_request(struct nbd_cmd *cmd)
 154{
 155        struct nbd_device *nbd = cmd->nbd;
 156        struct request *req = blk_mq_rq_from_pdu(cmd);
 157        int error = req->errors ? -EIO : 0;
 158
 159        dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
 160                error ? "failed" : "done");
 161
 162        blk_mq_complete_request(req, error);
 163}
 164
 165/*
 166 * Forcibly shutdown the socket causing all listeners to error
 167 */
 168static void sock_shutdown(struct nbd_device *nbd)
 169{
 170        int i;
 171
 172        if (nbd->num_connections == 0)
 173                return;
 174        if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
 175                return;
 176
 177        for (i = 0; i < nbd->num_connections; i++) {
 178                struct nbd_sock *nsock = nbd->socks[i];
 179                mutex_lock(&nsock->tx_lock);
 180                kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
 181                mutex_unlock(&nsock->tx_lock);
 182        }
 183        dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 184}
 185
 186static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 187                                                 bool reserved)
 188{
 189        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 190        struct nbd_device *nbd = cmd->nbd;
 191
 192        dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
 193        set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
 194        req->errors++;
 195
 196        /*
 197         * If our disconnect packet times out then we're already holding the
 198         * config_lock and could deadlock here, so just set an error and return,
 199         * we'll handle shutting everything down later.
 200         */
 201        if (req->cmd_type == REQ_TYPE_DRV_PRIV)
 202                return BLK_EH_HANDLED;
 203        mutex_lock(&nbd->config_lock);
 204        sock_shutdown(nbd);
 205        mutex_unlock(&nbd->config_lock);
 206        return BLK_EH_HANDLED;
 207}
 208
 209/*
 210 *  Send or receive packet.
 211 */
 212static int sock_xmit(struct nbd_device *nbd, int index, int send, void *buf,
 213                     int size, int msg_flags)
 214{
 215        struct socket *sock = nbd->socks[index]->sock;
 216        int result;
 217        struct msghdr msg;
 218        struct kvec iov;
 219        unsigned long pflags = current->flags;
 220
 221        if (unlikely(!sock)) {
 222                dev_err_ratelimited(disk_to_dev(nbd->disk),
 223                        "Attempted %s on closed socket in sock_xmit\n",
 224                        (send ? "send" : "recv"));
 225                return -EINVAL;
 226        }
 227
 228        current->flags |= PF_MEMALLOC;
 229        do {
 230                sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 231                iov.iov_base = buf;
 232                iov.iov_len = size;
 233                msg.msg_name = NULL;
 234                msg.msg_namelen = 0;
 235                msg.msg_control = NULL;
 236                msg.msg_controllen = 0;
 237                msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 238
 239                if (send)
 240                        result = kernel_sendmsg(sock, &msg, &iov, 1, size);
 241                else
 242                        result = kernel_recvmsg(sock, &msg, &iov, 1, size,
 243                                                msg.msg_flags);
 244
 245                if (result <= 0) {
 246                        if (result == 0)
 247                                result = -EPIPE; /* short read */
 248                        break;
 249                }
 250                size -= result;
 251                buf += result;
 252        } while (size > 0);
 253
 254        tsk_restore_flags(current, pflags, PF_MEMALLOC);
 255
 256        return result;
 257}
 258
 259static inline int sock_send_bvec(struct nbd_device *nbd, int index,
 260                                 struct bio_vec *bvec, int flags)
 261{
 262        int result;
 263        void *kaddr = kmap(bvec->bv_page);
 264        result = sock_xmit(nbd, index, 1, kaddr + bvec->bv_offset,
 265                           bvec->bv_len, flags);
 266        kunmap(bvec->bv_page);
 267        return result;
 268}
 269
 270/* always call with the tx_lock held */
 271static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 272{
 273        struct request *req = blk_mq_rq_from_pdu(cmd);
 274        int result;
 275        struct nbd_request request;
 276        unsigned long size = blk_rq_bytes(req);
 277        struct bio *bio;
 278        u32 type;
 279        u32 tag = blk_mq_unique_tag(req);
 280
 281        if (req_op(req) == REQ_OP_DISCARD)
 282                type = NBD_CMD_TRIM;
 283        else if (req_op(req) == REQ_OP_FLUSH)
 284                type = NBD_CMD_FLUSH;
 285        else if (rq_data_dir(req) == WRITE)
 286                type = NBD_CMD_WRITE;
 287        else
 288                type = NBD_CMD_READ;
 289
 290        memset(&request, 0, sizeof(request));
 291        request.magic = htonl(NBD_REQUEST_MAGIC);
 292        request.type = htonl(type);
 293        if (type != NBD_CMD_FLUSH) {
 294                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 295                request.len = htonl(size);
 296        }
 297        memcpy(request.handle, &tag, sizeof(tag));
 298
 299        dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 300                cmd, nbdcmd_to_ascii(type),
 301                (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 302        result = sock_xmit(nbd, index, 1, &request, sizeof(request),
 303                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0);
 304        if (result <= 0) {
 305                dev_err_ratelimited(disk_to_dev(nbd->disk),
 306                        "Send control failed (result %d)\n", result);
 307                return -EIO;
 308        }
 309
 310        if (type != NBD_CMD_WRITE)
 311                return 0;
 312
 313        bio = req->bio;
 314        while (bio) {
 315                struct bio *next = bio->bi_next;
 316                struct bvec_iter iter;
 317                struct bio_vec bvec;
 318
 319                bio_for_each_segment(bvec, bio, iter) {
 320                        bool is_last = !next && bio_iter_last(bvec, iter);
 321                        int flags = is_last ? 0 : MSG_MORE;
 322
 323                        dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 324                                cmd, bvec.bv_len);
 325                        result = sock_send_bvec(nbd, index, &bvec, flags);
 326                        if (result <= 0) {
 327                                dev_err(disk_to_dev(nbd->disk),
 328                                        "Send data failed (result %d)\n",
 329                                        result);
 330                                return -EIO;
 331                        }
 332                        /*
 333                         * The completion might already have come in,
 334                         * so break for the last one instead of letting
 335                         * the iterator do it. This prevents use-after-free
 336                         * of the bio.
 337                         */
 338                        if (is_last)
 339                                break;
 340                }
 341                bio = next;
 342        }
 343        return 0;
 344}
 345
 346static inline int sock_recv_bvec(struct nbd_device *nbd, int index,
 347                                 struct bio_vec *bvec)
 348{
 349        int result;
 350        void *kaddr = kmap(bvec->bv_page);
 351        result = sock_xmit(nbd, index, 0, kaddr + bvec->bv_offset,
 352                           bvec->bv_len, MSG_WAITALL);
 353        kunmap(bvec->bv_page);
 354        return result;
 355}
 356
 357/* NULL returned = something went wrong, inform userspace */
 358static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 359{
 360        int result;
 361        struct nbd_reply reply;
 362        struct nbd_cmd *cmd;
 363        struct request *req = NULL;
 364        u16 hwq;
 365        u32 tag;
 366
 367        reply.magic = 0;
 368        result = sock_xmit(nbd, index, 0, &reply, sizeof(reply), MSG_WAITALL);
 369        if (result <= 0) {
 370                if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
 371                    !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
 372                        dev_err(disk_to_dev(nbd->disk),
 373                                "Receive control failed (result %d)\n", result);
 374                return ERR_PTR(result);
 375        }
 376
 377        if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
 378                dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 379                                (unsigned long)ntohl(reply.magic));
 380                return ERR_PTR(-EPROTO);
 381        }
 382
 383        memcpy(&tag, reply.handle, sizeof(u32));
 384
 385        hwq = blk_mq_unique_tag_to_hwq(tag);
 386        if (hwq < nbd->tag_set.nr_hw_queues)
 387                req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
 388                                       blk_mq_unique_tag_to_tag(tag));
 389        if (!req || !blk_mq_request_started(req)) {
 390                dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
 391                        tag, req);
 392                return ERR_PTR(-ENOENT);
 393        }
 394        cmd = blk_mq_rq_to_pdu(req);
 395        if (ntohl(reply.error)) {
 396                dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 397                        ntohl(reply.error));
 398                req->errors++;
 399                return cmd;
 400        }
 401
 402        dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
 403        if (rq_data_dir(req) != WRITE) {
 404                struct req_iterator iter;
 405                struct bio_vec bvec;
 406
 407                rq_for_each_segment(bvec, req, iter) {
 408                        result = sock_recv_bvec(nbd, index, &bvec);
 409                        if (result <= 0) {
 410                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 411                                        result);
 412                                req->errors++;
 413                                return cmd;
 414                        }
 415                        dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
 416                                cmd, bvec.bv_len);
 417                }
 418        } else {
 419                /* See the comment in nbd_queue_rq. */
 420                wait_for_completion(&cmd->send_complete);
 421        }
 422        return cmd;
 423}
 424
 425static ssize_t pid_show(struct device *dev,
 426                        struct device_attribute *attr, char *buf)
 427{
 428        struct gendisk *disk = dev_to_disk(dev);
 429        struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 430
 431        return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
 432}
 433
 434static struct device_attribute pid_attr = {
 435        .attr = { .name = "pid", .mode = S_IRUGO},
 436        .show = pid_show,
 437};
 438
 439struct recv_thread_args {
 440        struct work_struct work;
 441        struct nbd_device *nbd;
 442        int index;
 443};
 444
 445static void recv_work(struct work_struct *work)
 446{
 447        struct recv_thread_args *args = container_of(work,
 448                                                     struct recv_thread_args,
 449                                                     work);
 450        struct nbd_device *nbd = args->nbd;
 451        struct nbd_cmd *cmd;
 452        int ret = 0;
 453
 454        BUG_ON(nbd->magic != NBD_MAGIC);
 455        while (1) {
 456                cmd = nbd_read_stat(nbd, args->index);
 457                if (IS_ERR(cmd)) {
 458                        ret = PTR_ERR(cmd);
 459                        break;
 460                }
 461
 462                nbd_end_request(cmd);
 463        }
 464
 465        /*
 466         * We got an error, shut everybody down if this wasn't the result of a
 467         * disconnect request.
 468         */
 469        if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
 470                sock_shutdown(nbd);
 471        atomic_dec(&nbd->recv_threads);
 472        wake_up(&nbd->recv_wq);
 473}
 474
 475static void nbd_clear_req(struct request *req, void *data, bool reserved)
 476{
 477        struct nbd_cmd *cmd;
 478
 479        if (!blk_mq_request_started(req))
 480                return;
 481        cmd = blk_mq_rq_to_pdu(req);
 482        req->errors++;
 483        nbd_end_request(cmd);
 484}
 485
 486static void nbd_clear_que(struct nbd_device *nbd)
 487{
 488        BUG_ON(nbd->magic != NBD_MAGIC);
 489
 490        blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 491        dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 492}
 493
 494
 495static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 496{
 497        struct request *req = blk_mq_rq_from_pdu(cmd);
 498        struct nbd_device *nbd = cmd->nbd;
 499        struct nbd_sock *nsock;
 500
 501        if (index >= nbd->num_connections) {
 502                dev_err_ratelimited(disk_to_dev(nbd->disk),
 503                                    "Attempted send on invalid socket\n");
 504                goto error_out;
 505        }
 506
 507        if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
 508                dev_err_ratelimited(disk_to_dev(nbd->disk),
 509                                    "Attempted send on closed socket\n");
 510                goto error_out;
 511        }
 512
 513        if (req->cmd_type != REQ_TYPE_FS &&
 514            req->cmd_type != REQ_TYPE_DRV_PRIV)
 515                goto error_out;
 516
 517        if (req->cmd_type == REQ_TYPE_FS &&
 518            rq_data_dir(req) == WRITE &&
 519            (nbd->flags & NBD_FLAG_READ_ONLY)) {
 520                dev_err_ratelimited(disk_to_dev(nbd->disk),
 521                                    "Write on read-only\n");
 522                goto error_out;
 523        }
 524
 525        req->errors = 0;
 526
 527        nsock = nbd->socks[index];
 528        mutex_lock(&nsock->tx_lock);
 529        if (unlikely(!nsock->sock)) {
 530                mutex_unlock(&nsock->tx_lock);
 531                dev_err_ratelimited(disk_to_dev(nbd->disk),
 532                                    "Attempted send on closed socket\n");
 533                goto error_out;
 534        }
 535
 536        if (nbd_send_cmd(nbd, cmd, index) != 0) {
 537                dev_err_ratelimited(disk_to_dev(nbd->disk),
 538                                    "Request send failed\n");
 539                req->errors++;
 540                nbd_end_request(cmd);
 541        }
 542
 543        mutex_unlock(&nsock->tx_lock);
 544
 545        return;
 546
 547error_out:
 548        req->errors++;
 549        nbd_end_request(cmd);
 550}
 551
 552static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 553                        const struct blk_mq_queue_data *bd)
 554{
 555        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 556
 557        /*
 558         * Since we look at the bio's to send the request over the network we
 559         * need to make sure the completion work doesn't mark this request done
 560         * before we are done doing our send.  This keeps us from dereferencing
 561         * freed data if we have particularly fast completions (ie we get the
 562         * completion before we exit sock_xmit on the last bvec) or in the case
 563         * that the server is misbehaving (or there was an error) before we're
 564         * done sending everything over the wire.
 565         */
 566        init_completion(&cmd->send_complete);
 567        blk_mq_start_request(bd->rq);
 568        nbd_handle_cmd(cmd, hctx->queue_num);
 569        complete(&cmd->send_complete);
 570
 571        return BLK_MQ_RQ_QUEUE_OK;
 572}
 573
 574static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
 575{
 576        struct nbd_sock **socks;
 577        struct nbd_sock *nsock;
 578
 579        if (!nbd->task_setup)
 580                nbd->task_setup = current;
 581        if (nbd->task_setup != current) {
 582                dev_err(disk_to_dev(nbd->disk),
 583                        "Device being setup by another task");
 584                return -EINVAL;
 585        }
 586
 587        socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
 588                         sizeof(struct nbd_sock *), GFP_KERNEL);
 589        if (!socks)
 590                return -ENOMEM;
 591        nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
 592        if (!nsock)
 593                return -ENOMEM;
 594
 595        nbd->socks = socks;
 596
 597        mutex_init(&nsock->tx_lock);
 598        nsock->sock = sock;
 599        socks[nbd->num_connections++] = nsock;
 600
 601        return 0;
 602}
 603
 604/* Reset all properties of an NBD device */
 605static void nbd_reset(struct nbd_device *nbd)
 606{
 607        int i;
 608
 609        for (i = 0; i < nbd->num_connections; i++)
 610                kfree(nbd->socks[i]);
 611        kfree(nbd->socks);
 612        nbd->socks = NULL;
 613        nbd->runtime_flags = 0;
 614        nbd->blksize = 1024;
 615        nbd->bytesize = 0;
 616        set_capacity(nbd->disk, 0);
 617        nbd->flags = 0;
 618        nbd->tag_set.timeout = 0;
 619        nbd->num_connections = 0;
 620        nbd->task_setup = NULL;
 621        queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 622}
 623
 624static void nbd_bdev_reset(struct block_device *bdev)
 625{
 626        set_device_ro(bdev, false);
 627        bdev->bd_inode->i_size = 0;
 628        if (max_part > 0) {
 629                blkdev_reread_part(bdev);
 630                bdev->bd_invalidated = 1;
 631        }
 632}
 633
 634static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
 635{
 636        if (nbd->flags & NBD_FLAG_READ_ONLY)
 637                set_device_ro(bdev, true);
 638        if (nbd->flags & NBD_FLAG_SEND_TRIM)
 639                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 640        if (nbd->flags & NBD_FLAG_SEND_FLUSH)
 641                blk_queue_write_cache(nbd->disk->queue, true, false);
 642        else
 643                blk_queue_write_cache(nbd->disk->queue, false, false);
 644}
 645
 646static void send_disconnects(struct nbd_device *nbd)
 647{
 648        struct nbd_request request = {};
 649        int i, ret;
 650
 651        request.magic = htonl(NBD_REQUEST_MAGIC);
 652        request.type = htonl(NBD_CMD_DISC);
 653
 654        for (i = 0; i < nbd->num_connections; i++) {
 655                ret = sock_xmit(nbd, i, 1, &request, sizeof(request), 0);
 656                if (ret <= 0)
 657                        dev_err(disk_to_dev(nbd->disk),
 658                                "Send disconnect failed %d\n", ret);
 659        }
 660}
 661
 662static int nbd_dev_dbg_init(struct nbd_device *nbd);
 663static void nbd_dev_dbg_close(struct nbd_device *nbd);
 664
 665/* Must be called with config_lock held */
 666static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 667                       unsigned int cmd, unsigned long arg)
 668{
 669        switch (cmd) {
 670        case NBD_DISCONNECT: {
 671                dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
 672                if (!nbd->socks)
 673                        return -EINVAL;
 674
 675                mutex_unlock(&nbd->config_lock);
 676                fsync_bdev(bdev);
 677                mutex_lock(&nbd->config_lock);
 678
 679                /* Check again after getting mutex back.  */
 680                if (!nbd->socks)
 681                        return -EINVAL;
 682
 683                if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
 684                                      &nbd->runtime_flags))
 685                        send_disconnects(nbd);
 686                return 0;
 687        }
 688
 689        case NBD_CLEAR_SOCK:
 690                sock_shutdown(nbd);
 691                nbd_clear_que(nbd);
 692                kill_bdev(bdev);
 693                nbd_bdev_reset(bdev);
 694                /*
 695                 * We want to give the run thread a chance to wait for everybody
 696                 * to clean up and then do it's own cleanup.
 697                 */
 698                if (!test_bit(NBD_RUNNING, &nbd->runtime_flags)) {
 699                        int i;
 700
 701                        for (i = 0; i < nbd->num_connections; i++)
 702                                kfree(nbd->socks[i]);
 703                        kfree(nbd->socks);
 704                        nbd->socks = NULL;
 705                        nbd->num_connections = 0;
 706                        nbd->task_setup = NULL;
 707                }
 708                return 0;
 709
 710        case NBD_SET_SOCK: {
 711                int err;
 712                struct socket *sock = sockfd_lookup(arg, &err);
 713
 714                if (!sock)
 715                        return err;
 716
 717                err = nbd_add_socket(nbd, sock);
 718                if (!err && max_part)
 719                        bdev->bd_invalidated = 1;
 720
 721                return err;
 722        }
 723
 724        case NBD_SET_BLKSIZE: {
 725                loff_t bsize = div_s64(nbd->bytesize, arg);
 726
 727                return nbd_size_set(nbd, bdev, arg, bsize);
 728        }
 729
 730        case NBD_SET_SIZE:
 731                return nbd_size_set(nbd, bdev, nbd->blksize,
 732                                        div_s64(arg, nbd->blksize));
 733
 734        case NBD_SET_SIZE_BLOCKS:
 735                return nbd_size_set(nbd, bdev, nbd->blksize, arg);
 736
 737        case NBD_SET_TIMEOUT:
 738                nbd->tag_set.timeout = arg * HZ;
 739                return 0;
 740
 741        case NBD_SET_FLAGS:
 742                nbd->flags = arg;
 743                return 0;
 744
 745        case NBD_DO_IT: {
 746                struct recv_thread_args *args;
 747                int num_connections = nbd->num_connections;
 748                int error = 0, i;
 749
 750                if (nbd->task_recv)
 751                        return -EBUSY;
 752                if (!nbd->socks)
 753                        return -EINVAL;
 754                if (num_connections > 1 &&
 755                    !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
 756                        dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
 757                        error = -EINVAL;
 758                        goto out_err;
 759                }
 760
 761                set_bit(NBD_RUNNING, &nbd->runtime_flags);
 762                blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
 763                args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
 764                if (!args) {
 765                        error = -ENOMEM;
 766                        goto out_err;
 767                }
 768                nbd->task_recv = current;
 769                mutex_unlock(&nbd->config_lock);
 770
 771                nbd_parse_flags(nbd, bdev);
 772
 773                error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
 774                if (error) {
 775                        dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
 776                        goto out_recv;
 777                }
 778
 779                nbd_size_update(nbd, bdev);
 780
 781                nbd_dev_dbg_init(nbd);
 782                for (i = 0; i < num_connections; i++) {
 783                        sk_set_memalloc(nbd->socks[i]->sock->sk);
 784                        atomic_inc(&nbd->recv_threads);
 785                        INIT_WORK(&args[i].work, recv_work);
 786                        args[i].nbd = nbd;
 787                        args[i].index = i;
 788                        queue_work(system_long_wq, &args[i].work);
 789                }
 790                wait_event_interruptible(nbd->recv_wq,
 791                                         atomic_read(&nbd->recv_threads) == 0);
 792                for (i = 0; i < num_connections; i++)
 793                        flush_work(&args[i].work);
 794                nbd_dev_dbg_close(nbd);
 795                nbd_size_clear(nbd, bdev);
 796                device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
 797out_recv:
 798                mutex_lock(&nbd->config_lock);
 799                nbd->task_recv = NULL;
 800out_err:
 801                sock_shutdown(nbd);
 802                nbd_clear_que(nbd);
 803                kill_bdev(bdev);
 804                nbd_bdev_reset(bdev);
 805
 806                /* user requested, ignore socket errors */
 807                if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
 808                        error = 0;
 809                if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
 810                        error = -ETIMEDOUT;
 811
 812                nbd_reset(nbd);
 813                return error;
 814        }
 815
 816        case NBD_CLEAR_QUE:
 817                /*
 818                 * This is for compatibility only.  The queue is always cleared
 819                 * by NBD_DO_IT or NBD_CLEAR_SOCK.
 820                 */
 821                return 0;
 822
 823        case NBD_PRINT_DEBUG:
 824                /*
 825                 * For compatibility only, we no longer keep a list of
 826                 * outstanding requests.
 827                 */
 828                return 0;
 829        }
 830        return -ENOTTY;
 831}
 832
 833static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 834                     unsigned int cmd, unsigned long arg)
 835{
 836        struct nbd_device *nbd = bdev->bd_disk->private_data;
 837        int error;
 838
 839        if (!capable(CAP_SYS_ADMIN))
 840                return -EPERM;
 841
 842        BUG_ON(nbd->magic != NBD_MAGIC);
 843
 844        mutex_lock(&nbd->config_lock);
 845        error = __nbd_ioctl(bdev, nbd, cmd, arg);
 846        mutex_unlock(&nbd->config_lock);
 847
 848        return error;
 849}
 850
 851static const struct block_device_operations nbd_fops =
 852{
 853        .owner =        THIS_MODULE,
 854        .ioctl =        nbd_ioctl,
 855        .compat_ioctl = nbd_ioctl,
 856};
 857
 858#if IS_ENABLED(CONFIG_DEBUG_FS)
 859
 860static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
 861{
 862        struct nbd_device *nbd = s->private;
 863
 864        if (nbd->task_recv)
 865                seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
 866
 867        return 0;
 868}
 869
 870static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
 871{
 872        return single_open(file, nbd_dbg_tasks_show, inode->i_private);
 873}
 874
 875static const struct file_operations nbd_dbg_tasks_ops = {
 876        .open = nbd_dbg_tasks_open,
 877        .read = seq_read,
 878        .llseek = seq_lseek,
 879        .release = single_release,
 880};
 881
 882static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
 883{
 884        struct nbd_device *nbd = s->private;
 885        u32 flags = nbd->flags;
 886
 887        seq_printf(s, "Hex: 0x%08x\n\n", flags);
 888
 889        seq_puts(s, "Known flags:\n");
 890
 891        if (flags & NBD_FLAG_HAS_FLAGS)
 892                seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
 893        if (flags & NBD_FLAG_READ_ONLY)
 894                seq_puts(s, "NBD_FLAG_READ_ONLY\n");
 895        if (flags & NBD_FLAG_SEND_FLUSH)
 896                seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
 897        if (flags & NBD_FLAG_SEND_TRIM)
 898                seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
 899
 900        return 0;
 901}
 902
 903static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
 904{
 905        return single_open(file, nbd_dbg_flags_show, inode->i_private);
 906}
 907
 908static const struct file_operations nbd_dbg_flags_ops = {
 909        .open = nbd_dbg_flags_open,
 910        .read = seq_read,
 911        .llseek = seq_lseek,
 912        .release = single_release,
 913};
 914
 915static int nbd_dev_dbg_init(struct nbd_device *nbd)
 916{
 917        struct dentry *dir;
 918
 919        if (!nbd_dbg_dir)
 920                return -EIO;
 921
 922        dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
 923        if (!dir) {
 924                dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
 925                        nbd_name(nbd));
 926                return -EIO;
 927        }
 928        nbd->dbg_dir = dir;
 929
 930        debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
 931        debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
 932        debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
 933        debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize);
 934        debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
 935
 936        return 0;
 937}
 938
 939static void nbd_dev_dbg_close(struct nbd_device *nbd)
 940{
 941        debugfs_remove_recursive(nbd->dbg_dir);
 942}
 943
 944static int nbd_dbg_init(void)
 945{
 946        struct dentry *dbg_dir;
 947
 948        dbg_dir = debugfs_create_dir("nbd", NULL);
 949        if (!dbg_dir)
 950                return -EIO;
 951
 952        nbd_dbg_dir = dbg_dir;
 953
 954        return 0;
 955}
 956
 957static void nbd_dbg_close(void)
 958{
 959        debugfs_remove_recursive(nbd_dbg_dir);
 960}
 961
 962#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
 963
 964static int nbd_dev_dbg_init(struct nbd_device *nbd)
 965{
 966        return 0;
 967}
 968
 969static void nbd_dev_dbg_close(struct nbd_device *nbd)
 970{
 971}
 972
 973static int nbd_dbg_init(void)
 974{
 975        return 0;
 976}
 977
 978static void nbd_dbg_close(void)
 979{
 980}
 981
 982#endif
 983
 984static int nbd_init_request(void *data, struct request *rq,
 985                            unsigned int hctx_idx, unsigned int request_idx,
 986                            unsigned int numa_node)
 987{
 988        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
 989        cmd->nbd = data;
 990        return 0;
 991}
 992
 993static struct blk_mq_ops nbd_mq_ops = {
 994        .queue_rq       = nbd_queue_rq,
 995        .init_request   = nbd_init_request,
 996        .timeout        = nbd_xmit_timeout,
 997};
 998
 999/*
1000 * And here should be modules and kernel interface 
1001 *  (Just smiley confuses emacs :-)
1002 */
1003
1004static int __init nbd_init(void)
1005{
1006        int err = -ENOMEM;
1007        int i;
1008        int part_shift;
1009
1010        BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
1011
1012        if (max_part < 0) {
1013                printk(KERN_ERR "nbd: max_part must be >= 0\n");
1014                return -EINVAL;
1015        }
1016
1017        part_shift = 0;
1018        if (max_part > 0) {
1019                part_shift = fls(max_part);
1020
1021                /*
1022                 * Adjust max_part according to part_shift as it is exported
1023                 * to user space so that user can know the max number of
1024                 * partition kernel should be able to manage.
1025                 *
1026                 * Note that -1 is required because partition 0 is reserved
1027                 * for the whole disk.
1028                 */
1029                max_part = (1UL << part_shift) - 1;
1030        }
1031
1032        if ((1UL << part_shift) > DISK_MAX_PARTS)
1033                return -EINVAL;
1034
1035        if (nbds_max > 1UL << (MINORBITS - part_shift))
1036                return -EINVAL;
1037
1038        nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
1039        if (!nbd_dev)
1040                return -ENOMEM;
1041
1042        for (i = 0; i < nbds_max; i++) {
1043                struct request_queue *q;
1044                struct gendisk *disk = alloc_disk(1 << part_shift);
1045                if (!disk)
1046                        goto out;
1047                nbd_dev[i].disk = disk;
1048
1049                nbd_dev[i].tag_set.ops = &nbd_mq_ops;
1050                nbd_dev[i].tag_set.nr_hw_queues = 1;
1051                nbd_dev[i].tag_set.queue_depth = 128;
1052                nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
1053                nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
1054                nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1055                        BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
1056                nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
1057
1058                err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);
1059                if (err) {
1060                        put_disk(disk);
1061                        goto out;
1062                }
1063
1064                /*
1065                 * The new linux 2.5 block layer implementation requires
1066                 * every gendisk to have its very own request_queue struct.
1067                 * These structs are big so we dynamically allocate them.
1068                 */
1069                q = blk_mq_init_queue(&nbd_dev[i].tag_set);
1070                if (IS_ERR(q)) {
1071                        blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1072                        put_disk(disk);
1073                        goto out;
1074                }
1075                disk->queue = q;
1076
1077                /*
1078                 * Tell the block layer that we are not a rotational device
1079                 */
1080                queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
1081                queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1082                disk->queue->limits.discard_granularity = 512;
1083                blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
1084                disk->queue->limits.discard_zeroes_data = 0;
1085                blk_queue_max_hw_sectors(disk->queue, 65536);
1086                disk->queue->limits.max_sectors = 256;
1087        }
1088
1089        if (register_blkdev(NBD_MAJOR, "nbd")) {
1090                err = -EIO;
1091                goto out;
1092        }
1093
1094        printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
1095
1096        nbd_dbg_init();
1097
1098        for (i = 0; i < nbds_max; i++) {
1099                struct gendisk *disk = nbd_dev[i].disk;
1100                nbd_dev[i].magic = NBD_MAGIC;
1101                mutex_init(&nbd_dev[i].config_lock);
1102                disk->major = NBD_MAJOR;
1103                disk->first_minor = i << part_shift;
1104                disk->fops = &nbd_fops;
1105                disk->private_data = &nbd_dev[i];
1106                sprintf(disk->disk_name, "nbd%d", i);
1107                init_waitqueue_head(&nbd_dev[i].recv_wq);
1108                nbd_reset(&nbd_dev[i]);
1109                add_disk(disk);
1110        }
1111
1112        return 0;
1113out:
1114        while (i--) {
1115                blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1116                blk_cleanup_queue(nbd_dev[i].disk->queue);
1117                put_disk(nbd_dev[i].disk);
1118        }
1119        kfree(nbd_dev);
1120        return err;
1121}
1122
1123static void __exit nbd_cleanup(void)
1124{
1125        int i;
1126
1127        nbd_dbg_close();
1128
1129        for (i = 0; i < nbds_max; i++) {
1130                struct gendisk *disk = nbd_dev[i].disk;
1131                nbd_dev[i].magic = 0;
1132                if (disk) {
1133                        del_gendisk(disk);
1134                        blk_cleanup_queue(disk->queue);
1135                        blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1136                        put_disk(disk);
1137                }
1138        }
1139        unregister_blkdev(NBD_MAJOR, "nbd");
1140        kfree(nbd_dev);
1141        printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
1142}
1143
1144module_init(nbd_init);
1145module_exit(nbd_cleanup);
1146
1147MODULE_DESCRIPTION("Network Block Device");
1148MODULE_LICENSE("GPL");
1149
1150module_param(nbds_max, int, 0444);
1151MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
1152module_param(max_part, int, 0444);
1153MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");
1154
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.