linux/drivers/block/xen-blkfront.c
<<
>>
Prefs
   1/*
   2 * blkfront.c
   3 *
   4 * XenLinux virtual block device driver.
   5 *
   6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
   7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
   8 * Copyright (c) 2004, Christian Limpach
   9 * Copyright (c) 2004, Andrew Warfield
  10 * Copyright (c) 2005, Christopher Clark
  11 * Copyright (c) 2005, XenSource Ltd
  12 *
  13 * This program is free software; you can redistribute it and/or
  14 * modify it under the terms of the GNU General Public License version 2
  15 * as published by the Free Software Foundation; or, when distributed
  16 * separately from the Linux kernel or incorporated into other
  17 * software packages, subject to the following license:
  18 *
  19 * Permission is hereby granted, free of charge, to any person obtaining a copy
  20 * of this source file (the "Software"), to deal in the Software without
  21 * restriction, including without limitation the rights to use, copy, modify,
  22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  23 * and to permit persons to whom the Software is furnished to do so, subject to
  24 * the following conditions:
  25 *
  26 * The above copyright notice and this permission notice shall be included in
  27 * all copies or substantial portions of the Software.
  28 *
  29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  35 * IN THE SOFTWARE.
  36 */
  37
  38#include <linux/interrupt.h>
  39#include <linux/blkdev.h>
  40#include <linux/hdreg.h>
  41#include <linux/cdrom.h>
  42#include <linux/module.h>
  43#include <linux/scatterlist.h>
  44
  45#include <xen/xenbus.h>
  46#include <xen/grant_table.h>
  47#include <xen/events.h>
  48#include <xen/page.h>
  49
  50#include <xen/interface/grant_table.h>
  51#include <xen/interface/io/blkif.h>
  52#include <xen/interface/io/protocols.h>
  53
  54#include <asm/xen/hypervisor.h>
  55
  56enum blkif_state {
  57        BLKIF_STATE_DISCONNECTED,
  58        BLKIF_STATE_CONNECTED,
  59        BLKIF_STATE_SUSPENDED,
  60};
  61
  62struct blk_shadow {
  63        struct blkif_request req;
  64        unsigned long request;
  65        unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  66};
  67
  68static struct block_device_operations xlvbd_block_fops;
  69
  70#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
  71
  72/*
  73 * We have one of these per vbd, whether ide, scsi or 'other'.  They
  74 * hang in private_data off the gendisk structure. We may end up
  75 * putting all kinds of interesting stuff here :-)
  76 */
  77struct blkfront_info
  78{
  79        struct xenbus_device *xbdev;
  80        struct gendisk *gd;
  81        int vdevice;
  82        blkif_vdev_t handle;
  83        enum blkif_state connected;
  84        int ring_ref;
  85        struct blkif_front_ring ring;
  86        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  87        unsigned int evtchn, irq;
  88        struct request_queue *rq;
  89        struct work_struct work;
  90        struct gnttab_free_callback callback;
  91        struct blk_shadow shadow[BLK_RING_SIZE];
  92        unsigned long shadow_free;
  93        int feature_barrier;
  94        int is_ready;
  95
  96        /**
  97         * The number of people holding this device open.  We won't allow a
  98         * hot-unplug unless this is 0.
  99         */
 100        int users;
 101};
 102
 103static DEFINE_SPINLOCK(blkif_io_lock);
 104
 105#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
 106        (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 107#define GRANT_INVALID_REF       0
 108
 109#define PARTS_PER_DISK          16
 110
 111#define BLKIF_MAJOR(dev) ((dev)>>8)
 112#define BLKIF_MINOR(dev) ((dev) & 0xff)
 113
 114#define DEV_NAME        "xvd"   /* name in /dev */
 115
 116/* Information about our VBDs. */
 117#define MAX_VBDS 64
 118static LIST_HEAD(vbds_list);
 119
 120static int get_id_from_freelist(struct blkfront_info *info)
 121{
 122        unsigned long free = info->shadow_free;
 123        BUG_ON(free > BLK_RING_SIZE);
 124        info->shadow_free = info->shadow[free].req.id;
 125        info->shadow[free].req.id = 0x0fffffee; /* debug */
 126        return free;
 127}
 128
 129static void add_id_to_freelist(struct blkfront_info *info,
 130                               unsigned long id)
 131{
 132        info->shadow[id].req.id  = info->shadow_free;
 133        info->shadow[id].request = 0;
 134        info->shadow_free = id;
 135}
 136
 137static void blkif_restart_queue_callback(void *arg)
 138{
 139        struct blkfront_info *info = (struct blkfront_info *)arg;
 140        schedule_work(&info->work);
 141}
 142
 143static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
 144{
 145        /* We don't have real geometry info, but let's at least return
 146           values consistent with the size of the device */
 147        sector_t nsect = get_capacity(bd->bd_disk);
 148        sector_t cylinders = nsect;
 149
 150        hg->heads = 0xff;
 151        hg->sectors = 0x3f;
 152        sector_div(cylinders, hg->heads * hg->sectors);
 153        hg->cylinders = cylinders;
 154        if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
 155                hg->cylinders = 0xffff;
 156        return 0;
 157}
 158
 159static int blkif_ioctl(struct inode *inode, struct file *filep,
 160                       unsigned command, unsigned long argument)
 161{
 162        struct blkfront_info *info =
 163                inode->i_bdev->bd_disk->private_data;
 164        int i;
 165
 166        dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
 167                command, (long)argument);
 168
 169        switch (command) {
 170        case CDROMMULTISESSION:
 171                dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
 172                for (i = 0; i < sizeof(struct cdrom_multisession); i++)
 173                        if (put_user(0, (char __user *)(argument + i)))
 174                                return -EFAULT;
 175                return 0;
 176
 177        case CDROM_GET_CAPABILITY: {
 178                struct gendisk *gd = info->gd;
 179                if (gd->flags & GENHD_FL_CD)
 180                        return 0;
 181                return -EINVAL;
 182        }
 183
 184        default:
 185                /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
 186                  command);*/
 187                return -EINVAL; /* same return as native Linux */
 188        }
 189
 190        return 0;
 191}
 192
 193/*
 194 * blkif_queue_request
 195 *
 196 * request block io
 197 *
 198 * id: for guest use only.
 199 * operation: BLKIF_OP_{READ,WRITE,PROBE}
 200 * buffer: buffer to read/write into. this should be a
 201 *   virtual address in the guest os.
 202 */
 203static int blkif_queue_request(struct request *req)
 204{
 205        struct blkfront_info *info = req->rq_disk->private_data;
 206        unsigned long buffer_mfn;
 207        struct blkif_request *ring_req;
 208        unsigned long id;
 209        unsigned int fsect, lsect;
 210        int i, ref;
 211        grant_ref_t gref_head;
 212        struct scatterlist *sg;
 213
 214        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 215                return 1;
 216
 217        if (gnttab_alloc_grant_references(
 218                BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
 219                gnttab_request_free_callback(
 220                        &info->callback,
 221                        blkif_restart_queue_callback,
 222                        info,
 223                        BLKIF_MAX_SEGMENTS_PER_REQUEST);
 224                return 1;
 225        }
 226
 227        /* Fill out a communications ring structure. */
 228        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 229        id = get_id_from_freelist(info);
 230        info->shadow[id].request = (unsigned long)req;
 231
 232        ring_req->id = id;
 233        ring_req->sector_number = (blkif_sector_t)req->sector;
 234        ring_req->handle = info->handle;
 235
 236        ring_req->operation = rq_data_dir(req) ?
 237                BLKIF_OP_WRITE : BLKIF_OP_READ;
 238        if (blk_barrier_rq(req))
 239                ring_req->operation = BLKIF_OP_WRITE_BARRIER;
 240
 241        ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
 242        BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
 243
 244        for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
 245                buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 246                fsect = sg->offset >> 9;
 247                lsect = fsect + (sg->length >> 9) - 1;
 248                /* install a grant reference. */
 249                ref = gnttab_claim_grant_reference(&gref_head);
 250                BUG_ON(ref == -ENOSPC);
 251
 252                gnttab_grant_foreign_access_ref(
 253                                ref,
 254                                info->xbdev->otherend_id,
 255                                buffer_mfn,
 256                                rq_data_dir(req) );
 257
 258                info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
 259                ring_req->seg[i] =
 260                                (struct blkif_request_segment) {
 261                                        .gref       = ref,
 262                                        .first_sect = fsect,
 263                                        .last_sect  = lsect };
 264        }
 265
 266        info->ring.req_prod_pvt++;
 267
 268        /* Keep a private copy so we can reissue requests when recovering. */
 269        info->shadow[id].req = *ring_req;
 270
 271        gnttab_free_grant_references(gref_head);
 272
 273        return 0;
 274}
 275
 276
 277static inline void flush_requests(struct blkfront_info *info)
 278{
 279        int notify;
 280
 281        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
 282
 283        if (notify)
 284                notify_remote_via_irq(info->irq);
 285}
 286
 287/*
 288 * do_blkif_request
 289 *  read a block; request is in a request queue
 290 */
 291static void do_blkif_request(struct request_queue *rq)
 292{
 293        struct blkfront_info *info = NULL;
 294        struct request *req;
 295        int queued;
 296
 297        pr_debug("Entered do_blkif_request\n");
 298
 299        queued = 0;
 300
 301        while ((req = elv_next_request(rq)) != NULL) {
 302                info = req->rq_disk->private_data;
 303                if (!blk_fs_request(req)) {
 304                        end_request(req, 0);
 305                        continue;
 306                }
 307
 308                if (RING_FULL(&info->ring))
 309                        goto wait;
 310
 311                pr_debug("do_blk_req %p: cmd %p, sec %lx, "
 312                         "(%u/%li) buffer:%p [%s]\n",
 313                         req, req->cmd, (unsigned long)req->sector,
 314                         req->current_nr_sectors,
 315                         req->nr_sectors, req->buffer,
 316                         rq_data_dir(req) ? "write" : "read");
 317
 318
 319                blkdev_dequeue_request(req);
 320                if (blkif_queue_request(req)) {
 321                        blk_requeue_request(rq, req);
 322wait:
 323                        /* Avoid pointless unplugs. */
 324                        blk_stop_queue(rq);
 325                        break;
 326                }
 327
 328                queued++;
 329        }
 330
 331        if (queued != 0)
 332                flush_requests(info);
 333}
 334
 335static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 336{
 337        struct request_queue *rq;
 338
 339        rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
 340        if (rq == NULL)
 341                return -1;
 342
 343        elevator_init(rq, "noop");
 344
 345        /* Hard sector size and max sectors impersonate the equiv. hardware. */
 346        blk_queue_hardsect_size(rq, sector_size);
 347        blk_queue_max_sectors(rq, 512);
 348
 349        /* Each segment in a request is up to an aligned page in size. */
 350        blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 351        blk_queue_max_segment_size(rq, PAGE_SIZE);
 352
 353        /* Ensure a merged request will fit in a single I/O ring slot. */
 354        blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 355        blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 356
 357        /* Make sure buffer addresses are sector-aligned. */
 358        blk_queue_dma_alignment(rq, 511);
 359
 360        /* Make sure we don't use bounce buffers. */
 361        blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
 362
 363        gd->queue = rq;
 364
 365        return 0;
 366}
 367
 368
 369static int xlvbd_barrier(struct blkfront_info *info)
 370{
 371        int err;
 372
 373        err = blk_queue_ordered(info->rq,
 374                                info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
 375                                NULL);
 376
 377        if (err)
 378                return err;
 379
 380        printk(KERN_INFO "blkfront: %s: barriers %s\n",
 381               info->gd->disk_name,
 382               info->feature_barrier ? "enabled" : "disabled");
 383        return 0;
 384}
 385
 386
 387static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
 388                               int vdevice, u16 vdisk_info, u16 sector_size,
 389                               struct blkfront_info *info)
 390{
 391        struct gendisk *gd;
 392        int nr_minors = 1;
 393        int err = -ENODEV;
 394
 395        BUG_ON(info->gd != NULL);
 396        BUG_ON(info->rq != NULL);
 397
 398        if ((minor % PARTS_PER_DISK) == 0)
 399                nr_minors = PARTS_PER_DISK;
 400
 401        gd = alloc_disk(nr_minors);
 402        if (gd == NULL)
 403                goto out;
 404
 405        if (nr_minors > 1)
 406                sprintf(gd->disk_name, "%s%c", DEV_NAME,
 407                        'a' + minor / PARTS_PER_DISK);
 408        else
 409                sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
 410                        'a' + minor / PARTS_PER_DISK,
 411                        minor % PARTS_PER_DISK);
 412
 413        gd->major = XENVBD_MAJOR;
 414        gd->first_minor = minor;
 415        gd->fops = &xlvbd_block_fops;
 416        gd->private_data = info;
 417        gd->driverfs_dev = &(info->xbdev->dev);
 418        set_capacity(gd, capacity);
 419
 420        if (xlvbd_init_blk_queue(gd, sector_size)) {
 421                del_gendisk(gd);
 422                goto out;
 423        }
 424
 425        info->rq = gd->queue;
 426        info->gd = gd;
 427
 428        if (info->feature_barrier)
 429                xlvbd_barrier(info);
 430
 431        if (vdisk_info & VDISK_READONLY)
 432                set_disk_ro(gd, 1);
 433
 434        if (vdisk_info & VDISK_REMOVABLE)
 435                gd->flags |= GENHD_FL_REMOVABLE;
 436
 437        if (vdisk_info & VDISK_CDROM)
 438                gd->flags |= GENHD_FL_CD;
 439
 440        return 0;
 441
 442 out:
 443        return err;
 444}
 445
 446static void kick_pending_request_queues(struct blkfront_info *info)
 447{
 448        if (!RING_FULL(&info->ring)) {
 449                /* Re-enable calldowns. */
 450                blk_start_queue(info->rq);
 451                /* Kick things off immediately. */
 452                do_blkif_request(info->rq);
 453        }
 454}
 455
 456static void blkif_restart_queue(struct work_struct *work)
 457{
 458        struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 459
 460        spin_lock_irq(&blkif_io_lock);
 461        if (info->connected == BLKIF_STATE_CONNECTED)
 462                kick_pending_request_queues(info);
 463        spin_unlock_irq(&blkif_io_lock);
 464}
 465
 466static void blkif_free(struct blkfront_info *info, int suspend)
 467{
 468        /* Prevent new requests being issued until we fix things up. */
 469        spin_lock_irq(&blkif_io_lock);
 470        info->connected = suspend ?
 471                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 472        /* No more blkif_request(). */
 473        if (info->rq)
 474                blk_stop_queue(info->rq);
 475        /* No more gnttab callback work. */
 476        gnttab_cancel_free_callback(&info->callback);
 477        spin_unlock_irq(&blkif_io_lock);
 478
 479        /* Flush gnttab callback work. Must be done with no locks held. */
 480        flush_scheduled_work();
 481
 482        /* Free resources associated with old device channel. */
 483        if (info->ring_ref != GRANT_INVALID_REF) {
 484                gnttab_end_foreign_access(info->ring_ref, 0,
 485                                          (unsigned long)info->ring.sring);
 486                info->ring_ref = GRANT_INVALID_REF;
 487                info->ring.sring = NULL;
 488        }
 489        if (info->irq)
 490                unbind_from_irqhandler(info->irq, info);
 491        info->evtchn = info->irq = 0;
 492
 493}
 494
 495static void blkif_completion(struct blk_shadow *s)
 496{
 497        int i;
 498        for (i = 0; i < s->req.nr_segments; i++)
 499                gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
 500}
 501
 502static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 503{
 504        struct request *req;
 505        struct blkif_response *bret;
 506        RING_IDX i, rp;
 507        unsigned long flags;
 508        struct blkfront_info *info = (struct blkfront_info *)dev_id;
 509        int error;
 510
 511        spin_lock_irqsave(&blkif_io_lock, flags);
 512
 513        if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
 514                spin_unlock_irqrestore(&blkif_io_lock, flags);
 515                return IRQ_HANDLED;
 516        }
 517
 518 again:
 519        rp = info->ring.sring->rsp_prod;
 520        rmb(); /* Ensure we see queued responses up to 'rp'. */
 521
 522        for (i = info->ring.rsp_cons; i != rp; i++) {
 523                unsigned long id;
 524                int ret;
 525
 526                bret = RING_GET_RESPONSE(&info->ring, i);
 527                id   = bret->id;
 528                req  = (struct request *)info->shadow[id].request;
 529
 530                blkif_completion(&info->shadow[id]);
 531
 532                add_id_to_freelist(info, id);
 533
 534                error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 535                switch (bret->operation) {
 536                case BLKIF_OP_WRITE_BARRIER:
 537                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 538                                printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 539                                       info->gd->disk_name);
 540                                error = -EOPNOTSUPP;
 541                                info->feature_barrier = 0;
 542                                xlvbd_barrier(info);
 543                        }
 544                        /* fall through */
 545                case BLKIF_OP_READ:
 546                case BLKIF_OP_WRITE:
 547                        if (unlikely(bret->status != BLKIF_RSP_OKAY))
 548                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 549                                        "request: %x\n", bret->status);
 550
 551                        ret = __blk_end_request(req, error, blk_rq_bytes(req));
 552                        BUG_ON(ret);
 553                        break;
 554                default:
 555                        BUG();
 556                }
 557        }
 558
 559        info->ring.rsp_cons = i;
 560
 561        if (i != info->ring.req_prod_pvt) {
 562                int more_to_do;
 563                RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
 564                if (more_to_do)
 565                        goto again;
 566        } else
 567                info->ring.sring->rsp_event = i + 1;
 568
 569        kick_pending_request_queues(info);
 570
 571        spin_unlock_irqrestore(&blkif_io_lock, flags);
 572
 573        return IRQ_HANDLED;
 574}
 575
 576
 577static int setup_blkring(struct xenbus_device *dev,
 578                         struct blkfront_info *info)
 579{
 580        struct blkif_sring *sring;
 581        int err;
 582
 583        info->ring_ref = GRANT_INVALID_REF;
 584
 585        sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 586        if (!sring) {
 587                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 588                return -ENOMEM;
 589        }
 590        SHARED_RING_INIT(sring);
 591        FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 592
 593        sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 594
 595        err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 596        if (err < 0) {
 597                free_page((unsigned long)sring);
 598                info->ring.sring = NULL;
 599                goto fail;
 600        }
 601        info->ring_ref = err;
 602
 603        err = xenbus_alloc_evtchn(dev, &info->evtchn);
 604        if (err)
 605                goto fail;
 606
 607        err = bind_evtchn_to_irqhandler(info->evtchn,
 608                                        blkif_interrupt,
 609                                        IRQF_SAMPLE_RANDOM, "blkif", info);
 610        if (err <= 0) {
 611                xenbus_dev_fatal(dev, err,
 612                                 "bind_evtchn_to_irqhandler failed");
 613                goto fail;
 614        }
 615        info->irq = err;
 616
 617        return 0;
 618fail:
 619        blkif_free(info, 0);
 620        return err;
 621}
 622
 623
 624/* Common code used when first setting up, and when resuming. */
 625static int talk_to_backend(struct xenbus_device *dev,
 626                           struct blkfront_info *info)
 627{
 628        const char *message = NULL;
 629        struct xenbus_transaction xbt;
 630        int err;
 631
 632        /* Create shared ring, alloc event channel. */
 633        err = setup_blkring(dev, info);
 634        if (err)
 635                goto out;
 636
 637again:
 638        err = xenbus_transaction_start(&xbt);
 639        if (err) {
 640                xenbus_dev_fatal(dev, err, "starting transaction");
 641                goto destroy_blkring;
 642        }
 643
 644        err = xenbus_printf(xbt, dev->nodename,
 645                            "ring-ref", "%u", info->ring_ref);
 646        if (err) {
 647                message = "writing ring-ref";
 648                goto abort_transaction;
 649        }
 650        err = xenbus_printf(xbt, dev->nodename,
 651                            "event-channel", "%u", info->evtchn);
 652        if (err) {
 653                message = "writing event-channel";
 654                goto abort_transaction;
 655        }
 656        err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
 657                            XEN_IO_PROTO_ABI_NATIVE);
 658        if (err) {
 659                message = "writing protocol";
 660                goto abort_transaction;
 661        }
 662
 663        err = xenbus_transaction_end(xbt, 0);
 664        if (err) {
 665                if (err == -EAGAIN)
 666                        goto again;
 667                xenbus_dev_fatal(dev, err, "completing transaction");
 668                goto destroy_blkring;
 669        }
 670
 671        xenbus_switch_state(dev, XenbusStateInitialised);
 672
 673        return 0;
 674
 675 abort_transaction:
 676        xenbus_transaction_end(xbt, 1);
 677        if (message)
 678                xenbus_dev_fatal(dev, err, "%s", message);
 679 destroy_blkring:
 680        blkif_free(info, 0);
 681 out:
 682        return err;
 683}
 684
 685
 686/**
 687 * Entry point to this code when a new device is created.  Allocate the basic
 688 * structures and the ring buffer for communication with the backend, and
 689 * inform the backend of the appropriate details for those.  Switch to
 690 * Initialised state.
 691 */
 692static int blkfront_probe(struct xenbus_device *dev,
 693                          const struct xenbus_device_id *id)
 694{
 695        int err, vdevice, i;
 696        struct blkfront_info *info;
 697
 698        /* FIXME: Use dynamic device id if this is not set. */
 699        err = xenbus_scanf(XBT_NIL, dev->nodename,
 700                           "virtual-device", "%i", &vdevice);
 701        if (err != 1) {
 702                xenbus_dev_fatal(dev, err, "reading virtual-device");
 703                return err;
 704        }
 705
 706        info = kzalloc(sizeof(*info), GFP_KERNEL);
 707        if (!info) {
 708                xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
 709                return -ENOMEM;
 710        }
 711
 712        info->xbdev = dev;
 713        info->vdevice = vdevice;
 714        info->connected = BLKIF_STATE_DISCONNECTED;
 715        INIT_WORK(&info->work, blkif_restart_queue);
 716
 717        for (i = 0; i < BLK_RING_SIZE; i++)
 718                info->shadow[i].req.id = i+1;
 719        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 720
 721        /* Front end dir is a number, which is used as the id. */
 722        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 723        dev->dev.driver_data = info;
 724
 725        err = talk_to_backend(dev, info);
 726        if (err) {
 727                kfree(info);
 728                dev->dev.driver_data = NULL;
 729                return err;
 730        }
 731
 732        return 0;
 733}
 734
 735
 736static int blkif_recover(struct blkfront_info *info)
 737{
 738        int i;
 739        struct blkif_request *req;
 740        struct blk_shadow *copy;
 741        int j;
 742
 743        /* Stage 1: Make a safe copy of the shadow state. */
 744        copy = kmalloc(sizeof(info->shadow),
 745                       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
 746        if (!copy)
 747                return -ENOMEM;
 748        memcpy(copy, info->shadow, sizeof(info->shadow));
 749
 750        /* Stage 2: Set up free list. */
 751        memset(&info->shadow, 0, sizeof(info->shadow));
 752        for (i = 0; i < BLK_RING_SIZE; i++)
 753                info->shadow[i].req.id = i+1;
 754        info->shadow_free = info->ring.req_prod_pvt;
 755        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 756
 757        /* Stage 3: Find pending requests and requeue them. */
 758        for (i = 0; i < BLK_RING_SIZE; i++) {
 759                /* Not in use? */
 760                if (copy[i].request == 0)
 761                        continue;
 762
 763                /* Grab a request slot and copy shadow state into it. */
 764                req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 765                *req = copy[i].req;
 766
 767                /* We get a new request id, and must reset the shadow state. */
 768                req->id = get_id_from_freelist(info);
 769                memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
 770
 771                /* Rewrite any grant references invalidated by susp/resume. */
 772                for (j = 0; j < req->nr_segments; j++)
 773                        gnttab_grant_foreign_access_ref(
 774                                req->seg[j].gref,
 775                                info->xbdev->otherend_id,
 776                                pfn_to_mfn(info->shadow[req->id].frame[j]),
 777                                rq_data_dir(
 778                                        (struct request *)
 779                                        info->shadow[req->id].request));
 780                info->shadow[req->id].req = *req;
 781
 782                info->ring.req_prod_pvt++;
 783        }
 784
 785        kfree(copy);
 786
 787        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 788
 789        spin_lock_irq(&blkif_io_lock);
 790
 791        /* Now safe for us to use the shared ring */
 792        info->connected = BLKIF_STATE_CONNECTED;
 793
 794        /* Send off requeued requests */
 795        flush_requests(info);
 796
 797        /* Kick any other new requests queued since we resumed */
 798        kick_pending_request_queues(info);
 799
 800        spin_unlock_irq(&blkif_io_lock);
 801
 802        return 0;
 803}
 804
 805/**
 806 * We are reconnecting to the backend, due to a suspend/resume, or a backend
 807 * driver restart.  We tear down our blkif structure and recreate it, but
 808 * leave the device-layer structures intact so that this is transparent to the
 809 * rest of the kernel.
 810 */
 811static int blkfront_resume(struct xenbus_device *dev)
 812{
 813        struct blkfront_info *info = dev->dev.driver_data;
 814        int err;
 815
 816        dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 817
 818        blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 819
 820        err = talk_to_backend(dev, info);
 821        if (info->connected == BLKIF_STATE_SUSPENDED && !err)
 822                err = blkif_recover(info);
 823
 824        return err;
 825}
 826
 827
 828/*
 829 * Invoked when the backend is finally 'ready' (and has told produced
 830 * the details about the physical device - #sectors, size, etc).
 831 */
 832static void blkfront_connect(struct blkfront_info *info)
 833{
 834        unsigned long long sectors;
 835        unsigned long sector_size;
 836        unsigned int binfo;
 837        int err;
 838
 839        if ((info->connected == BLKIF_STATE_CONNECTED) ||
 840            (info->connected == BLKIF_STATE_SUSPENDED) )
 841                return;
 842
 843        dev_dbg(&info->xbdev->dev, "%s:%s.\n",
 844                __func__, info->xbdev->otherend);
 845
 846        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 847                            "sectors", "%llu", &sectors,
 848                            "info", "%u", &binfo,
 849                            "sector-size", "%lu", &sector_size,
 850                            NULL);
 851        if (err) {
 852                xenbus_dev_fatal(info->xbdev, err,
 853                                 "reading backend fields at %s",
 854                                 info->xbdev->otherend);
 855                return;
 856        }
 857
 858        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 859                            "feature-barrier", "%lu", &info->feature_barrier,
 860                            NULL);
 861        if (err)
 862                info->feature_barrier = 0;
 863
 864        err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
 865                                  sectors, info->vdevice,
 866                                  binfo, sector_size, info);
 867        if (err) {
 868                xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 869                                 info->xbdev->otherend);
 870                return;
 871        }
 872
 873        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 874
 875        /* Kick pending requests. */
 876        spin_lock_irq(&blkif_io_lock);
 877        info->connected = BLKIF_STATE_CONNECTED;
 878        kick_pending_request_queues(info);
 879        spin_unlock_irq(&blkif_io_lock);
 880
 881        add_disk(info->gd);
 882
 883        info->is_ready = 1;
 884}
 885
 886/**
 887 * Handle the change of state of the backend to Closing.  We must delete our
 888 * device-layer structures now, to ensure that writes are flushed through to
 889 * the backend.  Once is this done, we can switch to Closed in
 890 * acknowledgement.
 891 */
 892static void blkfront_closing(struct xenbus_device *dev)
 893{
 894        struct blkfront_info *info = dev->dev.driver_data;
 895        unsigned long flags;
 896
 897        dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
 898
 899        if (info->rq == NULL)
 900                goto out;
 901
 902        spin_lock_irqsave(&blkif_io_lock, flags);
 903
 904        del_gendisk(info->gd);
 905
 906        /* No more blkif_request(). */
 907        blk_stop_queue(info->rq);
 908
 909        /* No more gnttab callback work. */
 910        gnttab_cancel_free_callback(&info->callback);
 911        spin_unlock_irqrestore(&blkif_io_lock, flags);
 912
 913        /* Flush gnttab callback work. Must be done with no locks held. */
 914        flush_scheduled_work();
 915
 916        blk_cleanup_queue(info->rq);
 917        info->rq = NULL;
 918
 919 out:
 920        xenbus_frontend_closed(dev);
 921}
 922
 923/**
 924 * Callback received when the backend's state changes.
 925 */
 926static void backend_changed(struct xenbus_device *dev,
 927                            enum xenbus_state backend_state)
 928{
 929        struct blkfront_info *info = dev->dev.driver_data;
 930        struct block_device *bd;
 931
 932        dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
 933
 934        switch (backend_state) {
 935        case XenbusStateInitialising:
 936        case XenbusStateInitWait:
 937        case XenbusStateInitialised:
 938        case XenbusStateUnknown:
 939        case XenbusStateClosed:
 940                break;
 941
 942        case XenbusStateConnected:
 943                blkfront_connect(info);
 944                break;
 945
 946        case XenbusStateClosing:
 947                bd = bdget_disk(info->gd, 0);
 948                if (bd == NULL)
 949                        xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
 950
 951                mutex_lock(&bd->bd_mutex);
 952                if (info->users > 0)
 953                        xenbus_dev_error(dev, -EBUSY,
 954                                         "Device in use; refusing to close");
 955                else
 956                        blkfront_closing(dev);
 957                mutex_unlock(&bd->bd_mutex);
 958                bdput(bd);
 959                break;
 960        }
 961}
 962
 963static int blkfront_remove(struct xenbus_device *dev)
 964{
 965        struct blkfront_info *info = dev->dev.driver_data;
 966
 967        dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
 968
 969        blkif_free(info, 0);
 970
 971        kfree(info);
 972
 973        return 0;
 974}
 975
 976static int blkfront_is_ready(struct xenbus_device *dev)
 977{
 978        struct blkfront_info *info = dev->dev.driver_data;
 979
 980        return info->is_ready;
 981}
 982
 983static int blkif_open(struct inode *inode, struct file *filep)
 984{
 985        struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
 986        info->users++;
 987        return 0;
 988}
 989
 990static int blkif_release(struct inode *inode, struct file *filep)
 991{
 992        struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
 993        info->users--;
 994        if (info->users == 0) {
 995                /* Check whether we have been instructed to close.  We will
 996                   have ignored this request initially, as the device was
 997                   still mounted. */
 998                struct xenbus_device *dev = info->xbdev;
 999                enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
1000
1001                if (state == XenbusStateClosing && info->is_ready)
1002                        blkfront_closing(dev);
1003        }
1004        return 0;
1005}
1006
1007static struct block_device_operations xlvbd_block_fops =
1008{
1009        .owner = THIS_MODULE,
1010        .open = blkif_open,
1011        .release = blkif_release,
1012        .getgeo = blkif_getgeo,
1013        .ioctl = blkif_ioctl,
1014};
1015
1016
1017static struct xenbus_device_id blkfront_ids[] = {
1018        { "vbd" },
1019        { "" }
1020};
1021
1022static struct xenbus_driver blkfront = {
1023        .name = "vbd",
1024        .owner = THIS_MODULE,
1025        .ids = blkfront_ids,
1026        .probe = blkfront_probe,
1027        .remove = blkfront_remove,
1028        .resume = blkfront_resume,
1029        .otherend_changed = backend_changed,
1030        .is_ready = blkfront_is_ready,
1031};
1032
1033static int __init xlblk_init(void)
1034{
1035        if (!is_running_on_xen())
1036                return -ENODEV;
1037
1038        if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
1039                printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
1040                       XENVBD_MAJOR, DEV_NAME);
1041                return -ENODEV;
1042        }
1043
1044        return xenbus_register_frontend(&blkfront);
1045}
1046module_init(xlblk_init);
1047
1048
1049static void __exit xlblk_exit(void)
1050{
1051        return xenbus_unregister_driver(&blkfront);
1052}
1053module_exit(xlblk_exit);
1054
1055MODULE_DESCRIPTION("Xen virtual block device frontend");
1056MODULE_LICENSE("GPL");
1057MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
1058MODULE_ALIAS("xen:vbd");
1059MODULE_ALIAS("xenblk");
1060