linux/drivers/block/xen-blkfront.c
<<
>>
Prefs
   1/*
   2 * blkfront.c
   3 *
   4 * XenLinux virtual block device driver.
   5 *
   6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
   7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
   8 * Copyright (c) 2004, Christian Limpach
   9 * Copyright (c) 2004, Andrew Warfield
  10 * Copyright (c) 2005, Christopher Clark
  11 * Copyright (c) 2005, XenSource Ltd
  12 *
  13 * This program is free software; you can redistribute it and/or
  14 * modify it under the terms of the GNU General Public License version 2
  15 * as published by the Free Software Foundation; or, when distributed
  16 * separately from the Linux kernel or incorporated into other
  17 * software packages, subject to the following license:
  18 *
  19 * Permission is hereby granted, free of charge, to any person obtaining a copy
  20 * of this source file (the "Software"), to deal in the Software without
  21 * restriction, including without limitation the rights to use, copy, modify,
  22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  23 * and to permit persons to whom the Software is furnished to do so, subject to
  24 * the following conditions:
  25 *
  26 * The above copyright notice and this permission notice shall be included in
  27 * all copies or substantial portions of the Software.
  28 *
  29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  35 * IN THE SOFTWARE.
  36 */
  37
  38#include <linux/interrupt.h>
  39#include <linux/blkdev.h>
  40#include <linux/hdreg.h>
  41#include <linux/cdrom.h>
  42#include <linux/module.h>
  43#include <linux/scatterlist.h>
  44
  45#include <xen/xenbus.h>
  46#include <xen/grant_table.h>
  47#include <xen/events.h>
  48#include <xen/page.h>
  49
  50#include <xen/interface/grant_table.h>
  51#include <xen/interface/io/blkif.h>
  52#include <xen/interface/io/protocols.h>
  53
  54#include <asm/xen/hypervisor.h>
  55
  56enum blkif_state {
  57        BLKIF_STATE_DISCONNECTED,
  58        BLKIF_STATE_CONNECTED,
  59        BLKIF_STATE_SUSPENDED,
  60};
  61
  62struct blk_shadow {
  63        struct blkif_request req;
  64        unsigned long request;
  65        unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  66};
  67
  68static struct block_device_operations xlvbd_block_fops;
  69
  70#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
  71
  72/*
  73 * We have one of these per vbd, whether ide, scsi or 'other'.  They
  74 * hang in private_data off the gendisk structure. We may end up
  75 * putting all kinds of interesting stuff here :-)
  76 */
  77struct blkfront_info
  78{
  79        struct xenbus_device *xbdev;
  80        struct gendisk *gd;
  81        int vdevice;
  82        blkif_vdev_t handle;
  83        enum blkif_state connected;
  84        int ring_ref;
  85        struct blkif_front_ring ring;
  86        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  87        unsigned int evtchn, irq;
  88        struct request_queue *rq;
  89        struct work_struct work;
  90        struct gnttab_free_callback callback;
  91        struct blk_shadow shadow[BLK_RING_SIZE];
  92        unsigned long shadow_free;
  93        int feature_barrier;
  94        int is_ready;
  95
  96        /**
  97         * The number of people holding this device open.  We won't allow a
  98         * hot-unplug unless this is 0.
  99         */
 100        int users;
 101};
 102
 103static DEFINE_SPINLOCK(blkif_io_lock);
 104
 105#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
 106        (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 107#define GRANT_INVALID_REF       0
 108
 109#define PARTS_PER_DISK          16
 110#define PARTS_PER_EXT_DISK      256
 111
 112#define BLKIF_MAJOR(dev) ((dev)>>8)
 113#define BLKIF_MINOR(dev) ((dev) & 0xff)
 114
 115#define EXT_SHIFT 28
 116#define EXTENDED (1<<EXT_SHIFT)
 117#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 118#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
 119
 120#define DEV_NAME        "xvd"   /* name in /dev */
 121
 122static int get_id_from_freelist(struct blkfront_info *info)
 123{
 124        unsigned long free = info->shadow_free;
 125        BUG_ON(free > BLK_RING_SIZE);
 126        info->shadow_free = info->shadow[free].req.id;
 127        info->shadow[free].req.id = 0x0fffffee; /* debug */
 128        return free;
 129}
 130
 131static void add_id_to_freelist(struct blkfront_info *info,
 132                               unsigned long id)
 133{
 134        info->shadow[id].req.id  = info->shadow_free;
 135        info->shadow[id].request = 0;
 136        info->shadow_free = id;
 137}
 138
 139static void blkif_restart_queue_callback(void *arg)
 140{
 141        struct blkfront_info *info = (struct blkfront_info *)arg;
 142        schedule_work(&info->work);
 143}
 144
 145static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
 146{
 147        /* We don't have real geometry info, but let's at least return
 148           values consistent with the size of the device */
 149        sector_t nsect = get_capacity(bd->bd_disk);
 150        sector_t cylinders = nsect;
 151
 152        hg->heads = 0xff;
 153        hg->sectors = 0x3f;
 154        sector_div(cylinders, hg->heads * hg->sectors);
 155        hg->cylinders = cylinders;
 156        if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
 157                hg->cylinders = 0xffff;
 158        return 0;
 159}
 160
 161static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 162                       unsigned command, unsigned long argument)
 163{
 164        struct blkfront_info *info = bdev->bd_disk->private_data;
 165        int i;
 166
 167        dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
 168                command, (long)argument);
 169
 170        switch (command) {
 171        case CDROMMULTISESSION:
 172                dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
 173                for (i = 0; i < sizeof(struct cdrom_multisession); i++)
 174                        if (put_user(0, (char __user *)(argument + i)))
 175                                return -EFAULT;
 176                return 0;
 177
 178        case CDROM_GET_CAPABILITY: {
 179                struct gendisk *gd = info->gd;
 180                if (gd->flags & GENHD_FL_CD)
 181                        return 0;
 182                return -EINVAL;
 183        }
 184
 185        default:
 186                /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
 187                  command);*/
 188                return -EINVAL; /* same return as native Linux */
 189        }
 190
 191        return 0;
 192}
 193
 194/*
 195 * blkif_queue_request
 196 *
 197 * request block io
 198 *
 199 * id: for guest use only.
 200 * operation: BLKIF_OP_{READ,WRITE,PROBE}
 201 * buffer: buffer to read/write into. this should be a
 202 *   virtual address in the guest os.
 203 */
 204static int blkif_queue_request(struct request *req)
 205{
 206        struct blkfront_info *info = req->rq_disk->private_data;
 207        unsigned long buffer_mfn;
 208        struct blkif_request *ring_req;
 209        unsigned long id;
 210        unsigned int fsect, lsect;
 211        int i, ref;
 212        grant_ref_t gref_head;
 213        struct scatterlist *sg;
 214
 215        if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 216                return 1;
 217
 218        if (gnttab_alloc_grant_references(
 219                BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
 220                gnttab_request_free_callback(
 221                        &info->callback,
 222                        blkif_restart_queue_callback,
 223                        info,
 224                        BLKIF_MAX_SEGMENTS_PER_REQUEST);
 225                return 1;
 226        }
 227
 228        /* Fill out a communications ring structure. */
 229        ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 230        id = get_id_from_freelist(info);
 231        info->shadow[id].request = (unsigned long)req;
 232
 233        ring_req->id = id;
 234        ring_req->sector_number = (blkif_sector_t)req->sector;
 235        ring_req->handle = info->handle;
 236
 237        ring_req->operation = rq_data_dir(req) ?
 238                BLKIF_OP_WRITE : BLKIF_OP_READ;
 239        if (blk_barrier_rq(req))
 240                ring_req->operation = BLKIF_OP_WRITE_BARRIER;
 241
 242        ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
 243        BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
 244
 245        for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
 246                buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 247                fsect = sg->offset >> 9;
 248                lsect = fsect + (sg->length >> 9) - 1;
 249                /* install a grant reference. */
 250                ref = gnttab_claim_grant_reference(&gref_head);
 251                BUG_ON(ref == -ENOSPC);
 252
 253                gnttab_grant_foreign_access_ref(
 254                                ref,
 255                                info->xbdev->otherend_id,
 256                                buffer_mfn,
 257                                rq_data_dir(req) );
 258
 259                info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
 260                ring_req->seg[i] =
 261                                (struct blkif_request_segment) {
 262                                        .gref       = ref,
 263                                        .first_sect = fsect,
 264                                        .last_sect  = lsect };
 265        }
 266
 267        info->ring.req_prod_pvt++;
 268
 269        /* Keep a private copy so we can reissue requests when recovering. */
 270        info->shadow[id].req = *ring_req;
 271
 272        gnttab_free_grant_references(gref_head);
 273
 274        return 0;
 275}
 276
 277
 278static inline void flush_requests(struct blkfront_info *info)
 279{
 280        int notify;
 281
 282        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
 283
 284        if (notify)
 285                notify_remote_via_irq(info->irq);
 286}
 287
 288/*
 289 * do_blkif_request
 290 *  read a block; request is in a request queue
 291 */
 292static void do_blkif_request(struct request_queue *rq)
 293{
 294        struct blkfront_info *info = NULL;
 295        struct request *req;
 296        int queued;
 297
 298        pr_debug("Entered do_blkif_request\n");
 299
 300        queued = 0;
 301
 302        while ((req = elv_next_request(rq)) != NULL) {
 303                info = req->rq_disk->private_data;
 304                if (!blk_fs_request(req)) {
 305                        end_request(req, 0);
 306                        continue;
 307                }
 308
 309                if (RING_FULL(&info->ring))
 310                        goto wait;
 311
 312                pr_debug("do_blk_req %p: cmd %p, sec %lx, "
 313                         "(%u/%li) buffer:%p [%s]\n",
 314                         req, req->cmd, (unsigned long)req->sector,
 315                         req->current_nr_sectors,
 316                         req->nr_sectors, req->buffer,
 317                         rq_data_dir(req) ? "write" : "read");
 318
 319
 320                blkdev_dequeue_request(req);
 321                if (blkif_queue_request(req)) {
 322                        blk_requeue_request(rq, req);
 323wait:
 324                        /* Avoid pointless unplugs. */
 325                        blk_stop_queue(rq);
 326                        break;
 327                }
 328
 329                queued++;
 330        }
 331
 332        if (queued != 0)
 333                flush_requests(info);
 334}
 335
 336static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 337{
 338        struct request_queue *rq;
 339
 340        rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
 341        if (rq == NULL)
 342                return -1;
 343
 344        queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 345
 346        /* Hard sector size and max sectors impersonate the equiv. hardware. */
 347        blk_queue_hardsect_size(rq, sector_size);
 348        blk_queue_max_sectors(rq, 512);
 349
 350        /* Each segment in a request is up to an aligned page in size. */
 351        blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 352        blk_queue_max_segment_size(rq, PAGE_SIZE);
 353
 354        /* Ensure a merged request will fit in a single I/O ring slot. */
 355        blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 356        blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 357
 358        /* Make sure buffer addresses are sector-aligned. */
 359        blk_queue_dma_alignment(rq, 511);
 360
 361        /* Make sure we don't use bounce buffers. */
 362        blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
 363
 364        gd->queue = rq;
 365
 366        return 0;
 367}
 368
 369
 370static int xlvbd_barrier(struct blkfront_info *info)
 371{
 372        int err;
 373
 374        err = blk_queue_ordered(info->rq,
 375                                info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
 376                                NULL);
 377
 378        if (err)
 379                return err;
 380
 381        printk(KERN_INFO "blkfront: %s: barriers %s\n",
 382               info->gd->disk_name,
 383               info->feature_barrier ? "enabled" : "disabled");
 384        return 0;
 385}
 386
 387
 388static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 389                               struct blkfront_info *info,
 390                               u16 vdisk_info, u16 sector_size)
 391{
 392        struct gendisk *gd;
 393        int nr_minors = 1;
 394        int err = -ENODEV;
 395        unsigned int offset;
 396        int minor;
 397        int nr_parts;
 398
 399        BUG_ON(info->gd != NULL);
 400        BUG_ON(info->rq != NULL);
 401
 402        if ((info->vdevice>>EXT_SHIFT) > 1) {
 403                /* this is above the extended range; something is wrong */
 404                printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
 405                return -ENODEV;
 406        }
 407
 408        if (!VDEV_IS_EXTENDED(info->vdevice)) {
 409                minor = BLKIF_MINOR(info->vdevice);
 410                nr_parts = PARTS_PER_DISK;
 411        } else {
 412                minor = BLKIF_MINOR_EXT(info->vdevice);
 413                nr_parts = PARTS_PER_EXT_DISK;
 414        }
 415
 416        if ((minor % nr_parts) == 0)
 417                nr_minors = nr_parts;
 418
 419        gd = alloc_disk(nr_minors);
 420        if (gd == NULL)
 421                goto out;
 422
 423        offset = minor / nr_parts;
 424
 425        if (nr_minors > 1) {
 426                if (offset < 26)
 427                        sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
 428                else
 429                        sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
 430                                'a' + ((offset / 26)-1), 'a' + (offset % 26));
 431        } else {
 432                if (offset < 26)
 433                        sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
 434                                'a' + offset,
 435                                minor & (nr_parts - 1));
 436                else
 437                        sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
 438                                'a' + ((offset / 26) - 1),
 439                                'a' + (offset % 26),
 440                                minor & (nr_parts - 1));
 441        }
 442
 443        gd->major = XENVBD_MAJOR;
 444        gd->first_minor = minor;
 445        gd->fops = &xlvbd_block_fops;
 446        gd->private_data = info;
 447        gd->driverfs_dev = &(info->xbdev->dev);
 448        set_capacity(gd, capacity);
 449
 450        if (xlvbd_init_blk_queue(gd, sector_size)) {
 451                del_gendisk(gd);
 452                goto out;
 453        }
 454
 455        info->rq = gd->queue;
 456        info->gd = gd;
 457
 458        if (info->feature_barrier)
 459                xlvbd_barrier(info);
 460
 461        if (vdisk_info & VDISK_READONLY)
 462                set_disk_ro(gd, 1);
 463
 464        if (vdisk_info & VDISK_REMOVABLE)
 465                gd->flags |= GENHD_FL_REMOVABLE;
 466
 467        if (vdisk_info & VDISK_CDROM)
 468                gd->flags |= GENHD_FL_CD;
 469
 470        return 0;
 471
 472 out:
 473        return err;
 474}
 475
 476static void kick_pending_request_queues(struct blkfront_info *info)
 477{
 478        if (!RING_FULL(&info->ring)) {
 479                /* Re-enable calldowns. */
 480                blk_start_queue(info->rq);
 481                /* Kick things off immediately. */
 482                do_blkif_request(info->rq);
 483        }
 484}
 485
 486static void blkif_restart_queue(struct work_struct *work)
 487{
 488        struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 489
 490        spin_lock_irq(&blkif_io_lock);
 491        if (info->connected == BLKIF_STATE_CONNECTED)
 492                kick_pending_request_queues(info);
 493        spin_unlock_irq(&blkif_io_lock);
 494}
 495
 496static void blkif_free(struct blkfront_info *info, int suspend)
 497{
 498        /* Prevent new requests being issued until we fix things up. */
 499        spin_lock_irq(&blkif_io_lock);
 500        info->connected = suspend ?
 501                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 502        /* No more blkif_request(). */
 503        if (info->rq)
 504                blk_stop_queue(info->rq);
 505        /* No more gnttab callback work. */
 506        gnttab_cancel_free_callback(&info->callback);
 507        spin_unlock_irq(&blkif_io_lock);
 508
 509        /* Flush gnttab callback work. Must be done with no locks held. */
 510        flush_scheduled_work();
 511
 512        /* Free resources associated with old device channel. */
 513        if (info->ring_ref != GRANT_INVALID_REF) {
 514                gnttab_end_foreign_access(info->ring_ref, 0,
 515                                          (unsigned long)info->ring.sring);
 516                info->ring_ref = GRANT_INVALID_REF;
 517                info->ring.sring = NULL;
 518        }
 519        if (info->irq)
 520                unbind_from_irqhandler(info->irq, info);
 521        info->evtchn = info->irq = 0;
 522
 523}
 524
 525static void blkif_completion(struct blk_shadow *s)
 526{
 527        int i;
 528        for (i = 0; i < s->req.nr_segments; i++)
 529                gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
 530}
 531
 532static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 533{
 534        struct request *req;
 535        struct blkif_response *bret;
 536        RING_IDX i, rp;
 537        unsigned long flags;
 538        struct blkfront_info *info = (struct blkfront_info *)dev_id;
 539        int error;
 540
 541        spin_lock_irqsave(&blkif_io_lock, flags);
 542
 543        if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
 544                spin_unlock_irqrestore(&blkif_io_lock, flags);
 545                return IRQ_HANDLED;
 546        }
 547
 548 again:
 549        rp = info->ring.sring->rsp_prod;
 550        rmb(); /* Ensure we see queued responses up to 'rp'. */
 551
 552        for (i = info->ring.rsp_cons; i != rp; i++) {
 553                unsigned long id;
 554                int ret;
 555
 556                bret = RING_GET_RESPONSE(&info->ring, i);
 557                id   = bret->id;
 558                req  = (struct request *)info->shadow[id].request;
 559
 560                blkif_completion(&info->shadow[id]);
 561
 562                add_id_to_freelist(info, id);
 563
 564                error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 565                switch (bret->operation) {
 566                case BLKIF_OP_WRITE_BARRIER:
 567                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 568                                printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 569                                       info->gd->disk_name);
 570                                error = -EOPNOTSUPP;
 571                                info->feature_barrier = 0;
 572                                xlvbd_barrier(info);
 573                        }
 574                        /* fall through */
 575                case BLKIF_OP_READ:
 576                case BLKIF_OP_WRITE:
 577                        if (unlikely(bret->status != BLKIF_RSP_OKAY))
 578                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 579                                        "request: %x\n", bret->status);
 580
 581                        ret = __blk_end_request(req, error, blk_rq_bytes(req));
 582                        BUG_ON(ret);
 583                        break;
 584                default:
 585                        BUG();
 586                }
 587        }
 588
 589        info->ring.rsp_cons = i;
 590
 591        if (i != info->ring.req_prod_pvt) {
 592                int more_to_do;
 593                RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
 594                if (more_to_do)
 595                        goto again;
 596        } else
 597                info->ring.sring->rsp_event = i + 1;
 598
 599        kick_pending_request_queues(info);
 600
 601        spin_unlock_irqrestore(&blkif_io_lock, flags);
 602
 603        return IRQ_HANDLED;
 604}
 605
 606
 607static int setup_blkring(struct xenbus_device *dev,
 608                         struct blkfront_info *info)
 609{
 610        struct blkif_sring *sring;
 611        int err;
 612
 613        info->ring_ref = GRANT_INVALID_REF;
 614
 615        sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 616        if (!sring) {
 617                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 618                return -ENOMEM;
 619        }
 620        SHARED_RING_INIT(sring);
 621        FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 622
 623        sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 624
 625        err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 626        if (err < 0) {
 627                free_page((unsigned long)sring);
 628                info->ring.sring = NULL;
 629                goto fail;
 630        }
 631        info->ring_ref = err;
 632
 633        err = xenbus_alloc_evtchn(dev, &info->evtchn);
 634        if (err)
 635                goto fail;
 636
 637        err = bind_evtchn_to_irqhandler(info->evtchn,
 638                                        blkif_interrupt,
 639                                        IRQF_SAMPLE_RANDOM, "blkif", info);
 640        if (err <= 0) {
 641                xenbus_dev_fatal(dev, err,
 642                                 "bind_evtchn_to_irqhandler failed");
 643                goto fail;
 644        }
 645        info->irq = err;
 646
 647        return 0;
 648fail:
 649        blkif_free(info, 0);
 650        return err;
 651}
 652
 653
 654/* Common code used when first setting up, and when resuming. */
 655static int talk_to_backend(struct xenbus_device *dev,
 656                           struct blkfront_info *info)
 657{
 658        const char *message = NULL;
 659        struct xenbus_transaction xbt;
 660        int err;
 661
 662        /* Create shared ring, alloc event channel. */
 663        err = setup_blkring(dev, info);
 664        if (err)
 665                goto out;
 666
 667again:
 668        err = xenbus_transaction_start(&xbt);
 669        if (err) {
 670                xenbus_dev_fatal(dev, err, "starting transaction");
 671                goto destroy_blkring;
 672        }
 673
 674        err = xenbus_printf(xbt, dev->nodename,
 675                            "ring-ref", "%u", info->ring_ref);
 676        if (err) {
 677                message = "writing ring-ref";
 678                goto abort_transaction;
 679        }
 680        err = xenbus_printf(xbt, dev->nodename,
 681                            "event-channel", "%u", info->evtchn);
 682        if (err) {
 683                message = "writing event-channel";
 684                goto abort_transaction;
 685        }
 686        err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
 687                            XEN_IO_PROTO_ABI_NATIVE);
 688        if (err) {
 689                message = "writing protocol";
 690                goto abort_transaction;
 691        }
 692
 693        err = xenbus_transaction_end(xbt, 0);
 694        if (err) {
 695                if (err == -EAGAIN)
 696                        goto again;
 697                xenbus_dev_fatal(dev, err, "completing transaction");
 698                goto destroy_blkring;
 699        }
 700
 701        xenbus_switch_state(dev, XenbusStateInitialised);
 702
 703        return 0;
 704
 705 abort_transaction:
 706        xenbus_transaction_end(xbt, 1);
 707        if (message)
 708                xenbus_dev_fatal(dev, err, "%s", message);
 709 destroy_blkring:
 710        blkif_free(info, 0);
 711 out:
 712        return err;
 713}
 714
 715
 716/**
 717 * Entry point to this code when a new device is created.  Allocate the basic
 718 * structures and the ring buffer for communication with the backend, and
 719 * inform the backend of the appropriate details for those.  Switch to
 720 * Initialised state.
 721 */
 722static int blkfront_probe(struct xenbus_device *dev,
 723                          const struct xenbus_device_id *id)
 724{
 725        int err, vdevice, i;
 726        struct blkfront_info *info;
 727
 728        /* FIXME: Use dynamic device id if this is not set. */
 729        err = xenbus_scanf(XBT_NIL, dev->nodename,
 730                           "virtual-device", "%i", &vdevice);
 731        if (err != 1) {
 732                /* go looking in the extended area instead */
 733                err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
 734                                   "%i", &vdevice);
 735                if (err != 1) {
 736                        xenbus_dev_fatal(dev, err, "reading virtual-device");
 737                        return err;
 738                }
 739        }
 740
 741        info = kzalloc(sizeof(*info), GFP_KERNEL);
 742        if (!info) {
 743                xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
 744                return -ENOMEM;
 745        }
 746
 747        info->xbdev = dev;
 748        info->vdevice = vdevice;
 749        info->connected = BLKIF_STATE_DISCONNECTED;
 750        INIT_WORK(&info->work, blkif_restart_queue);
 751
 752        for (i = 0; i < BLK_RING_SIZE; i++)
 753                info->shadow[i].req.id = i+1;
 754        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 755
 756        /* Front end dir is a number, which is used as the id. */
 757        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 758        dev->dev.driver_data = info;
 759
 760        err = talk_to_backend(dev, info);
 761        if (err) {
 762                kfree(info);
 763                dev->dev.driver_data = NULL;
 764                return err;
 765        }
 766
 767        return 0;
 768}
 769
 770
 771static int blkif_recover(struct blkfront_info *info)
 772{
 773        int i;
 774        struct blkif_request *req;
 775        struct blk_shadow *copy;
 776        int j;
 777
 778        /* Stage 1: Make a safe copy of the shadow state. */
 779        copy = kmalloc(sizeof(info->shadow),
 780                       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
 781        if (!copy)
 782                return -ENOMEM;
 783        memcpy(copy, info->shadow, sizeof(info->shadow));
 784
 785        /* Stage 2: Set up free list. */
 786        memset(&info->shadow, 0, sizeof(info->shadow));
 787        for (i = 0; i < BLK_RING_SIZE; i++)
 788                info->shadow[i].req.id = i+1;
 789        info->shadow_free = info->ring.req_prod_pvt;
 790        info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 791
 792        /* Stage 3: Find pending requests and requeue them. */
 793        for (i = 0; i < BLK_RING_SIZE; i++) {
 794                /* Not in use? */
 795                if (copy[i].request == 0)
 796                        continue;
 797
 798                /* Grab a request slot and copy shadow state into it. */
 799                req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 800                *req = copy[i].req;
 801
 802                /* We get a new request id, and must reset the shadow state. */
 803                req->id = get_id_from_freelist(info);
 804                memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
 805
 806                /* Rewrite any grant references invalidated by susp/resume. */
 807                for (j = 0; j < req->nr_segments; j++)
 808                        gnttab_grant_foreign_access_ref(
 809                                req->seg[j].gref,
 810                                info->xbdev->otherend_id,
 811                                pfn_to_mfn(info->shadow[req->id].frame[j]),
 812                                rq_data_dir(
 813                                        (struct request *)
 814                                        info->shadow[req->id].request));
 815                info->shadow[req->id].req = *req;
 816
 817                info->ring.req_prod_pvt++;
 818        }
 819
 820        kfree(copy);
 821
 822        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 823
 824        spin_lock_irq(&blkif_io_lock);
 825
 826        /* Now safe for us to use the shared ring */
 827        info->connected = BLKIF_STATE_CONNECTED;
 828
 829        /* Send off requeued requests */
 830        flush_requests(info);
 831
 832        /* Kick any other new requests queued since we resumed */
 833        kick_pending_request_queues(info);
 834
 835        spin_unlock_irq(&blkif_io_lock);
 836
 837        return 0;
 838}
 839
 840/**
 841 * We are reconnecting to the backend, due to a suspend/resume, or a backend
 842 * driver restart.  We tear down our blkif structure and recreate it, but
 843 * leave the device-layer structures intact so that this is transparent to the
 844 * rest of the kernel.
 845 */
 846static int blkfront_resume(struct xenbus_device *dev)
 847{
 848        struct blkfront_info *info = dev->dev.driver_data;
 849        int err;
 850
 851        dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 852
 853        blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 854
 855        err = talk_to_backend(dev, info);
 856        if (info->connected == BLKIF_STATE_SUSPENDED && !err)
 857                err = blkif_recover(info);
 858
 859        return err;
 860}
 861
 862
 863/*
 864 * Invoked when the backend is finally 'ready' (and has told produced
 865 * the details about the physical device - #sectors, size, etc).
 866 */
 867static void blkfront_connect(struct blkfront_info *info)
 868{
 869        unsigned long long sectors;
 870        unsigned long sector_size;
 871        unsigned int binfo;
 872        int err;
 873
 874        if ((info->connected == BLKIF_STATE_CONNECTED) ||
 875            (info->connected == BLKIF_STATE_SUSPENDED) )
 876                return;
 877
 878        dev_dbg(&info->xbdev->dev, "%s:%s.\n",
 879                __func__, info->xbdev->otherend);
 880
 881        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 882                            "sectors", "%llu", &sectors,
 883                            "info", "%u", &binfo,
 884                            "sector-size", "%lu", &sector_size,
 885                            NULL);
 886        if (err) {
 887                xenbus_dev_fatal(info->xbdev, err,
 888                                 "reading backend fields at %s",
 889                                 info->xbdev->otherend);
 890                return;
 891        }
 892
 893        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 894                            "feature-barrier", "%lu", &info->feature_barrier,
 895                            NULL);
 896        if (err)
 897                info->feature_barrier = 0;
 898
 899        err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 900        if (err) {
 901                xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 902                                 info->xbdev->otherend);
 903                return;
 904        }
 905
 906        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 907
 908        /* Kick pending requests. */
 909        spin_lock_irq(&blkif_io_lock);
 910        info->connected = BLKIF_STATE_CONNECTED;
 911        kick_pending_request_queues(info);
 912        spin_unlock_irq(&blkif_io_lock);
 913
 914        add_disk(info->gd);
 915
 916        info->is_ready = 1;
 917}
 918
 919/**
 920 * Handle the change of state of the backend to Closing.  We must delete our
 921 * device-layer structures now, to ensure that writes are flushed through to
 922 * the backend.  Once is this done, we can switch to Closed in
 923 * acknowledgement.
 924 */
 925static void blkfront_closing(struct xenbus_device *dev)
 926{
 927        struct blkfront_info *info = dev->dev.driver_data;
 928        unsigned long flags;
 929
 930        dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
 931
 932        if (info->rq == NULL)
 933                goto out;
 934
 935        spin_lock_irqsave(&blkif_io_lock, flags);
 936
 937        /* No more blkif_request(). */
 938        blk_stop_queue(info->rq);
 939
 940        /* No more gnttab callback work. */
 941        gnttab_cancel_free_callback(&info->callback);
 942        spin_unlock_irqrestore(&blkif_io_lock, flags);
 943
 944        /* Flush gnttab callback work. Must be done with no locks held. */
 945        flush_scheduled_work();
 946
 947        blk_cleanup_queue(info->rq);
 948        info->rq = NULL;
 949
 950        del_gendisk(info->gd);
 951
 952 out:
 953        xenbus_frontend_closed(dev);
 954}
 955
 956/**
 957 * Callback received when the backend's state changes.
 958 */
 959static void backend_changed(struct xenbus_device *dev,
 960                            enum xenbus_state backend_state)
 961{
 962        struct blkfront_info *info = dev->dev.driver_data;
 963        struct block_device *bd;
 964
 965        dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
 966
 967        switch (backend_state) {
 968        case XenbusStateInitialising:
 969        case XenbusStateInitWait:
 970        case XenbusStateInitialised:
 971        case XenbusStateUnknown:
 972        case XenbusStateClosed:
 973                break;
 974
 975        case XenbusStateConnected:
 976                blkfront_connect(info);
 977                break;
 978
 979        case XenbusStateClosing:
 980                if (info->gd == NULL) {
 981                        xenbus_frontend_closed(dev);
 982                        break;
 983                }
 984                bd = bdget_disk(info->gd, 0);
 985                if (bd == NULL)
 986                        xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
 987
 988                mutex_lock(&bd->bd_mutex);
 989                if (info->users > 0)
 990                        xenbus_dev_error(dev, -EBUSY,
 991                                         "Device in use; refusing to close");
 992                else
 993                        blkfront_closing(dev);
 994                mutex_unlock(&bd->bd_mutex);
 995                bdput(bd);
 996                break;
 997        }
 998}
 999
1000static int blkfront_remove(struct xenbus_device *dev)
1001{
1002        struct blkfront_info *info = dev->dev.driver_data;
1003
1004        dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
1005
1006        blkif_free(info, 0);
1007
1008        kfree(info);
1009
1010        return 0;
1011}
1012
1013static int blkfront_is_ready(struct xenbus_device *dev)
1014{
1015        struct blkfront_info *info = dev->dev.driver_data;
1016
1017        return info->is_ready;
1018}
1019
1020static int blkif_open(struct block_device *bdev, fmode_t mode)
1021{
1022        struct blkfront_info *info = bdev->bd_disk->private_data;
1023        info->users++;
1024        return 0;
1025}
1026
1027static int blkif_release(struct gendisk *disk, fmode_t mode)
1028{
1029        struct blkfront_info *info = disk->private_data;
1030        info->users--;
1031        if (info->users == 0) {
1032                /* Check whether we have been instructed to close.  We will
1033                   have ignored this request initially, as the device was
1034                   still mounted. */
1035                struct xenbus_device *dev = info->xbdev;
1036                enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
1037
1038                if (state == XenbusStateClosing && info->is_ready)
1039                        blkfront_closing(dev);
1040        }
1041        return 0;
1042}
1043
1044static struct block_device_operations xlvbd_block_fops =
1045{
1046        .owner = THIS_MODULE,
1047        .open = blkif_open,
1048        .release = blkif_release,
1049        .getgeo = blkif_getgeo,
1050        .locked_ioctl = blkif_ioctl,
1051};
1052
1053
1054static struct xenbus_device_id blkfront_ids[] = {
1055        { "vbd" },
1056        { "" }
1057};
1058
1059static struct xenbus_driver blkfront = {
1060        .name = "vbd",
1061        .owner = THIS_MODULE,
1062        .ids = blkfront_ids,
1063        .probe = blkfront_probe,
1064        .remove = blkfront_remove,
1065        .resume = blkfront_resume,
1066        .otherend_changed = backend_changed,
1067        .is_ready = blkfront_is_ready,
1068};
1069
1070static int __init xlblk_init(void)
1071{
1072        if (!xen_domain())
1073                return -ENODEV;
1074
1075        if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
1076                printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
1077                       XENVBD_MAJOR, DEV_NAME);
1078                return -ENODEV;
1079        }
1080
1081        return xenbus_register_frontend(&blkfront);
1082}
1083module_init(xlblk_init);
1084
1085
1086static void __exit xlblk_exit(void)
1087{
1088        return xenbus_unregister_driver(&blkfront);
1089}
1090module_exit(xlblk_exit);
1091
1092MODULE_DESCRIPTION("Xen virtual block device frontend");
1093MODULE_LICENSE("GPL");
1094MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
1095MODULE_ALIAS("xen:vbd");
1096MODULE_ALIAS("xenblk");
1097