linux/virt/kvm/eventfd.c
<<
>>
Prefs
   1/*
   2 * kvm eventfd support - use eventfd objects to signal various KVM events
   3 *
   4 * Copyright 2009 Novell.  All Rights Reserved.
   5 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   6 *
   7 * Author:
   8 *      Gregory Haskins <ghaskins@novell.com>
   9 *
  10 * This file is free software; you can redistribute it and/or modify
  11 * it under the terms of version 2 of the GNU General Public License
  12 * as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 * GNU General Public License for more details.
  18 *
  19 * You should have received a copy of the GNU General Public License
  20 * along with this program; if not, write to the Free Software Foundation,
  21 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
  22 */
  23
  24#include <linux/kvm_host.h>
  25#include <linux/kvm.h>
  26#include <linux/workqueue.h>
  27#include <linux/syscalls.h>
  28#include <linux/wait.h>
  29#include <linux/poll.h>
  30#include <linux/file.h>
  31#include <linux/list.h>
  32#include <linux/eventfd.h>
  33#include <linux/kernel.h>
  34#include <linux/slab.h>
  35
  36#include "iodev.h"
  37
  38/*
  39 * --------------------------------------------------------------------
  40 * irqfd: Allows an fd to be used to inject an interrupt to the guest
  41 *
  42 * Credit goes to Avi Kivity for the original idea.
  43 * --------------------------------------------------------------------
  44 */
  45
  46struct _irqfd {
  47        /* Used for MSI fast-path */
  48        struct kvm *kvm;
  49        wait_queue_t wait;
  50        /* Update side is protected by irqfds.lock */
  51        struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
  52        /* Used for level IRQ fast-path */
  53        int gsi;
  54        struct work_struct inject;
  55        /* Used for setup/shutdown */
  56        struct eventfd_ctx *eventfd;
  57        struct list_head list;
  58        poll_table pt;
  59        struct work_struct shutdown;
  60};
  61
  62static struct workqueue_struct *irqfd_cleanup_wq;
  63
  64static void
  65irqfd_inject(struct work_struct *work)
  66{
  67        struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
  68        struct kvm *kvm = irqfd->kvm;
  69
  70        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
  71        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
  72}
  73
  74/*
  75 * Race-free decouple logic (ordering is critical)
  76 */
  77static void
  78irqfd_shutdown(struct work_struct *work)
  79{
  80        struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
  81        u64 cnt;
  82
  83        /*
  84         * Synchronize with the wait-queue and unhook ourselves to prevent
  85         * further events.
  86         */
  87        eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
  88
  89        /*
  90         * We know no new events will be scheduled at this point, so block
  91         * until all previously outstanding events have completed
  92         */
  93        flush_work_sync(&irqfd->inject);
  94
  95        /*
  96         * It is now safe to release the object's resources
  97         */
  98        eventfd_ctx_put(irqfd->eventfd);
  99        kfree(irqfd);
 100}
 101
 102
 103/* assumes kvm->irqfds.lock is held */
 104static bool
 105irqfd_is_active(struct _irqfd *irqfd)
 106{
 107        return list_empty(&irqfd->list) ? false : true;
 108}
 109
 110/*
 111 * Mark the irqfd as inactive and schedule it for removal
 112 *
 113 * assumes kvm->irqfds.lock is held
 114 */
 115static void
 116irqfd_deactivate(struct _irqfd *irqfd)
 117{
 118        BUG_ON(!irqfd_is_active(irqfd));
 119
 120        list_del_init(&irqfd->list);
 121
 122        queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 123}
 124
 125/*
 126 * Called with wqh->lock held and interrupts disabled
 127 */
 128static int
 129irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 130{
 131        struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
 132        unsigned long flags = (unsigned long)key;
 133        struct kvm_kernel_irq_routing_entry *irq;
 134        struct kvm *kvm = irqfd->kvm;
 135
 136        if (flags & POLLIN) {
 137                rcu_read_lock();
 138                irq = rcu_dereference(irqfd->irq_entry);
 139                /* An event has been signaled, inject an interrupt */
 140                if (irq)
 141                        kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
 142                else
 143                        schedule_work(&irqfd->inject);
 144                rcu_read_unlock();
 145        }
 146
 147        if (flags & POLLHUP) {
 148                /* The eventfd is closing, detach from KVM */
 149                unsigned long flags;
 150
 151                spin_lock_irqsave(&kvm->irqfds.lock, flags);
 152
 153                /*
 154                 * We must check if someone deactivated the irqfd before
 155                 * we could acquire the irqfds.lock since the item is
 156                 * deactivated from the KVM side before it is unhooked from
 157                 * the wait-queue.  If it is already deactivated, we can
 158                 * simply return knowing the other side will cleanup for us.
 159                 * We cannot race against the irqfd going away since the
 160                 * other side is required to acquire wqh->lock, which we hold
 161                 */
 162                if (irqfd_is_active(irqfd))
 163                        irqfd_deactivate(irqfd);
 164
 165                spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
 166        }
 167
 168        return 0;
 169}
 170
 171static void
 172irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 173                        poll_table *pt)
 174{
 175        struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
 176        add_wait_queue(wqh, &irqfd->wait);
 177}
 178
 179/* Must be called under irqfds.lock */
 180static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
 181                         struct kvm_irq_routing_table *irq_rt)
 182{
 183        struct kvm_kernel_irq_routing_entry *e;
 184        struct hlist_node *n;
 185
 186        if (irqfd->gsi >= irq_rt->nr_rt_entries) {
 187                rcu_assign_pointer(irqfd->irq_entry, NULL);
 188                return;
 189        }
 190
 191        hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
 192                /* Only fast-path MSI. */
 193                if (e->type == KVM_IRQ_ROUTING_MSI)
 194                        rcu_assign_pointer(irqfd->irq_entry, e);
 195                else
 196                        rcu_assign_pointer(irqfd->irq_entry, NULL);
 197        }
 198}
 199
 200static int
 201kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
 202{
 203        struct kvm_irq_routing_table *irq_rt;
 204        struct _irqfd *irqfd, *tmp;
 205        struct file *file = NULL;
 206        struct eventfd_ctx *eventfd = NULL;
 207        int ret;
 208        unsigned int events;
 209
 210        irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
 211        if (!irqfd)
 212                return -ENOMEM;
 213
 214        irqfd->kvm = kvm;
 215        irqfd->gsi = gsi;
 216        INIT_LIST_HEAD(&irqfd->list);
 217        INIT_WORK(&irqfd->inject, irqfd_inject);
 218        INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
 219
 220        file = eventfd_fget(fd);
 221        if (IS_ERR(file)) {
 222                ret = PTR_ERR(file);
 223                goto fail;
 224        }
 225
 226        eventfd = eventfd_ctx_fileget(file);
 227        if (IS_ERR(eventfd)) {
 228                ret = PTR_ERR(eventfd);
 229                goto fail;
 230        }
 231
 232        irqfd->eventfd = eventfd;
 233
 234        /*
 235         * Install our own custom wake-up handling so we are notified via
 236         * a callback whenever someone signals the underlying eventfd
 237         */
 238        init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 239        init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
 240
 241        spin_lock_irq(&kvm->irqfds.lock);
 242
 243        ret = 0;
 244        list_for_each_entry(tmp, &kvm->irqfds.items, list) {
 245                if (irqfd->eventfd != tmp->eventfd)
 246                        continue;
 247                /* This fd is used for another irq already. */
 248                ret = -EBUSY;
 249                spin_unlock_irq(&kvm->irqfds.lock);
 250                goto fail;
 251        }
 252
 253        irq_rt = rcu_dereference_protected(kvm->irq_routing,
 254                                           lockdep_is_held(&kvm->irqfds.lock));
 255        irqfd_update(kvm, irqfd, irq_rt);
 256
 257        events = file->f_op->poll(file, &irqfd->pt);
 258
 259        list_add_tail(&irqfd->list, &kvm->irqfds.items);
 260
 261        /*
 262         * Check if there was an event already pending on the eventfd
 263         * before we registered, and trigger it as if we didn't miss it.
 264         */
 265        if (events & POLLIN)
 266                schedule_work(&irqfd->inject);
 267
 268        spin_unlock_irq(&kvm->irqfds.lock);
 269
 270        /*
 271         * do not drop the file until the irqfd is fully initialized, otherwise
 272         * we might race against the POLLHUP
 273         */
 274        fput(file);
 275
 276        return 0;
 277
 278fail:
 279        if (eventfd && !IS_ERR(eventfd))
 280                eventfd_ctx_put(eventfd);
 281
 282        if (!IS_ERR(file))
 283                fput(file);
 284
 285        kfree(irqfd);
 286        return ret;
 287}
 288
 289void
 290kvm_eventfd_init(struct kvm *kvm)
 291{
 292        spin_lock_init(&kvm->irqfds.lock);
 293        INIT_LIST_HEAD(&kvm->irqfds.items);
 294        INIT_LIST_HEAD(&kvm->ioeventfds);
 295}
 296
 297/*
 298 * shutdown any irqfd's that match fd+gsi
 299 */
 300static int
 301kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
 302{
 303        struct _irqfd *irqfd, *tmp;
 304        struct eventfd_ctx *eventfd;
 305
 306        eventfd = eventfd_ctx_fdget(fd);
 307        if (IS_ERR(eventfd))
 308                return PTR_ERR(eventfd);
 309
 310        spin_lock_irq(&kvm->irqfds.lock);
 311
 312        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
 313                if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
 314                        /*
 315                         * This rcu_assign_pointer is needed for when
 316                         * another thread calls kvm_irq_routing_update before
 317                         * we flush workqueue below (we synchronize with
 318                         * kvm_irq_routing_update using irqfds.lock).
 319                         * It is paired with synchronize_rcu done by caller
 320                         * of that function.
 321                         */
 322                        rcu_assign_pointer(irqfd->irq_entry, NULL);
 323                        irqfd_deactivate(irqfd);
 324                }
 325        }
 326
 327        spin_unlock_irq(&kvm->irqfds.lock);
 328        eventfd_ctx_put(eventfd);
 329
 330        /*
 331         * Block until we know all outstanding shutdown jobs have completed
 332         * so that we guarantee there will not be any more interrupts on this
 333         * gsi once this deassign function returns.
 334         */
 335        flush_workqueue(irqfd_cleanup_wq);
 336
 337        return 0;
 338}
 339
 340int
 341kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
 342{
 343        if (flags & KVM_IRQFD_FLAG_DEASSIGN)
 344                return kvm_irqfd_deassign(kvm, fd, gsi);
 345
 346        return kvm_irqfd_assign(kvm, fd, gsi);
 347}
 348
 349/*
 350 * This function is called as the kvm VM fd is being released. Shutdown all
 351 * irqfds that still remain open
 352 */
 353void
 354kvm_irqfd_release(struct kvm *kvm)
 355{
 356        struct _irqfd *irqfd, *tmp;
 357
 358        spin_lock_irq(&kvm->irqfds.lock);
 359
 360        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
 361                irqfd_deactivate(irqfd);
 362
 363        spin_unlock_irq(&kvm->irqfds.lock);
 364
 365        /*
 366         * Block until we know all outstanding shutdown jobs have completed
 367         * since we do not take a kvm* reference.
 368         */
 369        flush_workqueue(irqfd_cleanup_wq);
 370
 371}
 372
 373/*
 374 * Change irq_routing and irqfd.
 375 * Caller must invoke synchronize_rcu afterwards.
 376 */
 377void kvm_irq_routing_update(struct kvm *kvm,
 378                            struct kvm_irq_routing_table *irq_rt)
 379{
 380        struct _irqfd *irqfd;
 381
 382        spin_lock_irq(&kvm->irqfds.lock);
 383
 384        rcu_assign_pointer(kvm->irq_routing, irq_rt);
 385
 386        list_for_each_entry(irqfd, &kvm->irqfds.items, list)
 387                irqfd_update(kvm, irqfd, irq_rt);
 388
 389        spin_unlock_irq(&kvm->irqfds.lock);
 390}
 391
 392/*
 393 * create a host-wide workqueue for issuing deferred shutdown requests
 394 * aggregated from all vm* instances. We need our own isolated single-thread
 395 * queue to prevent deadlock against flushing the normal work-queue.
 396 */
 397static int __init irqfd_module_init(void)
 398{
 399        irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
 400        if (!irqfd_cleanup_wq)
 401                return -ENOMEM;
 402
 403        return 0;
 404}
 405
 406static void __exit irqfd_module_exit(void)
 407{
 408        destroy_workqueue(irqfd_cleanup_wq);
 409}
 410
 411module_init(irqfd_module_init);
 412module_exit(irqfd_module_exit);
 413
 414/*
 415 * --------------------------------------------------------------------
 416 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
 417 *
 418 * userspace can register a PIO/MMIO address with an eventfd for receiving
 419 * notification when the memory has been touched.
 420 * --------------------------------------------------------------------
 421 */
 422
 423struct _ioeventfd {
 424        struct list_head     list;
 425        u64                  addr;
 426        int                  length;
 427        struct eventfd_ctx  *eventfd;
 428        u64                  datamatch;
 429        struct kvm_io_device dev;
 430        bool                 wildcard;
 431};
 432
 433static inline struct _ioeventfd *
 434to_ioeventfd(struct kvm_io_device *dev)
 435{
 436        return container_of(dev, struct _ioeventfd, dev);
 437}
 438
 439static void
 440ioeventfd_release(struct _ioeventfd *p)
 441{
 442        eventfd_ctx_put(p->eventfd);
 443        list_del(&p->list);
 444        kfree(p);
 445}
 446
 447static bool
 448ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 449{
 450        u64 _val;
 451
 452        if (!(addr == p->addr && len == p->length))
 453                /* address-range must be precise for a hit */
 454                return false;
 455
 456        if (p->wildcard)
 457                /* all else equal, wildcard is always a hit */
 458                return true;
 459
 460        /* otherwise, we have to actually compare the data */
 461
 462        BUG_ON(!IS_ALIGNED((unsigned long)val, len));
 463
 464        switch (len) {
 465        case 1:
 466                _val = *(u8 *)val;
 467                break;
 468        case 2:
 469                _val = *(u16 *)val;
 470                break;
 471        case 4:
 472                _val = *(u32 *)val;
 473                break;
 474        case 8:
 475                _val = *(u64 *)val;
 476                break;
 477        default:
 478                return false;
 479        }
 480
 481        return _val == p->datamatch ? true : false;
 482}
 483
 484/* MMIO/PIO writes trigger an event if the addr/val match */
 485static int
 486ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 487                const void *val)
 488{
 489        struct _ioeventfd *p = to_ioeventfd(this);
 490
 491        if (!ioeventfd_in_range(p, addr, len, val))
 492                return -EOPNOTSUPP;
 493
 494        eventfd_signal(p->eventfd, 1);
 495        return 0;
 496}
 497
 498/*
 499 * This function is called as KVM is completely shutting down.  We do not
 500 * need to worry about locking just nuke anything we have as quickly as possible
 501 */
 502static void
 503ioeventfd_destructor(struct kvm_io_device *this)
 504{
 505        struct _ioeventfd *p = to_ioeventfd(this);
 506
 507        ioeventfd_release(p);
 508}
 509
 510static const struct kvm_io_device_ops ioeventfd_ops = {
 511        .write      = ioeventfd_write,
 512        .destructor = ioeventfd_destructor,
 513};
 514
 515/* assumes kvm->slots_lock held */
 516static bool
 517ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
 518{
 519        struct _ioeventfd *_p;
 520
 521        list_for_each_entry(_p, &kvm->ioeventfds, list)
 522                if (_p->addr == p->addr && _p->length == p->length &&
 523                    (_p->wildcard || p->wildcard ||
 524                     _p->datamatch == p->datamatch))
 525                        return true;
 526
 527        return false;
 528}
 529
 530static int
 531kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 532{
 533        int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 534        enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 535        struct _ioeventfd        *p;
 536        struct eventfd_ctx       *eventfd;
 537        int                       ret;
 538
 539        /* must be natural-word sized */
 540        switch (args->len) {
 541        case 1:
 542        case 2:
 543        case 4:
 544        case 8:
 545                break;
 546        default:
 547                return -EINVAL;
 548        }
 549
 550        /* check for range overflow */
 551        if (args->addr + args->len < args->addr)
 552                return -EINVAL;
 553
 554        /* check for extra flags that we don't understand */
 555        if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
 556                return -EINVAL;
 557
 558        eventfd = eventfd_ctx_fdget(args->fd);
 559        if (IS_ERR(eventfd))
 560                return PTR_ERR(eventfd);
 561
 562        p = kzalloc(sizeof(*p), GFP_KERNEL);
 563        if (!p) {
 564                ret = -ENOMEM;
 565                goto fail;
 566        }
 567
 568        INIT_LIST_HEAD(&p->list);
 569        p->addr    = args->addr;
 570        p->length  = args->len;
 571        p->eventfd = eventfd;
 572
 573        /* The datamatch feature is optional, otherwise this is a wildcard */
 574        if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
 575                p->datamatch = args->datamatch;
 576        else
 577                p->wildcard = true;
 578
 579        mutex_lock(&kvm->slots_lock);
 580
 581        /* Verify that there isn't a match already */
 582        if (ioeventfd_check_collision(kvm, p)) {
 583                ret = -EEXIST;
 584                goto unlock_fail;
 585        }
 586
 587        kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 588
 589        ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
 590                                      &p->dev);
 591        if (ret < 0)
 592                goto unlock_fail;
 593
 594        list_add_tail(&p->list, &kvm->ioeventfds);
 595
 596        mutex_unlock(&kvm->slots_lock);
 597
 598        return 0;
 599
 600unlock_fail:
 601        mutex_unlock(&kvm->slots_lock);
 602
 603fail:
 604        kfree(p);
 605        eventfd_ctx_put(eventfd);
 606
 607        return ret;
 608}
 609
 610static int
 611kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 612{
 613        int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 614        enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 615        struct _ioeventfd        *p, *tmp;
 616        struct eventfd_ctx       *eventfd;
 617        int                       ret = -ENOENT;
 618
 619        eventfd = eventfd_ctx_fdget(args->fd);
 620        if (IS_ERR(eventfd))
 621                return PTR_ERR(eventfd);
 622
 623        mutex_lock(&kvm->slots_lock);
 624
 625        list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
 626                bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 627
 628                if (p->eventfd != eventfd  ||
 629                    p->addr != args->addr  ||
 630                    p->length != args->len ||
 631                    p->wildcard != wildcard)
 632                        continue;
 633
 634                if (!p->wildcard && p->datamatch != args->datamatch)
 635                        continue;
 636
 637                kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
 638                ioeventfd_release(p);
 639                ret = 0;
 640                break;
 641        }
 642
 643        mutex_unlock(&kvm->slots_lock);
 644
 645        eventfd_ctx_put(eventfd);
 646
 647        return ret;
 648}
 649
 650int
 651kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 652{
 653        if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
 654                return kvm_deassign_ioeventfd(kvm, args);
 655
 656        return kvm_assign_ioeventfd(kvm, args);
 657}
 658
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.