linux/virt/kvm/eventfd.c
<<
>>
Prefs
   1/*
   2 * kvm eventfd support - use eventfd objects to signal various KVM events
   3 *
   4 * Copyright 2009 Novell.  All Rights Reserved.
   5 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   6 *
   7 * Author:
   8 *      Gregory Haskins <ghaskins@novell.com>
   9 *
  10 * This file is free software; you can redistribute it and/or modify
  11 * it under the terms of version 2 of the GNU General Public License
  12 * as published by the Free Software Foundation.
  13 *
  14 * This program is distributed in the hope that it will be useful,
  15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 * GNU General Public License for more details.
  18 *
  19 * You should have received a copy of the GNU General Public License
  20 * along with this program; if not, write to the Free Software Foundation,
  21 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
  22 */
  23
  24#include <linux/kvm_host.h>
  25#include <linux/kvm.h>
  26#include <linux/workqueue.h>
  27#include <linux/syscalls.h>
  28#include <linux/wait.h>
  29#include <linux/poll.h>
  30#include <linux/file.h>
  31#include <linux/list.h>
  32#include <linux/eventfd.h>
  33#include <linux/kernel.h>
  34#include <linux/slab.h>
  35
  36#include "iodev.h"
  37
  38/*
  39 * --------------------------------------------------------------------
  40 * irqfd: Allows an fd to be used to inject an interrupt to the guest
  41 *
  42 * Credit goes to Avi Kivity for the original idea.
  43 * --------------------------------------------------------------------
  44 */
  45
  46struct _irqfd {
  47        /* Used for MSI fast-path */
  48        struct kvm *kvm;
  49        wait_queue_t wait;
  50        /* Update side is protected by irqfds.lock */
  51        struct kvm_kernel_irq_routing_entry __rcu *irq_entry;
  52        /* Used for level IRQ fast-path */
  53        int gsi;
  54        struct work_struct inject;
  55        /* Used for setup/shutdown */
  56        struct eventfd_ctx *eventfd;
  57        struct list_head list;
  58        poll_table pt;
  59        struct work_struct shutdown;
  60};
  61
  62static struct workqueue_struct *irqfd_cleanup_wq;
  63
  64static void
  65irqfd_inject(struct work_struct *work)
  66{
  67        struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
  68        struct kvm *kvm = irqfd->kvm;
  69
  70        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
  71        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
  72}
  73
  74/*
  75 * Race-free decouple logic (ordering is critical)
  76 */
  77static void
  78irqfd_shutdown(struct work_struct *work)
  79{
  80        struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
  81        u64 cnt;
  82
  83        /*
  84         * Synchronize with the wait-queue and unhook ourselves to prevent
  85         * further events.
  86         */
  87        eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
  88
  89        /*
  90         * We know no new events will be scheduled at this point, so block
  91         * until all previously outstanding events have completed
  92         */
  93        flush_work_sync(&irqfd->inject);
  94
  95        /*
  96         * It is now safe to release the object's resources
  97         */
  98        eventfd_ctx_put(irqfd->eventfd);
  99        kfree(irqfd);
 100}
 101
 102
 103/* assumes kvm->irqfds.lock is held */
 104static bool
 105irqfd_is_active(struct _irqfd *irqfd)
 106{
 107        return list_empty(&irqfd->list) ? false : true;
 108}
 109
 110/*
 111 * Mark the irqfd as inactive and schedule it for removal
 112 *
 113 * assumes kvm->irqfds.lock is held
 114 */
 115static void
 116irqfd_deactivate(struct _irqfd *irqfd)
 117{
 118        BUG_ON(!irqfd_is_active(irqfd));
 119
 120        list_del_init(&irqfd->list);
 121
 122        queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 123}
 124
 125/*
 126 * Called with wqh->lock held and interrupts disabled
 127 */
 128static int
 129irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 130{
 131        struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
 132        unsigned long flags = (unsigned long)key;
 133        struct kvm_kernel_irq_routing_entry *irq;
 134        struct kvm *kvm = irqfd->kvm;
 135
 136        if (flags & POLLIN) {
 137                rcu_read_lock();
 138                irq = rcu_dereference(irqfd->irq_entry);
 139                /* An event has been signaled, inject an interrupt */
 140                if (irq)
 141                        kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
 142                else
 143                        schedule_work(&irqfd->inject);
 144                rcu_read_unlock();
 145        }
 146
 147        if (flags & POLLHUP) {
 148                /* The eventfd is closing, detach from KVM */
 149                unsigned long flags;
 150
 151                spin_lock_irqsave(&kvm->irqfds.lock, flags);
 152
 153                /*
 154                 * We must check if someone deactivated the irqfd before
 155                 * we could acquire the irqfds.lock since the item is
 156                 * deactivated from the KVM side before it is unhooked from
 157                 * the wait-queue.  If it is already deactivated, we can
 158                 * simply return knowing the other side will cleanup for us.
 159                 * We cannot race against the irqfd going away since the
 160                 * other side is required to acquire wqh->lock, which we hold
 161                 */
 162                if (irqfd_is_active(irqfd))
 163                        irqfd_deactivate(irqfd);
 164
 165                spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
 166        }
 167
 168        return 0;
 169}
 170
 171static void
 172irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 173                        poll_table *pt)
 174{
 175        struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
 176        add_wait_queue(wqh, &irqfd->wait);
 177}
 178
 179/* Must be called under irqfds.lock */
 180static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd,
 181                         struct kvm_irq_routing_table *irq_rt)
 182{
 183        struct kvm_kernel_irq_routing_entry *e;
 184        struct hlist_node *n;
 185
 186        if (irqfd->gsi >= irq_rt->nr_rt_entries) {
 187                rcu_assign_pointer(irqfd->irq_entry, NULL);
 188                return;
 189        }
 190
 191        hlist_for_each_entry(e, n, &irq_rt->map[irqfd->gsi], link) {
 192                /* Only fast-path MSI. */
 193                if (e->type == KVM_IRQ_ROUTING_MSI)
 194                        rcu_assign_pointer(irqfd->irq_entry, e);
 195                else
 196                        rcu_assign_pointer(irqfd->irq_entry, NULL);
 197        }
 198}
 199
 200static int
 201kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 202{
 203        struct kvm_irq_routing_table *irq_rt;
 204        struct _irqfd *irqfd, *tmp;
 205        struct file *file = NULL;
 206        struct eventfd_ctx *eventfd = NULL;
 207        int ret;
 208        unsigned int events;
 209
 210        irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
 211        if (!irqfd)
 212                return -ENOMEM;
 213
 214        irqfd->kvm = kvm;
 215        irqfd->gsi = args->gsi;
 216        INIT_LIST_HEAD(&irqfd->list);
 217        INIT_WORK(&irqfd->inject, irqfd_inject);
 218        INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
 219
 220        file = eventfd_fget(args->fd);
 221        if (IS_ERR(file)) {
 222                ret = PTR_ERR(file);
 223                goto fail;
 224        }
 225
 226        eventfd = eventfd_ctx_fileget(file);
 227        if (IS_ERR(eventfd)) {
 228                ret = PTR_ERR(eventfd);
 229                goto fail;
 230        }
 231
 232        irqfd->eventfd = eventfd;
 233
 234        /*
 235         * Install our own custom wake-up handling so we are notified via
 236         * a callback whenever someone signals the underlying eventfd
 237         */
 238        init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 239        init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
 240
 241        spin_lock_irq(&kvm->irqfds.lock);
 242
 243        ret = 0;
 244        list_for_each_entry(tmp, &kvm->irqfds.items, list) {
 245                if (irqfd->eventfd != tmp->eventfd)
 246                        continue;
 247                /* This fd is used for another irq already. */
 248                ret = -EBUSY;
 249                spin_unlock_irq(&kvm->irqfds.lock);
 250                goto fail;
 251        }
 252
 253        irq_rt = rcu_dereference_protected(kvm->irq_routing,
 254                                           lockdep_is_held(&kvm->irqfds.lock));
 255        irqfd_update(kvm, irqfd, irq_rt);
 256
 257        events = file->f_op->poll(file, &irqfd->pt);
 258
 259        list_add_tail(&irqfd->list, &kvm->irqfds.items);
 260
 261        /*
 262         * Check if there was an event already pending on the eventfd
 263         * before we registered, and trigger it as if we didn't miss it.
 264         */
 265        if (events & POLLIN)
 266                schedule_work(&irqfd->inject);
 267
 268        spin_unlock_irq(&kvm->irqfds.lock);
 269
 270        /*
 271         * do not drop the file until the irqfd is fully initialized, otherwise
 272         * we might race against the POLLHUP
 273         */
 274        fput(file);
 275
 276        return 0;
 277
 278fail:
 279        if (eventfd && !IS_ERR(eventfd))
 280                eventfd_ctx_put(eventfd);
 281
 282        if (!IS_ERR(file))
 283                fput(file);
 284
 285        kfree(irqfd);
 286        return ret;
 287}
 288
 289void
 290kvm_eventfd_init(struct kvm *kvm)
 291{
 292        spin_lock_init(&kvm->irqfds.lock);
 293        INIT_LIST_HEAD(&kvm->irqfds.items);
 294        INIT_LIST_HEAD(&kvm->ioeventfds);
 295}
 296
 297/*
 298 * shutdown any irqfd's that match fd+gsi
 299 */
 300static int
 301kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 302{
 303        struct _irqfd *irqfd, *tmp;
 304        struct eventfd_ctx *eventfd;
 305
 306        eventfd = eventfd_ctx_fdget(args->fd);
 307        if (IS_ERR(eventfd))
 308                return PTR_ERR(eventfd);
 309
 310        spin_lock_irq(&kvm->irqfds.lock);
 311
 312        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
 313                if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
 314                        /*
 315                         * This rcu_assign_pointer is needed for when
 316                         * another thread calls kvm_irq_routing_update before
 317                         * we flush workqueue below (we synchronize with
 318                         * kvm_irq_routing_update using irqfds.lock).
 319                         * It is paired with synchronize_rcu done by caller
 320                         * of that function.
 321                         */
 322                        rcu_assign_pointer(irqfd->irq_entry, NULL);
 323                        irqfd_deactivate(irqfd);
 324                }
 325        }
 326
 327        spin_unlock_irq(&kvm->irqfds.lock);
 328        eventfd_ctx_put(eventfd);
 329
 330        /*
 331         * Block until we know all outstanding shutdown jobs have completed
 332         * so that we guarantee there will not be any more interrupts on this
 333         * gsi once this deassign function returns.
 334         */
 335        flush_workqueue(irqfd_cleanup_wq);
 336
 337        return 0;
 338}
 339
 340int
 341kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 342{
 343        if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
 344                return -EINVAL;
 345
 346        if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
 347                return kvm_irqfd_deassign(kvm, args);
 348
 349        return kvm_irqfd_assign(kvm, args);
 350}
 351
 352/*
 353 * This function is called as the kvm VM fd is being released. Shutdown all
 354 * irqfds that still remain open
 355 */
 356void
 357kvm_irqfd_release(struct kvm *kvm)
 358{
 359        struct _irqfd *irqfd, *tmp;
 360
 361        spin_lock_irq(&kvm->irqfds.lock);
 362
 363        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
 364                irqfd_deactivate(irqfd);
 365
 366        spin_unlock_irq(&kvm->irqfds.lock);
 367
 368        /*
 369         * Block until we know all outstanding shutdown jobs have completed
 370         * since we do not take a kvm* reference.
 371         */
 372        flush_workqueue(irqfd_cleanup_wq);
 373
 374}
 375
 376/*
 377 * Change irq_routing and irqfd.
 378 * Caller must invoke synchronize_rcu afterwards.
 379 */
 380void kvm_irq_routing_update(struct kvm *kvm,
 381                            struct kvm_irq_routing_table *irq_rt)
 382{
 383        struct _irqfd *irqfd;
 384
 385        spin_lock_irq(&kvm->irqfds.lock);
 386
 387        rcu_assign_pointer(kvm->irq_routing, irq_rt);
 388
 389        list_for_each_entry(irqfd, &kvm->irqfds.items, list)
 390                irqfd_update(kvm, irqfd, irq_rt);
 391
 392        spin_unlock_irq(&kvm->irqfds.lock);
 393}
 394
 395/*
 396 * create a host-wide workqueue for issuing deferred shutdown requests
 397 * aggregated from all vm* instances. We need our own isolated single-thread
 398 * queue to prevent deadlock against flushing the normal work-queue.
 399 */
 400static int __init irqfd_module_init(void)
 401{
 402        irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
 403        if (!irqfd_cleanup_wq)
 404                return -ENOMEM;
 405
 406        return 0;
 407}
 408
 409static void __exit irqfd_module_exit(void)
 410{
 411        destroy_workqueue(irqfd_cleanup_wq);
 412}
 413
 414module_init(irqfd_module_init);
 415module_exit(irqfd_module_exit);
 416
 417/*
 418 * --------------------------------------------------------------------
 419 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
 420 *
 421 * userspace can register a PIO/MMIO address with an eventfd for receiving
 422 * notification when the memory has been touched.
 423 * --------------------------------------------------------------------
 424 */
 425
 426struct _ioeventfd {
 427        struct list_head     list;
 428        u64                  addr;
 429        int                  length;
 430        struct eventfd_ctx  *eventfd;
 431        u64                  datamatch;
 432        struct kvm_io_device dev;
 433        bool                 wildcard;
 434};
 435
 436static inline struct _ioeventfd *
 437to_ioeventfd(struct kvm_io_device *dev)
 438{
 439        return container_of(dev, struct _ioeventfd, dev);
 440}
 441
 442static void
 443ioeventfd_release(struct _ioeventfd *p)
 444{
 445        eventfd_ctx_put(p->eventfd);
 446        list_del(&p->list);
 447        kfree(p);
 448}
 449
 450static bool
 451ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 452{
 453        u64 _val;
 454
 455        if (!(addr == p->addr && len == p->length))
 456                /* address-range must be precise for a hit */
 457                return false;
 458
 459        if (p->wildcard)
 460                /* all else equal, wildcard is always a hit */
 461                return true;
 462
 463        /* otherwise, we have to actually compare the data */
 464
 465        BUG_ON(!IS_ALIGNED((unsigned long)val, len));
 466
 467        switch (len) {
 468        case 1:
 469                _val = *(u8 *)val;
 470                break;
 471        case 2:
 472                _val = *(u16 *)val;
 473                break;
 474        case 4:
 475                _val = *(u32 *)val;
 476                break;
 477        case 8:
 478                _val = *(u64 *)val;
 479                break;
 480        default:
 481                return false;
 482        }
 483
 484        return _val == p->datamatch ? true : false;
 485}
 486
 487/* MMIO/PIO writes trigger an event if the addr/val match */
 488static int
 489ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
 490                const void *val)
 491{
 492        struct _ioeventfd *p = to_ioeventfd(this);
 493
 494        if (!ioeventfd_in_range(p, addr, len, val))
 495                return -EOPNOTSUPP;
 496
 497        eventfd_signal(p->eventfd, 1);
 498        return 0;
 499}
 500
 501/*
 502 * This function is called as KVM is completely shutting down.  We do not
 503 * need to worry about locking just nuke anything we have as quickly as possible
 504 */
 505static void
 506ioeventfd_destructor(struct kvm_io_device *this)
 507{
 508        struct _ioeventfd *p = to_ioeventfd(this);
 509
 510        ioeventfd_release(p);
 511}
 512
 513static const struct kvm_io_device_ops ioeventfd_ops = {
 514        .write      = ioeventfd_write,
 515        .destructor = ioeventfd_destructor,
 516};
 517
 518/* assumes kvm->slots_lock held */
 519static bool
 520ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
 521{
 522        struct _ioeventfd *_p;
 523
 524        list_for_each_entry(_p, &kvm->ioeventfds, list)
 525                if (_p->addr == p->addr && _p->length == p->length &&
 526                    (_p->wildcard || p->wildcard ||
 527                     _p->datamatch == p->datamatch))
 528                        return true;
 529
 530        return false;
 531}
 532
 533static int
 534kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 535{
 536        int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 537        enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 538        struct _ioeventfd        *p;
 539        struct eventfd_ctx       *eventfd;
 540        int                       ret;
 541
 542        /* must be natural-word sized */
 543        switch (args->len) {
 544        case 1:
 545        case 2:
 546        case 4:
 547        case 8:
 548                break;
 549        default:
 550                return -EINVAL;
 551        }
 552
 553        /* check for range overflow */
 554        if (args->addr + args->len < args->addr)
 555                return -EINVAL;
 556
 557        /* check for extra flags that we don't understand */
 558        if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
 559                return -EINVAL;
 560
 561        eventfd = eventfd_ctx_fdget(args->fd);
 562        if (IS_ERR(eventfd))
 563                return PTR_ERR(eventfd);
 564
 565        p = kzalloc(sizeof(*p), GFP_KERNEL);
 566        if (!p) {
 567                ret = -ENOMEM;
 568                goto fail;
 569        }
 570
 571        INIT_LIST_HEAD(&p->list);
 572        p->addr    = args->addr;
 573        p->length  = args->len;
 574        p->eventfd = eventfd;
 575
 576        /* The datamatch feature is optional, otherwise this is a wildcard */
 577        if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
 578                p->datamatch = args->datamatch;
 579        else
 580                p->wildcard = true;
 581
 582        mutex_lock(&kvm->slots_lock);
 583
 584        /* Verify that there isn't a match already */
 585        if (ioeventfd_check_collision(kvm, p)) {
 586                ret = -EEXIST;
 587                goto unlock_fail;
 588        }
 589
 590        kvm_iodevice_init(&p->dev, &ioeventfd_ops);
 591
 592        ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
 593                                      &p->dev);
 594        if (ret < 0)
 595                goto unlock_fail;
 596
 597        list_add_tail(&p->list, &kvm->ioeventfds);
 598
 599        mutex_unlock(&kvm->slots_lock);
 600
 601        return 0;
 602
 603unlock_fail:
 604        mutex_unlock(&kvm->slots_lock);
 605
 606fail:
 607        kfree(p);
 608        eventfd_ctx_put(eventfd);
 609
 610        return ret;
 611}
 612
 613static int
 614kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 615{
 616        int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
 617        enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
 618        struct _ioeventfd        *p, *tmp;
 619        struct eventfd_ctx       *eventfd;
 620        int                       ret = -ENOENT;
 621
 622        eventfd = eventfd_ctx_fdget(args->fd);
 623        if (IS_ERR(eventfd))
 624                return PTR_ERR(eventfd);
 625
 626        mutex_lock(&kvm->slots_lock);
 627
 628        list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
 629                bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 630
 631                if (p->eventfd != eventfd  ||
 632                    p->addr != args->addr  ||
 633                    p->length != args->len ||
 634                    p->wildcard != wildcard)
 635                        continue;
 636
 637                if (!p->wildcard && p->datamatch != args->datamatch)
 638                        continue;
 639
 640                kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
 641                ioeventfd_release(p);
 642                ret = 0;
 643                break;
 644        }
 645
 646        mutex_unlock(&kvm->slots_lock);
 647
 648        eventfd_ctx_put(eventfd);
 649
 650        return ret;
 651}
 652
 653int
 654kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 655{
 656        if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
 657                return kvm_deassign_ioeventfd(kvm, args);
 658
 659        return kvm_assign_ioeventfd(kvm, args);
 660}
 661
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.