linux/drivers/vfio/vfio_iommu_type1.c
<<
>>
Prefs
   1/*
   2 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 *
  15 * We arbitrarily define a Type1 IOMMU as one matching the below code.
  16 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  17 * VT-d, but that makes it harder to re-use as theoretically anyone
  18 * implementing a similar IOMMU could make use of this.  We expect the
  19 * IOMMU to support the IOMMU API and have few to no restrictions around
  20 * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  21 * optimized for relatively static mappings of a userspace process with
  22 * userpsace pages pinned into memory.  We also assume devices and IOMMU
  23 * domains are PCI based as the IOMMU API is still centered around a
  24 * device/bus interface rather than a group interface.
  25 */
  26
  27#include <linux/compat.h>
  28#include <linux/device.h>
  29#include <linux/fs.h>
  30#include <linux/iommu.h>
  31#include <linux/module.h>
  32#include <linux/mm.h>
  33#include <linux/pci.h>          /* pci_bus_type */
  34#include <linux/rbtree.h>
  35#include <linux/sched.h>
  36#include <linux/slab.h>
  37#include <linux/uaccess.h>
  38#include <linux/vfio.h>
  39#include <linux/workqueue.h>
  40
  41#define DRIVER_VERSION  "0.2"
  42#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  43#define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  44
  45static bool allow_unsafe_interrupts;
  46module_param_named(allow_unsafe_interrupts,
  47                   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  48MODULE_PARM_DESC(allow_unsafe_interrupts,
  49                 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  50
  51static bool disable_hugepages;
  52module_param_named(disable_hugepages,
  53                   disable_hugepages, bool, S_IRUGO | S_IWUSR);
  54MODULE_PARM_DESC(disable_hugepages,
  55                 "Disable VFIO IOMMU support for IOMMU hugepages.");
  56
  57struct vfio_iommu {
  58        struct iommu_domain     *domain;
  59        struct mutex            lock;
  60        struct rb_root          dma_list;
  61        struct list_head        group_list;
  62        bool                    cache;
  63};
  64
  65struct vfio_dma {
  66        struct rb_node          node;
  67        dma_addr_t              iova;           /* Device address */
  68        unsigned long           vaddr;          /* Process virtual addr */
  69        size_t                  size;           /* Map size (bytes) */
  70        int                     prot;           /* IOMMU_READ/WRITE */
  71};
  72
  73struct vfio_group {
  74        struct iommu_group      *iommu_group;
  75        struct list_head        next;
  76};
  77
  78/*
  79 * This code handles mapping and unmapping of user data buffers
  80 * into DMA'ble space using the IOMMU
  81 */
  82
  83static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  84                                      dma_addr_t start, size_t size)
  85{
  86        struct rb_node *node = iommu->dma_list.rb_node;
  87
  88        while (node) {
  89                struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
  90
  91                if (start + size <= dma->iova)
  92                        node = node->rb_left;
  93                else if (start >= dma->iova + dma->size)
  94                        node = node->rb_right;
  95                else
  96                        return dma;
  97        }
  98
  99        return NULL;
 100}
 101
 102static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 103{
 104        struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 105        struct vfio_dma *dma;
 106
 107        while (*link) {
 108                parent = *link;
 109                dma = rb_entry(parent, struct vfio_dma, node);
 110
 111                if (new->iova + new->size <= dma->iova)
 112                        link = &(*link)->rb_left;
 113                else
 114                        link = &(*link)->rb_right;
 115        }
 116
 117        rb_link_node(&new->node, parent, link);
 118        rb_insert_color(&new->node, &iommu->dma_list);
 119}
 120
 121static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 122{
 123        rb_erase(&old->node, &iommu->dma_list);
 124}
 125
 126struct vwork {
 127        struct mm_struct        *mm;
 128        long                    npage;
 129        struct work_struct      work;
 130};
 131
 132/* delayed decrement/increment for locked_vm */
 133static void vfio_lock_acct_bg(struct work_struct *work)
 134{
 135        struct vwork *vwork = container_of(work, struct vwork, work);
 136        struct mm_struct *mm;
 137
 138        mm = vwork->mm;
 139        down_write(&mm->mmap_sem);
 140        mm->locked_vm += vwork->npage;
 141        up_write(&mm->mmap_sem);
 142        mmput(mm);
 143        kfree(vwork);
 144}
 145
 146static void vfio_lock_acct(long npage)
 147{
 148        struct vwork *vwork;
 149        struct mm_struct *mm;
 150
 151        if (!current->mm || !npage)
 152                return; /* process exited or nothing to do */
 153
 154        if (down_write_trylock(&current->mm->mmap_sem)) {
 155                current->mm->locked_vm += npage;
 156                up_write(&current->mm->mmap_sem);
 157                return;
 158        }
 159
 160        /*
 161         * Couldn't get mmap_sem lock, so must setup to update
 162         * mm->locked_vm later. If locked_vm were atomic, we
 163         * wouldn't need this silliness
 164         */
 165        vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 166        if (!vwork)
 167                return;
 168        mm = get_task_mm(current);
 169        if (!mm) {
 170                kfree(vwork);
 171                return;
 172        }
 173        INIT_WORK(&vwork->work, vfio_lock_acct_bg);
 174        vwork->mm = mm;
 175        vwork->npage = npage;
 176        schedule_work(&vwork->work);
 177}
 178
 179/*
 180 * Some mappings aren't backed by a struct page, for example an mmap'd
 181 * MMIO range for our own or another device.  These use a different
 182 * pfn conversion and shouldn't be tracked as locked pages.
 183 */
 184static bool is_invalid_reserved_pfn(unsigned long pfn)
 185{
 186        if (pfn_valid(pfn)) {
 187                bool reserved;
 188                struct page *tail = pfn_to_page(pfn);
 189                struct page *head = compound_trans_head(tail);
 190                reserved = !!(PageReserved(head));
 191                if (head != tail) {
 192                        /*
 193                         * "head" is not a dangling pointer
 194                         * (compound_trans_head takes care of that)
 195                         * but the hugepage may have been split
 196                         * from under us (and we may not hold a
 197                         * reference count on the head page so it can
 198                         * be reused before we run PageReferenced), so
 199                         * we've to check PageTail before returning
 200                         * what we just read.
 201                         */
 202                        smp_rmb();
 203                        if (PageTail(tail))
 204                                return reserved;
 205                }
 206                return PageReserved(tail);
 207        }
 208
 209        return true;
 210}
 211
 212static int put_pfn(unsigned long pfn, int prot)
 213{
 214        if (!is_invalid_reserved_pfn(pfn)) {
 215                struct page *page = pfn_to_page(pfn);
 216                if (prot & IOMMU_WRITE)
 217                        SetPageDirty(page);
 218                put_page(page);
 219                return 1;
 220        }
 221        return 0;
 222}
 223
 224static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 225{
 226        struct page *page[1];
 227        struct vm_area_struct *vma;
 228        int ret = -EFAULT;
 229
 230        if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
 231                *pfn = page_to_pfn(page[0]);
 232                return 0;
 233        }
 234
 235        down_read(&current->mm->mmap_sem);
 236
 237        vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
 238
 239        if (vma && vma->vm_flags & VM_PFNMAP) {
 240                *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 241                if (is_invalid_reserved_pfn(*pfn))
 242                        ret = 0;
 243        }
 244
 245        up_read(&current->mm->mmap_sem);
 246
 247        return ret;
 248}
 249
 250/*
 251 * Attempt to pin pages.  We really don't want to track all the pfns and
 252 * the iommu can only map chunks of consecutive pfns anyway, so get the
 253 * first page and all consecutive pages with the same locking.
 254 */
 255static long vfio_pin_pages(unsigned long vaddr, long npage,
 256                           int prot, unsigned long *pfn_base)
 257{
 258        unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 259        bool lock_cap = capable(CAP_IPC_LOCK);
 260        long ret, i;
 261
 262        if (!current->mm)
 263                return -ENODEV;
 264
 265        ret = vaddr_get_pfn(vaddr, prot, pfn_base);
 266        if (ret)
 267                return ret;
 268
 269        if (is_invalid_reserved_pfn(*pfn_base))
 270                return 1;
 271
 272        if (!lock_cap && current->mm->locked_vm + 1 > limit) {
 273                put_pfn(*pfn_base, prot);
 274                pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 275                        limit << PAGE_SHIFT);
 276                return -ENOMEM;
 277        }
 278
 279        if (unlikely(disable_hugepages)) {
 280                vfio_lock_acct(1);
 281                return 1;
 282        }
 283
 284        /* Lock all the consecutive pages from pfn_base */
 285        for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 286                unsigned long pfn = 0;
 287
 288                ret = vaddr_get_pfn(vaddr, prot, &pfn);
 289                if (ret)
 290                        break;
 291
 292                if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
 293                        put_pfn(pfn, prot);
 294                        break;
 295                }
 296
 297                if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
 298                        put_pfn(pfn, prot);
 299                        pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 300                                __func__, limit << PAGE_SHIFT);
 301                        break;
 302                }
 303        }
 304
 305        vfio_lock_acct(i);
 306
 307        return i;
 308}
 309
 310static long vfio_unpin_pages(unsigned long pfn, long npage,
 311                             int prot, bool do_accounting)
 312{
 313        unsigned long unlocked = 0;
 314        long i;
 315
 316        for (i = 0; i < npage; i++)
 317                unlocked += put_pfn(pfn++, prot);
 318
 319        if (do_accounting)
 320                vfio_lock_acct(-unlocked);
 321
 322        return unlocked;
 323}
 324
 325static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 326                            dma_addr_t iova, size_t *size)
 327{
 328        dma_addr_t start = iova, end = iova + *size;
 329        long unlocked = 0;
 330
 331        while (iova < end) {
 332                size_t unmapped;
 333                phys_addr_t phys;
 334
 335                /*
 336                 * We use the IOMMU to track the physical address.  This
 337                 * saves us from having a lot more entries in our mapping
 338                 * tree.  The downside is that we don't track the size
 339                 * used to do the mapping.  We request unmap of a single
 340                 * page, but expect IOMMUs that support large pages to
 341                 * unmap a larger chunk.
 342                 */
 343                phys = iommu_iova_to_phys(iommu->domain, iova);
 344                if (WARN_ON(!phys)) {
 345                        iova += PAGE_SIZE;
 346                        continue;
 347                }
 348
 349                unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 350                if (!unmapped)
 351                        break;
 352
 353                unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
 354                                             unmapped >> PAGE_SHIFT,
 355                                             dma->prot, false);
 356                iova += unmapped;
 357        }
 358
 359        vfio_lock_acct(-unlocked);
 360
 361        *size = iova - start;
 362
 363        return 0;
 364}
 365
 366static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 367                                   size_t *size, struct vfio_dma *dma)
 368{
 369        size_t offset, overlap, tmp;
 370        struct vfio_dma *split;
 371        int ret;
 372
 373        if (!*size)
 374                return 0;
 375
 376        /*
 377         * Existing dma region is completely covered, unmap all.  This is
 378         * the likely case since userspace tends to map and unmap buffers
 379         * in one shot rather than multiple mappings within a buffer.
 380         */
 381        if (likely(start <= dma->iova &&
 382                   start + *size >= dma->iova + dma->size)) {
 383                *size = dma->size;
 384                ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
 385                if (ret)
 386                        return ret;
 387
 388                /*
 389                 * Did we remove more than we have?  Should never happen
 390                 * since a vfio_dma is contiguous in iova and vaddr.
 391                 */
 392                WARN_ON(*size != dma->size);
 393
 394                vfio_remove_dma(iommu, dma);
 395                kfree(dma);
 396                return 0;
 397        }
 398
 399        /* Overlap low address of existing range */
 400        if (start <= dma->iova) {
 401                overlap = start + *size - dma->iova;
 402                ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
 403                if (ret)
 404                        return ret;
 405
 406                vfio_remove_dma(iommu, dma);
 407
 408                /*
 409                 * Check, we may have removed to whole vfio_dma.  If not
 410                 * fixup and re-insert.
 411                 */
 412                if (overlap < dma->size) {
 413                        dma->iova += overlap;
 414                        dma->vaddr += overlap;
 415                        dma->size -= overlap;
 416                        vfio_insert_dma(iommu, dma);
 417                } else
 418                        kfree(dma);
 419
 420                *size = overlap;
 421                return 0;
 422        }
 423
 424        /* Overlap high address of existing range */
 425        if (start + *size >= dma->iova + dma->size) {
 426                offset = start - dma->iova;
 427                overlap = dma->size - offset;
 428
 429                ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
 430                if (ret)
 431                        return ret;
 432
 433                dma->size -= overlap;
 434                *size = overlap;
 435                return 0;
 436        }
 437
 438        /* Split existing */
 439
 440        /*
 441         * Allocate our tracking structure early even though it may not
 442         * be used.  An Allocation failure later loses track of pages and
 443         * is more difficult to unwind.
 444         */
 445        split = kzalloc(sizeof(*split), GFP_KERNEL);
 446        if (!split)
 447                return -ENOMEM;
 448
 449        offset = start - dma->iova;
 450
 451        ret = vfio_unmap_unpin(iommu, dma, start, size);
 452        if (ret || !*size) {
 453                kfree(split);
 454                return ret;
 455        }
 456
 457        tmp = dma->size;
 458
 459        /* Resize the lower vfio_dma in place, before the below insert */
 460        dma->size = offset;
 461
 462        /* Insert new for remainder, assuming it didn't all get unmapped */
 463        if (likely(offset + *size < tmp)) {
 464                split->size = tmp - offset - *size;
 465                split->iova = dma->iova + offset + *size;
 466                split->vaddr = dma->vaddr + offset + *size;
 467                split->prot = dma->prot;
 468                vfio_insert_dma(iommu, split);
 469        } else
 470                kfree(split);
 471
 472        return 0;
 473}
 474
 475static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 476                             struct vfio_iommu_type1_dma_unmap *unmap)
 477{
 478        uint64_t mask;
 479        struct vfio_dma *dma;
 480        size_t unmapped = 0, size;
 481        int ret = 0;
 482
 483        mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 484
 485        if (unmap->iova & mask)
 486                return -EINVAL;
 487        if (!unmap->size || unmap->size & mask)
 488                return -EINVAL;
 489
 490        WARN_ON(mask & PAGE_MASK);
 491
 492        mutex_lock(&iommu->lock);
 493
 494        while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 495                size = unmap->size;
 496                ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
 497                if (ret || !size)
 498                        break;
 499                unmapped += size;
 500        }
 501
 502        mutex_unlock(&iommu->lock);
 503
 504        /*
 505         * We may unmap more than requested, update the unmap struct so
 506         * userspace can know.
 507         */
 508        unmap->size = unmapped;
 509
 510        return ret;
 511}
 512
 513/*
 514 * Turns out AMD IOMMU has a page table bug where it won't map large pages
 515 * to a region that previously mapped smaller pages.  This should be fixed
 516 * soon, so this is just a temporary workaround to break mappings down into
 517 * PAGE_SIZE.  Better to map smaller pages than nothing.
 518 */
 519static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
 520                          unsigned long pfn, long npage, int prot)
 521{
 522        long i;
 523        int ret;
 524
 525        for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 526                ret = iommu_map(iommu->domain, iova,
 527                                (phys_addr_t)pfn << PAGE_SHIFT,
 528                                PAGE_SIZE, prot);
 529                if (ret)
 530                        break;
 531        }
 532
 533        for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
 534                iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 535
 536        return ret;
 537}
 538
 539static int vfio_dma_do_map(struct vfio_iommu *iommu,
 540                           struct vfio_iommu_type1_dma_map *map)
 541{
 542        dma_addr_t end, iova;
 543        unsigned long vaddr = map->vaddr;
 544        size_t size = map->size;
 545        long npage;
 546        int ret = 0, prot = 0;
 547        uint64_t mask;
 548        struct vfio_dma *dma = NULL;
 549        unsigned long pfn;
 550
 551        end = map->iova + map->size;
 552
 553        mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 554
 555        /* READ/WRITE from device perspective */
 556        if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 557                prot |= IOMMU_WRITE;
 558        if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 559                prot |= IOMMU_READ;
 560
 561        if (!prot)
 562                return -EINVAL; /* No READ/WRITE? */
 563
 564        if (iommu->cache)
 565                prot |= IOMMU_CACHE;
 566
 567        if (vaddr & mask)
 568                return -EINVAL;
 569        if (map->iova & mask)
 570                return -EINVAL;
 571        if (!map->size || map->size & mask)
 572                return -EINVAL;
 573
 574        WARN_ON(mask & PAGE_MASK);
 575
 576        /* Don't allow IOVA wrap */
 577        if (end && end < map->iova)
 578                return -EINVAL;
 579
 580        /* Don't allow virtual address wrap */
 581        if (vaddr + map->size && vaddr + map->size < vaddr)
 582                return -EINVAL;
 583
 584        mutex_lock(&iommu->lock);
 585
 586        if (vfio_find_dma(iommu, map->iova, map->size)) {
 587                mutex_unlock(&iommu->lock);
 588                return -EEXIST;
 589        }
 590
 591        for (iova = map->iova; iova < end; iova += size, vaddr += size) {
 592                long i;
 593
 594                /* Pin a contiguous chunk of memory */
 595                npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
 596                                       prot, &pfn);
 597                if (npage <= 0) {
 598                        WARN_ON(!npage);
 599                        ret = (int)npage;
 600                        goto out;
 601                }
 602
 603                /* Verify pages are not already mapped */
 604                for (i = 0; i < npage; i++) {
 605                        if (iommu_iova_to_phys(iommu->domain,
 606                                               iova + (i << PAGE_SHIFT))) {
 607                                ret = -EBUSY;
 608                                goto out_unpin;
 609                        }
 610                }
 611
 612                ret = iommu_map(iommu->domain, iova,
 613                                (phys_addr_t)pfn << PAGE_SHIFT,
 614                                npage << PAGE_SHIFT, prot);
 615                if (ret) {
 616                        if (ret != -EBUSY ||
 617                            map_try_harder(iommu, iova, pfn, npage, prot)) {
 618                                goto out_unpin;
 619                        }
 620                }
 621
 622                size = npage << PAGE_SHIFT;
 623
 624                /*
 625                 * Check if we abut a region below - nothing below 0.
 626                 * This is the most likely case when mapping chunks of
 627                 * physically contiguous regions within a virtual address
 628                 * range.  Update the abutting entry in place since iova
 629                 * doesn't change.
 630                 */
 631                if (likely(iova)) {
 632                        struct vfio_dma *tmp;
 633                        tmp = vfio_find_dma(iommu, iova - 1, 1);
 634                        if (tmp && tmp->prot == prot &&
 635                            tmp->vaddr + tmp->size == vaddr) {
 636                                tmp->size += size;
 637                                iova = tmp->iova;
 638                                size = tmp->size;
 639                                vaddr = tmp->vaddr;
 640                                dma = tmp;
 641                        }
 642                }
 643
 644                /*
 645                 * Check if we abut a region above - nothing above ~0 + 1.
 646                 * If we abut above and below, remove and free.  If only
 647                 * abut above, remove, modify, reinsert.
 648                 */
 649                if (likely(iova + size)) {
 650                        struct vfio_dma *tmp;
 651                        tmp = vfio_find_dma(iommu, iova + size, 1);
 652                        if (tmp && tmp->prot == prot &&
 653                            tmp->vaddr == vaddr + size) {
 654                                vfio_remove_dma(iommu, tmp);
 655                                if (dma) {
 656                                        dma->size += tmp->size;
 657                                        kfree(tmp);
 658                                } else {
 659                                        size += tmp->size;
 660                                        tmp->size = size;
 661                                        tmp->iova = iova;
 662                                        tmp->vaddr = vaddr;
 663                                        vfio_insert_dma(iommu, tmp);
 664                                        dma = tmp;
 665                                }
 666                        }
 667                }
 668
 669                if (!dma) {
 670                        dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 671                        if (!dma) {
 672                                iommu_unmap(iommu->domain, iova, size);
 673                                ret = -ENOMEM;
 674                                goto out_unpin;
 675                        }
 676
 677                        dma->size = size;
 678                        dma->iova = iova;
 679                        dma->vaddr = vaddr;
 680                        dma->prot = prot;
 681                        vfio_insert_dma(iommu, dma);
 682                }
 683        }
 684
 685        WARN_ON(ret);
 686        mutex_unlock(&iommu->lock);
 687        return ret;
 688
 689out_unpin:
 690        vfio_unpin_pages(pfn, npage, prot, true);
 691
 692out:
 693        iova = map->iova;
 694        size = map->size;
 695        while ((dma = vfio_find_dma(iommu, iova, size))) {
 696                int r = vfio_remove_dma_overlap(iommu, iova,
 697                                                &size, dma);
 698                if (WARN_ON(r || !size))
 699                        break;
 700        }
 701
 702        mutex_unlock(&iommu->lock);
 703        return ret;
 704}
 705
 706static int vfio_iommu_type1_attach_group(void *iommu_data,
 707                                         struct iommu_group *iommu_group)
 708{
 709        struct vfio_iommu *iommu = iommu_data;
 710        struct vfio_group *group, *tmp;
 711        int ret;
 712
 713        group = kzalloc(sizeof(*group), GFP_KERNEL);
 714        if (!group)
 715                return -ENOMEM;
 716
 717        mutex_lock(&iommu->lock);
 718
 719        list_for_each_entry(tmp, &iommu->group_list, next) {
 720                if (tmp->iommu_group == iommu_group) {
 721                        mutex_unlock(&iommu->lock);
 722                        kfree(group);
 723                        return -EINVAL;
 724                }
 725        }
 726
 727        /*
 728         * TODO: Domain have capabilities that might change as we add
 729         * groups (see iommu->cache, currently never set).  Check for
 730         * them and potentially disallow groups to be attached when it
 731         * would change capabilities (ugh).
 732         */
 733        ret = iommu_attach_group(iommu->domain, iommu_group);
 734        if (ret) {
 735                mutex_unlock(&iommu->lock);
 736                kfree(group);
 737                return ret;
 738        }
 739
 740        group->iommu_group = iommu_group;
 741        list_add(&group->next, &iommu->group_list);
 742
 743        mutex_unlock(&iommu->lock);
 744
 745        return 0;
 746}
 747
 748static void vfio_iommu_type1_detach_group(void *iommu_data,
 749                                          struct iommu_group *iommu_group)
 750{
 751        struct vfio_iommu *iommu = iommu_data;
 752        struct vfio_group *group;
 753
 754        mutex_lock(&iommu->lock);
 755
 756        list_for_each_entry(group, &iommu->group_list, next) {
 757                if (group->iommu_group == iommu_group) {
 758                        iommu_detach_group(iommu->domain, iommu_group);
 759                        list_del(&group->next);
 760                        kfree(group);
 761                        break;
 762                }
 763        }
 764
 765        mutex_unlock(&iommu->lock);
 766}
 767
 768static void *vfio_iommu_type1_open(unsigned long arg)
 769{
 770        struct vfio_iommu *iommu;
 771
 772        if (arg != VFIO_TYPE1_IOMMU)
 773                return ERR_PTR(-EINVAL);
 774
 775        iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 776        if (!iommu)
 777                return ERR_PTR(-ENOMEM);
 778
 779        INIT_LIST_HEAD(&iommu->group_list);
 780        iommu->dma_list = RB_ROOT;
 781        mutex_init(&iommu->lock);
 782
 783        /*
 784         * Wish we didn't have to know about bus_type here.
 785         */
 786        iommu->domain = iommu_domain_alloc(&pci_bus_type);
 787        if (!iommu->domain) {
 788                kfree(iommu);
 789                return ERR_PTR(-EIO);
 790        }
 791
 792        /*
 793         * Wish we could specify required capabilities rather than create
 794         * a domain, see what comes out and hope it doesn't change along
 795         * the way.  Fortunately we know interrupt remapping is global for
 796         * our iommus.
 797         */
 798        if (!allow_unsafe_interrupts &&
 799            !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
 800                pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
 801                       __func__);
 802                iommu_domain_free(iommu->domain);
 803                kfree(iommu);
 804                return ERR_PTR(-EPERM);
 805        }
 806
 807        return iommu;
 808}
 809
 810static void vfio_iommu_type1_release(void *iommu_data)
 811{
 812        struct vfio_iommu *iommu = iommu_data;
 813        struct vfio_group *group, *group_tmp;
 814        struct rb_node *node;
 815
 816        list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
 817                iommu_detach_group(iommu->domain, group->iommu_group);
 818                list_del(&group->next);
 819                kfree(group);
 820        }
 821
 822        while ((node = rb_first(&iommu->dma_list))) {
 823                struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 824                size_t size = dma->size;
 825                vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 826                if (WARN_ON(!size))
 827                        break;
 828        }
 829
 830        iommu_domain_free(iommu->domain);
 831        iommu->domain = NULL;
 832        kfree(iommu);
 833}
 834
 835static long vfio_iommu_type1_ioctl(void *iommu_data,
 836                                   unsigned int cmd, unsigned long arg)
 837{
 838        struct vfio_iommu *iommu = iommu_data;
 839        unsigned long minsz;
 840
 841        if (cmd == VFIO_CHECK_EXTENSION) {
 842                switch (arg) {
 843                case VFIO_TYPE1_IOMMU:
 844                        return 1;
 845                default:
 846                        return 0;
 847                }
 848        } else if (cmd == VFIO_IOMMU_GET_INFO) {
 849                struct vfio_iommu_type1_info info;
 850
 851                minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 852
 853                if (copy_from_user(&info, (void __user *)arg, minsz))
 854                        return -EFAULT;
 855
 856                if (info.argsz < minsz)
 857                        return -EINVAL;
 858
 859                info.flags = 0;
 860
 861                info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
 862
 863                return copy_to_user((void __user *)arg, &info, minsz);
 864
 865        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
 866                struct vfio_iommu_type1_dma_map map;
 867                uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
 868                                VFIO_DMA_MAP_FLAG_WRITE;
 869
 870                minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 871
 872                if (copy_from_user(&map, (void __user *)arg, minsz))
 873                        return -EFAULT;
 874
 875                if (map.argsz < minsz || map.flags & ~mask)
 876                        return -EINVAL;
 877
 878                return vfio_dma_do_map(iommu, &map);
 879
 880        } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 881                struct vfio_iommu_type1_dma_unmap unmap;
 882                long ret;
 883
 884                minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 885
 886                if (copy_from_user(&unmap, (void __user *)arg, minsz))
 887                        return -EFAULT;
 888
 889                if (unmap.argsz < minsz || unmap.flags)
 890                        return -EINVAL;
 891
 892                ret = vfio_dma_do_unmap(iommu, &unmap);
 893                if (ret)
 894                        return ret;
 895
 896                return copy_to_user((void __user *)arg, &unmap, minsz);
 897        }
 898
 899        return -ENOTTY;
 900}
 901
 902static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 903        .name           = "vfio-iommu-type1",
 904        .owner          = THIS_MODULE,
 905        .open           = vfio_iommu_type1_open,
 906        .release        = vfio_iommu_type1_release,
 907        .ioctl          = vfio_iommu_type1_ioctl,
 908        .attach_group   = vfio_iommu_type1_attach_group,
 909        .detach_group   = vfio_iommu_type1_detach_group,
 910};
 911
 912static int __init vfio_iommu_type1_init(void)
 913{
 914        if (!iommu_present(&pci_bus_type))
 915                return -ENODEV;
 916
 917        return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
 918}
 919
 920static void __exit vfio_iommu_type1_cleanup(void)
 921{
 922        vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
 923}
 924
 925module_init(vfio_iommu_type1_init);
 926module_exit(vfio_iommu_type1_cleanup);
 927
 928MODULE_VERSION(DRIVER_VERSION);
 929MODULE_LICENSE("GPL v2");
 930MODULE_AUTHOR(DRIVER_AUTHOR);
 931MODULE_DESCRIPTION(DRIVER_DESC);
 932
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.