linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/cacheflush.h>
  46#include <asm/iommu.h>
  47
  48#define ROOT_SIZE               VTD_PAGE_SIZE
  49#define CONTEXT_SIZE            VTD_PAGE_SIZE
  50
  51#define IS_BRIDGE_HOST_DEVICE(pdev) \
  52                            ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
  53#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  54#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  55#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  56
  57#define IOAPIC_RANGE_START      (0xfee00000)
  58#define IOAPIC_RANGE_END        (0xfeefffff)
  59#define IOVA_START_ADDR         (0x1000)
  60
  61#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  62
  63#define MAX_AGAW_WIDTH 64
  64
  65#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  66#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  67
  68/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  69   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  70#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  71                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  72#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  73
  74#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  75#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  76#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  77
  78/* page table handling */
  79#define LEVEL_STRIDE            (9)
  80#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  81
  82/*
  83 * This bitmap is used to advertise the page sizes our hardware support
  84 * to the IOMMU core, which will then use this information to split
  85 * physically contiguous memory regions it is mapping into page sizes
  86 * that we support.
  87 *
  88 * Traditionally the IOMMU core just handed us the mappings directly,
  89 * after making sure the size is an order of a 4KiB page and that the
  90 * mapping has natural alignment.
  91 *
  92 * To retain this behavior, we currently advertise that we support
  93 * all page sizes that are an order of 4KiB.
  94 *
  95 * If at some point we'd like to utilize the IOMMU core's new behavior,
  96 * we could change this to advertise the real page sizes we support.
  97 */
  98#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
  99
 100static inline int agaw_to_level(int agaw)
 101{
 102        return agaw + 2;
 103}
 104
 105static inline int agaw_to_width(int agaw)
 106{
 107        return 30 + agaw * LEVEL_STRIDE;
 108}
 109
 110static inline int width_to_agaw(int width)
 111{
 112        return (width - 30) / LEVEL_STRIDE;
 113}
 114
 115static inline unsigned int level_to_offset_bits(int level)
 116{
 117        return (level - 1) * LEVEL_STRIDE;
 118}
 119
 120static inline int pfn_level_offset(unsigned long pfn, int level)
 121{
 122        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 123}
 124
 125static inline unsigned long level_mask(int level)
 126{
 127        return -1UL << level_to_offset_bits(level);
 128}
 129
 130static inline unsigned long level_size(int level)
 131{
 132        return 1UL << level_to_offset_bits(level);
 133}
 134
 135static inline unsigned long align_to_level(unsigned long pfn, int level)
 136{
 137        return (pfn + level_size(level) - 1) & level_mask(level);
 138}
 139
 140static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 141{
 142        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 143}
 144
 145/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 146   are never going to work. */
 147static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 148{
 149        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 150}
 151
 152static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 153{
 154        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 155}
 156static inline unsigned long page_to_dma_pfn(struct page *pg)
 157{
 158        return mm_to_dma_pfn(page_to_pfn(pg));
 159}
 160static inline unsigned long virt_to_dma_pfn(void *p)
 161{
 162        return page_to_dma_pfn(virt_to_page(p));
 163}
 164
 165/* global iommu list, set NULL for ignored DMAR units */
 166static struct intel_iommu **g_iommus;
 167
 168static void __init check_tylersburg_isoch(void);
 169static int rwbf_quirk;
 170
 171/*
 172 * set to 1 to panic kernel if can't successfully enable VT-d
 173 * (used when kernel is launched w/ TXT)
 174 */
 175static int force_on = 0;
 176
 177/*
 178 * 0: Present
 179 * 1-11: Reserved
 180 * 12-63: Context Ptr (12 - (haw-1))
 181 * 64-127: Reserved
 182 */
 183struct root_entry {
 184        u64     val;
 185        u64     rsvd1;
 186};
 187#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 188static inline bool root_present(struct root_entry *root)
 189{
 190        return (root->val & 1);
 191}
 192static inline void set_root_present(struct root_entry *root)
 193{
 194        root->val |= 1;
 195}
 196static inline void set_root_value(struct root_entry *root, unsigned long value)
 197{
 198        root->val |= value & VTD_PAGE_MASK;
 199}
 200
 201static inline struct context_entry *
 202get_context_addr_from_root(struct root_entry *root)
 203{
 204        return (struct context_entry *)
 205                (root_present(root)?phys_to_virt(
 206                root->val & VTD_PAGE_MASK) :
 207                NULL);
 208}
 209
 210/*
 211 * low 64 bits:
 212 * 0: present
 213 * 1: fault processing disable
 214 * 2-3: translation type
 215 * 12-63: address space root
 216 * high 64 bits:
 217 * 0-2: address width
 218 * 3-6: aval
 219 * 8-23: domain id
 220 */
 221struct context_entry {
 222        u64 lo;
 223        u64 hi;
 224};
 225
 226static inline bool context_present(struct context_entry *context)
 227{
 228        return (context->lo & 1);
 229}
 230static inline void context_set_present(struct context_entry *context)
 231{
 232        context->lo |= 1;
 233}
 234
 235static inline void context_set_fault_enable(struct context_entry *context)
 236{
 237        context->lo &= (((u64)-1) << 2) | 1;
 238}
 239
 240static inline void context_set_translation_type(struct context_entry *context,
 241                                                unsigned long value)
 242{
 243        context->lo &= (((u64)-1) << 4) | 3;
 244        context->lo |= (value & 3) << 2;
 245}
 246
 247static inline void context_set_address_root(struct context_entry *context,
 248                                            unsigned long value)
 249{
 250        context->lo |= value & VTD_PAGE_MASK;
 251}
 252
 253static inline void context_set_address_width(struct context_entry *context,
 254                                             unsigned long value)
 255{
 256        context->hi |= value & 7;
 257}
 258
 259static inline void context_set_domain_id(struct context_entry *context,
 260                                         unsigned long value)
 261{
 262        context->hi |= (value & ((1 << 16) - 1)) << 8;
 263}
 264
 265static inline void context_clear_entry(struct context_entry *context)
 266{
 267        context->lo = 0;
 268        context->hi = 0;
 269}
 270
 271/*
 272 * 0: readable
 273 * 1: writable
 274 * 2-6: reserved
 275 * 7: super page
 276 * 8-10: available
 277 * 11: snoop behavior
 278 * 12-63: Host physcial address
 279 */
 280struct dma_pte {
 281        u64 val;
 282};
 283
 284static inline void dma_clear_pte(struct dma_pte *pte)
 285{
 286        pte->val = 0;
 287}
 288
 289static inline void dma_set_pte_readable(struct dma_pte *pte)
 290{
 291        pte->val |= DMA_PTE_READ;
 292}
 293
 294static inline void dma_set_pte_writable(struct dma_pte *pte)
 295{
 296        pte->val |= DMA_PTE_WRITE;
 297}
 298
 299static inline void dma_set_pte_snp(struct dma_pte *pte)
 300{
 301        pte->val |= DMA_PTE_SNP;
 302}
 303
 304static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 305{
 306        pte->val = (pte->val & ~3) | (prot & 3);
 307}
 308
 309static inline u64 dma_pte_addr(struct dma_pte *pte)
 310{
 311#ifdef CONFIG_64BIT
 312        return pte->val & VTD_PAGE_MASK;
 313#else
 314        /* Must have a full atomic 64-bit read */
 315        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 316#endif
 317}
 318
 319static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 320{
 321        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 322}
 323
 324static inline bool dma_pte_present(struct dma_pte *pte)
 325{
 326        return (pte->val & 3) != 0;
 327}
 328
 329static inline bool dma_pte_superpage(struct dma_pte *pte)
 330{
 331        return (pte->val & (1 << 7));
 332}
 333
 334static inline int first_pte_in_page(struct dma_pte *pte)
 335{
 336        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 337}
 338
 339/*
 340 * This domain is a statically identity mapping domain.
 341 *      1. This domain creats a static 1:1 mapping to all usable memory.
 342 *      2. It maps to each iommu if successful.
 343 *      3. Each iommu mapps to this domain if successful.
 344 */
 345static struct dmar_domain *si_domain;
 346static int hw_pass_through = 1;
 347
 348/* devices under the same p2p bridge are owned in one domain */
 349#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 350
 351/* domain represents a virtual machine, more than one devices
 352 * across iommus may be owned in one domain, e.g. kvm guest.
 353 */
 354#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 355
 356/* si_domain contains mulitple devices */
 357#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 358
 359struct dmar_domain {
 360        int     id;                     /* domain id */
 361        int     nid;                    /* node id */
 362        unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
 363
 364        struct list_head devices;       /* all devices' list */
 365        struct iova_domain iovad;       /* iova's that belong to this domain */
 366
 367        struct dma_pte  *pgd;           /* virtual address */
 368        int             gaw;            /* max guest address width */
 369
 370        /* adjusted guest address width, 0 is level 2 30-bit */
 371        int             agaw;
 372
 373        int             flags;          /* flags to find out type of domain */
 374
 375        int             iommu_coherency;/* indicate coherency of iommu access */
 376        int             iommu_snooping; /* indicate snooping control feature*/
 377        int             iommu_count;    /* reference count of iommu */
 378        int             iommu_superpage;/* Level of superpages supported:
 379                                           0 == 4KiB (no superpages), 1 == 2MiB,
 380                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 381        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 382        u64             max_addr;       /* maximum mapped address */
 383};
 384
 385/* PCI domain-device relationship */
 386struct device_domain_info {
 387        struct list_head link;  /* link to domain siblings */
 388        struct list_head global; /* link to global list */
 389        int segment;            /* PCI domain */
 390        u8 bus;                 /* PCI bus number */
 391        u8 devfn;               /* PCI devfn number */
 392        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 393        struct intel_iommu *iommu; /* IOMMU used by this device */
 394        struct dmar_domain *domain; /* pointer to domain */
 395};
 396
 397static void flush_unmaps_timeout(unsigned long data);
 398
 399DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 400
 401#define HIGH_WATER_MARK 250
 402struct deferred_flush_tables {
 403        int next;
 404        struct iova *iova[HIGH_WATER_MARK];
 405        struct dmar_domain *domain[HIGH_WATER_MARK];
 406};
 407
 408static struct deferred_flush_tables *deferred_flush;
 409
 410/* bitmap for indexing intel_iommus */
 411static int g_num_of_iommus;
 412
 413static DEFINE_SPINLOCK(async_umap_flush_lock);
 414static LIST_HEAD(unmaps_to_do);
 415
 416static int timer_on;
 417static long list_size;
 418
 419static void domain_remove_dev_info(struct dmar_domain *domain);
 420
 421#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 422int dmar_disabled = 0;
 423#else
 424int dmar_disabled = 1;
 425#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 426
 427int intel_iommu_enabled = 0;
 428EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 429
 430static int dmar_map_gfx = 1;
 431static int dmar_forcedac;
 432static int intel_iommu_strict;
 433static int intel_iommu_superpage = 1;
 434
 435int intel_iommu_gfx_mapped;
 436EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 437
 438#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 439static DEFINE_SPINLOCK(device_domain_lock);
 440static LIST_HEAD(device_domain_list);
 441
 442static struct iommu_ops intel_iommu_ops;
 443
 444static int __init intel_iommu_setup(char *str)
 445{
 446        if (!str)
 447                return -EINVAL;
 448        while (*str) {
 449                if (!strncmp(str, "on", 2)) {
 450                        dmar_disabled = 0;
 451                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 452                } else if (!strncmp(str, "off", 3)) {
 453                        dmar_disabled = 1;
 454                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 455                } else if (!strncmp(str, "igfx_off", 8)) {
 456                        dmar_map_gfx = 0;
 457                        printk(KERN_INFO
 458                                "Intel-IOMMU: disable GFX device mapping\n");
 459                } else if (!strncmp(str, "forcedac", 8)) {
 460                        printk(KERN_INFO
 461                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 462                        dmar_forcedac = 1;
 463                } else if (!strncmp(str, "strict", 6)) {
 464                        printk(KERN_INFO
 465                                "Intel-IOMMU: disable batched IOTLB flush\n");
 466                        intel_iommu_strict = 1;
 467                } else if (!strncmp(str, "sp_off", 6)) {
 468                        printk(KERN_INFO
 469                                "Intel-IOMMU: disable supported super page\n");
 470                        intel_iommu_superpage = 0;
 471                }
 472
 473                str += strcspn(str, ",");
 474                while (*str == ',')
 475                        str++;
 476        }
 477        return 0;
 478}
 479__setup("intel_iommu=", intel_iommu_setup);
 480
 481static struct kmem_cache *iommu_domain_cache;
 482static struct kmem_cache *iommu_devinfo_cache;
 483static struct kmem_cache *iommu_iova_cache;
 484
 485static inline void *alloc_pgtable_page(int node)
 486{
 487        struct page *page;
 488        void *vaddr = NULL;
 489
 490        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 491        if (page)
 492                vaddr = page_address(page);
 493        return vaddr;
 494}
 495
 496static inline void free_pgtable_page(void *vaddr)
 497{
 498        free_page((unsigned long)vaddr);
 499}
 500
 501static inline void *alloc_domain_mem(void)
 502{
 503        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 504}
 505
 506static void free_domain_mem(void *vaddr)
 507{
 508        kmem_cache_free(iommu_domain_cache, vaddr);
 509}
 510
 511static inline void * alloc_devinfo_mem(void)
 512{
 513        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 514}
 515
 516static inline void free_devinfo_mem(void *vaddr)
 517{
 518        kmem_cache_free(iommu_devinfo_cache, vaddr);
 519}
 520
 521struct iova *alloc_iova_mem(void)
 522{
 523        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 524}
 525
 526void free_iova_mem(struct iova *iova)
 527{
 528        kmem_cache_free(iommu_iova_cache, iova);
 529}
 530
 531
 532static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 533{
 534        unsigned long sagaw;
 535        int agaw = -1;
 536
 537        sagaw = cap_sagaw(iommu->cap);
 538        for (agaw = width_to_agaw(max_gaw);
 539             agaw >= 0; agaw--) {
 540                if (test_bit(agaw, &sagaw))
 541                        break;
 542        }
 543
 544        return agaw;
 545}
 546
 547/*
 548 * Calculate max SAGAW for each iommu.
 549 */
 550int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 551{
 552        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 553}
 554
 555/*
 556 * calculate agaw for each iommu.
 557 * "SAGAW" may be different across iommus, use a default agaw, and
 558 * get a supported less agaw for iommus that don't support the default agaw.
 559 */
 560int iommu_calculate_agaw(struct intel_iommu *iommu)
 561{
 562        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 563}
 564
 565/* This functionin only returns single iommu in a domain */
 566static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 567{
 568        int iommu_id;
 569
 570        /* si_domain and vm domain should not get here. */
 571        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 572        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 573
 574        iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
 575        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 576                return NULL;
 577
 578        return g_iommus[iommu_id];
 579}
 580
 581static void domain_update_iommu_coherency(struct dmar_domain *domain)
 582{
 583        int i;
 584
 585        domain->iommu_coherency = 1;
 586
 587        for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
 588                if (!ecap_coherent(g_iommus[i]->ecap)) {
 589                        domain->iommu_coherency = 0;
 590                        break;
 591                }
 592        }
 593}
 594
 595static void domain_update_iommu_snooping(struct dmar_domain *domain)
 596{
 597        int i;
 598
 599        domain->iommu_snooping = 1;
 600
 601        for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
 602                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 603                        domain->iommu_snooping = 0;
 604                        break;
 605                }
 606        }
 607}
 608
 609static void domain_update_iommu_superpage(struct dmar_domain *domain)
 610{
 611        struct dmar_drhd_unit *drhd;
 612        struct intel_iommu *iommu = NULL;
 613        int mask = 0xf;
 614
 615        if (!intel_iommu_superpage) {
 616                domain->iommu_superpage = 0;
 617                return;
 618        }
 619
 620        /* set iommu_superpage to the smallest common denominator */
 621        for_each_active_iommu(iommu, drhd) {
 622                mask &= cap_super_page_val(iommu->cap);
 623                if (!mask) {
 624                        break;
 625                }
 626        }
 627        domain->iommu_superpage = fls(mask);
 628}
 629
 630/* Some capabilities may be different across iommus */
 631static void domain_update_iommu_cap(struct dmar_domain *domain)
 632{
 633        domain_update_iommu_coherency(domain);
 634        domain_update_iommu_snooping(domain);
 635        domain_update_iommu_superpage(domain);
 636}
 637
 638static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 639{
 640        struct dmar_drhd_unit *drhd = NULL;
 641        int i;
 642
 643        for_each_drhd_unit(drhd) {
 644                if (drhd->ignored)
 645                        continue;
 646                if (segment != drhd->segment)
 647                        continue;
 648
 649                for (i = 0; i < drhd->devices_cnt; i++) {
 650                        if (drhd->devices[i] &&
 651                            drhd->devices[i]->bus->number == bus &&
 652                            drhd->devices[i]->devfn == devfn)
 653                                return drhd->iommu;
 654                        if (drhd->devices[i] &&
 655                            drhd->devices[i]->subordinate &&
 656                            drhd->devices[i]->subordinate->number <= bus &&
 657                            drhd->devices[i]->subordinate->subordinate >= bus)
 658                                return drhd->iommu;
 659                }
 660
 661                if (drhd->include_all)
 662                        return drhd->iommu;
 663        }
 664
 665        return NULL;
 666}
 667
 668static void domain_flush_cache(struct dmar_domain *domain,
 669                               void *addr, int size)
 670{
 671        if (!domain->iommu_coherency)
 672                clflush_cache_range(addr, size);
 673}
 674
 675/* Gets context entry for a given bus and devfn */
 676static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 677                u8 bus, u8 devfn)
 678{
 679        struct root_entry *root;
 680        struct context_entry *context;
 681        unsigned long phy_addr;
 682        unsigned long flags;
 683
 684        spin_lock_irqsave(&iommu->lock, flags);
 685        root = &iommu->root_entry[bus];
 686        context = get_context_addr_from_root(root);
 687        if (!context) {
 688                context = (struct context_entry *)
 689                                alloc_pgtable_page(iommu->node);
 690                if (!context) {
 691                        spin_unlock_irqrestore(&iommu->lock, flags);
 692                        return NULL;
 693                }
 694                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 695                phy_addr = virt_to_phys((void *)context);
 696                set_root_value(root, phy_addr);
 697                set_root_present(root);
 698                __iommu_flush_cache(iommu, root, sizeof(*root));
 699        }
 700        spin_unlock_irqrestore(&iommu->lock, flags);
 701        return &context[devfn];
 702}
 703
 704static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 705{
 706        struct root_entry *root;
 707        struct context_entry *context;
 708        int ret;
 709        unsigned long flags;
 710
 711        spin_lock_irqsave(&iommu->lock, flags);
 712        root = &iommu->root_entry[bus];
 713        context = get_context_addr_from_root(root);
 714        if (!context) {
 715                ret = 0;
 716                goto out;
 717        }
 718        ret = context_present(&context[devfn]);
 719out:
 720        spin_unlock_irqrestore(&iommu->lock, flags);
 721        return ret;
 722}
 723
 724static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 725{
 726        struct root_entry *root;
 727        struct context_entry *context;
 728        unsigned long flags;
 729
 730        spin_lock_irqsave(&iommu->lock, flags);
 731        root = &iommu->root_entry[bus];
 732        context = get_context_addr_from_root(root);
 733        if (context) {
 734                context_clear_entry(&context[devfn]);
 735                __iommu_flush_cache(iommu, &context[devfn], \
 736                        sizeof(*context));
 737        }
 738        spin_unlock_irqrestore(&iommu->lock, flags);
 739}
 740
 741static void free_context_table(struct intel_iommu *iommu)
 742{
 743        struct root_entry *root;
 744        int i;
 745        unsigned long flags;
 746        struct context_entry *context;
 747
 748        spin_lock_irqsave(&iommu->lock, flags);
 749        if (!iommu->root_entry) {
 750                goto out;
 751        }
 752        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 753                root = &iommu->root_entry[i];
 754                context = get_context_addr_from_root(root);
 755                if (context)
 756                        free_pgtable_page(context);
 757        }
 758        free_pgtable_page(iommu->root_entry);
 759        iommu->root_entry = NULL;
 760out:
 761        spin_unlock_irqrestore(&iommu->lock, flags);
 762}
 763
 764static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 765                                      unsigned long pfn, int target_level)
 766{
 767        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 768        struct dma_pte *parent, *pte = NULL;
 769        int level = agaw_to_level(domain->agaw);
 770        int offset;
 771
 772        BUG_ON(!domain->pgd);
 773        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 774        parent = domain->pgd;
 775
 776        while (level > 0) {
 777                void *tmp_page;
 778
 779                offset = pfn_level_offset(pfn, level);
 780                pte = &parent[offset];
 781                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 782                        break;
 783                if (level == target_level)
 784                        break;
 785
 786                if (!dma_pte_present(pte)) {
 787                        uint64_t pteval;
 788
 789                        tmp_page = alloc_pgtable_page(domain->nid);
 790
 791                        if (!tmp_page)
 792                                return NULL;
 793
 794                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 795                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 796                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 797                                /* Someone else set it while we were thinking; use theirs. */
 798                                free_pgtable_page(tmp_page);
 799                        } else {
 800                                dma_pte_addr(pte);
 801                                domain_flush_cache(domain, pte, sizeof(*pte));
 802                        }
 803                }
 804                parent = phys_to_virt(dma_pte_addr(pte));
 805                level--;
 806        }
 807
 808        return pte;
 809}
 810
 811
 812/* return address's pte at specific level */
 813static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 814                                         unsigned long pfn,
 815                                         int level, int *large_page)
 816{
 817        struct dma_pte *parent, *pte = NULL;
 818        int total = agaw_to_level(domain->agaw);
 819        int offset;
 820
 821        parent = domain->pgd;
 822        while (level <= total) {
 823                offset = pfn_level_offset(pfn, total);
 824                pte = &parent[offset];
 825                if (level == total)
 826                        return pte;
 827
 828                if (!dma_pte_present(pte)) {
 829                        *large_page = total;
 830                        break;
 831                }
 832
 833                if (pte->val & DMA_PTE_LARGE_PAGE) {
 834                        *large_page = total;
 835                        return pte;
 836                }
 837
 838                parent = phys_to_virt(dma_pte_addr(pte));
 839                total--;
 840        }
 841        return NULL;
 842}
 843
 844/* clear last level pte, a tlb flush should be followed */
 845static int dma_pte_clear_range(struct dmar_domain *domain,
 846                                unsigned long start_pfn,
 847                                unsigned long last_pfn)
 848{
 849        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 850        unsigned int large_page = 1;
 851        struct dma_pte *first_pte, *pte;
 852        int order;
 853
 854        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 855        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 856        BUG_ON(start_pfn > last_pfn);
 857
 858        /* we don't need lock here; nobody else touches the iova range */
 859        do {
 860                large_page = 1;
 861                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 862                if (!pte) {
 863                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 864                        continue;
 865                }
 866                do {
 867                        dma_clear_pte(pte);
 868                        start_pfn += lvl_to_nr_pages(large_page);
 869                        pte++;
 870                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 871
 872                domain_flush_cache(domain, first_pte,
 873                                   (void *)pte - (void *)first_pte);
 874
 875        } while (start_pfn && start_pfn <= last_pfn);
 876
 877        order = (large_page - 1) * 9;
 878        return order;
 879}
 880
 881/* free page table pages. last level pte should already be cleared */
 882static void dma_pte_free_pagetable(struct dmar_domain *domain,
 883                                   unsigned long start_pfn,
 884                                   unsigned long last_pfn)
 885{
 886        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 887        struct dma_pte *first_pte, *pte;
 888        int total = agaw_to_level(domain->agaw);
 889        int level;
 890        unsigned long tmp;
 891        int large_page = 2;
 892
 893        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 894        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 895        BUG_ON(start_pfn > last_pfn);
 896
 897        /* We don't need lock here; nobody else touches the iova range */
 898        level = 2;
 899        while (level <= total) {
 900                tmp = align_to_level(start_pfn, level);
 901
 902                /* If we can't even clear one PTE at this level, we're done */
 903                if (tmp + level_size(level) - 1 > last_pfn)
 904                        return;
 905
 906                do {
 907                        large_page = level;
 908                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 909                        if (large_page > level)
 910                                level = large_page + 1;
 911                        if (!pte) {
 912                                tmp = align_to_level(tmp + 1, level + 1);
 913                                continue;
 914                        }
 915                        do {
 916                                if (dma_pte_present(pte)) {
 917                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 918                                        dma_clear_pte(pte);
 919                                }
 920                                pte++;
 921                                tmp += level_size(level);
 922                        } while (!first_pte_in_page(pte) &&
 923                                 tmp + level_size(level) - 1 <= last_pfn);
 924
 925                        domain_flush_cache(domain, first_pte,
 926                                           (void *)pte - (void *)first_pte);
 927                        
 928                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 929                level++;
 930        }
 931        /* free pgd */
 932        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 933                free_pgtable_page(domain->pgd);
 934                domain->pgd = NULL;
 935        }
 936}
 937
 938/* iommu handling */
 939static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 940{
 941        struct root_entry *root;
 942        unsigned long flags;
 943
 944        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 945        if (!root)
 946                return -ENOMEM;
 947
 948        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 949
 950        spin_lock_irqsave(&iommu->lock, flags);
 951        iommu->root_entry = root;
 952        spin_unlock_irqrestore(&iommu->lock, flags);
 953
 954        return 0;
 955}
 956
 957static void iommu_set_root_entry(struct intel_iommu *iommu)
 958{
 959        void *addr;
 960        u32 sts;
 961        unsigned long flag;
 962
 963        addr = iommu->root_entry;
 964
 965        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 966        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 967
 968        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 969
 970        /* Make sure hardware complete it */
 971        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 972                      readl, (sts & DMA_GSTS_RTPS), sts);
 973
 974        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 975}
 976
 977static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 978{
 979        u32 val;
 980        unsigned long flag;
 981
 982        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 983                return;
 984
 985        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 986        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 987
 988        /* Make sure hardware complete it */
 989        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 990                      readl, (!(val & DMA_GSTS_WBFS)), val);
 991
 992        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 993}
 994
 995/* return value determine if we need a write buffer flush */
 996static void __iommu_flush_context(struct intel_iommu *iommu,
 997                                  u16 did, u16 source_id, u8 function_mask,
 998                                  u64 type)
 999{
1000        u64 val = 0;
1001        unsigned long flag;
1002
1003        switch (type) {
1004        case DMA_CCMD_GLOBAL_INVL:
1005                val = DMA_CCMD_GLOBAL_INVL;
1006                break;
1007        case DMA_CCMD_DOMAIN_INVL:
1008                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1009                break;
1010        case DMA_CCMD_DEVICE_INVL:
1011                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1012                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1013                break;
1014        default:
1015                BUG();
1016        }
1017        val |= DMA_CCMD_ICC;
1018
1019        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1020        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1021
1022        /* Make sure hardware complete it */
1023        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1024                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1025
1026        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1027}
1028
1029/* return value determine if we need a write buffer flush */
1030static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1031                                u64 addr, unsigned int size_order, u64 type)
1032{
1033        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1034        u64 val = 0, val_iva = 0;
1035        unsigned long flag;
1036
1037        switch (type) {
1038        case DMA_TLB_GLOBAL_FLUSH:
1039                /* global flush doesn't need set IVA_REG */
1040                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1041                break;
1042        case DMA_TLB_DSI_FLUSH:
1043                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1044                break;
1045        case DMA_TLB_PSI_FLUSH:
1046                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1047                /* Note: always flush non-leaf currently */
1048                val_iva = size_order | addr;
1049                break;
1050        default:
1051                BUG();
1052        }
1053        /* Note: set drain read/write */
1054#if 0
1055        /*
1056         * This is probably to be super secure.. Looks like we can
1057         * ignore it without any impact.
1058         */
1059        if (cap_read_drain(iommu->cap))
1060                val |= DMA_TLB_READ_DRAIN;
1061#endif
1062        if (cap_write_drain(iommu->cap))
1063                val |= DMA_TLB_WRITE_DRAIN;
1064
1065        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1066        /* Note: Only uses first TLB reg currently */
1067        if (val_iva)
1068                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1069        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1070
1071        /* Make sure hardware complete it */
1072        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1073                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1074
1075        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1076
1077        /* check IOTLB invalidation granularity */
1078        if (DMA_TLB_IAIG(val) == 0)
1079                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1080        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1081                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1082                        (unsigned long long)DMA_TLB_IIRG(type),
1083                        (unsigned long long)DMA_TLB_IAIG(val));
1084}
1085
1086static struct device_domain_info *iommu_support_dev_iotlb(
1087        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1088{
1089        int found = 0;
1090        unsigned long flags;
1091        struct device_domain_info *info;
1092        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1093
1094        if (!ecap_dev_iotlb_support(iommu->ecap))
1095                return NULL;
1096
1097        if (!iommu->qi)
1098                return NULL;
1099
1100        spin_lock_irqsave(&device_domain_lock, flags);
1101        list_for_each_entry(info, &domain->devices, link)
1102                if (info->bus == bus && info->devfn == devfn) {
1103                        found = 1;
1104                        break;
1105                }
1106        spin_unlock_irqrestore(&device_domain_lock, flags);
1107
1108        if (!found || !info->dev)
1109                return NULL;
1110
1111        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1112                return NULL;
1113
1114        if (!dmar_find_matched_atsr_unit(info->dev))
1115                return NULL;
1116
1117        info->iommu = iommu;
1118
1119        return info;
1120}
1121
1122static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1123{
1124        if (!info)
1125                return;
1126
1127        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1128}
1129
1130static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1131{
1132        if (!info->dev || !pci_ats_enabled(info->dev))
1133                return;
1134
1135        pci_disable_ats(info->dev);
1136}
1137
1138static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1139                                  u64 addr, unsigned mask)
1140{
1141        u16 sid, qdep;
1142        unsigned long flags;
1143        struct device_domain_info *info;
1144
1145        spin_lock_irqsave(&device_domain_lock, flags);
1146        list_for_each_entry(info, &domain->devices, link) {
1147                if (!info->dev || !pci_ats_enabled(info->dev))
1148                        continue;
1149
1150                sid = info->bus << 8 | info->devfn;
1151                qdep = pci_ats_queue_depth(info->dev);
1152                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1153        }
1154        spin_unlock_irqrestore(&device_domain_lock, flags);
1155}
1156
1157static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1158                                  unsigned long pfn, unsigned int pages, int map)
1159{
1160        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1161        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1162
1163        BUG_ON(pages == 0);
1164
1165        /*
1166         * Fallback to domain selective flush if no PSI support or the size is
1167         * too big.
1168         * PSI requires page size to be 2 ^ x, and the base address is naturally
1169         * aligned to the size
1170         */
1171        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1172                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1173                                                DMA_TLB_DSI_FLUSH);
1174        else
1175                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1176                                                DMA_TLB_PSI_FLUSH);
1177
1178        /*
1179         * In caching mode, changes of pages from non-present to present require
1180         * flush. However, device IOTLB doesn't need to be flushed in this case.
1181         */
1182        if (!cap_caching_mode(iommu->cap) || !map)
1183                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1184}
1185
1186static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1187{
1188        u32 pmen;
1189        unsigned long flags;
1190
1191        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1192        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1193        pmen &= ~DMA_PMEN_EPM;
1194        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1195
1196        /* wait for the protected region status bit to clear */
1197        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1198                readl, !(pmen & DMA_PMEN_PRS), pmen);
1199
1200        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1201}
1202
1203static int iommu_enable_translation(struct intel_iommu *iommu)
1204{
1205        u32 sts;
1206        unsigned long flags;
1207
1208        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1209        iommu->gcmd |= DMA_GCMD_TE;
1210        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1211
1212        /* Make sure hardware complete it */
1213        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1214                      readl, (sts & DMA_GSTS_TES), sts);
1215
1216        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1217        return 0;
1218}
1219
1220static int iommu_disable_translation(struct intel_iommu *iommu)
1221{
1222        u32 sts;
1223        unsigned long flag;
1224
1225        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1226        iommu->gcmd &= ~DMA_GCMD_TE;
1227        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1228
1229        /* Make sure hardware complete it */
1230        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1231                      readl, (!(sts & DMA_GSTS_TES)), sts);
1232
1233        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1234        return 0;
1235}
1236
1237
1238static int iommu_init_domains(struct intel_iommu *iommu)
1239{
1240        unsigned long ndomains;
1241        unsigned long nlongs;
1242
1243        ndomains = cap_ndoms(iommu->cap);
1244        pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1245                        ndomains);
1246        nlongs = BITS_TO_LONGS(ndomains);
1247
1248        spin_lock_init(&iommu->lock);
1249
1250        /* TBD: there might be 64K domains,
1251         * consider other allocation for future chip
1252         */
1253        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1254        if (!iommu->domain_ids) {
1255                printk(KERN_ERR "Allocating domain id array failed\n");
1256                return -ENOMEM;
1257        }
1258        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1259                        GFP_KERNEL);
1260        if (!iommu->domains) {
1261                printk(KERN_ERR "Allocating domain array failed\n");
1262                return -ENOMEM;
1263        }
1264
1265        /*
1266         * if Caching mode is set, then invalid translations are tagged
1267         * with domainid 0. Hence we need to pre-allocate it.
1268         */
1269        if (cap_caching_mode(iommu->cap))
1270                set_bit(0, iommu->domain_ids);
1271        return 0;
1272}
1273
1274
1275static void domain_exit(struct dmar_domain *domain);
1276static void vm_domain_exit(struct dmar_domain *domain);
1277
1278void free_dmar_iommu(struct intel_iommu *iommu)
1279{
1280        struct dmar_domain *domain;
1281        int i;
1282        unsigned long flags;
1283
1284        if ((iommu->domains) && (iommu->domain_ids)) {
1285                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1286                        domain = iommu->domains[i];
1287                        clear_bit(i, iommu->domain_ids);
1288
1289                        spin_lock_irqsave(&domain->iommu_lock, flags);
1290                        if (--domain->iommu_count == 0) {
1291                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1292                                        vm_domain_exit(domain);
1293                                else
1294                                        domain_exit(domain);
1295                        }
1296                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1297                }
1298        }
1299
1300        if (iommu->gcmd & DMA_GCMD_TE)
1301                iommu_disable_translation(iommu);
1302
1303        if (iommu->irq) {
1304                irq_set_handler_data(iommu->irq, NULL);
1305                /* This will mask the irq */
1306                free_irq(iommu->irq, iommu);
1307                destroy_irq(iommu->irq);
1308        }
1309
1310        kfree(iommu->domains);
1311        kfree(iommu->domain_ids);
1312
1313        g_iommus[iommu->seq_id] = NULL;
1314
1315        /* if all iommus are freed, free g_iommus */
1316        for (i = 0; i < g_num_of_iommus; i++) {
1317                if (g_iommus[i])
1318                        break;
1319        }
1320
1321        if (i == g_num_of_iommus)
1322                kfree(g_iommus);
1323
1324        /* free context mapping */
1325        free_context_table(iommu);
1326}
1327
1328static struct dmar_domain *alloc_domain(void)
1329{
1330        struct dmar_domain *domain;
1331
1332        domain = alloc_domain_mem();
1333        if (!domain)
1334                return NULL;
1335
1336        domain->nid = -1;
1337        memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1338        domain->flags = 0;
1339
1340        return domain;
1341}
1342
1343static int iommu_attach_domain(struct dmar_domain *domain,
1344                               struct intel_iommu *iommu)
1345{
1346        int num;
1347        unsigned long ndomains;
1348        unsigned long flags;
1349
1350        ndomains = cap_ndoms(iommu->cap);
1351
1352        spin_lock_irqsave(&iommu->lock, flags);
1353
1354        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1355        if (num >= ndomains) {
1356                spin_unlock_irqrestore(&iommu->lock, flags);
1357                printk(KERN_ERR "IOMMU: no free domain ids\n");
1358                return -ENOMEM;
1359        }
1360
1361        domain->id = num;
1362        set_bit(num, iommu->domain_ids);
1363        set_bit(iommu->seq_id, &domain->iommu_bmp);
1364        iommu->domains[num] = domain;
1365        spin_unlock_irqrestore(&iommu->lock, flags);
1366
1367        return 0;
1368}
1369
1370static void iommu_detach_domain(struct dmar_domain *domain,
1371                                struct intel_iommu *iommu)
1372{
1373        unsigned long flags;
1374        int num, ndomains;
1375        int found = 0;
1376
1377        spin_lock_irqsave(&iommu->lock, flags);
1378        ndomains = cap_ndoms(iommu->cap);
1379        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1380                if (iommu->domains[num] == domain) {
1381                        found = 1;
1382                        break;
1383                }
1384        }
1385
1386        if (found) {
1387                clear_bit(num, iommu->domain_ids);
1388                clear_bit(iommu->seq_id, &domain->iommu_bmp);
1389                iommu->domains[num] = NULL;
1390        }
1391        spin_unlock_irqrestore(&iommu->lock, flags);
1392}
1393
1394static struct iova_domain reserved_iova_list;
1395static struct lock_class_key reserved_rbtree_key;
1396
1397static int dmar_init_reserved_ranges(void)
1398{
1399        struct pci_dev *pdev = NULL;
1400        struct iova *iova;
1401        int i;
1402
1403        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1404
1405        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1406                &reserved_rbtree_key);
1407
1408        /* IOAPIC ranges shouldn't be accessed by DMA */
1409        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1410                IOVA_PFN(IOAPIC_RANGE_END));
1411        if (!iova) {
1412                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1413                return -ENODEV;
1414        }
1415
1416        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1417        for_each_pci_dev(pdev) {
1418                struct resource *r;
1419
1420                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1421                        r = &pdev->resource[i];
1422                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1423                                continue;
1424                        iova = reserve_iova(&reserved_iova_list,
1425                                            IOVA_PFN(r->start),
1426                                            IOVA_PFN(r->end));
1427                        if (!iova) {
1428                                printk(KERN_ERR "Reserve iova failed\n");
1429                                return -ENODEV;
1430                        }
1431                }
1432        }
1433        return 0;
1434}
1435
1436static void domain_reserve_special_ranges(struct dmar_domain *domain)
1437{
1438        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1439}
1440
1441static inline int guestwidth_to_adjustwidth(int gaw)
1442{
1443        int agaw;
1444        int r = (gaw - 12) % 9;
1445
1446        if (r == 0)
1447                agaw = gaw;
1448        else
1449                agaw = gaw + 9 - r;
1450        if (agaw > 64)
1451                agaw = 64;
1452        return agaw;
1453}
1454
1455static int domain_init(struct dmar_domain *domain, int guest_width)
1456{
1457        struct intel_iommu *iommu;
1458        int adjust_width, agaw;
1459        unsigned long sagaw;
1460
1461        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1462        spin_lock_init(&domain->iommu_lock);
1463
1464        domain_reserve_special_ranges(domain);
1465
1466        /* calculate AGAW */
1467        iommu = domain_get_iommu(domain);
1468        if (guest_width > cap_mgaw(iommu->cap))
1469                guest_width = cap_mgaw(iommu->cap);
1470        domain->gaw = guest_width;
1471        adjust_width = guestwidth_to_adjustwidth(guest_width);
1472        agaw = width_to_agaw(adjust_width);
1473        sagaw = cap_sagaw(iommu->cap);
1474        if (!test_bit(agaw, &sagaw)) {
1475                /* hardware doesn't support it, choose a bigger one */
1476                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1477                agaw = find_next_bit(&sagaw, 5, agaw);
1478                if (agaw >= 5)
1479                        return -ENODEV;
1480        }
1481        domain->agaw = agaw;
1482        INIT_LIST_HEAD(&domain->devices);
1483
1484        if (ecap_coherent(iommu->ecap))
1485                domain->iommu_coherency = 1;
1486        else
1487                domain->iommu_coherency = 0;
1488
1489        if (ecap_sc_support(iommu->ecap))
1490                domain->iommu_snooping = 1;
1491        else
1492                domain->iommu_snooping = 0;
1493
1494        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1495        domain->iommu_count = 1;
1496        domain->nid = iommu->node;
1497
1498        /* always allocate the top pgd */
1499        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1500        if (!domain->pgd)
1501                return -ENOMEM;
1502        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1503        return 0;
1504}
1505
1506static void domain_exit(struct dmar_domain *domain)
1507{
1508        struct dmar_drhd_unit *drhd;
1509        struct intel_iommu *iommu;
1510
1511        /* Domain 0 is reserved, so dont process it */
1512        if (!domain)
1513                return;
1514
1515        /* Flush any lazy unmaps that may reference this domain */
1516        if (!intel_iommu_strict)
1517                flush_unmaps_timeout(0);
1518
1519        domain_remove_dev_info(domain);
1520        /* destroy iovas */
1521        put_iova_domain(&domain->iovad);
1522
1523        /* clear ptes */
1524        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1525
1526        /* free page tables */
1527        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1528
1529        for_each_active_iommu(iommu, drhd)
1530                if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1531                        iommu_detach_domain(domain, iommu);
1532
1533        free_domain_mem(domain);
1534}
1535
1536static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1537                                 u8 bus, u8 devfn, int translation)
1538{
1539        struct context_entry *context;
1540        unsigned long flags;
1541        struct intel_iommu *iommu;
1542        struct dma_pte *pgd;
1543        unsigned long num;
1544        unsigned long ndomains;
1545        int id;
1546        int agaw;
1547        struct device_domain_info *info = NULL;
1548
1549        pr_debug("Set context mapping for %02x:%02x.%d\n",
1550                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1551
1552        BUG_ON(!domain->pgd);
1553        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1554               translation != CONTEXT_TT_MULTI_LEVEL);
1555
1556        iommu = device_to_iommu(segment, bus, devfn);
1557        if (!iommu)
1558                return -ENODEV;
1559
1560        context = device_to_context_entry(iommu, bus, devfn);
1561        if (!context)
1562                return -ENOMEM;
1563        spin_lock_irqsave(&iommu->lock, flags);
1564        if (context_present(context)) {
1565                spin_unlock_irqrestore(&iommu->lock, flags);
1566                return 0;
1567        }
1568
1569        id = domain->id;
1570        pgd = domain->pgd;
1571
1572        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1573            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1574                int found = 0;
1575
1576                /* find an available domain id for this device in iommu */
1577                ndomains = cap_ndoms(iommu->cap);
1578                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1579                        if (iommu->domains[num] == domain) {
1580                                id = num;
1581                                found = 1;
1582                                break;
1583                        }
1584                }
1585
1586                if (found == 0) {
1587                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1588                        if (num >= ndomains) {
1589                                spin_unlock_irqrestore(&iommu->lock, flags);
1590                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1591                                return -EFAULT;
1592                        }
1593
1594                        set_bit(num, iommu->domain_ids);
1595                        iommu->domains[num] = domain;
1596                        id = num;
1597                }
1598
1599                /* Skip top levels of page tables for
1600                 * iommu which has less agaw than default.
1601                 * Unnecessary for PT mode.
1602                 */
1603                if (translation != CONTEXT_TT_PASS_THROUGH) {
1604                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1605                                pgd = phys_to_virt(dma_pte_addr(pgd));
1606                                if (!dma_pte_present(pgd)) {
1607                                        spin_unlock_irqrestore(&iommu->lock, flags);
1608                                        return -ENOMEM;
1609                                }
1610                        }
1611                }
1612        }
1613
1614        context_set_domain_id(context, id);
1615
1616        if (translation != CONTEXT_TT_PASS_THROUGH) {
1617                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1618                translation = info ? CONTEXT_TT_DEV_IOTLB :
1619                                     CONTEXT_TT_MULTI_LEVEL;
1620        }
1621        /*
1622         * In pass through mode, AW must be programmed to indicate the largest
1623         * AGAW value supported by hardware. And ASR is ignored by hardware.
1624         */
1625        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1626                context_set_address_width(context, iommu->msagaw);
1627        else {
1628                context_set_address_root(context, virt_to_phys(pgd));
1629                context_set_address_width(context, iommu->agaw);
1630        }
1631
1632        context_set_translation_type(context, translation);
1633        context_set_fault_enable(context);
1634        context_set_present(context);
1635        domain_flush_cache(domain, context, sizeof(*context));
1636
1637        /*
1638         * It's a non-present to present mapping. If hardware doesn't cache
1639         * non-present entry we only need to flush the write-buffer. If the
1640         * _does_ cache non-present entries, then it does so in the special
1641         * domain #0, which we have to flush:
1642         */
1643        if (cap_caching_mode(iommu->cap)) {
1644                iommu->flush.flush_context(iommu, 0,
1645                                           (((u16)bus) << 8) | devfn,
1646                                           DMA_CCMD_MASK_NOBIT,
1647                                           DMA_CCMD_DEVICE_INVL);
1648                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1649        } else {
1650                iommu_flush_write_buffer(iommu);
1651        }
1652        iommu_enable_dev_iotlb(info);
1653        spin_unlock_irqrestore(&iommu->lock, flags);
1654
1655        spin_lock_irqsave(&domain->iommu_lock, flags);
1656        if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1657                domain->iommu_count++;
1658                if (domain->iommu_count == 1)
1659                        domain->nid = iommu->node;
1660                domain_update_iommu_cap(domain);
1661        }
1662        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1663        return 0;
1664}
1665
1666static int
1667domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1668                        int translation)
1669{
1670        int ret;
1671        struct pci_dev *tmp, *parent;
1672
1673        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1674                                         pdev->bus->number, pdev->devfn,
1675                                         translation);
1676        if (ret)
1677                return ret;
1678
1679        /* dependent device mapping */
1680        tmp = pci_find_upstream_pcie_bridge(pdev);
1681        if (!tmp)
1682                return 0;
1683        /* Secondary interface's bus number and devfn 0 */
1684        parent = pdev->bus->self;
1685        while (parent != tmp) {
1686                ret = domain_context_mapping_one(domain,
1687                                                 pci_domain_nr(parent->bus),
1688                                                 parent->bus->number,
1689                                                 parent->devfn, translation);
1690                if (ret)
1691                        return ret;
1692                parent = parent->bus->self;
1693        }
1694        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1695                return domain_context_mapping_one(domain,
1696                                        pci_domain_nr(tmp->subordinate),
1697                                        tmp->subordinate->number, 0,
1698                                        translation);
1699        else /* this is a legacy PCI bridge */
1700                return domain_context_mapping_one(domain,
1701                                                  pci_domain_nr(tmp->bus),
1702                                                  tmp->bus->number,
1703                                                  tmp->devfn,
1704                                                  translation);
1705}
1706
1707static int domain_context_mapped(struct pci_dev *pdev)
1708{
1709        int ret;
1710        struct pci_dev *tmp, *parent;
1711        struct intel_iommu *iommu;
1712
1713        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1714                                pdev->devfn);
1715        if (!iommu)
1716                return -ENODEV;
1717
1718        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1719        if (!ret)
1720                return ret;
1721        /* dependent device mapping */
1722        tmp = pci_find_upstream_pcie_bridge(pdev);
1723        if (!tmp)
1724                return ret;
1725        /* Secondary interface's bus number and devfn 0 */
1726        parent = pdev->bus->self;
1727        while (parent != tmp) {
1728                ret = device_context_mapped(iommu, parent->bus->number,
1729                                            parent->devfn);
1730                if (!ret)
1731                        return ret;
1732                parent = parent->bus->self;
1733        }
1734        if (pci_is_pcie(tmp))
1735                return device_context_mapped(iommu, tmp->subordinate->number,
1736                                             0);
1737        else
1738                return device_context_mapped(iommu, tmp->bus->number,
1739                                             tmp->devfn);
1740}
1741
1742/* Returns a number of VTD pages, but aligned to MM page size */
1743static inline unsigned long aligned_nrpages(unsigned long host_addr,
1744                                            size_t size)
1745{
1746        host_addr &= ~PAGE_MASK;
1747        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1748}
1749
1750/* Return largest possible superpage level for a given mapping */
1751static inline int hardware_largepage_caps(struct dmar_domain *domain,
1752                                          unsigned long iov_pfn,
1753                                          unsigned long phy_pfn,
1754                                          unsigned long pages)
1755{
1756        int support, level = 1;
1757        unsigned long pfnmerge;
1758
1759        support = domain->iommu_superpage;
1760
1761        /* To use a large page, the virtual *and* physical addresses
1762           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1763           of them will mean we have to use smaller pages. So just
1764           merge them and check both at once. */
1765        pfnmerge = iov_pfn | phy_pfn;
1766
1767        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1768                pages >>= VTD_STRIDE_SHIFT;
1769                if (!pages)
1770                        break;
1771                pfnmerge >>= VTD_STRIDE_SHIFT;
1772                level++;
1773                support--;
1774        }
1775        return level;
1776}
1777
1778static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1779                            struct scatterlist *sg, unsigned long phys_pfn,
1780                            unsigned long nr_pages, int prot)
1781{
1782        struct dma_pte *first_pte = NULL, *pte = NULL;
1783        phys_addr_t uninitialized_var(pteval);
1784        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1785        unsigned long sg_res;
1786        unsigned int largepage_lvl = 0;
1787        unsigned long lvl_pages = 0;
1788
1789        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1790
1791        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1792                return -EINVAL;
1793
1794        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1795
1796        if (sg)
1797                sg_res = 0;
1798        else {
1799                sg_res = nr_pages + 1;
1800                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1801        }
1802
1803        while (nr_pages > 0) {
1804                uint64_t tmp;
1805
1806                if (!sg_res) {
1807                        sg_res = aligned_nrpages(sg->offset, sg->length);
1808                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1809                        sg->dma_length = sg->length;
1810                        pteval = page_to_phys(sg_page(sg)) | prot;
1811                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1812                }
1813
1814                if (!pte) {
1815                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1816
1817                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1818                        if (!pte)
1819                                return -ENOMEM;
1820                        /* It is large page*/
1821                        if (largepage_lvl > 1)
1822                                pteval |= DMA_PTE_LARGE_PAGE;
1823                        else
1824                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1825
1826                }
1827                /* We don't need lock here, nobody else
1828                 * touches the iova range
1829                 */
1830                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1831                if (tmp) {
1832                        static int dumps = 5;
1833                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1834                               iov_pfn, tmp, (unsigned long long)pteval);
1835                        if (dumps) {
1836                                dumps--;
1837                                debug_dma_dump_mappings(NULL);
1838                        }
1839                        WARN_ON(1);
1840                }
1841
1842                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1843
1844                BUG_ON(nr_pages < lvl_pages);
1845                BUG_ON(sg_res < lvl_pages);
1846
1847                nr_pages -= lvl_pages;
1848                iov_pfn += lvl_pages;
1849                phys_pfn += lvl_pages;
1850                pteval += lvl_pages * VTD_PAGE_SIZE;
1851                sg_res -= lvl_pages;
1852
1853                /* If the next PTE would be the first in a new page, then we
1854                   need to flush the cache on the entries we've just written.
1855                   And then we'll need to recalculate 'pte', so clear it and
1856                   let it get set again in the if (!pte) block above.
1857
1858                   If we're done (!nr_pages) we need to flush the cache too.
1859
1860                   Also if we've been setting superpages, we may need to
1861                   recalculate 'pte' and switch back to smaller pages for the
1862                   end of the mapping, if the trailing size is not enough to
1863                   use another superpage (i.e. sg_res < lvl_pages). */
1864                pte++;
1865                if (!nr_pages || first_pte_in_page(pte) ||
1866                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1867                        domain_flush_cache(domain, first_pte,
1868                                           (void *)pte - (void *)first_pte);
1869                        pte = NULL;
1870                }
1871
1872                if (!sg_res && nr_pages)
1873                        sg = sg_next(sg);
1874        }
1875        return 0;
1876}
1877
1878static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1879                                    struct scatterlist *sg, unsigned long nr_pages,
1880                                    int prot)
1881{
1882        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1883}
1884
1885static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                                     unsigned long phys_pfn, unsigned long nr_pages,
1887                                     int prot)
1888{
1889        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1890}
1891
1892static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1893{
1894        if (!iommu)
1895                return;
1896
1897        clear_context_table(iommu, bus, devfn);
1898        iommu->flush.flush_context(iommu, 0, 0, 0,
1899                                           DMA_CCMD_GLOBAL_INVL);
1900        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1901}
1902
1903static void domain_remove_dev_info(struct dmar_domain *domain)
1904{
1905        struct device_domain_info *info;
1906        unsigned long flags;
1907        struct intel_iommu *iommu;
1908
1909        spin_lock_irqsave(&device_domain_lock, flags);
1910        while (!list_empty(&domain->devices)) {
1911                info = list_entry(domain->devices.next,
1912                        struct device_domain_info, link);
1913                list_del(&info->link);
1914                list_del(&info->global);
1915                if (info->dev)
1916                        info->dev->dev.archdata.iommu = NULL;
1917                spin_unlock_irqrestore(&device_domain_lock, flags);
1918
1919                iommu_disable_dev_iotlb(info);
1920                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1921                iommu_detach_dev(iommu, info->bus, info->devfn);
1922                free_devinfo_mem(info);
1923
1924                spin_lock_irqsave(&device_domain_lock, flags);
1925        }
1926        spin_unlock_irqrestore(&device_domain_lock, flags);
1927}
1928
1929/*
1930 * find_domain
1931 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1932 */
1933static struct dmar_domain *
1934find_domain(struct pci_dev *pdev)
1935{
1936        struct device_domain_info *info;
1937
1938        /* No lock here, assumes no domain exit in normal case */
1939        info = pdev->dev.archdata.iommu;
1940        if (info)
1941                return info->domain;
1942        return NULL;
1943}
1944
1945/* domain is initialized */
1946static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1947{
1948        struct dmar_domain *domain, *found = NULL;
1949        struct intel_iommu *iommu;
1950        struct dmar_drhd_unit *drhd;
1951        struct device_domain_info *info, *tmp;
1952        struct pci_dev *dev_tmp;
1953        unsigned long flags;
1954        int bus = 0, devfn = 0;
1955        int segment;
1956        int ret;
1957
1958        domain = find_domain(pdev);
1959        if (domain)
1960                return domain;
1961
1962        segment = pci_domain_nr(pdev->bus);
1963
1964        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1965        if (dev_tmp) {
1966                if (pci_is_pcie(dev_tmp)) {
1967                        bus = dev_tmp->subordinate->number;
1968                        devfn = 0;
1969                } else {
1970                        bus = dev_tmp->bus->number;
1971                        devfn = dev_tmp->devfn;
1972                }
1973                spin_lock_irqsave(&device_domain_lock, flags);
1974                list_for_each_entry(info, &device_domain_list, global) {
1975                        if (info->segment == segment &&
1976                            info->bus == bus && info->devfn == devfn) {
1977                                found = info->domain;
1978                                break;
1979                        }
1980                }
1981                spin_unlock_irqrestore(&device_domain_lock, flags);
1982                /* pcie-pci bridge already has a domain, uses it */
1983                if (found) {
1984                        domain = found;
1985                        goto found_domain;
1986                }
1987        }
1988
1989        domain = alloc_domain();
1990        if (!domain)
1991                goto error;
1992
1993        /* Allocate new domain for the device */
1994        drhd = dmar_find_matched_drhd_unit(pdev);
1995        if (!drhd) {
1996                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1997                        pci_name(pdev));
1998                return NULL;
1999        }
2000        iommu = drhd->iommu;
2001
2002        ret = iommu_attach_domain(domain, iommu);
2003        if (ret) {
2004                free_domain_mem(domain);
2005                goto error;
2006        }
2007
2008        if (domain_init(domain, gaw)) {
2009                domain_exit(domain);
2010                goto error;
2011        }
2012
2013        /* register pcie-to-pci device */
2014        if (dev_tmp) {
2015                info = alloc_devinfo_mem();
2016                if (!info) {
2017                        domain_exit(domain);
2018                        goto error;
2019                }
2020                info->segment = segment;
2021                info->bus = bus;
2022                info->devfn = devfn;
2023                info->dev = NULL;
2024                info->domain = domain;
2025                /* This domain is shared by devices under p2p bridge */
2026                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2027
2028                /* pcie-to-pci bridge already has a domain, uses it */
2029                found = NULL;
2030                spin_lock_irqsave(&device_domain_lock, flags);
2031                list_for_each_entry(tmp, &device_domain_list, global) {
2032                        if (tmp->segment == segment &&
2033                            tmp->bus == bus && tmp->devfn == devfn) {
2034                                found = tmp->domain;
2035                                break;
2036                        }
2037                }
2038                if (found) {
2039                        spin_unlock_irqrestore(&device_domain_lock, flags);
2040                        free_devinfo_mem(info);
2041                        domain_exit(domain);
2042                        domain = found;
2043                } else {
2044                        list_add(&info->link, &domain->devices);
2045                        list_add(&info->global, &device_domain_list);
2046                        spin_unlock_irqrestore(&device_domain_lock, flags);
2047                }
2048        }
2049
2050found_domain:
2051        info = alloc_devinfo_mem();
2052        if (!info)
2053                goto error;
2054        info->segment = segment;
2055        info->bus = pdev->bus->number;
2056        info->devfn = pdev->devfn;
2057        info->dev = pdev;
2058        info->domain = domain;
2059        spin_lock_irqsave(&device_domain_lock, flags);
2060        /* somebody is fast */
2061        found = find_domain(pdev);
2062        if (found != NULL) {
2063                spin_unlock_irqrestore(&device_domain_lock, flags);
2064                if (found != domain) {
2065                        domain_exit(domain);
2066                        domain = found;
2067                }
2068                free_devinfo_mem(info);
2069                return domain;
2070        }
2071        list_add(&info->link, &domain->devices);
2072        list_add(&info->global, &device_domain_list);
2073        pdev->dev.archdata.iommu = info;
2074        spin_unlock_irqrestore(&device_domain_lock, flags);
2075        return domain;
2076error:
2077        /* recheck it here, maybe others set it */
2078        return find_domain(pdev);
2079}
2080
2081static int iommu_identity_mapping;
2082#define IDENTMAP_ALL            1
2083#define IDENTMAP_GFX            2
2084#define IDENTMAP_AZALIA         4
2085
2086static int iommu_domain_identity_map(struct dmar_domain *domain,
2087                                     unsigned long long start,
2088                                     unsigned long long end)
2089{
2090        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2091        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2092
2093        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2094                          dma_to_mm_pfn(last_vpfn))) {
2095                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2096                return -ENOMEM;
2097        }
2098
2099        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2100                 start, end, domain->id);
2101        /*
2102         * RMRR range might have overlap with physical memory range,
2103         * clear it first
2104         */
2105        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2106
2107        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2108                                  last_vpfn - first_vpfn + 1,
2109                                  DMA_PTE_READ|DMA_PTE_WRITE);
2110}
2111
2112static int iommu_prepare_identity_map(struct pci_dev *pdev,
2113                                      unsigned long long start,
2114                                      unsigned long long end)
2115{
2116        struct dmar_domain *domain;
2117        int ret;
2118
2119        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2120        if (!domain)
2121                return -ENOMEM;
2122
2123        /* For _hardware_ passthrough, don't bother. But for software
2124           passthrough, we do it anyway -- it may indicate a memory
2125           range which is reserved in E820, so which didn't get set
2126           up to start with in si_domain */
2127        if (domain == si_domain && hw_pass_through) {
2128                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2129                       pci_name(pdev), start, end);
2130                return 0;
2131        }
2132
2133        printk(KERN_INFO
2134               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2135               pci_name(pdev), start, end);
2136        
2137        if (end < start) {
2138                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2139                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2140                        dmi_get_system_info(DMI_BIOS_VENDOR),
2141                        dmi_get_system_info(DMI_BIOS_VERSION),
2142                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2143                ret = -EIO;
2144                goto error;
2145        }
2146
2147        if (end >> agaw_to_width(domain->agaw)) {
2148                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2149                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2150                     agaw_to_width(domain->agaw),
2151                     dmi_get_system_info(DMI_BIOS_VENDOR),
2152                     dmi_get_system_info(DMI_BIOS_VERSION),
2153                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2154                ret = -EIO;
2155                goto error;
2156        }
2157
2158        ret = iommu_domain_identity_map(domain, start, end);
2159        if (ret)
2160                goto error;
2161
2162        /* context entry init */
2163        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2164        if (ret)
2165                goto error;
2166
2167        return 0;
2168
2169 error:
2170        domain_exit(domain);
2171        return ret;
2172}
2173
2174static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2175        struct pci_dev *pdev)
2176{
2177        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2178                return 0;
2179        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2180                rmrr->end_address);
2181}
2182
2183#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2184static inline void iommu_prepare_isa(void)
2185{
2186        struct pci_dev *pdev;
2187        int ret;
2188
2189        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2190        if (!pdev)
2191                return;
2192
2193        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2194        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2195
2196        if (ret)
2197                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2198                       "floppy might not work\n");
2199
2200}
2201#else
2202static inline void iommu_prepare_isa(void)
2203{
2204        return;
2205}
2206#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2207
2208static int md_domain_init(struct dmar_domain *domain, int guest_width);
2209
2210static int __init si_domain_init(int hw)
2211{
2212        struct dmar_drhd_unit *drhd;
2213        struct intel_iommu *iommu;
2214        int nid, ret = 0;
2215
2216        si_domain = alloc_domain();
2217        if (!si_domain)
2218                return -EFAULT;
2219
2220        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2221
2222        for_each_active_iommu(iommu, drhd) {
2223                ret = iommu_attach_domain(si_domain, iommu);
2224                if (ret) {
2225                        domain_exit(si_domain);
2226                        return -EFAULT;
2227                }
2228        }
2229
2230        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2231                domain_exit(si_domain);
2232                return -EFAULT;
2233        }
2234
2235        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2236
2237        if (hw)
2238                return 0;
2239
2240        for_each_online_node(nid) {
2241                unsigned long start_pfn, end_pfn;
2242                int i;
2243
2244                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2245                        ret = iommu_domain_identity_map(si_domain,
2246                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2247                        if (ret)
2248                                return ret;
2249                }
2250        }
2251
2252        return 0;
2253}
2254
2255static void domain_remove_one_dev_info(struct dmar_domain *domain,
2256                                          struct pci_dev *pdev);
2257static int identity_mapping(struct pci_dev *pdev)
2258{
2259        struct device_domain_info *info;
2260
2261        if (likely(!iommu_identity_mapping))
2262                return 0;
2263
2264        info = pdev->dev.archdata.iommu;
2265        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2266                return (info->domain == si_domain);
2267
2268        return 0;
2269}
2270
2271static int domain_add_dev_info(struct dmar_domain *domain,
2272                               struct pci_dev *pdev,
2273                               int translation)
2274{
2275        struct device_domain_info *info;
2276        unsigned long flags;
2277        int ret;
2278
2279        info = alloc_devinfo_mem();
2280        if (!info)
2281                return -ENOMEM;
2282
2283        ret = domain_context_mapping(domain, pdev, translation);
2284        if (ret) {
2285                free_devinfo_mem(info);
2286                return ret;
2287        }
2288
2289        info->segment = pci_domain_nr(pdev->bus);
2290        info->bus = pdev->bus->number;
2291        info->devfn = pdev->devfn;
2292        info->dev = pdev;
2293        info->domain = domain;
2294
2295        spin_lock_irqsave(&device_domain_lock, flags);
2296        list_add(&info->link, &domain->devices);
2297        list_add(&info->global, &device_domain_list);
2298        pdev->dev.archdata.iommu = info;
2299        spin_unlock_irqrestore(&device_domain_lock, flags);
2300
2301        return 0;
2302}
2303
2304static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2305{
2306        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2307                return 1;
2308
2309        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2310                return 1;
2311
2312        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2313                return 0;
2314
2315        /*
2316         * We want to start off with all devices in the 1:1 domain, and
2317         * take them out later if we find they can't access all of memory.
2318         *
2319         * However, we can't do this for PCI devices behind bridges,
2320         * because all PCI devices behind the same bridge will end up
2321         * with the same source-id on their transactions.
2322         *
2323         * Practically speaking, we can't change things around for these
2324         * devices at run-time, because we can't be sure there'll be no
2325         * DMA transactions in flight for any of their siblings.
2326         * 
2327         * So PCI devices (unless they're on the root bus) as well as
2328         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2329         * the 1:1 domain, just in _case_ one of their siblings turns out
2330         * not to be able to map all of memory.
2331         */
2332        if (!pci_is_pcie(pdev)) {
2333                if (!pci_is_root_bus(pdev->bus))
2334                        return 0;
2335                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2336                        return 0;
2337        } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2338                return 0;
2339
2340        /* 
2341         * At boot time, we don't yet know if devices will be 64-bit capable.
2342         * Assume that they will -- if they turn out not to be, then we can 
2343         * take them out of the 1:1 domain later.
2344         */
2345        if (!startup) {
2346                /*
2347                 * If the device's dma_mask is less than the system's memory
2348                 * size then this is not a candidate for identity mapping.
2349                 */
2350                u64 dma_mask = pdev->dma_mask;
2351
2352                if (pdev->dev.coherent_dma_mask &&
2353                    pdev->dev.coherent_dma_mask < dma_mask)
2354                        dma_mask = pdev->dev.coherent_dma_mask;
2355
2356                return dma_mask >= dma_get_required_mask(&pdev->dev);
2357        }
2358
2359        return 1;
2360}
2361
2362static int __init iommu_prepare_static_identity_mapping(int hw)
2363{
2364        struct pci_dev *pdev = NULL;
2365        int ret;
2366
2367        ret = si_domain_init(hw);
2368        if (ret)
2369                return -EFAULT;
2370
2371        for_each_pci_dev(pdev) {
2372                /* Skip Host/PCI Bridge devices */
2373                if (IS_BRIDGE_HOST_DEVICE(pdev))
2374                        continue;
2375                if (iommu_should_identity_map(pdev, 1)) {
2376                        printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2377                               hw ? "hardware" : "software", pci_name(pdev));
2378
2379                        ret = domain_add_dev_info(si_domain, pdev,
2380                                                     hw ? CONTEXT_TT_PASS_THROUGH :
2381                                                     CONTEXT_TT_MULTI_LEVEL);
2382                        if (ret)
2383                                return ret;
2384                }
2385        }
2386
2387        return 0;
2388}
2389
2390static int __init init_dmars(void)
2391{
2392        struct dmar_drhd_unit *drhd;
2393        struct dmar_rmrr_unit *rmrr;
2394        struct pci_dev *pdev;
2395        struct intel_iommu *iommu;
2396        int i, ret;
2397
2398        /*
2399         * for each drhd
2400         *    allocate root
2401         *    initialize and program root entry to not present
2402         * endfor
2403         */
2404        for_each_drhd_unit(drhd) {
2405                g_num_of_iommus++;
2406                /*
2407                 * lock not needed as this is only incremented in the single
2408                 * threaded kernel __init code path all other access are read
2409                 * only
2410                 */
2411        }
2412
2413        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2414                        GFP_KERNEL);
2415        if (!g_iommus) {
2416                printk(KERN_ERR "Allocating global iommu array failed\n");
2417                ret = -ENOMEM;
2418                goto error;
2419        }
2420
2421        deferred_flush = kzalloc(g_num_of_iommus *
2422                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2423        if (!deferred_flush) {
2424                ret = -ENOMEM;
2425                goto error;
2426        }
2427
2428        for_each_drhd_unit(drhd) {
2429                if (drhd->ignored)
2430                        continue;
2431
2432                iommu = drhd->iommu;
2433                g_iommus[iommu->seq_id] = iommu;
2434
2435                ret = iommu_init_domains(iommu);
2436                if (ret)
2437                        goto error;
2438
2439                /*
2440                 * TBD:
2441                 * we could share the same root & context tables
2442                 * among all IOMMU's. Need to Split it later.
2443                 */
2444                ret = iommu_alloc_root_entry(iommu);
2445                if (ret) {
2446                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2447                        goto error;
2448                }
2449                if (!ecap_pass_through(iommu->ecap))
2450                        hw_pass_through = 0;
2451        }
2452
2453        /*
2454         * Start from the sane iommu hardware state.
2455         */
2456        for_each_drhd_unit(drhd) {
2457                if (drhd->ignored)
2458                        continue;
2459
2460                iommu = drhd->iommu;
2461
2462                /*
2463                 * If the queued invalidation is already initialized by us
2464                 * (for example, while enabling interrupt-remapping) then
2465                 * we got the things already rolling from a sane state.
2466                 */
2467                if (iommu->qi)
2468                        continue;
2469
2470                /*
2471                 * Clear any previous faults.
2472                 */
2473                dmar_fault(-1, iommu);
2474                /*
2475                 * Disable queued invalidation if supported and already enabled
2476                 * before OS handover.
2477                 */
2478                dmar_disable_qi(iommu);
2479        }
2480
2481        for_each_drhd_unit(drhd) {
2482                if (drhd->ignored)
2483                        continue;
2484
2485                iommu = drhd->iommu;
2486
2487                if (dmar_enable_qi(iommu)) {
2488                        /*
2489                         * Queued Invalidate not enabled, use Register Based
2490                         * Invalidate
2491                         */
2492                        iommu->flush.flush_context = __iommu_flush_context;
2493                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2494                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2495                               "invalidation\n",
2496                                iommu->seq_id,
2497                               (unsigned long long)drhd->reg_base_addr);
2498                } else {
2499                        iommu->flush.flush_context = qi_flush_context;
2500                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2501                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2502                               "invalidation\n",
2503                                iommu->seq_id,
2504                               (unsigned long long)drhd->reg_base_addr);
2505                }
2506        }
2507
2508        if (iommu_pass_through)
2509                iommu_identity_mapping |= IDENTMAP_ALL;
2510
2511#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2512        iommu_identity_mapping |= IDENTMAP_GFX;
2513#endif
2514
2515        check_tylersburg_isoch();
2516
2517        /*
2518         * If pass through is not set or not enabled, setup context entries for
2519         * identity mappings for rmrr, gfx, and isa and may fall back to static
2520         * identity mapping if iommu_identity_mapping is set.
2521         */
2522        if (iommu_identity_mapping) {
2523                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2524                if (ret) {
2525                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2526                        goto error;
2527                }
2528        }
2529        /*
2530         * For each rmrr
2531         *   for each dev attached to rmrr
2532         *   do
2533         *     locate drhd for dev, alloc domain for dev
2534         *     allocate free domain
2535         *     allocate page table entries for rmrr
2536         *     if context not allocated for bus
2537         *           allocate and init context
2538         *           set present in root table for this bus
2539         *     init context with domain, translation etc
2540         *    endfor
2541         * endfor
2542         */
2543        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2544        for_each_rmrr_units(rmrr) {
2545                for (i = 0; i < rmrr->devices_cnt; i++) {
2546                        pdev = rmrr->devices[i];
2547                        /*
2548                         * some BIOS lists non-exist devices in DMAR
2549                         * table.
2550                         */
2551                        if (!pdev)
2552                                continue;
2553                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2554                        if (ret)
2555                                printk(KERN_ERR
2556                                       "IOMMU: mapping reserved region failed\n");
2557                }
2558        }
2559
2560        iommu_prepare_isa();
2561
2562        /*
2563         * for each drhd
2564         *   enable fault log
2565         *   global invalidate context cache
2566         *   global invalidate iotlb
2567         *   enable translation
2568         */
2569        for_each_drhd_unit(drhd) {
2570                if (drhd->ignored) {
2571                        /*
2572                         * we always have to disable PMRs or DMA may fail on
2573                         * this device
2574                         */
2575                        if (force_on)
2576                                iommu_disable_protect_mem_regions(drhd->iommu);
2577                        continue;
2578                }
2579                iommu = drhd->iommu;
2580
2581                iommu_flush_write_buffer(iommu);
2582
2583                ret = dmar_set_interrupt(iommu);
2584                if (ret)
2585                        goto error;
2586
2587                iommu_set_root_entry(iommu);
2588
2589                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2590                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2591
2592                ret = iommu_enable_translation(iommu);
2593                if (ret)
2594                        goto error;
2595
2596                iommu_disable_protect_mem_regions(iommu);
2597        }
2598
2599        return 0;
2600error:
2601        for_each_drhd_unit(drhd) {
2602                if (drhd->ignored)
2603                        continue;
2604                iommu = drhd->iommu;
2605                free_iommu(iommu);
2606        }
2607        kfree(g_iommus);
2608        return ret;
2609}
2610
2611/* This takes a number of _MM_ pages, not VTD pages */
2612static struct iova *intel_alloc_iova(struct device *dev,
2613                                     struct dmar_domain *domain,
2614                                     unsigned long nrpages, uint64_t dma_mask)
2615{
2616        struct pci_dev *pdev = to_pci_dev(dev);
2617        struct iova *iova = NULL;
2618
2619        /* Restrict dma_mask to the width that the iommu can handle */
2620        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2621
2622        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2623                /*
2624                 * First try to allocate an io virtual address in
2625                 * DMA_BIT_MASK(32) and if that fails then try allocating
2626                 * from higher range
2627                 */
2628                iova = alloc_iova(&domain->iovad, nrpages,
2629                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2630                if (iova)
2631                        return iova;
2632        }
2633        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2634        if (unlikely(!iova)) {
2635                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2636                       nrpages, pci_name(pdev));
2637                return NULL;
2638        }
2639
2640        return iova;
2641}
2642
2643static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2644{
2645        struct dmar_domain *domain;
2646        int ret;
2647
2648        domain = get_domain_for_dev(pdev,
2649                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2650        if (!domain) {
2651                printk(KERN_ERR
2652                        "Allocating domain for %s failed", pci_name(pdev));
2653                return NULL;
2654        }
2655
2656        /* make sure context mapping is ok */
2657        if (unlikely(!domain_context_mapped(pdev))) {
2658                ret = domain_context_mapping(domain, pdev,
2659                                             CONTEXT_TT_MULTI_LEVEL);
2660                if (ret) {
2661                        printk(KERN_ERR
2662                                "Domain context map for %s failed",
2663                                pci_name(pdev));
2664                        return NULL;
2665                }
2666        }
2667
2668        return domain;
2669}
2670
2671static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2672{
2673        struct device_domain_info *info;
2674
2675        /* No lock here, assumes no domain exit in normal case */
2676        info = dev->dev.archdata.iommu;
2677        if (likely(info))
2678                return info->domain;
2679
2680        return __get_valid_domain_for_dev(dev);
2681}
2682
2683static int iommu_dummy(struct pci_dev *pdev)
2684{
2685        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2686}
2687
2688/* Check if the pdev needs to go through non-identity map and unmap process.*/
2689static int iommu_no_mapping(struct device *dev)
2690{
2691        struct pci_dev *pdev;
2692        int found;
2693
2694        if (unlikely(dev->bus != &pci_bus_type))
2695                return 1;
2696
2697        pdev = to_pci_dev(dev);
2698        if (iommu_dummy(pdev))
2699                return 1;
2700
2701        if (!iommu_identity_mapping)
2702                return 0;
2703
2704        found = identity_mapping(pdev);
2705        if (found) {
2706                if (iommu_should_identity_map(pdev, 0))
2707                        return 1;
2708                else {
2709                        /*
2710                         * 32 bit DMA is removed from si_domain and fall back
2711                         * to non-identity mapping.
2712                         */
2713                        domain_remove_one_dev_info(si_domain, pdev);
2714                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2715                               pci_name(pdev));
2716                        return 0;
2717                }
2718        } else {
2719                /*
2720                 * In case of a detached 64 bit DMA device from vm, the device
2721                 * is put into si_domain for identity mapping.
2722                 */
2723                if (iommu_should_identity_map(pdev, 0)) {
2724                        int ret;
2725                        ret = domain_add_dev_info(si_domain, pdev,
2726                                                  hw_pass_through ?
2727                                                  CONTEXT_TT_PASS_THROUGH :
2728                                                  CONTEXT_TT_MULTI_LEVEL);
2729                        if (!ret) {
2730                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2731                                       pci_name(pdev));
2732                                return 1;
2733                        }
2734                }
2735        }
2736
2737        return 0;
2738}
2739
2740static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2741                                     size_t size, int dir, u64 dma_mask)
2742{
2743        struct pci_dev *pdev = to_pci_dev(hwdev);
2744        struct dmar_domain *domain;
2745        phys_addr_t start_paddr;
2746        struct iova *iova;
2747        int prot = 0;
2748        int ret;
2749        struct intel_iommu *iommu;
2750        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2751
2752        BUG_ON(dir == DMA_NONE);
2753
2754        if (iommu_no_mapping(hwdev))
2755                return paddr;
2756
2757        domain = get_valid_domain_for_dev(pdev);
2758        if (!domain)
2759                return 0;
2760
2761        iommu = domain_get_iommu(domain);
2762        size = aligned_nrpages(paddr, size);
2763
2764        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2765        if (!iova)
2766                goto error;
2767
2768        /*
2769         * Check if DMAR supports zero-length reads on write only
2770         * mappings..
2771         */
2772        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2773                        !cap_zlr(iommu->cap))
2774                prot |= DMA_PTE_READ;
2775        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2776                prot |= DMA_PTE_WRITE;
2777        /*
2778         * paddr - (paddr + size) might be partial page, we should map the whole
2779         * page.  Note: if two part of one page are separately mapped, we
2780         * might have two guest_addr mapping to the same host paddr, but this
2781         * is not a big problem
2782         */
2783        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2784                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2785        if (ret)
2786                goto error;
2787
2788        /* it's a non-present to present mapping. Only flush if caching mode */
2789        if (cap_caching_mode(iommu->cap))
2790                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2791        else
2792                iommu_flush_write_buffer(iommu);
2793
2794        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2795        start_paddr += paddr & ~PAGE_MASK;
2796        return start_paddr;
2797
2798error:
2799        if (iova)
2800                __free_iova(&domain->iovad, iova);
2801        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2802                pci_name(pdev), size, (unsigned long long)paddr, dir);
2803        return 0;
2804}
2805
2806static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2807                                 unsigned long offset, size_t size,
2808                                 enum dma_data_direction dir,
2809                                 struct dma_attrs *attrs)
2810{
2811        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2812                                  dir, to_pci_dev(dev)->dma_mask);
2813}
2814
2815static void flush_unmaps(void)
2816{
2817        int i, j;
2818
2819        timer_on = 0;
2820
2821        /* just flush them all */
2822        for (i = 0; i < g_num_of_iommus; i++) {
2823                struct intel_iommu *iommu = g_iommus[i];
2824                if (!iommu)
2825                        continue;
2826
2827                if (!deferred_flush[i].next)
2828                        continue;
2829
2830                /* In caching mode, global flushes turn emulation expensive */
2831                if (!cap_caching_mode(iommu->cap))
2832                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2833                                         DMA_TLB_GLOBAL_FLUSH);
2834                for (j = 0; j < deferred_flush[i].next; j++) {
2835                        unsigned long mask;
2836                        struct iova *iova = deferred_flush[i].iova[j];
2837                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2838
2839                        /* On real hardware multiple invalidations are expensive */
2840                        if (cap_caching_mode(iommu->cap))
2841                                iommu_flush_iotlb_psi(iommu, domain->id,
2842                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2843                        else {
2844                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2845                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2846                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2847                        }
2848                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2849                }
2850                deferred_flush[i].next = 0;
2851        }
2852
2853        list_size = 0;
2854}
2855
2856static void flush_unmaps_timeout(unsigned long data)
2857{
2858        unsigned long flags;
2859
2860        spin_lock_irqsave(&async_umap_flush_lock, flags);
2861        flush_unmaps();
2862        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2863}
2864
2865static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2866{
2867        unsigned long flags;
2868        int next, iommu_id;
2869        struct intel_iommu *iommu;
2870
2871        spin_lock_irqsave(&async_umap_flush_lock, flags);
2872        if (list_size == HIGH_WATER_MARK)
2873                flush_unmaps();
2874
2875        iommu = domain_get_iommu(dom);
2876        iommu_id = iommu->seq_id;
2877
2878        next = deferred_flush[iommu_id].next;
2879        deferred_flush[iommu_id].domain[next] = dom;
2880        deferred_flush[iommu_id].iova[next] = iova;
2881        deferred_flush[iommu_id].next++;
2882
2883        if (!timer_on) {
2884                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2885                timer_on = 1;
2886        }
2887        list_size++;
2888        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2889}
2890
2891static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2892                             size_t size, enum dma_data_direction dir,
2893                             struct dma_attrs *attrs)
2894{
2895        struct pci_dev *pdev = to_pci_dev(dev);
2896        struct dmar_domain *domain;
2897        unsigned long start_pfn, last_pfn;
2898        struct iova *iova;
2899        struct intel_iommu *iommu;
2900
2901        if (iommu_no_mapping(dev))
2902                return;
2903
2904        domain = find_domain(pdev);
2905        BUG_ON(!domain);
2906
2907        iommu = domain_get_iommu(domain);
2908
2909        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2910        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2911                      (unsigned long long)dev_addr))
2912                return;
2913
2914        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2915        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2916
2917        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2918                 pci_name(pdev), start_pfn, last_pfn);
2919
2920        /*  clear the whole page */
2921        dma_pte_clear_range(domain, start_pfn, last_pfn);
2922
2923        /* free page tables */
2924        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2925
2926        if (intel_iommu_strict) {
2927                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2928                                      last_pfn - start_pfn + 1, 0);
2929                /* free iova */
2930                __free_iova(&domain->iovad, iova);
2931        } else {
2932                add_unmap(domain, iova);
2933                /*
2934                 * queue up the release of the unmap to save the 1/6th of the
2935                 * cpu used up by the iotlb flush operation...
2936                 */
2937        }
2938}
2939
2940static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2941                                  dma_addr_t *dma_handle, gfp_t flags)
2942{
2943        void *vaddr;
2944        int order;
2945
2946        size = PAGE_ALIGN(size);
2947        order = get_order(size);
2948
2949        if (!iommu_no_mapping(hwdev))
2950                flags &= ~(GFP_DMA | GFP_DMA32);
2951        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2952                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2953                        flags |= GFP_DMA;
2954                else
2955                        flags |= GFP_DMA32;
2956        }
2957
2958        vaddr = (void *)__get_free_pages(flags, order);
2959        if (!vaddr)
2960                return NULL;
2961        memset(vaddr, 0, size);
2962
2963        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2964                                         DMA_BIDIRECTIONAL,
2965                                         hwdev->coherent_dma_mask);
2966        if (*dma_handle)
2967                return vaddr;
2968        free_pages((unsigned long)vaddr, order);
2969        return NULL;
2970}
2971
2972static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2973                                dma_addr_t dma_handle)
2974{
2975        int order;
2976
2977        size = PAGE_ALIGN(size);
2978        order = get_order(size);
2979
2980        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2981        free_pages((unsigned long)vaddr, order);
2982}
2983
2984static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2985                           int nelems, enum dma_data_direction dir,
2986                           struct dma_attrs *attrs)
2987{
2988        struct pci_dev *pdev = to_pci_dev(hwdev);
2989        struct dmar_domain *domain;
2990        unsigned long start_pfn, last_pfn;
2991        struct iova *iova;
2992        struct intel_iommu *iommu;
2993
2994        if (iommu_no_mapping(hwdev))
2995                return;
2996
2997        domain = find_domain(pdev);
2998        BUG_ON(!domain);
2999
3000        iommu = domain_get_iommu(domain);
3001
3002        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3003        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3004                      (unsigned long long)sglist[0].dma_address))
3005                return;
3006
3007        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3008        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3009
3010        /*  clear the whole page */
3011        dma_pte_clear_range(domain, start_pfn, last_pfn);
3012
3013        /* free page tables */
3014        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3015
3016        if (intel_iommu_strict) {
3017                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3018                                      last_pfn - start_pfn + 1, 0);
3019                /* free iova */
3020                __free_iova(&domain->iovad, iova);
3021        } else {
3022                add_unmap(domain, iova);
3023                /*
3024                 * queue up the release of the unmap to save the 1/6th of the
3025                 * cpu used up by the iotlb flush operation...
3026                 */
3027        }
3028}
3029
3030static int intel_nontranslate_map_sg(struct device *hddev,
3031        struct scatterlist *sglist, int nelems, int dir)
3032{
3033        int i;
3034        struct scatterlist *sg;
3035
3036        for_each_sg(sglist, sg, nelems, i) {
3037                BUG_ON(!sg_page(sg));
3038                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3039                sg->dma_length = sg->length;
3040        }
3041        return nelems;
3042}
3043
3044static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3045                        enum dma_data_direction dir, struct dma_attrs *attrs)
3046{
3047        int i;
3048        struct pci_dev *pdev = to_pci_dev(hwdev);
3049        struct dmar_domain *domain;
3050        size_t size = 0;
3051        int prot = 0;
3052        struct iova *iova = NULL;
3053        int ret;
3054        struct scatterlist *sg;
3055        unsigned long start_vpfn;
3056        struct intel_iommu *iommu;
3057
3058        BUG_ON(dir == DMA_NONE);
3059        if (iommu_no_mapping(hwdev))
3060                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3061
3062        domain = get_valid_domain_for_dev(pdev);
3063        if (!domain)
3064                return 0;
3065
3066        iommu = domain_get_iommu(domain);
3067
3068        for_each_sg(sglist, sg, nelems, i)
3069                size += aligned_nrpages(sg->offset, sg->length);
3070
3071        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3072                                pdev->dma_mask);
3073        if (!iova) {
3074                sglist->dma_length = 0;
3075                return 0;
3076        }
3077
3078        /*
3079         * Check if DMAR supports zero-length reads on write only
3080         * mappings..
3081         */
3082        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3083                        !cap_zlr(iommu->cap))
3084                prot |= DMA_PTE_READ;
3085        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3086                prot |= DMA_PTE_WRITE;
3087
3088        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3089
3090        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3091        if (unlikely(ret)) {
3092                /*  clear the page */
3093                dma_pte_clear_range(domain, start_vpfn,
3094                                    start_vpfn + size - 1);
3095                /* free page tables */
3096                dma_pte_free_pagetable(domain, start_vpfn,
3097                                       start_vpfn + size - 1);
3098                /* free iova */
3099                __free_iova(&domain->iovad, iova);
3100                return 0;
3101        }
3102
3103        /* it's a non-present to present mapping. Only flush if caching mode */
3104        if (cap_caching_mode(iommu->cap))
3105                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3106        else
3107                iommu_flush_write_buffer(iommu);
3108
3109        return nelems;
3110}
3111
3112static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3113{
3114        return !dma_addr;
3115}
3116
3117struct dma_map_ops intel_dma_ops = {
3118        .alloc_coherent = intel_alloc_coherent,
3119        .free_coherent = intel_free_coherent,
3120        .map_sg = intel_map_sg,
3121        .unmap_sg = intel_unmap_sg,
3122        .map_page = intel_map_page,
3123        .unmap_page = intel_unmap_page,
3124        .mapping_error = intel_mapping_error,
3125};
3126
3127static inline int iommu_domain_cache_init(void)
3128{
3129        int ret = 0;
3130
3131        iommu_domain_cache = kmem_cache_create("iommu_domain",
3132                                         sizeof(struct dmar_domain),
3133                                         0,
3134                                         SLAB_HWCACHE_ALIGN,
3135
3136                                         NULL);
3137        if (!iommu_domain_cache) {
3138                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3139                ret = -ENOMEM;
3140        }
3141
3142        return ret;
3143}
3144
3145static inline int iommu_devinfo_cache_init(void)
3146{
3147        int ret = 0;
3148
3149        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3150                                         sizeof(struct device_domain_info),
3151                                         0,
3152                                         SLAB_HWCACHE_ALIGN,
3153                                         NULL);
3154        if (!iommu_devinfo_cache) {
3155                printk(KERN_ERR "Couldn't create devinfo cache\n");
3156                ret = -ENOMEM;
3157        }
3158
3159        return ret;
3160}
3161
3162static inline int iommu_iova_cache_init(void)
3163{
3164        int ret = 0;
3165
3166        iommu_iova_cache = kmem_cache_create("iommu_iova",
3167                                         sizeof(struct iova),
3168                                         0,
3169                                         SLAB_HWCACHE_ALIGN,
3170                                         NULL);
3171        if (!iommu_iova_cache) {
3172                printk(KERN_ERR "Couldn't create iova cache\n");
3173                ret = -ENOMEM;
3174        }
3175
3176        return ret;
3177}
3178
3179static int __init iommu_init_mempool(void)
3180{
3181        int ret;
3182        ret = iommu_iova_cache_init();
3183        if (ret)
3184                return ret;
3185
3186        ret = iommu_domain_cache_init();
3187        if (ret)
3188                goto domain_error;
3189
3190        ret = iommu_devinfo_cache_init();
3191        if (!ret)
3192                return ret;
3193
3194        kmem_cache_destroy(iommu_domain_cache);
3195domain_error:
3196        kmem_cache_destroy(iommu_iova_cache);
3197
3198        return -ENOMEM;
3199}
3200
3201static void __init iommu_exit_mempool(void)
3202{
3203        kmem_cache_destroy(iommu_devinfo_cache);
3204        kmem_cache_destroy(iommu_domain_cache);
3205        kmem_cache_destroy(iommu_iova_cache);
3206
3207}
3208
3209static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3210{
3211        struct dmar_drhd_unit *drhd;
3212        u32 vtbar;
3213        int rc;
3214
3215        /* We know that this device on this chipset has its own IOMMU.
3216         * If we find it under a different IOMMU, then the BIOS is lying
3217         * to us. Hope that the IOMMU for this device is actually
3218         * disabled, and it needs no translation...
3219         */
3220        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3221        if (rc) {
3222                /* "can't" happen */
3223                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3224                return;
3225        }
3226        vtbar &= 0xffff0000;
3227
3228        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3229        drhd = dmar_find_matched_drhd_unit(pdev);
3230        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3231                            TAINT_FIRMWARE_WORKAROUND,
3232                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3233                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3234}
3235DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3236
3237static void __init init_no_remapping_devices(void)
3238{
3239        struct dmar_drhd_unit *drhd;
3240
3241        for_each_drhd_unit(drhd) {
3242                if (!drhd->include_all) {
3243                        int i;
3244                        for (i = 0; i < drhd->devices_cnt; i++)
3245                                if (drhd->devices[i] != NULL)
3246                                        break;
3247                        /* ignore DMAR unit if no pci devices exist */
3248                        if (i == drhd->devices_cnt)
3249                                drhd->ignored = 1;
3250                }
3251        }
3252
3253        for_each_drhd_unit(drhd) {
3254                int i;
3255                if (drhd->ignored || drhd->include_all)
3256                        continue;
3257
3258                for (i = 0; i < drhd->devices_cnt; i++)
3259                        if (drhd->devices[i] &&
3260                            !IS_GFX_DEVICE(drhd->devices[i]))
3261                                break;
3262
3263                if (i < drhd->devices_cnt)
3264                        continue;
3265
3266                /* This IOMMU has *only* gfx devices. Either bypass it or
3267                   set the gfx_mapped flag, as appropriate */
3268                if (dmar_map_gfx) {
3269                        intel_iommu_gfx_mapped = 1;
3270                } else {
3271                        drhd->ignored = 1;
3272                        for (i = 0; i < drhd->devices_cnt; i++) {
3273                                if (!drhd->devices[i])
3274                                        continue;
3275                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3276                        }
3277                }
3278        }
3279}
3280
3281#ifdef CONFIG_SUSPEND
3282static int init_iommu_hw(void)
3283{
3284        struct dmar_drhd_unit *drhd;
3285        struct intel_iommu *iommu = NULL;
3286
3287        for_each_active_iommu(iommu, drhd)
3288                if (iommu->qi)
3289                        dmar_reenable_qi(iommu);
3290
3291        for_each_iommu(iommu, drhd) {
3292                if (drhd->ignored) {
3293                        /*
3294                         * we always have to disable PMRs or DMA may fail on
3295                         * this device
3296                         */
3297                        if (force_on)
3298                                iommu_disable_protect_mem_regions(iommu);
3299                        continue;
3300                }
3301        
3302                iommu_flush_write_buffer(iommu);
3303
3304                iommu_set_root_entry(iommu);
3305
3306                iommu->flush.flush_context(iommu, 0, 0, 0,
3307                                           DMA_CCMD_GLOBAL_INVL);
3308                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3309                                         DMA_TLB_GLOBAL_FLUSH);
3310                if (iommu_enable_translation(iommu))
3311                        return 1;
3312                iommu_disable_protect_mem_regions(iommu);
3313        }
3314
3315        return 0;
3316}
3317
3318static void iommu_flush_all(void)
3319{
3320        struct dmar_drhd_unit *drhd;
3321        struct intel_iommu *iommu;
3322
3323        for_each_active_iommu(iommu, drhd) {
3324                iommu->flush.flush_context(iommu, 0, 0, 0,
3325                                           DMA_CCMD_GLOBAL_INVL);
3326                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3327                                         DMA_TLB_GLOBAL_FLUSH);
3328        }
3329}
3330
3331static int iommu_suspend(void)
3332{
3333        struct dmar_drhd_unit *drhd;
3334        struct intel_iommu *iommu = NULL;
3335        unsigned long flag;
3336
3337        for_each_active_iommu(iommu, drhd) {
3338                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3339                                                 GFP_ATOMIC);
3340                if (!iommu->iommu_state)
3341                        goto nomem;
3342        }
3343
3344        iommu_flush_all();
3345
3346        for_each_active_iommu(iommu, drhd) {
3347                iommu_disable_translation(iommu);
3348
3349                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3350
3351                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3352                        readl(iommu->reg + DMAR_FECTL_REG);
3353                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3354                        readl(iommu->reg + DMAR_FEDATA_REG);
3355                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3356                        readl(iommu->reg + DMAR_FEADDR_REG);
3357                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3358                        readl(iommu->reg + DMAR_FEUADDR_REG);
3359
3360                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3361        }
3362        return 0;
3363
3364nomem:
3365        for_each_active_iommu(iommu, drhd)
3366                kfree(iommu->iommu_state);
3367
3368        return -ENOMEM;
3369}
3370
3371static void iommu_resume(void)
3372{
3373        struct dmar_drhd_unit *drhd;
3374        struct intel_iommu *iommu = NULL;
3375        unsigned long flag;
3376
3377        if (init_iommu_hw()) {
3378                if (force_on)
3379                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3380                else
3381                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3382                return;
3383        }
3384
3385        for_each_active_iommu(iommu, drhd) {
3386
3387                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3388
3389                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3390                        iommu->reg + DMAR_FECTL_REG);
3391                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3392                        iommu->reg + DMAR_FEDATA_REG);
3393                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3394                        iommu->reg + DMAR_FEADDR_REG);
3395                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3396                        iommu->reg + DMAR_FEUADDR_REG);
3397
3398                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3399        }
3400
3401        for_each_active_iommu(iommu, drhd)
3402                kfree(iommu->iommu_state);
3403}
3404
3405static struct syscore_ops iommu_syscore_ops = {
3406        .resume         = iommu_resume,
3407        .suspend        = iommu_suspend,
3408};
3409
3410static void __init init_iommu_pm_ops(void)
3411{
3412        register_syscore_ops(&iommu_syscore_ops);
3413}
3414
3415#else
3416static inline void init_iommu_pm_ops(void) {}
3417#endif  /* CONFIG_PM */
3418
3419LIST_HEAD(dmar_rmrr_units);
3420
3421static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3422{
3423        list_add(&rmrr->list, &dmar_rmrr_units);
3424}
3425
3426
3427int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3428{
3429        struct acpi_dmar_reserved_memory *rmrr;
3430        struct dmar_rmrr_unit *rmrru;
3431
3432        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3433        if (!rmrru)
3434                return -ENOMEM;
3435
3436        rmrru->hdr = header;
3437        rmrr = (struct acpi_dmar_reserved_memory *)header;
3438        rmrru->base_address = rmrr->base_address;
3439        rmrru->end_address = rmrr->end_address;
3440
3441        dmar_register_rmrr_unit(rmrru);
3442        return 0;
3443}
3444
3445static int __init
3446rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3447{
3448        struct acpi_dmar_reserved_memory *rmrr;
3449        int ret;
3450
3451        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3452        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3453                ((void *)rmrr) + rmrr->header.length,
3454                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3455
3456        if (ret || (rmrru->devices_cnt == 0)) {
3457                list_del(&rmrru->list);
3458                kfree(rmrru);
3459        }
3460        return ret;
3461}
3462
3463static LIST_HEAD(dmar_atsr_units);
3464
3465int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3466{
3467        struct acpi_dmar_atsr *atsr;
3468        struct dmar_atsr_unit *atsru;
3469
3470        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3471        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3472        if (!atsru)
3473                return -ENOMEM;
3474
3475        atsru->hdr = hdr;
3476        atsru->include_all = atsr->flags & 0x1;
3477
3478        list_add(&atsru->list, &dmar_atsr_units);
3479
3480        return 0;
3481}
3482
3483static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3484{
3485        int rc;
3486        struct acpi_dmar_atsr *atsr;
3487
3488        if (atsru->include_all)
3489                return 0;
3490
3491        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3492        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3493                                (void *)atsr + atsr->header.length,
3494                                &atsru->devices_cnt, &atsru->devices,
3495                                atsr->segment);
3496        if (rc || !atsru->devices_cnt) {
3497                list_del(&atsru->list);
3498                kfree(atsru);
3499        }
3500
3501        return rc;
3502}
3503
3504int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3505{
3506        int i;
3507        struct pci_bus *bus;
3508        struct acpi_dmar_atsr *atsr;
3509        struct dmar_atsr_unit *atsru;
3510
3511        dev = pci_physfn(dev);
3512
3513        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3514                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3515                if (atsr->segment == pci_domain_nr(dev->bus))
3516                        goto found;
3517        }
3518
3519        return 0;
3520
3521found:
3522        for (bus = dev->bus; bus; bus = bus->parent) {
3523                struct pci_dev *bridge = bus->self;
3524
3525                if (!bridge || !pci_is_pcie(bridge) ||
3526                    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3527                        return 0;
3528
3529                if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3530                        for (i = 0; i < atsru->devices_cnt; i++)
3531                                if (atsru->devices[i] == bridge)
3532                                        return 1;
3533                        break;
3534                }
3535        }
3536
3537        if (atsru->include_all)
3538                return 1;
3539
3540        return 0;
3541}
3542
3543int __init dmar_parse_rmrr_atsr_dev(void)
3544{
3545        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3546        struct dmar_atsr_unit *atsr, *atsr_n;
3547        int ret = 0;
3548
3549        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3550                ret = rmrr_parse_dev(rmrr);
3551                if (ret)
3552                        return ret;
3553        }
3554
3555        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3556                ret = atsr_parse_dev(atsr);
3557                if (ret)
3558                        return ret;
3559        }
3560
3561        return ret;
3562}
3563
3564/*
3565 * Here we only respond to action of unbound device from driver.
3566 *
3567 * Added device is not attached to its DMAR domain here yet. That will happen
3568 * when mapping the device to iova.
3569 */
3570static int device_notifier(struct notifier_block *nb,
3571                                  unsigned long action, void *data)
3572{
3573        struct device *dev = data;
3574        struct pci_dev *pdev = to_pci_dev(dev);
3575        struct dmar_domain *domain;
3576
3577        if (iommu_no_mapping(dev))
3578                return 0;
3579
3580        domain = find_domain(pdev);
3581        if (!domain)
3582                return 0;
3583
3584        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3585                domain_remove_one_dev_info(domain, pdev);
3586
3587                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3588                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3589                    list_empty(&domain->devices))
3590                        domain_exit(domain);
3591        }
3592
3593        return 0;
3594}
3595
3596static struct notifier_block device_nb = {
3597        .notifier_call = device_notifier,
3598};
3599
3600int __init intel_iommu_init(void)
3601{
3602        int ret = 0;
3603
3604        /* VT-d is required for a TXT/tboot launch, so enforce that */
3605        force_on = tboot_force_iommu();
3606
3607        if (dmar_table_init()) {
3608                if (force_on)
3609                        panic("tboot: Failed to initialize DMAR table\n");
3610                return  -ENODEV;
3611        }
3612
3613        if (dmar_dev_scope_init() < 0) {
3614                if (force_on)
3615                        panic("tboot: Failed to initialize DMAR device scope\n");
3616                return  -ENODEV;
3617        }
3618
3619        if (no_iommu || dmar_disabled)
3620                return -ENODEV;
3621
3622        if (iommu_init_mempool()) {
3623                if (force_on)
3624                        panic("tboot: Failed to initialize iommu memory\n");
3625                return  -ENODEV;
3626        }
3627
3628        if (list_empty(&dmar_rmrr_units))
3629                printk(KERN_INFO "DMAR: No RMRR found\n");
3630
3631        if (list_empty(&dmar_atsr_units))
3632                printk(KERN_INFO "DMAR: No ATSR found\n");
3633
3634        if (dmar_init_reserved_ranges()) {
3635                if (force_on)
3636                        panic("tboot: Failed to reserve iommu ranges\n");
3637                return  -ENODEV;
3638        }
3639
3640        init_no_remapping_devices();
3641
3642        ret = init_dmars();
3643        if (ret) {
3644                if (force_on)
3645                        panic("tboot: Failed to initialize DMARs\n");
3646                printk(KERN_ERR "IOMMU: dmar init failed\n");
3647                put_iova_domain(&reserved_iova_list);
3648                iommu_exit_mempool();
3649                return ret;
3650        }
3651        printk(KERN_INFO
3652        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3653
3654        init_timer(&unmap_timer);
3655#ifdef CONFIG_SWIOTLB
3656        swiotlb = 0;
3657#endif
3658        dma_ops = &intel_dma_ops;
3659
3660        init_iommu_pm_ops();
3661
3662        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3663
3664        bus_register_notifier(&pci_bus_type, &device_nb);
3665
3666        intel_iommu_enabled = 1;
3667
3668        return 0;
3669}
3670
3671static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3672                                           struct pci_dev *pdev)
3673{
3674        struct pci_dev *tmp, *parent;
3675
3676        if (!iommu || !pdev)
3677                return;
3678
3679        /* dependent device detach */
3680        tmp = pci_find_upstream_pcie_bridge(pdev);
3681        /* Secondary interface's bus number and devfn 0 */
3682        if (tmp) {
3683                parent = pdev->bus->self;
3684                while (parent != tmp) {
3685                        iommu_detach_dev(iommu, parent->bus->number,
3686                                         parent->devfn);
3687                        parent = parent->bus->self;
3688                }
3689                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3690                        iommu_detach_dev(iommu,
3691                                tmp->subordinate->number, 0);
3692                else /* this is a legacy PCI bridge */
3693                        iommu_detach_dev(iommu, tmp->bus->number,
3694                                         tmp->devfn);
3695        }
3696}
3697
3698static void domain_remove_one_dev_info(struct dmar_domain *domain,
3699                                          struct pci_dev *pdev)
3700{
3701        struct device_domain_info *info;
3702        struct intel_iommu *iommu;
3703        unsigned long flags;
3704        int found = 0;
3705        struct list_head *entry, *tmp;
3706
3707        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3708                                pdev->devfn);
3709        if (!iommu)
3710                return;
3711
3712        spin_lock_irqsave(&device_domain_lock, flags);
3713        list_for_each_safe(entry, tmp, &domain->devices) {
3714                info = list_entry(entry, struct device_domain_info, link);
3715                if (info->segment == pci_domain_nr(pdev->bus) &&
3716                    info->bus == pdev->bus->number &&
3717                    info->devfn == pdev->devfn) {
3718                        list_del(&info->link);
3719                        list_del(&info->global);
3720                        if (info->dev)
3721                                info->dev->dev.archdata.iommu = NULL;
3722                        spin_unlock_irqrestore(&device_domain_lock, flags);
3723
3724                        iommu_disable_dev_iotlb(info);
3725                        iommu_detach_dev(iommu, info->bus, info->devfn);
3726                        iommu_detach_dependent_devices(iommu, pdev);
3727                        free_devinfo_mem(info);
3728
3729                        spin_lock_irqsave(&device_domain_lock, flags);
3730
3731                        if (found)
3732                                break;
3733                        else
3734                                continue;
3735                }
3736
3737                /* if there is no other devices under the same iommu
3738                 * owned by this domain, clear this iommu in iommu_bmp
3739                 * update iommu count and coherency
3740                 */
3741                if (iommu == device_to_iommu(info->segment, info->bus,
3742                                            info->devfn))
3743                        found = 1;
3744        }
3745
3746        spin_unlock_irqrestore(&device_domain_lock, flags);
3747
3748        if (found == 0) {
3749                unsigned long tmp_flags;
3750                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3751                clear_bit(iommu->seq_id, &domain->iommu_bmp);
3752                domain->iommu_count--;
3753                domain_update_iommu_cap(domain);
3754                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3755
3756                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3757                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3758                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3759                        clear_bit(domain->id, iommu->domain_ids);
3760                        iommu->domains[domain->id] = NULL;
3761                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3762                }
3763        }
3764}
3765
3766static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3767{
3768        struct device_domain_info *info;
3769        struct intel_iommu *iommu;
3770        unsigned long flags1, flags2;
3771
3772        spin_lock_irqsave(&device_domain_lock, flags1);
3773        while (!list_empty(&domain->devices)) {
3774                info = list_entry(domain->devices.next,
3775                        struct device_domain_info, link);
3776                list_del(&info->link);
3777                list_del(&info->global);
3778                if (info->dev)
3779                        info->dev->dev.archdata.iommu = NULL;
3780
3781                spin_unlock_irqrestore(&device_domain_lock, flags1);
3782
3783                iommu_disable_dev_iotlb(info);
3784                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3785                iommu_detach_dev(iommu, info->bus, info->devfn);
3786                iommu_detach_dependent_devices(iommu, info->dev);
3787
3788                /* clear this iommu in iommu_bmp, update iommu count
3789                 * and capabilities
3790                 */
3791                spin_lock_irqsave(&domain->iommu_lock, flags2);
3792                if (test_and_clear_bit(iommu->seq_id,
3793                                       &domain->iommu_bmp)) {
3794                        domain->iommu_count--;
3795                        domain_update_iommu_cap(domain);
3796                }
3797                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3798
3799                free_devinfo_mem(info);
3800                spin_lock_irqsave(&device_domain_lock, flags1);
3801        }
3802        spin_unlock_irqrestore(&device_domain_lock, flags1);
3803}
3804
3805/* domain id for virtual machine, it won't be set in context */
3806static unsigned long vm_domid;
3807
3808static struct dmar_domain *iommu_alloc_vm_domain(void)
3809{
3810        struct dmar_domain *domain;
3811
3812        domain = alloc_domain_mem();
3813        if (!domain)
3814                return NULL;
3815
3816        domain->id = vm_domid++;
3817        domain->nid = -1;
3818        memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3819        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3820
3821        return domain;
3822}
3823
3824static int md_domain_init(struct dmar_domain *domain, int guest_width)
3825{
3826        int adjust_width;
3827
3828        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3829        spin_lock_init(&domain->iommu_lock);
3830
3831        domain_reserve_special_ranges(domain);
3832
3833        /* calculate AGAW */
3834        domain->gaw = guest_width;
3835        adjust_width = guestwidth_to_adjustwidth(guest_width);
3836        domain->agaw = width_to_agaw(adjust_width);
3837
3838        INIT_LIST_HEAD(&domain->devices);
3839
3840        domain->iommu_count = 0;
3841        domain->iommu_coherency = 0;
3842        domain->iommu_snooping = 0;
3843        domain->iommu_superpage = 0;
3844        domain->max_addr = 0;
3845        domain->nid = -1;
3846
3847        /* always allocate the top pgd */
3848        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3849        if (!domain->pgd)
3850                return -ENOMEM;
3851        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3852        return 0;
3853}
3854
3855static void iommu_free_vm_domain(struct dmar_domain *domain)
3856{
3857        unsigned long flags;
3858        struct dmar_drhd_unit *drhd;
3859        struct intel_iommu *iommu;
3860        unsigned long i;
3861        unsigned long ndomains;
3862
3863        for_each_drhd_unit(drhd) {
3864                if (drhd->ignored)
3865                        continue;
3866                iommu = drhd->iommu;
3867
3868                ndomains = cap_ndoms(iommu->cap);
3869                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3870                        if (iommu->domains[i] == domain) {
3871                                spin_lock_irqsave(&iommu->lock, flags);
3872                                clear_bit(i, iommu->domain_ids);
3873                                iommu->domains[i] = NULL;
3874                                spin_unlock_irqrestore(&iommu->lock, flags);
3875                                break;
3876                        }
3877                }
3878        }
3879}
3880
3881static void vm_domain_exit(struct dmar_domain *domain)
3882{
3883        /* Domain 0 is reserved, so dont process it */
3884        if (!domain)
3885                return;
3886
3887        vm_domain_remove_all_dev_info(domain);
3888        /* destroy iovas */
3889        put_iova_domain(&domain->iovad);
3890
3891        /* clear ptes */
3892        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3893
3894        /* free page tables */
3895        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3896
3897        iommu_free_vm_domain(domain);
3898        free_domain_mem(domain);
3899}
3900
3901static int intel_iommu_domain_init(struct iommu_domain *domain)
3902{
3903        struct dmar_domain *dmar_domain;
3904
3905        dmar_domain = iommu_alloc_vm_domain();
3906        if (!dmar_domain) {
3907                printk(KERN_ERR
3908                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3909                return -ENOMEM;
3910        }
3911        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3912                printk(KERN_ERR
3913                        "intel_iommu_domain_init() failed\n");
3914                vm_domain_exit(dmar_domain);
3915                return -ENOMEM;
3916        }
3917        domain_update_iommu_cap(dmar_domain);
3918        domain->priv = dmar_domain;
3919
3920        return 0;
3921}
3922
3923static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3924{
3925        struct dmar_domain *dmar_domain = domain->priv;
3926
3927        domain->priv = NULL;
3928        vm_domain_exit(dmar_domain);
3929}
3930
3931static int intel_iommu_attach_device(struct iommu_domain *domain,
3932                                     struct device *dev)
3933{
3934        struct dmar_domain *dmar_domain = domain->priv;
3935        struct pci_dev *pdev = to_pci_dev(dev);
3936        struct intel_iommu *iommu;
3937        int addr_width;
3938
3939        /* normally pdev is not mapped */
3940        if (unlikely(domain_context_mapped(pdev))) {
3941                struct dmar_domain *old_domain;
3942
3943                old_domain = find_domain(pdev);
3944                if (old_domain) {
3945                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3946                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3947                                domain_remove_one_dev_info(old_domain, pdev);
3948                        else
3949                                domain_remove_dev_info(old_domain);
3950                }
3951        }
3952
3953        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3954                                pdev->devfn);
3955        if (!iommu)
3956                return -ENODEV;
3957
3958        /* check if this iommu agaw is sufficient for max mapped address */
3959        addr_width = agaw_to_width(iommu->agaw);
3960        if (addr_width > cap_mgaw(iommu->cap))
3961                addr_width = cap_mgaw(iommu->cap);
3962
3963        if (dmar_domain->max_addr > (1LL << addr_width)) {
3964                printk(KERN_ERR "%s: iommu width (%d) is not "
3965                       "sufficient for the mapped address (%llx)\n",
3966                       __func__, addr_width, dmar_domain->max_addr);
3967                return -EFAULT;
3968        }
3969        dmar_domain->gaw = addr_width;
3970
3971        /*
3972         * Knock out extra levels of page tables if necessary
3973         */
3974        while (iommu->agaw < dmar_domain->agaw) {
3975                struct dma_pte *pte;
3976
3977                pte = dmar_domain->pgd;
3978                if (dma_pte_present(pte)) {
3979                        dmar_domain->pgd = (struct dma_pte *)
3980                                phys_to_virt(dma_pte_addr(pte));
3981                        free_pgtable_page(pte);
3982                }
3983                dmar_domain->agaw--;
3984        }
3985
3986        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3987}
3988
3989static void intel_iommu_detach_device(struct iommu_domain *domain,
3990                                      struct device *dev)
3991{
3992        struct dmar_domain *dmar_domain = domain->priv;
3993        struct pci_dev *pdev = to_pci_dev(dev);
3994
3995        domain_remove_one_dev_info(dmar_domain, pdev);
3996}
3997
3998static int intel_iommu_map(struct iommu_domain *domain,
3999                           unsigned long iova, phys_addr_t hpa,
4000                           size_t size, int iommu_prot)
4001{
4002        struct dmar_domain *dmar_domain = domain->priv;
4003        u64 max_addr;
4004        int prot = 0;
4005        int ret;
4006
4007        if (iommu_prot & IOMMU_READ)
4008                prot |= DMA_PTE_READ;
4009        if (iommu_prot & IOMMU_WRITE)
4010                prot |= DMA_PTE_WRITE;
4011        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4012                prot |= DMA_PTE_SNP;
4013
4014        max_addr = iova + size;
4015        if (dmar_domain->max_addr < max_addr) {
4016                u64 end;
4017
4018                /* check if minimum agaw is sufficient for mapped address */
4019                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4020                if (end < max_addr) {
4021                        printk(KERN_ERR "%s: iommu width (%d) is not "
4022                               "sufficient for the mapped address (%llx)\n",
4023                               __func__, dmar_domain->gaw, max_addr);
4024                        return -EFAULT;
4025                }
4026                dmar_domain->max_addr = max_addr;
4027        }
4028        /* Round up size to next multiple of PAGE_SIZE, if it and
4029           the low bits of hpa would take us onto the next page */
4030        size = aligned_nrpages(hpa, size);
4031        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4032                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4033        return ret;
4034}
4035
4036static size_t intel_iommu_unmap(struct iommu_domain *domain,
4037                             unsigned long iova, size_t size)
4038{
4039        struct dmar_domain *dmar_domain = domain->priv;
4040        int order;
4041
4042        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4043                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4044
4045        if (dmar_domain->max_addr == iova + size)
4046                dmar_domain->max_addr = iova;
4047
4048        return PAGE_SIZE << order;
4049}
4050
4051static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4052                                            unsigned long iova)
4053{
4054        struct dmar_domain *dmar_domain = domain->priv;
4055        struct dma_pte *pte;
4056        u64 phys = 0;
4057
4058        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4059        if (pte)
4060                phys = dma_pte_addr(pte);
4061
4062        return phys;
4063}
4064
4065static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4066                                      unsigned long cap)
4067{
4068        struct dmar_domain *dmar_domain = domain->priv;
4069
4070        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4071                return dmar_domain->iommu_snooping;
4072        if (cap == IOMMU_CAP_INTR_REMAP)
4073                return intr_remapping_enabled;
4074
4075        return 0;
4076}
4077
4078/*
4079 * Group numbers are arbitrary.  Device with the same group number
4080 * indicate the iommu cannot differentiate between them.  To avoid
4081 * tracking used groups we just use the seg|bus|devfn of the lowest
4082 * level we're able to differentiate devices
4083 */
4084static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4085{
4086        struct pci_dev *pdev = to_pci_dev(dev);
4087        struct pci_dev *bridge;
4088        union {
4089                struct {
4090                        u8 devfn;
4091                        u8 bus;
4092                        u16 segment;
4093                } pci;
4094                u32 group;
4095        } id;
4096
4097        if (iommu_no_mapping(dev))
4098                return -ENODEV;
4099
4100        id.pci.segment = pci_domain_nr(pdev->bus);
4101        id.pci.bus = pdev->bus->number;
4102        id.pci.devfn = pdev->devfn;
4103
4104        if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4105                return -ENODEV;
4106
4107        bridge = pci_find_upstream_pcie_bridge(pdev);
4108        if (bridge) {
4109                if (pci_is_pcie(bridge)) {
4110                        id.pci.bus = bridge->subordinate->number;
4111                        id.pci.devfn = 0;
4112                } else {
4113                        id.pci.bus = bridge->bus->number;
4114                        id.pci.devfn = bridge->devfn;
4115                }
4116        }
4117
4118        if (!pdev->is_virtfn && iommu_group_mf)
4119                id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4120
4121        *groupid = id.group;
4122
4123        return 0;
4124}
4125
4126static struct iommu_ops intel_iommu_ops = {
4127        .domain_init    = intel_iommu_domain_init,
4128        .domain_destroy = intel_iommu_domain_destroy,
4129        .attach_dev     = intel_iommu_attach_device,
4130        .detach_dev     = intel_iommu_detach_device,
4131        .map            = intel_iommu_map,
4132        .unmap          = intel_iommu_unmap,
4133        .iova_to_phys   = intel_iommu_iova_to_phys,
4134        .domain_has_cap = intel_iommu_domain_has_cap,
4135        .device_group   = intel_iommu_device_group,
4136        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4137};
4138
4139static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4140{
4141        /*
4142         * Mobile 4 Series Chipset neglects to set RWBF capability,
4143         * but needs it:
4144         */
4145        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4146        rwbf_quirk = 1;
4147
4148        /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4149        if (dev->revision == 0x07) {
4150                printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4151                dmar_map_gfx = 0;
4152        }
4153}
4154
4155DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4156
4157#define GGC 0x52
4158#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4159#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4160#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4161#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4162#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4163#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4164#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4165#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4166
4167static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4168{
4169        unsigned short ggc;
4170
4171        if (pci_read_config_word(dev, GGC, &ggc))
4172                return;
4173
4174        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4175                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4176                dmar_map_gfx = 0;
4177        } else if (dmar_map_gfx) {
4178                /* we have to ensure the gfx device is idle before we flush */
4179                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4180                intel_iommu_strict = 1;
4181       }
4182}
4183DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4184DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4185DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4186DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4187
4188/* On Tylersburg chipsets, some BIOSes have been known to enable the
4189   ISOCH DMAR unit for the Azalia sound device, but not give it any
4190   TLB entries, which causes it to deadlock. Check for that.  We do
4191   this in a function called from init_dmars(), instead of in a PCI
4192   quirk, because we don't want to print the obnoxious "BIOS broken"
4193   message if VT-d is actually disabled.
4194*/
4195static void __init check_tylersburg_isoch(void)
4196{
4197        struct pci_dev *pdev;
4198        uint32_t vtisochctrl;
4199
4200        /* If there's no Azalia in the system anyway, forget it. */
4201        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4202        if (!pdev)
4203                return;
4204        pci_dev_put(pdev);
4205
4206        /* System Management Registers. Might be hidden, in which case
4207           we can't do the sanity check. But that's OK, because the
4208           known-broken BIOSes _don't_ actually hide it, so far. */
4209        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4210        if (!pdev)
4211                return;
4212
4213        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4214                pci_dev_put(pdev);
4215                return;
4216        }
4217
4218        pci_dev_put(pdev);
4219
4220        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4221        if (vtisochctrl & 1)
4222                return;
4223
4224        /* Drop all bits other than the number of TLB entries */
4225        vtisochctrl &= 0x1c;
4226
4227        /* If we have the recommended number of TLB entries (16), fine. */
4228        if (vtisochctrl == 0x10)
4229                return;
4230
4231        /* Zero TLB entries? You get to ride the short bus to school. */
4232        if (!vtisochctrl) {
4233                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4234                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4235                     dmi_get_system_info(DMI_BIOS_VENDOR),
4236                     dmi_get_system_info(DMI_BIOS_VERSION),
4237                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4238                iommu_identity_mapping |= IDENTMAP_AZALIA;
4239                return;
4240        }
4241        
4242        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4243               vtisochctrl);
4244}
4245
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.