linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#define ROOT_SIZE               VTD_PAGE_SIZE
  50#define CONTEXT_SIZE            VTD_PAGE_SIZE
  51
  52#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  53#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  54#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  55
  56#define IOAPIC_RANGE_START      (0xfee00000)
  57#define IOAPIC_RANGE_END        (0xfeefffff)
  58#define IOVA_START_ADDR         (0x1000)
  59
  60#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  61
  62#define MAX_AGAW_WIDTH 64
  63
  64#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  65#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  66
  67/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  68   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  69#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  70                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  71#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  72
  73#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  74#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  75#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  76
  77/* page table handling */
  78#define LEVEL_STRIDE            (9)
  79#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  80
  81/*
  82 * This bitmap is used to advertise the page sizes our hardware support
  83 * to the IOMMU core, which will then use this information to split
  84 * physically contiguous memory regions it is mapping into page sizes
  85 * that we support.
  86 *
  87 * Traditionally the IOMMU core just handed us the mappings directly,
  88 * after making sure the size is an order of a 4KiB page and that the
  89 * mapping has natural alignment.
  90 *
  91 * To retain this behavior, we currently advertise that we support
  92 * all page sizes that are an order of 4KiB.
  93 *
  94 * If at some point we'd like to utilize the IOMMU core's new behavior,
  95 * we could change this to advertise the real page sizes we support.
  96 */
  97#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
  98
  99static inline int agaw_to_level(int agaw)
 100{
 101        return agaw + 2;
 102}
 103
 104static inline int agaw_to_width(int agaw)
 105{
 106        return 30 + agaw * LEVEL_STRIDE;
 107}
 108
 109static inline int width_to_agaw(int width)
 110{
 111        return (width - 30) / LEVEL_STRIDE;
 112}
 113
 114static inline unsigned int level_to_offset_bits(int level)
 115{
 116        return (level - 1) * LEVEL_STRIDE;
 117}
 118
 119static inline int pfn_level_offset(unsigned long pfn, int level)
 120{
 121        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 122}
 123
 124static inline unsigned long level_mask(int level)
 125{
 126        return -1UL << level_to_offset_bits(level);
 127}
 128
 129static inline unsigned long level_size(int level)
 130{
 131        return 1UL << level_to_offset_bits(level);
 132}
 133
 134static inline unsigned long align_to_level(unsigned long pfn, int level)
 135{
 136        return (pfn + level_size(level) - 1) & level_mask(level);
 137}
 138
 139static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 140{
 141        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 142}
 143
 144/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 145   are never going to work. */
 146static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 147{
 148        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 149}
 150
 151static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 152{
 153        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 154}
 155static inline unsigned long page_to_dma_pfn(struct page *pg)
 156{
 157        return mm_to_dma_pfn(page_to_pfn(pg));
 158}
 159static inline unsigned long virt_to_dma_pfn(void *p)
 160{
 161        return page_to_dma_pfn(virt_to_page(p));
 162}
 163
 164/* global iommu list, set NULL for ignored DMAR units */
 165static struct intel_iommu **g_iommus;
 166
 167static void __init check_tylersburg_isoch(void);
 168static int rwbf_quirk;
 169
 170/*
 171 * set to 1 to panic kernel if can't successfully enable VT-d
 172 * (used when kernel is launched w/ TXT)
 173 */
 174static int force_on = 0;
 175
 176/*
 177 * 0: Present
 178 * 1-11: Reserved
 179 * 12-63: Context Ptr (12 - (haw-1))
 180 * 64-127: Reserved
 181 */
 182struct root_entry {
 183        u64     val;
 184        u64     rsvd1;
 185};
 186#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 187static inline bool root_present(struct root_entry *root)
 188{
 189        return (root->val & 1);
 190}
 191static inline void set_root_present(struct root_entry *root)
 192{
 193        root->val |= 1;
 194}
 195static inline void set_root_value(struct root_entry *root, unsigned long value)
 196{
 197        root->val |= value & VTD_PAGE_MASK;
 198}
 199
 200static inline struct context_entry *
 201get_context_addr_from_root(struct root_entry *root)
 202{
 203        return (struct context_entry *)
 204                (root_present(root)?phys_to_virt(
 205                root->val & VTD_PAGE_MASK) :
 206                NULL);
 207}
 208
 209/*
 210 * low 64 bits:
 211 * 0: present
 212 * 1: fault processing disable
 213 * 2-3: translation type
 214 * 12-63: address space root
 215 * high 64 bits:
 216 * 0-2: address width
 217 * 3-6: aval
 218 * 8-23: domain id
 219 */
 220struct context_entry {
 221        u64 lo;
 222        u64 hi;
 223};
 224
 225static inline bool context_present(struct context_entry *context)
 226{
 227        return (context->lo & 1);
 228}
 229static inline void context_set_present(struct context_entry *context)
 230{
 231        context->lo |= 1;
 232}
 233
 234static inline void context_set_fault_enable(struct context_entry *context)
 235{
 236        context->lo &= (((u64)-1) << 2) | 1;
 237}
 238
 239static inline void context_set_translation_type(struct context_entry *context,
 240                                                unsigned long value)
 241{
 242        context->lo &= (((u64)-1) << 4) | 3;
 243        context->lo |= (value & 3) << 2;
 244}
 245
 246static inline void context_set_address_root(struct context_entry *context,
 247                                            unsigned long value)
 248{
 249        context->lo |= value & VTD_PAGE_MASK;
 250}
 251
 252static inline void context_set_address_width(struct context_entry *context,
 253                                             unsigned long value)
 254{
 255        context->hi |= value & 7;
 256}
 257
 258static inline void context_set_domain_id(struct context_entry *context,
 259                                         unsigned long value)
 260{
 261        context->hi |= (value & ((1 << 16) - 1)) << 8;
 262}
 263
 264static inline void context_clear_entry(struct context_entry *context)
 265{
 266        context->lo = 0;
 267        context->hi = 0;
 268}
 269
 270/*
 271 * 0: readable
 272 * 1: writable
 273 * 2-6: reserved
 274 * 7: super page
 275 * 8-10: available
 276 * 11: snoop behavior
 277 * 12-63: Host physcial address
 278 */
 279struct dma_pte {
 280        u64 val;
 281};
 282
 283static inline void dma_clear_pte(struct dma_pte *pte)
 284{
 285        pte->val = 0;
 286}
 287
 288static inline void dma_set_pte_readable(struct dma_pte *pte)
 289{
 290        pte->val |= DMA_PTE_READ;
 291}
 292
 293static inline void dma_set_pte_writable(struct dma_pte *pte)
 294{
 295        pte->val |= DMA_PTE_WRITE;
 296}
 297
 298static inline void dma_set_pte_snp(struct dma_pte *pte)
 299{
 300        pte->val |= DMA_PTE_SNP;
 301}
 302
 303static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 304{
 305        pte->val = (pte->val & ~3) | (prot & 3);
 306}
 307
 308static inline u64 dma_pte_addr(struct dma_pte *pte)
 309{
 310#ifdef CONFIG_64BIT
 311        return pte->val & VTD_PAGE_MASK;
 312#else
 313        /* Must have a full atomic 64-bit read */
 314        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 315#endif
 316}
 317
 318static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 319{
 320        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 321}
 322
 323static inline bool dma_pte_present(struct dma_pte *pte)
 324{
 325        return (pte->val & 3) != 0;
 326}
 327
 328static inline bool dma_pte_superpage(struct dma_pte *pte)
 329{
 330        return (pte->val & (1 << 7));
 331}
 332
 333static inline int first_pte_in_page(struct dma_pte *pte)
 334{
 335        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 336}
 337
 338/*
 339 * This domain is a statically identity mapping domain.
 340 *      1. This domain creats a static 1:1 mapping to all usable memory.
 341 *      2. It maps to each iommu if successful.
 342 *      3. Each iommu mapps to this domain if successful.
 343 */
 344static struct dmar_domain *si_domain;
 345static int hw_pass_through = 1;
 346
 347/* devices under the same p2p bridge are owned in one domain */
 348#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 349
 350/* domain represents a virtual machine, more than one devices
 351 * across iommus may be owned in one domain, e.g. kvm guest.
 352 */
 353#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 354
 355/* si_domain contains mulitple devices */
 356#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 357
 358/* define the limit of IOMMUs supported in each domain */
 359#ifdef  CONFIG_X86
 360# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 361#else
 362# define        IOMMU_UNITS_SUPPORTED   64
 363#endif
 364
 365struct dmar_domain {
 366        int     id;                     /* domain id */
 367        int     nid;                    /* node id */
 368        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 369                                        /* bitmap of iommus this domain uses*/
 370
 371        struct list_head devices;       /* all devices' list */
 372        struct iova_domain iovad;       /* iova's that belong to this domain */
 373
 374        struct dma_pte  *pgd;           /* virtual address */
 375        int             gaw;            /* max guest address width */
 376
 377        /* adjusted guest address width, 0 is level 2 30-bit */
 378        int             agaw;
 379
 380        int             flags;          /* flags to find out type of domain */
 381
 382        int             iommu_coherency;/* indicate coherency of iommu access */
 383        int             iommu_snooping; /* indicate snooping control feature*/
 384        int             iommu_count;    /* reference count of iommu */
 385        int             iommu_superpage;/* Level of superpages supported:
 386                                           0 == 4KiB (no superpages), 1 == 2MiB,
 387                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 388        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 389        u64             max_addr;       /* maximum mapped address */
 390};
 391
 392/* PCI domain-device relationship */
 393struct device_domain_info {
 394        struct list_head link;  /* link to domain siblings */
 395        struct list_head global; /* link to global list */
 396        int segment;            /* PCI domain */
 397        u8 bus;                 /* PCI bus number */
 398        u8 devfn;               /* PCI devfn number */
 399        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 400        struct intel_iommu *iommu; /* IOMMU used by this device */
 401        struct dmar_domain *domain; /* pointer to domain */
 402};
 403
 404static void flush_unmaps_timeout(unsigned long data);
 405
 406DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 407
 408#define HIGH_WATER_MARK 250
 409struct deferred_flush_tables {
 410        int next;
 411        struct iova *iova[HIGH_WATER_MARK];
 412        struct dmar_domain *domain[HIGH_WATER_MARK];
 413};
 414
 415static struct deferred_flush_tables *deferred_flush;
 416
 417/* bitmap for indexing intel_iommus */
 418static int g_num_of_iommus;
 419
 420static DEFINE_SPINLOCK(async_umap_flush_lock);
 421static LIST_HEAD(unmaps_to_do);
 422
 423static int timer_on;
 424static long list_size;
 425
 426static void domain_remove_dev_info(struct dmar_domain *domain);
 427
 428#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 429int dmar_disabled = 0;
 430#else
 431int dmar_disabled = 1;
 432#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 433
 434int intel_iommu_enabled = 0;
 435EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 436
 437static int dmar_map_gfx = 1;
 438static int dmar_forcedac;
 439static int intel_iommu_strict;
 440static int intel_iommu_superpage = 1;
 441
 442int intel_iommu_gfx_mapped;
 443EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 444
 445#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 446static DEFINE_SPINLOCK(device_domain_lock);
 447static LIST_HEAD(device_domain_list);
 448
 449static struct iommu_ops intel_iommu_ops;
 450
 451static int __init intel_iommu_setup(char *str)
 452{
 453        if (!str)
 454                return -EINVAL;
 455        while (*str) {
 456                if (!strncmp(str, "on", 2)) {
 457                        dmar_disabled = 0;
 458                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 459                } else if (!strncmp(str, "off", 3)) {
 460                        dmar_disabled = 1;
 461                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 462                } else if (!strncmp(str, "igfx_off", 8)) {
 463                        dmar_map_gfx = 0;
 464                        printk(KERN_INFO
 465                                "Intel-IOMMU: disable GFX device mapping\n");
 466                } else if (!strncmp(str, "forcedac", 8)) {
 467                        printk(KERN_INFO
 468                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 469                        dmar_forcedac = 1;
 470                } else if (!strncmp(str, "strict", 6)) {
 471                        printk(KERN_INFO
 472                                "Intel-IOMMU: disable batched IOTLB flush\n");
 473                        intel_iommu_strict = 1;
 474                } else if (!strncmp(str, "sp_off", 6)) {
 475                        printk(KERN_INFO
 476                                "Intel-IOMMU: disable supported super page\n");
 477                        intel_iommu_superpage = 0;
 478                }
 479
 480                str += strcspn(str, ",");
 481                while (*str == ',')
 482                        str++;
 483        }
 484        return 0;
 485}
 486__setup("intel_iommu=", intel_iommu_setup);
 487
 488static struct kmem_cache *iommu_domain_cache;
 489static struct kmem_cache *iommu_devinfo_cache;
 490static struct kmem_cache *iommu_iova_cache;
 491
 492static inline void *alloc_pgtable_page(int node)
 493{
 494        struct page *page;
 495        void *vaddr = NULL;
 496
 497        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 498        if (page)
 499                vaddr = page_address(page);
 500        return vaddr;
 501}
 502
 503static inline void free_pgtable_page(void *vaddr)
 504{
 505        free_page((unsigned long)vaddr);
 506}
 507
 508static inline void *alloc_domain_mem(void)
 509{
 510        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 511}
 512
 513static void free_domain_mem(void *vaddr)
 514{
 515        kmem_cache_free(iommu_domain_cache, vaddr);
 516}
 517
 518static inline void * alloc_devinfo_mem(void)
 519{
 520        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 521}
 522
 523static inline void free_devinfo_mem(void *vaddr)
 524{
 525        kmem_cache_free(iommu_devinfo_cache, vaddr);
 526}
 527
 528struct iova *alloc_iova_mem(void)
 529{
 530        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 531}
 532
 533void free_iova_mem(struct iova *iova)
 534{
 535        kmem_cache_free(iommu_iova_cache, iova);
 536}
 537
 538
 539static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 540{
 541        unsigned long sagaw;
 542        int agaw = -1;
 543
 544        sagaw = cap_sagaw(iommu->cap);
 545        for (agaw = width_to_agaw(max_gaw);
 546             agaw >= 0; agaw--) {
 547                if (test_bit(agaw, &sagaw))
 548                        break;
 549        }
 550
 551        return agaw;
 552}
 553
 554/*
 555 * Calculate max SAGAW for each iommu.
 556 */
 557int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 558{
 559        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 560}
 561
 562/*
 563 * calculate agaw for each iommu.
 564 * "SAGAW" may be different across iommus, use a default agaw, and
 565 * get a supported less agaw for iommus that don't support the default agaw.
 566 */
 567int iommu_calculate_agaw(struct intel_iommu *iommu)
 568{
 569        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 570}
 571
 572/* This functionin only returns single iommu in a domain */
 573static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 574{
 575        int iommu_id;
 576
 577        /* si_domain and vm domain should not get here. */
 578        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 579        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 580
 581        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 582        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 583                return NULL;
 584
 585        return g_iommus[iommu_id];
 586}
 587
 588static void domain_update_iommu_coherency(struct dmar_domain *domain)
 589{
 590        int i;
 591
 592        i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 593
 594        domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
 595
 596        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 597                if (!ecap_coherent(g_iommus[i]->ecap)) {
 598                        domain->iommu_coherency = 0;
 599                        break;
 600                }
 601        }
 602}
 603
 604static void domain_update_iommu_snooping(struct dmar_domain *domain)
 605{
 606        int i;
 607
 608        domain->iommu_snooping = 1;
 609
 610        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 611                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 612                        domain->iommu_snooping = 0;
 613                        break;
 614                }
 615        }
 616}
 617
 618static void domain_update_iommu_superpage(struct dmar_domain *domain)
 619{
 620        struct dmar_drhd_unit *drhd;
 621        struct intel_iommu *iommu = NULL;
 622        int mask = 0xf;
 623
 624        if (!intel_iommu_superpage) {
 625                domain->iommu_superpage = 0;
 626                return;
 627        }
 628
 629        /* set iommu_superpage to the smallest common denominator */
 630        for_each_active_iommu(iommu, drhd) {
 631                mask &= cap_super_page_val(iommu->cap);
 632                if (!mask) {
 633                        break;
 634                }
 635        }
 636        domain->iommu_superpage = fls(mask);
 637}
 638
 639/* Some capabilities may be different across iommus */
 640static void domain_update_iommu_cap(struct dmar_domain *domain)
 641{
 642        domain_update_iommu_coherency(domain);
 643        domain_update_iommu_snooping(domain);
 644        domain_update_iommu_superpage(domain);
 645}
 646
 647static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 648{
 649        struct dmar_drhd_unit *drhd = NULL;
 650        int i;
 651
 652        for_each_drhd_unit(drhd) {
 653                if (drhd->ignored)
 654                        continue;
 655                if (segment != drhd->segment)
 656                        continue;
 657
 658                for (i = 0; i < drhd->devices_cnt; i++) {
 659                        if (drhd->devices[i] &&
 660                            drhd->devices[i]->bus->number == bus &&
 661                            drhd->devices[i]->devfn == devfn)
 662                                return drhd->iommu;
 663                        if (drhd->devices[i] &&
 664                            drhd->devices[i]->subordinate &&
 665                            drhd->devices[i]->subordinate->number <= bus &&
 666                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 667                                return drhd->iommu;
 668                }
 669
 670                if (drhd->include_all)
 671                        return drhd->iommu;
 672        }
 673
 674        return NULL;
 675}
 676
 677static void domain_flush_cache(struct dmar_domain *domain,
 678                               void *addr, int size)
 679{
 680        if (!domain->iommu_coherency)
 681                clflush_cache_range(addr, size);
 682}
 683
 684/* Gets context entry for a given bus and devfn */
 685static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 686                u8 bus, u8 devfn)
 687{
 688        struct root_entry *root;
 689        struct context_entry *context;
 690        unsigned long phy_addr;
 691        unsigned long flags;
 692
 693        spin_lock_irqsave(&iommu->lock, flags);
 694        root = &iommu->root_entry[bus];
 695        context = get_context_addr_from_root(root);
 696        if (!context) {
 697                context = (struct context_entry *)
 698                                alloc_pgtable_page(iommu->node);
 699                if (!context) {
 700                        spin_unlock_irqrestore(&iommu->lock, flags);
 701                        return NULL;
 702                }
 703                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 704                phy_addr = virt_to_phys((void *)context);
 705                set_root_value(root, phy_addr);
 706                set_root_present(root);
 707                __iommu_flush_cache(iommu, root, sizeof(*root));
 708        }
 709        spin_unlock_irqrestore(&iommu->lock, flags);
 710        return &context[devfn];
 711}
 712
 713static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 714{
 715        struct root_entry *root;
 716        struct context_entry *context;
 717        int ret;
 718        unsigned long flags;
 719
 720        spin_lock_irqsave(&iommu->lock, flags);
 721        root = &iommu->root_entry[bus];
 722        context = get_context_addr_from_root(root);
 723        if (!context) {
 724                ret = 0;
 725                goto out;
 726        }
 727        ret = context_present(&context[devfn]);
 728out:
 729        spin_unlock_irqrestore(&iommu->lock, flags);
 730        return ret;
 731}
 732
 733static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 734{
 735        struct root_entry *root;
 736        struct context_entry *context;
 737        unsigned long flags;
 738
 739        spin_lock_irqsave(&iommu->lock, flags);
 740        root = &iommu->root_entry[bus];
 741        context = get_context_addr_from_root(root);
 742        if (context) {
 743                context_clear_entry(&context[devfn]);
 744                __iommu_flush_cache(iommu, &context[devfn], \
 745                        sizeof(*context));
 746        }
 747        spin_unlock_irqrestore(&iommu->lock, flags);
 748}
 749
 750static void free_context_table(struct intel_iommu *iommu)
 751{
 752        struct root_entry *root;
 753        int i;
 754        unsigned long flags;
 755        struct context_entry *context;
 756
 757        spin_lock_irqsave(&iommu->lock, flags);
 758        if (!iommu->root_entry) {
 759                goto out;
 760        }
 761        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 762                root = &iommu->root_entry[i];
 763                context = get_context_addr_from_root(root);
 764                if (context)
 765                        free_pgtable_page(context);
 766        }
 767        free_pgtable_page(iommu->root_entry);
 768        iommu->root_entry = NULL;
 769out:
 770        spin_unlock_irqrestore(&iommu->lock, flags);
 771}
 772
 773static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 774                                      unsigned long pfn, int target_level)
 775{
 776        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 777        struct dma_pte *parent, *pte = NULL;
 778        int level = agaw_to_level(domain->agaw);
 779        int offset;
 780
 781        BUG_ON(!domain->pgd);
 782        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 783        parent = domain->pgd;
 784
 785        while (level > 0) {
 786                void *tmp_page;
 787
 788                offset = pfn_level_offset(pfn, level);
 789                pte = &parent[offset];
 790                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 791                        break;
 792                if (level == target_level)
 793                        break;
 794
 795                if (!dma_pte_present(pte)) {
 796                        uint64_t pteval;
 797
 798                        tmp_page = alloc_pgtable_page(domain->nid);
 799
 800                        if (!tmp_page)
 801                                return NULL;
 802
 803                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 804                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 805                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 806                                /* Someone else set it while we were thinking; use theirs. */
 807                                free_pgtable_page(tmp_page);
 808                        } else {
 809                                dma_pte_addr(pte);
 810                                domain_flush_cache(domain, pte, sizeof(*pte));
 811                        }
 812                }
 813                parent = phys_to_virt(dma_pte_addr(pte));
 814                level--;
 815        }
 816
 817        return pte;
 818}
 819
 820
 821/* return address's pte at specific level */
 822static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 823                                         unsigned long pfn,
 824                                         int level, int *large_page)
 825{
 826        struct dma_pte *parent, *pte = NULL;
 827        int total = agaw_to_level(domain->agaw);
 828        int offset;
 829
 830        parent = domain->pgd;
 831        while (level <= total) {
 832                offset = pfn_level_offset(pfn, total);
 833                pte = &parent[offset];
 834                if (level == total)
 835                        return pte;
 836
 837                if (!dma_pte_present(pte)) {
 838                        *large_page = total;
 839                        break;
 840                }
 841
 842                if (pte->val & DMA_PTE_LARGE_PAGE) {
 843                        *large_page = total;
 844                        return pte;
 845                }
 846
 847                parent = phys_to_virt(dma_pte_addr(pte));
 848                total--;
 849        }
 850        return NULL;
 851}
 852
 853/* clear last level pte, a tlb flush should be followed */
 854static int dma_pte_clear_range(struct dmar_domain *domain,
 855                                unsigned long start_pfn,
 856                                unsigned long last_pfn)
 857{
 858        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 859        unsigned int large_page = 1;
 860        struct dma_pte *first_pte, *pte;
 861        int order;
 862
 863        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 864        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 865        BUG_ON(start_pfn > last_pfn);
 866
 867        /* we don't need lock here; nobody else touches the iova range */
 868        do {
 869                large_page = 1;
 870                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 871                if (!pte) {
 872                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 873                        continue;
 874                }
 875                do {
 876                        dma_clear_pte(pte);
 877                        start_pfn += lvl_to_nr_pages(large_page);
 878                        pte++;
 879                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 880
 881                domain_flush_cache(domain, first_pte,
 882                                   (void *)pte - (void *)first_pte);
 883
 884        } while (start_pfn && start_pfn <= last_pfn);
 885
 886        order = (large_page - 1) * 9;
 887        return order;
 888}
 889
 890/* free page table pages. last level pte should already be cleared */
 891static void dma_pte_free_pagetable(struct dmar_domain *domain,
 892                                   unsigned long start_pfn,
 893                                   unsigned long last_pfn)
 894{
 895        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 896        struct dma_pte *first_pte, *pte;
 897        int total = agaw_to_level(domain->agaw);
 898        int level;
 899        unsigned long tmp;
 900        int large_page = 2;
 901
 902        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 903        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 904        BUG_ON(start_pfn > last_pfn);
 905
 906        /* We don't need lock here; nobody else touches the iova range */
 907        level = 2;
 908        while (level <= total) {
 909                tmp = align_to_level(start_pfn, level);
 910
 911                /* If we can't even clear one PTE at this level, we're done */
 912                if (tmp + level_size(level) - 1 > last_pfn)
 913                        return;
 914
 915                do {
 916                        large_page = level;
 917                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 918                        if (large_page > level)
 919                                level = large_page + 1;
 920                        if (!pte) {
 921                                tmp = align_to_level(tmp + 1, level + 1);
 922                                continue;
 923                        }
 924                        do {
 925                                if (dma_pte_present(pte)) {
 926                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 927                                        dma_clear_pte(pte);
 928                                }
 929                                pte++;
 930                                tmp += level_size(level);
 931                        } while (!first_pte_in_page(pte) &&
 932                                 tmp + level_size(level) - 1 <= last_pfn);
 933
 934                        domain_flush_cache(domain, first_pte,
 935                                           (void *)pte - (void *)first_pte);
 936                        
 937                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 938                level++;
 939        }
 940        /* free pgd */
 941        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 942                free_pgtable_page(domain->pgd);
 943                domain->pgd = NULL;
 944        }
 945}
 946
 947/* iommu handling */
 948static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 949{
 950        struct root_entry *root;
 951        unsigned long flags;
 952
 953        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 954        if (!root)
 955                return -ENOMEM;
 956
 957        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 958
 959        spin_lock_irqsave(&iommu->lock, flags);
 960        iommu->root_entry = root;
 961        spin_unlock_irqrestore(&iommu->lock, flags);
 962
 963        return 0;
 964}
 965
 966static void iommu_set_root_entry(struct intel_iommu *iommu)
 967{
 968        void *addr;
 969        u32 sts;
 970        unsigned long flag;
 971
 972        addr = iommu->root_entry;
 973
 974        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 975        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 976
 977        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 978
 979        /* Make sure hardware complete it */
 980        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 981                      readl, (sts & DMA_GSTS_RTPS), sts);
 982
 983        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 984}
 985
 986static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 987{
 988        u32 val;
 989        unsigned long flag;
 990
 991        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 992                return;
 993
 994        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 995        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 996
 997        /* Make sure hardware complete it */
 998        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 999                      readl, (!(val & DMA_GSTS_WBFS)), val);
1000
1001        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1002}
1003
1004/* return value determine if we need a write buffer flush */
1005static void __iommu_flush_context(struct intel_iommu *iommu,
1006                                  u16 did, u16 source_id, u8 function_mask,
1007                                  u64 type)
1008{
1009        u64 val = 0;
1010        unsigned long flag;
1011
1012        switch (type) {
1013        case DMA_CCMD_GLOBAL_INVL:
1014                val = DMA_CCMD_GLOBAL_INVL;
1015                break;
1016        case DMA_CCMD_DOMAIN_INVL:
1017                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1018                break;
1019        case DMA_CCMD_DEVICE_INVL:
1020                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1021                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1022                break;
1023        default:
1024                BUG();
1025        }
1026        val |= DMA_CCMD_ICC;
1027
1028        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1029        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1030
1031        /* Make sure hardware complete it */
1032        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1033                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1034
1035        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1036}
1037
1038/* return value determine if we need a write buffer flush */
1039static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1040                                u64 addr, unsigned int size_order, u64 type)
1041{
1042        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1043        u64 val = 0, val_iva = 0;
1044        unsigned long flag;
1045
1046        switch (type) {
1047        case DMA_TLB_GLOBAL_FLUSH:
1048                /* global flush doesn't need set IVA_REG */
1049                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1050                break;
1051        case DMA_TLB_DSI_FLUSH:
1052                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                break;
1054        case DMA_TLB_PSI_FLUSH:
1055                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056                /* Note: always flush non-leaf currently */
1057                val_iva = size_order | addr;
1058                break;
1059        default:
1060                BUG();
1061        }
1062        /* Note: set drain read/write */
1063#if 0
1064        /*
1065         * This is probably to be super secure.. Looks like we can
1066         * ignore it without any impact.
1067         */
1068        if (cap_read_drain(iommu->cap))
1069                val |= DMA_TLB_READ_DRAIN;
1070#endif
1071        if (cap_write_drain(iommu->cap))
1072                val |= DMA_TLB_WRITE_DRAIN;
1073
1074        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1075        /* Note: Only uses first TLB reg currently */
1076        if (val_iva)
1077                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1078        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1079
1080        /* Make sure hardware complete it */
1081        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1082                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1083
1084        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085
1086        /* check IOTLB invalidation granularity */
1087        if (DMA_TLB_IAIG(val) == 0)
1088                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1089        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1090                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1091                        (unsigned long long)DMA_TLB_IIRG(type),
1092                        (unsigned long long)DMA_TLB_IAIG(val));
1093}
1094
1095static struct device_domain_info *iommu_support_dev_iotlb(
1096        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1097{
1098        int found = 0;
1099        unsigned long flags;
1100        struct device_domain_info *info;
1101        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1102
1103        if (!ecap_dev_iotlb_support(iommu->ecap))
1104                return NULL;
1105
1106        if (!iommu->qi)
1107                return NULL;
1108
1109        spin_lock_irqsave(&device_domain_lock, flags);
1110        list_for_each_entry(info, &domain->devices, link)
1111                if (info->bus == bus && info->devfn == devfn) {
1112                        found = 1;
1113                        break;
1114                }
1115        spin_unlock_irqrestore(&device_domain_lock, flags);
1116
1117        if (!found || !info->dev)
1118                return NULL;
1119
1120        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1121                return NULL;
1122
1123        if (!dmar_find_matched_atsr_unit(info->dev))
1124                return NULL;
1125
1126        info->iommu = iommu;
1127
1128        return info;
1129}
1130
1131static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1132{
1133        if (!info)
1134                return;
1135
1136        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1137}
1138
1139static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1140{
1141        if (!info->dev || !pci_ats_enabled(info->dev))
1142                return;
1143
1144        pci_disable_ats(info->dev);
1145}
1146
1147static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1148                                  u64 addr, unsigned mask)
1149{
1150        u16 sid, qdep;
1151        unsigned long flags;
1152        struct device_domain_info *info;
1153
1154        spin_lock_irqsave(&device_domain_lock, flags);
1155        list_for_each_entry(info, &domain->devices, link) {
1156                if (!info->dev || !pci_ats_enabled(info->dev))
1157                        continue;
1158
1159                sid = info->bus << 8 | info->devfn;
1160                qdep = pci_ats_queue_depth(info->dev);
1161                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1162        }
1163        spin_unlock_irqrestore(&device_domain_lock, flags);
1164}
1165
1166static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1167                                  unsigned long pfn, unsigned int pages, int map)
1168{
1169        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1170        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1171
1172        BUG_ON(pages == 0);
1173
1174        /*
1175         * Fallback to domain selective flush if no PSI support or the size is
1176         * too big.
1177         * PSI requires page size to be 2 ^ x, and the base address is naturally
1178         * aligned to the size
1179         */
1180        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1181                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1182                                                DMA_TLB_DSI_FLUSH);
1183        else
1184                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1185                                                DMA_TLB_PSI_FLUSH);
1186
1187        /*
1188         * In caching mode, changes of pages from non-present to present require
1189         * flush. However, device IOTLB doesn't need to be flushed in this case.
1190         */
1191        if (!cap_caching_mode(iommu->cap) || !map)
1192                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1193}
1194
1195static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1196{
1197        u32 pmen;
1198        unsigned long flags;
1199
1200        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1201        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1202        pmen &= ~DMA_PMEN_EPM;
1203        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1204
1205        /* wait for the protected region status bit to clear */
1206        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1207                readl, !(pmen & DMA_PMEN_PRS), pmen);
1208
1209        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1210}
1211
1212static int iommu_enable_translation(struct intel_iommu *iommu)
1213{
1214        u32 sts;
1215        unsigned long flags;
1216
1217        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218        iommu->gcmd |= DMA_GCMD_TE;
1219        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1220
1221        /* Make sure hardware complete it */
1222        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                      readl, (sts & DMA_GSTS_TES), sts);
1224
1225        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226        return 0;
1227}
1228
1229static int iommu_disable_translation(struct intel_iommu *iommu)
1230{
1231        u32 sts;
1232        unsigned long flag;
1233
1234        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1235        iommu->gcmd &= ~DMA_GCMD_TE;
1236        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238        /* Make sure hardware complete it */
1239        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                      readl, (!(sts & DMA_GSTS_TES)), sts);
1241
1242        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243        return 0;
1244}
1245
1246
1247static int iommu_init_domains(struct intel_iommu *iommu)
1248{
1249        unsigned long ndomains;
1250        unsigned long nlongs;
1251
1252        ndomains = cap_ndoms(iommu->cap);
1253        pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1254                        ndomains);
1255        nlongs = BITS_TO_LONGS(ndomains);
1256
1257        spin_lock_init(&iommu->lock);
1258
1259        /* TBD: there might be 64K domains,
1260         * consider other allocation for future chip
1261         */
1262        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1263        if (!iommu->domain_ids) {
1264                printk(KERN_ERR "Allocating domain id array failed\n");
1265                return -ENOMEM;
1266        }
1267        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1268                        GFP_KERNEL);
1269        if (!iommu->domains) {
1270                printk(KERN_ERR "Allocating domain array failed\n");
1271                return -ENOMEM;
1272        }
1273
1274        /*
1275         * if Caching mode is set, then invalid translations are tagged
1276         * with domainid 0. Hence we need to pre-allocate it.
1277         */
1278        if (cap_caching_mode(iommu->cap))
1279                set_bit(0, iommu->domain_ids);
1280        return 0;
1281}
1282
1283
1284static void domain_exit(struct dmar_domain *domain);
1285static void vm_domain_exit(struct dmar_domain *domain);
1286
1287void free_dmar_iommu(struct intel_iommu *iommu)
1288{
1289        struct dmar_domain *domain;
1290        int i;
1291        unsigned long flags;
1292
1293        if ((iommu->domains) && (iommu->domain_ids)) {
1294                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1295                        domain = iommu->domains[i];
1296                        clear_bit(i, iommu->domain_ids);
1297
1298                        spin_lock_irqsave(&domain->iommu_lock, flags);
1299                        if (--domain->iommu_count == 0) {
1300                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1301                                        vm_domain_exit(domain);
1302                                else
1303                                        domain_exit(domain);
1304                        }
1305                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306                }
1307        }
1308
1309        if (iommu->gcmd & DMA_GCMD_TE)
1310                iommu_disable_translation(iommu);
1311
1312        if (iommu->irq) {
1313                irq_set_handler_data(iommu->irq, NULL);
1314                /* This will mask the irq */
1315                free_irq(iommu->irq, iommu);
1316                destroy_irq(iommu->irq);
1317        }
1318
1319        kfree(iommu->domains);
1320        kfree(iommu->domain_ids);
1321
1322        g_iommus[iommu->seq_id] = NULL;
1323
1324        /* if all iommus are freed, free g_iommus */
1325        for (i = 0; i < g_num_of_iommus; i++) {
1326                if (g_iommus[i])
1327                        break;
1328        }
1329
1330        if (i == g_num_of_iommus)
1331                kfree(g_iommus);
1332
1333        /* free context mapping */
1334        free_context_table(iommu);
1335}
1336
1337static struct dmar_domain *alloc_domain(void)
1338{
1339        struct dmar_domain *domain;
1340
1341        domain = alloc_domain_mem();
1342        if (!domain)
1343                return NULL;
1344
1345        domain->nid = -1;
1346        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1347        domain->flags = 0;
1348
1349        return domain;
1350}
1351
1352static int iommu_attach_domain(struct dmar_domain *domain,
1353                               struct intel_iommu *iommu)
1354{
1355        int num;
1356        unsigned long ndomains;
1357        unsigned long flags;
1358
1359        ndomains = cap_ndoms(iommu->cap);
1360
1361        spin_lock_irqsave(&iommu->lock, flags);
1362
1363        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1364        if (num >= ndomains) {
1365                spin_unlock_irqrestore(&iommu->lock, flags);
1366                printk(KERN_ERR "IOMMU: no free domain ids\n");
1367                return -ENOMEM;
1368        }
1369
1370        domain->id = num;
1371        set_bit(num, iommu->domain_ids);
1372        set_bit(iommu->seq_id, domain->iommu_bmp);
1373        iommu->domains[num] = domain;
1374        spin_unlock_irqrestore(&iommu->lock, flags);
1375
1376        return 0;
1377}
1378
1379static void iommu_detach_domain(struct dmar_domain *domain,
1380                                struct intel_iommu *iommu)
1381{
1382        unsigned long flags;
1383        int num, ndomains;
1384        int found = 0;
1385
1386        spin_lock_irqsave(&iommu->lock, flags);
1387        ndomains = cap_ndoms(iommu->cap);
1388        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1389                if (iommu->domains[num] == domain) {
1390                        found = 1;
1391                        break;
1392                }
1393        }
1394
1395        if (found) {
1396                clear_bit(num, iommu->domain_ids);
1397                clear_bit(iommu->seq_id, domain->iommu_bmp);
1398                iommu->domains[num] = NULL;
1399        }
1400        spin_unlock_irqrestore(&iommu->lock, flags);
1401}
1402
1403static struct iova_domain reserved_iova_list;
1404static struct lock_class_key reserved_rbtree_key;
1405
1406static int dmar_init_reserved_ranges(void)
1407{
1408        struct pci_dev *pdev = NULL;
1409        struct iova *iova;
1410        int i;
1411
1412        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1413
1414        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1415                &reserved_rbtree_key);
1416
1417        /* IOAPIC ranges shouldn't be accessed by DMA */
1418        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1419                IOVA_PFN(IOAPIC_RANGE_END));
1420        if (!iova) {
1421                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1422                return -ENODEV;
1423        }
1424
1425        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1426        for_each_pci_dev(pdev) {
1427                struct resource *r;
1428
1429                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1430                        r = &pdev->resource[i];
1431                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1432                                continue;
1433                        iova = reserve_iova(&reserved_iova_list,
1434                                            IOVA_PFN(r->start),
1435                                            IOVA_PFN(r->end));
1436                        if (!iova) {
1437                                printk(KERN_ERR "Reserve iova failed\n");
1438                                return -ENODEV;
1439                        }
1440                }
1441        }
1442        return 0;
1443}
1444
1445static void domain_reserve_special_ranges(struct dmar_domain *domain)
1446{
1447        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1448}
1449
1450static inline int guestwidth_to_adjustwidth(int gaw)
1451{
1452        int agaw;
1453        int r = (gaw - 12) % 9;
1454
1455        if (r == 0)
1456                agaw = gaw;
1457        else
1458                agaw = gaw + 9 - r;
1459        if (agaw > 64)
1460                agaw = 64;
1461        return agaw;
1462}
1463
1464static int domain_init(struct dmar_domain *domain, int guest_width)
1465{
1466        struct intel_iommu *iommu;
1467        int adjust_width, agaw;
1468        unsigned long sagaw;
1469
1470        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1471        spin_lock_init(&domain->iommu_lock);
1472
1473        domain_reserve_special_ranges(domain);
1474
1475        /* calculate AGAW */
1476        iommu = domain_get_iommu(domain);
1477        if (guest_width > cap_mgaw(iommu->cap))
1478                guest_width = cap_mgaw(iommu->cap);
1479        domain->gaw = guest_width;
1480        adjust_width = guestwidth_to_adjustwidth(guest_width);
1481        agaw = width_to_agaw(adjust_width);
1482        sagaw = cap_sagaw(iommu->cap);
1483        if (!test_bit(agaw, &sagaw)) {
1484                /* hardware doesn't support it, choose a bigger one */
1485                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1486                agaw = find_next_bit(&sagaw, 5, agaw);
1487                if (agaw >= 5)
1488                        return -ENODEV;
1489        }
1490        domain->agaw = agaw;
1491        INIT_LIST_HEAD(&domain->devices);
1492
1493        if (ecap_coherent(iommu->ecap))
1494                domain->iommu_coherency = 1;
1495        else
1496                domain->iommu_coherency = 0;
1497
1498        if (ecap_sc_support(iommu->ecap))
1499                domain->iommu_snooping = 1;
1500        else
1501                domain->iommu_snooping = 0;
1502
1503        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1504        domain->iommu_count = 1;
1505        domain->nid = iommu->node;
1506
1507        /* always allocate the top pgd */
1508        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1509        if (!domain->pgd)
1510                return -ENOMEM;
1511        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1512        return 0;
1513}
1514
1515static void domain_exit(struct dmar_domain *domain)
1516{
1517        struct dmar_drhd_unit *drhd;
1518        struct intel_iommu *iommu;
1519
1520        /* Domain 0 is reserved, so dont process it */
1521        if (!domain)
1522                return;
1523
1524        /* Flush any lazy unmaps that may reference this domain */
1525        if (!intel_iommu_strict)
1526                flush_unmaps_timeout(0);
1527
1528        domain_remove_dev_info(domain);
1529        /* destroy iovas */
1530        put_iova_domain(&domain->iovad);
1531
1532        /* clear ptes */
1533        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535        /* free page tables */
1536        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537
1538        for_each_active_iommu(iommu, drhd)
1539                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1540                        iommu_detach_domain(domain, iommu);
1541
1542        free_domain_mem(domain);
1543}
1544
1545static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1546                                 u8 bus, u8 devfn, int translation)
1547{
1548        struct context_entry *context;
1549        unsigned long flags;
1550        struct intel_iommu *iommu;
1551        struct dma_pte *pgd;
1552        unsigned long num;
1553        unsigned long ndomains;
1554        int id;
1555        int agaw;
1556        struct device_domain_info *info = NULL;
1557
1558        pr_debug("Set context mapping for %02x:%02x.%d\n",
1559                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560
1561        BUG_ON(!domain->pgd);
1562        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1563               translation != CONTEXT_TT_MULTI_LEVEL);
1564
1565        iommu = device_to_iommu(segment, bus, devfn);
1566        if (!iommu)
1567                return -ENODEV;
1568
1569        context = device_to_context_entry(iommu, bus, devfn);
1570        if (!context)
1571                return -ENOMEM;
1572        spin_lock_irqsave(&iommu->lock, flags);
1573        if (context_present(context)) {
1574                spin_unlock_irqrestore(&iommu->lock, flags);
1575                return 0;
1576        }
1577
1578        id = domain->id;
1579        pgd = domain->pgd;
1580
1581        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1582            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1583                int found = 0;
1584
1585                /* find an available domain id for this device in iommu */
1586                ndomains = cap_ndoms(iommu->cap);
1587                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1588                        if (iommu->domains[num] == domain) {
1589                                id = num;
1590                                found = 1;
1591                                break;
1592                        }
1593                }
1594
1595                if (found == 0) {
1596                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1597                        if (num >= ndomains) {
1598                                spin_unlock_irqrestore(&iommu->lock, flags);
1599                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1600                                return -EFAULT;
1601                        }
1602
1603                        set_bit(num, iommu->domain_ids);
1604                        iommu->domains[num] = domain;
1605                        id = num;
1606                }
1607
1608                /* Skip top levels of page tables for
1609                 * iommu which has less agaw than default.
1610                 * Unnecessary for PT mode.
1611                 */
1612                if (translation != CONTEXT_TT_PASS_THROUGH) {
1613                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1614                                pgd = phys_to_virt(dma_pte_addr(pgd));
1615                                if (!dma_pte_present(pgd)) {
1616                                        spin_unlock_irqrestore(&iommu->lock, flags);
1617                                        return -ENOMEM;
1618                                }
1619                        }
1620                }
1621        }
1622
1623        context_set_domain_id(context, id);
1624
1625        if (translation != CONTEXT_TT_PASS_THROUGH) {
1626                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1627                translation = info ? CONTEXT_TT_DEV_IOTLB :
1628                                     CONTEXT_TT_MULTI_LEVEL;
1629        }
1630        /*
1631         * In pass through mode, AW must be programmed to indicate the largest
1632         * AGAW value supported by hardware. And ASR is ignored by hardware.
1633         */
1634        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1635                context_set_address_width(context, iommu->msagaw);
1636        else {
1637                context_set_address_root(context, virt_to_phys(pgd));
1638                context_set_address_width(context, iommu->agaw);
1639        }
1640
1641        context_set_translation_type(context, translation);
1642        context_set_fault_enable(context);
1643        context_set_present(context);
1644        domain_flush_cache(domain, context, sizeof(*context));
1645
1646        /*
1647         * It's a non-present to present mapping. If hardware doesn't cache
1648         * non-present entry we only need to flush the write-buffer. If the
1649         * _does_ cache non-present entries, then it does so in the special
1650         * domain #0, which we have to flush:
1651         */
1652        if (cap_caching_mode(iommu->cap)) {
1653                iommu->flush.flush_context(iommu, 0,
1654                                           (((u16)bus) << 8) | devfn,
1655                                           DMA_CCMD_MASK_NOBIT,
1656                                           DMA_CCMD_DEVICE_INVL);
1657                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1658        } else {
1659                iommu_flush_write_buffer(iommu);
1660        }
1661        iommu_enable_dev_iotlb(info);
1662        spin_unlock_irqrestore(&iommu->lock, flags);
1663
1664        spin_lock_irqsave(&domain->iommu_lock, flags);
1665        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1666                domain->iommu_count++;
1667                if (domain->iommu_count == 1)
1668                        domain->nid = iommu->node;
1669                domain_update_iommu_cap(domain);
1670        }
1671        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1672        return 0;
1673}
1674
1675static int
1676domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1677                        int translation)
1678{
1679        int ret;
1680        struct pci_dev *tmp, *parent;
1681
1682        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1683                                         pdev->bus->number, pdev->devfn,
1684                                         translation);
1685        if (ret)
1686                return ret;
1687
1688        /* dependent device mapping */
1689        tmp = pci_find_upstream_pcie_bridge(pdev);
1690        if (!tmp)
1691                return 0;
1692        /* Secondary interface's bus number and devfn 0 */
1693        parent = pdev->bus->self;
1694        while (parent != tmp) {
1695                ret = domain_context_mapping_one(domain,
1696                                                 pci_domain_nr(parent->bus),
1697                                                 parent->bus->number,
1698                                                 parent->devfn, translation);
1699                if (ret)
1700                        return ret;
1701                parent = parent->bus->self;
1702        }
1703        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1704                return domain_context_mapping_one(domain,
1705                                        pci_domain_nr(tmp->subordinate),
1706                                        tmp->subordinate->number, 0,
1707                                        translation);
1708        else /* this is a legacy PCI bridge */
1709                return domain_context_mapping_one(domain,
1710                                                  pci_domain_nr(tmp->bus),
1711                                                  tmp->bus->number,
1712                                                  tmp->devfn,
1713                                                  translation);
1714}
1715
1716static int domain_context_mapped(struct pci_dev *pdev)
1717{
1718        int ret;
1719        struct pci_dev *tmp, *parent;
1720        struct intel_iommu *iommu;
1721
1722        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1723                                pdev->devfn);
1724        if (!iommu)
1725                return -ENODEV;
1726
1727        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1728        if (!ret)
1729                return ret;
1730        /* dependent device mapping */
1731        tmp = pci_find_upstream_pcie_bridge(pdev);
1732        if (!tmp)
1733                return ret;
1734        /* Secondary interface's bus number and devfn 0 */
1735        parent = pdev->bus->self;
1736        while (parent != tmp) {
1737                ret = device_context_mapped(iommu, parent->bus->number,
1738                                            parent->devfn);
1739                if (!ret)
1740                        return ret;
1741                parent = parent->bus->self;
1742        }
1743        if (pci_is_pcie(tmp))
1744                return device_context_mapped(iommu, tmp->subordinate->number,
1745                                             0);
1746        else
1747                return device_context_mapped(iommu, tmp->bus->number,
1748                                             tmp->devfn);
1749}
1750
1751/* Returns a number of VTD pages, but aligned to MM page size */
1752static inline unsigned long aligned_nrpages(unsigned long host_addr,
1753                                            size_t size)
1754{
1755        host_addr &= ~PAGE_MASK;
1756        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1757}
1758
1759/* Return largest possible superpage level for a given mapping */
1760static inline int hardware_largepage_caps(struct dmar_domain *domain,
1761                                          unsigned long iov_pfn,
1762                                          unsigned long phy_pfn,
1763                                          unsigned long pages)
1764{
1765        int support, level = 1;
1766        unsigned long pfnmerge;
1767
1768        support = domain->iommu_superpage;
1769
1770        /* To use a large page, the virtual *and* physical addresses
1771           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1772           of them will mean we have to use smaller pages. So just
1773           merge them and check both at once. */
1774        pfnmerge = iov_pfn | phy_pfn;
1775
1776        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1777                pages >>= VTD_STRIDE_SHIFT;
1778                if (!pages)
1779                        break;
1780                pfnmerge >>= VTD_STRIDE_SHIFT;
1781                level++;
1782                support--;
1783        }
1784        return level;
1785}
1786
1787static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1788                            struct scatterlist *sg, unsigned long phys_pfn,
1789                            unsigned long nr_pages, int prot)
1790{
1791        struct dma_pte *first_pte = NULL, *pte = NULL;
1792        phys_addr_t uninitialized_var(pteval);
1793        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1794        unsigned long sg_res;
1795        unsigned int largepage_lvl = 0;
1796        unsigned long lvl_pages = 0;
1797
1798        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1799
1800        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1801                return -EINVAL;
1802
1803        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1804
1805        if (sg)
1806                sg_res = 0;
1807        else {
1808                sg_res = nr_pages + 1;
1809                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1810        }
1811
1812        while (nr_pages > 0) {
1813                uint64_t tmp;
1814
1815                if (!sg_res) {
1816                        sg_res = aligned_nrpages(sg->offset, sg->length);
1817                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1818                        sg->dma_length = sg->length;
1819                        pteval = page_to_phys(sg_page(sg)) | prot;
1820                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1821                }
1822
1823                if (!pte) {
1824                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1825
1826                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1827                        if (!pte)
1828                                return -ENOMEM;
1829                        /* It is large page*/
1830                        if (largepage_lvl > 1) {
1831                                pteval |= DMA_PTE_LARGE_PAGE;
1832                                /* Ensure that old small page tables are removed to make room
1833                                   for superpage, if they exist. */
1834                                dma_pte_clear_range(domain, iov_pfn,
1835                                                    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1836                                dma_pte_free_pagetable(domain, iov_pfn,
1837                                                       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838                        } else {
1839                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1840                        }
1841
1842                }
1843                /* We don't need lock here, nobody else
1844                 * touches the iova range
1845                 */
1846                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1847                if (tmp) {
1848                        static int dumps = 5;
1849                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1850                               iov_pfn, tmp, (unsigned long long)pteval);
1851                        if (dumps) {
1852                                dumps--;
1853                                debug_dma_dump_mappings(NULL);
1854                        }
1855                        WARN_ON(1);
1856                }
1857
1858                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1859
1860                BUG_ON(nr_pages < lvl_pages);
1861                BUG_ON(sg_res < lvl_pages);
1862
1863                nr_pages -= lvl_pages;
1864                iov_pfn += lvl_pages;
1865                phys_pfn += lvl_pages;
1866                pteval += lvl_pages * VTD_PAGE_SIZE;
1867                sg_res -= lvl_pages;
1868
1869                /* If the next PTE would be the first in a new page, then we
1870                   need to flush the cache on the entries we've just written.
1871                   And then we'll need to recalculate 'pte', so clear it and
1872                   let it get set again in the if (!pte) block above.
1873
1874                   If we're done (!nr_pages) we need to flush the cache too.
1875
1876                   Also if we've been setting superpages, we may need to
1877                   recalculate 'pte' and switch back to smaller pages for the
1878                   end of the mapping, if the trailing size is not enough to
1879                   use another superpage (i.e. sg_res < lvl_pages). */
1880                pte++;
1881                if (!nr_pages || first_pte_in_page(pte) ||
1882                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1883                        domain_flush_cache(domain, first_pte,
1884                                           (void *)pte - (void *)first_pte);
1885                        pte = NULL;
1886                }
1887
1888                if (!sg_res && nr_pages)
1889                        sg = sg_next(sg);
1890        }
1891        return 0;
1892}
1893
1894static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1895                                    struct scatterlist *sg, unsigned long nr_pages,
1896                                    int prot)
1897{
1898        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1899}
1900
1901static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1902                                     unsigned long phys_pfn, unsigned long nr_pages,
1903                                     int prot)
1904{
1905        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1906}
1907
1908static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1909{
1910        if (!iommu)
1911                return;
1912
1913        clear_context_table(iommu, bus, devfn);
1914        iommu->flush.flush_context(iommu, 0, 0, 0,
1915                                           DMA_CCMD_GLOBAL_INVL);
1916        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1917}
1918
1919static inline void unlink_domain_info(struct device_domain_info *info)
1920{
1921        assert_spin_locked(&device_domain_lock);
1922        list_del(&info->link);
1923        list_del(&info->global);
1924        if (info->dev)
1925                info->dev->dev.archdata.iommu = NULL;
1926}
1927
1928static void domain_remove_dev_info(struct dmar_domain *domain)
1929{
1930        struct device_domain_info *info;
1931        unsigned long flags;
1932        struct intel_iommu *iommu;
1933
1934        spin_lock_irqsave(&device_domain_lock, flags);
1935        while (!list_empty(&domain->devices)) {
1936                info = list_entry(domain->devices.next,
1937                        struct device_domain_info, link);
1938                unlink_domain_info(info);
1939                spin_unlock_irqrestore(&device_domain_lock, flags);
1940
1941                iommu_disable_dev_iotlb(info);
1942                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1943                iommu_detach_dev(iommu, info->bus, info->devfn);
1944                free_devinfo_mem(info);
1945
1946                spin_lock_irqsave(&device_domain_lock, flags);
1947        }
1948        spin_unlock_irqrestore(&device_domain_lock, flags);
1949}
1950
1951/*
1952 * find_domain
1953 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1954 */
1955static struct dmar_domain *
1956find_domain(struct pci_dev *pdev)
1957{
1958        struct device_domain_info *info;
1959
1960        /* No lock here, assumes no domain exit in normal case */
1961        info = pdev->dev.archdata.iommu;
1962        if (info)
1963                return info->domain;
1964        return NULL;
1965}
1966
1967/* domain is initialized */
1968static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1969{
1970        struct dmar_domain *domain, *found = NULL;
1971        struct intel_iommu *iommu;
1972        struct dmar_drhd_unit *drhd;
1973        struct device_domain_info *info, *tmp;
1974        struct pci_dev *dev_tmp;
1975        unsigned long flags;
1976        int bus = 0, devfn = 0;
1977        int segment;
1978        int ret;
1979
1980        domain = find_domain(pdev);
1981        if (domain)
1982                return domain;
1983
1984        segment = pci_domain_nr(pdev->bus);
1985
1986        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1987        if (dev_tmp) {
1988                if (pci_is_pcie(dev_tmp)) {
1989                        bus = dev_tmp->subordinate->number;
1990                        devfn = 0;
1991                } else {
1992                        bus = dev_tmp->bus->number;
1993                        devfn = dev_tmp->devfn;
1994                }
1995                spin_lock_irqsave(&device_domain_lock, flags);
1996                list_for_each_entry(info, &device_domain_list, global) {
1997                        if (info->segment == segment &&
1998                            info->bus == bus && info->devfn == devfn) {
1999                                found = info->domain;
2000                                break;
2001                        }
2002                }
2003                spin_unlock_irqrestore(&device_domain_lock, flags);
2004                /* pcie-pci bridge already has a domain, uses it */
2005                if (found) {
2006                        domain = found;
2007                        goto found_domain;
2008                }
2009        }
2010
2011        domain = alloc_domain();
2012        if (!domain)
2013                goto error;
2014
2015        /* Allocate new domain for the device */
2016        drhd = dmar_find_matched_drhd_unit(pdev);
2017        if (!drhd) {
2018                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2019                        pci_name(pdev));
2020                free_domain_mem(domain);
2021                return NULL;
2022        }
2023        iommu = drhd->iommu;
2024
2025        ret = iommu_attach_domain(domain, iommu);
2026        if (ret) {
2027                free_domain_mem(domain);
2028                goto error;
2029        }
2030
2031        if (domain_init(domain, gaw)) {
2032                domain_exit(domain);
2033                goto error;
2034        }
2035
2036        /* register pcie-to-pci device */
2037        if (dev_tmp) {
2038                info = alloc_devinfo_mem();
2039                if (!info) {
2040                        domain_exit(domain);
2041                        goto error;
2042                }
2043                info->segment = segment;
2044                info->bus = bus;
2045                info->devfn = devfn;
2046                info->dev = NULL;
2047                info->domain = domain;
2048                /* This domain is shared by devices under p2p bridge */
2049                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2050
2051                /* pcie-to-pci bridge already has a domain, uses it */
2052                found = NULL;
2053                spin_lock_irqsave(&device_domain_lock, flags);
2054                list_for_each_entry(tmp, &device_domain_list, global) {
2055                        if (tmp->segment == segment &&
2056                            tmp->bus == bus && tmp->devfn == devfn) {
2057                                found = tmp->domain;
2058                                break;
2059                        }
2060                }
2061                if (found) {
2062                        spin_unlock_irqrestore(&device_domain_lock, flags);
2063                        free_devinfo_mem(info);
2064                        domain_exit(domain);
2065                        domain = found;
2066                } else {
2067                        list_add(&info->link, &domain->devices);
2068                        list_add(&info->global, &device_domain_list);
2069                        spin_unlock_irqrestore(&device_domain_lock, flags);
2070                }
2071        }
2072
2073found_domain:
2074        info = alloc_devinfo_mem();
2075        if (!info)
2076                goto error;
2077        info->segment = segment;
2078        info->bus = pdev->bus->number;
2079        info->devfn = pdev->devfn;
2080        info->dev = pdev;
2081        info->domain = domain;
2082        spin_lock_irqsave(&device_domain_lock, flags);
2083        /* somebody is fast */
2084        found = find_domain(pdev);
2085        if (found != NULL) {
2086                spin_unlock_irqrestore(&device_domain_lock, flags);
2087                if (found != domain) {
2088                        domain_exit(domain);
2089                        domain = found;
2090                }
2091                free_devinfo_mem(info);
2092                return domain;
2093        }
2094        list_add(&info->link, &domain->devices);
2095        list_add(&info->global, &device_domain_list);
2096        pdev->dev.archdata.iommu = info;
2097        spin_unlock_irqrestore(&device_domain_lock, flags);
2098        return domain;
2099error:
2100        /* recheck it here, maybe others set it */
2101        return find_domain(pdev);
2102}
2103
2104static int iommu_identity_mapping;
2105#define IDENTMAP_ALL            1
2106#define IDENTMAP_GFX            2
2107#define IDENTMAP_AZALIA         4
2108
2109static int iommu_domain_identity_map(struct dmar_domain *domain,
2110                                     unsigned long long start,
2111                                     unsigned long long end)
2112{
2113        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2114        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2115
2116        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2117                          dma_to_mm_pfn(last_vpfn))) {
2118                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2119                return -ENOMEM;
2120        }
2121
2122        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2123                 start, end, domain->id);
2124        /*
2125         * RMRR range might have overlap with physical memory range,
2126         * clear it first
2127         */
2128        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2129
2130        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2131                                  last_vpfn - first_vpfn + 1,
2132                                  DMA_PTE_READ|DMA_PTE_WRITE);
2133}
2134
2135static int iommu_prepare_identity_map(struct pci_dev *pdev,
2136                                      unsigned long long start,
2137                                      unsigned long long end)
2138{
2139        struct dmar_domain *domain;
2140        int ret;
2141
2142        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2143        if (!domain)
2144                return -ENOMEM;
2145
2146        /* For _hardware_ passthrough, don't bother. But for software
2147           passthrough, we do it anyway -- it may indicate a memory
2148           range which is reserved in E820, so which didn't get set
2149           up to start with in si_domain */
2150        if (domain == si_domain && hw_pass_through) {
2151                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2152                       pci_name(pdev), start, end);
2153                return 0;
2154        }
2155
2156        printk(KERN_INFO
2157               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2158               pci_name(pdev), start, end);
2159        
2160        if (end < start) {
2161                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2162                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2163                        dmi_get_system_info(DMI_BIOS_VENDOR),
2164                        dmi_get_system_info(DMI_BIOS_VERSION),
2165                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2166                ret = -EIO;
2167                goto error;
2168        }
2169
2170        if (end >> agaw_to_width(domain->agaw)) {
2171                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2172                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2173                     agaw_to_width(domain->agaw),
2174                     dmi_get_system_info(DMI_BIOS_VENDOR),
2175                     dmi_get_system_info(DMI_BIOS_VERSION),
2176                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2177                ret = -EIO;
2178                goto error;
2179        }
2180
2181        ret = iommu_domain_identity_map(domain, start, end);
2182        if (ret)
2183                goto error;
2184
2185        /* context entry init */
2186        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2187        if (ret)
2188                goto error;
2189
2190        return 0;
2191
2192 error:
2193        domain_exit(domain);
2194        return ret;
2195}
2196
2197static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2198        struct pci_dev *pdev)
2199{
2200        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2201                return 0;
2202        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2203                rmrr->end_address);
2204}
2205
2206#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2207static inline void iommu_prepare_isa(void)
2208{
2209        struct pci_dev *pdev;
2210        int ret;
2211
2212        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2213        if (!pdev)
2214                return;
2215
2216        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2217        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2218
2219        if (ret)
2220                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2221                       "floppy might not work\n");
2222
2223}
2224#else
2225static inline void iommu_prepare_isa(void)
2226{
2227        return;
2228}
2229#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2230
2231static int md_domain_init(struct dmar_domain *domain, int guest_width);
2232
2233static int __init si_domain_init(int hw)
2234{
2235        struct dmar_drhd_unit *drhd;
2236        struct intel_iommu *iommu;
2237        int nid, ret = 0;
2238
2239        si_domain = alloc_domain();
2240        if (!si_domain)
2241                return -EFAULT;
2242
2243        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2244
2245        for_each_active_iommu(iommu, drhd) {
2246                ret = iommu_attach_domain(si_domain, iommu);
2247                if (ret) {
2248                        domain_exit(si_domain);
2249                        return -EFAULT;
2250                }
2251        }
2252
2253        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2254                domain_exit(si_domain);
2255                return -EFAULT;
2256        }
2257
2258        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2259
2260        if (hw)
2261                return 0;
2262
2263        for_each_online_node(nid) {
2264                unsigned long start_pfn, end_pfn;
2265                int i;
2266
2267                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2268                        ret = iommu_domain_identity_map(si_domain,
2269                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2270                        if (ret)
2271                                return ret;
2272                }
2273        }
2274
2275        return 0;
2276}
2277
2278static void domain_remove_one_dev_info(struct dmar_domain *domain,
2279                                          struct pci_dev *pdev);
2280static int identity_mapping(struct pci_dev *pdev)
2281{
2282        struct device_domain_info *info;
2283
2284        if (likely(!iommu_identity_mapping))
2285                return 0;
2286
2287        info = pdev->dev.archdata.iommu;
2288        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2289                return (info->domain == si_domain);
2290
2291        return 0;
2292}
2293
2294static int domain_add_dev_info(struct dmar_domain *domain,
2295                               struct pci_dev *pdev,
2296                               int translation)
2297{
2298        struct device_domain_info *info;
2299        unsigned long flags;
2300        int ret;
2301
2302        info = alloc_devinfo_mem();
2303        if (!info)
2304                return -ENOMEM;
2305
2306        info->segment = pci_domain_nr(pdev->bus);
2307        info->bus = pdev->bus->number;
2308        info->devfn = pdev->devfn;
2309        info->dev = pdev;
2310        info->domain = domain;
2311
2312        spin_lock_irqsave(&device_domain_lock, flags);
2313        list_add(&info->link, &domain->devices);
2314        list_add(&info->global, &device_domain_list);
2315        pdev->dev.archdata.iommu = info;
2316        spin_unlock_irqrestore(&device_domain_lock, flags);
2317
2318        ret = domain_context_mapping(domain, pdev, translation);
2319        if (ret) {
2320                spin_lock_irqsave(&device_domain_lock, flags);
2321                unlink_domain_info(info);
2322                spin_unlock_irqrestore(&device_domain_lock, flags);
2323                free_devinfo_mem(info);
2324                return ret;
2325        }
2326
2327        return 0;
2328}
2329
2330static bool device_has_rmrr(struct pci_dev *dev)
2331{
2332        struct dmar_rmrr_unit *rmrr;
2333        int i;
2334
2335        for_each_rmrr_units(rmrr) {
2336                for (i = 0; i < rmrr->devices_cnt; i++) {
2337                        /*
2338                         * Return TRUE if this RMRR contains the device that
2339                         * is passed in.
2340                         */
2341                        if (rmrr->devices[i] == dev)
2342                                return true;
2343                }
2344        }
2345        return false;
2346}
2347
2348static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2349{
2350
2351        /*
2352         * We want to prevent any device associated with an RMRR from
2353         * getting placed into the SI Domain. This is done because
2354         * problems exist when devices are moved in and out of domains
2355         * and their respective RMRR info is lost. We exempt USB devices
2356         * from this process due to their usage of RMRRs that are known
2357         * to not be needed after BIOS hand-off to OS.
2358         */
2359        if (device_has_rmrr(pdev) &&
2360            (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2361                return 0;
2362
2363        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2364                return 1;
2365
2366        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2367                return 1;
2368
2369        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2370                return 0;
2371
2372        /*
2373         * We want to start off with all devices in the 1:1 domain, and
2374         * take them out later if we find they can't access all of memory.
2375         *
2376         * However, we can't do this for PCI devices behind bridges,
2377         * because all PCI devices behind the same bridge will end up
2378         * with the same source-id on their transactions.
2379         *
2380         * Practically speaking, we can't change things around for these
2381         * devices at run-time, because we can't be sure there'll be no
2382         * DMA transactions in flight for any of their siblings.
2383         * 
2384         * So PCI devices (unless they're on the root bus) as well as
2385         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2386         * the 1:1 domain, just in _case_ one of their siblings turns out
2387         * not to be able to map all of memory.
2388         */
2389        if (!pci_is_pcie(pdev)) {
2390                if (!pci_is_root_bus(pdev->bus))
2391                        return 0;
2392                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2393                        return 0;
2394        } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2395                return 0;
2396
2397        /* 
2398         * At boot time, we don't yet know if devices will be 64-bit capable.
2399         * Assume that they will -- if they turn out not to be, then we can 
2400         * take them out of the 1:1 domain later.
2401         */
2402        if (!startup) {
2403                /*
2404                 * If the device's dma_mask is less than the system's memory
2405                 * size then this is not a candidate for identity mapping.
2406                 */
2407                u64 dma_mask = pdev->dma_mask;
2408
2409                if (pdev->dev.coherent_dma_mask &&
2410                    pdev->dev.coherent_dma_mask < dma_mask)
2411                        dma_mask = pdev->dev.coherent_dma_mask;
2412
2413                return dma_mask >= dma_get_required_mask(&pdev->dev);
2414        }
2415
2416        return 1;
2417}
2418
2419static int __init iommu_prepare_static_identity_mapping(int hw)
2420{
2421        struct pci_dev *pdev = NULL;
2422        int ret;
2423
2424        ret = si_domain_init(hw);
2425        if (ret)
2426                return -EFAULT;
2427
2428        for_each_pci_dev(pdev) {
2429                if (iommu_should_identity_map(pdev, 1)) {
2430                        ret = domain_add_dev_info(si_domain, pdev,
2431                                             hw ? CONTEXT_TT_PASS_THROUGH :
2432                                                  CONTEXT_TT_MULTI_LEVEL);
2433                        if (ret) {
2434                                /* device not associated with an iommu */
2435                                if (ret == -ENODEV)
2436                                        continue;
2437                                return ret;
2438                        }
2439                        pr_info("IOMMU: %s identity mapping for device %s\n",
2440                                hw ? "hardware" : "software", pci_name(pdev));
2441                }
2442        }
2443
2444        return 0;
2445}
2446
2447static int __init init_dmars(void)
2448{
2449        struct dmar_drhd_unit *drhd;
2450        struct dmar_rmrr_unit *rmrr;
2451        struct pci_dev *pdev;
2452        struct intel_iommu *iommu;
2453        int i, ret;
2454
2455        /*
2456         * for each drhd
2457         *    allocate root
2458         *    initialize and program root entry to not present
2459         * endfor
2460         */
2461        for_each_drhd_unit(drhd) {
2462                /*
2463                 * lock not needed as this is only incremented in the single
2464                 * threaded kernel __init code path all other access are read
2465                 * only
2466                 */
2467                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2468                        g_num_of_iommus++;
2469                        continue;
2470                }
2471                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2472                          IOMMU_UNITS_SUPPORTED);
2473        }
2474
2475        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2476                        GFP_KERNEL);
2477        if (!g_iommus) {
2478                printk(KERN_ERR "Allocating global iommu array failed\n");
2479                ret = -ENOMEM;
2480                goto error;
2481        }
2482
2483        deferred_flush = kzalloc(g_num_of_iommus *
2484                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2485        if (!deferred_flush) {
2486                ret = -ENOMEM;
2487                goto error;
2488        }
2489
2490        for_each_drhd_unit(drhd) {
2491                if (drhd->ignored)
2492                        continue;
2493
2494                iommu = drhd->iommu;
2495                g_iommus[iommu->seq_id] = iommu;
2496
2497                ret = iommu_init_domains(iommu);
2498                if (ret)
2499                        goto error;
2500
2501                /*
2502                 * TBD:
2503                 * we could share the same root & context tables
2504                 * among all IOMMU's. Need to Split it later.
2505                 */
2506                ret = iommu_alloc_root_entry(iommu);
2507                if (ret) {
2508                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2509                        goto error;
2510                }
2511                if (!ecap_pass_through(iommu->ecap))
2512                        hw_pass_through = 0;
2513        }
2514
2515        /*
2516         * Start from the sane iommu hardware state.
2517         */
2518        for_each_drhd_unit(drhd) {
2519                if (drhd->ignored)
2520                        continue;
2521
2522                iommu = drhd->iommu;
2523
2524                /*
2525                 * If the queued invalidation is already initialized by us
2526                 * (for example, while enabling interrupt-remapping) then
2527                 * we got the things already rolling from a sane state.
2528                 */
2529                if (iommu->qi)
2530                        continue;
2531
2532                /*
2533                 * Clear any previous faults.
2534                 */
2535                dmar_fault(-1, iommu);
2536                /*
2537                 * Disable queued invalidation if supported and already enabled
2538                 * before OS handover.
2539                 */
2540                dmar_disable_qi(iommu);
2541        }
2542
2543        for_each_drhd_unit(drhd) {
2544                if (drhd->ignored)
2545                        continue;
2546
2547                iommu = drhd->iommu;
2548
2549                if (dmar_enable_qi(iommu)) {
2550                        /*
2551                         * Queued Invalidate not enabled, use Register Based
2552                         * Invalidate
2553                         */
2554                        iommu->flush.flush_context = __iommu_flush_context;
2555                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2556                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2557                               "invalidation\n",
2558                                iommu->seq_id,
2559                               (unsigned long long)drhd->reg_base_addr);
2560                } else {
2561                        iommu->flush.flush_context = qi_flush_context;
2562                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2563                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2564                               "invalidation\n",
2565                                iommu->seq_id,
2566                               (unsigned long long)drhd->reg_base_addr);
2567                }
2568        }
2569
2570        if (iommu_pass_through)
2571                iommu_identity_mapping |= IDENTMAP_ALL;
2572
2573#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2574        iommu_identity_mapping |= IDENTMAP_GFX;
2575#endif
2576
2577        check_tylersburg_isoch();
2578
2579        /*
2580         * If pass through is not set or not enabled, setup context entries for
2581         * identity mappings for rmrr, gfx, and isa and may fall back to static
2582         * identity mapping if iommu_identity_mapping is set.
2583         */
2584        if (iommu_identity_mapping) {
2585                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2586                if (ret) {
2587                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2588                        goto error;
2589                }
2590        }
2591        /*
2592         * For each rmrr
2593         *   for each dev attached to rmrr
2594         *   do
2595         *     locate drhd for dev, alloc domain for dev
2596         *     allocate free domain
2597         *     allocate page table entries for rmrr
2598         *     if context not allocated for bus
2599         *           allocate and init context
2600         *           set present in root table for this bus
2601         *     init context with domain, translation etc
2602         *    endfor
2603         * endfor
2604         */
2605        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2606        for_each_rmrr_units(rmrr) {
2607                for (i = 0; i < rmrr->devices_cnt; i++) {
2608                        pdev = rmrr->devices[i];
2609                        /*
2610                         * some BIOS lists non-exist devices in DMAR
2611                         * table.
2612                         */
2613                        if (!pdev)
2614                                continue;
2615                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2616                        if (ret)
2617                                printk(KERN_ERR
2618                                       "IOMMU: mapping reserved region failed\n");
2619                }
2620        }
2621
2622        iommu_prepare_isa();
2623
2624        /*
2625         * for each drhd
2626         *   enable fault log
2627         *   global invalidate context cache
2628         *   global invalidate iotlb
2629         *   enable translation
2630         */
2631        for_each_drhd_unit(drhd) {
2632                if (drhd->ignored) {
2633                        /*
2634                         * we always have to disable PMRs or DMA may fail on
2635                         * this device
2636                         */
2637                        if (force_on)
2638                                iommu_disable_protect_mem_regions(drhd->iommu);
2639                        continue;
2640                }
2641                iommu = drhd->iommu;
2642
2643                iommu_flush_write_buffer(iommu);
2644
2645                ret = dmar_set_interrupt(iommu);
2646                if (ret)
2647                        goto error;
2648
2649                iommu_set_root_entry(iommu);
2650
2651                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2652                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2653
2654                ret = iommu_enable_translation(iommu);
2655                if (ret)
2656                        goto error;
2657
2658                iommu_disable_protect_mem_regions(iommu);
2659        }
2660
2661        return 0;
2662error:
2663        for_each_drhd_unit(drhd) {
2664                if (drhd->ignored)
2665                        continue;
2666                iommu = drhd->iommu;
2667                free_iommu(iommu);
2668        }
2669        kfree(g_iommus);
2670        return ret;
2671}
2672
2673/* This takes a number of _MM_ pages, not VTD pages */
2674static struct iova *intel_alloc_iova(struct device *dev,
2675                                     struct dmar_domain *domain,
2676                                     unsigned long nrpages, uint64_t dma_mask)
2677{
2678        struct pci_dev *pdev = to_pci_dev(dev);
2679        struct iova *iova = NULL;
2680
2681        /* Restrict dma_mask to the width that the iommu can handle */
2682        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2683
2684        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2685                /*
2686                 * First try to allocate an io virtual address in
2687                 * DMA_BIT_MASK(32) and if that fails then try allocating
2688                 * from higher range
2689                 */
2690                iova = alloc_iova(&domain->iovad, nrpages,
2691                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2692                if (iova)
2693                        return iova;
2694        }
2695        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2696        if (unlikely(!iova)) {
2697                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2698                       nrpages, pci_name(pdev));
2699                return NULL;
2700        }
2701
2702        return iova;
2703}
2704
2705static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2706{
2707        struct dmar_domain *domain;
2708        int ret;
2709
2710        domain = get_domain_for_dev(pdev,
2711                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2712        if (!domain) {
2713                printk(KERN_ERR
2714                        "Allocating domain for %s failed", pci_name(pdev));
2715                return NULL;
2716        }
2717
2718        /* make sure context mapping is ok */
2719        if (unlikely(!domain_context_mapped(pdev))) {
2720                ret = domain_context_mapping(domain, pdev,
2721                                             CONTEXT_TT_MULTI_LEVEL);
2722                if (ret) {
2723                        printk(KERN_ERR
2724                                "Domain context map for %s failed",
2725                                pci_name(pdev));
2726                        return NULL;
2727                }
2728        }
2729
2730        return domain;
2731}
2732
2733static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2734{
2735        struct device_domain_info *info;
2736
2737        /* No lock here, assumes no domain exit in normal case */
2738        info = dev->dev.archdata.iommu;
2739        if (likely(info))
2740                return info->domain;
2741
2742        return __get_valid_domain_for_dev(dev);
2743}
2744
2745static int iommu_dummy(struct pci_dev *pdev)
2746{
2747        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2748}
2749
2750/* Check if the pdev needs to go through non-identity map and unmap process.*/
2751static int iommu_no_mapping(struct device *dev)
2752{
2753        struct pci_dev *pdev;
2754        int found;
2755
2756        if (unlikely(dev->bus != &pci_bus_type))
2757                return 1;
2758
2759        pdev = to_pci_dev(dev);
2760        if (iommu_dummy(pdev))
2761                return 1;
2762
2763        if (!iommu_identity_mapping)
2764                return 0;
2765
2766        found = identity_mapping(pdev);
2767        if (found) {
2768                if (iommu_should_identity_map(pdev, 0))
2769                        return 1;
2770                else {
2771                        /*
2772                         * 32 bit DMA is removed from si_domain and fall back
2773                         * to non-identity mapping.
2774                         */
2775                        domain_remove_one_dev_info(si_domain, pdev);
2776                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2777                               pci_name(pdev));
2778                        return 0;
2779                }
2780        } else {
2781                /*
2782                 * In case of a detached 64 bit DMA device from vm, the device
2783                 * is put into si_domain for identity mapping.
2784                 */
2785                if (iommu_should_identity_map(pdev, 0)) {
2786                        int ret;
2787                        ret = domain_add_dev_info(si_domain, pdev,
2788                                                  hw_pass_through ?
2789                                                  CONTEXT_TT_PASS_THROUGH :
2790                                                  CONTEXT_TT_MULTI_LEVEL);
2791                        if (!ret) {
2792                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2793                                       pci_name(pdev));
2794                                return 1;
2795                        }
2796                }
2797        }
2798
2799        return 0;
2800}
2801
2802static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2803                                     size_t size, int dir, u64 dma_mask)
2804{
2805        struct pci_dev *pdev = to_pci_dev(hwdev);
2806        struct dmar_domain *domain;
2807        phys_addr_t start_paddr;
2808        struct iova *iova;
2809        int prot = 0;
2810        int ret;
2811        struct intel_iommu *iommu;
2812        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2813
2814        BUG_ON(dir == DMA_NONE);
2815
2816        if (iommu_no_mapping(hwdev))
2817                return paddr;
2818
2819        domain = get_valid_domain_for_dev(pdev);
2820        if (!domain)
2821                return 0;
2822
2823        iommu = domain_get_iommu(domain);
2824        size = aligned_nrpages(paddr, size);
2825
2826        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2827        if (!iova)
2828                goto error;
2829
2830        /*
2831         * Check if DMAR supports zero-length reads on write only
2832         * mappings..
2833         */
2834        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2835                        !cap_zlr(iommu->cap))
2836                prot |= DMA_PTE_READ;
2837        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2838                prot |= DMA_PTE_WRITE;
2839        /*
2840         * paddr - (paddr + size) might be partial page, we should map the whole
2841         * page.  Note: if two part of one page are separately mapped, we
2842         * might have two guest_addr mapping to the same host paddr, but this
2843         * is not a big problem
2844         */
2845        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2846                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2847        if (ret)
2848                goto error;
2849
2850        /* it's a non-present to present mapping. Only flush if caching mode */
2851        if (cap_caching_mode(iommu->cap))
2852                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2853        else
2854                iommu_flush_write_buffer(iommu);
2855
2856        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2857        start_paddr += paddr & ~PAGE_MASK;
2858        return start_paddr;
2859
2860error:
2861        if (iova)
2862                __free_iova(&domain->iovad, iova);
2863        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2864                pci_name(pdev), size, (unsigned long long)paddr, dir);
2865        return 0;
2866}
2867
2868static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2869                                 unsigned long offset, size_t size,
2870                                 enum dma_data_direction dir,
2871                                 struct dma_attrs *attrs)
2872{
2873        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2874                                  dir, to_pci_dev(dev)->dma_mask);
2875}
2876
2877static void flush_unmaps(void)
2878{
2879        int i, j;
2880
2881        timer_on = 0;
2882
2883        /* just flush them all */
2884        for (i = 0; i < g_num_of_iommus; i++) {
2885                struct intel_iommu *iommu = g_iommus[i];
2886                if (!iommu)
2887                        continue;
2888
2889                if (!deferred_flush[i].next)
2890                        continue;
2891
2892                /* In caching mode, global flushes turn emulation expensive */
2893                if (!cap_caching_mode(iommu->cap))
2894                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2895                                         DMA_TLB_GLOBAL_FLUSH);
2896                for (j = 0; j < deferred_flush[i].next; j++) {
2897                        unsigned long mask;
2898                        struct iova *iova = deferred_flush[i].iova[j];
2899                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2900
2901                        /* On real hardware multiple invalidations are expensive */
2902                        if (cap_caching_mode(iommu->cap))
2903                                iommu_flush_iotlb_psi(iommu, domain->id,
2904                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2905                        else {
2906                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2907                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2908                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2909                        }
2910                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2911                }
2912                deferred_flush[i].next = 0;
2913        }
2914
2915        list_size = 0;
2916}
2917
2918static void flush_unmaps_timeout(unsigned long data)
2919{
2920        unsigned long flags;
2921
2922        spin_lock_irqsave(&async_umap_flush_lock, flags);
2923        flush_unmaps();
2924        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2925}
2926
2927static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2928{
2929        unsigned long flags;
2930        int next, iommu_id;
2931        struct intel_iommu *iommu;
2932
2933        spin_lock_irqsave(&async_umap_flush_lock, flags);
2934        if (list_size == HIGH_WATER_MARK)
2935                flush_unmaps();
2936
2937        iommu = domain_get_iommu(dom);
2938        iommu_id = iommu->seq_id;
2939
2940        next = deferred_flush[iommu_id].next;
2941        deferred_flush[iommu_id].domain[next] = dom;
2942        deferred_flush[iommu_id].iova[next] = iova;
2943        deferred_flush[iommu_id].next++;
2944
2945        if (!timer_on) {
2946                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2947                timer_on = 1;
2948        }
2949        list_size++;
2950        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2951}
2952
2953static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2954                             size_t size, enum dma_data_direction dir,
2955                             struct dma_attrs *attrs)
2956{
2957        struct pci_dev *pdev = to_pci_dev(dev);
2958        struct dmar_domain *domain;
2959        unsigned long start_pfn, last_pfn;
2960        struct iova *iova;
2961        struct intel_iommu *iommu;
2962
2963        if (iommu_no_mapping(dev))
2964                return;
2965
2966        domain = find_domain(pdev);
2967        BUG_ON(!domain);
2968
2969        iommu = domain_get_iommu(domain);
2970
2971        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2972        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2973                      (unsigned long long)dev_addr))
2974                return;
2975
2976        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2977        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2978
2979        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2980                 pci_name(pdev), start_pfn, last_pfn);
2981
2982        /*  clear the whole page */
2983        dma_pte_clear_range(domain, start_pfn, last_pfn);
2984
2985        /* free page tables */
2986        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2987
2988        if (intel_iommu_strict) {
2989                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2990                                      last_pfn - start_pfn + 1, 0);
2991                /* free iova */
2992                __free_iova(&domain->iovad, iova);
2993        } else {
2994                add_unmap(domain, iova);
2995                /*
2996                 * queue up the release of the unmap to save the 1/6th of the
2997                 * cpu used up by the iotlb flush operation...
2998                 */
2999        }
3000}
3001
3002static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3003                                  dma_addr_t *dma_handle, gfp_t flags,
3004                                  struct dma_attrs *attrs)
3005{
3006        void *vaddr;
3007        int order;
3008
3009        size = PAGE_ALIGN(size);
3010        order = get_order(size);
3011
3012        if (!iommu_no_mapping(hwdev))
3013                flags &= ~(GFP_DMA | GFP_DMA32);
3014        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3015                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3016                        flags |= GFP_DMA;
3017                else
3018                        flags |= GFP_DMA32;
3019        }
3020
3021        vaddr = (void *)__get_free_pages(flags, order);
3022        if (!vaddr)
3023                return NULL;
3024        memset(vaddr, 0, size);
3025
3026        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3027                                         DMA_BIDIRECTIONAL,
3028                                         hwdev->coherent_dma_mask);
3029        if (*dma_handle)
3030                return vaddr;
3031        free_pages((unsigned long)vaddr, order);
3032        return NULL;
3033}
3034
3035static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3036                                dma_addr_t dma_handle, struct dma_attrs *attrs)
3037{
3038        int order;
3039
3040        size = PAGE_ALIGN(size);
3041        order = get_order(size);
3042
3043        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3044        free_pages((unsigned long)vaddr, order);
3045}
3046
3047static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3048                           int nelems, enum dma_data_direction dir,
3049                           struct dma_attrs *attrs)
3050{
3051        struct pci_dev *pdev = to_pci_dev(hwdev);
3052        struct dmar_domain *domain;
3053        unsigned long start_pfn, last_pfn;
3054        struct iova *iova;
3055        struct intel_iommu *iommu;
3056
3057        if (iommu_no_mapping(hwdev))
3058                return;
3059
3060        domain = find_domain(pdev);
3061        BUG_ON(!domain);
3062
3063        iommu = domain_get_iommu(domain);
3064
3065        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3066        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3067                      (unsigned long long)sglist[0].dma_address))
3068                return;
3069
3070        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3071        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3072
3073        /*  clear the whole page */
3074        dma_pte_clear_range(domain, start_pfn, last_pfn);
3075
3076        /* free page tables */
3077        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3078
3079        if (intel_iommu_strict) {
3080                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3081                                      last_pfn - start_pfn + 1, 0);
3082                /* free iova */
3083                __free_iova(&domain->iovad, iova);
3084        } else {
3085                add_unmap(domain, iova);
3086                /*
3087                 * queue up the release of the unmap to save the 1/6th of the
3088                 * cpu used up by the iotlb flush operation...
3089                 */
3090        }
3091}
3092
3093static int intel_nontranslate_map_sg(struct device *hddev,
3094        struct scatterlist *sglist, int nelems, int dir)
3095{
3096        int i;
3097        struct scatterlist *sg;
3098
3099        for_each_sg(sglist, sg, nelems, i) {
3100                BUG_ON(!sg_page(sg));
3101                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3102                sg->dma_length = sg->length;
3103        }
3104        return nelems;
3105}
3106
3107static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3108                        enum dma_data_direction dir, struct dma_attrs *attrs)
3109{
3110        int i;
3111        struct pci_dev *pdev = to_pci_dev(hwdev);
3112        struct dmar_domain *domain;
3113        size_t size = 0;
3114        int prot = 0;
3115        struct iova *iova = NULL;
3116        int ret;
3117        struct scatterlist *sg;
3118        unsigned long start_vpfn;
3119        struct intel_iommu *iommu;
3120
3121        BUG_ON(dir == DMA_NONE);
3122        if (iommu_no_mapping(hwdev))
3123                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3124
3125        domain = get_valid_domain_for_dev(pdev);
3126        if (!domain)
3127                return 0;
3128
3129        iommu = domain_get_iommu(domain);
3130
3131        for_each_sg(sglist, sg, nelems, i)
3132                size += aligned_nrpages(sg->offset, sg->length);
3133
3134        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3135                                pdev->dma_mask);
3136        if (!iova) {
3137                sglist->dma_length = 0;
3138                return 0;
3139        }
3140
3141        /*
3142         * Check if DMAR supports zero-length reads on write only
3143         * mappings..
3144         */
3145        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3146                        !cap_zlr(iommu->cap))
3147                prot |= DMA_PTE_READ;
3148        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3149                prot |= DMA_PTE_WRITE;
3150
3151        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3152
3153        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3154        if (unlikely(ret)) {
3155                /*  clear the page */
3156                dma_pte_clear_range(domain, start_vpfn,
3157                                    start_vpfn + size - 1);
3158                /* free page tables */
3159                dma_pte_free_pagetable(domain, start_vpfn,
3160                                       start_vpfn + size - 1);
3161                /* free iova */
3162                __free_iova(&domain->iovad, iova);
3163                return 0;
3164        }
3165
3166        /* it's a non-present to present mapping. Only flush if caching mode */
3167        if (cap_caching_mode(iommu->cap))
3168                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3169        else
3170                iommu_flush_write_buffer(iommu);
3171
3172        return nelems;
3173}
3174
3175static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3176{
3177        return !dma_addr;
3178}
3179
3180struct dma_map_ops intel_dma_ops = {
3181        .alloc = intel_alloc_coherent,
3182        .free = intel_free_coherent,
3183        .map_sg = intel_map_sg,
3184        .unmap_sg = intel_unmap_sg,
3185        .map_page = intel_map_page,
3186        .unmap_page = intel_unmap_page,
3187        .mapping_error = intel_mapping_error,
3188};
3189
3190static inline int iommu_domain_cache_init(void)
3191{
3192        int ret = 0;
3193
3194        iommu_domain_cache = kmem_cache_create("iommu_domain",
3195                                         sizeof(struct dmar_domain),
3196                                         0,
3197                                         SLAB_HWCACHE_ALIGN,
3198
3199                                         NULL);
3200        if (!iommu_domain_cache) {
3201                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3202                ret = -ENOMEM;
3203        }
3204
3205        return ret;
3206}
3207
3208static inline int iommu_devinfo_cache_init(void)
3209{
3210        int ret = 0;
3211
3212        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3213                                         sizeof(struct device_domain_info),
3214                                         0,
3215                                         SLAB_HWCACHE_ALIGN,
3216                                         NULL);
3217        if (!iommu_devinfo_cache) {
3218                printk(KERN_ERR "Couldn't create devinfo cache\n");
3219                ret = -ENOMEM;
3220        }
3221
3222        return ret;
3223}
3224
3225static inline int iommu_iova_cache_init(void)
3226{
3227        int ret = 0;
3228
3229        iommu_iova_cache = kmem_cache_create("iommu_iova",
3230                                         sizeof(struct iova),
3231                                         0,
3232                                         SLAB_HWCACHE_ALIGN,
3233                                         NULL);
3234        if (!iommu_iova_cache) {
3235                printk(KERN_ERR "Couldn't create iova cache\n");
3236                ret = -ENOMEM;
3237        }
3238
3239        return ret;
3240}
3241
3242static int __init iommu_init_mempool(void)
3243{
3244        int ret;
3245        ret = iommu_iova_cache_init();
3246        if (ret)
3247                return ret;
3248
3249        ret = iommu_domain_cache_init();
3250        if (ret)
3251                goto domain_error;
3252
3253        ret = iommu_devinfo_cache_init();
3254        if (!ret)
3255                return ret;
3256
3257        kmem_cache_destroy(iommu_domain_cache);
3258domain_error:
3259        kmem_cache_destroy(iommu_iova_cache);
3260
3261        return -ENOMEM;
3262}
3263
3264static void __init iommu_exit_mempool(void)
3265{
3266        kmem_cache_destroy(iommu_devinfo_cache);
3267        kmem_cache_destroy(iommu_domain_cache);
3268        kmem_cache_destroy(iommu_iova_cache);
3269
3270}
3271
3272static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3273{
3274        struct dmar_drhd_unit *drhd;
3275        u32 vtbar;
3276        int rc;
3277
3278        /* We know that this device on this chipset has its own IOMMU.
3279         * If we find it under a different IOMMU, then the BIOS is lying
3280         * to us. Hope that the IOMMU for this device is actually
3281         * disabled, and it needs no translation...
3282         */
3283        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3284        if (rc) {
3285                /* "can't" happen */
3286                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3287                return;
3288        }
3289        vtbar &= 0xffff0000;
3290
3291        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3292        drhd = dmar_find_matched_drhd_unit(pdev);
3293        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3294                            TAINT_FIRMWARE_WORKAROUND,
3295                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3296                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3297}
3298DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3299
3300static void __init init_no_remapping_devices(void)
3301{
3302        struct dmar_drhd_unit *drhd;
3303
3304        for_each_drhd_unit(drhd) {
3305                if (!drhd->include_all) {
3306                        int i;
3307                        for (i = 0; i < drhd->devices_cnt; i++)
3308                                if (drhd->devices[i] != NULL)
3309                                        break;
3310                        /* ignore DMAR unit if no pci devices exist */
3311                        if (i == drhd->devices_cnt)
3312                                drhd->ignored = 1;
3313                }
3314        }
3315
3316        for_each_drhd_unit(drhd) {
3317                int i;
3318                if (drhd->ignored || drhd->include_all)
3319                        continue;
3320
3321                for (i = 0; i < drhd->devices_cnt; i++)
3322                        if (drhd->devices[i] &&
3323                            !IS_GFX_DEVICE(drhd->devices[i]))
3324                                break;
3325
3326                if (i < drhd->devices_cnt)
3327                        continue;
3328
3329                /* This IOMMU has *only* gfx devices. Either bypass it or
3330                   set the gfx_mapped flag, as appropriate */
3331                if (dmar_map_gfx) {
3332                        intel_iommu_gfx_mapped = 1;
3333                } else {
3334                        drhd->ignored = 1;
3335                        for (i = 0; i < drhd->devices_cnt; i++) {
3336                                if (!drhd->devices[i])
3337                                        continue;
3338                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3339                        }
3340                }
3341        }
3342}
3343
3344#ifdef CONFIG_SUSPEND
3345static int init_iommu_hw(void)
3346{
3347        struct dmar_drhd_unit *drhd;
3348        struct intel_iommu *iommu = NULL;
3349
3350        for_each_active_iommu(iommu, drhd)
3351                if (iommu->qi)
3352                        dmar_reenable_qi(iommu);
3353
3354        for_each_iommu(iommu, drhd) {
3355                if (drhd->ignored) {
3356                        /*
3357                         * we always have to disable PMRs or DMA may fail on
3358                         * this device
3359                         */
3360                        if (force_on)
3361                                iommu_disable_protect_mem_regions(iommu);
3362                        continue;
3363                }
3364        
3365                iommu_flush_write_buffer(iommu);
3366
3367                iommu_set_root_entry(iommu);
3368
3369                iommu->flush.flush_context(iommu, 0, 0, 0,
3370                                           DMA_CCMD_GLOBAL_INVL);
3371                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3372                                         DMA_TLB_GLOBAL_FLUSH);
3373                if (iommu_enable_translation(iommu))
3374                        return 1;
3375                iommu_disable_protect_mem_regions(iommu);
3376        }
3377
3378        return 0;
3379}
3380
3381static void iommu_flush_all(void)
3382{
3383        struct dmar_drhd_unit *drhd;
3384        struct intel_iommu *iommu;
3385
3386        for_each_active_iommu(iommu, drhd) {
3387                iommu->flush.flush_context(iommu, 0, 0, 0,
3388                                           DMA_CCMD_GLOBAL_INVL);
3389                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3390                                         DMA_TLB_GLOBAL_FLUSH);
3391        }
3392}
3393
3394static int iommu_suspend(void)
3395{
3396        struct dmar_drhd_unit *drhd;
3397        struct intel_iommu *iommu = NULL;
3398        unsigned long flag;
3399
3400        for_each_active_iommu(iommu, drhd) {
3401                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3402                                                 GFP_ATOMIC);
3403                if (!iommu->iommu_state)
3404                        goto nomem;
3405        }
3406
3407        iommu_flush_all();
3408
3409        for_each_active_iommu(iommu, drhd) {
3410                iommu_disable_translation(iommu);
3411
3412                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3413
3414                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3415                        readl(iommu->reg + DMAR_FECTL_REG);
3416                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3417                        readl(iommu->reg + DMAR_FEDATA_REG);
3418                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3419                        readl(iommu->reg + DMAR_FEADDR_REG);
3420                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3421                        readl(iommu->reg + DMAR_FEUADDR_REG);
3422
3423                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3424        }
3425        return 0;
3426
3427nomem:
3428        for_each_active_iommu(iommu, drhd)
3429                kfree(iommu->iommu_state);
3430
3431        return -ENOMEM;
3432}
3433
3434static void iommu_resume(void)
3435{
3436        struct dmar_drhd_unit *drhd;
3437        struct intel_iommu *iommu = NULL;
3438        unsigned long flag;
3439
3440        if (init_iommu_hw()) {
3441                if (force_on)
3442                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3443                else
3444                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3445                return;
3446        }
3447
3448        for_each_active_iommu(iommu, drhd) {
3449
3450                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3451
3452                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3453                        iommu->reg + DMAR_FECTL_REG);
3454                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3455                        iommu->reg + DMAR_FEDATA_REG);
3456                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3457                        iommu->reg + DMAR_FEADDR_REG);
3458                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3459                        iommu->reg + DMAR_FEUADDR_REG);
3460
3461                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3462        }
3463
3464        for_each_active_iommu(iommu, drhd)
3465                kfree(iommu->iommu_state);
3466}
3467
3468static struct syscore_ops iommu_syscore_ops = {
3469        .resume         = iommu_resume,
3470        .suspend        = iommu_suspend,
3471};
3472
3473static void __init init_iommu_pm_ops(void)
3474{
3475        register_syscore_ops(&iommu_syscore_ops);
3476}
3477
3478#else
3479static inline void init_iommu_pm_ops(void) {}
3480#endif  /* CONFIG_PM */
3481
3482LIST_HEAD(dmar_rmrr_units);
3483
3484static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3485{
3486        list_add(&rmrr->list, &dmar_rmrr_units);
3487}
3488
3489
3490int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3491{
3492        struct acpi_dmar_reserved_memory *rmrr;
3493        struct dmar_rmrr_unit *rmrru;
3494
3495        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3496        if (!rmrru)
3497                return -ENOMEM;
3498
3499        rmrru->hdr = header;
3500        rmrr = (struct acpi_dmar_reserved_memory *)header;
3501        rmrru->base_address = rmrr->base_address;
3502        rmrru->end_address = rmrr->end_address;
3503
3504        dmar_register_rmrr_unit(rmrru);
3505        return 0;
3506}
3507
3508static int __init
3509rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3510{
3511        struct acpi_dmar_reserved_memory *rmrr;
3512        int ret;
3513
3514        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3515        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3516                ((void *)rmrr) + rmrr->header.length,
3517                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3518
3519        if (ret || (rmrru->devices_cnt == 0)) {
3520                list_del(&rmrru->list);
3521                kfree(rmrru);
3522        }
3523        return ret;
3524}
3525
3526static LIST_HEAD(dmar_atsr_units);
3527
3528int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3529{
3530        struct acpi_dmar_atsr *atsr;
3531        struct dmar_atsr_unit *atsru;
3532
3533        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3534        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3535        if (!atsru)
3536                return -ENOMEM;
3537
3538        atsru->hdr = hdr;
3539        atsru->include_all = atsr->flags & 0x1;
3540
3541        list_add(&atsru->list, &dmar_atsr_units);
3542
3543        return 0;
3544}
3545
3546static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3547{
3548        int rc;
3549        struct acpi_dmar_atsr *atsr;
3550
3551        if (atsru->include_all)
3552                return 0;
3553
3554        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3555        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3556                                (void *)atsr + atsr->header.length,
3557                                &atsru->devices_cnt, &atsru->devices,
3558                                atsr->segment);
3559        if (rc || !atsru->devices_cnt) {
3560                list_del(&atsru->list);
3561                kfree(atsru);
3562        }
3563
3564        return rc;
3565}
3566
3567int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3568{
3569        int i;
3570        struct pci_bus *bus;
3571        struct acpi_dmar_atsr *atsr;
3572        struct dmar_atsr_unit *atsru;
3573
3574        dev = pci_physfn(dev);
3575
3576        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3577                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3578                if (atsr->segment == pci_domain_nr(dev->bus))
3579                        goto found;
3580        }
3581
3582        return 0;
3583
3584found:
3585        for (bus = dev->bus; bus; bus = bus->parent) {
3586                struct pci_dev *bridge = bus->self;
3587
3588                if (!bridge || !pci_is_pcie(bridge) ||
3589                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3590                        return 0;
3591
3592                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3593                        for (i = 0; i < atsru->devices_cnt; i++)
3594                                if (atsru->devices[i] == bridge)
3595                                        return 1;
3596                        break;
3597                }
3598        }
3599
3600        if (atsru->include_all)
3601                return 1;
3602
3603        return 0;
3604}
3605
3606int __init dmar_parse_rmrr_atsr_dev(void)
3607{
3608        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3609        struct dmar_atsr_unit *atsr, *atsr_n;
3610        int ret = 0;
3611
3612        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3613                ret = rmrr_parse_dev(rmrr);
3614                if (ret)
3615                        return ret;
3616        }
3617
3618        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3619                ret = atsr_parse_dev(atsr);
3620                if (ret)
3621                        return ret;
3622        }
3623
3624        return ret;
3625}
3626
3627/*
3628 * Here we only respond to action of unbound device from driver.
3629 *
3630 * Added device is not attached to its DMAR domain here yet. That will happen
3631 * when mapping the device to iova.
3632 */
3633static int device_notifier(struct notifier_block *nb,
3634                                  unsigned long action, void *data)
3635{
3636        struct device *dev = data;
3637        struct pci_dev *pdev = to_pci_dev(dev);
3638        struct dmar_domain *domain;
3639
3640        if (iommu_no_mapping(dev))
3641                return 0;
3642
3643        domain = find_domain(pdev);
3644        if (!domain)
3645                return 0;
3646
3647        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3648                domain_remove_one_dev_info(domain, pdev);
3649
3650                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3651                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3652                    list_empty(&domain->devices))
3653                        domain_exit(domain);
3654        }
3655
3656        return 0;
3657}
3658
3659static struct notifier_block device_nb = {
3660        .notifier_call = device_notifier,
3661};
3662
3663int __init intel_iommu_init(void)
3664{
3665        int ret = 0;
3666
3667        /* VT-d is required for a TXT/tboot launch, so enforce that */
3668        force_on = tboot_force_iommu();
3669
3670        if (dmar_table_init()) {
3671                if (force_on)
3672                        panic("tboot: Failed to initialize DMAR table\n");
3673                return  -ENODEV;
3674        }
3675
3676        if (dmar_dev_scope_init() < 0) {
3677                if (force_on)
3678                        panic("tboot: Failed to initialize DMAR device scope\n");
3679                return  -ENODEV;
3680        }
3681
3682        if (no_iommu || dmar_disabled)
3683                return -ENODEV;
3684
3685        if (iommu_init_mempool()) {
3686                if (force_on)
3687                        panic("tboot: Failed to initialize iommu memory\n");
3688                return  -ENODEV;
3689        }
3690
3691        if (list_empty(&dmar_rmrr_units))
3692                printk(KERN_INFO "DMAR: No RMRR found\n");
3693
3694        if (list_empty(&dmar_atsr_units))
3695                printk(KERN_INFO "DMAR: No ATSR found\n");
3696
3697        if (dmar_init_reserved_ranges()) {
3698                if (force_on)
3699                        panic("tboot: Failed to reserve iommu ranges\n");
3700                return  -ENODEV;
3701        }
3702
3703        init_no_remapping_devices();
3704
3705        ret = init_dmars();
3706        if (ret) {
3707                if (force_on)
3708                        panic("tboot: Failed to initialize DMARs\n");
3709                printk(KERN_ERR "IOMMU: dmar init failed\n");
3710                put_iova_domain(&reserved_iova_list);
3711                iommu_exit_mempool();
3712                return ret;
3713        }
3714        printk(KERN_INFO
3715        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3716
3717        init_timer(&unmap_timer);
3718#ifdef CONFIG_SWIOTLB
3719        swiotlb = 0;
3720#endif
3721        dma_ops = &intel_dma_ops;
3722
3723        init_iommu_pm_ops();
3724
3725        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3726
3727        bus_register_notifier(&pci_bus_type, &device_nb);
3728
3729        intel_iommu_enabled = 1;
3730
3731        return 0;
3732}
3733
3734static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3735                                           struct pci_dev *pdev)
3736{
3737        struct pci_dev *tmp, *parent;
3738
3739        if (!iommu || !pdev)
3740                return;
3741
3742        /* dependent device detach */
3743        tmp = pci_find_upstream_pcie_bridge(pdev);
3744        /* Secondary interface's bus number and devfn 0 */
3745        if (tmp) {
3746                parent = pdev->bus->self;
3747                while (parent != tmp) {
3748                        iommu_detach_dev(iommu, parent->bus->number,
3749                                         parent->devfn);
3750                        parent = parent->bus->self;
3751                }
3752                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3753                        iommu_detach_dev(iommu,
3754                                tmp->subordinate->number, 0);
3755                else /* this is a legacy PCI bridge */
3756                        iommu_detach_dev(iommu, tmp->bus->number,
3757                                         tmp->devfn);
3758        }
3759}
3760
3761static void domain_remove_one_dev_info(struct dmar_domain *domain,
3762                                          struct pci_dev *pdev)
3763{
3764        struct device_domain_info *info;
3765        struct intel_iommu *iommu;
3766        unsigned long flags;
3767        int found = 0;
3768        struct list_head *entry, *tmp;
3769
3770        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3771                                pdev->devfn);
3772        if (!iommu)
3773                return;
3774
3775        spin_lock_irqsave(&device_domain_lock, flags);
3776        list_for_each_safe(entry, tmp, &domain->devices) {
3777                info = list_entry(entry, struct device_domain_info, link);
3778                if (info->segment == pci_domain_nr(pdev->bus) &&
3779                    info->bus == pdev->bus->number &&
3780                    info->devfn == pdev->devfn) {
3781                        unlink_domain_info(info);
3782                        spin_unlock_irqrestore(&device_domain_lock, flags);
3783
3784                        iommu_disable_dev_iotlb(info);
3785                        iommu_detach_dev(iommu, info->bus, info->devfn);
3786                        iommu_detach_dependent_devices(iommu, pdev);
3787                        free_devinfo_mem(info);
3788
3789                        spin_lock_irqsave(&device_domain_lock, flags);
3790
3791                        if (found)
3792                                break;
3793                        else
3794                                continue;
3795                }
3796
3797                /* if there is no other devices under the same iommu
3798                 * owned by this domain, clear this iommu in iommu_bmp
3799                 * update iommu count and coherency
3800                 */
3801                if (iommu == device_to_iommu(info->segment, info->bus,
3802                                            info->devfn))
3803                        found = 1;
3804        }
3805
3806        spin_unlock_irqrestore(&device_domain_lock, flags);
3807
3808        if (found == 0) {
3809                unsigned long tmp_flags;
3810                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3811                clear_bit(iommu->seq_id, domain->iommu_bmp);
3812                domain->iommu_count--;
3813                domain_update_iommu_cap(domain);
3814                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3815
3816                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3817                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3818                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3819                        clear_bit(domain->id, iommu->domain_ids);
3820                        iommu->domains[domain->id] = NULL;
3821                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3822                }
3823        }
3824}
3825
3826static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3827{
3828        struct device_domain_info *info;
3829        struct intel_iommu *iommu;
3830        unsigned long flags1, flags2;
3831
3832        spin_lock_irqsave(&device_domain_lock, flags1);
3833        while (!list_empty(&domain->devices)) {
3834                info = list_entry(domain->devices.next,
3835                        struct device_domain_info, link);
3836                unlink_domain_info(info);
3837                spin_unlock_irqrestore(&device_domain_lock, flags1);
3838
3839                iommu_disable_dev_iotlb(info);
3840                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3841                iommu_detach_dev(iommu, info->bus, info->devfn);
3842                iommu_detach_dependent_devices(iommu, info->dev);
3843
3844                /* clear this iommu in iommu_bmp, update iommu count
3845                 * and capabilities
3846                 */
3847                spin_lock_irqsave(&domain->iommu_lock, flags2);
3848                if (test_and_clear_bit(iommu->seq_id,
3849                                       domain->iommu_bmp)) {
3850                        domain->iommu_count--;
3851                        domain_update_iommu_cap(domain);
3852                }
3853                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3854
3855                free_devinfo_mem(info);
3856                spin_lock_irqsave(&device_domain_lock, flags1);
3857        }
3858        spin_unlock_irqrestore(&device_domain_lock, flags1);
3859}
3860
3861/* domain id for virtual machine, it won't be set in context */
3862static unsigned long vm_domid;
3863
3864static struct dmar_domain *iommu_alloc_vm_domain(void)
3865{
3866        struct dmar_domain *domain;
3867
3868        domain = alloc_domain_mem();
3869        if (!domain)
3870                return NULL;
3871
3872        domain->id = vm_domid++;
3873        domain->nid = -1;
3874        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3875        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3876
3877        return domain;
3878}
3879
3880static int md_domain_init(struct dmar_domain *domain, int guest_width)
3881{
3882        int adjust_width;
3883
3884        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3885        spin_lock_init(&domain->iommu_lock);
3886
3887        domain_reserve_special_ranges(domain);
3888
3889        /* calculate AGAW */
3890        domain->gaw = guest_width;
3891        adjust_width = guestwidth_to_adjustwidth(guest_width);
3892        domain->agaw = width_to_agaw(adjust_width);
3893
3894        INIT_LIST_HEAD(&domain->devices);
3895
3896        domain->iommu_count = 0;
3897        domain->iommu_coherency = 0;
3898        domain->iommu_snooping = 0;
3899        domain->iommu_superpage = 0;
3900        domain->max_addr = 0;
3901        domain->nid = -1;
3902
3903        /* always allocate the top pgd */
3904        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3905        if (!domain->pgd)
3906                return -ENOMEM;
3907        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3908        return 0;
3909}
3910
3911static void iommu_free_vm_domain(struct dmar_domain *domain)
3912{
3913        unsigned long flags;
3914        struct dmar_drhd_unit *drhd;
3915        struct intel_iommu *iommu;
3916        unsigned long i;
3917        unsigned long ndomains;
3918
3919        for_each_drhd_unit(drhd) {
3920                if (drhd->ignored)
3921                        continue;
3922                iommu = drhd->iommu;
3923
3924                ndomains = cap_ndoms(iommu->cap);
3925                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3926                        if (iommu->domains[i] == domain) {
3927                                spin_lock_irqsave(&iommu->lock, flags);
3928                                clear_bit(i, iommu->domain_ids);
3929                                iommu->domains[i] = NULL;
3930                                spin_unlock_irqrestore(&iommu->lock, flags);
3931                                break;
3932                        }
3933                }
3934        }
3935}
3936
3937static void vm_domain_exit(struct dmar_domain *domain)
3938{
3939        /* Domain 0 is reserved, so dont process it */
3940        if (!domain)
3941                return;
3942
3943        vm_domain_remove_all_dev_info(domain);
3944        /* destroy iovas */
3945        put_iova_domain(&domain->iovad);
3946
3947        /* clear ptes */
3948        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3949
3950        /* free page tables */
3951        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3952
3953        iommu_free_vm_domain(domain);
3954        free_domain_mem(domain);
3955}
3956
3957static int intel_iommu_domain_init(struct iommu_domain *domain)
3958{
3959        struct dmar_domain *dmar_domain;
3960
3961        dmar_domain = iommu_alloc_vm_domain();
3962        if (!dmar_domain) {
3963                printk(KERN_ERR
3964                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3965                return -ENOMEM;
3966        }
3967        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3968                printk(KERN_ERR
3969                        "intel_iommu_domain_init() failed\n");
3970                vm_domain_exit(dmar_domain);
3971                return -ENOMEM;
3972        }
3973        domain_update_iommu_cap(dmar_domain);
3974        domain->priv = dmar_domain;
3975
3976        domain->geometry.aperture_start = 0;
3977        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3978        domain->geometry.force_aperture = true;
3979
3980        return 0;
3981}
3982
3983static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3984{
3985        struct dmar_domain *dmar_domain = domain->priv;
3986
3987        domain->priv = NULL;
3988        vm_domain_exit(dmar_domain);
3989}
3990
3991static int intel_iommu_attach_device(struct iommu_domain *domain,
3992                                     struct device *dev)
3993{
3994        struct dmar_domain *dmar_domain = domain->priv;
3995        struct pci_dev *pdev = to_pci_dev(dev);
3996        struct intel_iommu *iommu;
3997        int addr_width;
3998
3999        /* normally pdev is not mapped */
4000        if (unlikely(domain_context_mapped(pdev))) {
4001                struct dmar_domain *old_domain;
4002
4003                old_domain = find_domain(pdev);
4004                if (old_domain) {
4005                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4006                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4007                                domain_remove_one_dev_info(old_domain, pdev);
4008                        else
4009                                domain_remove_dev_info(old_domain);
4010                }
4011        }
4012
4013        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4014                                pdev->devfn);
4015        if (!iommu)
4016                return -ENODEV;
4017
4018        /* check if this iommu agaw is sufficient for max mapped address */
4019        addr_width = agaw_to_width(iommu->agaw);
4020        if (addr_width > cap_mgaw(iommu->cap))
4021                addr_width = cap_mgaw(iommu->cap);
4022
4023        if (dmar_domain->max_addr > (1LL << addr_width)) {
4024                printk(KERN_ERR "%s: iommu width (%d) is not "
4025                       "sufficient for the mapped address (%llx)\n",
4026                       __func__, addr_width, dmar_domain->max_addr);
4027                return -EFAULT;
4028        }
4029        dmar_domain->gaw = addr_width;
4030
4031        /*
4032         * Knock out extra levels of page tables if necessary
4033         */
4034        while (iommu->agaw < dmar_domain->agaw) {
4035                struct dma_pte *pte;
4036
4037                pte = dmar_domain->pgd;
4038                if (dma_pte_present(pte)) {
4039                        dmar_domain->pgd = (struct dma_pte *)
4040                                phys_to_virt(dma_pte_addr(pte));
4041                        free_pgtable_page(pte);
4042                }
4043                dmar_domain->agaw--;
4044        }
4045
4046        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4047}
4048
4049static void intel_iommu_detach_device(struct iommu_domain *domain,
4050                                      struct device *dev)
4051{
4052        struct dmar_domain *dmar_domain = domain->priv;
4053        struct pci_dev *pdev = to_pci_dev(dev);
4054
4055        domain_remove_one_dev_info(dmar_domain, pdev);
4056}
4057
4058static int intel_iommu_map(struct iommu_domain *domain,
4059                           unsigned long iova, phys_addr_t hpa,
4060                           size_t size, int iommu_prot)
4061{
4062        struct dmar_domain *dmar_domain = domain->priv;
4063        u64 max_addr;
4064        int prot = 0;
4065        int ret;
4066
4067        if (iommu_prot & IOMMU_READ)
4068                prot |= DMA_PTE_READ;
4069        if (iommu_prot & IOMMU_WRITE)
4070                prot |= DMA_PTE_WRITE;
4071        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4072                prot |= DMA_PTE_SNP;
4073
4074        max_addr = iova + size;
4075        if (dmar_domain->max_addr < max_addr) {
4076                u64 end;
4077
4078                /* check if minimum agaw is sufficient for mapped address */
4079                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4080                if (end < max_addr) {
4081                        printk(KERN_ERR "%s: iommu width (%d) is not "
4082                               "sufficient for the mapped address (%llx)\n",
4083                               __func__, dmar_domain->gaw, max_addr);
4084                        return -EFAULT;
4085                }
4086                dmar_domain->max_addr = max_addr;
4087        }
4088        /* Round up size to next multiple of PAGE_SIZE, if it and
4089           the low bits of hpa would take us onto the next page */
4090        size = aligned_nrpages(hpa, size);
4091        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4092                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4093        return ret;
4094}
4095
4096static size_t intel_iommu_unmap(struct iommu_domain *domain,
4097                             unsigned long iova, size_t size)
4098{
4099        struct dmar_domain *dmar_domain = domain->priv;
4100        int order;
4101
4102        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4103                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4104
4105        if (dmar_domain->max_addr == iova + size)
4106                dmar_domain->max_addr = iova;
4107
4108        return PAGE_SIZE << order;
4109}
4110
4111static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4112                                            unsigned long iova)
4113{
4114        struct dmar_domain *dmar_domain = domain->priv;
4115        struct dma_pte *pte;
4116        u64 phys = 0;
4117
4118        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4119        if (pte)
4120                phys = dma_pte_addr(pte);
4121
4122        return phys;
4123}
4124
4125static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4126                                      unsigned long cap)
4127{
4128        struct dmar_domain *dmar_domain = domain->priv;
4129
4130        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4131                return dmar_domain->iommu_snooping;
4132        if (cap == IOMMU_CAP_INTR_REMAP)
4133                return irq_remapping_enabled;
4134
4135        return 0;
4136}
4137
4138static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4139{
4140        pci_dev_put(*from);
4141        *from = to;
4142}
4143
4144#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4145
4146static int intel_iommu_add_device(struct device *dev)
4147{
4148        struct pci_dev *pdev = to_pci_dev(dev);
4149        struct pci_dev *bridge, *dma_pdev = NULL;
4150        struct iommu_group *group;
4151        int ret;
4152
4153        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4154                             pdev->bus->number, pdev->devfn))
4155                return -ENODEV;
4156
4157        bridge = pci_find_upstream_pcie_bridge(pdev);
4158        if (bridge) {
4159                if (pci_is_pcie(bridge))
4160                        dma_pdev = pci_get_domain_bus_and_slot(
4161                                                pci_domain_nr(pdev->bus),
4162                                                bridge->subordinate->number, 0);
4163                if (!dma_pdev)
4164                        dma_pdev = pci_dev_get(bridge);
4165        } else
4166                dma_pdev = pci_dev_get(pdev);
4167
4168        /* Account for quirked devices */
4169        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4170
4171        /*
4172         * If it's a multifunction device that does not support our
4173         * required ACS flags, add to the same group as function 0.
4174         */
4175        if (dma_pdev->multifunction &&
4176            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4177                swap_pci_ref(&dma_pdev,
4178                             pci_get_slot(dma_pdev->bus,
4179                                          PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4180                                          0)));
4181
4182        /*
4183         * Devices on the root bus go through the iommu.  If that's not us,
4184         * find the next upstream device and test ACS up to the root bus.
4185         * Finding the next device may require skipping virtual buses.
4186         */
4187        while (!pci_is_root_bus(dma_pdev->bus)) {
4188                struct pci_bus *bus = dma_pdev->bus;
4189
4190                while (!bus->self) {
4191                        if (!pci_is_root_bus(bus))
4192                                bus = bus->parent;
4193                        else
4194                                goto root_bus;
4195                }
4196
4197                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4198                        break;
4199
4200                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4201        }
4202
4203root_bus:
4204        group = iommu_group_get(&dma_pdev->dev);
4205        pci_dev_put(dma_pdev);
4206        if (!group) {
4207                group = iommu_group_alloc();
4208                if (IS_ERR(group))
4209                        return PTR_ERR(group);
4210        }
4211
4212        ret = iommu_group_add_device(group, dev);
4213
4214        iommu_group_put(group);
4215        return ret;
4216}
4217
4218static void intel_iommu_remove_device(struct device *dev)
4219{
4220        iommu_group_remove_device(dev);
4221}
4222
4223static struct iommu_ops intel_iommu_ops = {
4224        .domain_init    = intel_iommu_domain_init,
4225        .domain_destroy = intel_iommu_domain_destroy,
4226        .attach_dev     = intel_iommu_attach_device,
4227        .detach_dev     = intel_iommu_detach_device,
4228        .map            = intel_iommu_map,
4229        .unmap          = intel_iommu_unmap,
4230        .iova_to_phys   = intel_iommu_iova_to_phys,
4231        .domain_has_cap = intel_iommu_domain_has_cap,
4232        .add_device     = intel_iommu_add_device,
4233        .remove_device  = intel_iommu_remove_device,
4234        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4235};
4236
4237static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4238{
4239        /* G4x/GM45 integrated gfx dmar support is totally busted. */
4240        printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4241        dmar_map_gfx = 0;
4242}
4243
4244DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4245DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4246DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4247DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4248DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4249DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4250DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4251
4252static void quirk_iommu_rwbf(struct pci_dev *dev)
4253{
4254        /*
4255         * Mobile 4 Series Chipset neglects to set RWBF capability,
4256         * but needs it. Same seems to hold for the desktop versions.
4257         */
4258        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4259        rwbf_quirk = 1;
4260}
4261
4262DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4263DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4264DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4265DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4266DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4267DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4268DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4269
4270#define GGC 0x52
4271#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4272#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4273#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4274#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4275#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4276#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4277#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4278#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4279
4280static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4281{
4282        unsigned short ggc;
4283
4284        if (pci_read_config_word(dev, GGC, &ggc))
4285                return;
4286
4287        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4288                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4289                dmar_map_gfx = 0;
4290        } else if (dmar_map_gfx) {
4291                /* we have to ensure the gfx device is idle before we flush */
4292                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4293                intel_iommu_strict = 1;
4294       }
4295}
4296DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4297DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4298DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4299DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4300
4301/* On Tylersburg chipsets, some BIOSes have been known to enable the
4302   ISOCH DMAR unit for the Azalia sound device, but not give it any
4303   TLB entries, which causes it to deadlock. Check for that.  We do
4304   this in a function called from init_dmars(), instead of in a PCI
4305   quirk, because we don't want to print the obnoxious "BIOS broken"
4306   message if VT-d is actually disabled.
4307*/
4308static void __init check_tylersburg_isoch(void)
4309{
4310        struct pci_dev *pdev;
4311        uint32_t vtisochctrl;
4312
4313        /* If there's no Azalia in the system anyway, forget it. */
4314        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4315        if (!pdev)
4316                return;
4317        pci_dev_put(pdev);
4318
4319        /* System Management Registers. Might be hidden, in which case
4320           we can't do the sanity check. But that's OK, because the
4321           known-broken BIOSes _don't_ actually hide it, so far. */
4322        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4323        if (!pdev)
4324                return;
4325
4326        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4327                pci_dev_put(pdev);
4328                return;
4329        }
4330
4331        pci_dev_put(pdev);
4332
4333        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4334        if (vtisochctrl & 1)
4335                return;
4336
4337        /* Drop all bits other than the number of TLB entries */
4338        vtisochctrl &= 0x1c;
4339
4340        /* If we have the recommended number of TLB entries (16), fine. */
4341        if (vtisochctrl == 0x10)
4342                return;
4343
4344        /* Zero TLB entries? You get to ride the short bus to school. */
4345        if (!vtisochctrl) {
4346                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4347                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4348                     dmi_get_system_info(DMI_BIOS_VENDOR),
4349                     dmi_get_system_info(DMI_BIOS_VERSION),
4350                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4351                iommu_identity_mapping |= IDENTMAP_AZALIA;
4352                return;
4353        }
4354        
4355        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4356               vtisochctrl);
4357}
4358
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.