linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#define ROOT_SIZE               VTD_PAGE_SIZE
  50#define CONTEXT_SIZE            VTD_PAGE_SIZE
  51
  52#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  53#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  54#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  55
  56#define IOAPIC_RANGE_START      (0xfee00000)
  57#define IOAPIC_RANGE_END        (0xfeefffff)
  58#define IOVA_START_ADDR         (0x1000)
  59
  60#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  61
  62#define MAX_AGAW_WIDTH 64
  63
  64#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  65#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  66
  67/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  68   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  69#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  70                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  71#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  72
  73#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  74#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  75#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  76
  77/* page table handling */
  78#define LEVEL_STRIDE            (9)
  79#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  80
  81/*
  82 * This bitmap is used to advertise the page sizes our hardware support
  83 * to the IOMMU core, which will then use this information to split
  84 * physically contiguous memory regions it is mapping into page sizes
  85 * that we support.
  86 *
  87 * Traditionally the IOMMU core just handed us the mappings directly,
  88 * after making sure the size is an order of a 4KiB page and that the
  89 * mapping has natural alignment.
  90 *
  91 * To retain this behavior, we currently advertise that we support
  92 * all page sizes that are an order of 4KiB.
  93 *
  94 * If at some point we'd like to utilize the IOMMU core's new behavior,
  95 * we could change this to advertise the real page sizes we support.
  96 */
  97#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
  98
  99static inline int agaw_to_level(int agaw)
 100{
 101        return agaw + 2;
 102}
 103
 104static inline int agaw_to_width(int agaw)
 105{
 106        return 30 + agaw * LEVEL_STRIDE;
 107}
 108
 109static inline int width_to_agaw(int width)
 110{
 111        return (width - 30) / LEVEL_STRIDE;
 112}
 113
 114static inline unsigned int level_to_offset_bits(int level)
 115{
 116        return (level - 1) * LEVEL_STRIDE;
 117}
 118
 119static inline int pfn_level_offset(unsigned long pfn, int level)
 120{
 121        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 122}
 123
 124static inline unsigned long level_mask(int level)
 125{
 126        return -1UL << level_to_offset_bits(level);
 127}
 128
 129static inline unsigned long level_size(int level)
 130{
 131        return 1UL << level_to_offset_bits(level);
 132}
 133
 134static inline unsigned long align_to_level(unsigned long pfn, int level)
 135{
 136        return (pfn + level_size(level) - 1) & level_mask(level);
 137}
 138
 139static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 140{
 141        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 142}
 143
 144/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 145   are never going to work. */
 146static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 147{
 148        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 149}
 150
 151static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 152{
 153        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 154}
 155static inline unsigned long page_to_dma_pfn(struct page *pg)
 156{
 157        return mm_to_dma_pfn(page_to_pfn(pg));
 158}
 159static inline unsigned long virt_to_dma_pfn(void *p)
 160{
 161        return page_to_dma_pfn(virt_to_page(p));
 162}
 163
 164/* global iommu list, set NULL for ignored DMAR units */
 165static struct intel_iommu **g_iommus;
 166
 167static void __init check_tylersburg_isoch(void);
 168static int rwbf_quirk;
 169
 170/*
 171 * set to 1 to panic kernel if can't successfully enable VT-d
 172 * (used when kernel is launched w/ TXT)
 173 */
 174static int force_on = 0;
 175
 176/*
 177 * 0: Present
 178 * 1-11: Reserved
 179 * 12-63: Context Ptr (12 - (haw-1))
 180 * 64-127: Reserved
 181 */
 182struct root_entry {
 183        u64     val;
 184        u64     rsvd1;
 185};
 186#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 187static inline bool root_present(struct root_entry *root)
 188{
 189        return (root->val & 1);
 190}
 191static inline void set_root_present(struct root_entry *root)
 192{
 193        root->val |= 1;
 194}
 195static inline void set_root_value(struct root_entry *root, unsigned long value)
 196{
 197        root->val |= value & VTD_PAGE_MASK;
 198}
 199
 200static inline struct context_entry *
 201get_context_addr_from_root(struct root_entry *root)
 202{
 203        return (struct context_entry *)
 204                (root_present(root)?phys_to_virt(
 205                root->val & VTD_PAGE_MASK) :
 206                NULL);
 207}
 208
 209/*
 210 * low 64 bits:
 211 * 0: present
 212 * 1: fault processing disable
 213 * 2-3: translation type
 214 * 12-63: address space root
 215 * high 64 bits:
 216 * 0-2: address width
 217 * 3-6: aval
 218 * 8-23: domain id
 219 */
 220struct context_entry {
 221        u64 lo;
 222        u64 hi;
 223};
 224
 225static inline bool context_present(struct context_entry *context)
 226{
 227        return (context->lo & 1);
 228}
 229static inline void context_set_present(struct context_entry *context)
 230{
 231        context->lo |= 1;
 232}
 233
 234static inline void context_set_fault_enable(struct context_entry *context)
 235{
 236        context->lo &= (((u64)-1) << 2) | 1;
 237}
 238
 239static inline void context_set_translation_type(struct context_entry *context,
 240                                                unsigned long value)
 241{
 242        context->lo &= (((u64)-1) << 4) | 3;
 243        context->lo |= (value & 3) << 2;
 244}
 245
 246static inline void context_set_address_root(struct context_entry *context,
 247                                            unsigned long value)
 248{
 249        context->lo |= value & VTD_PAGE_MASK;
 250}
 251
 252static inline void context_set_address_width(struct context_entry *context,
 253                                             unsigned long value)
 254{
 255        context->hi |= value & 7;
 256}
 257
 258static inline void context_set_domain_id(struct context_entry *context,
 259                                         unsigned long value)
 260{
 261        context->hi |= (value & ((1 << 16) - 1)) << 8;
 262}
 263
 264static inline void context_clear_entry(struct context_entry *context)
 265{
 266        context->lo = 0;
 267        context->hi = 0;
 268}
 269
 270/*
 271 * 0: readable
 272 * 1: writable
 273 * 2-6: reserved
 274 * 7: super page
 275 * 8-10: available
 276 * 11: snoop behavior
 277 * 12-63: Host physcial address
 278 */
 279struct dma_pte {
 280        u64 val;
 281};
 282
 283static inline void dma_clear_pte(struct dma_pte *pte)
 284{
 285        pte->val = 0;
 286}
 287
 288static inline void dma_set_pte_readable(struct dma_pte *pte)
 289{
 290        pte->val |= DMA_PTE_READ;
 291}
 292
 293static inline void dma_set_pte_writable(struct dma_pte *pte)
 294{
 295        pte->val |= DMA_PTE_WRITE;
 296}
 297
 298static inline void dma_set_pte_snp(struct dma_pte *pte)
 299{
 300        pte->val |= DMA_PTE_SNP;
 301}
 302
 303static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 304{
 305        pte->val = (pte->val & ~3) | (prot & 3);
 306}
 307
 308static inline u64 dma_pte_addr(struct dma_pte *pte)
 309{
 310#ifdef CONFIG_64BIT
 311        return pte->val & VTD_PAGE_MASK;
 312#else
 313        /* Must have a full atomic 64-bit read */
 314        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 315#endif
 316}
 317
 318static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 319{
 320        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 321}
 322
 323static inline bool dma_pte_present(struct dma_pte *pte)
 324{
 325        return (pte->val & 3) != 0;
 326}
 327
 328static inline bool dma_pte_superpage(struct dma_pte *pte)
 329{
 330        return (pte->val & (1 << 7));
 331}
 332
 333static inline int first_pte_in_page(struct dma_pte *pte)
 334{
 335        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 336}
 337
 338/*
 339 * This domain is a statically identity mapping domain.
 340 *      1. This domain creats a static 1:1 mapping to all usable memory.
 341 *      2. It maps to each iommu if successful.
 342 *      3. Each iommu mapps to this domain if successful.
 343 */
 344static struct dmar_domain *si_domain;
 345static int hw_pass_through = 1;
 346
 347/* devices under the same p2p bridge are owned in one domain */
 348#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 349
 350/* domain represents a virtual machine, more than one devices
 351 * across iommus may be owned in one domain, e.g. kvm guest.
 352 */
 353#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 354
 355/* si_domain contains mulitple devices */
 356#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 357
 358/* define the limit of IOMMUs supported in each domain */
 359#ifdef  CONFIG_X86
 360# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 361#else
 362# define        IOMMU_UNITS_SUPPORTED   64
 363#endif
 364
 365struct dmar_domain {
 366        int     id;                     /* domain id */
 367        int     nid;                    /* node id */
 368        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 369                                        /* bitmap of iommus this domain uses*/
 370
 371        struct list_head devices;       /* all devices' list */
 372        struct iova_domain iovad;       /* iova's that belong to this domain */
 373
 374        struct dma_pte  *pgd;           /* virtual address */
 375        int             gaw;            /* max guest address width */
 376
 377        /* adjusted guest address width, 0 is level 2 30-bit */
 378        int             agaw;
 379
 380        int             flags;          /* flags to find out type of domain */
 381
 382        int             iommu_coherency;/* indicate coherency of iommu access */
 383        int             iommu_snooping; /* indicate snooping control feature*/
 384        int             iommu_count;    /* reference count of iommu */
 385        int             iommu_superpage;/* Level of superpages supported:
 386                                           0 == 4KiB (no superpages), 1 == 2MiB,
 387                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 388        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 389        u64             max_addr;       /* maximum mapped address */
 390};
 391
 392/* PCI domain-device relationship */
 393struct device_domain_info {
 394        struct list_head link;  /* link to domain siblings */
 395        struct list_head global; /* link to global list */
 396        int segment;            /* PCI domain */
 397        u8 bus;                 /* PCI bus number */
 398        u8 devfn;               /* PCI devfn number */
 399        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 400        struct intel_iommu *iommu; /* IOMMU used by this device */
 401        struct dmar_domain *domain; /* pointer to domain */
 402};
 403
 404static void flush_unmaps_timeout(unsigned long data);
 405
 406DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 407
 408#define HIGH_WATER_MARK 250
 409struct deferred_flush_tables {
 410        int next;
 411        struct iova *iova[HIGH_WATER_MARK];
 412        struct dmar_domain *domain[HIGH_WATER_MARK];
 413};
 414
 415static struct deferred_flush_tables *deferred_flush;
 416
 417/* bitmap for indexing intel_iommus */
 418static int g_num_of_iommus;
 419
 420static DEFINE_SPINLOCK(async_umap_flush_lock);
 421static LIST_HEAD(unmaps_to_do);
 422
 423static int timer_on;
 424static long list_size;
 425
 426static void domain_remove_dev_info(struct dmar_domain *domain);
 427
 428#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 429int dmar_disabled = 0;
 430#else
 431int dmar_disabled = 1;
 432#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 433
 434int intel_iommu_enabled = 0;
 435EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 436
 437static int dmar_map_gfx = 1;
 438static int dmar_forcedac;
 439static int intel_iommu_strict;
 440static int intel_iommu_superpage = 1;
 441
 442int intel_iommu_gfx_mapped;
 443EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 444
 445#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 446static DEFINE_SPINLOCK(device_domain_lock);
 447static LIST_HEAD(device_domain_list);
 448
 449static struct iommu_ops intel_iommu_ops;
 450
 451static int __init intel_iommu_setup(char *str)
 452{
 453        if (!str)
 454                return -EINVAL;
 455        while (*str) {
 456                if (!strncmp(str, "on", 2)) {
 457                        dmar_disabled = 0;
 458                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 459                } else if (!strncmp(str, "off", 3)) {
 460                        dmar_disabled = 1;
 461                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 462                } else if (!strncmp(str, "igfx_off", 8)) {
 463                        dmar_map_gfx = 0;
 464                        printk(KERN_INFO
 465                                "Intel-IOMMU: disable GFX device mapping\n");
 466                } else if (!strncmp(str, "forcedac", 8)) {
 467                        printk(KERN_INFO
 468                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 469                        dmar_forcedac = 1;
 470                } else if (!strncmp(str, "strict", 6)) {
 471                        printk(KERN_INFO
 472                                "Intel-IOMMU: disable batched IOTLB flush\n");
 473                        intel_iommu_strict = 1;
 474                } else if (!strncmp(str, "sp_off", 6)) {
 475                        printk(KERN_INFO
 476                                "Intel-IOMMU: disable supported super page\n");
 477                        intel_iommu_superpage = 0;
 478                }
 479
 480                str += strcspn(str, ",");
 481                while (*str == ',')
 482                        str++;
 483        }
 484        return 0;
 485}
 486__setup("intel_iommu=", intel_iommu_setup);
 487
 488static struct kmem_cache *iommu_domain_cache;
 489static struct kmem_cache *iommu_devinfo_cache;
 490static struct kmem_cache *iommu_iova_cache;
 491
 492static inline void *alloc_pgtable_page(int node)
 493{
 494        struct page *page;
 495        void *vaddr = NULL;
 496
 497        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 498        if (page)
 499                vaddr = page_address(page);
 500        return vaddr;
 501}
 502
 503static inline void free_pgtable_page(void *vaddr)
 504{
 505        free_page((unsigned long)vaddr);
 506}
 507
 508static inline void *alloc_domain_mem(void)
 509{
 510        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 511}
 512
 513static void free_domain_mem(void *vaddr)
 514{
 515        kmem_cache_free(iommu_domain_cache, vaddr);
 516}
 517
 518static inline void * alloc_devinfo_mem(void)
 519{
 520        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 521}
 522
 523static inline void free_devinfo_mem(void *vaddr)
 524{
 525        kmem_cache_free(iommu_devinfo_cache, vaddr);
 526}
 527
 528struct iova *alloc_iova_mem(void)
 529{
 530        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 531}
 532
 533void free_iova_mem(struct iova *iova)
 534{
 535        kmem_cache_free(iommu_iova_cache, iova);
 536}
 537
 538
 539static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 540{
 541        unsigned long sagaw;
 542        int agaw = -1;
 543
 544        sagaw = cap_sagaw(iommu->cap);
 545        for (agaw = width_to_agaw(max_gaw);
 546             agaw >= 0; agaw--) {
 547                if (test_bit(agaw, &sagaw))
 548                        break;
 549        }
 550
 551        return agaw;
 552}
 553
 554/*
 555 * Calculate max SAGAW for each iommu.
 556 */
 557int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 558{
 559        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 560}
 561
 562/*
 563 * calculate agaw for each iommu.
 564 * "SAGAW" may be different across iommus, use a default agaw, and
 565 * get a supported less agaw for iommus that don't support the default agaw.
 566 */
 567int iommu_calculate_agaw(struct intel_iommu *iommu)
 568{
 569        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 570}
 571
 572/* This functionin only returns single iommu in a domain */
 573static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 574{
 575        int iommu_id;
 576
 577        /* si_domain and vm domain should not get here. */
 578        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 579        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 580
 581        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 582        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 583                return NULL;
 584
 585        return g_iommus[iommu_id];
 586}
 587
 588static void domain_update_iommu_coherency(struct dmar_domain *domain)
 589{
 590        int i;
 591
 592        domain->iommu_coherency = 1;
 593
 594        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 595                if (!ecap_coherent(g_iommus[i]->ecap)) {
 596                        domain->iommu_coherency = 0;
 597                        break;
 598                }
 599        }
 600}
 601
 602static void domain_update_iommu_snooping(struct dmar_domain *domain)
 603{
 604        int i;
 605
 606        domain->iommu_snooping = 1;
 607
 608        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 609                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 610                        domain->iommu_snooping = 0;
 611                        break;
 612                }
 613        }
 614}
 615
 616static void domain_update_iommu_superpage(struct dmar_domain *domain)
 617{
 618        struct dmar_drhd_unit *drhd;
 619        struct intel_iommu *iommu = NULL;
 620        int mask = 0xf;
 621
 622        if (!intel_iommu_superpage) {
 623                domain->iommu_superpage = 0;
 624                return;
 625        }
 626
 627        /* set iommu_superpage to the smallest common denominator */
 628        for_each_active_iommu(iommu, drhd) {
 629                mask &= cap_super_page_val(iommu->cap);
 630                if (!mask) {
 631                        break;
 632                }
 633        }
 634        domain->iommu_superpage = fls(mask);
 635}
 636
 637/* Some capabilities may be different across iommus */
 638static void domain_update_iommu_cap(struct dmar_domain *domain)
 639{
 640        domain_update_iommu_coherency(domain);
 641        domain_update_iommu_snooping(domain);
 642        domain_update_iommu_superpage(domain);
 643}
 644
 645static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 646{
 647        struct dmar_drhd_unit *drhd = NULL;
 648        int i;
 649
 650        for_each_drhd_unit(drhd) {
 651                if (drhd->ignored)
 652                        continue;
 653                if (segment != drhd->segment)
 654                        continue;
 655
 656                for (i = 0; i < drhd->devices_cnt; i++) {
 657                        if (drhd->devices[i] &&
 658                            drhd->devices[i]->bus->number == bus &&
 659                            drhd->devices[i]->devfn == devfn)
 660                                return drhd->iommu;
 661                        if (drhd->devices[i] &&
 662                            drhd->devices[i]->subordinate &&
 663                            drhd->devices[i]->subordinate->number <= bus &&
 664                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 665                                return drhd->iommu;
 666                }
 667
 668                if (drhd->include_all)
 669                        return drhd->iommu;
 670        }
 671
 672        return NULL;
 673}
 674
 675static void domain_flush_cache(struct dmar_domain *domain,
 676                               void *addr, int size)
 677{
 678        if (!domain->iommu_coherency)
 679                clflush_cache_range(addr, size);
 680}
 681
 682/* Gets context entry for a given bus and devfn */
 683static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 684                u8 bus, u8 devfn)
 685{
 686        struct root_entry *root;
 687        struct context_entry *context;
 688        unsigned long phy_addr;
 689        unsigned long flags;
 690
 691        spin_lock_irqsave(&iommu->lock, flags);
 692        root = &iommu->root_entry[bus];
 693        context = get_context_addr_from_root(root);
 694        if (!context) {
 695                context = (struct context_entry *)
 696                                alloc_pgtable_page(iommu->node);
 697                if (!context) {
 698                        spin_unlock_irqrestore(&iommu->lock, flags);
 699                        return NULL;
 700                }
 701                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 702                phy_addr = virt_to_phys((void *)context);
 703                set_root_value(root, phy_addr);
 704                set_root_present(root);
 705                __iommu_flush_cache(iommu, root, sizeof(*root));
 706        }
 707        spin_unlock_irqrestore(&iommu->lock, flags);
 708        return &context[devfn];
 709}
 710
 711static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 712{
 713        struct root_entry *root;
 714        struct context_entry *context;
 715        int ret;
 716        unsigned long flags;
 717
 718        spin_lock_irqsave(&iommu->lock, flags);
 719        root = &iommu->root_entry[bus];
 720        context = get_context_addr_from_root(root);
 721        if (!context) {
 722                ret = 0;
 723                goto out;
 724        }
 725        ret = context_present(&context[devfn]);
 726out:
 727        spin_unlock_irqrestore(&iommu->lock, flags);
 728        return ret;
 729}
 730
 731static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 732{
 733        struct root_entry *root;
 734        struct context_entry *context;
 735        unsigned long flags;
 736
 737        spin_lock_irqsave(&iommu->lock, flags);
 738        root = &iommu->root_entry[bus];
 739        context = get_context_addr_from_root(root);
 740        if (context) {
 741                context_clear_entry(&context[devfn]);
 742                __iommu_flush_cache(iommu, &context[devfn], \
 743                        sizeof(*context));
 744        }
 745        spin_unlock_irqrestore(&iommu->lock, flags);
 746}
 747
 748static void free_context_table(struct intel_iommu *iommu)
 749{
 750        struct root_entry *root;
 751        int i;
 752        unsigned long flags;
 753        struct context_entry *context;
 754
 755        spin_lock_irqsave(&iommu->lock, flags);
 756        if (!iommu->root_entry) {
 757                goto out;
 758        }
 759        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 760                root = &iommu->root_entry[i];
 761                context = get_context_addr_from_root(root);
 762                if (context)
 763                        free_pgtable_page(context);
 764        }
 765        free_pgtable_page(iommu->root_entry);
 766        iommu->root_entry = NULL;
 767out:
 768        spin_unlock_irqrestore(&iommu->lock, flags);
 769}
 770
 771static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 772                                      unsigned long pfn, int target_level)
 773{
 774        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 775        struct dma_pte *parent, *pte = NULL;
 776        int level = agaw_to_level(domain->agaw);
 777        int offset;
 778
 779        BUG_ON(!domain->pgd);
 780        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 781        parent = domain->pgd;
 782
 783        while (level > 0) {
 784                void *tmp_page;
 785
 786                offset = pfn_level_offset(pfn, level);
 787                pte = &parent[offset];
 788                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 789                        break;
 790                if (level == target_level)
 791                        break;
 792
 793                if (!dma_pte_present(pte)) {
 794                        uint64_t pteval;
 795
 796                        tmp_page = alloc_pgtable_page(domain->nid);
 797
 798                        if (!tmp_page)
 799                                return NULL;
 800
 801                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 802                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 803                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 804                                /* Someone else set it while we were thinking; use theirs. */
 805                                free_pgtable_page(tmp_page);
 806                        } else {
 807                                dma_pte_addr(pte);
 808                                domain_flush_cache(domain, pte, sizeof(*pte));
 809                        }
 810                }
 811                parent = phys_to_virt(dma_pte_addr(pte));
 812                level--;
 813        }
 814
 815        return pte;
 816}
 817
 818
 819/* return address's pte at specific level */
 820static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 821                                         unsigned long pfn,
 822                                         int level, int *large_page)
 823{
 824        struct dma_pte *parent, *pte = NULL;
 825        int total = agaw_to_level(domain->agaw);
 826        int offset;
 827
 828        parent = domain->pgd;
 829        while (level <= total) {
 830                offset = pfn_level_offset(pfn, total);
 831                pte = &parent[offset];
 832                if (level == total)
 833                        return pte;
 834
 835                if (!dma_pte_present(pte)) {
 836                        *large_page = total;
 837                        break;
 838                }
 839
 840                if (pte->val & DMA_PTE_LARGE_PAGE) {
 841                        *large_page = total;
 842                        return pte;
 843                }
 844
 845                parent = phys_to_virt(dma_pte_addr(pte));
 846                total--;
 847        }
 848        return NULL;
 849}
 850
 851/* clear last level pte, a tlb flush should be followed */
 852static int dma_pte_clear_range(struct dmar_domain *domain,
 853                                unsigned long start_pfn,
 854                                unsigned long last_pfn)
 855{
 856        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 857        unsigned int large_page = 1;
 858        struct dma_pte *first_pte, *pte;
 859        int order;
 860
 861        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 862        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 863        BUG_ON(start_pfn > last_pfn);
 864
 865        /* we don't need lock here; nobody else touches the iova range */
 866        do {
 867                large_page = 1;
 868                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 869                if (!pte) {
 870                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 871                        continue;
 872                }
 873                do {
 874                        dma_clear_pte(pte);
 875                        start_pfn += lvl_to_nr_pages(large_page);
 876                        pte++;
 877                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 878
 879                domain_flush_cache(domain, first_pte,
 880                                   (void *)pte - (void *)first_pte);
 881
 882        } while (start_pfn && start_pfn <= last_pfn);
 883
 884        order = (large_page - 1) * 9;
 885        return order;
 886}
 887
 888/* free page table pages. last level pte should already be cleared */
 889static void dma_pte_free_pagetable(struct dmar_domain *domain,
 890                                   unsigned long start_pfn,
 891                                   unsigned long last_pfn)
 892{
 893        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 894        struct dma_pte *first_pte, *pte;
 895        int total = agaw_to_level(domain->agaw);
 896        int level;
 897        unsigned long tmp;
 898        int large_page = 2;
 899
 900        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 901        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 902        BUG_ON(start_pfn > last_pfn);
 903
 904        /* We don't need lock here; nobody else touches the iova range */
 905        level = 2;
 906        while (level <= total) {
 907                tmp = align_to_level(start_pfn, level);
 908
 909                /* If we can't even clear one PTE at this level, we're done */
 910                if (tmp + level_size(level) - 1 > last_pfn)
 911                        return;
 912
 913                do {
 914                        large_page = level;
 915                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 916                        if (large_page > level)
 917                                level = large_page + 1;
 918                        if (!pte) {
 919                                tmp = align_to_level(tmp + 1, level + 1);
 920                                continue;
 921                        }
 922                        do {
 923                                if (dma_pte_present(pte)) {
 924                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 925                                        dma_clear_pte(pte);
 926                                }
 927                                pte++;
 928                                tmp += level_size(level);
 929                        } while (!first_pte_in_page(pte) &&
 930                                 tmp + level_size(level) - 1 <= last_pfn);
 931
 932                        domain_flush_cache(domain, first_pte,
 933                                           (void *)pte - (void *)first_pte);
 934                        
 935                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 936                level++;
 937        }
 938        /* free pgd */
 939        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 940                free_pgtable_page(domain->pgd);
 941                domain->pgd = NULL;
 942        }
 943}
 944
 945/* iommu handling */
 946static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 947{
 948        struct root_entry *root;
 949        unsigned long flags;
 950
 951        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 952        if (!root)
 953                return -ENOMEM;
 954
 955        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 956
 957        spin_lock_irqsave(&iommu->lock, flags);
 958        iommu->root_entry = root;
 959        spin_unlock_irqrestore(&iommu->lock, flags);
 960
 961        return 0;
 962}
 963
 964static void iommu_set_root_entry(struct intel_iommu *iommu)
 965{
 966        void *addr;
 967        u32 sts;
 968        unsigned long flag;
 969
 970        addr = iommu->root_entry;
 971
 972        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 973        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 974
 975        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 976
 977        /* Make sure hardware complete it */
 978        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 979                      readl, (sts & DMA_GSTS_RTPS), sts);
 980
 981        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 982}
 983
 984static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 985{
 986        u32 val;
 987        unsigned long flag;
 988
 989        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 990                return;
 991
 992        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 993        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 994
 995        /* Make sure hardware complete it */
 996        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 997                      readl, (!(val & DMA_GSTS_WBFS)), val);
 998
 999        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1000}
1001
1002/* return value determine if we need a write buffer flush */
1003static void __iommu_flush_context(struct intel_iommu *iommu,
1004                                  u16 did, u16 source_id, u8 function_mask,
1005                                  u64 type)
1006{
1007        u64 val = 0;
1008        unsigned long flag;
1009
1010        switch (type) {
1011        case DMA_CCMD_GLOBAL_INVL:
1012                val = DMA_CCMD_GLOBAL_INVL;
1013                break;
1014        case DMA_CCMD_DOMAIN_INVL:
1015                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1016                break;
1017        case DMA_CCMD_DEVICE_INVL:
1018                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1019                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1020                break;
1021        default:
1022                BUG();
1023        }
1024        val |= DMA_CCMD_ICC;
1025
1026        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1027        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1028
1029        /* Make sure hardware complete it */
1030        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1031                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1032
1033        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1034}
1035
1036/* return value determine if we need a write buffer flush */
1037static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1038                                u64 addr, unsigned int size_order, u64 type)
1039{
1040        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1041        u64 val = 0, val_iva = 0;
1042        unsigned long flag;
1043
1044        switch (type) {
1045        case DMA_TLB_GLOBAL_FLUSH:
1046                /* global flush doesn't need set IVA_REG */
1047                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1048                break;
1049        case DMA_TLB_DSI_FLUSH:
1050                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1051                break;
1052        case DMA_TLB_PSI_FLUSH:
1053                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054                /* Note: always flush non-leaf currently */
1055                val_iva = size_order | addr;
1056                break;
1057        default:
1058                BUG();
1059        }
1060        /* Note: set drain read/write */
1061#if 0
1062        /*
1063         * This is probably to be super secure.. Looks like we can
1064         * ignore it without any impact.
1065         */
1066        if (cap_read_drain(iommu->cap))
1067                val |= DMA_TLB_READ_DRAIN;
1068#endif
1069        if (cap_write_drain(iommu->cap))
1070                val |= DMA_TLB_WRITE_DRAIN;
1071
1072        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073        /* Note: Only uses first TLB reg currently */
1074        if (val_iva)
1075                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1076        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1077
1078        /* Make sure hardware complete it */
1079        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1080                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1081
1082        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1083
1084        /* check IOTLB invalidation granularity */
1085        if (DMA_TLB_IAIG(val) == 0)
1086                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1087        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1088                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1089                        (unsigned long long)DMA_TLB_IIRG(type),
1090                        (unsigned long long)DMA_TLB_IAIG(val));
1091}
1092
1093static struct device_domain_info *iommu_support_dev_iotlb(
1094        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1095{
1096        int found = 0;
1097        unsigned long flags;
1098        struct device_domain_info *info;
1099        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1100
1101        if (!ecap_dev_iotlb_support(iommu->ecap))
1102                return NULL;
1103
1104        if (!iommu->qi)
1105                return NULL;
1106
1107        spin_lock_irqsave(&device_domain_lock, flags);
1108        list_for_each_entry(info, &domain->devices, link)
1109                if (info->bus == bus && info->devfn == devfn) {
1110                        found = 1;
1111                        break;
1112                }
1113        spin_unlock_irqrestore(&device_domain_lock, flags);
1114
1115        if (!found || !info->dev)
1116                return NULL;
1117
1118        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1119                return NULL;
1120
1121        if (!dmar_find_matched_atsr_unit(info->dev))
1122                return NULL;
1123
1124        info->iommu = iommu;
1125
1126        return info;
1127}
1128
1129static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1130{
1131        if (!info)
1132                return;
1133
1134        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1135}
1136
1137static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1138{
1139        if (!info->dev || !pci_ats_enabled(info->dev))
1140                return;
1141
1142        pci_disable_ats(info->dev);
1143}
1144
1145static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1146                                  u64 addr, unsigned mask)
1147{
1148        u16 sid, qdep;
1149        unsigned long flags;
1150        struct device_domain_info *info;
1151
1152        spin_lock_irqsave(&device_domain_lock, flags);
1153        list_for_each_entry(info, &domain->devices, link) {
1154                if (!info->dev || !pci_ats_enabled(info->dev))
1155                        continue;
1156
1157                sid = info->bus << 8 | info->devfn;
1158                qdep = pci_ats_queue_depth(info->dev);
1159                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1160        }
1161        spin_unlock_irqrestore(&device_domain_lock, flags);
1162}
1163
1164static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1165                                  unsigned long pfn, unsigned int pages, int map)
1166{
1167        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1168        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1169
1170        BUG_ON(pages == 0);
1171
1172        /*
1173         * Fallback to domain selective flush if no PSI support or the size is
1174         * too big.
1175         * PSI requires page size to be 2 ^ x, and the base address is naturally
1176         * aligned to the size
1177         */
1178        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1179                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1180                                                DMA_TLB_DSI_FLUSH);
1181        else
1182                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1183                                                DMA_TLB_PSI_FLUSH);
1184
1185        /*
1186         * In caching mode, changes of pages from non-present to present require
1187         * flush. However, device IOTLB doesn't need to be flushed in this case.
1188         */
1189        if (!cap_caching_mode(iommu->cap) || !map)
1190                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1191}
1192
1193static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1194{
1195        u32 pmen;
1196        unsigned long flags;
1197
1198        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1199        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1200        pmen &= ~DMA_PMEN_EPM;
1201        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1202
1203        /* wait for the protected region status bit to clear */
1204        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1205                readl, !(pmen & DMA_PMEN_PRS), pmen);
1206
1207        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1208}
1209
1210static int iommu_enable_translation(struct intel_iommu *iommu)
1211{
1212        u32 sts;
1213        unsigned long flags;
1214
1215        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1216        iommu->gcmd |= DMA_GCMD_TE;
1217        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1218
1219        /* Make sure hardware complete it */
1220        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1221                      readl, (sts & DMA_GSTS_TES), sts);
1222
1223        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1224        return 0;
1225}
1226
1227static int iommu_disable_translation(struct intel_iommu *iommu)
1228{
1229        u32 sts;
1230        unsigned long flag;
1231
1232        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233        iommu->gcmd &= ~DMA_GCMD_TE;
1234        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1235
1236        /* Make sure hardware complete it */
1237        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238                      readl, (!(sts & DMA_GSTS_TES)), sts);
1239
1240        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241        return 0;
1242}
1243
1244
1245static int iommu_init_domains(struct intel_iommu *iommu)
1246{
1247        unsigned long ndomains;
1248        unsigned long nlongs;
1249
1250        ndomains = cap_ndoms(iommu->cap);
1251        pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1252                        ndomains);
1253        nlongs = BITS_TO_LONGS(ndomains);
1254
1255        spin_lock_init(&iommu->lock);
1256
1257        /* TBD: there might be 64K domains,
1258         * consider other allocation for future chip
1259         */
1260        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1261        if (!iommu->domain_ids) {
1262                printk(KERN_ERR "Allocating domain id array failed\n");
1263                return -ENOMEM;
1264        }
1265        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1266                        GFP_KERNEL);
1267        if (!iommu->domains) {
1268                printk(KERN_ERR "Allocating domain array failed\n");
1269                return -ENOMEM;
1270        }
1271
1272        /*
1273         * if Caching mode is set, then invalid translations are tagged
1274         * with domainid 0. Hence we need to pre-allocate it.
1275         */
1276        if (cap_caching_mode(iommu->cap))
1277                set_bit(0, iommu->domain_ids);
1278        return 0;
1279}
1280
1281
1282static void domain_exit(struct dmar_domain *domain);
1283static void vm_domain_exit(struct dmar_domain *domain);
1284
1285void free_dmar_iommu(struct intel_iommu *iommu)
1286{
1287        struct dmar_domain *domain;
1288        int i;
1289        unsigned long flags;
1290
1291        if ((iommu->domains) && (iommu->domain_ids)) {
1292                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1293                        domain = iommu->domains[i];
1294                        clear_bit(i, iommu->domain_ids);
1295
1296                        spin_lock_irqsave(&domain->iommu_lock, flags);
1297                        if (--domain->iommu_count == 0) {
1298                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1299                                        vm_domain_exit(domain);
1300                                else
1301                                        domain_exit(domain);
1302                        }
1303                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1304                }
1305        }
1306
1307        if (iommu->gcmd & DMA_GCMD_TE)
1308                iommu_disable_translation(iommu);
1309
1310        if (iommu->irq) {
1311                irq_set_handler_data(iommu->irq, NULL);
1312                /* This will mask the irq */
1313                free_irq(iommu->irq, iommu);
1314                destroy_irq(iommu->irq);
1315        }
1316
1317        kfree(iommu->domains);
1318        kfree(iommu->domain_ids);
1319
1320        g_iommus[iommu->seq_id] = NULL;
1321
1322        /* if all iommus are freed, free g_iommus */
1323        for (i = 0; i < g_num_of_iommus; i++) {
1324                if (g_iommus[i])
1325                        break;
1326        }
1327
1328        if (i == g_num_of_iommus)
1329                kfree(g_iommus);
1330
1331        /* free context mapping */
1332        free_context_table(iommu);
1333}
1334
1335static struct dmar_domain *alloc_domain(void)
1336{
1337        struct dmar_domain *domain;
1338
1339        domain = alloc_domain_mem();
1340        if (!domain)
1341                return NULL;
1342
1343        domain->nid = -1;
1344        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1345        domain->flags = 0;
1346
1347        return domain;
1348}
1349
1350static int iommu_attach_domain(struct dmar_domain *domain,
1351                               struct intel_iommu *iommu)
1352{
1353        int num;
1354        unsigned long ndomains;
1355        unsigned long flags;
1356
1357        ndomains = cap_ndoms(iommu->cap);
1358
1359        spin_lock_irqsave(&iommu->lock, flags);
1360
1361        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1362        if (num >= ndomains) {
1363                spin_unlock_irqrestore(&iommu->lock, flags);
1364                printk(KERN_ERR "IOMMU: no free domain ids\n");
1365                return -ENOMEM;
1366        }
1367
1368        domain->id = num;
1369        set_bit(num, iommu->domain_ids);
1370        set_bit(iommu->seq_id, domain->iommu_bmp);
1371        iommu->domains[num] = domain;
1372        spin_unlock_irqrestore(&iommu->lock, flags);
1373
1374        return 0;
1375}
1376
1377static void iommu_detach_domain(struct dmar_domain *domain,
1378                                struct intel_iommu *iommu)
1379{
1380        unsigned long flags;
1381        int num, ndomains;
1382        int found = 0;
1383
1384        spin_lock_irqsave(&iommu->lock, flags);
1385        ndomains = cap_ndoms(iommu->cap);
1386        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1387                if (iommu->domains[num] == domain) {
1388                        found = 1;
1389                        break;
1390                }
1391        }
1392
1393        if (found) {
1394                clear_bit(num, iommu->domain_ids);
1395                clear_bit(iommu->seq_id, domain->iommu_bmp);
1396                iommu->domains[num] = NULL;
1397        }
1398        spin_unlock_irqrestore(&iommu->lock, flags);
1399}
1400
1401static struct iova_domain reserved_iova_list;
1402static struct lock_class_key reserved_rbtree_key;
1403
1404static int dmar_init_reserved_ranges(void)
1405{
1406        struct pci_dev *pdev = NULL;
1407        struct iova *iova;
1408        int i;
1409
1410        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1411
1412        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1413                &reserved_rbtree_key);
1414
1415        /* IOAPIC ranges shouldn't be accessed by DMA */
1416        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1417                IOVA_PFN(IOAPIC_RANGE_END));
1418        if (!iova) {
1419                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1420                return -ENODEV;
1421        }
1422
1423        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1424        for_each_pci_dev(pdev) {
1425                struct resource *r;
1426
1427                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1428                        r = &pdev->resource[i];
1429                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1430                                continue;
1431                        iova = reserve_iova(&reserved_iova_list,
1432                                            IOVA_PFN(r->start),
1433                                            IOVA_PFN(r->end));
1434                        if (!iova) {
1435                                printk(KERN_ERR "Reserve iova failed\n");
1436                                return -ENODEV;
1437                        }
1438                }
1439        }
1440        return 0;
1441}
1442
1443static void domain_reserve_special_ranges(struct dmar_domain *domain)
1444{
1445        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1446}
1447
1448static inline int guestwidth_to_adjustwidth(int gaw)
1449{
1450        int agaw;
1451        int r = (gaw - 12) % 9;
1452
1453        if (r == 0)
1454                agaw = gaw;
1455        else
1456                agaw = gaw + 9 - r;
1457        if (agaw > 64)
1458                agaw = 64;
1459        return agaw;
1460}
1461
1462static int domain_init(struct dmar_domain *domain, int guest_width)
1463{
1464        struct intel_iommu *iommu;
1465        int adjust_width, agaw;
1466        unsigned long sagaw;
1467
1468        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1469        spin_lock_init(&domain->iommu_lock);
1470
1471        domain_reserve_special_ranges(domain);
1472
1473        /* calculate AGAW */
1474        iommu = domain_get_iommu(domain);
1475        if (guest_width > cap_mgaw(iommu->cap))
1476                guest_width = cap_mgaw(iommu->cap);
1477        domain->gaw = guest_width;
1478        adjust_width = guestwidth_to_adjustwidth(guest_width);
1479        agaw = width_to_agaw(adjust_width);
1480        sagaw = cap_sagaw(iommu->cap);
1481        if (!test_bit(agaw, &sagaw)) {
1482                /* hardware doesn't support it, choose a bigger one */
1483                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1484                agaw = find_next_bit(&sagaw, 5, agaw);
1485                if (agaw >= 5)
1486                        return -ENODEV;
1487        }
1488        domain->agaw = agaw;
1489        INIT_LIST_HEAD(&domain->devices);
1490
1491        if (ecap_coherent(iommu->ecap))
1492                domain->iommu_coherency = 1;
1493        else
1494                domain->iommu_coherency = 0;
1495
1496        if (ecap_sc_support(iommu->ecap))
1497                domain->iommu_snooping = 1;
1498        else
1499                domain->iommu_snooping = 0;
1500
1501        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1502        domain->iommu_count = 1;
1503        domain->nid = iommu->node;
1504
1505        /* always allocate the top pgd */
1506        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1507        if (!domain->pgd)
1508                return -ENOMEM;
1509        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1510        return 0;
1511}
1512
1513static void domain_exit(struct dmar_domain *domain)
1514{
1515        struct dmar_drhd_unit *drhd;
1516        struct intel_iommu *iommu;
1517
1518        /* Domain 0 is reserved, so dont process it */
1519        if (!domain)
1520                return;
1521
1522        /* Flush any lazy unmaps that may reference this domain */
1523        if (!intel_iommu_strict)
1524                flush_unmaps_timeout(0);
1525
1526        domain_remove_dev_info(domain);
1527        /* destroy iovas */
1528        put_iova_domain(&domain->iovad);
1529
1530        /* clear ptes */
1531        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1532
1533        /* free page tables */
1534        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536        for_each_active_iommu(iommu, drhd)
1537                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1538                        iommu_detach_domain(domain, iommu);
1539
1540        free_domain_mem(domain);
1541}
1542
1543static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1544                                 u8 bus, u8 devfn, int translation)
1545{
1546        struct context_entry *context;
1547        unsigned long flags;
1548        struct intel_iommu *iommu;
1549        struct dma_pte *pgd;
1550        unsigned long num;
1551        unsigned long ndomains;
1552        int id;
1553        int agaw;
1554        struct device_domain_info *info = NULL;
1555
1556        pr_debug("Set context mapping for %02x:%02x.%d\n",
1557                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1558
1559        BUG_ON(!domain->pgd);
1560        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1561               translation != CONTEXT_TT_MULTI_LEVEL);
1562
1563        iommu = device_to_iommu(segment, bus, devfn);
1564        if (!iommu)
1565                return -ENODEV;
1566
1567        context = device_to_context_entry(iommu, bus, devfn);
1568        if (!context)
1569                return -ENOMEM;
1570        spin_lock_irqsave(&iommu->lock, flags);
1571        if (context_present(context)) {
1572                spin_unlock_irqrestore(&iommu->lock, flags);
1573                return 0;
1574        }
1575
1576        id = domain->id;
1577        pgd = domain->pgd;
1578
1579        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1580            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1581                int found = 0;
1582
1583                /* find an available domain id for this device in iommu */
1584                ndomains = cap_ndoms(iommu->cap);
1585                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1586                        if (iommu->domains[num] == domain) {
1587                                id = num;
1588                                found = 1;
1589                                break;
1590                        }
1591                }
1592
1593                if (found == 0) {
1594                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1595                        if (num >= ndomains) {
1596                                spin_unlock_irqrestore(&iommu->lock, flags);
1597                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1598                                return -EFAULT;
1599                        }
1600
1601                        set_bit(num, iommu->domain_ids);
1602                        iommu->domains[num] = domain;
1603                        id = num;
1604                }
1605
1606                /* Skip top levels of page tables for
1607                 * iommu which has less agaw than default.
1608                 * Unnecessary for PT mode.
1609                 */
1610                if (translation != CONTEXT_TT_PASS_THROUGH) {
1611                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1612                                pgd = phys_to_virt(dma_pte_addr(pgd));
1613                                if (!dma_pte_present(pgd)) {
1614                                        spin_unlock_irqrestore(&iommu->lock, flags);
1615                                        return -ENOMEM;
1616                                }
1617                        }
1618                }
1619        }
1620
1621        context_set_domain_id(context, id);
1622
1623        if (translation != CONTEXT_TT_PASS_THROUGH) {
1624                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1625                translation = info ? CONTEXT_TT_DEV_IOTLB :
1626                                     CONTEXT_TT_MULTI_LEVEL;
1627        }
1628        /*
1629         * In pass through mode, AW must be programmed to indicate the largest
1630         * AGAW value supported by hardware. And ASR is ignored by hardware.
1631         */
1632        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1633                context_set_address_width(context, iommu->msagaw);
1634        else {
1635                context_set_address_root(context, virt_to_phys(pgd));
1636                context_set_address_width(context, iommu->agaw);
1637        }
1638
1639        context_set_translation_type(context, translation);
1640        context_set_fault_enable(context);
1641        context_set_present(context);
1642        domain_flush_cache(domain, context, sizeof(*context));
1643
1644        /*
1645         * It's a non-present to present mapping. If hardware doesn't cache
1646         * non-present entry we only need to flush the write-buffer. If the
1647         * _does_ cache non-present entries, then it does so in the special
1648         * domain #0, which we have to flush:
1649         */
1650        if (cap_caching_mode(iommu->cap)) {
1651                iommu->flush.flush_context(iommu, 0,
1652                                           (((u16)bus) << 8) | devfn,
1653                                           DMA_CCMD_MASK_NOBIT,
1654                                           DMA_CCMD_DEVICE_INVL);
1655                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1656        } else {
1657                iommu_flush_write_buffer(iommu);
1658        }
1659        iommu_enable_dev_iotlb(info);
1660        spin_unlock_irqrestore(&iommu->lock, flags);
1661
1662        spin_lock_irqsave(&domain->iommu_lock, flags);
1663        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1664                domain->iommu_count++;
1665                if (domain->iommu_count == 1)
1666                        domain->nid = iommu->node;
1667                domain_update_iommu_cap(domain);
1668        }
1669        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1670        return 0;
1671}
1672
1673static int
1674domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1675                        int translation)
1676{
1677        int ret;
1678        struct pci_dev *tmp, *parent;
1679
1680        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1681                                         pdev->bus->number, pdev->devfn,
1682                                         translation);
1683        if (ret)
1684                return ret;
1685
1686        /* dependent device mapping */
1687        tmp = pci_find_upstream_pcie_bridge(pdev);
1688        if (!tmp)
1689                return 0;
1690        /* Secondary interface's bus number and devfn 0 */
1691        parent = pdev->bus->self;
1692        while (parent != tmp) {
1693                ret = domain_context_mapping_one(domain,
1694                                                 pci_domain_nr(parent->bus),
1695                                                 parent->bus->number,
1696                                                 parent->devfn, translation);
1697                if (ret)
1698                        return ret;
1699                parent = parent->bus->self;
1700        }
1701        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1702                return domain_context_mapping_one(domain,
1703                                        pci_domain_nr(tmp->subordinate),
1704                                        tmp->subordinate->number, 0,
1705                                        translation);
1706        else /* this is a legacy PCI bridge */
1707                return domain_context_mapping_one(domain,
1708                                                  pci_domain_nr(tmp->bus),
1709                                                  tmp->bus->number,
1710                                                  tmp->devfn,
1711                                                  translation);
1712}
1713
1714static int domain_context_mapped(struct pci_dev *pdev)
1715{
1716        int ret;
1717        struct pci_dev *tmp, *parent;
1718        struct intel_iommu *iommu;
1719
1720        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1721                                pdev->devfn);
1722        if (!iommu)
1723                return -ENODEV;
1724
1725        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1726        if (!ret)
1727                return ret;
1728        /* dependent device mapping */
1729        tmp = pci_find_upstream_pcie_bridge(pdev);
1730        if (!tmp)
1731                return ret;
1732        /* Secondary interface's bus number and devfn 0 */
1733        parent = pdev->bus->self;
1734        while (parent != tmp) {
1735                ret = device_context_mapped(iommu, parent->bus->number,
1736                                            parent->devfn);
1737                if (!ret)
1738                        return ret;
1739                parent = parent->bus->self;
1740        }
1741        if (pci_is_pcie(tmp))
1742                return device_context_mapped(iommu, tmp->subordinate->number,
1743                                             0);
1744        else
1745                return device_context_mapped(iommu, tmp->bus->number,
1746                                             tmp->devfn);
1747}
1748
1749/* Returns a number of VTD pages, but aligned to MM page size */
1750static inline unsigned long aligned_nrpages(unsigned long host_addr,
1751                                            size_t size)
1752{
1753        host_addr &= ~PAGE_MASK;
1754        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1755}
1756
1757/* Return largest possible superpage level for a given mapping */
1758static inline int hardware_largepage_caps(struct dmar_domain *domain,
1759                                          unsigned long iov_pfn,
1760                                          unsigned long phy_pfn,
1761                                          unsigned long pages)
1762{
1763        int support, level = 1;
1764        unsigned long pfnmerge;
1765
1766        support = domain->iommu_superpage;
1767
1768        /* To use a large page, the virtual *and* physical addresses
1769           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1770           of them will mean we have to use smaller pages. So just
1771           merge them and check both at once. */
1772        pfnmerge = iov_pfn | phy_pfn;
1773
1774        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1775                pages >>= VTD_STRIDE_SHIFT;
1776                if (!pages)
1777                        break;
1778                pfnmerge >>= VTD_STRIDE_SHIFT;
1779                level++;
1780                support--;
1781        }
1782        return level;
1783}
1784
1785static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1786                            struct scatterlist *sg, unsigned long phys_pfn,
1787                            unsigned long nr_pages, int prot)
1788{
1789        struct dma_pte *first_pte = NULL, *pte = NULL;
1790        phys_addr_t uninitialized_var(pteval);
1791        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1792        unsigned long sg_res;
1793        unsigned int largepage_lvl = 0;
1794        unsigned long lvl_pages = 0;
1795
1796        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1797
1798        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1799                return -EINVAL;
1800
1801        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1802
1803        if (sg)
1804                sg_res = 0;
1805        else {
1806                sg_res = nr_pages + 1;
1807                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1808        }
1809
1810        while (nr_pages > 0) {
1811                uint64_t tmp;
1812
1813                if (!sg_res) {
1814                        sg_res = aligned_nrpages(sg->offset, sg->length);
1815                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1816                        sg->dma_length = sg->length;
1817                        pteval = page_to_phys(sg_page(sg)) | prot;
1818                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1819                }
1820
1821                if (!pte) {
1822                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1823
1824                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1825                        if (!pte)
1826                                return -ENOMEM;
1827                        /* It is large page*/
1828                        if (largepage_lvl > 1)
1829                                pteval |= DMA_PTE_LARGE_PAGE;
1830                        else
1831                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1832
1833                }
1834                /* We don't need lock here, nobody else
1835                 * touches the iova range
1836                 */
1837                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1838                if (tmp) {
1839                        static int dumps = 5;
1840                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1841                               iov_pfn, tmp, (unsigned long long)pteval);
1842                        if (dumps) {
1843                                dumps--;
1844                                debug_dma_dump_mappings(NULL);
1845                        }
1846                        WARN_ON(1);
1847                }
1848
1849                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1850
1851                BUG_ON(nr_pages < lvl_pages);
1852                BUG_ON(sg_res < lvl_pages);
1853
1854                nr_pages -= lvl_pages;
1855                iov_pfn += lvl_pages;
1856                phys_pfn += lvl_pages;
1857                pteval += lvl_pages * VTD_PAGE_SIZE;
1858                sg_res -= lvl_pages;
1859
1860                /* If the next PTE would be the first in a new page, then we
1861                   need to flush the cache on the entries we've just written.
1862                   And then we'll need to recalculate 'pte', so clear it and
1863                   let it get set again in the if (!pte) block above.
1864
1865                   If we're done (!nr_pages) we need to flush the cache too.
1866
1867                   Also if we've been setting superpages, we may need to
1868                   recalculate 'pte' and switch back to smaller pages for the
1869                   end of the mapping, if the trailing size is not enough to
1870                   use another superpage (i.e. sg_res < lvl_pages). */
1871                pte++;
1872                if (!nr_pages || first_pte_in_page(pte) ||
1873                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1874                        domain_flush_cache(domain, first_pte,
1875                                           (void *)pte - (void *)first_pte);
1876                        pte = NULL;
1877                }
1878
1879                if (!sg_res && nr_pages)
1880                        sg = sg_next(sg);
1881        }
1882        return 0;
1883}
1884
1885static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                                    struct scatterlist *sg, unsigned long nr_pages,
1887                                    int prot)
1888{
1889        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1890}
1891
1892static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1893                                     unsigned long phys_pfn, unsigned long nr_pages,
1894                                     int prot)
1895{
1896        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1897}
1898
1899static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1900{
1901        if (!iommu)
1902                return;
1903
1904        clear_context_table(iommu, bus, devfn);
1905        iommu->flush.flush_context(iommu, 0, 0, 0,
1906                                           DMA_CCMD_GLOBAL_INVL);
1907        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1908}
1909
1910static inline void unlink_domain_info(struct device_domain_info *info)
1911{
1912        assert_spin_locked(&device_domain_lock);
1913        list_del(&info->link);
1914        list_del(&info->global);
1915        if (info->dev)
1916                info->dev->dev.archdata.iommu = NULL;
1917}
1918
1919static void domain_remove_dev_info(struct dmar_domain *domain)
1920{
1921        struct device_domain_info *info;
1922        unsigned long flags;
1923        struct intel_iommu *iommu;
1924
1925        spin_lock_irqsave(&device_domain_lock, flags);
1926        while (!list_empty(&domain->devices)) {
1927                info = list_entry(domain->devices.next,
1928                        struct device_domain_info, link);
1929                unlink_domain_info(info);
1930                spin_unlock_irqrestore(&device_domain_lock, flags);
1931
1932                iommu_disable_dev_iotlb(info);
1933                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1934                iommu_detach_dev(iommu, info->bus, info->devfn);
1935                free_devinfo_mem(info);
1936
1937                spin_lock_irqsave(&device_domain_lock, flags);
1938        }
1939        spin_unlock_irqrestore(&device_domain_lock, flags);
1940}
1941
1942/*
1943 * find_domain
1944 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1945 */
1946static struct dmar_domain *
1947find_domain(struct pci_dev *pdev)
1948{
1949        struct device_domain_info *info;
1950
1951        /* No lock here, assumes no domain exit in normal case */
1952        info = pdev->dev.archdata.iommu;
1953        if (info)
1954                return info->domain;
1955        return NULL;
1956}
1957
1958/* domain is initialized */
1959static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1960{
1961        struct dmar_domain *domain, *found = NULL;
1962        struct intel_iommu *iommu;
1963        struct dmar_drhd_unit *drhd;
1964        struct device_domain_info *info, *tmp;
1965        struct pci_dev *dev_tmp;
1966        unsigned long flags;
1967        int bus = 0, devfn = 0;
1968        int segment;
1969        int ret;
1970
1971        domain = find_domain(pdev);
1972        if (domain)
1973                return domain;
1974
1975        segment = pci_domain_nr(pdev->bus);
1976
1977        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1978        if (dev_tmp) {
1979                if (pci_is_pcie(dev_tmp)) {
1980                        bus = dev_tmp->subordinate->number;
1981                        devfn = 0;
1982                } else {
1983                        bus = dev_tmp->bus->number;
1984                        devfn = dev_tmp->devfn;
1985                }
1986                spin_lock_irqsave(&device_domain_lock, flags);
1987                list_for_each_entry(info, &device_domain_list, global) {
1988                        if (info->segment == segment &&
1989                            info->bus == bus && info->devfn == devfn) {
1990                                found = info->domain;
1991                                break;
1992                        }
1993                }
1994                spin_unlock_irqrestore(&device_domain_lock, flags);
1995                /* pcie-pci bridge already has a domain, uses it */
1996                if (found) {
1997                        domain = found;
1998                        goto found_domain;
1999                }
2000        }
2001
2002        domain = alloc_domain();
2003        if (!domain)
2004                goto error;
2005
2006        /* Allocate new domain for the device */
2007        drhd = dmar_find_matched_drhd_unit(pdev);
2008        if (!drhd) {
2009                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2010                        pci_name(pdev));
2011                free_domain_mem(domain);
2012                return NULL;
2013        }
2014        iommu = drhd->iommu;
2015
2016        ret = iommu_attach_domain(domain, iommu);
2017        if (ret) {
2018                free_domain_mem(domain);
2019                goto error;
2020        }
2021
2022        if (domain_init(domain, gaw)) {
2023                domain_exit(domain);
2024                goto error;
2025        }
2026
2027        /* register pcie-to-pci device */
2028        if (dev_tmp) {
2029                info = alloc_devinfo_mem();
2030                if (!info) {
2031                        domain_exit(domain);
2032                        goto error;
2033                }
2034                info->segment = segment;
2035                info->bus = bus;
2036                info->devfn = devfn;
2037                info->dev = NULL;
2038                info->domain = domain;
2039                /* This domain is shared by devices under p2p bridge */
2040                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2041
2042                /* pcie-to-pci bridge already has a domain, uses it */
2043                found = NULL;
2044                spin_lock_irqsave(&device_domain_lock, flags);
2045                list_for_each_entry(tmp, &device_domain_list, global) {
2046                        if (tmp->segment == segment &&
2047                            tmp->bus == bus && tmp->devfn == devfn) {
2048                                found = tmp->domain;
2049                                break;
2050                        }
2051                }
2052                if (found) {
2053                        spin_unlock_irqrestore(&device_domain_lock, flags);
2054                        free_devinfo_mem(info);
2055                        domain_exit(domain);
2056                        domain = found;
2057                } else {
2058                        list_add(&info->link, &domain->devices);
2059                        list_add(&info->global, &device_domain_list);
2060                        spin_unlock_irqrestore(&device_domain_lock, flags);
2061                }
2062        }
2063
2064found_domain:
2065        info = alloc_devinfo_mem();
2066        if (!info)
2067                goto error;
2068        info->segment = segment;
2069        info->bus = pdev->bus->number;
2070        info->devfn = pdev->devfn;
2071        info->dev = pdev;
2072        info->domain = domain;
2073        spin_lock_irqsave(&device_domain_lock, flags);
2074        /* somebody is fast */
2075        found = find_domain(pdev);
2076        if (found != NULL) {
2077                spin_unlock_irqrestore(&device_domain_lock, flags);
2078                if (found != domain) {
2079                        domain_exit(domain);
2080                        domain = found;
2081                }
2082                free_devinfo_mem(info);
2083                return domain;
2084        }
2085        list_add(&info->link, &domain->devices);
2086        list_add(&info->global, &device_domain_list);
2087        pdev->dev.archdata.iommu = info;
2088        spin_unlock_irqrestore(&device_domain_lock, flags);
2089        return domain;
2090error:
2091        /* recheck it here, maybe others set it */
2092        return find_domain(pdev);
2093}
2094
2095static int iommu_identity_mapping;
2096#define IDENTMAP_ALL            1
2097#define IDENTMAP_GFX            2
2098#define IDENTMAP_AZALIA         4
2099
2100static int iommu_domain_identity_map(struct dmar_domain *domain,
2101                                     unsigned long long start,
2102                                     unsigned long long end)
2103{
2104        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2105        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2106
2107        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2108                          dma_to_mm_pfn(last_vpfn))) {
2109                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2110                return -ENOMEM;
2111        }
2112
2113        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2114                 start, end, domain->id);
2115        /*
2116         * RMRR range might have overlap with physical memory range,
2117         * clear it first
2118         */
2119        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2120
2121        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2122                                  last_vpfn - first_vpfn + 1,
2123                                  DMA_PTE_READ|DMA_PTE_WRITE);
2124}
2125
2126static int iommu_prepare_identity_map(struct pci_dev *pdev,
2127                                      unsigned long long start,
2128                                      unsigned long long end)
2129{
2130        struct dmar_domain *domain;
2131        int ret;
2132
2133        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2134        if (!domain)
2135                return -ENOMEM;
2136
2137        /* For _hardware_ passthrough, don't bother. But for software
2138           passthrough, we do it anyway -- it may indicate a memory
2139           range which is reserved in E820, so which didn't get set
2140           up to start with in si_domain */
2141        if (domain == si_domain && hw_pass_through) {
2142                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2143                       pci_name(pdev), start, end);
2144                return 0;
2145        }
2146
2147        printk(KERN_INFO
2148               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2149               pci_name(pdev), start, end);
2150        
2151        if (end < start) {
2152                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2153                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2154                        dmi_get_system_info(DMI_BIOS_VENDOR),
2155                        dmi_get_system_info(DMI_BIOS_VERSION),
2156                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2157                ret = -EIO;
2158                goto error;
2159        }
2160
2161        if (end >> agaw_to_width(domain->agaw)) {
2162                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2163                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2164                     agaw_to_width(domain->agaw),
2165                     dmi_get_system_info(DMI_BIOS_VENDOR),
2166                     dmi_get_system_info(DMI_BIOS_VERSION),
2167                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2168                ret = -EIO;
2169                goto error;
2170        }
2171
2172        ret = iommu_domain_identity_map(domain, start, end);
2173        if (ret)
2174                goto error;
2175
2176        /* context entry init */
2177        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2178        if (ret)
2179                goto error;
2180
2181        return 0;
2182
2183 error:
2184        domain_exit(domain);
2185        return ret;
2186}
2187
2188static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2189        struct pci_dev *pdev)
2190{
2191        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2192                return 0;
2193        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2194                rmrr->end_address);
2195}
2196
2197#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2198static inline void iommu_prepare_isa(void)
2199{
2200        struct pci_dev *pdev;
2201        int ret;
2202
2203        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2204        if (!pdev)
2205                return;
2206
2207        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2208        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2209
2210        if (ret)
2211                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2212                       "floppy might not work\n");
2213
2214}
2215#else
2216static inline void iommu_prepare_isa(void)
2217{
2218        return;
2219}
2220#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2221
2222static int md_domain_init(struct dmar_domain *domain, int guest_width);
2223
2224static int __init si_domain_init(int hw)
2225{
2226        struct dmar_drhd_unit *drhd;
2227        struct intel_iommu *iommu;
2228        int nid, ret = 0;
2229
2230        si_domain = alloc_domain();
2231        if (!si_domain)
2232                return -EFAULT;
2233
2234        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2235
2236        for_each_active_iommu(iommu, drhd) {
2237                ret = iommu_attach_domain(si_domain, iommu);
2238                if (ret) {
2239                        domain_exit(si_domain);
2240                        return -EFAULT;
2241                }
2242        }
2243
2244        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2245                domain_exit(si_domain);
2246                return -EFAULT;
2247        }
2248
2249        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2250
2251        if (hw)
2252                return 0;
2253
2254        for_each_online_node(nid) {
2255                unsigned long start_pfn, end_pfn;
2256                int i;
2257
2258                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2259                        ret = iommu_domain_identity_map(si_domain,
2260                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2261                        if (ret)
2262                                return ret;
2263                }
2264        }
2265
2266        return 0;
2267}
2268
2269static void domain_remove_one_dev_info(struct dmar_domain *domain,
2270                                          struct pci_dev *pdev);
2271static int identity_mapping(struct pci_dev *pdev)
2272{
2273        struct device_domain_info *info;
2274
2275        if (likely(!iommu_identity_mapping))
2276                return 0;
2277
2278        info = pdev->dev.archdata.iommu;
2279        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2280                return (info->domain == si_domain);
2281
2282        return 0;
2283}
2284
2285static int domain_add_dev_info(struct dmar_domain *domain,
2286                               struct pci_dev *pdev,
2287                               int translation)
2288{
2289        struct device_domain_info *info;
2290        unsigned long flags;
2291        int ret;
2292
2293        info = alloc_devinfo_mem();
2294        if (!info)
2295                return -ENOMEM;
2296
2297        info->segment = pci_domain_nr(pdev->bus);
2298        info->bus = pdev->bus->number;
2299        info->devfn = pdev->devfn;
2300        info->dev = pdev;
2301        info->domain = domain;
2302
2303        spin_lock_irqsave(&device_domain_lock, flags);
2304        list_add(&info->link, &domain->devices);
2305        list_add(&info->global, &device_domain_list);
2306        pdev->dev.archdata.iommu = info;
2307        spin_unlock_irqrestore(&device_domain_lock, flags);
2308
2309        ret = domain_context_mapping(domain, pdev, translation);
2310        if (ret) {
2311                spin_lock_irqsave(&device_domain_lock, flags);
2312                unlink_domain_info(info);
2313                spin_unlock_irqrestore(&device_domain_lock, flags);
2314                free_devinfo_mem(info);
2315                return ret;
2316        }
2317
2318        return 0;
2319}
2320
2321static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2322{
2323        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2324                return 1;
2325
2326        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2327                return 1;
2328
2329        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2330                return 0;
2331
2332        /*
2333         * We want to start off with all devices in the 1:1 domain, and
2334         * take them out later if we find they can't access all of memory.
2335         *
2336         * However, we can't do this for PCI devices behind bridges,
2337         * because all PCI devices behind the same bridge will end up
2338         * with the same source-id on their transactions.
2339         *
2340         * Practically speaking, we can't change things around for these
2341         * devices at run-time, because we can't be sure there'll be no
2342         * DMA transactions in flight for any of their siblings.
2343         * 
2344         * So PCI devices (unless they're on the root bus) as well as
2345         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2346         * the 1:1 domain, just in _case_ one of their siblings turns out
2347         * not to be able to map all of memory.
2348         */
2349        if (!pci_is_pcie(pdev)) {
2350                if (!pci_is_root_bus(pdev->bus))
2351                        return 0;
2352                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2353                        return 0;
2354        } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2355                return 0;
2356
2357        /* 
2358         * At boot time, we don't yet know if devices will be 64-bit capable.
2359         * Assume that they will -- if they turn out not to be, then we can 
2360         * take them out of the 1:1 domain later.
2361         */
2362        if (!startup) {
2363                /*
2364                 * If the device's dma_mask is less than the system's memory
2365                 * size then this is not a candidate for identity mapping.
2366                 */
2367                u64 dma_mask = pdev->dma_mask;
2368
2369                if (pdev->dev.coherent_dma_mask &&
2370                    pdev->dev.coherent_dma_mask < dma_mask)
2371                        dma_mask = pdev->dev.coherent_dma_mask;
2372
2373                return dma_mask >= dma_get_required_mask(&pdev->dev);
2374        }
2375
2376        return 1;
2377}
2378
2379static int __init iommu_prepare_static_identity_mapping(int hw)
2380{
2381        struct pci_dev *pdev = NULL;
2382        int ret;
2383
2384        ret = si_domain_init(hw);
2385        if (ret)
2386                return -EFAULT;
2387
2388        for_each_pci_dev(pdev) {
2389                if (iommu_should_identity_map(pdev, 1)) {
2390                        ret = domain_add_dev_info(si_domain, pdev,
2391                                             hw ? CONTEXT_TT_PASS_THROUGH :
2392                                                  CONTEXT_TT_MULTI_LEVEL);
2393                        if (ret) {
2394                                /* device not associated with an iommu */
2395                                if (ret == -ENODEV)
2396                                        continue;
2397                                return ret;
2398                        }
2399                        pr_info("IOMMU: %s identity mapping for device %s\n",
2400                                hw ? "hardware" : "software", pci_name(pdev));
2401                }
2402        }
2403
2404        return 0;
2405}
2406
2407static int __init init_dmars(void)
2408{
2409        struct dmar_drhd_unit *drhd;
2410        struct dmar_rmrr_unit *rmrr;
2411        struct pci_dev *pdev;
2412        struct intel_iommu *iommu;
2413        int i, ret;
2414
2415        /*
2416         * for each drhd
2417         *    allocate root
2418         *    initialize and program root entry to not present
2419         * endfor
2420         */
2421        for_each_drhd_unit(drhd) {
2422                /*
2423                 * lock not needed as this is only incremented in the single
2424                 * threaded kernel __init code path all other access are read
2425                 * only
2426                 */
2427                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2428                        g_num_of_iommus++;
2429                        continue;
2430                }
2431                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2432                          IOMMU_UNITS_SUPPORTED);
2433        }
2434
2435        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2436                        GFP_KERNEL);
2437        if (!g_iommus) {
2438                printk(KERN_ERR "Allocating global iommu array failed\n");
2439                ret = -ENOMEM;
2440                goto error;
2441        }
2442
2443        deferred_flush = kzalloc(g_num_of_iommus *
2444                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2445        if (!deferred_flush) {
2446                ret = -ENOMEM;
2447                goto error;
2448        }
2449
2450        for_each_drhd_unit(drhd) {
2451                if (drhd->ignored)
2452                        continue;
2453
2454                iommu = drhd->iommu;
2455                g_iommus[iommu->seq_id] = iommu;
2456
2457                ret = iommu_init_domains(iommu);
2458                if (ret)
2459                        goto error;
2460
2461                /*
2462                 * TBD:
2463                 * we could share the same root & context tables
2464                 * among all IOMMU's. Need to Split it later.
2465                 */
2466                ret = iommu_alloc_root_entry(iommu);
2467                if (ret) {
2468                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2469                        goto error;
2470                }
2471                if (!ecap_pass_through(iommu->ecap))
2472                        hw_pass_through = 0;
2473        }
2474
2475        /*
2476         * Start from the sane iommu hardware state.
2477         */
2478        for_each_drhd_unit(drhd) {
2479                if (drhd->ignored)
2480                        continue;
2481
2482                iommu = drhd->iommu;
2483
2484                /*
2485                 * If the queued invalidation is already initialized by us
2486                 * (for example, while enabling interrupt-remapping) then
2487                 * we got the things already rolling from a sane state.
2488                 */
2489                if (iommu->qi)
2490                        continue;
2491
2492                /*
2493                 * Clear any previous faults.
2494                 */
2495                dmar_fault(-1, iommu);
2496                /*
2497                 * Disable queued invalidation if supported and already enabled
2498                 * before OS handover.
2499                 */
2500                dmar_disable_qi(iommu);
2501        }
2502
2503        for_each_drhd_unit(drhd) {
2504                if (drhd->ignored)
2505                        continue;
2506
2507                iommu = drhd->iommu;
2508
2509                if (dmar_enable_qi(iommu)) {
2510                        /*
2511                         * Queued Invalidate not enabled, use Register Based
2512                         * Invalidate
2513                         */
2514                        iommu->flush.flush_context = __iommu_flush_context;
2515                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2516                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2517                               "invalidation\n",
2518                                iommu->seq_id,
2519                               (unsigned long long)drhd->reg_base_addr);
2520                } else {
2521                        iommu->flush.flush_context = qi_flush_context;
2522                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2523                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2524                               "invalidation\n",
2525                                iommu->seq_id,
2526                               (unsigned long long)drhd->reg_base_addr);
2527                }
2528        }
2529
2530        if (iommu_pass_through)
2531                iommu_identity_mapping |= IDENTMAP_ALL;
2532
2533#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2534        iommu_identity_mapping |= IDENTMAP_GFX;
2535#endif
2536
2537        check_tylersburg_isoch();
2538
2539        /*
2540         * If pass through is not set or not enabled, setup context entries for
2541         * identity mappings for rmrr, gfx, and isa and may fall back to static
2542         * identity mapping if iommu_identity_mapping is set.
2543         */
2544        if (iommu_identity_mapping) {
2545                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2546                if (ret) {
2547                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2548                        goto error;
2549                }
2550        }
2551        /*
2552         * For each rmrr
2553         *   for each dev attached to rmrr
2554         *   do
2555         *     locate drhd for dev, alloc domain for dev
2556         *     allocate free domain
2557         *     allocate page table entries for rmrr
2558         *     if context not allocated for bus
2559         *           allocate and init context
2560         *           set present in root table for this bus
2561         *     init context with domain, translation etc
2562         *    endfor
2563         * endfor
2564         */
2565        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2566        for_each_rmrr_units(rmrr) {
2567                for (i = 0; i < rmrr->devices_cnt; i++) {
2568                        pdev = rmrr->devices[i];
2569                        /*
2570                         * some BIOS lists non-exist devices in DMAR
2571                         * table.
2572                         */
2573                        if (!pdev)
2574                                continue;
2575                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2576                        if (ret)
2577                                printk(KERN_ERR
2578                                       "IOMMU: mapping reserved region failed\n");
2579                }
2580        }
2581
2582        iommu_prepare_isa();
2583
2584        /*
2585         * for each drhd
2586         *   enable fault log
2587         *   global invalidate context cache
2588         *   global invalidate iotlb
2589         *   enable translation
2590         */
2591        for_each_drhd_unit(drhd) {
2592                if (drhd->ignored) {
2593                        /*
2594                         * we always have to disable PMRs or DMA may fail on
2595                         * this device
2596                         */
2597                        if (force_on)
2598                                iommu_disable_protect_mem_regions(drhd->iommu);
2599                        continue;
2600                }
2601                iommu = drhd->iommu;
2602
2603                iommu_flush_write_buffer(iommu);
2604
2605                ret = dmar_set_interrupt(iommu);
2606                if (ret)
2607                        goto error;
2608
2609                iommu_set_root_entry(iommu);
2610
2611                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2612                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2613
2614                ret = iommu_enable_translation(iommu);
2615                if (ret)
2616                        goto error;
2617
2618                iommu_disable_protect_mem_regions(iommu);
2619        }
2620
2621        return 0;
2622error:
2623        for_each_drhd_unit(drhd) {
2624                if (drhd->ignored)
2625                        continue;
2626                iommu = drhd->iommu;
2627                free_iommu(iommu);
2628        }
2629        kfree(g_iommus);
2630        return ret;
2631}
2632
2633/* This takes a number of _MM_ pages, not VTD pages */
2634static struct iova *intel_alloc_iova(struct device *dev,
2635                                     struct dmar_domain *domain,
2636                                     unsigned long nrpages, uint64_t dma_mask)
2637{
2638        struct pci_dev *pdev = to_pci_dev(dev);
2639        struct iova *iova = NULL;
2640
2641        /* Restrict dma_mask to the width that the iommu can handle */
2642        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2643
2644        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2645                /*
2646                 * First try to allocate an io virtual address in
2647                 * DMA_BIT_MASK(32) and if that fails then try allocating
2648                 * from higher range
2649                 */
2650                iova = alloc_iova(&domain->iovad, nrpages,
2651                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2652                if (iova)
2653                        return iova;
2654        }
2655        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2656        if (unlikely(!iova)) {
2657                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2658                       nrpages, pci_name(pdev));
2659                return NULL;
2660        }
2661
2662        return iova;
2663}
2664
2665static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2666{
2667        struct dmar_domain *domain;
2668        int ret;
2669
2670        domain = get_domain_for_dev(pdev,
2671                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2672        if (!domain) {
2673                printk(KERN_ERR
2674                        "Allocating domain for %s failed", pci_name(pdev));
2675                return NULL;
2676        }
2677
2678        /* make sure context mapping is ok */
2679        if (unlikely(!domain_context_mapped(pdev))) {
2680                ret = domain_context_mapping(domain, pdev,
2681                                             CONTEXT_TT_MULTI_LEVEL);
2682                if (ret) {
2683                        printk(KERN_ERR
2684                                "Domain context map for %s failed",
2685                                pci_name(pdev));
2686                        return NULL;
2687                }
2688        }
2689
2690        return domain;
2691}
2692
2693static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2694{
2695        struct device_domain_info *info;
2696
2697        /* No lock here, assumes no domain exit in normal case */
2698        info = dev->dev.archdata.iommu;
2699        if (likely(info))
2700                return info->domain;
2701
2702        return __get_valid_domain_for_dev(dev);
2703}
2704
2705static int iommu_dummy(struct pci_dev *pdev)
2706{
2707        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2708}
2709
2710/* Check if the pdev needs to go through non-identity map and unmap process.*/
2711static int iommu_no_mapping(struct device *dev)
2712{
2713        struct pci_dev *pdev;
2714        int found;
2715
2716        if (unlikely(dev->bus != &pci_bus_type))
2717                return 1;
2718
2719        pdev = to_pci_dev(dev);
2720        if (iommu_dummy(pdev))
2721                return 1;
2722
2723        if (!iommu_identity_mapping)
2724                return 0;
2725
2726        found = identity_mapping(pdev);
2727        if (found) {
2728                if (iommu_should_identity_map(pdev, 0))
2729                        return 1;
2730                else {
2731                        /*
2732                         * 32 bit DMA is removed from si_domain and fall back
2733                         * to non-identity mapping.
2734                         */
2735                        domain_remove_one_dev_info(si_domain, pdev);
2736                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2737                               pci_name(pdev));
2738                        return 0;
2739                }
2740        } else {
2741                /*
2742                 * In case of a detached 64 bit DMA device from vm, the device
2743                 * is put into si_domain for identity mapping.
2744                 */
2745                if (iommu_should_identity_map(pdev, 0)) {
2746                        int ret;
2747                        ret = domain_add_dev_info(si_domain, pdev,
2748                                                  hw_pass_through ?
2749                                                  CONTEXT_TT_PASS_THROUGH :
2750                                                  CONTEXT_TT_MULTI_LEVEL);
2751                        if (!ret) {
2752                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2753                                       pci_name(pdev));
2754                                return 1;
2755                        }
2756                }
2757        }
2758
2759        return 0;
2760}
2761
2762static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2763                                     size_t size, int dir, u64 dma_mask)
2764{
2765        struct pci_dev *pdev = to_pci_dev(hwdev);
2766        struct dmar_domain *domain;
2767        phys_addr_t start_paddr;
2768        struct iova *iova;
2769        int prot = 0;
2770        int ret;
2771        struct intel_iommu *iommu;
2772        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2773
2774        BUG_ON(dir == DMA_NONE);
2775
2776        if (iommu_no_mapping(hwdev))
2777                return paddr;
2778
2779        domain = get_valid_domain_for_dev(pdev);
2780        if (!domain)
2781                return 0;
2782
2783        iommu = domain_get_iommu(domain);
2784        size = aligned_nrpages(paddr, size);
2785
2786        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2787        if (!iova)
2788                goto error;
2789
2790        /*
2791         * Check if DMAR supports zero-length reads on write only
2792         * mappings..
2793         */
2794        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2795                        !cap_zlr(iommu->cap))
2796                prot |= DMA_PTE_READ;
2797        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2798                prot |= DMA_PTE_WRITE;
2799        /*
2800         * paddr - (paddr + size) might be partial page, we should map the whole
2801         * page.  Note: if two part of one page are separately mapped, we
2802         * might have two guest_addr mapping to the same host paddr, but this
2803         * is not a big problem
2804         */
2805        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2806                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2807        if (ret)
2808                goto error;
2809
2810        /* it's a non-present to present mapping. Only flush if caching mode */
2811        if (cap_caching_mode(iommu->cap))
2812                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2813        else
2814                iommu_flush_write_buffer(iommu);
2815
2816        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2817        start_paddr += paddr & ~PAGE_MASK;
2818        return start_paddr;
2819
2820error:
2821        if (iova)
2822                __free_iova(&domain->iovad, iova);
2823        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2824                pci_name(pdev), size, (unsigned long long)paddr, dir);
2825        return 0;
2826}
2827
2828static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2829                                 unsigned long offset, size_t size,
2830                                 enum dma_data_direction dir,
2831                                 struct dma_attrs *attrs)
2832{
2833        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2834                                  dir, to_pci_dev(dev)->dma_mask);
2835}
2836
2837static void flush_unmaps(void)
2838{
2839        int i, j;
2840
2841        timer_on = 0;
2842
2843        /* just flush them all */
2844        for (i = 0; i < g_num_of_iommus; i++) {
2845                struct intel_iommu *iommu = g_iommus[i];
2846                if (!iommu)
2847                        continue;
2848
2849                if (!deferred_flush[i].next)
2850                        continue;
2851
2852                /* In caching mode, global flushes turn emulation expensive */
2853                if (!cap_caching_mode(iommu->cap))
2854                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2855                                         DMA_TLB_GLOBAL_FLUSH);
2856                for (j = 0; j < deferred_flush[i].next; j++) {
2857                        unsigned long mask;
2858                        struct iova *iova = deferred_flush[i].iova[j];
2859                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2860
2861                        /* On real hardware multiple invalidations are expensive */
2862                        if (cap_caching_mode(iommu->cap))
2863                                iommu_flush_iotlb_psi(iommu, domain->id,
2864                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2865                        else {
2866                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2867                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2868                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2869                        }
2870                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2871                }
2872                deferred_flush[i].next = 0;
2873        }
2874
2875        list_size = 0;
2876}
2877
2878static void flush_unmaps_timeout(unsigned long data)
2879{
2880        unsigned long flags;
2881
2882        spin_lock_irqsave(&async_umap_flush_lock, flags);
2883        flush_unmaps();
2884        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2885}
2886
2887static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2888{
2889        unsigned long flags;
2890        int next, iommu_id;
2891        struct intel_iommu *iommu;
2892
2893        spin_lock_irqsave(&async_umap_flush_lock, flags);
2894        if (list_size == HIGH_WATER_MARK)
2895                flush_unmaps();
2896
2897        iommu = domain_get_iommu(dom);
2898        iommu_id = iommu->seq_id;
2899
2900        next = deferred_flush[iommu_id].next;
2901        deferred_flush[iommu_id].domain[next] = dom;
2902        deferred_flush[iommu_id].iova[next] = iova;
2903        deferred_flush[iommu_id].next++;
2904
2905        if (!timer_on) {
2906                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2907                timer_on = 1;
2908        }
2909        list_size++;
2910        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2911}
2912
2913static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2914                             size_t size, enum dma_data_direction dir,
2915                             struct dma_attrs *attrs)
2916{
2917        struct pci_dev *pdev = to_pci_dev(dev);
2918        struct dmar_domain *domain;
2919        unsigned long start_pfn, last_pfn;
2920        struct iova *iova;
2921        struct intel_iommu *iommu;
2922
2923        if (iommu_no_mapping(dev))
2924                return;
2925
2926        domain = find_domain(pdev);
2927        BUG_ON(!domain);
2928
2929        iommu = domain_get_iommu(domain);
2930
2931        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2932        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2933                      (unsigned long long)dev_addr))
2934                return;
2935
2936        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2937        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2938
2939        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2940                 pci_name(pdev), start_pfn, last_pfn);
2941
2942        /*  clear the whole page */
2943        dma_pte_clear_range(domain, start_pfn, last_pfn);
2944
2945        /* free page tables */
2946        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2947
2948        if (intel_iommu_strict) {
2949                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2950                                      last_pfn - start_pfn + 1, 0);
2951                /* free iova */
2952                __free_iova(&domain->iovad, iova);
2953        } else {
2954                add_unmap(domain, iova);
2955                /*
2956                 * queue up the release of the unmap to save the 1/6th of the
2957                 * cpu used up by the iotlb flush operation...
2958                 */
2959        }
2960}
2961
2962static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2963                                  dma_addr_t *dma_handle, gfp_t flags,
2964                                  struct dma_attrs *attrs)
2965{
2966        void *vaddr;
2967        int order;
2968
2969        size = PAGE_ALIGN(size);
2970        order = get_order(size);
2971
2972        if (!iommu_no_mapping(hwdev))
2973                flags &= ~(GFP_DMA | GFP_DMA32);
2974        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2975                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2976                        flags |= GFP_DMA;
2977                else
2978                        flags |= GFP_DMA32;
2979        }
2980
2981        vaddr = (void *)__get_free_pages(flags, order);
2982        if (!vaddr)
2983                return NULL;
2984        memset(vaddr, 0, size);
2985
2986        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2987                                         DMA_BIDIRECTIONAL,
2988                                         hwdev->coherent_dma_mask);
2989        if (*dma_handle)
2990                return vaddr;
2991        free_pages((unsigned long)vaddr, order);
2992        return NULL;
2993}
2994
2995static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2996                                dma_addr_t dma_handle, struct dma_attrs *attrs)
2997{
2998        int order;
2999
3000        size = PAGE_ALIGN(size);
3001        order = get_order(size);
3002
3003        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3004        free_pages((unsigned long)vaddr, order);
3005}
3006
3007static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3008                           int nelems, enum dma_data_direction dir,
3009                           struct dma_attrs *attrs)
3010{
3011        struct pci_dev *pdev = to_pci_dev(hwdev);
3012        struct dmar_domain *domain;
3013        unsigned long start_pfn, last_pfn;
3014        struct iova *iova;
3015        struct intel_iommu *iommu;
3016
3017        if (iommu_no_mapping(hwdev))
3018                return;
3019
3020        domain = find_domain(pdev);
3021        BUG_ON(!domain);
3022
3023        iommu = domain_get_iommu(domain);
3024
3025        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3026        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3027                      (unsigned long long)sglist[0].dma_address))
3028                return;
3029
3030        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3031        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3032
3033        /*  clear the whole page */
3034        dma_pte_clear_range(domain, start_pfn, last_pfn);
3035
3036        /* free page tables */
3037        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3038
3039        if (intel_iommu_strict) {
3040                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3041                                      last_pfn - start_pfn + 1, 0);
3042                /* free iova */
3043                __free_iova(&domain->iovad, iova);
3044        } else {
3045                add_unmap(domain, iova);
3046                /*
3047                 * queue up the release of the unmap to save the 1/6th of the
3048                 * cpu used up by the iotlb flush operation...
3049                 */
3050        }
3051}
3052
3053static int intel_nontranslate_map_sg(struct device *hddev,
3054        struct scatterlist *sglist, int nelems, int dir)
3055{
3056        int i;
3057        struct scatterlist *sg;
3058
3059        for_each_sg(sglist, sg, nelems, i) {
3060                BUG_ON(!sg_page(sg));
3061                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3062                sg->dma_length = sg->length;
3063        }
3064        return nelems;
3065}
3066
3067static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3068                        enum dma_data_direction dir, struct dma_attrs *attrs)
3069{
3070        int i;
3071        struct pci_dev *pdev = to_pci_dev(hwdev);
3072        struct dmar_domain *domain;
3073        size_t size = 0;
3074        int prot = 0;
3075        struct iova *iova = NULL;
3076        int ret;
3077        struct scatterlist *sg;
3078        unsigned long start_vpfn;
3079        struct intel_iommu *iommu;
3080
3081        BUG_ON(dir == DMA_NONE);
3082        if (iommu_no_mapping(hwdev))
3083                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3084
3085        domain = get_valid_domain_for_dev(pdev);
3086        if (!domain)
3087                return 0;
3088
3089        iommu = domain_get_iommu(domain);
3090
3091        for_each_sg(sglist, sg, nelems, i)
3092                size += aligned_nrpages(sg->offset, sg->length);
3093
3094        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3095                                pdev->dma_mask);
3096        if (!iova) {
3097                sglist->dma_length = 0;
3098                return 0;
3099        }
3100
3101        /*
3102         * Check if DMAR supports zero-length reads on write only
3103         * mappings..
3104         */
3105        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3106                        !cap_zlr(iommu->cap))
3107                prot |= DMA_PTE_READ;
3108        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3109                prot |= DMA_PTE_WRITE;
3110
3111        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3112
3113        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3114        if (unlikely(ret)) {
3115                /*  clear the page */
3116                dma_pte_clear_range(domain, start_vpfn,
3117                                    start_vpfn + size - 1);
3118                /* free page tables */
3119                dma_pte_free_pagetable(domain, start_vpfn,
3120                                       start_vpfn + size - 1);
3121                /* free iova */
3122                __free_iova(&domain->iovad, iova);
3123                return 0;
3124        }
3125
3126        /* it's a non-present to present mapping. Only flush if caching mode */
3127        if (cap_caching_mode(iommu->cap))
3128                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3129        else
3130                iommu_flush_write_buffer(iommu);
3131
3132        return nelems;
3133}
3134
3135static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3136{
3137        return !dma_addr;
3138}
3139
3140struct dma_map_ops intel_dma_ops = {
3141        .alloc = intel_alloc_coherent,
3142        .free = intel_free_coherent,
3143        .map_sg = intel_map_sg,
3144        .unmap_sg = intel_unmap_sg,
3145        .map_page = intel_map_page,
3146        .unmap_page = intel_unmap_page,
3147        .mapping_error = intel_mapping_error,
3148};
3149
3150static inline int iommu_domain_cache_init(void)
3151{
3152        int ret = 0;
3153
3154        iommu_domain_cache = kmem_cache_create("iommu_domain",
3155                                         sizeof(struct dmar_domain),
3156                                         0,
3157                                         SLAB_HWCACHE_ALIGN,
3158
3159                                         NULL);
3160        if (!iommu_domain_cache) {
3161                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3162                ret = -ENOMEM;
3163        }
3164
3165        return ret;
3166}
3167
3168static inline int iommu_devinfo_cache_init(void)
3169{
3170        int ret = 0;
3171
3172        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3173                                         sizeof(struct device_domain_info),
3174                                         0,
3175                                         SLAB_HWCACHE_ALIGN,
3176                                         NULL);
3177        if (!iommu_devinfo_cache) {
3178                printk(KERN_ERR "Couldn't create devinfo cache\n");
3179                ret = -ENOMEM;
3180        }
3181
3182        return ret;
3183}
3184
3185static inline int iommu_iova_cache_init(void)
3186{
3187        int ret = 0;
3188
3189        iommu_iova_cache = kmem_cache_create("iommu_iova",
3190                                         sizeof(struct iova),
3191                                         0,
3192                                         SLAB_HWCACHE_ALIGN,
3193                                         NULL);
3194        if (!iommu_iova_cache) {
3195                printk(KERN_ERR "Couldn't create iova cache\n");
3196                ret = -ENOMEM;
3197        }
3198
3199        return ret;
3200}
3201
3202static int __init iommu_init_mempool(void)
3203{
3204        int ret;
3205        ret = iommu_iova_cache_init();
3206        if (ret)
3207                return ret;
3208
3209        ret = iommu_domain_cache_init();
3210        if (ret)
3211                goto domain_error;
3212
3213        ret = iommu_devinfo_cache_init();
3214        if (!ret)
3215                return ret;
3216
3217        kmem_cache_destroy(iommu_domain_cache);
3218domain_error:
3219        kmem_cache_destroy(iommu_iova_cache);
3220
3221        return -ENOMEM;
3222}
3223
3224static void __init iommu_exit_mempool(void)
3225{
3226        kmem_cache_destroy(iommu_devinfo_cache);
3227        kmem_cache_destroy(iommu_domain_cache);
3228        kmem_cache_destroy(iommu_iova_cache);
3229
3230}
3231
3232static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3233{
3234        struct dmar_drhd_unit *drhd;
3235        u32 vtbar;
3236        int rc;
3237
3238        /* We know that this device on this chipset has its own IOMMU.
3239         * If we find it under a different IOMMU, then the BIOS is lying
3240         * to us. Hope that the IOMMU for this device is actually
3241         * disabled, and it needs no translation...
3242         */
3243        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3244        if (rc) {
3245                /* "can't" happen */
3246                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3247                return;
3248        }
3249        vtbar &= 0xffff0000;
3250
3251        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3252        drhd = dmar_find_matched_drhd_unit(pdev);
3253        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3254                            TAINT_FIRMWARE_WORKAROUND,
3255                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3256                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3257}
3258DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3259
3260static void __init init_no_remapping_devices(void)
3261{
3262        struct dmar_drhd_unit *drhd;
3263
3264        for_each_drhd_unit(drhd) {
3265                if (!drhd->include_all) {
3266                        int i;
3267                        for (i = 0; i < drhd->devices_cnt; i++)
3268                                if (drhd->devices[i] != NULL)
3269                                        break;
3270                        /* ignore DMAR unit if no pci devices exist */
3271                        if (i == drhd->devices_cnt)
3272                                drhd->ignored = 1;
3273                }
3274        }
3275
3276        for_each_drhd_unit(drhd) {
3277                int i;
3278                if (drhd->ignored || drhd->include_all)
3279                        continue;
3280
3281                for (i = 0; i < drhd->devices_cnt; i++)
3282                        if (drhd->devices[i] &&
3283                            !IS_GFX_DEVICE(drhd->devices[i]))
3284                                break;
3285
3286                if (i < drhd->devices_cnt)
3287                        continue;
3288
3289                /* This IOMMU has *only* gfx devices. Either bypass it or
3290                   set the gfx_mapped flag, as appropriate */
3291                if (dmar_map_gfx) {
3292                        intel_iommu_gfx_mapped = 1;
3293                } else {
3294                        drhd->ignored = 1;
3295                        for (i = 0; i < drhd->devices_cnt; i++) {
3296                                if (!drhd->devices[i])
3297                                        continue;
3298                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299                        }
3300                }
3301        }
3302}
3303
3304#ifdef CONFIG_SUSPEND
3305static int init_iommu_hw(void)
3306{
3307        struct dmar_drhd_unit *drhd;
3308        struct intel_iommu *iommu = NULL;
3309
3310        for_each_active_iommu(iommu, drhd)
3311                if (iommu->qi)
3312                        dmar_reenable_qi(iommu);
3313
3314        for_each_iommu(iommu, drhd) {
3315                if (drhd->ignored) {
3316                        /*
3317                         * we always have to disable PMRs or DMA may fail on
3318                         * this device
3319                         */
3320                        if (force_on)
3321                                iommu_disable_protect_mem_regions(iommu);
3322                        continue;
3323                }
3324        
3325                iommu_flush_write_buffer(iommu);
3326
3327                iommu_set_root_entry(iommu);
3328
3329                iommu->flush.flush_context(iommu, 0, 0, 0,
3330                                           DMA_CCMD_GLOBAL_INVL);
3331                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3332                                         DMA_TLB_GLOBAL_FLUSH);
3333                if (iommu_enable_translation(iommu))
3334                        return 1;
3335                iommu_disable_protect_mem_regions(iommu);
3336        }
3337
3338        return 0;
3339}
3340
3341static void iommu_flush_all(void)
3342{
3343        struct dmar_drhd_unit *drhd;
3344        struct intel_iommu *iommu;
3345
3346        for_each_active_iommu(iommu, drhd) {
3347                iommu->flush.flush_context(iommu, 0, 0, 0,
3348                                           DMA_CCMD_GLOBAL_INVL);
3349                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3350                                         DMA_TLB_GLOBAL_FLUSH);
3351        }
3352}
3353
3354static int iommu_suspend(void)
3355{
3356        struct dmar_drhd_unit *drhd;
3357        struct intel_iommu *iommu = NULL;
3358        unsigned long flag;
3359
3360        for_each_active_iommu(iommu, drhd) {
3361                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3362                                                 GFP_ATOMIC);
3363                if (!iommu->iommu_state)
3364                        goto nomem;
3365        }
3366
3367        iommu_flush_all();
3368
3369        for_each_active_iommu(iommu, drhd) {
3370                iommu_disable_translation(iommu);
3371
3372                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3373
3374                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3375                        readl(iommu->reg + DMAR_FECTL_REG);
3376                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3377                        readl(iommu->reg + DMAR_FEDATA_REG);
3378                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3379                        readl(iommu->reg + DMAR_FEADDR_REG);
3380                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3381                        readl(iommu->reg + DMAR_FEUADDR_REG);
3382
3383                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3384        }
3385        return 0;
3386
3387nomem:
3388        for_each_active_iommu(iommu, drhd)
3389                kfree(iommu->iommu_state);
3390
3391        return -ENOMEM;
3392}
3393
3394static void iommu_resume(void)
3395{
3396        struct dmar_drhd_unit *drhd;
3397        struct intel_iommu *iommu = NULL;
3398        unsigned long flag;
3399
3400        if (init_iommu_hw()) {
3401                if (force_on)
3402                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3403                else
3404                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3405                return;
3406        }
3407
3408        for_each_active_iommu(iommu, drhd) {
3409
3410                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3411
3412                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3413                        iommu->reg + DMAR_FECTL_REG);
3414                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3415                        iommu->reg + DMAR_FEDATA_REG);
3416                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3417                        iommu->reg + DMAR_FEADDR_REG);
3418                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3419                        iommu->reg + DMAR_FEUADDR_REG);
3420
3421                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3422        }
3423
3424        for_each_active_iommu(iommu, drhd)
3425                kfree(iommu->iommu_state);
3426}
3427
3428static struct syscore_ops iommu_syscore_ops = {
3429        .resume         = iommu_resume,
3430        .suspend        = iommu_suspend,
3431};
3432
3433static void __init init_iommu_pm_ops(void)
3434{
3435        register_syscore_ops(&iommu_syscore_ops);
3436}
3437
3438#else
3439static inline void init_iommu_pm_ops(void) {}
3440#endif  /* CONFIG_PM */
3441
3442LIST_HEAD(dmar_rmrr_units);
3443
3444static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3445{
3446        list_add(&rmrr->list, &dmar_rmrr_units);
3447}
3448
3449
3450int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3451{
3452        struct acpi_dmar_reserved_memory *rmrr;
3453        struct dmar_rmrr_unit *rmrru;
3454
3455        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3456        if (!rmrru)
3457                return -ENOMEM;
3458
3459        rmrru->hdr = header;
3460        rmrr = (struct acpi_dmar_reserved_memory *)header;
3461        rmrru->base_address = rmrr->base_address;
3462        rmrru->end_address = rmrr->end_address;
3463
3464        dmar_register_rmrr_unit(rmrru);
3465        return 0;
3466}
3467
3468static int __init
3469rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3470{
3471        struct acpi_dmar_reserved_memory *rmrr;
3472        int ret;
3473
3474        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3475        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3476                ((void *)rmrr) + rmrr->header.length,
3477                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3478
3479        if (ret || (rmrru->devices_cnt == 0)) {
3480                list_del(&rmrru->list);
3481                kfree(rmrru);
3482        }
3483        return ret;
3484}
3485
3486static LIST_HEAD(dmar_atsr_units);
3487
3488int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3489{
3490        struct acpi_dmar_atsr *atsr;
3491        struct dmar_atsr_unit *atsru;
3492
3493        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3494        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3495        if (!atsru)
3496                return -ENOMEM;
3497
3498        atsru->hdr = hdr;
3499        atsru->include_all = atsr->flags & 0x1;
3500
3501        list_add(&atsru->list, &dmar_atsr_units);
3502
3503        return 0;
3504}
3505
3506static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3507{
3508        int rc;
3509        struct acpi_dmar_atsr *atsr;
3510
3511        if (atsru->include_all)
3512                return 0;
3513
3514        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3515        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3516                                (void *)atsr + atsr->header.length,
3517                                &atsru->devices_cnt, &atsru->devices,
3518                                atsr->segment);
3519        if (rc || !atsru->devices_cnt) {
3520                list_del(&atsru->list);
3521                kfree(atsru);
3522        }
3523
3524        return rc;
3525}
3526
3527int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3528{
3529        int i;
3530        struct pci_bus *bus;
3531        struct acpi_dmar_atsr *atsr;
3532        struct dmar_atsr_unit *atsru;
3533
3534        dev = pci_physfn(dev);
3535
3536        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3537                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3538                if (atsr->segment == pci_domain_nr(dev->bus))
3539                        goto found;
3540        }
3541
3542        return 0;
3543
3544found:
3545        for (bus = dev->bus; bus; bus = bus->parent) {
3546                struct pci_dev *bridge = bus->self;
3547
3548                if (!bridge || !pci_is_pcie(bridge) ||
3549                    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3550                        return 0;
3551
3552                if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3553                        for (i = 0; i < atsru->devices_cnt; i++)
3554                                if (atsru->devices[i] == bridge)
3555                                        return 1;
3556                        break;
3557                }
3558        }
3559
3560        if (atsru->include_all)
3561                return 1;
3562
3563        return 0;
3564}
3565
3566int __init dmar_parse_rmrr_atsr_dev(void)
3567{
3568        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3569        struct dmar_atsr_unit *atsr, *atsr_n;
3570        int ret = 0;
3571
3572        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3573                ret = rmrr_parse_dev(rmrr);
3574                if (ret)
3575                        return ret;
3576        }
3577
3578        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3579                ret = atsr_parse_dev(atsr);
3580                if (ret)
3581                        return ret;
3582        }
3583
3584        return ret;
3585}
3586
3587/*
3588 * Here we only respond to action of unbound device from driver.
3589 *
3590 * Added device is not attached to its DMAR domain here yet. That will happen
3591 * when mapping the device to iova.
3592 */
3593static int device_notifier(struct notifier_block *nb,
3594                                  unsigned long action, void *data)
3595{
3596        struct device *dev = data;
3597        struct pci_dev *pdev = to_pci_dev(dev);
3598        struct dmar_domain *domain;
3599
3600        if (iommu_no_mapping(dev))
3601                return 0;
3602
3603        domain = find_domain(pdev);
3604        if (!domain)
3605                return 0;
3606
3607        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3608                domain_remove_one_dev_info(domain, pdev);
3609
3610                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3611                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3612                    list_empty(&domain->devices))
3613                        domain_exit(domain);
3614        }
3615
3616        return 0;
3617}
3618
3619static struct notifier_block device_nb = {
3620        .notifier_call = device_notifier,
3621};
3622
3623int __init intel_iommu_init(void)
3624{
3625        int ret = 0;
3626
3627        /* VT-d is required for a TXT/tboot launch, so enforce that */
3628        force_on = tboot_force_iommu();
3629
3630        if (dmar_table_init()) {
3631                if (force_on)
3632                        panic("tboot: Failed to initialize DMAR table\n");
3633                return  -ENODEV;
3634        }
3635
3636        if (dmar_dev_scope_init() < 0) {
3637                if (force_on)
3638                        panic("tboot: Failed to initialize DMAR device scope\n");
3639                return  -ENODEV;
3640        }
3641
3642        if (no_iommu || dmar_disabled)
3643                return -ENODEV;
3644
3645        if (iommu_init_mempool()) {
3646                if (force_on)
3647                        panic("tboot: Failed to initialize iommu memory\n");
3648                return  -ENODEV;
3649        }
3650
3651        if (list_empty(&dmar_rmrr_units))
3652                printk(KERN_INFO "DMAR: No RMRR found\n");
3653
3654        if (list_empty(&dmar_atsr_units))
3655                printk(KERN_INFO "DMAR: No ATSR found\n");
3656
3657        if (dmar_init_reserved_ranges()) {
3658                if (force_on)
3659                        panic("tboot: Failed to reserve iommu ranges\n");
3660                return  -ENODEV;
3661        }
3662
3663        init_no_remapping_devices();
3664
3665        ret = init_dmars();
3666        if (ret) {
3667                if (force_on)
3668                        panic("tboot: Failed to initialize DMARs\n");
3669                printk(KERN_ERR "IOMMU: dmar init failed\n");
3670                put_iova_domain(&reserved_iova_list);
3671                iommu_exit_mempool();
3672                return ret;
3673        }
3674        printk(KERN_INFO
3675        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3676
3677        init_timer(&unmap_timer);
3678#ifdef CONFIG_SWIOTLB
3679        swiotlb = 0;
3680#endif
3681        dma_ops = &intel_dma_ops;
3682
3683        init_iommu_pm_ops();
3684
3685        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3686
3687        bus_register_notifier(&pci_bus_type, &device_nb);
3688
3689        intel_iommu_enabled = 1;
3690
3691        return 0;
3692}
3693
3694static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3695                                           struct pci_dev *pdev)
3696{
3697        struct pci_dev *tmp, *parent;
3698
3699        if (!iommu || !pdev)
3700                return;
3701
3702        /* dependent device detach */
3703        tmp = pci_find_upstream_pcie_bridge(pdev);
3704        /* Secondary interface's bus number and devfn 0 */
3705        if (tmp) {
3706                parent = pdev->bus->self;
3707                while (parent != tmp) {
3708                        iommu_detach_dev(iommu, parent->bus->number,
3709                                         parent->devfn);
3710                        parent = parent->bus->self;
3711                }
3712                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3713                        iommu_detach_dev(iommu,
3714                                tmp->subordinate->number, 0);
3715                else /* this is a legacy PCI bridge */
3716                        iommu_detach_dev(iommu, tmp->bus->number,
3717                                         tmp->devfn);
3718        }
3719}
3720
3721static void domain_remove_one_dev_info(struct dmar_domain *domain,
3722                                          struct pci_dev *pdev)
3723{
3724        struct device_domain_info *info;
3725        struct intel_iommu *iommu;
3726        unsigned long flags;
3727        int found = 0;
3728        struct list_head *entry, *tmp;
3729
3730        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3731                                pdev->devfn);
3732        if (!iommu)
3733                return;
3734
3735        spin_lock_irqsave(&device_domain_lock, flags);
3736        list_for_each_safe(entry, tmp, &domain->devices) {
3737                info = list_entry(entry, struct device_domain_info, link);
3738                if (info->segment == pci_domain_nr(pdev->bus) &&
3739                    info->bus == pdev->bus->number &&
3740                    info->devfn == pdev->devfn) {
3741                        unlink_domain_info(info);
3742                        spin_unlock_irqrestore(&device_domain_lock, flags);
3743
3744                        iommu_disable_dev_iotlb(info);
3745                        iommu_detach_dev(iommu, info->bus, info->devfn);
3746                        iommu_detach_dependent_devices(iommu, pdev);
3747                        free_devinfo_mem(info);
3748
3749                        spin_lock_irqsave(&device_domain_lock, flags);
3750
3751                        if (found)
3752                                break;
3753                        else
3754                                continue;
3755                }
3756
3757                /* if there is no other devices under the same iommu
3758                 * owned by this domain, clear this iommu in iommu_bmp
3759                 * update iommu count and coherency
3760                 */
3761                if (iommu == device_to_iommu(info->segment, info->bus,
3762                                            info->devfn))
3763                        found = 1;
3764        }
3765
3766        spin_unlock_irqrestore(&device_domain_lock, flags);
3767
3768        if (found == 0) {
3769                unsigned long tmp_flags;
3770                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3771                clear_bit(iommu->seq_id, domain->iommu_bmp);
3772                domain->iommu_count--;
3773                domain_update_iommu_cap(domain);
3774                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3775
3776                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3777                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3778                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3779                        clear_bit(domain->id, iommu->domain_ids);
3780                        iommu->domains[domain->id] = NULL;
3781                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3782                }
3783        }
3784}
3785
3786static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3787{
3788        struct device_domain_info *info;
3789        struct intel_iommu *iommu;
3790        unsigned long flags1, flags2;
3791
3792        spin_lock_irqsave(&device_domain_lock, flags1);
3793        while (!list_empty(&domain->devices)) {
3794                info = list_entry(domain->devices.next,
3795                        struct device_domain_info, link);
3796                unlink_domain_info(info);
3797                spin_unlock_irqrestore(&device_domain_lock, flags1);
3798
3799                iommu_disable_dev_iotlb(info);
3800                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3801                iommu_detach_dev(iommu, info->bus, info->devfn);
3802                iommu_detach_dependent_devices(iommu, info->dev);
3803
3804                /* clear this iommu in iommu_bmp, update iommu count
3805                 * and capabilities
3806                 */
3807                spin_lock_irqsave(&domain->iommu_lock, flags2);
3808                if (test_and_clear_bit(iommu->seq_id,
3809                                       domain->iommu_bmp)) {
3810                        domain->iommu_count--;
3811                        domain_update_iommu_cap(domain);
3812                }
3813                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3814
3815                free_devinfo_mem(info);
3816                spin_lock_irqsave(&device_domain_lock, flags1);
3817        }
3818        spin_unlock_irqrestore(&device_domain_lock, flags1);
3819}
3820
3821/* domain id for virtual machine, it won't be set in context */
3822static unsigned long vm_domid;
3823
3824static struct dmar_domain *iommu_alloc_vm_domain(void)
3825{
3826        struct dmar_domain *domain;
3827
3828        domain = alloc_domain_mem();
3829        if (!domain)
3830                return NULL;
3831
3832        domain->id = vm_domid++;
3833        domain->nid = -1;
3834        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3835        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3836
3837        return domain;
3838}
3839
3840static int md_domain_init(struct dmar_domain *domain, int guest_width)
3841{
3842        int adjust_width;
3843
3844        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3845        spin_lock_init(&domain->iommu_lock);
3846
3847        domain_reserve_special_ranges(domain);
3848
3849        /* calculate AGAW */
3850        domain->gaw = guest_width;
3851        adjust_width = guestwidth_to_adjustwidth(guest_width);
3852        domain->agaw = width_to_agaw(adjust_width);
3853
3854        INIT_LIST_HEAD(&domain->devices);
3855
3856        domain->iommu_count = 0;
3857        domain->iommu_coherency = 0;
3858        domain->iommu_snooping = 0;
3859        domain->iommu_superpage = 0;
3860        domain->max_addr = 0;
3861        domain->nid = -1;
3862
3863        /* always allocate the top pgd */
3864        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3865        if (!domain->pgd)
3866                return -ENOMEM;
3867        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3868        return 0;
3869}
3870
3871static void iommu_free_vm_domain(struct dmar_domain *domain)
3872{
3873        unsigned long flags;
3874        struct dmar_drhd_unit *drhd;
3875        struct intel_iommu *iommu;
3876        unsigned long i;
3877        unsigned long ndomains;
3878
3879        for_each_drhd_unit(drhd) {
3880                if (drhd->ignored)
3881                        continue;
3882                iommu = drhd->iommu;
3883
3884                ndomains = cap_ndoms(iommu->cap);
3885                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3886                        if (iommu->domains[i] == domain) {
3887                                spin_lock_irqsave(&iommu->lock, flags);
3888                                clear_bit(i, iommu->domain_ids);
3889                                iommu->domains[i] = NULL;
3890                                spin_unlock_irqrestore(&iommu->lock, flags);
3891                                break;
3892                        }
3893                }
3894        }
3895}
3896
3897static void vm_domain_exit(struct dmar_domain *domain)
3898{
3899        /* Domain 0 is reserved, so dont process it */
3900        if (!domain)
3901                return;
3902
3903        vm_domain_remove_all_dev_info(domain);
3904        /* destroy iovas */
3905        put_iova_domain(&domain->iovad);
3906
3907        /* clear ptes */
3908        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3909
3910        /* free page tables */
3911        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3912
3913        iommu_free_vm_domain(domain);
3914        free_domain_mem(domain);
3915}
3916
3917static int intel_iommu_domain_init(struct iommu_domain *domain)
3918{
3919        struct dmar_domain *dmar_domain;
3920
3921        dmar_domain = iommu_alloc_vm_domain();
3922        if (!dmar_domain) {
3923                printk(KERN_ERR
3924                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3925                return -ENOMEM;
3926        }
3927        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3928                printk(KERN_ERR
3929                        "intel_iommu_domain_init() failed\n");
3930                vm_domain_exit(dmar_domain);
3931                return -ENOMEM;
3932        }
3933        domain_update_iommu_cap(dmar_domain);
3934        domain->priv = dmar_domain;
3935
3936        domain->geometry.aperture_start = 0;
3937        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3938        domain->geometry.force_aperture = true;
3939
3940        return 0;
3941}
3942
3943static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3944{
3945        struct dmar_domain *dmar_domain = domain->priv;
3946
3947        domain->priv = NULL;
3948        vm_domain_exit(dmar_domain);
3949}
3950
3951static int intel_iommu_attach_device(struct iommu_domain *domain,
3952                                     struct device *dev)
3953{
3954        struct dmar_domain *dmar_domain = domain->priv;
3955        struct pci_dev *pdev = to_pci_dev(dev);
3956        struct intel_iommu *iommu;
3957        int addr_width;
3958
3959        /* normally pdev is not mapped */
3960        if (unlikely(domain_context_mapped(pdev))) {
3961                struct dmar_domain *old_domain;
3962
3963                old_domain = find_domain(pdev);
3964                if (old_domain) {
3965                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3966                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3967                                domain_remove_one_dev_info(old_domain, pdev);
3968                        else
3969                                domain_remove_dev_info(old_domain);
3970                }
3971        }
3972
3973        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3974                                pdev->devfn);
3975        if (!iommu)
3976                return -ENODEV;
3977
3978        /* check if this iommu agaw is sufficient for max mapped address */
3979        addr_width = agaw_to_width(iommu->agaw);
3980        if (addr_width > cap_mgaw(iommu->cap))
3981                addr_width = cap_mgaw(iommu->cap);
3982
3983        if (dmar_domain->max_addr > (1LL << addr_width)) {
3984                printk(KERN_ERR "%s: iommu width (%d) is not "
3985                       "sufficient for the mapped address (%llx)\n",
3986                       __func__, addr_width, dmar_domain->max_addr);
3987                return -EFAULT;
3988        }
3989        dmar_domain->gaw = addr_width;
3990
3991        /*
3992         * Knock out extra levels of page tables if necessary
3993         */
3994        while (iommu->agaw < dmar_domain->agaw) {
3995                struct dma_pte *pte;
3996
3997                pte = dmar_domain->pgd;
3998                if (dma_pte_present(pte)) {
3999                        dmar_domain->pgd = (struct dma_pte *)
4000                                phys_to_virt(dma_pte_addr(pte));
4001                        free_pgtable_page(pte);
4002                }
4003                dmar_domain->agaw--;
4004        }
4005
4006        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4007}
4008
4009static void intel_iommu_detach_device(struct iommu_domain *domain,
4010                                      struct device *dev)
4011{
4012        struct dmar_domain *dmar_domain = domain->priv;
4013        struct pci_dev *pdev = to_pci_dev(dev);
4014
4015        domain_remove_one_dev_info(dmar_domain, pdev);
4016}
4017
4018static int intel_iommu_map(struct iommu_domain *domain,
4019                           unsigned long iova, phys_addr_t hpa,
4020                           size_t size, int iommu_prot)
4021{
4022        struct dmar_domain *dmar_domain = domain->priv;
4023        u64 max_addr;
4024        int prot = 0;
4025        int ret;
4026
4027        if (iommu_prot & IOMMU_READ)
4028                prot |= DMA_PTE_READ;
4029        if (iommu_prot & IOMMU_WRITE)
4030                prot |= DMA_PTE_WRITE;
4031        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4032                prot |= DMA_PTE_SNP;
4033
4034        max_addr = iova + size;
4035        if (dmar_domain->max_addr < max_addr) {
4036                u64 end;
4037
4038                /* check if minimum agaw is sufficient for mapped address */
4039                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4040                if (end < max_addr) {
4041                        printk(KERN_ERR "%s: iommu width (%d) is not "
4042                               "sufficient for the mapped address (%llx)\n",
4043                               __func__, dmar_domain->gaw, max_addr);
4044                        return -EFAULT;
4045                }
4046                dmar_domain->max_addr = max_addr;
4047        }
4048        /* Round up size to next multiple of PAGE_SIZE, if it and
4049           the low bits of hpa would take us onto the next page */
4050        size = aligned_nrpages(hpa, size);
4051        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4052                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4053        return ret;
4054}
4055
4056static size_t intel_iommu_unmap(struct iommu_domain *domain,
4057                             unsigned long iova, size_t size)
4058{
4059        struct dmar_domain *dmar_domain = domain->priv;
4060        int order;
4061
4062        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4063                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4064
4065        if (dmar_domain->max_addr == iova + size)
4066                dmar_domain->max_addr = iova;
4067
4068        return PAGE_SIZE << order;
4069}
4070
4071static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4072                                            unsigned long iova)
4073{
4074        struct dmar_domain *dmar_domain = domain->priv;
4075        struct dma_pte *pte;
4076        u64 phys = 0;
4077
4078        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4079        if (pte)
4080                phys = dma_pte_addr(pte);
4081
4082        return phys;
4083}
4084
4085static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4086                                      unsigned long cap)
4087{
4088        struct dmar_domain *dmar_domain = domain->priv;
4089
4090        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4091                return dmar_domain->iommu_snooping;
4092        if (cap == IOMMU_CAP_INTR_REMAP)
4093                return irq_remapping_enabled;
4094
4095        return 0;
4096}
4097
4098static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4099{
4100        pci_dev_put(*from);
4101        *from = to;
4102}
4103
4104#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4105
4106static int intel_iommu_add_device(struct device *dev)
4107{
4108        struct pci_dev *pdev = to_pci_dev(dev);
4109        struct pci_dev *bridge, *dma_pdev;
4110        struct iommu_group *group;
4111        int ret;
4112
4113        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4114                             pdev->bus->number, pdev->devfn))
4115                return -ENODEV;
4116
4117        bridge = pci_find_upstream_pcie_bridge(pdev);
4118        if (bridge) {
4119                if (pci_is_pcie(bridge))
4120                        dma_pdev = pci_get_domain_bus_and_slot(
4121                                                pci_domain_nr(pdev->bus),
4122                                                bridge->subordinate->number, 0);
4123                else
4124                        dma_pdev = pci_dev_get(bridge);
4125        } else
4126                dma_pdev = pci_dev_get(pdev);
4127
4128        /* Account for quirked devices */
4129        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4130
4131        /*
4132         * If it's a multifunction device that does not support our
4133         * required ACS flags, add to the same group as function 0.
4134         */
4135        if (dma_pdev->multifunction &&
4136            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4137                swap_pci_ref(&dma_pdev,
4138                             pci_get_slot(dma_pdev->bus,
4139                                          PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4140                                          0)));
4141
4142        /*
4143         * Devices on the root bus go through the iommu.  If that's not us,
4144         * find the next upstream device and test ACS up to the root bus.
4145         * Finding the next device may require skipping virtual buses.
4146         */
4147        while (!pci_is_root_bus(dma_pdev->bus)) {
4148                struct pci_bus *bus = dma_pdev->bus;
4149
4150                while (!bus->self) {
4151                        if (!pci_is_root_bus(bus))
4152                                bus = bus->parent;
4153                        else
4154                                goto root_bus;
4155                }
4156
4157                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4158                        break;
4159
4160                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4161        }
4162
4163root_bus:
4164        group = iommu_group_get(&dma_pdev->dev);
4165        pci_dev_put(dma_pdev);
4166        if (!group) {
4167                group = iommu_group_alloc();
4168                if (IS_ERR(group))
4169                        return PTR_ERR(group);
4170        }
4171
4172        ret = iommu_group_add_device(group, dev);
4173
4174        iommu_group_put(group);
4175        return ret;
4176}
4177
4178static void intel_iommu_remove_device(struct device *dev)
4179{
4180        iommu_group_remove_device(dev);
4181}
4182
4183static struct iommu_ops intel_iommu_ops = {
4184        .domain_init    = intel_iommu_domain_init,
4185        .domain_destroy = intel_iommu_domain_destroy,
4186        .attach_dev     = intel_iommu_attach_device,
4187        .detach_dev     = intel_iommu_detach_device,
4188        .map            = intel_iommu_map,
4189        .unmap          = intel_iommu_unmap,
4190        .iova_to_phys   = intel_iommu_iova_to_phys,
4191        .domain_has_cap = intel_iommu_domain_has_cap,
4192        .add_device     = intel_iommu_add_device,
4193        .remove_device  = intel_iommu_remove_device,
4194        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4195};
4196
4197static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4198{
4199        /*
4200         * Mobile 4 Series Chipset neglects to set RWBF capability,
4201         * but needs it:
4202         */
4203        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4204        rwbf_quirk = 1;
4205
4206        /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4207        if (dev->revision == 0x07) {
4208                printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4209                dmar_map_gfx = 0;
4210        }
4211}
4212
4213DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4214
4215#define GGC 0x52
4216#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4217#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4218#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4219#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4220#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4221#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4222#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4223#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4224
4225static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4226{
4227        unsigned short ggc;
4228
4229        if (pci_read_config_word(dev, GGC, &ggc))
4230                return;
4231
4232        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4233                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4234                dmar_map_gfx = 0;
4235        } else if (dmar_map_gfx) {
4236                /* we have to ensure the gfx device is idle before we flush */
4237                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4238                intel_iommu_strict = 1;
4239       }
4240}
4241DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4242DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4243DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4244DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4245
4246/* On Tylersburg chipsets, some BIOSes have been known to enable the
4247   ISOCH DMAR unit for the Azalia sound device, but not give it any
4248   TLB entries, which causes it to deadlock. Check for that.  We do
4249   this in a function called from init_dmars(), instead of in a PCI
4250   quirk, because we don't want to print the obnoxious "BIOS broken"
4251   message if VT-d is actually disabled.
4252*/
4253static void __init check_tylersburg_isoch(void)
4254{
4255        struct pci_dev *pdev;
4256        uint32_t vtisochctrl;
4257
4258        /* If there's no Azalia in the system anyway, forget it. */
4259        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4260        if (!pdev)
4261                return;
4262        pci_dev_put(pdev);
4263
4264        /* System Management Registers. Might be hidden, in which case
4265           we can't do the sanity check. But that's OK, because the
4266           known-broken BIOSes _don't_ actually hide it, so far. */
4267        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4268        if (!pdev)
4269                return;
4270
4271        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4272                pci_dev_put(pdev);
4273                return;
4274        }
4275
4276        pci_dev_put(pdev);
4277
4278        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4279        if (vtisochctrl & 1)
4280                return;
4281
4282        /* Drop all bits other than the number of TLB entries */
4283        vtisochctrl &= 0x1c;
4284
4285        /* If we have the recommended number of TLB entries (16), fine. */
4286        if (vtisochctrl == 0x10)
4287                return;
4288
4289        /* Zero TLB entries? You get to ride the short bus to school. */
4290        if (!vtisochctrl) {
4291                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4292                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4293                     dmi_get_system_info(DMI_BIOS_VENDOR),
4294                     dmi_get_system_info(DMI_BIOS_VERSION),
4295                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4296                iommu_identity_mapping |= IDENTMAP_AZALIA;
4297                return;
4298        }
4299        
4300        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4301               vtisochctrl);
4302}
4303
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.