linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#define ROOT_SIZE               VTD_PAGE_SIZE
  50#define CONTEXT_SIZE            VTD_PAGE_SIZE
  51
  52#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  53#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  54#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  55
  56#define IOAPIC_RANGE_START      (0xfee00000)
  57#define IOAPIC_RANGE_END        (0xfeefffff)
  58#define IOVA_START_ADDR         (0x1000)
  59
  60#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  61
  62#define MAX_AGAW_WIDTH 64
  63
  64#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  65#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  66
  67/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  68   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  69#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  70                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  71#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  72
  73#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  74#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  75#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  76
  77/* page table handling */
  78#define LEVEL_STRIDE            (9)
  79#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  80
  81/*
  82 * This bitmap is used to advertise the page sizes our hardware support
  83 * to the IOMMU core, which will then use this information to split
  84 * physically contiguous memory regions it is mapping into page sizes
  85 * that we support.
  86 *
  87 * Traditionally the IOMMU core just handed us the mappings directly,
  88 * after making sure the size is an order of a 4KiB page and that the
  89 * mapping has natural alignment.
  90 *
  91 * To retain this behavior, we currently advertise that we support
  92 * all page sizes that are an order of 4KiB.
  93 *
  94 * If at some point we'd like to utilize the IOMMU core's new behavior,
  95 * we could change this to advertise the real page sizes we support.
  96 */
  97#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
  98
  99static inline int agaw_to_level(int agaw)
 100{
 101        return agaw + 2;
 102}
 103
 104static inline int agaw_to_width(int agaw)
 105{
 106        return 30 + agaw * LEVEL_STRIDE;
 107}
 108
 109static inline int width_to_agaw(int width)
 110{
 111        return (width - 30) / LEVEL_STRIDE;
 112}
 113
 114static inline unsigned int level_to_offset_bits(int level)
 115{
 116        return (level - 1) * LEVEL_STRIDE;
 117}
 118
 119static inline int pfn_level_offset(unsigned long pfn, int level)
 120{
 121        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 122}
 123
 124static inline unsigned long level_mask(int level)
 125{
 126        return -1UL << level_to_offset_bits(level);
 127}
 128
 129static inline unsigned long level_size(int level)
 130{
 131        return 1UL << level_to_offset_bits(level);
 132}
 133
 134static inline unsigned long align_to_level(unsigned long pfn, int level)
 135{
 136        return (pfn + level_size(level) - 1) & level_mask(level);
 137}
 138
 139static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 140{
 141        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 142}
 143
 144/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 145   are never going to work. */
 146static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 147{
 148        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 149}
 150
 151static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 152{
 153        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 154}
 155static inline unsigned long page_to_dma_pfn(struct page *pg)
 156{
 157        return mm_to_dma_pfn(page_to_pfn(pg));
 158}
 159static inline unsigned long virt_to_dma_pfn(void *p)
 160{
 161        return page_to_dma_pfn(virt_to_page(p));
 162}
 163
 164/* global iommu list, set NULL for ignored DMAR units */
 165static struct intel_iommu **g_iommus;
 166
 167static void __init check_tylersburg_isoch(void);
 168static int rwbf_quirk;
 169
 170/*
 171 * set to 1 to panic kernel if can't successfully enable VT-d
 172 * (used when kernel is launched w/ TXT)
 173 */
 174static int force_on = 0;
 175
 176/*
 177 * 0: Present
 178 * 1-11: Reserved
 179 * 12-63: Context Ptr (12 - (haw-1))
 180 * 64-127: Reserved
 181 */
 182struct root_entry {
 183        u64     val;
 184        u64     rsvd1;
 185};
 186#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 187static inline bool root_present(struct root_entry *root)
 188{
 189        return (root->val & 1);
 190}
 191static inline void set_root_present(struct root_entry *root)
 192{
 193        root->val |= 1;
 194}
 195static inline void set_root_value(struct root_entry *root, unsigned long value)
 196{
 197        root->val |= value & VTD_PAGE_MASK;
 198}
 199
 200static inline struct context_entry *
 201get_context_addr_from_root(struct root_entry *root)
 202{
 203        return (struct context_entry *)
 204                (root_present(root)?phys_to_virt(
 205                root->val & VTD_PAGE_MASK) :
 206                NULL);
 207}
 208
 209/*
 210 * low 64 bits:
 211 * 0: present
 212 * 1: fault processing disable
 213 * 2-3: translation type
 214 * 12-63: address space root
 215 * high 64 bits:
 216 * 0-2: address width
 217 * 3-6: aval
 218 * 8-23: domain id
 219 */
 220struct context_entry {
 221        u64 lo;
 222        u64 hi;
 223};
 224
 225static inline bool context_present(struct context_entry *context)
 226{
 227        return (context->lo & 1);
 228}
 229static inline void context_set_present(struct context_entry *context)
 230{
 231        context->lo |= 1;
 232}
 233
 234static inline void context_set_fault_enable(struct context_entry *context)
 235{
 236        context->lo &= (((u64)-1) << 2) | 1;
 237}
 238
 239static inline void context_set_translation_type(struct context_entry *context,
 240                                                unsigned long value)
 241{
 242        context->lo &= (((u64)-1) << 4) | 3;
 243        context->lo |= (value & 3) << 2;
 244}
 245
 246static inline void context_set_address_root(struct context_entry *context,
 247                                            unsigned long value)
 248{
 249        context->lo |= value & VTD_PAGE_MASK;
 250}
 251
 252static inline void context_set_address_width(struct context_entry *context,
 253                                             unsigned long value)
 254{
 255        context->hi |= value & 7;
 256}
 257
 258static inline void context_set_domain_id(struct context_entry *context,
 259                                         unsigned long value)
 260{
 261        context->hi |= (value & ((1 << 16) - 1)) << 8;
 262}
 263
 264static inline void context_clear_entry(struct context_entry *context)
 265{
 266        context->lo = 0;
 267        context->hi = 0;
 268}
 269
 270/*
 271 * 0: readable
 272 * 1: writable
 273 * 2-6: reserved
 274 * 7: super page
 275 * 8-10: available
 276 * 11: snoop behavior
 277 * 12-63: Host physcial address
 278 */
 279struct dma_pte {
 280        u64 val;
 281};
 282
 283static inline void dma_clear_pte(struct dma_pte *pte)
 284{
 285        pte->val = 0;
 286}
 287
 288static inline void dma_set_pte_readable(struct dma_pte *pte)
 289{
 290        pte->val |= DMA_PTE_READ;
 291}
 292
 293static inline void dma_set_pte_writable(struct dma_pte *pte)
 294{
 295        pte->val |= DMA_PTE_WRITE;
 296}
 297
 298static inline void dma_set_pte_snp(struct dma_pte *pte)
 299{
 300        pte->val |= DMA_PTE_SNP;
 301}
 302
 303static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 304{
 305        pte->val = (pte->val & ~3) | (prot & 3);
 306}
 307
 308static inline u64 dma_pte_addr(struct dma_pte *pte)
 309{
 310#ifdef CONFIG_64BIT
 311        return pte->val & VTD_PAGE_MASK;
 312#else
 313        /* Must have a full atomic 64-bit read */
 314        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 315#endif
 316}
 317
 318static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 319{
 320        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 321}
 322
 323static inline bool dma_pte_present(struct dma_pte *pte)
 324{
 325        return (pte->val & 3) != 0;
 326}
 327
 328static inline bool dma_pte_superpage(struct dma_pte *pte)
 329{
 330        return (pte->val & (1 << 7));
 331}
 332
 333static inline int first_pte_in_page(struct dma_pte *pte)
 334{
 335        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 336}
 337
 338/*
 339 * This domain is a statically identity mapping domain.
 340 *      1. This domain creats a static 1:1 mapping to all usable memory.
 341 *      2. It maps to each iommu if successful.
 342 *      3. Each iommu mapps to this domain if successful.
 343 */
 344static struct dmar_domain *si_domain;
 345static int hw_pass_through = 1;
 346
 347/* devices under the same p2p bridge are owned in one domain */
 348#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 349
 350/* domain represents a virtual machine, more than one devices
 351 * across iommus may be owned in one domain, e.g. kvm guest.
 352 */
 353#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 354
 355/* si_domain contains mulitple devices */
 356#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 357
 358/* define the limit of IOMMUs supported in each domain */
 359#ifdef  CONFIG_X86
 360# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 361#else
 362# define        IOMMU_UNITS_SUPPORTED   64
 363#endif
 364
 365struct dmar_domain {
 366        int     id;                     /* domain id */
 367        int     nid;                    /* node id */
 368        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 369                                        /* bitmap of iommus this domain uses*/
 370
 371        struct list_head devices;       /* all devices' list */
 372        struct iova_domain iovad;       /* iova's that belong to this domain */
 373
 374        struct dma_pte  *pgd;           /* virtual address */
 375        int             gaw;            /* max guest address width */
 376
 377        /* adjusted guest address width, 0 is level 2 30-bit */
 378        int             agaw;
 379
 380        int             flags;          /* flags to find out type of domain */
 381
 382        int             iommu_coherency;/* indicate coherency of iommu access */
 383        int             iommu_snooping; /* indicate snooping control feature*/
 384        int             iommu_count;    /* reference count of iommu */
 385        int             iommu_superpage;/* Level of superpages supported:
 386                                           0 == 4KiB (no superpages), 1 == 2MiB,
 387                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 388        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 389        u64             max_addr;       /* maximum mapped address */
 390};
 391
 392/* PCI domain-device relationship */
 393struct device_domain_info {
 394        struct list_head link;  /* link to domain siblings */
 395        struct list_head global; /* link to global list */
 396        int segment;            /* PCI domain */
 397        u8 bus;                 /* PCI bus number */
 398        u8 devfn;               /* PCI devfn number */
 399        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 400        struct intel_iommu *iommu; /* IOMMU used by this device */
 401        struct dmar_domain *domain; /* pointer to domain */
 402};
 403
 404static void flush_unmaps_timeout(unsigned long data);
 405
 406DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 407
 408#define HIGH_WATER_MARK 250
 409struct deferred_flush_tables {
 410        int next;
 411        struct iova *iova[HIGH_WATER_MARK];
 412        struct dmar_domain *domain[HIGH_WATER_MARK];
 413};
 414
 415static struct deferred_flush_tables *deferred_flush;
 416
 417/* bitmap for indexing intel_iommus */
 418static int g_num_of_iommus;
 419
 420static DEFINE_SPINLOCK(async_umap_flush_lock);
 421static LIST_HEAD(unmaps_to_do);
 422
 423static int timer_on;
 424static long list_size;
 425
 426static void domain_remove_dev_info(struct dmar_domain *domain);
 427
 428#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 429int dmar_disabled = 0;
 430#else
 431int dmar_disabled = 1;
 432#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 433
 434int intel_iommu_enabled = 0;
 435EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 436
 437static int dmar_map_gfx = 1;
 438static int dmar_forcedac;
 439static int intel_iommu_strict;
 440static int intel_iommu_superpage = 1;
 441
 442int intel_iommu_gfx_mapped;
 443EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 444
 445#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 446static DEFINE_SPINLOCK(device_domain_lock);
 447static LIST_HEAD(device_domain_list);
 448
 449static struct iommu_ops intel_iommu_ops;
 450
 451static int __init intel_iommu_setup(char *str)
 452{
 453        if (!str)
 454                return -EINVAL;
 455        while (*str) {
 456                if (!strncmp(str, "on", 2)) {
 457                        dmar_disabled = 0;
 458                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 459                } else if (!strncmp(str, "off", 3)) {
 460                        dmar_disabled = 1;
 461                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 462                } else if (!strncmp(str, "igfx_off", 8)) {
 463                        dmar_map_gfx = 0;
 464                        printk(KERN_INFO
 465                                "Intel-IOMMU: disable GFX device mapping\n");
 466                } else if (!strncmp(str, "forcedac", 8)) {
 467                        printk(KERN_INFO
 468                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 469                        dmar_forcedac = 1;
 470                } else if (!strncmp(str, "strict", 6)) {
 471                        printk(KERN_INFO
 472                                "Intel-IOMMU: disable batched IOTLB flush\n");
 473                        intel_iommu_strict = 1;
 474                } else if (!strncmp(str, "sp_off", 6)) {
 475                        printk(KERN_INFO
 476                                "Intel-IOMMU: disable supported super page\n");
 477                        intel_iommu_superpage = 0;
 478                }
 479
 480                str += strcspn(str, ",");
 481                while (*str == ',')
 482                        str++;
 483        }
 484        return 0;
 485}
 486__setup("intel_iommu=", intel_iommu_setup);
 487
 488static struct kmem_cache *iommu_domain_cache;
 489static struct kmem_cache *iommu_devinfo_cache;
 490static struct kmem_cache *iommu_iova_cache;
 491
 492static inline void *alloc_pgtable_page(int node)
 493{
 494        struct page *page;
 495        void *vaddr = NULL;
 496
 497        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 498        if (page)
 499                vaddr = page_address(page);
 500        return vaddr;
 501}
 502
 503static inline void free_pgtable_page(void *vaddr)
 504{
 505        free_page((unsigned long)vaddr);
 506}
 507
 508static inline void *alloc_domain_mem(void)
 509{
 510        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 511}
 512
 513static void free_domain_mem(void *vaddr)
 514{
 515        kmem_cache_free(iommu_domain_cache, vaddr);
 516}
 517
 518static inline void * alloc_devinfo_mem(void)
 519{
 520        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 521}
 522
 523static inline void free_devinfo_mem(void *vaddr)
 524{
 525        kmem_cache_free(iommu_devinfo_cache, vaddr);
 526}
 527
 528struct iova *alloc_iova_mem(void)
 529{
 530        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 531}
 532
 533void free_iova_mem(struct iova *iova)
 534{
 535        kmem_cache_free(iommu_iova_cache, iova);
 536}
 537
 538
 539static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 540{
 541        unsigned long sagaw;
 542        int agaw = -1;
 543
 544        sagaw = cap_sagaw(iommu->cap);
 545        for (agaw = width_to_agaw(max_gaw);
 546             agaw >= 0; agaw--) {
 547                if (test_bit(agaw, &sagaw))
 548                        break;
 549        }
 550
 551        return agaw;
 552}
 553
 554/*
 555 * Calculate max SAGAW for each iommu.
 556 */
 557int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 558{
 559        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 560}
 561
 562/*
 563 * calculate agaw for each iommu.
 564 * "SAGAW" may be different across iommus, use a default agaw, and
 565 * get a supported less agaw for iommus that don't support the default agaw.
 566 */
 567int iommu_calculate_agaw(struct intel_iommu *iommu)
 568{
 569        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 570}
 571
 572/* This functionin only returns single iommu in a domain */
 573static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 574{
 575        int iommu_id;
 576
 577        /* si_domain and vm domain should not get here. */
 578        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 579        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 580
 581        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 582        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 583                return NULL;
 584
 585        return g_iommus[iommu_id];
 586}
 587
 588static void domain_update_iommu_coherency(struct dmar_domain *domain)
 589{
 590        int i;
 591
 592        i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 593
 594        domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
 595
 596        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 597                if (!ecap_coherent(g_iommus[i]->ecap)) {
 598                        domain->iommu_coherency = 0;
 599                        break;
 600                }
 601        }
 602}
 603
 604static void domain_update_iommu_snooping(struct dmar_domain *domain)
 605{
 606        int i;
 607
 608        domain->iommu_snooping = 1;
 609
 610        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 611                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 612                        domain->iommu_snooping = 0;
 613                        break;
 614                }
 615        }
 616}
 617
 618static void domain_update_iommu_superpage(struct dmar_domain *domain)
 619{
 620        struct dmar_drhd_unit *drhd;
 621        struct intel_iommu *iommu = NULL;
 622        int mask = 0xf;
 623
 624        if (!intel_iommu_superpage) {
 625                domain->iommu_superpage = 0;
 626                return;
 627        }
 628
 629        /* set iommu_superpage to the smallest common denominator */
 630        for_each_active_iommu(iommu, drhd) {
 631                mask &= cap_super_page_val(iommu->cap);
 632                if (!mask) {
 633                        break;
 634                }
 635        }
 636        domain->iommu_superpage = fls(mask);
 637}
 638
 639/* Some capabilities may be different across iommus */
 640static void domain_update_iommu_cap(struct dmar_domain *domain)
 641{
 642        domain_update_iommu_coherency(domain);
 643        domain_update_iommu_snooping(domain);
 644        domain_update_iommu_superpage(domain);
 645}
 646
 647static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 648{
 649        struct dmar_drhd_unit *drhd = NULL;
 650        int i;
 651
 652        for_each_drhd_unit(drhd) {
 653                if (drhd->ignored)
 654                        continue;
 655                if (segment != drhd->segment)
 656                        continue;
 657
 658                for (i = 0; i < drhd->devices_cnt; i++) {
 659                        if (drhd->devices[i] &&
 660                            drhd->devices[i]->bus->number == bus &&
 661                            drhd->devices[i]->devfn == devfn)
 662                                return drhd->iommu;
 663                        if (drhd->devices[i] &&
 664                            drhd->devices[i]->subordinate &&
 665                            drhd->devices[i]->subordinate->number <= bus &&
 666                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 667                                return drhd->iommu;
 668                }
 669
 670                if (drhd->include_all)
 671                        return drhd->iommu;
 672        }
 673
 674        return NULL;
 675}
 676
 677static void domain_flush_cache(struct dmar_domain *domain,
 678                               void *addr, int size)
 679{
 680        if (!domain->iommu_coherency)
 681                clflush_cache_range(addr, size);
 682}
 683
 684/* Gets context entry for a given bus and devfn */
 685static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 686                u8 bus, u8 devfn)
 687{
 688        struct root_entry *root;
 689        struct context_entry *context;
 690        unsigned long phy_addr;
 691        unsigned long flags;
 692
 693        spin_lock_irqsave(&iommu->lock, flags);
 694        root = &iommu->root_entry[bus];
 695        context = get_context_addr_from_root(root);
 696        if (!context) {
 697                context = (struct context_entry *)
 698                                alloc_pgtable_page(iommu->node);
 699                if (!context) {
 700                        spin_unlock_irqrestore(&iommu->lock, flags);
 701                        return NULL;
 702                }
 703                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 704                phy_addr = virt_to_phys((void *)context);
 705                set_root_value(root, phy_addr);
 706                set_root_present(root);
 707                __iommu_flush_cache(iommu, root, sizeof(*root));
 708        }
 709        spin_unlock_irqrestore(&iommu->lock, flags);
 710        return &context[devfn];
 711}
 712
 713static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 714{
 715        struct root_entry *root;
 716        struct context_entry *context;
 717        int ret;
 718        unsigned long flags;
 719
 720        spin_lock_irqsave(&iommu->lock, flags);
 721        root = &iommu->root_entry[bus];
 722        context = get_context_addr_from_root(root);
 723        if (!context) {
 724                ret = 0;
 725                goto out;
 726        }
 727        ret = context_present(&context[devfn]);
 728out:
 729        spin_unlock_irqrestore(&iommu->lock, flags);
 730        return ret;
 731}
 732
 733static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 734{
 735        struct root_entry *root;
 736        struct context_entry *context;
 737        unsigned long flags;
 738
 739        spin_lock_irqsave(&iommu->lock, flags);
 740        root = &iommu->root_entry[bus];
 741        context = get_context_addr_from_root(root);
 742        if (context) {
 743                context_clear_entry(&context[devfn]);
 744                __iommu_flush_cache(iommu, &context[devfn], \
 745                        sizeof(*context));
 746        }
 747        spin_unlock_irqrestore(&iommu->lock, flags);
 748}
 749
 750static void free_context_table(struct intel_iommu *iommu)
 751{
 752        struct root_entry *root;
 753        int i;
 754        unsigned long flags;
 755        struct context_entry *context;
 756
 757        spin_lock_irqsave(&iommu->lock, flags);
 758        if (!iommu->root_entry) {
 759                goto out;
 760        }
 761        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 762                root = &iommu->root_entry[i];
 763                context = get_context_addr_from_root(root);
 764                if (context)
 765                        free_pgtable_page(context);
 766        }
 767        free_pgtable_page(iommu->root_entry);
 768        iommu->root_entry = NULL;
 769out:
 770        spin_unlock_irqrestore(&iommu->lock, flags);
 771}
 772
 773static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 774                                      unsigned long pfn, int target_level)
 775{
 776        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 777        struct dma_pte *parent, *pte = NULL;
 778        int level = agaw_to_level(domain->agaw);
 779        int offset;
 780
 781        BUG_ON(!domain->pgd);
 782        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 783        parent = domain->pgd;
 784
 785        while (level > 0) {
 786                void *tmp_page;
 787
 788                offset = pfn_level_offset(pfn, level);
 789                pte = &parent[offset];
 790                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 791                        break;
 792                if (level == target_level)
 793                        break;
 794
 795                if (!dma_pte_present(pte)) {
 796                        uint64_t pteval;
 797
 798                        tmp_page = alloc_pgtable_page(domain->nid);
 799
 800                        if (!tmp_page)
 801                                return NULL;
 802
 803                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 804                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 805                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 806                                /* Someone else set it while we were thinking; use theirs. */
 807                                free_pgtable_page(tmp_page);
 808                        } else {
 809                                dma_pte_addr(pte);
 810                                domain_flush_cache(domain, pte, sizeof(*pte));
 811                        }
 812                }
 813                parent = phys_to_virt(dma_pte_addr(pte));
 814                level--;
 815        }
 816
 817        return pte;
 818}
 819
 820
 821/* return address's pte at specific level */
 822static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 823                                         unsigned long pfn,
 824                                         int level, int *large_page)
 825{
 826        struct dma_pte *parent, *pte = NULL;
 827        int total = agaw_to_level(domain->agaw);
 828        int offset;
 829
 830        parent = domain->pgd;
 831        while (level <= total) {
 832                offset = pfn_level_offset(pfn, total);
 833                pte = &parent[offset];
 834                if (level == total)
 835                        return pte;
 836
 837                if (!dma_pte_present(pte)) {
 838                        *large_page = total;
 839                        break;
 840                }
 841
 842                if (pte->val & DMA_PTE_LARGE_PAGE) {
 843                        *large_page = total;
 844                        return pte;
 845                }
 846
 847                parent = phys_to_virt(dma_pte_addr(pte));
 848                total--;
 849        }
 850        return NULL;
 851}
 852
 853/* clear last level pte, a tlb flush should be followed */
 854static int dma_pte_clear_range(struct dmar_domain *domain,
 855                                unsigned long start_pfn,
 856                                unsigned long last_pfn)
 857{
 858        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 859        unsigned int large_page = 1;
 860        struct dma_pte *first_pte, *pte;
 861        int order;
 862
 863        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 864        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 865        BUG_ON(start_pfn > last_pfn);
 866
 867        /* we don't need lock here; nobody else touches the iova range */
 868        do {
 869                large_page = 1;
 870                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 871                if (!pte) {
 872                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 873                        continue;
 874                }
 875                do {
 876                        dma_clear_pte(pte);
 877                        start_pfn += lvl_to_nr_pages(large_page);
 878                        pte++;
 879                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 880
 881                domain_flush_cache(domain, first_pte,
 882                                   (void *)pte - (void *)first_pte);
 883
 884        } while (start_pfn && start_pfn <= last_pfn);
 885
 886        order = (large_page - 1) * 9;
 887        return order;
 888}
 889
 890/* free page table pages. last level pte should already be cleared */
 891static void dma_pte_free_pagetable(struct dmar_domain *domain,
 892                                   unsigned long start_pfn,
 893                                   unsigned long last_pfn)
 894{
 895        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 896        struct dma_pte *first_pte, *pte;
 897        int total = agaw_to_level(domain->agaw);
 898        int level;
 899        unsigned long tmp;
 900        int large_page = 2;
 901
 902        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 903        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 904        BUG_ON(start_pfn > last_pfn);
 905
 906        /* We don't need lock here; nobody else touches the iova range */
 907        level = 2;
 908        while (level <= total) {
 909                tmp = align_to_level(start_pfn, level);
 910
 911                /* If we can't even clear one PTE at this level, we're done */
 912                if (tmp + level_size(level) - 1 > last_pfn)
 913                        return;
 914
 915                do {
 916                        large_page = level;
 917                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 918                        if (large_page > level)
 919                                level = large_page + 1;
 920                        if (!pte) {
 921                                tmp = align_to_level(tmp + 1, level + 1);
 922                                continue;
 923                        }
 924                        do {
 925                                if (dma_pte_present(pte)) {
 926                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 927                                        dma_clear_pte(pte);
 928                                }
 929                                pte++;
 930                                tmp += level_size(level);
 931                        } while (!first_pte_in_page(pte) &&
 932                                 tmp + level_size(level) - 1 <= last_pfn);
 933
 934                        domain_flush_cache(domain, first_pte,
 935                                           (void *)pte - (void *)first_pte);
 936                        
 937                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 938                level++;
 939        }
 940        /* free pgd */
 941        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 942                free_pgtable_page(domain->pgd);
 943                domain->pgd = NULL;
 944        }
 945}
 946
 947/* iommu handling */
 948static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 949{
 950        struct root_entry *root;
 951        unsigned long flags;
 952
 953        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 954        if (!root)
 955                return -ENOMEM;
 956
 957        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 958
 959        spin_lock_irqsave(&iommu->lock, flags);
 960        iommu->root_entry = root;
 961        spin_unlock_irqrestore(&iommu->lock, flags);
 962
 963        return 0;
 964}
 965
 966static void iommu_set_root_entry(struct intel_iommu *iommu)
 967{
 968        void *addr;
 969        u32 sts;
 970        unsigned long flag;
 971
 972        addr = iommu->root_entry;
 973
 974        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 975        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 976
 977        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 978
 979        /* Make sure hardware complete it */
 980        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 981                      readl, (sts & DMA_GSTS_RTPS), sts);
 982
 983        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 984}
 985
 986static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 987{
 988        u32 val;
 989        unsigned long flag;
 990
 991        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 992                return;
 993
 994        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 995        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 996
 997        /* Make sure hardware complete it */
 998        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 999                      readl, (!(val & DMA_GSTS_WBFS)), val);
1000
1001        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1002}
1003
1004/* return value determine if we need a write buffer flush */
1005static void __iommu_flush_context(struct intel_iommu *iommu,
1006                                  u16 did, u16 source_id, u8 function_mask,
1007                                  u64 type)
1008{
1009        u64 val = 0;
1010        unsigned long flag;
1011
1012        switch (type) {
1013        case DMA_CCMD_GLOBAL_INVL:
1014                val = DMA_CCMD_GLOBAL_INVL;
1015                break;
1016        case DMA_CCMD_DOMAIN_INVL:
1017                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1018                break;
1019        case DMA_CCMD_DEVICE_INVL:
1020                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1021                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1022                break;
1023        default:
1024                BUG();
1025        }
1026        val |= DMA_CCMD_ICC;
1027
1028        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1029        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1030
1031        /* Make sure hardware complete it */
1032        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1033                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1034
1035        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1036}
1037
1038/* return value determine if we need a write buffer flush */
1039static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1040                                u64 addr, unsigned int size_order, u64 type)
1041{
1042        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1043        u64 val = 0, val_iva = 0;
1044        unsigned long flag;
1045
1046        switch (type) {
1047        case DMA_TLB_GLOBAL_FLUSH:
1048                /* global flush doesn't need set IVA_REG */
1049                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1050                break;
1051        case DMA_TLB_DSI_FLUSH:
1052                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                break;
1054        case DMA_TLB_PSI_FLUSH:
1055                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1056                /* Note: always flush non-leaf currently */
1057                val_iva = size_order | addr;
1058                break;
1059        default:
1060                BUG();
1061        }
1062        /* Note: set drain read/write */
1063#if 0
1064        /*
1065         * This is probably to be super secure.. Looks like we can
1066         * ignore it without any impact.
1067         */
1068        if (cap_read_drain(iommu->cap))
1069                val |= DMA_TLB_READ_DRAIN;
1070#endif
1071        if (cap_write_drain(iommu->cap))
1072                val |= DMA_TLB_WRITE_DRAIN;
1073
1074        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1075        /* Note: Only uses first TLB reg currently */
1076        if (val_iva)
1077                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1078        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1079
1080        /* Make sure hardware complete it */
1081        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1082                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1083
1084        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085
1086        /* check IOTLB invalidation granularity */
1087        if (DMA_TLB_IAIG(val) == 0)
1088                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1089        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1090                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1091                        (unsigned long long)DMA_TLB_IIRG(type),
1092                        (unsigned long long)DMA_TLB_IAIG(val));
1093}
1094
1095static struct device_domain_info *iommu_support_dev_iotlb(
1096        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1097{
1098        int found = 0;
1099        unsigned long flags;
1100        struct device_domain_info *info;
1101        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1102
1103        if (!ecap_dev_iotlb_support(iommu->ecap))
1104                return NULL;
1105
1106        if (!iommu->qi)
1107                return NULL;
1108
1109        spin_lock_irqsave(&device_domain_lock, flags);
1110        list_for_each_entry(info, &domain->devices, link)
1111                if (info->bus == bus && info->devfn == devfn) {
1112                        found = 1;
1113                        break;
1114                }
1115        spin_unlock_irqrestore(&device_domain_lock, flags);
1116
1117        if (!found || !info->dev)
1118                return NULL;
1119
1120        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1121                return NULL;
1122
1123        if (!dmar_find_matched_atsr_unit(info->dev))
1124                return NULL;
1125
1126        info->iommu = iommu;
1127
1128        return info;
1129}
1130
1131static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1132{
1133        if (!info)
1134                return;
1135
1136        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1137}
1138
1139static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1140{
1141        if (!info->dev || !pci_ats_enabled(info->dev))
1142                return;
1143
1144        pci_disable_ats(info->dev);
1145}
1146
1147static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1148                                  u64 addr, unsigned mask)
1149{
1150        u16 sid, qdep;
1151        unsigned long flags;
1152        struct device_domain_info *info;
1153
1154        spin_lock_irqsave(&device_domain_lock, flags);
1155        list_for_each_entry(info, &domain->devices, link) {
1156                if (!info->dev || !pci_ats_enabled(info->dev))
1157                        continue;
1158
1159                sid = info->bus << 8 | info->devfn;
1160                qdep = pci_ats_queue_depth(info->dev);
1161                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1162        }
1163        spin_unlock_irqrestore(&device_domain_lock, flags);
1164}
1165
1166static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1167                                  unsigned long pfn, unsigned int pages, int map)
1168{
1169        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1170        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1171
1172        BUG_ON(pages == 0);
1173
1174        /*
1175         * Fallback to domain selective flush if no PSI support or the size is
1176         * too big.
1177         * PSI requires page size to be 2 ^ x, and the base address is naturally
1178         * aligned to the size
1179         */
1180        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1181                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1182                                                DMA_TLB_DSI_FLUSH);
1183        else
1184                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1185                                                DMA_TLB_PSI_FLUSH);
1186
1187        /*
1188         * In caching mode, changes of pages from non-present to present require
1189         * flush. However, device IOTLB doesn't need to be flushed in this case.
1190         */
1191        if (!cap_caching_mode(iommu->cap) || !map)
1192                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1193}
1194
1195static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1196{
1197        u32 pmen;
1198        unsigned long flags;
1199
1200        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1201        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1202        pmen &= ~DMA_PMEN_EPM;
1203        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1204
1205        /* wait for the protected region status bit to clear */
1206        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1207                readl, !(pmen & DMA_PMEN_PRS), pmen);
1208
1209        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1210}
1211
1212static int iommu_enable_translation(struct intel_iommu *iommu)
1213{
1214        u32 sts;
1215        unsigned long flags;
1216
1217        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218        iommu->gcmd |= DMA_GCMD_TE;
1219        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1220
1221        /* Make sure hardware complete it */
1222        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                      readl, (sts & DMA_GSTS_TES), sts);
1224
1225        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226        return 0;
1227}
1228
1229static int iommu_disable_translation(struct intel_iommu *iommu)
1230{
1231        u32 sts;
1232        unsigned long flag;
1233
1234        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1235        iommu->gcmd &= ~DMA_GCMD_TE;
1236        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238        /* Make sure hardware complete it */
1239        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240                      readl, (!(sts & DMA_GSTS_TES)), sts);
1241
1242        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243        return 0;
1244}
1245
1246
1247static int iommu_init_domains(struct intel_iommu *iommu)
1248{
1249        unsigned long ndomains;
1250        unsigned long nlongs;
1251
1252        ndomains = cap_ndoms(iommu->cap);
1253        pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1254                        ndomains);
1255        nlongs = BITS_TO_LONGS(ndomains);
1256
1257        spin_lock_init(&iommu->lock);
1258
1259        /* TBD: there might be 64K domains,
1260         * consider other allocation for future chip
1261         */
1262        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1263        if (!iommu->domain_ids) {
1264                printk(KERN_ERR "Allocating domain id array failed\n");
1265                return -ENOMEM;
1266        }
1267        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1268                        GFP_KERNEL);
1269        if (!iommu->domains) {
1270                printk(KERN_ERR "Allocating domain array failed\n");
1271                return -ENOMEM;
1272        }
1273
1274        /*
1275         * if Caching mode is set, then invalid translations are tagged
1276         * with domainid 0. Hence we need to pre-allocate it.
1277         */
1278        if (cap_caching_mode(iommu->cap))
1279                set_bit(0, iommu->domain_ids);
1280        return 0;
1281}
1282
1283
1284static void domain_exit(struct dmar_domain *domain);
1285static void vm_domain_exit(struct dmar_domain *domain);
1286
1287void free_dmar_iommu(struct intel_iommu *iommu)
1288{
1289        struct dmar_domain *domain;
1290        int i;
1291        unsigned long flags;
1292
1293        if ((iommu->domains) && (iommu->domain_ids)) {
1294                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1295                        domain = iommu->domains[i];
1296                        clear_bit(i, iommu->domain_ids);
1297
1298                        spin_lock_irqsave(&domain->iommu_lock, flags);
1299                        if (--domain->iommu_count == 0) {
1300                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1301                                        vm_domain_exit(domain);
1302                                else
1303                                        domain_exit(domain);
1304                        }
1305                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1306                }
1307        }
1308
1309        if (iommu->gcmd & DMA_GCMD_TE)
1310                iommu_disable_translation(iommu);
1311
1312        if (iommu->irq) {
1313                irq_set_handler_data(iommu->irq, NULL);
1314                /* This will mask the irq */
1315                free_irq(iommu->irq, iommu);
1316                destroy_irq(iommu->irq);
1317        }
1318
1319        kfree(iommu->domains);
1320        kfree(iommu->domain_ids);
1321
1322        g_iommus[iommu->seq_id] = NULL;
1323
1324        /* if all iommus are freed, free g_iommus */
1325        for (i = 0; i < g_num_of_iommus; i++) {
1326                if (g_iommus[i])
1327                        break;
1328        }
1329
1330        if (i == g_num_of_iommus)
1331                kfree(g_iommus);
1332
1333        /* free context mapping */
1334        free_context_table(iommu);
1335}
1336
1337static struct dmar_domain *alloc_domain(void)
1338{
1339        struct dmar_domain *domain;
1340
1341        domain = alloc_domain_mem();
1342        if (!domain)
1343                return NULL;
1344
1345        domain->nid = -1;
1346        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1347        domain->flags = 0;
1348
1349        return domain;
1350}
1351
1352static int iommu_attach_domain(struct dmar_domain *domain,
1353                               struct intel_iommu *iommu)
1354{
1355        int num;
1356        unsigned long ndomains;
1357        unsigned long flags;
1358
1359        ndomains = cap_ndoms(iommu->cap);
1360
1361        spin_lock_irqsave(&iommu->lock, flags);
1362
1363        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1364        if (num >= ndomains) {
1365                spin_unlock_irqrestore(&iommu->lock, flags);
1366                printk(KERN_ERR "IOMMU: no free domain ids\n");
1367                return -ENOMEM;
1368        }
1369
1370        domain->id = num;
1371        set_bit(num, iommu->domain_ids);
1372        set_bit(iommu->seq_id, domain->iommu_bmp);
1373        iommu->domains[num] = domain;
1374        spin_unlock_irqrestore(&iommu->lock, flags);
1375
1376        return 0;
1377}
1378
1379static void iommu_detach_domain(struct dmar_domain *domain,
1380                                struct intel_iommu *iommu)
1381{
1382        unsigned long flags;
1383        int num, ndomains;
1384        int found = 0;
1385
1386        spin_lock_irqsave(&iommu->lock, flags);
1387        ndomains = cap_ndoms(iommu->cap);
1388        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1389                if (iommu->domains[num] == domain) {
1390                        found = 1;
1391                        break;
1392                }
1393        }
1394
1395        if (found) {
1396                clear_bit(num, iommu->domain_ids);
1397                clear_bit(iommu->seq_id, domain->iommu_bmp);
1398                iommu->domains[num] = NULL;
1399        }
1400        spin_unlock_irqrestore(&iommu->lock, flags);
1401}
1402
1403static struct iova_domain reserved_iova_list;
1404static struct lock_class_key reserved_rbtree_key;
1405
1406static int dmar_init_reserved_ranges(void)
1407{
1408        struct pci_dev *pdev = NULL;
1409        struct iova *iova;
1410        int i;
1411
1412        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1413
1414        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1415                &reserved_rbtree_key);
1416
1417        /* IOAPIC ranges shouldn't be accessed by DMA */
1418        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1419                IOVA_PFN(IOAPIC_RANGE_END));
1420        if (!iova) {
1421                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1422                return -ENODEV;
1423        }
1424
1425        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1426        for_each_pci_dev(pdev) {
1427                struct resource *r;
1428
1429                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1430                        r = &pdev->resource[i];
1431                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1432                                continue;
1433                        iova = reserve_iova(&reserved_iova_list,
1434                                            IOVA_PFN(r->start),
1435                                            IOVA_PFN(r->end));
1436                        if (!iova) {
1437                                printk(KERN_ERR "Reserve iova failed\n");
1438                                return -ENODEV;
1439                        }
1440                }
1441        }
1442        return 0;
1443}
1444
1445static void domain_reserve_special_ranges(struct dmar_domain *domain)
1446{
1447        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1448}
1449
1450static inline int guestwidth_to_adjustwidth(int gaw)
1451{
1452        int agaw;
1453        int r = (gaw - 12) % 9;
1454
1455        if (r == 0)
1456                agaw = gaw;
1457        else
1458                agaw = gaw + 9 - r;
1459        if (agaw > 64)
1460                agaw = 64;
1461        return agaw;
1462}
1463
1464static int domain_init(struct dmar_domain *domain, int guest_width)
1465{
1466        struct intel_iommu *iommu;
1467        int adjust_width, agaw;
1468        unsigned long sagaw;
1469
1470        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1471        spin_lock_init(&domain->iommu_lock);
1472
1473        domain_reserve_special_ranges(domain);
1474
1475        /* calculate AGAW */
1476        iommu = domain_get_iommu(domain);
1477        if (guest_width > cap_mgaw(iommu->cap))
1478                guest_width = cap_mgaw(iommu->cap);
1479        domain->gaw = guest_width;
1480        adjust_width = guestwidth_to_adjustwidth(guest_width);
1481        agaw = width_to_agaw(adjust_width);
1482        sagaw = cap_sagaw(iommu->cap);
1483        if (!test_bit(agaw, &sagaw)) {
1484                /* hardware doesn't support it, choose a bigger one */
1485                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1486                agaw = find_next_bit(&sagaw, 5, agaw);
1487                if (agaw >= 5)
1488                        return -ENODEV;
1489        }
1490        domain->agaw = agaw;
1491        INIT_LIST_HEAD(&domain->devices);
1492
1493        if (ecap_coherent(iommu->ecap))
1494                domain->iommu_coherency = 1;
1495        else
1496                domain->iommu_coherency = 0;
1497
1498        if (ecap_sc_support(iommu->ecap))
1499                domain->iommu_snooping = 1;
1500        else
1501                domain->iommu_snooping = 0;
1502
1503        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1504        domain->iommu_count = 1;
1505        domain->nid = iommu->node;
1506
1507        /* always allocate the top pgd */
1508        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1509        if (!domain->pgd)
1510                return -ENOMEM;
1511        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1512        return 0;
1513}
1514
1515static void domain_exit(struct dmar_domain *domain)
1516{
1517        struct dmar_drhd_unit *drhd;
1518        struct intel_iommu *iommu;
1519
1520        /* Domain 0 is reserved, so dont process it */
1521        if (!domain)
1522                return;
1523
1524        /* Flush any lazy unmaps that may reference this domain */
1525        if (!intel_iommu_strict)
1526                flush_unmaps_timeout(0);
1527
1528        domain_remove_dev_info(domain);
1529        /* destroy iovas */
1530        put_iova_domain(&domain->iovad);
1531
1532        /* clear ptes */
1533        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535        /* free page tables */
1536        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1537
1538        for_each_active_iommu(iommu, drhd)
1539                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1540                        iommu_detach_domain(domain, iommu);
1541
1542        free_domain_mem(domain);
1543}
1544
1545static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1546                                 u8 bus, u8 devfn, int translation)
1547{
1548        struct context_entry *context;
1549        unsigned long flags;
1550        struct intel_iommu *iommu;
1551        struct dma_pte *pgd;
1552        unsigned long num;
1553        unsigned long ndomains;
1554        int id;
1555        int agaw;
1556        struct device_domain_info *info = NULL;
1557
1558        pr_debug("Set context mapping for %02x:%02x.%d\n",
1559                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1560
1561        BUG_ON(!domain->pgd);
1562        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1563               translation != CONTEXT_TT_MULTI_LEVEL);
1564
1565        iommu = device_to_iommu(segment, bus, devfn);
1566        if (!iommu)
1567                return -ENODEV;
1568
1569        context = device_to_context_entry(iommu, bus, devfn);
1570        if (!context)
1571                return -ENOMEM;
1572        spin_lock_irqsave(&iommu->lock, flags);
1573        if (context_present(context)) {
1574                spin_unlock_irqrestore(&iommu->lock, flags);
1575                return 0;
1576        }
1577
1578        id = domain->id;
1579        pgd = domain->pgd;
1580
1581        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1582            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1583                int found = 0;
1584
1585                /* find an available domain id for this device in iommu */
1586                ndomains = cap_ndoms(iommu->cap);
1587                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1588                        if (iommu->domains[num] == domain) {
1589                                id = num;
1590                                found = 1;
1591                                break;
1592                        }
1593                }
1594
1595                if (found == 0) {
1596                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1597                        if (num >= ndomains) {
1598                                spin_unlock_irqrestore(&iommu->lock, flags);
1599                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1600                                return -EFAULT;
1601                        }
1602
1603                        set_bit(num, iommu->domain_ids);
1604                        iommu->domains[num] = domain;
1605                        id = num;
1606                }
1607
1608                /* Skip top levels of page tables for
1609                 * iommu which has less agaw than default.
1610                 * Unnecessary for PT mode.
1611                 */
1612                if (translation != CONTEXT_TT_PASS_THROUGH) {
1613                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1614                                pgd = phys_to_virt(dma_pte_addr(pgd));
1615                                if (!dma_pte_present(pgd)) {
1616                                        spin_unlock_irqrestore(&iommu->lock, flags);
1617                                        return -ENOMEM;
1618                                }
1619                        }
1620                }
1621        }
1622
1623        context_set_domain_id(context, id);
1624
1625        if (translation != CONTEXT_TT_PASS_THROUGH) {
1626                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1627                translation = info ? CONTEXT_TT_DEV_IOTLB :
1628                                     CONTEXT_TT_MULTI_LEVEL;
1629        }
1630        /*
1631         * In pass through mode, AW must be programmed to indicate the largest
1632         * AGAW value supported by hardware. And ASR is ignored by hardware.
1633         */
1634        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1635                context_set_address_width(context, iommu->msagaw);
1636        else {
1637                context_set_address_root(context, virt_to_phys(pgd));
1638                context_set_address_width(context, iommu->agaw);
1639        }
1640
1641        context_set_translation_type(context, translation);
1642        context_set_fault_enable(context);
1643        context_set_present(context);
1644        domain_flush_cache(domain, context, sizeof(*context));
1645
1646        /*
1647         * It's a non-present to present mapping. If hardware doesn't cache
1648         * non-present entry we only need to flush the write-buffer. If the
1649         * _does_ cache non-present entries, then it does so in the special
1650         * domain #0, which we have to flush:
1651         */
1652        if (cap_caching_mode(iommu->cap)) {
1653                iommu->flush.flush_context(iommu, 0,
1654                                           (((u16)bus) << 8) | devfn,
1655                                           DMA_CCMD_MASK_NOBIT,
1656                                           DMA_CCMD_DEVICE_INVL);
1657                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1658        } else {
1659                iommu_flush_write_buffer(iommu);
1660        }
1661        iommu_enable_dev_iotlb(info);
1662        spin_unlock_irqrestore(&iommu->lock, flags);
1663
1664        spin_lock_irqsave(&domain->iommu_lock, flags);
1665        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1666                domain->iommu_count++;
1667                if (domain->iommu_count == 1)
1668                        domain->nid = iommu->node;
1669                domain_update_iommu_cap(domain);
1670        }
1671        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1672        return 0;
1673}
1674
1675static int
1676domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1677                        int translation)
1678{
1679        int ret;
1680        struct pci_dev *tmp, *parent;
1681
1682        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1683                                         pdev->bus->number, pdev->devfn,
1684                                         translation);
1685        if (ret)
1686                return ret;
1687
1688        /* dependent device mapping */
1689        tmp = pci_find_upstream_pcie_bridge(pdev);
1690        if (!tmp)
1691                return 0;
1692        /* Secondary interface's bus number and devfn 0 */
1693        parent = pdev->bus->self;
1694        while (parent != tmp) {
1695                ret = domain_context_mapping_one(domain,
1696                                                 pci_domain_nr(parent->bus),
1697                                                 parent->bus->number,
1698                                                 parent->devfn, translation);
1699                if (ret)
1700                        return ret;
1701                parent = parent->bus->self;
1702        }
1703        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1704                return domain_context_mapping_one(domain,
1705                                        pci_domain_nr(tmp->subordinate),
1706                                        tmp->subordinate->number, 0,
1707                                        translation);
1708        else /* this is a legacy PCI bridge */
1709                return domain_context_mapping_one(domain,
1710                                                  pci_domain_nr(tmp->bus),
1711                                                  tmp->bus->number,
1712                                                  tmp->devfn,
1713                                                  translation);
1714}
1715
1716static int domain_context_mapped(struct pci_dev *pdev)
1717{
1718        int ret;
1719        struct pci_dev *tmp, *parent;
1720        struct intel_iommu *iommu;
1721
1722        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1723                                pdev->devfn);
1724        if (!iommu)
1725                return -ENODEV;
1726
1727        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1728        if (!ret)
1729                return ret;
1730        /* dependent device mapping */
1731        tmp = pci_find_upstream_pcie_bridge(pdev);
1732        if (!tmp)
1733                return ret;
1734        /* Secondary interface's bus number and devfn 0 */
1735        parent = pdev->bus->self;
1736        while (parent != tmp) {
1737                ret = device_context_mapped(iommu, parent->bus->number,
1738                                            parent->devfn);
1739                if (!ret)
1740                        return ret;
1741                parent = parent->bus->self;
1742        }
1743        if (pci_is_pcie(tmp))
1744                return device_context_mapped(iommu, tmp->subordinate->number,
1745                                             0);
1746        else
1747                return device_context_mapped(iommu, tmp->bus->number,
1748                                             tmp->devfn);
1749}
1750
1751/* Returns a number of VTD pages, but aligned to MM page size */
1752static inline unsigned long aligned_nrpages(unsigned long host_addr,
1753                                            size_t size)
1754{
1755        host_addr &= ~PAGE_MASK;
1756        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1757}
1758
1759/* Return largest possible superpage level for a given mapping */
1760static inline int hardware_largepage_caps(struct dmar_domain *domain,
1761                                          unsigned long iov_pfn,
1762                                          unsigned long phy_pfn,
1763                                          unsigned long pages)
1764{
1765        int support, level = 1;
1766        unsigned long pfnmerge;
1767
1768        support = domain->iommu_superpage;
1769
1770        /* To use a large page, the virtual *and* physical addresses
1771           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1772           of them will mean we have to use smaller pages. So just
1773           merge them and check both at once. */
1774        pfnmerge = iov_pfn | phy_pfn;
1775
1776        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1777                pages >>= VTD_STRIDE_SHIFT;
1778                if (!pages)
1779                        break;
1780                pfnmerge >>= VTD_STRIDE_SHIFT;
1781                level++;
1782                support--;
1783        }
1784        return level;
1785}
1786
1787static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1788                            struct scatterlist *sg, unsigned long phys_pfn,
1789                            unsigned long nr_pages, int prot)
1790{
1791        struct dma_pte *first_pte = NULL, *pte = NULL;
1792        phys_addr_t uninitialized_var(pteval);
1793        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1794        unsigned long sg_res;
1795        unsigned int largepage_lvl = 0;
1796        unsigned long lvl_pages = 0;
1797
1798        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1799
1800        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1801                return -EINVAL;
1802
1803        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1804
1805        if (sg)
1806                sg_res = 0;
1807        else {
1808                sg_res = nr_pages + 1;
1809                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1810        }
1811
1812        while (nr_pages > 0) {
1813                uint64_t tmp;
1814
1815                if (!sg_res) {
1816                        sg_res = aligned_nrpages(sg->offset, sg->length);
1817                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1818                        sg->dma_length = sg->length;
1819                        pteval = page_to_phys(sg_page(sg)) | prot;
1820                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1821                }
1822
1823                if (!pte) {
1824                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1825
1826                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1827                        if (!pte)
1828                                return -ENOMEM;
1829                        /* It is large page*/
1830                        if (largepage_lvl > 1) {
1831                                pteval |= DMA_PTE_LARGE_PAGE;
1832                                /* Ensure that old small page tables are removed to make room
1833                                   for superpage, if they exist. */
1834                                dma_pte_clear_range(domain, iov_pfn,
1835                                                    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1836                                dma_pte_free_pagetable(domain, iov_pfn,
1837                                                       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838                        } else {
1839                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1840                        }
1841
1842                }
1843                /* We don't need lock here, nobody else
1844                 * touches the iova range
1845                 */
1846                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1847                if (tmp) {
1848                        static int dumps = 5;
1849                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1850                               iov_pfn, tmp, (unsigned long long)pteval);
1851                        if (dumps) {
1852                                dumps--;
1853                                debug_dma_dump_mappings(NULL);
1854                        }
1855                        WARN_ON(1);
1856                }
1857
1858                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1859
1860                BUG_ON(nr_pages < lvl_pages);
1861                BUG_ON(sg_res < lvl_pages);
1862
1863                nr_pages -= lvl_pages;
1864                iov_pfn += lvl_pages;
1865                phys_pfn += lvl_pages;
1866                pteval += lvl_pages * VTD_PAGE_SIZE;
1867                sg_res -= lvl_pages;
1868
1869                /* If the next PTE would be the first in a new page, then we
1870                   need to flush the cache on the entries we've just written.
1871                   And then we'll need to recalculate 'pte', so clear it and
1872                   let it get set again in the if (!pte) block above.
1873
1874                   If we're done (!nr_pages) we need to flush the cache too.
1875
1876                   Also if we've been setting superpages, we may need to
1877                   recalculate 'pte' and switch back to smaller pages for the
1878                   end of the mapping, if the trailing size is not enough to
1879                   use another superpage (i.e. sg_res < lvl_pages). */
1880                pte++;
1881                if (!nr_pages || first_pte_in_page(pte) ||
1882                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1883                        domain_flush_cache(domain, first_pte,
1884                                           (void *)pte - (void *)first_pte);
1885                        pte = NULL;
1886                }
1887
1888                if (!sg_res && nr_pages)
1889                        sg = sg_next(sg);
1890        }
1891        return 0;
1892}
1893
1894static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1895                                    struct scatterlist *sg, unsigned long nr_pages,
1896                                    int prot)
1897{
1898        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1899}
1900
1901static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1902                                     unsigned long phys_pfn, unsigned long nr_pages,
1903                                     int prot)
1904{
1905        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1906}
1907
1908static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1909{
1910        if (!iommu)
1911                return;
1912
1913        clear_context_table(iommu, bus, devfn);
1914        iommu->flush.flush_context(iommu, 0, 0, 0,
1915                                           DMA_CCMD_GLOBAL_INVL);
1916        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1917}
1918
1919static inline void unlink_domain_info(struct device_domain_info *info)
1920{
1921        assert_spin_locked(&device_domain_lock);
1922        list_del(&info->link);
1923        list_del(&info->global);
1924        if (info->dev)
1925                info->dev->dev.archdata.iommu = NULL;
1926}
1927
1928static void domain_remove_dev_info(struct dmar_domain *domain)
1929{
1930        struct device_domain_info *info;
1931        unsigned long flags;
1932        struct intel_iommu *iommu;
1933
1934        spin_lock_irqsave(&device_domain_lock, flags);
1935        while (!list_empty(&domain->devices)) {
1936                info = list_entry(domain->devices.next,
1937                        struct device_domain_info, link);
1938                unlink_domain_info(info);
1939                spin_unlock_irqrestore(&device_domain_lock, flags);
1940
1941                iommu_disable_dev_iotlb(info);
1942                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1943                iommu_detach_dev(iommu, info->bus, info->devfn);
1944                free_devinfo_mem(info);
1945
1946                spin_lock_irqsave(&device_domain_lock, flags);
1947        }
1948        spin_unlock_irqrestore(&device_domain_lock, flags);
1949}
1950
1951/*
1952 * find_domain
1953 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1954 */
1955static struct dmar_domain *
1956find_domain(struct pci_dev *pdev)
1957{
1958        struct device_domain_info *info;
1959
1960        /* No lock here, assumes no domain exit in normal case */
1961        info = pdev->dev.archdata.iommu;
1962        if (info)
1963                return info->domain;
1964        return NULL;
1965}
1966
1967/* domain is initialized */
1968static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1969{
1970        struct dmar_domain *domain, *found = NULL;
1971        struct intel_iommu *iommu;
1972        struct dmar_drhd_unit *drhd;
1973        struct device_domain_info *info, *tmp;
1974        struct pci_dev *dev_tmp;
1975        unsigned long flags;
1976        int bus = 0, devfn = 0;
1977        int segment;
1978        int ret;
1979
1980        domain = find_domain(pdev);
1981        if (domain)
1982                return domain;
1983
1984        segment = pci_domain_nr(pdev->bus);
1985
1986        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1987        if (dev_tmp) {
1988                if (pci_is_pcie(dev_tmp)) {
1989                        bus = dev_tmp->subordinate->number;
1990                        devfn = 0;
1991                } else {
1992                        bus = dev_tmp->bus->number;
1993                        devfn = dev_tmp->devfn;
1994                }
1995                spin_lock_irqsave(&device_domain_lock, flags);
1996                list_for_each_entry(info, &device_domain_list, global) {
1997                        if (info->segment == segment &&
1998                            info->bus == bus && info->devfn == devfn) {
1999                                found = info->domain;
2000                                break;
2001                        }
2002                }
2003                spin_unlock_irqrestore(&device_domain_lock, flags);
2004                /* pcie-pci bridge already has a domain, uses it */
2005                if (found) {
2006                        domain = found;
2007                        goto found_domain;
2008                }
2009        }
2010
2011        domain = alloc_domain();
2012        if (!domain)
2013                goto error;
2014
2015        /* Allocate new domain for the device */
2016        drhd = dmar_find_matched_drhd_unit(pdev);
2017        if (!drhd) {
2018                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2019                        pci_name(pdev));
2020                free_domain_mem(domain);
2021                return NULL;
2022        }
2023        iommu = drhd->iommu;
2024
2025        ret = iommu_attach_domain(domain, iommu);
2026        if (ret) {
2027                free_domain_mem(domain);
2028                goto error;
2029        }
2030
2031        if (domain_init(domain, gaw)) {
2032                domain_exit(domain);
2033                goto error;
2034        }
2035
2036        /* register pcie-to-pci device */
2037        if (dev_tmp) {
2038                info = alloc_devinfo_mem();
2039                if (!info) {
2040                        domain_exit(domain);
2041                        goto error;
2042                }
2043                info->segment = segment;
2044                info->bus = bus;
2045                info->devfn = devfn;
2046                info->dev = NULL;
2047                info->domain = domain;
2048                /* This domain is shared by devices under p2p bridge */
2049                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2050
2051                /* pcie-to-pci bridge already has a domain, uses it */
2052                found = NULL;
2053                spin_lock_irqsave(&device_domain_lock, flags);
2054                list_for_each_entry(tmp, &device_domain_list, global) {
2055                        if (tmp->segment == segment &&
2056                            tmp->bus == bus && tmp->devfn == devfn) {
2057                                found = tmp->domain;
2058                                break;
2059                        }
2060                }
2061                if (found) {
2062                        spin_unlock_irqrestore(&device_domain_lock, flags);
2063                        free_devinfo_mem(info);
2064                        domain_exit(domain);
2065                        domain = found;
2066                } else {
2067                        list_add(&info->link, &domain->devices);
2068                        list_add(&info->global, &device_domain_list);
2069                        spin_unlock_irqrestore(&device_domain_lock, flags);
2070                }
2071        }
2072
2073found_domain:
2074        info = alloc_devinfo_mem();
2075        if (!info)
2076                goto error;
2077        info->segment = segment;
2078        info->bus = pdev->bus->number;
2079        info->devfn = pdev->devfn;
2080        info->dev = pdev;
2081        info->domain = domain;
2082        spin_lock_irqsave(&device_domain_lock, flags);
2083        /* somebody is fast */
2084        found = find_domain(pdev);
2085        if (found != NULL) {
2086                spin_unlock_irqrestore(&device_domain_lock, flags);
2087                if (found != domain) {
2088                        domain_exit(domain);
2089                        domain = found;
2090                }
2091                free_devinfo_mem(info);
2092                return domain;
2093        }
2094        list_add(&info->link, &domain->devices);
2095        list_add(&info->global, &device_domain_list);
2096        pdev->dev.archdata.iommu = info;
2097        spin_unlock_irqrestore(&device_domain_lock, flags);
2098        return domain;
2099error:
2100        /* recheck it here, maybe others set it */
2101        return find_domain(pdev);
2102}
2103
2104static int iommu_identity_mapping;
2105#define IDENTMAP_ALL            1
2106#define IDENTMAP_GFX            2
2107#define IDENTMAP_AZALIA         4
2108
2109static int iommu_domain_identity_map(struct dmar_domain *domain,
2110                                     unsigned long long start,
2111                                     unsigned long long end)
2112{
2113        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2114        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2115
2116        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2117                          dma_to_mm_pfn(last_vpfn))) {
2118                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2119                return -ENOMEM;
2120        }
2121
2122        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2123                 start, end, domain->id);
2124        /*
2125         * RMRR range might have overlap with physical memory range,
2126         * clear it first
2127         */
2128        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2129
2130        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2131                                  last_vpfn - first_vpfn + 1,
2132                                  DMA_PTE_READ|DMA_PTE_WRITE);
2133}
2134
2135static int iommu_prepare_identity_map(struct pci_dev *pdev,
2136                                      unsigned long long start,
2137                                      unsigned long long end)
2138{
2139        struct dmar_domain *domain;
2140        int ret;
2141
2142        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2143        if (!domain)
2144                return -ENOMEM;
2145
2146        /* For _hardware_ passthrough, don't bother. But for software
2147           passthrough, we do it anyway -- it may indicate a memory
2148           range which is reserved in E820, so which didn't get set
2149           up to start with in si_domain */
2150        if (domain == si_domain && hw_pass_through) {
2151                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2152                       pci_name(pdev), start, end);
2153                return 0;
2154        }
2155
2156        printk(KERN_INFO
2157               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2158               pci_name(pdev), start, end);
2159        
2160        if (end < start) {
2161                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2162                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2163                        dmi_get_system_info(DMI_BIOS_VENDOR),
2164                        dmi_get_system_info(DMI_BIOS_VERSION),
2165                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2166                ret = -EIO;
2167                goto error;
2168        }
2169
2170        if (end >> agaw_to_width(domain->agaw)) {
2171                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2172                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2173                     agaw_to_width(domain->agaw),
2174                     dmi_get_system_info(DMI_BIOS_VENDOR),
2175                     dmi_get_system_info(DMI_BIOS_VERSION),
2176                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2177                ret = -EIO;
2178                goto error;
2179        }
2180
2181        ret = iommu_domain_identity_map(domain, start, end);
2182        if (ret)
2183                goto error;
2184
2185        /* context entry init */
2186        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2187        if (ret)
2188                goto error;
2189
2190        return 0;
2191
2192 error:
2193        domain_exit(domain);
2194        return ret;
2195}
2196
2197static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2198        struct pci_dev *pdev)
2199{
2200        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2201                return 0;
2202        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2203                rmrr->end_address);
2204}
2205
2206#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2207static inline void iommu_prepare_isa(void)
2208{
2209        struct pci_dev *pdev;
2210        int ret;
2211
2212        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2213        if (!pdev)
2214                return;
2215
2216        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2217        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2218
2219        if (ret)
2220                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2221                       "floppy might not work\n");
2222
2223}
2224#else
2225static inline void iommu_prepare_isa(void)
2226{
2227        return;
2228}
2229#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2230
2231static int md_domain_init(struct dmar_domain *domain, int guest_width);
2232
2233static int __init si_domain_init(int hw)
2234{
2235        struct dmar_drhd_unit *drhd;
2236        struct intel_iommu *iommu;
2237        int nid, ret = 0;
2238
2239        si_domain = alloc_domain();
2240        if (!si_domain)
2241                return -EFAULT;
2242
2243        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2244
2245        for_each_active_iommu(iommu, drhd) {
2246                ret = iommu_attach_domain(si_domain, iommu);
2247                if (ret) {
2248                        domain_exit(si_domain);
2249                        return -EFAULT;
2250                }
2251        }
2252
2253        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2254                domain_exit(si_domain);
2255                return -EFAULT;
2256        }
2257
2258        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2259
2260        if (hw)
2261                return 0;
2262
2263        for_each_online_node(nid) {
2264                unsigned long start_pfn, end_pfn;
2265                int i;
2266
2267                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2268                        ret = iommu_domain_identity_map(si_domain,
2269                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2270                        if (ret)
2271                                return ret;
2272                }
2273        }
2274
2275        return 0;
2276}
2277
2278static void domain_remove_one_dev_info(struct dmar_domain *domain,
2279                                          struct pci_dev *pdev);
2280static int identity_mapping(struct pci_dev *pdev)
2281{
2282        struct device_domain_info *info;
2283
2284        if (likely(!iommu_identity_mapping))
2285                return 0;
2286
2287        info = pdev->dev.archdata.iommu;
2288        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2289                return (info->domain == si_domain);
2290
2291        return 0;
2292}
2293
2294static int domain_add_dev_info(struct dmar_domain *domain,
2295                               struct pci_dev *pdev,
2296                               int translation)
2297{
2298        struct device_domain_info *info;
2299        unsigned long flags;
2300        int ret;
2301
2302        info = alloc_devinfo_mem();
2303        if (!info)
2304                return -ENOMEM;
2305
2306        info->segment = pci_domain_nr(pdev->bus);
2307        info->bus = pdev->bus->number;
2308        info->devfn = pdev->devfn;
2309        info->dev = pdev;
2310        info->domain = domain;
2311
2312        spin_lock_irqsave(&device_domain_lock, flags);
2313        list_add(&info->link, &domain->devices);
2314        list_add(&info->global, &device_domain_list);
2315        pdev->dev.archdata.iommu = info;
2316        spin_unlock_irqrestore(&device_domain_lock, flags);
2317
2318        ret = domain_context_mapping(domain, pdev, translation);
2319        if (ret) {
2320                spin_lock_irqsave(&device_domain_lock, flags);
2321                unlink_domain_info(info);
2322                spin_unlock_irqrestore(&device_domain_lock, flags);
2323                free_devinfo_mem(info);
2324                return ret;
2325        }
2326
2327        return 0;
2328}
2329
2330static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2331{
2332        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2333                return 1;
2334
2335        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2336                return 1;
2337
2338        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2339                return 0;
2340
2341        /*
2342         * We want to start off with all devices in the 1:1 domain, and
2343         * take them out later if we find they can't access all of memory.
2344         *
2345         * However, we can't do this for PCI devices behind bridges,
2346         * because all PCI devices behind the same bridge will end up
2347         * with the same source-id on their transactions.
2348         *
2349         * Practically speaking, we can't change things around for these
2350         * devices at run-time, because we can't be sure there'll be no
2351         * DMA transactions in flight for any of their siblings.
2352         * 
2353         * So PCI devices (unless they're on the root bus) as well as
2354         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2355         * the 1:1 domain, just in _case_ one of their siblings turns out
2356         * not to be able to map all of memory.
2357         */
2358        if (!pci_is_pcie(pdev)) {
2359                if (!pci_is_root_bus(pdev->bus))
2360                        return 0;
2361                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2362                        return 0;
2363        } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2364                return 0;
2365
2366        /* 
2367         * At boot time, we don't yet know if devices will be 64-bit capable.
2368         * Assume that they will -- if they turn out not to be, then we can 
2369         * take them out of the 1:1 domain later.
2370         */
2371        if (!startup) {
2372                /*
2373                 * If the device's dma_mask is less than the system's memory
2374                 * size then this is not a candidate for identity mapping.
2375                 */
2376                u64 dma_mask = pdev->dma_mask;
2377
2378                if (pdev->dev.coherent_dma_mask &&
2379                    pdev->dev.coherent_dma_mask < dma_mask)
2380                        dma_mask = pdev->dev.coherent_dma_mask;
2381
2382                return dma_mask >= dma_get_required_mask(&pdev->dev);
2383        }
2384
2385        return 1;
2386}
2387
2388static int __init iommu_prepare_static_identity_mapping(int hw)
2389{
2390        struct pci_dev *pdev = NULL;
2391        int ret;
2392
2393        ret = si_domain_init(hw);
2394        if (ret)
2395                return -EFAULT;
2396
2397        for_each_pci_dev(pdev) {
2398                if (iommu_should_identity_map(pdev, 1)) {
2399                        ret = domain_add_dev_info(si_domain, pdev,
2400                                             hw ? CONTEXT_TT_PASS_THROUGH :
2401                                                  CONTEXT_TT_MULTI_LEVEL);
2402                        if (ret) {
2403                                /* device not associated with an iommu */
2404                                if (ret == -ENODEV)
2405                                        continue;
2406                                return ret;
2407                        }
2408                        pr_info("IOMMU: %s identity mapping for device %s\n",
2409                                hw ? "hardware" : "software", pci_name(pdev));
2410                }
2411        }
2412
2413        return 0;
2414}
2415
2416static int __init init_dmars(void)
2417{
2418        struct dmar_drhd_unit *drhd;
2419        struct dmar_rmrr_unit *rmrr;
2420        struct pci_dev *pdev;
2421        struct intel_iommu *iommu;
2422        int i, ret;
2423
2424        /*
2425         * for each drhd
2426         *    allocate root
2427         *    initialize and program root entry to not present
2428         * endfor
2429         */
2430        for_each_drhd_unit(drhd) {
2431                /*
2432                 * lock not needed as this is only incremented in the single
2433                 * threaded kernel __init code path all other access are read
2434                 * only
2435                 */
2436                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2437                        g_num_of_iommus++;
2438                        continue;
2439                }
2440                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2441                          IOMMU_UNITS_SUPPORTED);
2442        }
2443
2444        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2445                        GFP_KERNEL);
2446        if (!g_iommus) {
2447                printk(KERN_ERR "Allocating global iommu array failed\n");
2448                ret = -ENOMEM;
2449                goto error;
2450        }
2451
2452        deferred_flush = kzalloc(g_num_of_iommus *
2453                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2454        if (!deferred_flush) {
2455                ret = -ENOMEM;
2456                goto error;
2457        }
2458
2459        for_each_drhd_unit(drhd) {
2460                if (drhd->ignored)
2461                        continue;
2462
2463                iommu = drhd->iommu;
2464                g_iommus[iommu->seq_id] = iommu;
2465
2466                ret = iommu_init_domains(iommu);
2467                if (ret)
2468                        goto error;
2469
2470                /*
2471                 * TBD:
2472                 * we could share the same root & context tables
2473                 * among all IOMMU's. Need to Split it later.
2474                 */
2475                ret = iommu_alloc_root_entry(iommu);
2476                if (ret) {
2477                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2478                        goto error;
2479                }
2480                if (!ecap_pass_through(iommu->ecap))
2481                        hw_pass_through = 0;
2482        }
2483
2484        /*
2485         * Start from the sane iommu hardware state.
2486         */
2487        for_each_drhd_unit(drhd) {
2488                if (drhd->ignored)
2489                        continue;
2490
2491                iommu = drhd->iommu;
2492
2493                /*
2494                 * If the queued invalidation is already initialized by us
2495                 * (for example, while enabling interrupt-remapping) then
2496                 * we got the things already rolling from a sane state.
2497                 */
2498                if (iommu->qi)
2499                        continue;
2500
2501                /*
2502                 * Clear any previous faults.
2503                 */
2504                dmar_fault(-1, iommu);
2505                /*
2506                 * Disable queued invalidation if supported and already enabled
2507                 * before OS handover.
2508                 */
2509                dmar_disable_qi(iommu);
2510        }
2511
2512        for_each_drhd_unit(drhd) {
2513                if (drhd->ignored)
2514                        continue;
2515
2516                iommu = drhd->iommu;
2517
2518                if (dmar_enable_qi(iommu)) {
2519                        /*
2520                         * Queued Invalidate not enabled, use Register Based
2521                         * Invalidate
2522                         */
2523                        iommu->flush.flush_context = __iommu_flush_context;
2524                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2525                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2526                               "invalidation\n",
2527                                iommu->seq_id,
2528                               (unsigned long long)drhd->reg_base_addr);
2529                } else {
2530                        iommu->flush.flush_context = qi_flush_context;
2531                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2532                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2533                               "invalidation\n",
2534                                iommu->seq_id,
2535                               (unsigned long long)drhd->reg_base_addr);
2536                }
2537        }
2538
2539        if (iommu_pass_through)
2540                iommu_identity_mapping |= IDENTMAP_ALL;
2541
2542#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2543        iommu_identity_mapping |= IDENTMAP_GFX;
2544#endif
2545
2546        check_tylersburg_isoch();
2547
2548        /*
2549         * If pass through is not set or not enabled, setup context entries for
2550         * identity mappings for rmrr, gfx, and isa and may fall back to static
2551         * identity mapping if iommu_identity_mapping is set.
2552         */
2553        if (iommu_identity_mapping) {
2554                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2555                if (ret) {
2556                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2557                        goto error;
2558                }
2559        }
2560        /*
2561         * For each rmrr
2562         *   for each dev attached to rmrr
2563         *   do
2564         *     locate drhd for dev, alloc domain for dev
2565         *     allocate free domain
2566         *     allocate page table entries for rmrr
2567         *     if context not allocated for bus
2568         *           allocate and init context
2569         *           set present in root table for this bus
2570         *     init context with domain, translation etc
2571         *    endfor
2572         * endfor
2573         */
2574        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2575        for_each_rmrr_units(rmrr) {
2576                for (i = 0; i < rmrr->devices_cnt; i++) {
2577                        pdev = rmrr->devices[i];
2578                        /*
2579                         * some BIOS lists non-exist devices in DMAR
2580                         * table.
2581                         */
2582                        if (!pdev)
2583                                continue;
2584                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2585                        if (ret)
2586                                printk(KERN_ERR
2587                                       "IOMMU: mapping reserved region failed\n");
2588                }
2589        }
2590
2591        iommu_prepare_isa();
2592
2593        /*
2594         * for each drhd
2595         *   enable fault log
2596         *   global invalidate context cache
2597         *   global invalidate iotlb
2598         *   enable translation
2599         */
2600        for_each_drhd_unit(drhd) {
2601                if (drhd->ignored) {
2602                        /*
2603                         * we always have to disable PMRs or DMA may fail on
2604                         * this device
2605                         */
2606                        if (force_on)
2607                                iommu_disable_protect_mem_regions(drhd->iommu);
2608                        continue;
2609                }
2610                iommu = drhd->iommu;
2611
2612                iommu_flush_write_buffer(iommu);
2613
2614                ret = dmar_set_interrupt(iommu);
2615                if (ret)
2616                        goto error;
2617
2618                iommu_set_root_entry(iommu);
2619
2620                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2621                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2622
2623                ret = iommu_enable_translation(iommu);
2624                if (ret)
2625                        goto error;
2626
2627                iommu_disable_protect_mem_regions(iommu);
2628        }
2629
2630        return 0;
2631error:
2632        for_each_drhd_unit(drhd) {
2633                if (drhd->ignored)
2634                        continue;
2635                iommu = drhd->iommu;
2636                free_iommu(iommu);
2637        }
2638        kfree(g_iommus);
2639        return ret;
2640}
2641
2642/* This takes a number of _MM_ pages, not VTD pages */
2643static struct iova *intel_alloc_iova(struct device *dev,
2644                                     struct dmar_domain *domain,
2645                                     unsigned long nrpages, uint64_t dma_mask)
2646{
2647        struct pci_dev *pdev = to_pci_dev(dev);
2648        struct iova *iova = NULL;
2649
2650        /* Restrict dma_mask to the width that the iommu can handle */
2651        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2652
2653        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2654                /*
2655                 * First try to allocate an io virtual address in
2656                 * DMA_BIT_MASK(32) and if that fails then try allocating
2657                 * from higher range
2658                 */
2659                iova = alloc_iova(&domain->iovad, nrpages,
2660                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2661                if (iova)
2662                        return iova;
2663        }
2664        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2665        if (unlikely(!iova)) {
2666                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2667                       nrpages, pci_name(pdev));
2668                return NULL;
2669        }
2670
2671        return iova;
2672}
2673
2674static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2675{
2676        struct dmar_domain *domain;
2677        int ret;
2678
2679        domain = get_domain_for_dev(pdev,
2680                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2681        if (!domain) {
2682                printk(KERN_ERR
2683                        "Allocating domain for %s failed", pci_name(pdev));
2684                return NULL;
2685        }
2686
2687        /* make sure context mapping is ok */
2688        if (unlikely(!domain_context_mapped(pdev))) {
2689                ret = domain_context_mapping(domain, pdev,
2690                                             CONTEXT_TT_MULTI_LEVEL);
2691                if (ret) {
2692                        printk(KERN_ERR
2693                                "Domain context map for %s failed",
2694                                pci_name(pdev));
2695                        return NULL;
2696                }
2697        }
2698
2699        return domain;
2700}
2701
2702static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2703{
2704        struct device_domain_info *info;
2705
2706        /* No lock here, assumes no domain exit in normal case */
2707        info = dev->dev.archdata.iommu;
2708        if (likely(info))
2709                return info->domain;
2710
2711        return __get_valid_domain_for_dev(dev);
2712}
2713
2714static int iommu_dummy(struct pci_dev *pdev)
2715{
2716        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2717}
2718
2719/* Check if the pdev needs to go through non-identity map and unmap process.*/
2720static int iommu_no_mapping(struct device *dev)
2721{
2722        struct pci_dev *pdev;
2723        int found;
2724
2725        if (unlikely(dev->bus != &pci_bus_type))
2726                return 1;
2727
2728        pdev = to_pci_dev(dev);
2729        if (iommu_dummy(pdev))
2730                return 1;
2731
2732        if (!iommu_identity_mapping)
2733                return 0;
2734
2735        found = identity_mapping(pdev);
2736        if (found) {
2737                if (iommu_should_identity_map(pdev, 0))
2738                        return 1;
2739                else {
2740                        /*
2741                         * 32 bit DMA is removed from si_domain and fall back
2742                         * to non-identity mapping.
2743                         */
2744                        domain_remove_one_dev_info(si_domain, pdev);
2745                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2746                               pci_name(pdev));
2747                        return 0;
2748                }
2749        } else {
2750                /*
2751                 * In case of a detached 64 bit DMA device from vm, the device
2752                 * is put into si_domain for identity mapping.
2753                 */
2754                if (iommu_should_identity_map(pdev, 0)) {
2755                        int ret;
2756                        ret = domain_add_dev_info(si_domain, pdev,
2757                                                  hw_pass_through ?
2758                                                  CONTEXT_TT_PASS_THROUGH :
2759                                                  CONTEXT_TT_MULTI_LEVEL);
2760                        if (!ret) {
2761                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2762                                       pci_name(pdev));
2763                                return 1;
2764                        }
2765                }
2766        }
2767
2768        return 0;
2769}
2770
2771static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2772                                     size_t size, int dir, u64 dma_mask)
2773{
2774        struct pci_dev *pdev = to_pci_dev(hwdev);
2775        struct dmar_domain *domain;
2776        phys_addr_t start_paddr;
2777        struct iova *iova;
2778        int prot = 0;
2779        int ret;
2780        struct intel_iommu *iommu;
2781        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2782
2783        BUG_ON(dir == DMA_NONE);
2784
2785        if (iommu_no_mapping(hwdev))
2786                return paddr;
2787
2788        domain = get_valid_domain_for_dev(pdev);
2789        if (!domain)
2790                return 0;
2791
2792        iommu = domain_get_iommu(domain);
2793        size = aligned_nrpages(paddr, size);
2794
2795        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2796        if (!iova)
2797                goto error;
2798
2799        /*
2800         * Check if DMAR supports zero-length reads on write only
2801         * mappings..
2802         */
2803        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2804                        !cap_zlr(iommu->cap))
2805                prot |= DMA_PTE_READ;
2806        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2807                prot |= DMA_PTE_WRITE;
2808        /*
2809         * paddr - (paddr + size) might be partial page, we should map the whole
2810         * page.  Note: if two part of one page are separately mapped, we
2811         * might have two guest_addr mapping to the same host paddr, but this
2812         * is not a big problem
2813         */
2814        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2815                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2816        if (ret)
2817                goto error;
2818
2819        /* it's a non-present to present mapping. Only flush if caching mode */
2820        if (cap_caching_mode(iommu->cap))
2821                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2822        else
2823                iommu_flush_write_buffer(iommu);
2824
2825        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2826        start_paddr += paddr & ~PAGE_MASK;
2827        return start_paddr;
2828
2829error:
2830        if (iova)
2831                __free_iova(&domain->iovad, iova);
2832        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2833                pci_name(pdev), size, (unsigned long long)paddr, dir);
2834        return 0;
2835}
2836
2837static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2838                                 unsigned long offset, size_t size,
2839                                 enum dma_data_direction dir,
2840                                 struct dma_attrs *attrs)
2841{
2842        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2843                                  dir, to_pci_dev(dev)->dma_mask);
2844}
2845
2846static void flush_unmaps(void)
2847{
2848        int i, j;
2849
2850        timer_on = 0;
2851
2852        /* just flush them all */
2853        for (i = 0; i < g_num_of_iommus; i++) {
2854                struct intel_iommu *iommu = g_iommus[i];
2855                if (!iommu)
2856                        continue;
2857
2858                if (!deferred_flush[i].next)
2859                        continue;
2860
2861                /* In caching mode, global flushes turn emulation expensive */
2862                if (!cap_caching_mode(iommu->cap))
2863                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2864                                         DMA_TLB_GLOBAL_FLUSH);
2865                for (j = 0; j < deferred_flush[i].next; j++) {
2866                        unsigned long mask;
2867                        struct iova *iova = deferred_flush[i].iova[j];
2868                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2869
2870                        /* On real hardware multiple invalidations are expensive */
2871                        if (cap_caching_mode(iommu->cap))
2872                                iommu_flush_iotlb_psi(iommu, domain->id,
2873                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2874                        else {
2875                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2876                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2877                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2878                        }
2879                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2880                }
2881                deferred_flush[i].next = 0;
2882        }
2883
2884        list_size = 0;
2885}
2886
2887static void flush_unmaps_timeout(unsigned long data)
2888{
2889        unsigned long flags;
2890
2891        spin_lock_irqsave(&async_umap_flush_lock, flags);
2892        flush_unmaps();
2893        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2894}
2895
2896static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2897{
2898        unsigned long flags;
2899        int next, iommu_id;
2900        struct intel_iommu *iommu;
2901
2902        spin_lock_irqsave(&async_umap_flush_lock, flags);
2903        if (list_size == HIGH_WATER_MARK)
2904                flush_unmaps();
2905
2906        iommu = domain_get_iommu(dom);
2907        iommu_id = iommu->seq_id;
2908
2909        next = deferred_flush[iommu_id].next;
2910        deferred_flush[iommu_id].domain[next] = dom;
2911        deferred_flush[iommu_id].iova[next] = iova;
2912        deferred_flush[iommu_id].next++;
2913
2914        if (!timer_on) {
2915                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2916                timer_on = 1;
2917        }
2918        list_size++;
2919        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2920}
2921
2922static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2923                             size_t size, enum dma_data_direction dir,
2924                             struct dma_attrs *attrs)
2925{
2926        struct pci_dev *pdev = to_pci_dev(dev);
2927        struct dmar_domain *domain;
2928        unsigned long start_pfn, last_pfn;
2929        struct iova *iova;
2930        struct intel_iommu *iommu;
2931
2932        if (iommu_no_mapping(dev))
2933                return;
2934
2935        domain = find_domain(pdev);
2936        BUG_ON(!domain);
2937
2938        iommu = domain_get_iommu(domain);
2939
2940        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2941        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2942                      (unsigned long long)dev_addr))
2943                return;
2944
2945        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2946        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2947
2948        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2949                 pci_name(pdev), start_pfn, last_pfn);
2950
2951        /*  clear the whole page */
2952        dma_pte_clear_range(domain, start_pfn, last_pfn);
2953
2954        /* free page tables */
2955        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2956
2957        if (intel_iommu_strict) {
2958                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2959                                      last_pfn - start_pfn + 1, 0);
2960                /* free iova */
2961                __free_iova(&domain->iovad, iova);
2962        } else {
2963                add_unmap(domain, iova);
2964                /*
2965                 * queue up the release of the unmap to save the 1/6th of the
2966                 * cpu used up by the iotlb flush operation...
2967                 */
2968        }
2969}
2970
2971static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2972                                  dma_addr_t *dma_handle, gfp_t flags,
2973                                  struct dma_attrs *attrs)
2974{
2975        void *vaddr;
2976        int order;
2977
2978        size = PAGE_ALIGN(size);
2979        order = get_order(size);
2980
2981        if (!iommu_no_mapping(hwdev))
2982                flags &= ~(GFP_DMA | GFP_DMA32);
2983        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2984                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2985                        flags |= GFP_DMA;
2986                else
2987                        flags |= GFP_DMA32;
2988        }
2989
2990        vaddr = (void *)__get_free_pages(flags, order);
2991        if (!vaddr)
2992                return NULL;
2993        memset(vaddr, 0, size);
2994
2995        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2996                                         DMA_BIDIRECTIONAL,
2997                                         hwdev->coherent_dma_mask);
2998        if (*dma_handle)
2999                return vaddr;
3000        free_pages((unsigned long)vaddr, order);
3001        return NULL;
3002}
3003
3004static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3005                                dma_addr_t dma_handle, struct dma_attrs *attrs)
3006{
3007        int order;
3008
3009        size = PAGE_ALIGN(size);
3010        order = get_order(size);
3011
3012        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3013        free_pages((unsigned long)vaddr, order);
3014}
3015
3016static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3017                           int nelems, enum dma_data_direction dir,
3018                           struct dma_attrs *attrs)
3019{
3020        struct pci_dev *pdev = to_pci_dev(hwdev);
3021        struct dmar_domain *domain;
3022        unsigned long start_pfn, last_pfn;
3023        struct iova *iova;
3024        struct intel_iommu *iommu;
3025
3026        if (iommu_no_mapping(hwdev))
3027                return;
3028
3029        domain = find_domain(pdev);
3030        BUG_ON(!domain);
3031
3032        iommu = domain_get_iommu(domain);
3033
3034        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3035        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3036                      (unsigned long long)sglist[0].dma_address))
3037                return;
3038
3039        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3040        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3041
3042        /*  clear the whole page */
3043        dma_pte_clear_range(domain, start_pfn, last_pfn);
3044
3045        /* free page tables */
3046        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3047
3048        if (intel_iommu_strict) {
3049                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3050                                      last_pfn - start_pfn + 1, 0);
3051                /* free iova */
3052                __free_iova(&domain->iovad, iova);
3053        } else {
3054                add_unmap(domain, iova);
3055                /*
3056                 * queue up the release of the unmap to save the 1/6th of the
3057                 * cpu used up by the iotlb flush operation...
3058                 */
3059        }
3060}
3061
3062static int intel_nontranslate_map_sg(struct device *hddev,
3063        struct scatterlist *sglist, int nelems, int dir)
3064{
3065        int i;
3066        struct scatterlist *sg;
3067
3068        for_each_sg(sglist, sg, nelems, i) {
3069                BUG_ON(!sg_page(sg));
3070                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3071                sg->dma_length = sg->length;
3072        }
3073        return nelems;
3074}
3075
3076static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3077                        enum dma_data_direction dir, struct dma_attrs *attrs)
3078{
3079        int i;
3080        struct pci_dev *pdev = to_pci_dev(hwdev);
3081        struct dmar_domain *domain;
3082        size_t size = 0;
3083        int prot = 0;
3084        struct iova *iova = NULL;
3085        int ret;
3086        struct scatterlist *sg;
3087        unsigned long start_vpfn;
3088        struct intel_iommu *iommu;
3089
3090        BUG_ON(dir == DMA_NONE);
3091        if (iommu_no_mapping(hwdev))
3092                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3093
3094        domain = get_valid_domain_for_dev(pdev);
3095        if (!domain)
3096                return 0;
3097
3098        iommu = domain_get_iommu(domain);
3099
3100        for_each_sg(sglist, sg, nelems, i)
3101                size += aligned_nrpages(sg->offset, sg->length);
3102
3103        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3104                                pdev->dma_mask);
3105        if (!iova) {
3106                sglist->dma_length = 0;
3107                return 0;
3108        }
3109
3110        /*
3111         * Check if DMAR supports zero-length reads on write only
3112         * mappings..
3113         */
3114        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3115                        !cap_zlr(iommu->cap))
3116                prot |= DMA_PTE_READ;
3117        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3118                prot |= DMA_PTE_WRITE;
3119
3120        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3121
3122        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3123        if (unlikely(ret)) {
3124                /*  clear the page */
3125                dma_pte_clear_range(domain, start_vpfn,
3126                                    start_vpfn + size - 1);
3127                /* free page tables */
3128                dma_pte_free_pagetable(domain, start_vpfn,
3129                                       start_vpfn + size - 1);
3130                /* free iova */
3131                __free_iova(&domain->iovad, iova);
3132                return 0;
3133        }
3134
3135        /* it's a non-present to present mapping. Only flush if caching mode */
3136        if (cap_caching_mode(iommu->cap))
3137                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3138        else
3139                iommu_flush_write_buffer(iommu);
3140
3141        return nelems;
3142}
3143
3144static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3145{
3146        return !dma_addr;
3147}
3148
3149struct dma_map_ops intel_dma_ops = {
3150        .alloc = intel_alloc_coherent,
3151        .free = intel_free_coherent,
3152        .map_sg = intel_map_sg,
3153        .unmap_sg = intel_unmap_sg,
3154        .map_page = intel_map_page,
3155        .unmap_page = intel_unmap_page,
3156        .mapping_error = intel_mapping_error,
3157};
3158
3159static inline int iommu_domain_cache_init(void)
3160{
3161        int ret = 0;
3162
3163        iommu_domain_cache = kmem_cache_create("iommu_domain",
3164                                         sizeof(struct dmar_domain),
3165                                         0,
3166                                         SLAB_HWCACHE_ALIGN,
3167
3168                                         NULL);
3169        if (!iommu_domain_cache) {
3170                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3171                ret = -ENOMEM;
3172        }
3173
3174        return ret;
3175}
3176
3177static inline int iommu_devinfo_cache_init(void)
3178{
3179        int ret = 0;
3180
3181        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3182                                         sizeof(struct device_domain_info),
3183                                         0,
3184                                         SLAB_HWCACHE_ALIGN,
3185                                         NULL);
3186        if (!iommu_devinfo_cache) {
3187                printk(KERN_ERR "Couldn't create devinfo cache\n");
3188                ret = -ENOMEM;
3189        }
3190
3191        return ret;
3192}
3193
3194static inline int iommu_iova_cache_init(void)
3195{
3196        int ret = 0;
3197
3198        iommu_iova_cache = kmem_cache_create("iommu_iova",
3199                                         sizeof(struct iova),
3200                                         0,
3201                                         SLAB_HWCACHE_ALIGN,
3202                                         NULL);
3203        if (!iommu_iova_cache) {
3204                printk(KERN_ERR "Couldn't create iova cache\n");
3205                ret = -ENOMEM;
3206        }
3207
3208        return ret;
3209}
3210
3211static int __init iommu_init_mempool(void)
3212{
3213        int ret;
3214        ret = iommu_iova_cache_init();
3215        if (ret)
3216                return ret;
3217
3218        ret = iommu_domain_cache_init();
3219        if (ret)
3220                goto domain_error;
3221
3222        ret = iommu_devinfo_cache_init();
3223        if (!ret)
3224                return ret;
3225
3226        kmem_cache_destroy(iommu_domain_cache);
3227domain_error:
3228        kmem_cache_destroy(iommu_iova_cache);
3229
3230        return -ENOMEM;
3231}
3232
3233static void __init iommu_exit_mempool(void)
3234{
3235        kmem_cache_destroy(iommu_devinfo_cache);
3236        kmem_cache_destroy(iommu_domain_cache);
3237        kmem_cache_destroy(iommu_iova_cache);
3238
3239}
3240
3241static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3242{
3243        struct dmar_drhd_unit *drhd;
3244        u32 vtbar;
3245        int rc;
3246
3247        /* We know that this device on this chipset has its own IOMMU.
3248         * If we find it under a different IOMMU, then the BIOS is lying
3249         * to us. Hope that the IOMMU for this device is actually
3250         * disabled, and it needs no translation...
3251         */
3252        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3253        if (rc) {
3254                /* "can't" happen */
3255                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3256                return;
3257        }
3258        vtbar &= 0xffff0000;
3259
3260        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3261        drhd = dmar_find_matched_drhd_unit(pdev);
3262        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3263                            TAINT_FIRMWARE_WORKAROUND,
3264                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3265                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3266}
3267DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3268
3269static void __init init_no_remapping_devices(void)
3270{
3271        struct dmar_drhd_unit *drhd;
3272
3273        for_each_drhd_unit(drhd) {
3274                if (!drhd->include_all) {
3275                        int i;
3276                        for (i = 0; i < drhd->devices_cnt; i++)
3277                                if (drhd->devices[i] != NULL)
3278                                        break;
3279                        /* ignore DMAR unit if no pci devices exist */
3280                        if (i == drhd->devices_cnt)
3281                                drhd->ignored = 1;
3282                }
3283        }
3284
3285        for_each_drhd_unit(drhd) {
3286                int i;
3287                if (drhd->ignored || drhd->include_all)
3288                        continue;
3289
3290                for (i = 0; i < drhd->devices_cnt; i++)
3291                        if (drhd->devices[i] &&
3292                            !IS_GFX_DEVICE(drhd->devices[i]))
3293                                break;
3294
3295                if (i < drhd->devices_cnt)
3296                        continue;
3297
3298                /* This IOMMU has *only* gfx devices. Either bypass it or
3299                   set the gfx_mapped flag, as appropriate */
3300                if (dmar_map_gfx) {
3301                        intel_iommu_gfx_mapped = 1;
3302                } else {
3303                        drhd->ignored = 1;
3304                        for (i = 0; i < drhd->devices_cnt; i++) {
3305                                if (!drhd->devices[i])
3306                                        continue;
3307                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3308                        }
3309                }
3310        }
3311}
3312
3313#ifdef CONFIG_SUSPEND
3314static int init_iommu_hw(void)
3315{
3316        struct dmar_drhd_unit *drhd;
3317        struct intel_iommu *iommu = NULL;
3318
3319        for_each_active_iommu(iommu, drhd)
3320                if (iommu->qi)
3321                        dmar_reenable_qi(iommu);
3322
3323        for_each_iommu(iommu, drhd) {
3324                if (drhd->ignored) {
3325                        /*
3326                         * we always have to disable PMRs or DMA may fail on
3327                         * this device
3328                         */
3329                        if (force_on)
3330                                iommu_disable_protect_mem_regions(iommu);
3331                        continue;
3332                }
3333        
3334                iommu_flush_write_buffer(iommu);
3335
3336                iommu_set_root_entry(iommu);
3337
3338                iommu->flush.flush_context(iommu, 0, 0, 0,
3339                                           DMA_CCMD_GLOBAL_INVL);
3340                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3341                                         DMA_TLB_GLOBAL_FLUSH);
3342                if (iommu_enable_translation(iommu))
3343                        return 1;
3344                iommu_disable_protect_mem_regions(iommu);
3345        }
3346
3347        return 0;
3348}
3349
3350static void iommu_flush_all(void)
3351{
3352        struct dmar_drhd_unit *drhd;
3353        struct intel_iommu *iommu;
3354
3355        for_each_active_iommu(iommu, drhd) {
3356                iommu->flush.flush_context(iommu, 0, 0, 0,
3357                                           DMA_CCMD_GLOBAL_INVL);
3358                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3359                                         DMA_TLB_GLOBAL_FLUSH);
3360        }
3361}
3362
3363static int iommu_suspend(void)
3364{
3365        struct dmar_drhd_unit *drhd;
3366        struct intel_iommu *iommu = NULL;
3367        unsigned long flag;
3368
3369        for_each_active_iommu(iommu, drhd) {
3370                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3371                                                 GFP_ATOMIC);
3372                if (!iommu->iommu_state)
3373                        goto nomem;
3374        }
3375
3376        iommu_flush_all();
3377
3378        for_each_active_iommu(iommu, drhd) {
3379                iommu_disable_translation(iommu);
3380
3381                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3382
3383                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3384                        readl(iommu->reg + DMAR_FECTL_REG);
3385                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3386                        readl(iommu->reg + DMAR_FEDATA_REG);
3387                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3388                        readl(iommu->reg + DMAR_FEADDR_REG);
3389                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3390                        readl(iommu->reg + DMAR_FEUADDR_REG);
3391
3392                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3393        }
3394        return 0;
3395
3396nomem:
3397        for_each_active_iommu(iommu, drhd)
3398                kfree(iommu->iommu_state);
3399
3400        return -ENOMEM;
3401}
3402
3403static void iommu_resume(void)
3404{
3405        struct dmar_drhd_unit *drhd;
3406        struct intel_iommu *iommu = NULL;
3407        unsigned long flag;
3408
3409        if (init_iommu_hw()) {
3410                if (force_on)
3411                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3412                else
3413                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3414                return;
3415        }
3416
3417        for_each_active_iommu(iommu, drhd) {
3418
3419                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3420
3421                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3422                        iommu->reg + DMAR_FECTL_REG);
3423                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3424                        iommu->reg + DMAR_FEDATA_REG);
3425                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3426                        iommu->reg + DMAR_FEADDR_REG);
3427                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3428                        iommu->reg + DMAR_FEUADDR_REG);
3429
3430                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3431        }
3432
3433        for_each_active_iommu(iommu, drhd)
3434                kfree(iommu->iommu_state);
3435}
3436
3437static struct syscore_ops iommu_syscore_ops = {
3438        .resume         = iommu_resume,
3439        .suspend        = iommu_suspend,
3440};
3441
3442static void __init init_iommu_pm_ops(void)
3443{
3444        register_syscore_ops(&iommu_syscore_ops);
3445}
3446
3447#else
3448static inline void init_iommu_pm_ops(void) {}
3449#endif  /* CONFIG_PM */
3450
3451LIST_HEAD(dmar_rmrr_units);
3452
3453static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3454{
3455        list_add(&rmrr->list, &dmar_rmrr_units);
3456}
3457
3458
3459int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3460{
3461        struct acpi_dmar_reserved_memory *rmrr;
3462        struct dmar_rmrr_unit *rmrru;
3463
3464        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3465        if (!rmrru)
3466                return -ENOMEM;
3467
3468        rmrru->hdr = header;
3469        rmrr = (struct acpi_dmar_reserved_memory *)header;
3470        rmrru->base_address = rmrr->base_address;
3471        rmrru->end_address = rmrr->end_address;
3472
3473        dmar_register_rmrr_unit(rmrru);
3474        return 0;
3475}
3476
3477static int __init
3478rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3479{
3480        struct acpi_dmar_reserved_memory *rmrr;
3481        int ret;
3482
3483        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3484        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3485                ((void *)rmrr) + rmrr->header.length,
3486                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3487
3488        if (ret || (rmrru->devices_cnt == 0)) {
3489                list_del(&rmrru->list);
3490                kfree(rmrru);
3491        }
3492        return ret;
3493}
3494
3495static LIST_HEAD(dmar_atsr_units);
3496
3497int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3498{
3499        struct acpi_dmar_atsr *atsr;
3500        struct dmar_atsr_unit *atsru;
3501
3502        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3503        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3504        if (!atsru)
3505                return -ENOMEM;
3506
3507        atsru->hdr = hdr;
3508        atsru->include_all = atsr->flags & 0x1;
3509
3510        list_add(&atsru->list, &dmar_atsr_units);
3511
3512        return 0;
3513}
3514
3515static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3516{
3517        int rc;
3518        struct acpi_dmar_atsr *atsr;
3519
3520        if (atsru->include_all)
3521                return 0;
3522
3523        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3524        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3525                                (void *)atsr + atsr->header.length,
3526                                &atsru->devices_cnt, &atsru->devices,
3527                                atsr->segment);
3528        if (rc || !atsru->devices_cnt) {
3529                list_del(&atsru->list);
3530                kfree(atsru);
3531        }
3532
3533        return rc;
3534}
3535
3536int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3537{
3538        int i;
3539        struct pci_bus *bus;
3540        struct acpi_dmar_atsr *atsr;
3541        struct dmar_atsr_unit *atsru;
3542
3543        dev = pci_physfn(dev);
3544
3545        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3546                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3547                if (atsr->segment == pci_domain_nr(dev->bus))
3548                        goto found;
3549        }
3550
3551        return 0;
3552
3553found:
3554        for (bus = dev->bus; bus; bus = bus->parent) {
3555                struct pci_dev *bridge = bus->self;
3556
3557                if (!bridge || !pci_is_pcie(bridge) ||
3558                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3559                        return 0;
3560
3561                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3562                        for (i = 0; i < atsru->devices_cnt; i++)
3563                                if (atsru->devices[i] == bridge)
3564                                        return 1;
3565                        break;
3566                }
3567        }
3568
3569        if (atsru->include_all)
3570                return 1;
3571
3572        return 0;
3573}
3574
3575int __init dmar_parse_rmrr_atsr_dev(void)
3576{
3577        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3578        struct dmar_atsr_unit *atsr, *atsr_n;
3579        int ret = 0;
3580
3581        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3582                ret = rmrr_parse_dev(rmrr);
3583                if (ret)
3584                        return ret;
3585        }
3586
3587        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3588                ret = atsr_parse_dev(atsr);
3589                if (ret)
3590                        return ret;
3591        }
3592
3593        return ret;
3594}
3595
3596/*
3597 * Here we only respond to action of unbound device from driver.
3598 *
3599 * Added device is not attached to its DMAR domain here yet. That will happen
3600 * when mapping the device to iova.
3601 */
3602static int device_notifier(struct notifier_block *nb,
3603                                  unsigned long action, void *data)
3604{
3605        struct device *dev = data;
3606        struct pci_dev *pdev = to_pci_dev(dev);
3607        struct dmar_domain *domain;
3608
3609        if (iommu_no_mapping(dev))
3610                return 0;
3611
3612        domain = find_domain(pdev);
3613        if (!domain)
3614                return 0;
3615
3616        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3617                domain_remove_one_dev_info(domain, pdev);
3618
3619                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3620                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3621                    list_empty(&domain->devices))
3622                        domain_exit(domain);
3623        }
3624
3625        return 0;
3626}
3627
3628static struct notifier_block device_nb = {
3629        .notifier_call = device_notifier,
3630};
3631
3632int __init intel_iommu_init(void)
3633{
3634        int ret = 0;
3635
3636        /* VT-d is required for a TXT/tboot launch, so enforce that */
3637        force_on = tboot_force_iommu();
3638
3639        if (dmar_table_init()) {
3640                if (force_on)
3641                        panic("tboot: Failed to initialize DMAR table\n");
3642                return  -ENODEV;
3643        }
3644
3645        if (dmar_dev_scope_init() < 0) {
3646                if (force_on)
3647                        panic("tboot: Failed to initialize DMAR device scope\n");
3648                return  -ENODEV;
3649        }
3650
3651        if (no_iommu || dmar_disabled)
3652                return -ENODEV;
3653
3654        if (iommu_init_mempool()) {
3655                if (force_on)
3656                        panic("tboot: Failed to initialize iommu memory\n");
3657                return  -ENODEV;
3658        }
3659
3660        if (list_empty(&dmar_rmrr_units))
3661                printk(KERN_INFO "DMAR: No RMRR found\n");
3662
3663        if (list_empty(&dmar_atsr_units))
3664                printk(KERN_INFO "DMAR: No ATSR found\n");
3665
3666        if (dmar_init_reserved_ranges()) {
3667                if (force_on)
3668                        panic("tboot: Failed to reserve iommu ranges\n");
3669                return  -ENODEV;
3670        }
3671
3672        init_no_remapping_devices();
3673
3674        ret = init_dmars();
3675        if (ret) {
3676                if (force_on)
3677                        panic("tboot: Failed to initialize DMARs\n");
3678                printk(KERN_ERR "IOMMU: dmar init failed\n");
3679                put_iova_domain(&reserved_iova_list);
3680                iommu_exit_mempool();
3681                return ret;
3682        }
3683        printk(KERN_INFO
3684        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3685
3686        init_timer(&unmap_timer);
3687#ifdef CONFIG_SWIOTLB
3688        swiotlb = 0;
3689#endif
3690        dma_ops = &intel_dma_ops;
3691
3692        init_iommu_pm_ops();
3693
3694        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3695
3696        bus_register_notifier(&pci_bus_type, &device_nb);
3697
3698        intel_iommu_enabled = 1;
3699
3700        return 0;
3701}
3702
3703static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3704                                           struct pci_dev *pdev)
3705{
3706        struct pci_dev *tmp, *parent;
3707
3708        if (!iommu || !pdev)
3709                return;
3710
3711        /* dependent device detach */
3712        tmp = pci_find_upstream_pcie_bridge(pdev);
3713        /* Secondary interface's bus number and devfn 0 */
3714        if (tmp) {
3715                parent = pdev->bus->self;
3716                while (parent != tmp) {
3717                        iommu_detach_dev(iommu, parent->bus->number,
3718                                         parent->devfn);
3719                        parent = parent->bus->self;
3720                }
3721                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3722                        iommu_detach_dev(iommu,
3723                                tmp->subordinate->number, 0);
3724                else /* this is a legacy PCI bridge */
3725                        iommu_detach_dev(iommu, tmp->bus->number,
3726                                         tmp->devfn);
3727        }
3728}
3729
3730static void domain_remove_one_dev_info(struct dmar_domain *domain,
3731                                          struct pci_dev *pdev)
3732{
3733        struct device_domain_info *info;
3734        struct intel_iommu *iommu;
3735        unsigned long flags;
3736        int found = 0;
3737        struct list_head *entry, *tmp;
3738
3739        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3740                                pdev->devfn);
3741        if (!iommu)
3742                return;
3743
3744        spin_lock_irqsave(&device_domain_lock, flags);
3745        list_for_each_safe(entry, tmp, &domain->devices) {
3746                info = list_entry(entry, struct device_domain_info, link);
3747                if (info->segment == pci_domain_nr(pdev->bus) &&
3748                    info->bus == pdev->bus->number &&
3749                    info->devfn == pdev->devfn) {
3750                        unlink_domain_info(info);
3751                        spin_unlock_irqrestore(&device_domain_lock, flags);
3752
3753                        iommu_disable_dev_iotlb(info);
3754                        iommu_detach_dev(iommu, info->bus, info->devfn);
3755                        iommu_detach_dependent_devices(iommu, pdev);
3756                        free_devinfo_mem(info);
3757
3758                        spin_lock_irqsave(&device_domain_lock, flags);
3759
3760                        if (found)
3761                                break;
3762                        else
3763                                continue;
3764                }
3765
3766                /* if there is no other devices under the same iommu
3767                 * owned by this domain, clear this iommu in iommu_bmp
3768                 * update iommu count and coherency
3769                 */
3770                if (iommu == device_to_iommu(info->segment, info->bus,
3771                                            info->devfn))
3772                        found = 1;
3773        }
3774
3775        spin_unlock_irqrestore(&device_domain_lock, flags);
3776
3777        if (found == 0) {
3778                unsigned long tmp_flags;
3779                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3780                clear_bit(iommu->seq_id, domain->iommu_bmp);
3781                domain->iommu_count--;
3782                domain_update_iommu_cap(domain);
3783                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3784
3785                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3786                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3787                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3788                        clear_bit(domain->id, iommu->domain_ids);
3789                        iommu->domains[domain->id] = NULL;
3790                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3791                }
3792        }
3793}
3794
3795static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3796{
3797        struct device_domain_info *info;
3798        struct intel_iommu *iommu;
3799        unsigned long flags1, flags2;
3800
3801        spin_lock_irqsave(&device_domain_lock, flags1);
3802        while (!list_empty(&domain->devices)) {
3803                info = list_entry(domain->devices.next,
3804                        struct device_domain_info, link);
3805                unlink_domain_info(info);
3806                spin_unlock_irqrestore(&device_domain_lock, flags1);
3807
3808                iommu_disable_dev_iotlb(info);
3809                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3810                iommu_detach_dev(iommu, info->bus, info->devfn);
3811                iommu_detach_dependent_devices(iommu, info->dev);
3812
3813                /* clear this iommu in iommu_bmp, update iommu count
3814                 * and capabilities
3815                 */
3816                spin_lock_irqsave(&domain->iommu_lock, flags2);
3817                if (test_and_clear_bit(iommu->seq_id,
3818                                       domain->iommu_bmp)) {
3819                        domain->iommu_count--;
3820                        domain_update_iommu_cap(domain);
3821                }
3822                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3823
3824                free_devinfo_mem(info);
3825                spin_lock_irqsave(&device_domain_lock, flags1);
3826        }
3827        spin_unlock_irqrestore(&device_domain_lock, flags1);
3828}
3829
3830/* domain id for virtual machine, it won't be set in context */
3831static unsigned long vm_domid;
3832
3833static struct dmar_domain *iommu_alloc_vm_domain(void)
3834{
3835        struct dmar_domain *domain;
3836
3837        domain = alloc_domain_mem();
3838        if (!domain)
3839                return NULL;
3840
3841        domain->id = vm_domid++;
3842        domain->nid = -1;
3843        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3844        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3845
3846        return domain;
3847}
3848
3849static int md_domain_init(struct dmar_domain *domain, int guest_width)
3850{
3851        int adjust_width;
3852
3853        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3854        spin_lock_init(&domain->iommu_lock);
3855
3856        domain_reserve_special_ranges(domain);
3857
3858        /* calculate AGAW */
3859        domain->gaw = guest_width;
3860        adjust_width = guestwidth_to_adjustwidth(guest_width);
3861        domain->agaw = width_to_agaw(adjust_width);
3862
3863        INIT_LIST_HEAD(&domain->devices);
3864
3865        domain->iommu_count = 0;
3866        domain->iommu_coherency = 0;
3867        domain->iommu_snooping = 0;
3868        domain->iommu_superpage = 0;
3869        domain->max_addr = 0;
3870        domain->nid = -1;
3871
3872        /* always allocate the top pgd */
3873        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3874        if (!domain->pgd)
3875                return -ENOMEM;
3876        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3877        return 0;
3878}
3879
3880static void iommu_free_vm_domain(struct dmar_domain *domain)
3881{
3882        unsigned long flags;
3883        struct dmar_drhd_unit *drhd;
3884        struct intel_iommu *iommu;
3885        unsigned long i;
3886        unsigned long ndomains;
3887
3888        for_each_drhd_unit(drhd) {
3889                if (drhd->ignored)
3890                        continue;
3891                iommu = drhd->iommu;
3892
3893                ndomains = cap_ndoms(iommu->cap);
3894                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3895                        if (iommu->domains[i] == domain) {
3896                                spin_lock_irqsave(&iommu->lock, flags);
3897                                clear_bit(i, iommu->domain_ids);
3898                                iommu->domains[i] = NULL;
3899                                spin_unlock_irqrestore(&iommu->lock, flags);
3900                                break;
3901                        }
3902                }
3903        }
3904}
3905
3906static void vm_domain_exit(struct dmar_domain *domain)
3907{
3908        /* Domain 0 is reserved, so dont process it */
3909        if (!domain)
3910                return;
3911
3912        vm_domain_remove_all_dev_info(domain);
3913        /* destroy iovas */
3914        put_iova_domain(&domain->iovad);
3915
3916        /* clear ptes */
3917        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3918
3919        /* free page tables */
3920        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3921
3922        iommu_free_vm_domain(domain);
3923        free_domain_mem(domain);
3924}
3925
3926static int intel_iommu_domain_init(struct iommu_domain *domain)
3927{
3928        struct dmar_domain *dmar_domain;
3929
3930        dmar_domain = iommu_alloc_vm_domain();
3931        if (!dmar_domain) {
3932                printk(KERN_ERR
3933                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3934                return -ENOMEM;
3935        }
3936        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3937                printk(KERN_ERR
3938                        "intel_iommu_domain_init() failed\n");
3939                vm_domain_exit(dmar_domain);
3940                return -ENOMEM;
3941        }
3942        domain_update_iommu_cap(dmar_domain);
3943        domain->priv = dmar_domain;
3944
3945        domain->geometry.aperture_start = 0;
3946        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3947        domain->geometry.force_aperture = true;
3948
3949        return 0;
3950}
3951
3952static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3953{
3954        struct dmar_domain *dmar_domain = domain->priv;
3955
3956        domain->priv = NULL;
3957        vm_domain_exit(dmar_domain);
3958}
3959
3960static int intel_iommu_attach_device(struct iommu_domain *domain,
3961                                     struct device *dev)
3962{
3963        struct dmar_domain *dmar_domain = domain->priv;
3964        struct pci_dev *pdev = to_pci_dev(dev);
3965        struct intel_iommu *iommu;
3966        int addr_width;
3967
3968        /* normally pdev is not mapped */
3969        if (unlikely(domain_context_mapped(pdev))) {
3970                struct dmar_domain *old_domain;
3971
3972                old_domain = find_domain(pdev);
3973                if (old_domain) {
3974                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3975                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3976                                domain_remove_one_dev_info(old_domain, pdev);
3977                        else
3978                                domain_remove_dev_info(old_domain);
3979                }
3980        }
3981
3982        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3983                                pdev->devfn);
3984        if (!iommu)
3985                return -ENODEV;
3986
3987        /* check if this iommu agaw is sufficient for max mapped address */
3988        addr_width = agaw_to_width(iommu->agaw);
3989        if (addr_width > cap_mgaw(iommu->cap))
3990                addr_width = cap_mgaw(iommu->cap);
3991
3992        if (dmar_domain->max_addr > (1LL << addr_width)) {
3993                printk(KERN_ERR "%s: iommu width (%d) is not "
3994                       "sufficient for the mapped address (%llx)\n",
3995                       __func__, addr_width, dmar_domain->max_addr);
3996                return -EFAULT;
3997        }
3998        dmar_domain->gaw = addr_width;
3999
4000        /*
4001         * Knock out extra levels of page tables if necessary
4002         */
4003        while (iommu->agaw < dmar_domain->agaw) {
4004                struct dma_pte *pte;
4005
4006                pte = dmar_domain->pgd;
4007                if (dma_pte_present(pte)) {
4008                        dmar_domain->pgd = (struct dma_pte *)
4009                                phys_to_virt(dma_pte_addr(pte));
4010                        free_pgtable_page(pte);
4011                }
4012                dmar_domain->agaw--;
4013        }
4014
4015        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4016}
4017
4018static void intel_iommu_detach_device(struct iommu_domain *domain,
4019                                      struct device *dev)
4020{
4021        struct dmar_domain *dmar_domain = domain->priv;
4022        struct pci_dev *pdev = to_pci_dev(dev);
4023
4024        domain_remove_one_dev_info(dmar_domain, pdev);
4025}
4026
4027static int intel_iommu_map(struct iommu_domain *domain,
4028                           unsigned long iova, phys_addr_t hpa,
4029                           size_t size, int iommu_prot)
4030{
4031        struct dmar_domain *dmar_domain = domain->priv;
4032        u64 max_addr;
4033        int prot = 0;
4034        int ret;
4035
4036        if (iommu_prot & IOMMU_READ)
4037                prot |= DMA_PTE_READ;
4038        if (iommu_prot & IOMMU_WRITE)
4039                prot |= DMA_PTE_WRITE;
4040        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4041                prot |= DMA_PTE_SNP;
4042
4043        max_addr = iova + size;
4044        if (dmar_domain->max_addr < max_addr) {
4045                u64 end;
4046
4047                /* check if minimum agaw is sufficient for mapped address */
4048                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4049                if (end < max_addr) {
4050                        printk(KERN_ERR "%s: iommu width (%d) is not "
4051                               "sufficient for the mapped address (%llx)\n",
4052                               __func__, dmar_domain->gaw, max_addr);
4053                        return -EFAULT;
4054                }
4055                dmar_domain->max_addr = max_addr;
4056        }
4057        /* Round up size to next multiple of PAGE_SIZE, if it and
4058           the low bits of hpa would take us onto the next page */
4059        size = aligned_nrpages(hpa, size);
4060        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4061                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4062        return ret;
4063}
4064
4065static size_t intel_iommu_unmap(struct iommu_domain *domain,
4066                             unsigned long iova, size_t size)
4067{
4068        struct dmar_domain *dmar_domain = domain->priv;
4069        int order;
4070
4071        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4072                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4073
4074        if (dmar_domain->max_addr == iova + size)
4075                dmar_domain->max_addr = iova;
4076
4077        return PAGE_SIZE << order;
4078}
4079
4080static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4081                                            unsigned long iova)
4082{
4083        struct dmar_domain *dmar_domain = domain->priv;
4084        struct dma_pte *pte;
4085        u64 phys = 0;
4086
4087        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4088        if (pte)
4089                phys = dma_pte_addr(pte);
4090
4091        return phys;
4092}
4093
4094static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4095                                      unsigned long cap)
4096{
4097        struct dmar_domain *dmar_domain = domain->priv;
4098
4099        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4100                return dmar_domain->iommu_snooping;
4101        if (cap == IOMMU_CAP_INTR_REMAP)
4102                return irq_remapping_enabled;
4103
4104        return 0;
4105}
4106
4107static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4108{
4109        pci_dev_put(*from);
4110        *from = to;
4111}
4112
4113#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4114
4115static int intel_iommu_add_device(struct device *dev)
4116{
4117        struct pci_dev *pdev = to_pci_dev(dev);
4118        struct pci_dev *bridge, *dma_pdev = NULL;
4119        struct iommu_group *group;
4120        int ret;
4121
4122        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4123                             pdev->bus->number, pdev->devfn))
4124                return -ENODEV;
4125
4126        bridge = pci_find_upstream_pcie_bridge(pdev);
4127        if (bridge) {
4128                if (pci_is_pcie(bridge))
4129                        dma_pdev = pci_get_domain_bus_and_slot(
4130                                                pci_domain_nr(pdev->bus),
4131                                                bridge->subordinate->number, 0);
4132                if (!dma_pdev)
4133                        dma_pdev = pci_dev_get(bridge);
4134        } else
4135                dma_pdev = pci_dev_get(pdev);
4136
4137        /* Account for quirked devices */
4138        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4139
4140        /*
4141         * If it's a multifunction device that does not support our
4142         * required ACS flags, add to the same group as function 0.
4143         */
4144        if (dma_pdev->multifunction &&
4145            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4146                swap_pci_ref(&dma_pdev,
4147                             pci_get_slot(dma_pdev->bus,
4148                                          PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4149                                          0)));
4150
4151        /*
4152         * Devices on the root bus go through the iommu.  If that's not us,
4153         * find the next upstream device and test ACS up to the root bus.
4154         * Finding the next device may require skipping virtual buses.
4155         */
4156        while (!pci_is_root_bus(dma_pdev->bus)) {
4157                struct pci_bus *bus = dma_pdev->bus;
4158
4159                while (!bus->self) {
4160                        if (!pci_is_root_bus(bus))
4161                                bus = bus->parent;
4162                        else
4163                                goto root_bus;
4164                }
4165
4166                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4167                        break;
4168
4169                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4170        }
4171
4172root_bus:
4173        group = iommu_group_get(&dma_pdev->dev);
4174        pci_dev_put(dma_pdev);
4175        if (!group) {
4176                group = iommu_group_alloc();
4177                if (IS_ERR(group))
4178                        return PTR_ERR(group);
4179        }
4180
4181        ret = iommu_group_add_device(group, dev);
4182
4183        iommu_group_put(group);
4184        return ret;
4185}
4186
4187static void intel_iommu_remove_device(struct device *dev)
4188{
4189        iommu_group_remove_device(dev);
4190}
4191
4192static struct iommu_ops intel_iommu_ops = {
4193        .domain_init    = intel_iommu_domain_init,
4194        .domain_destroy = intel_iommu_domain_destroy,
4195        .attach_dev     = intel_iommu_attach_device,
4196        .detach_dev     = intel_iommu_detach_device,
4197        .map            = intel_iommu_map,
4198        .unmap          = intel_iommu_unmap,
4199        .iova_to_phys   = intel_iommu_iova_to_phys,
4200        .domain_has_cap = intel_iommu_domain_has_cap,
4201        .add_device     = intel_iommu_add_device,
4202        .remove_device  = intel_iommu_remove_device,
4203        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4204};
4205
4206static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4207{
4208        /*
4209         * Mobile 4 Series Chipset neglects to set RWBF capability,
4210         * but needs it:
4211         */
4212        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4213        rwbf_quirk = 1;
4214
4215        /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4216        if (dev->revision == 0x07) {
4217                printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4218                dmar_map_gfx = 0;
4219        }
4220}
4221
4222DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4223
4224#define GGC 0x52
4225#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4226#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4227#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4228#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4229#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4230#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4231#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4232#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4233
4234static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4235{
4236        unsigned short ggc;
4237
4238        if (pci_read_config_word(dev, GGC, &ggc))
4239                return;
4240
4241        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4242                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4243                dmar_map_gfx = 0;
4244        } else if (dmar_map_gfx) {
4245                /* we have to ensure the gfx device is idle before we flush */
4246                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4247                intel_iommu_strict = 1;
4248       }
4249}
4250DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4251DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4252DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4253DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4254
4255/* On Tylersburg chipsets, some BIOSes have been known to enable the
4256   ISOCH DMAR unit for the Azalia sound device, but not give it any
4257   TLB entries, which causes it to deadlock. Check for that.  We do
4258   this in a function called from init_dmars(), instead of in a PCI
4259   quirk, because we don't want to print the obnoxious "BIOS broken"
4260   message if VT-d is actually disabled.
4261*/
4262static void __init check_tylersburg_isoch(void)
4263{
4264        struct pci_dev *pdev;
4265        uint32_t vtisochctrl;
4266
4267        /* If there's no Azalia in the system anyway, forget it. */
4268        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4269        if (!pdev)
4270                return;
4271        pci_dev_put(pdev);
4272
4273        /* System Management Registers. Might be hidden, in which case
4274           we can't do the sanity check. But that's OK, because the
4275           known-broken BIOSes _don't_ actually hide it, so far. */
4276        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4277        if (!pdev)
4278                return;
4279
4280        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4281                pci_dev_put(pdev);
4282                return;
4283        }
4284
4285        pci_dev_put(pdev);
4286
4287        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4288        if (vtisochctrl & 1)
4289                return;
4290
4291        /* Drop all bits other than the number of TLB entries */
4292        vtisochctrl &= 0x1c;
4293
4294        /* If we have the recommended number of TLB entries (16), fine. */
4295        if (vtisochctrl == 0x10)
4296                return;
4297
4298        /* Zero TLB entries? You get to ride the short bus to school. */
4299        if (!vtisochctrl) {
4300                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4301                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4302                     dmi_get_system_info(DMI_BIOS_VENDOR),
4303                     dmi_get_system_info(DMI_BIOS_VERSION),
4304                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4305                iommu_identity_mapping |= IDENTMAP_AZALIA;
4306                return;
4307        }
4308        
4309        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4310               vtisochctrl);
4311}
4312
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.