linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#include "irq_remapping.h"
  50#include "pci.h"
  51
  52#define ROOT_SIZE               VTD_PAGE_SIZE
  53#define CONTEXT_SIZE            VTD_PAGE_SIZE
  54
  55#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  56#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  57#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  58
  59#define IOAPIC_RANGE_START      (0xfee00000)
  60#define IOAPIC_RANGE_END        (0xfeefffff)
  61#define IOVA_START_ADDR         (0x1000)
  62
  63#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  64
  65#define MAX_AGAW_WIDTH 64
  66
  67#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  68#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  69
  70/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  71   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  72#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  73                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  74#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  75
  76#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  77#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  78#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  79
  80/* page table handling */
  81#define LEVEL_STRIDE            (9)
  82#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  83
  84/*
  85 * This bitmap is used to advertise the page sizes our hardware support
  86 * to the IOMMU core, which will then use this information to split
  87 * physically contiguous memory regions it is mapping into page sizes
  88 * that we support.
  89 *
  90 * Traditionally the IOMMU core just handed us the mappings directly,
  91 * after making sure the size is an order of a 4KiB page and that the
  92 * mapping has natural alignment.
  93 *
  94 * To retain this behavior, we currently advertise that we support
  95 * all page sizes that are an order of 4KiB.
  96 *
  97 * If at some point we'd like to utilize the IOMMU core's new behavior,
  98 * we could change this to advertise the real page sizes we support.
  99 */
 100#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 101
 102static inline int agaw_to_level(int agaw)
 103{
 104        return agaw + 2;
 105}
 106
 107static inline int agaw_to_width(int agaw)
 108{
 109        return 30 + agaw * LEVEL_STRIDE;
 110}
 111
 112static inline int width_to_agaw(int width)
 113{
 114        return (width - 30) / LEVEL_STRIDE;
 115}
 116
 117static inline unsigned int level_to_offset_bits(int level)
 118{
 119        return (level - 1) * LEVEL_STRIDE;
 120}
 121
 122static inline int pfn_level_offset(unsigned long pfn, int level)
 123{
 124        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 125}
 126
 127static inline unsigned long level_mask(int level)
 128{
 129        return -1UL << level_to_offset_bits(level);
 130}
 131
 132static inline unsigned long level_size(int level)
 133{
 134        return 1UL << level_to_offset_bits(level);
 135}
 136
 137static inline unsigned long align_to_level(unsigned long pfn, int level)
 138{
 139        return (pfn + level_size(level) - 1) & level_mask(level);
 140}
 141
 142static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 143{
 144        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 145}
 146
 147/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 148   are never going to work. */
 149static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 150{
 151        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 152}
 153
 154static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 155{
 156        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 157}
 158static inline unsigned long page_to_dma_pfn(struct page *pg)
 159{
 160        return mm_to_dma_pfn(page_to_pfn(pg));
 161}
 162static inline unsigned long virt_to_dma_pfn(void *p)
 163{
 164        return page_to_dma_pfn(virt_to_page(p));
 165}
 166
 167/* global iommu list, set NULL for ignored DMAR units */
 168static struct intel_iommu **g_iommus;
 169
 170static void __init check_tylersburg_isoch(void);
 171static int rwbf_quirk;
 172
 173/*
 174 * set to 1 to panic kernel if can't successfully enable VT-d
 175 * (used when kernel is launched w/ TXT)
 176 */
 177static int force_on = 0;
 178
 179/*
 180 * 0: Present
 181 * 1-11: Reserved
 182 * 12-63: Context Ptr (12 - (haw-1))
 183 * 64-127: Reserved
 184 */
 185struct root_entry {
 186        u64     val;
 187        u64     rsvd1;
 188};
 189#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 190static inline bool root_present(struct root_entry *root)
 191{
 192        return (root->val & 1);
 193}
 194static inline void set_root_present(struct root_entry *root)
 195{
 196        root->val |= 1;
 197}
 198static inline void set_root_value(struct root_entry *root, unsigned long value)
 199{
 200        root->val |= value & VTD_PAGE_MASK;
 201}
 202
 203static inline struct context_entry *
 204get_context_addr_from_root(struct root_entry *root)
 205{
 206        return (struct context_entry *)
 207                (root_present(root)?phys_to_virt(
 208                root->val & VTD_PAGE_MASK) :
 209                NULL);
 210}
 211
 212/*
 213 * low 64 bits:
 214 * 0: present
 215 * 1: fault processing disable
 216 * 2-3: translation type
 217 * 12-63: address space root
 218 * high 64 bits:
 219 * 0-2: address width
 220 * 3-6: aval
 221 * 8-23: domain id
 222 */
 223struct context_entry {
 224        u64 lo;
 225        u64 hi;
 226};
 227
 228static inline bool context_present(struct context_entry *context)
 229{
 230        return (context->lo & 1);
 231}
 232static inline void context_set_present(struct context_entry *context)
 233{
 234        context->lo |= 1;
 235}
 236
 237static inline void context_set_fault_enable(struct context_entry *context)
 238{
 239        context->lo &= (((u64)-1) << 2) | 1;
 240}
 241
 242static inline void context_set_translation_type(struct context_entry *context,
 243                                                unsigned long value)
 244{
 245        context->lo &= (((u64)-1) << 4) | 3;
 246        context->lo |= (value & 3) << 2;
 247}
 248
 249static inline void context_set_address_root(struct context_entry *context,
 250                                            unsigned long value)
 251{
 252        context->lo |= value & VTD_PAGE_MASK;
 253}
 254
 255static inline void context_set_address_width(struct context_entry *context,
 256                                             unsigned long value)
 257{
 258        context->hi |= value & 7;
 259}
 260
 261static inline void context_set_domain_id(struct context_entry *context,
 262                                         unsigned long value)
 263{
 264        context->hi |= (value & ((1 << 16) - 1)) << 8;
 265}
 266
 267static inline void context_clear_entry(struct context_entry *context)
 268{
 269        context->lo = 0;
 270        context->hi = 0;
 271}
 272
 273/*
 274 * 0: readable
 275 * 1: writable
 276 * 2-6: reserved
 277 * 7: super page
 278 * 8-10: available
 279 * 11: snoop behavior
 280 * 12-63: Host physcial address
 281 */
 282struct dma_pte {
 283        u64 val;
 284};
 285
 286static inline void dma_clear_pte(struct dma_pte *pte)
 287{
 288        pte->val = 0;
 289}
 290
 291static inline void dma_set_pte_readable(struct dma_pte *pte)
 292{
 293        pte->val |= DMA_PTE_READ;
 294}
 295
 296static inline void dma_set_pte_writable(struct dma_pte *pte)
 297{
 298        pte->val |= DMA_PTE_WRITE;
 299}
 300
 301static inline void dma_set_pte_snp(struct dma_pte *pte)
 302{
 303        pte->val |= DMA_PTE_SNP;
 304}
 305
 306static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 307{
 308        pte->val = (pte->val & ~3) | (prot & 3);
 309}
 310
 311static inline u64 dma_pte_addr(struct dma_pte *pte)
 312{
 313#ifdef CONFIG_64BIT
 314        return pte->val & VTD_PAGE_MASK;
 315#else
 316        /* Must have a full atomic 64-bit read */
 317        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 318#endif
 319}
 320
 321static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 322{
 323        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 324}
 325
 326static inline bool dma_pte_present(struct dma_pte *pte)
 327{
 328        return (pte->val & 3) != 0;
 329}
 330
 331static inline bool dma_pte_superpage(struct dma_pte *pte)
 332{
 333        return (pte->val & (1 << 7));
 334}
 335
 336static inline int first_pte_in_page(struct dma_pte *pte)
 337{
 338        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 339}
 340
 341/*
 342 * This domain is a statically identity mapping domain.
 343 *      1. This domain creats a static 1:1 mapping to all usable memory.
 344 *      2. It maps to each iommu if successful.
 345 *      3. Each iommu mapps to this domain if successful.
 346 */
 347static struct dmar_domain *si_domain;
 348static int hw_pass_through = 1;
 349
 350/* devices under the same p2p bridge are owned in one domain */
 351#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 352
 353/* domain represents a virtual machine, more than one devices
 354 * across iommus may be owned in one domain, e.g. kvm guest.
 355 */
 356#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 357
 358/* si_domain contains mulitple devices */
 359#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 360
 361/* define the limit of IOMMUs supported in each domain */
 362#ifdef  CONFIG_X86
 363# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 364#else
 365# define        IOMMU_UNITS_SUPPORTED   64
 366#endif
 367
 368struct dmar_domain {
 369        int     id;                     /* domain id */
 370        int     nid;                    /* node id */
 371        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 372                                        /* bitmap of iommus this domain uses*/
 373
 374        struct list_head devices;       /* all devices' list */
 375        struct iova_domain iovad;       /* iova's that belong to this domain */
 376
 377        struct dma_pte  *pgd;           /* virtual address */
 378        int             gaw;            /* max guest address width */
 379
 380        /* adjusted guest address width, 0 is level 2 30-bit */
 381        int             agaw;
 382
 383        int             flags;          /* flags to find out type of domain */
 384
 385        int             iommu_coherency;/* indicate coherency of iommu access */
 386        int             iommu_snooping; /* indicate snooping control feature*/
 387        int             iommu_count;    /* reference count of iommu */
 388        int             iommu_superpage;/* Level of superpages supported:
 389                                           0 == 4KiB (no superpages), 1 == 2MiB,
 390                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 391        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 392        u64             max_addr;       /* maximum mapped address */
 393};
 394
 395/* PCI domain-device relationship */
 396struct device_domain_info {
 397        struct list_head link;  /* link to domain siblings */
 398        struct list_head global; /* link to global list */
 399        int segment;            /* PCI domain */
 400        u8 bus;                 /* PCI bus number */
 401        u8 devfn;               /* PCI devfn number */
 402        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 403        struct intel_iommu *iommu; /* IOMMU used by this device */
 404        struct dmar_domain *domain; /* pointer to domain */
 405};
 406
 407static void flush_unmaps_timeout(unsigned long data);
 408
 409DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 410
 411#define HIGH_WATER_MARK 250
 412struct deferred_flush_tables {
 413        int next;
 414        struct iova *iova[HIGH_WATER_MARK];
 415        struct dmar_domain *domain[HIGH_WATER_MARK];
 416};
 417
 418static struct deferred_flush_tables *deferred_flush;
 419
 420/* bitmap for indexing intel_iommus */
 421static int g_num_of_iommus;
 422
 423static DEFINE_SPINLOCK(async_umap_flush_lock);
 424static LIST_HEAD(unmaps_to_do);
 425
 426static int timer_on;
 427static long list_size;
 428
 429static void domain_remove_dev_info(struct dmar_domain *domain);
 430
 431#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 432int dmar_disabled = 0;
 433#else
 434int dmar_disabled = 1;
 435#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 436
 437int intel_iommu_enabled = 0;
 438EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 439
 440static int dmar_map_gfx = 1;
 441static int dmar_forcedac;
 442static int intel_iommu_strict;
 443static int intel_iommu_superpage = 1;
 444
 445int intel_iommu_gfx_mapped;
 446EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 447
 448#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 449static DEFINE_SPINLOCK(device_domain_lock);
 450static LIST_HEAD(device_domain_list);
 451
 452static struct iommu_ops intel_iommu_ops;
 453
 454static int __init intel_iommu_setup(char *str)
 455{
 456        if (!str)
 457                return -EINVAL;
 458        while (*str) {
 459                if (!strncmp(str, "on", 2)) {
 460                        dmar_disabled = 0;
 461                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 462                } else if (!strncmp(str, "off", 3)) {
 463                        dmar_disabled = 1;
 464                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 465                } else if (!strncmp(str, "igfx_off", 8)) {
 466                        dmar_map_gfx = 0;
 467                        printk(KERN_INFO
 468                                "Intel-IOMMU: disable GFX device mapping\n");
 469                } else if (!strncmp(str, "forcedac", 8)) {
 470                        printk(KERN_INFO
 471                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 472                        dmar_forcedac = 1;
 473                } else if (!strncmp(str, "strict", 6)) {
 474                        printk(KERN_INFO
 475                                "Intel-IOMMU: disable batched IOTLB flush\n");
 476                        intel_iommu_strict = 1;
 477                } else if (!strncmp(str, "sp_off", 6)) {
 478                        printk(KERN_INFO
 479                                "Intel-IOMMU: disable supported super page\n");
 480                        intel_iommu_superpage = 0;
 481                }
 482
 483                str += strcspn(str, ",");
 484                while (*str == ',')
 485                        str++;
 486        }
 487        return 0;
 488}
 489__setup("intel_iommu=", intel_iommu_setup);
 490
 491static struct kmem_cache *iommu_domain_cache;
 492static struct kmem_cache *iommu_devinfo_cache;
 493static struct kmem_cache *iommu_iova_cache;
 494
 495static inline void *alloc_pgtable_page(int node)
 496{
 497        struct page *page;
 498        void *vaddr = NULL;
 499
 500        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 501        if (page)
 502                vaddr = page_address(page);
 503        return vaddr;
 504}
 505
 506static inline void free_pgtable_page(void *vaddr)
 507{
 508        free_page((unsigned long)vaddr);
 509}
 510
 511static inline void *alloc_domain_mem(void)
 512{
 513        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 514}
 515
 516static void free_domain_mem(void *vaddr)
 517{
 518        kmem_cache_free(iommu_domain_cache, vaddr);
 519}
 520
 521static inline void * alloc_devinfo_mem(void)
 522{
 523        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 524}
 525
 526static inline void free_devinfo_mem(void *vaddr)
 527{
 528        kmem_cache_free(iommu_devinfo_cache, vaddr);
 529}
 530
 531struct iova *alloc_iova_mem(void)
 532{
 533        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 534}
 535
 536void free_iova_mem(struct iova *iova)
 537{
 538        kmem_cache_free(iommu_iova_cache, iova);
 539}
 540
 541
 542static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 543{
 544        unsigned long sagaw;
 545        int agaw = -1;
 546
 547        sagaw = cap_sagaw(iommu->cap);
 548        for (agaw = width_to_agaw(max_gaw);
 549             agaw >= 0; agaw--) {
 550                if (test_bit(agaw, &sagaw))
 551                        break;
 552        }
 553
 554        return agaw;
 555}
 556
 557/*
 558 * Calculate max SAGAW for each iommu.
 559 */
 560int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 561{
 562        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 563}
 564
 565/*
 566 * calculate agaw for each iommu.
 567 * "SAGAW" may be different across iommus, use a default agaw, and
 568 * get a supported less agaw for iommus that don't support the default agaw.
 569 */
 570int iommu_calculate_agaw(struct intel_iommu *iommu)
 571{
 572        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 573}
 574
 575/* This functionin only returns single iommu in a domain */
 576static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 577{
 578        int iommu_id;
 579
 580        /* si_domain and vm domain should not get here. */
 581        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 582        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 583
 584        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 585        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 586                return NULL;
 587
 588        return g_iommus[iommu_id];
 589}
 590
 591static void domain_update_iommu_coherency(struct dmar_domain *domain)
 592{
 593        int i;
 594
 595        i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 596
 597        domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
 598
 599        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 600                if (!ecap_coherent(g_iommus[i]->ecap)) {
 601                        domain->iommu_coherency = 0;
 602                        break;
 603                }
 604        }
 605}
 606
 607static void domain_update_iommu_snooping(struct dmar_domain *domain)
 608{
 609        int i;
 610
 611        domain->iommu_snooping = 1;
 612
 613        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 614                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 615                        domain->iommu_snooping = 0;
 616                        break;
 617                }
 618        }
 619}
 620
 621static void domain_update_iommu_superpage(struct dmar_domain *domain)
 622{
 623        struct dmar_drhd_unit *drhd;
 624        struct intel_iommu *iommu = NULL;
 625        int mask = 0xf;
 626
 627        if (!intel_iommu_superpage) {
 628                domain->iommu_superpage = 0;
 629                return;
 630        }
 631
 632        /* set iommu_superpage to the smallest common denominator */
 633        for_each_active_iommu(iommu, drhd) {
 634                mask &= cap_super_page_val(iommu->cap);
 635                if (!mask) {
 636                        break;
 637                }
 638        }
 639        domain->iommu_superpage = fls(mask);
 640}
 641
 642/* Some capabilities may be different across iommus */
 643static void domain_update_iommu_cap(struct dmar_domain *domain)
 644{
 645        domain_update_iommu_coherency(domain);
 646        domain_update_iommu_snooping(domain);
 647        domain_update_iommu_superpage(domain);
 648}
 649
 650static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 651{
 652        struct dmar_drhd_unit *drhd = NULL;
 653        int i;
 654
 655        for_each_drhd_unit(drhd) {
 656                if (drhd->ignored)
 657                        continue;
 658                if (segment != drhd->segment)
 659                        continue;
 660
 661                for (i = 0; i < drhd->devices_cnt; i++) {
 662                        if (drhd->devices[i] &&
 663                            drhd->devices[i]->bus->number == bus &&
 664                            drhd->devices[i]->devfn == devfn)
 665                                return drhd->iommu;
 666                        if (drhd->devices[i] &&
 667                            drhd->devices[i]->subordinate &&
 668                            drhd->devices[i]->subordinate->number <= bus &&
 669                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 670                                return drhd->iommu;
 671                }
 672
 673                if (drhd->include_all)
 674                        return drhd->iommu;
 675        }
 676
 677        return NULL;
 678}
 679
 680static void domain_flush_cache(struct dmar_domain *domain,
 681                               void *addr, int size)
 682{
 683        if (!domain->iommu_coherency)
 684                clflush_cache_range(addr, size);
 685}
 686
 687/* Gets context entry for a given bus and devfn */
 688static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 689                u8 bus, u8 devfn)
 690{
 691        struct root_entry *root;
 692        struct context_entry *context;
 693        unsigned long phy_addr;
 694        unsigned long flags;
 695
 696        spin_lock_irqsave(&iommu->lock, flags);
 697        root = &iommu->root_entry[bus];
 698        context = get_context_addr_from_root(root);
 699        if (!context) {
 700                context = (struct context_entry *)
 701                                alloc_pgtable_page(iommu->node);
 702                if (!context) {
 703                        spin_unlock_irqrestore(&iommu->lock, flags);
 704                        return NULL;
 705                }
 706                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 707                phy_addr = virt_to_phys((void *)context);
 708                set_root_value(root, phy_addr);
 709                set_root_present(root);
 710                __iommu_flush_cache(iommu, root, sizeof(*root));
 711        }
 712        spin_unlock_irqrestore(&iommu->lock, flags);
 713        return &context[devfn];
 714}
 715
 716static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 717{
 718        struct root_entry *root;
 719        struct context_entry *context;
 720        int ret;
 721        unsigned long flags;
 722
 723        spin_lock_irqsave(&iommu->lock, flags);
 724        root = &iommu->root_entry[bus];
 725        context = get_context_addr_from_root(root);
 726        if (!context) {
 727                ret = 0;
 728                goto out;
 729        }
 730        ret = context_present(&context[devfn]);
 731out:
 732        spin_unlock_irqrestore(&iommu->lock, flags);
 733        return ret;
 734}
 735
 736static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 737{
 738        struct root_entry *root;
 739        struct context_entry *context;
 740        unsigned long flags;
 741
 742        spin_lock_irqsave(&iommu->lock, flags);
 743        root = &iommu->root_entry[bus];
 744        context = get_context_addr_from_root(root);
 745        if (context) {
 746                context_clear_entry(&context[devfn]);
 747                __iommu_flush_cache(iommu, &context[devfn], \
 748                        sizeof(*context));
 749        }
 750        spin_unlock_irqrestore(&iommu->lock, flags);
 751}
 752
 753static void free_context_table(struct intel_iommu *iommu)
 754{
 755        struct root_entry *root;
 756        int i;
 757        unsigned long flags;
 758        struct context_entry *context;
 759
 760        spin_lock_irqsave(&iommu->lock, flags);
 761        if (!iommu->root_entry) {
 762                goto out;
 763        }
 764        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 765                root = &iommu->root_entry[i];
 766                context = get_context_addr_from_root(root);
 767                if (context)
 768                        free_pgtable_page(context);
 769        }
 770        free_pgtable_page(iommu->root_entry);
 771        iommu->root_entry = NULL;
 772out:
 773        spin_unlock_irqrestore(&iommu->lock, flags);
 774}
 775
 776static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 777                                      unsigned long pfn, int target_level)
 778{
 779        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 780        struct dma_pte *parent, *pte = NULL;
 781        int level = agaw_to_level(domain->agaw);
 782        int offset;
 783
 784        BUG_ON(!domain->pgd);
 785        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 786        parent = domain->pgd;
 787
 788        while (level > 0) {
 789                void *tmp_page;
 790
 791                offset = pfn_level_offset(pfn, level);
 792                pte = &parent[offset];
 793                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 794                        break;
 795                if (level == target_level)
 796                        break;
 797
 798                if (!dma_pte_present(pte)) {
 799                        uint64_t pteval;
 800
 801                        tmp_page = alloc_pgtable_page(domain->nid);
 802
 803                        if (!tmp_page)
 804                                return NULL;
 805
 806                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 807                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 808                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 809                                /* Someone else set it while we were thinking; use theirs. */
 810                                free_pgtable_page(tmp_page);
 811                        } else {
 812                                dma_pte_addr(pte);
 813                                domain_flush_cache(domain, pte, sizeof(*pte));
 814                        }
 815                }
 816                parent = phys_to_virt(dma_pte_addr(pte));
 817                level--;
 818        }
 819
 820        return pte;
 821}
 822
 823
 824/* return address's pte at specific level */
 825static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 826                                         unsigned long pfn,
 827                                         int level, int *large_page)
 828{
 829        struct dma_pte *parent, *pte = NULL;
 830        int total = agaw_to_level(domain->agaw);
 831        int offset;
 832
 833        parent = domain->pgd;
 834        while (level <= total) {
 835                offset = pfn_level_offset(pfn, total);
 836                pte = &parent[offset];
 837                if (level == total)
 838                        return pte;
 839
 840                if (!dma_pte_present(pte)) {
 841                        *large_page = total;
 842                        break;
 843                }
 844
 845                if (pte->val & DMA_PTE_LARGE_PAGE) {
 846                        *large_page = total;
 847                        return pte;
 848                }
 849
 850                parent = phys_to_virt(dma_pte_addr(pte));
 851                total--;
 852        }
 853        return NULL;
 854}
 855
 856/* clear last level pte, a tlb flush should be followed */
 857static int dma_pte_clear_range(struct dmar_domain *domain,
 858                                unsigned long start_pfn,
 859                                unsigned long last_pfn)
 860{
 861        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 862        unsigned int large_page = 1;
 863        struct dma_pte *first_pte, *pte;
 864        int order;
 865
 866        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 867        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 868        BUG_ON(start_pfn > last_pfn);
 869
 870        /* we don't need lock here; nobody else touches the iova range */
 871        do {
 872                large_page = 1;
 873                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 874                if (!pte) {
 875                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 876                        continue;
 877                }
 878                do {
 879                        dma_clear_pte(pte);
 880                        start_pfn += lvl_to_nr_pages(large_page);
 881                        pte++;
 882                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 883
 884                domain_flush_cache(domain, first_pte,
 885                                   (void *)pte - (void *)first_pte);
 886
 887        } while (start_pfn && start_pfn <= last_pfn);
 888
 889        order = (large_page - 1) * 9;
 890        return order;
 891}
 892
 893static void dma_pte_free_level(struct dmar_domain *domain, int level,
 894                               struct dma_pte *pte, unsigned long pfn,
 895                               unsigned long start_pfn, unsigned long last_pfn)
 896{
 897        pfn = max(start_pfn, pfn);
 898        pte = &pte[pfn_level_offset(pfn, level)];
 899
 900        do {
 901                unsigned long level_pfn;
 902                struct dma_pte *level_pte;
 903
 904                if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 905                        goto next;
 906
 907                level_pfn = pfn & level_mask(level - 1);
 908                level_pte = phys_to_virt(dma_pte_addr(pte));
 909
 910                if (level > 2)
 911                        dma_pte_free_level(domain, level - 1, level_pte,
 912                                           level_pfn, start_pfn, last_pfn);
 913
 914                /* If range covers entire pagetable, free it */
 915                if (!(start_pfn > level_pfn ||
 916                      last_pfn < level_pfn + level_size(level))) {
 917                        dma_clear_pte(pte);
 918                        domain_flush_cache(domain, pte, sizeof(*pte));
 919                        free_pgtable_page(level_pte);
 920                }
 921next:
 922                pfn += level_size(level);
 923        } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
 924}
 925
 926/* free page table pages. last level pte should already be cleared */
 927static void dma_pte_free_pagetable(struct dmar_domain *domain,
 928                                   unsigned long start_pfn,
 929                                   unsigned long last_pfn)
 930{
 931        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 932
 933        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 934        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 935        BUG_ON(start_pfn > last_pfn);
 936
 937        /* We don't need lock here; nobody else touches the iova range */
 938        dma_pte_free_level(domain, agaw_to_level(domain->agaw),
 939                           domain->pgd, 0, start_pfn, last_pfn);
 940
 941        /* free pgd */
 942        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 943                free_pgtable_page(domain->pgd);
 944                domain->pgd = NULL;
 945        }
 946}
 947
 948/* iommu handling */
 949static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 950{
 951        struct root_entry *root;
 952        unsigned long flags;
 953
 954        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 955        if (!root)
 956                return -ENOMEM;
 957
 958        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 959
 960        spin_lock_irqsave(&iommu->lock, flags);
 961        iommu->root_entry = root;
 962        spin_unlock_irqrestore(&iommu->lock, flags);
 963
 964        return 0;
 965}
 966
 967static void iommu_set_root_entry(struct intel_iommu *iommu)
 968{
 969        void *addr;
 970        u32 sts;
 971        unsigned long flag;
 972
 973        addr = iommu->root_entry;
 974
 975        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 976        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 977
 978        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 979
 980        /* Make sure hardware complete it */
 981        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 982                      readl, (sts & DMA_GSTS_RTPS), sts);
 983
 984        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 985}
 986
 987static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 988{
 989        u32 val;
 990        unsigned long flag;
 991
 992        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 993                return;
 994
 995        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 996        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 997
 998        /* Make sure hardware complete it */
 999        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1000                      readl, (!(val & DMA_GSTS_WBFS)), val);
1001
1002        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1003}
1004
1005/* return value determine if we need a write buffer flush */
1006static void __iommu_flush_context(struct intel_iommu *iommu,
1007                                  u16 did, u16 source_id, u8 function_mask,
1008                                  u64 type)
1009{
1010        u64 val = 0;
1011        unsigned long flag;
1012
1013        switch (type) {
1014        case DMA_CCMD_GLOBAL_INVL:
1015                val = DMA_CCMD_GLOBAL_INVL;
1016                break;
1017        case DMA_CCMD_DOMAIN_INVL:
1018                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1019                break;
1020        case DMA_CCMD_DEVICE_INVL:
1021                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1022                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1023                break;
1024        default:
1025                BUG();
1026        }
1027        val |= DMA_CCMD_ICC;
1028
1029        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1030        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1031
1032        /* Make sure hardware complete it */
1033        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1034                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1035
1036        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1037}
1038
1039/* return value determine if we need a write buffer flush */
1040static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1041                                u64 addr, unsigned int size_order, u64 type)
1042{
1043        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1044        u64 val = 0, val_iva = 0;
1045        unsigned long flag;
1046
1047        switch (type) {
1048        case DMA_TLB_GLOBAL_FLUSH:
1049                /* global flush doesn't need set IVA_REG */
1050                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1051                break;
1052        case DMA_TLB_DSI_FLUSH:
1053                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054                break;
1055        case DMA_TLB_PSI_FLUSH:
1056                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1057                /* Note: always flush non-leaf currently */
1058                val_iva = size_order | addr;
1059                break;
1060        default:
1061                BUG();
1062        }
1063        /* Note: set drain read/write */
1064#if 0
1065        /*
1066         * This is probably to be super secure.. Looks like we can
1067         * ignore it without any impact.
1068         */
1069        if (cap_read_drain(iommu->cap))
1070                val |= DMA_TLB_READ_DRAIN;
1071#endif
1072        if (cap_write_drain(iommu->cap))
1073                val |= DMA_TLB_WRITE_DRAIN;
1074
1075        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1076        /* Note: Only uses first TLB reg currently */
1077        if (val_iva)
1078                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1079        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1080
1081        /* Make sure hardware complete it */
1082        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1083                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1084
1085        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086
1087        /* check IOTLB invalidation granularity */
1088        if (DMA_TLB_IAIG(val) == 0)
1089                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1090        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1091                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1092                        (unsigned long long)DMA_TLB_IIRG(type),
1093                        (unsigned long long)DMA_TLB_IAIG(val));
1094}
1095
1096static struct device_domain_info *iommu_support_dev_iotlb(
1097        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1098{
1099        int found = 0;
1100        unsigned long flags;
1101        struct device_domain_info *info;
1102        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1103
1104        if (!ecap_dev_iotlb_support(iommu->ecap))
1105                return NULL;
1106
1107        if (!iommu->qi)
1108                return NULL;
1109
1110        spin_lock_irqsave(&device_domain_lock, flags);
1111        list_for_each_entry(info, &domain->devices, link)
1112                if (info->bus == bus && info->devfn == devfn) {
1113                        found = 1;
1114                        break;
1115                }
1116        spin_unlock_irqrestore(&device_domain_lock, flags);
1117
1118        if (!found || !info->dev)
1119                return NULL;
1120
1121        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1122                return NULL;
1123
1124        if (!dmar_find_matched_atsr_unit(info->dev))
1125                return NULL;
1126
1127        info->iommu = iommu;
1128
1129        return info;
1130}
1131
1132static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1133{
1134        if (!info)
1135                return;
1136
1137        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1138}
1139
1140static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1141{
1142        if (!info->dev || !pci_ats_enabled(info->dev))
1143                return;
1144
1145        pci_disable_ats(info->dev);
1146}
1147
1148static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1149                                  u64 addr, unsigned mask)
1150{
1151        u16 sid, qdep;
1152        unsigned long flags;
1153        struct device_domain_info *info;
1154
1155        spin_lock_irqsave(&device_domain_lock, flags);
1156        list_for_each_entry(info, &domain->devices, link) {
1157                if (!info->dev || !pci_ats_enabled(info->dev))
1158                        continue;
1159
1160                sid = info->bus << 8 | info->devfn;
1161                qdep = pci_ats_queue_depth(info->dev);
1162                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1163        }
1164        spin_unlock_irqrestore(&device_domain_lock, flags);
1165}
1166
1167static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1168                                  unsigned long pfn, unsigned int pages, int map)
1169{
1170        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1171        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1172
1173        BUG_ON(pages == 0);
1174
1175        /*
1176         * Fallback to domain selective flush if no PSI support or the size is
1177         * too big.
1178         * PSI requires page size to be 2 ^ x, and the base address is naturally
1179         * aligned to the size
1180         */
1181        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1182                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1183                                                DMA_TLB_DSI_FLUSH);
1184        else
1185                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1186                                                DMA_TLB_PSI_FLUSH);
1187
1188        /*
1189         * In caching mode, changes of pages from non-present to present require
1190         * flush. However, device IOTLB doesn't need to be flushed in this case.
1191         */
1192        if (!cap_caching_mode(iommu->cap) || !map)
1193                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1194}
1195
1196static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1197{
1198        u32 pmen;
1199        unsigned long flags;
1200
1201        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1202        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1203        pmen &= ~DMA_PMEN_EPM;
1204        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1205
1206        /* wait for the protected region status bit to clear */
1207        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1208                readl, !(pmen & DMA_PMEN_PRS), pmen);
1209
1210        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1211}
1212
1213static int iommu_enable_translation(struct intel_iommu *iommu)
1214{
1215        u32 sts;
1216        unsigned long flags;
1217
1218        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1219        iommu->gcmd |= DMA_GCMD_TE;
1220        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1221
1222        /* Make sure hardware complete it */
1223        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1224                      readl, (sts & DMA_GSTS_TES), sts);
1225
1226        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227        return 0;
1228}
1229
1230static int iommu_disable_translation(struct intel_iommu *iommu)
1231{
1232        u32 sts;
1233        unsigned long flag;
1234
1235        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236        iommu->gcmd &= ~DMA_GCMD_TE;
1237        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1238
1239        /* Make sure hardware complete it */
1240        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1241                      readl, (!(sts & DMA_GSTS_TES)), sts);
1242
1243        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1244        return 0;
1245}
1246
1247
1248static int iommu_init_domains(struct intel_iommu *iommu)
1249{
1250        unsigned long ndomains;
1251        unsigned long nlongs;
1252
1253        ndomains = cap_ndoms(iommu->cap);
1254        pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1255                        ndomains);
1256        nlongs = BITS_TO_LONGS(ndomains);
1257
1258        spin_lock_init(&iommu->lock);
1259
1260        /* TBD: there might be 64K domains,
1261         * consider other allocation for future chip
1262         */
1263        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1264        if (!iommu->domain_ids) {
1265                printk(KERN_ERR "Allocating domain id array failed\n");
1266                return -ENOMEM;
1267        }
1268        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1269                        GFP_KERNEL);
1270        if (!iommu->domains) {
1271                printk(KERN_ERR "Allocating domain array failed\n");
1272                return -ENOMEM;
1273        }
1274
1275        /*
1276         * if Caching mode is set, then invalid translations are tagged
1277         * with domainid 0. Hence we need to pre-allocate it.
1278         */
1279        if (cap_caching_mode(iommu->cap))
1280                set_bit(0, iommu->domain_ids);
1281        return 0;
1282}
1283
1284
1285static void domain_exit(struct dmar_domain *domain);
1286static void vm_domain_exit(struct dmar_domain *domain);
1287
1288void free_dmar_iommu(struct intel_iommu *iommu)
1289{
1290        struct dmar_domain *domain;
1291        int i;
1292        unsigned long flags;
1293
1294        if ((iommu->domains) && (iommu->domain_ids)) {
1295                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1296                        domain = iommu->domains[i];
1297                        clear_bit(i, iommu->domain_ids);
1298
1299                        spin_lock_irqsave(&domain->iommu_lock, flags);
1300                        if (--domain->iommu_count == 0) {
1301                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1302                                        vm_domain_exit(domain);
1303                                else
1304                                        domain_exit(domain);
1305                        }
1306                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1307                }
1308        }
1309
1310        if (iommu->gcmd & DMA_GCMD_TE)
1311                iommu_disable_translation(iommu);
1312
1313        if (iommu->irq) {
1314                irq_set_handler_data(iommu->irq, NULL);
1315                /* This will mask the irq */
1316                free_irq(iommu->irq, iommu);
1317                destroy_irq(iommu->irq);
1318        }
1319
1320        kfree(iommu->domains);
1321        kfree(iommu->domain_ids);
1322
1323        g_iommus[iommu->seq_id] = NULL;
1324
1325        /* if all iommus are freed, free g_iommus */
1326        for (i = 0; i < g_num_of_iommus; i++) {
1327                if (g_iommus[i])
1328                        break;
1329        }
1330
1331        if (i == g_num_of_iommus)
1332                kfree(g_iommus);
1333
1334        /* free context mapping */
1335        free_context_table(iommu);
1336}
1337
1338static struct dmar_domain *alloc_domain(void)
1339{
1340        struct dmar_domain *domain;
1341
1342        domain = alloc_domain_mem();
1343        if (!domain)
1344                return NULL;
1345
1346        domain->nid = -1;
1347        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1348        domain->flags = 0;
1349
1350        return domain;
1351}
1352
1353static int iommu_attach_domain(struct dmar_domain *domain,
1354                               struct intel_iommu *iommu)
1355{
1356        int num;
1357        unsigned long ndomains;
1358        unsigned long flags;
1359
1360        ndomains = cap_ndoms(iommu->cap);
1361
1362        spin_lock_irqsave(&iommu->lock, flags);
1363
1364        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1365        if (num >= ndomains) {
1366                spin_unlock_irqrestore(&iommu->lock, flags);
1367                printk(KERN_ERR "IOMMU: no free domain ids\n");
1368                return -ENOMEM;
1369        }
1370
1371        domain->id = num;
1372        set_bit(num, iommu->domain_ids);
1373        set_bit(iommu->seq_id, domain->iommu_bmp);
1374        iommu->domains[num] = domain;
1375        spin_unlock_irqrestore(&iommu->lock, flags);
1376
1377        return 0;
1378}
1379
1380static void iommu_detach_domain(struct dmar_domain *domain,
1381                                struct intel_iommu *iommu)
1382{
1383        unsigned long flags;
1384        int num, ndomains;
1385        int found = 0;
1386
1387        spin_lock_irqsave(&iommu->lock, flags);
1388        ndomains = cap_ndoms(iommu->cap);
1389        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1390                if (iommu->domains[num] == domain) {
1391                        found = 1;
1392                        break;
1393                }
1394        }
1395
1396        if (found) {
1397                clear_bit(num, iommu->domain_ids);
1398                clear_bit(iommu->seq_id, domain->iommu_bmp);
1399                iommu->domains[num] = NULL;
1400        }
1401        spin_unlock_irqrestore(&iommu->lock, flags);
1402}
1403
1404static struct iova_domain reserved_iova_list;
1405static struct lock_class_key reserved_rbtree_key;
1406
1407static int dmar_init_reserved_ranges(void)
1408{
1409        struct pci_dev *pdev = NULL;
1410        struct iova *iova;
1411        int i;
1412
1413        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1414
1415        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1416                &reserved_rbtree_key);
1417
1418        /* IOAPIC ranges shouldn't be accessed by DMA */
1419        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1420                IOVA_PFN(IOAPIC_RANGE_END));
1421        if (!iova) {
1422                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1423                return -ENODEV;
1424        }
1425
1426        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1427        for_each_pci_dev(pdev) {
1428                struct resource *r;
1429
1430                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1431                        r = &pdev->resource[i];
1432                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1433                                continue;
1434                        iova = reserve_iova(&reserved_iova_list,
1435                                            IOVA_PFN(r->start),
1436                                            IOVA_PFN(r->end));
1437                        if (!iova) {
1438                                printk(KERN_ERR "Reserve iova failed\n");
1439                                return -ENODEV;
1440                        }
1441                }
1442        }
1443        return 0;
1444}
1445
1446static void domain_reserve_special_ranges(struct dmar_domain *domain)
1447{
1448        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1449}
1450
1451static inline int guestwidth_to_adjustwidth(int gaw)
1452{
1453        int agaw;
1454        int r = (gaw - 12) % 9;
1455
1456        if (r == 0)
1457                agaw = gaw;
1458        else
1459                agaw = gaw + 9 - r;
1460        if (agaw > 64)
1461                agaw = 64;
1462        return agaw;
1463}
1464
1465static int domain_init(struct dmar_domain *domain, int guest_width)
1466{
1467        struct intel_iommu *iommu;
1468        int adjust_width, agaw;
1469        unsigned long sagaw;
1470
1471        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1472        spin_lock_init(&domain->iommu_lock);
1473
1474        domain_reserve_special_ranges(domain);
1475
1476        /* calculate AGAW */
1477        iommu = domain_get_iommu(domain);
1478        if (guest_width > cap_mgaw(iommu->cap))
1479                guest_width = cap_mgaw(iommu->cap);
1480        domain->gaw = guest_width;
1481        adjust_width = guestwidth_to_adjustwidth(guest_width);
1482        agaw = width_to_agaw(adjust_width);
1483        sagaw = cap_sagaw(iommu->cap);
1484        if (!test_bit(agaw, &sagaw)) {
1485                /* hardware doesn't support it, choose a bigger one */
1486                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1487                agaw = find_next_bit(&sagaw, 5, agaw);
1488                if (agaw >= 5)
1489                        return -ENODEV;
1490        }
1491        domain->agaw = agaw;
1492        INIT_LIST_HEAD(&domain->devices);
1493
1494        if (ecap_coherent(iommu->ecap))
1495                domain->iommu_coherency = 1;
1496        else
1497                domain->iommu_coherency = 0;
1498
1499        if (ecap_sc_support(iommu->ecap))
1500                domain->iommu_snooping = 1;
1501        else
1502                domain->iommu_snooping = 0;
1503
1504        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1505        domain->iommu_count = 1;
1506        domain->nid = iommu->node;
1507
1508        /* always allocate the top pgd */
1509        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1510        if (!domain->pgd)
1511                return -ENOMEM;
1512        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1513        return 0;
1514}
1515
1516static void domain_exit(struct dmar_domain *domain)
1517{
1518        struct dmar_drhd_unit *drhd;
1519        struct intel_iommu *iommu;
1520
1521        /* Domain 0 is reserved, so dont process it */
1522        if (!domain)
1523                return;
1524
1525        /* Flush any lazy unmaps that may reference this domain */
1526        if (!intel_iommu_strict)
1527                flush_unmaps_timeout(0);
1528
1529        domain_remove_dev_info(domain);
1530        /* destroy iovas */
1531        put_iova_domain(&domain->iovad);
1532
1533        /* clear ptes */
1534        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536        /* free page tables */
1537        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1538
1539        for_each_active_iommu(iommu, drhd)
1540                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1541                        iommu_detach_domain(domain, iommu);
1542
1543        free_domain_mem(domain);
1544}
1545
1546static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1547                                 u8 bus, u8 devfn, int translation)
1548{
1549        struct context_entry *context;
1550        unsigned long flags;
1551        struct intel_iommu *iommu;
1552        struct dma_pte *pgd;
1553        unsigned long num;
1554        unsigned long ndomains;
1555        int id;
1556        int agaw;
1557        struct device_domain_info *info = NULL;
1558
1559        pr_debug("Set context mapping for %02x:%02x.%d\n",
1560                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1561
1562        BUG_ON(!domain->pgd);
1563        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1564               translation != CONTEXT_TT_MULTI_LEVEL);
1565
1566        iommu = device_to_iommu(segment, bus, devfn);
1567        if (!iommu)
1568                return -ENODEV;
1569
1570        context = device_to_context_entry(iommu, bus, devfn);
1571        if (!context)
1572                return -ENOMEM;
1573        spin_lock_irqsave(&iommu->lock, flags);
1574        if (context_present(context)) {
1575                spin_unlock_irqrestore(&iommu->lock, flags);
1576                return 0;
1577        }
1578
1579        id = domain->id;
1580        pgd = domain->pgd;
1581
1582        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1583            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1584                int found = 0;
1585
1586                /* find an available domain id for this device in iommu */
1587                ndomains = cap_ndoms(iommu->cap);
1588                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1589                        if (iommu->domains[num] == domain) {
1590                                id = num;
1591                                found = 1;
1592                                break;
1593                        }
1594                }
1595
1596                if (found == 0) {
1597                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1598                        if (num >= ndomains) {
1599                                spin_unlock_irqrestore(&iommu->lock, flags);
1600                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1601                                return -EFAULT;
1602                        }
1603
1604                        set_bit(num, iommu->domain_ids);
1605                        iommu->domains[num] = domain;
1606                        id = num;
1607                }
1608
1609                /* Skip top levels of page tables for
1610                 * iommu which has less agaw than default.
1611                 * Unnecessary for PT mode.
1612                 */
1613                if (translation != CONTEXT_TT_PASS_THROUGH) {
1614                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1615                                pgd = phys_to_virt(dma_pte_addr(pgd));
1616                                if (!dma_pte_present(pgd)) {
1617                                        spin_unlock_irqrestore(&iommu->lock, flags);
1618                                        return -ENOMEM;
1619                                }
1620                        }
1621                }
1622        }
1623
1624        context_set_domain_id(context, id);
1625
1626        if (translation != CONTEXT_TT_PASS_THROUGH) {
1627                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1628                translation = info ? CONTEXT_TT_DEV_IOTLB :
1629                                     CONTEXT_TT_MULTI_LEVEL;
1630        }
1631        /*
1632         * In pass through mode, AW must be programmed to indicate the largest
1633         * AGAW value supported by hardware. And ASR is ignored by hardware.
1634         */
1635        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1636                context_set_address_width(context, iommu->msagaw);
1637        else {
1638                context_set_address_root(context, virt_to_phys(pgd));
1639                context_set_address_width(context, iommu->agaw);
1640        }
1641
1642        context_set_translation_type(context, translation);
1643        context_set_fault_enable(context);
1644        context_set_present(context);
1645        domain_flush_cache(domain, context, sizeof(*context));
1646
1647        /*
1648         * It's a non-present to present mapping. If hardware doesn't cache
1649         * non-present entry we only need to flush the write-buffer. If the
1650         * _does_ cache non-present entries, then it does so in the special
1651         * domain #0, which we have to flush:
1652         */
1653        if (cap_caching_mode(iommu->cap)) {
1654                iommu->flush.flush_context(iommu, 0,
1655                                           (((u16)bus) << 8) | devfn,
1656                                           DMA_CCMD_MASK_NOBIT,
1657                                           DMA_CCMD_DEVICE_INVL);
1658                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1659        } else {
1660                iommu_flush_write_buffer(iommu);
1661        }
1662        iommu_enable_dev_iotlb(info);
1663        spin_unlock_irqrestore(&iommu->lock, flags);
1664
1665        spin_lock_irqsave(&domain->iommu_lock, flags);
1666        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1667                domain->iommu_count++;
1668                if (domain->iommu_count == 1)
1669                        domain->nid = iommu->node;
1670                domain_update_iommu_cap(domain);
1671        }
1672        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1673        return 0;
1674}
1675
1676static int
1677domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1678                        int translation)
1679{
1680        int ret;
1681        struct pci_dev *tmp, *parent;
1682
1683        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1684                                         pdev->bus->number, pdev->devfn,
1685                                         translation);
1686        if (ret)
1687                return ret;
1688
1689        /* dependent device mapping */
1690        tmp = pci_find_upstream_pcie_bridge(pdev);
1691        if (!tmp)
1692                return 0;
1693        /* Secondary interface's bus number and devfn 0 */
1694        parent = pdev->bus->self;
1695        while (parent != tmp) {
1696                ret = domain_context_mapping_one(domain,
1697                                                 pci_domain_nr(parent->bus),
1698                                                 parent->bus->number,
1699                                                 parent->devfn, translation);
1700                if (ret)
1701                        return ret;
1702                parent = parent->bus->self;
1703        }
1704        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1705                return domain_context_mapping_one(domain,
1706                                        pci_domain_nr(tmp->subordinate),
1707                                        tmp->subordinate->number, 0,
1708                                        translation);
1709        else /* this is a legacy PCI bridge */
1710                return domain_context_mapping_one(domain,
1711                                                  pci_domain_nr(tmp->bus),
1712                                                  tmp->bus->number,
1713                                                  tmp->devfn,
1714                                                  translation);
1715}
1716
1717static int domain_context_mapped(struct pci_dev *pdev)
1718{
1719        int ret;
1720        struct pci_dev *tmp, *parent;
1721        struct intel_iommu *iommu;
1722
1723        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1724                                pdev->devfn);
1725        if (!iommu)
1726                return -ENODEV;
1727
1728        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1729        if (!ret)
1730                return ret;
1731        /* dependent device mapping */
1732        tmp = pci_find_upstream_pcie_bridge(pdev);
1733        if (!tmp)
1734                return ret;
1735        /* Secondary interface's bus number and devfn 0 */
1736        parent = pdev->bus->self;
1737        while (parent != tmp) {
1738                ret = device_context_mapped(iommu, parent->bus->number,
1739                                            parent->devfn);
1740                if (!ret)
1741                        return ret;
1742                parent = parent->bus->self;
1743        }
1744        if (pci_is_pcie(tmp))
1745                return device_context_mapped(iommu, tmp->subordinate->number,
1746                                             0);
1747        else
1748                return device_context_mapped(iommu, tmp->bus->number,
1749                                             tmp->devfn);
1750}
1751
1752/* Returns a number of VTD pages, but aligned to MM page size */
1753static inline unsigned long aligned_nrpages(unsigned long host_addr,
1754                                            size_t size)
1755{
1756        host_addr &= ~PAGE_MASK;
1757        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1758}
1759
1760/* Return largest possible superpage level for a given mapping */
1761static inline int hardware_largepage_caps(struct dmar_domain *domain,
1762                                          unsigned long iov_pfn,
1763                                          unsigned long phy_pfn,
1764                                          unsigned long pages)
1765{
1766        int support, level = 1;
1767        unsigned long pfnmerge;
1768
1769        support = domain->iommu_superpage;
1770
1771        /* To use a large page, the virtual *and* physical addresses
1772           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1773           of them will mean we have to use smaller pages. So just
1774           merge them and check both at once. */
1775        pfnmerge = iov_pfn | phy_pfn;
1776
1777        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1778                pages >>= VTD_STRIDE_SHIFT;
1779                if (!pages)
1780                        break;
1781                pfnmerge >>= VTD_STRIDE_SHIFT;
1782                level++;
1783                support--;
1784        }
1785        return level;
1786}
1787
1788static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1789                            struct scatterlist *sg, unsigned long phys_pfn,
1790                            unsigned long nr_pages, int prot)
1791{
1792        struct dma_pte *first_pte = NULL, *pte = NULL;
1793        phys_addr_t uninitialized_var(pteval);
1794        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1795        unsigned long sg_res;
1796        unsigned int largepage_lvl = 0;
1797        unsigned long lvl_pages = 0;
1798
1799        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1800
1801        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1802                return -EINVAL;
1803
1804        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1805
1806        if (sg)
1807                sg_res = 0;
1808        else {
1809                sg_res = nr_pages + 1;
1810                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1811        }
1812
1813        while (nr_pages > 0) {
1814                uint64_t tmp;
1815
1816                if (!sg_res) {
1817                        sg_res = aligned_nrpages(sg->offset, sg->length);
1818                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1819                        sg->dma_length = sg->length;
1820                        pteval = page_to_phys(sg_page(sg)) | prot;
1821                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1822                }
1823
1824                if (!pte) {
1825                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1826
1827                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1828                        if (!pte)
1829                                return -ENOMEM;
1830                        /* It is large page*/
1831                        if (largepage_lvl > 1) {
1832                                pteval |= DMA_PTE_LARGE_PAGE;
1833                                /* Ensure that old small page tables are removed to make room
1834                                   for superpage, if they exist. */
1835                                dma_pte_clear_range(domain, iov_pfn,
1836                                                    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1837                                dma_pte_free_pagetable(domain, iov_pfn,
1838                                                       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1839                        } else {
1840                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1841                        }
1842
1843                }
1844                /* We don't need lock here, nobody else
1845                 * touches the iova range
1846                 */
1847                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1848                if (tmp) {
1849                        static int dumps = 5;
1850                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851                               iov_pfn, tmp, (unsigned long long)pteval);
1852                        if (dumps) {
1853                                dumps--;
1854                                debug_dma_dump_mappings(NULL);
1855                        }
1856                        WARN_ON(1);
1857                }
1858
1859                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1860
1861                BUG_ON(nr_pages < lvl_pages);
1862                BUG_ON(sg_res < lvl_pages);
1863
1864                nr_pages -= lvl_pages;
1865                iov_pfn += lvl_pages;
1866                phys_pfn += lvl_pages;
1867                pteval += lvl_pages * VTD_PAGE_SIZE;
1868                sg_res -= lvl_pages;
1869
1870                /* If the next PTE would be the first in a new page, then we
1871                   need to flush the cache on the entries we've just written.
1872                   And then we'll need to recalculate 'pte', so clear it and
1873                   let it get set again in the if (!pte) block above.
1874
1875                   If we're done (!nr_pages) we need to flush the cache too.
1876
1877                   Also if we've been setting superpages, we may need to
1878                   recalculate 'pte' and switch back to smaller pages for the
1879                   end of the mapping, if the trailing size is not enough to
1880                   use another superpage (i.e. sg_res < lvl_pages). */
1881                pte++;
1882                if (!nr_pages || first_pte_in_page(pte) ||
1883                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1884                        domain_flush_cache(domain, first_pte,
1885                                           (void *)pte - (void *)first_pte);
1886                        pte = NULL;
1887                }
1888
1889                if (!sg_res && nr_pages)
1890                        sg = sg_next(sg);
1891        }
1892        return 0;
1893}
1894
1895static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1896                                    struct scatterlist *sg, unsigned long nr_pages,
1897                                    int prot)
1898{
1899        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1900}
1901
1902static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1903                                     unsigned long phys_pfn, unsigned long nr_pages,
1904                                     int prot)
1905{
1906        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1907}
1908
1909static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1910{
1911        if (!iommu)
1912                return;
1913
1914        clear_context_table(iommu, bus, devfn);
1915        iommu->flush.flush_context(iommu, 0, 0, 0,
1916                                           DMA_CCMD_GLOBAL_INVL);
1917        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1918}
1919
1920static inline void unlink_domain_info(struct device_domain_info *info)
1921{
1922        assert_spin_locked(&device_domain_lock);
1923        list_del(&info->link);
1924        list_del(&info->global);
1925        if (info->dev)
1926                info->dev->dev.archdata.iommu = NULL;
1927}
1928
1929static void domain_remove_dev_info(struct dmar_domain *domain)
1930{
1931        struct device_domain_info *info;
1932        unsigned long flags;
1933        struct intel_iommu *iommu;
1934
1935        spin_lock_irqsave(&device_domain_lock, flags);
1936        while (!list_empty(&domain->devices)) {
1937                info = list_entry(domain->devices.next,
1938                        struct device_domain_info, link);
1939                unlink_domain_info(info);
1940                spin_unlock_irqrestore(&device_domain_lock, flags);
1941
1942                iommu_disable_dev_iotlb(info);
1943                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1944                iommu_detach_dev(iommu, info->bus, info->devfn);
1945                free_devinfo_mem(info);
1946
1947                spin_lock_irqsave(&device_domain_lock, flags);
1948        }
1949        spin_unlock_irqrestore(&device_domain_lock, flags);
1950}
1951
1952/*
1953 * find_domain
1954 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1955 */
1956static struct dmar_domain *
1957find_domain(struct pci_dev *pdev)
1958{
1959        struct device_domain_info *info;
1960
1961        /* No lock here, assumes no domain exit in normal case */
1962        info = pdev->dev.archdata.iommu;
1963        if (info)
1964                return info->domain;
1965        return NULL;
1966}
1967
1968/* domain is initialized */
1969static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1970{
1971        struct dmar_domain *domain, *found = NULL;
1972        struct intel_iommu *iommu;
1973        struct dmar_drhd_unit *drhd;
1974        struct device_domain_info *info, *tmp;
1975        struct pci_dev *dev_tmp;
1976        unsigned long flags;
1977        int bus = 0, devfn = 0;
1978        int segment;
1979        int ret;
1980
1981        domain = find_domain(pdev);
1982        if (domain)
1983                return domain;
1984
1985        segment = pci_domain_nr(pdev->bus);
1986
1987        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1988        if (dev_tmp) {
1989                if (pci_is_pcie(dev_tmp)) {
1990                        bus = dev_tmp->subordinate->number;
1991                        devfn = 0;
1992                } else {
1993                        bus = dev_tmp->bus->number;
1994                        devfn = dev_tmp->devfn;
1995                }
1996                spin_lock_irqsave(&device_domain_lock, flags);
1997                list_for_each_entry(info, &device_domain_list, global) {
1998                        if (info->segment == segment &&
1999                            info->bus == bus && info->devfn == devfn) {
2000                                found = info->domain;
2001                                break;
2002                        }
2003                }
2004                spin_unlock_irqrestore(&device_domain_lock, flags);
2005                /* pcie-pci bridge already has a domain, uses it */
2006                if (found) {
2007                        domain = found;
2008                        goto found_domain;
2009                }
2010        }
2011
2012        domain = alloc_domain();
2013        if (!domain)
2014                goto error;
2015
2016        /* Allocate new domain for the device */
2017        drhd = dmar_find_matched_drhd_unit(pdev);
2018        if (!drhd) {
2019                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2020                        pci_name(pdev));
2021                free_domain_mem(domain);
2022                return NULL;
2023        }
2024        iommu = drhd->iommu;
2025
2026        ret = iommu_attach_domain(domain, iommu);
2027        if (ret) {
2028                free_domain_mem(domain);
2029                goto error;
2030        }
2031
2032        if (domain_init(domain, gaw)) {
2033                domain_exit(domain);
2034                goto error;
2035        }
2036
2037        /* register pcie-to-pci device */
2038        if (dev_tmp) {
2039                info = alloc_devinfo_mem();
2040                if (!info) {
2041                        domain_exit(domain);
2042                        goto error;
2043                }
2044                info->segment = segment;
2045                info->bus = bus;
2046                info->devfn = devfn;
2047                info->dev = NULL;
2048                info->domain = domain;
2049                /* This domain is shared by devices under p2p bridge */
2050                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2051
2052                /* pcie-to-pci bridge already has a domain, uses it */
2053                found = NULL;
2054                spin_lock_irqsave(&device_domain_lock, flags);
2055                list_for_each_entry(tmp, &device_domain_list, global) {
2056                        if (tmp->segment == segment &&
2057                            tmp->bus == bus && tmp->devfn == devfn) {
2058                                found = tmp->domain;
2059                                break;
2060                        }
2061                }
2062                if (found) {
2063                        spin_unlock_irqrestore(&device_domain_lock, flags);
2064                        free_devinfo_mem(info);
2065                        domain_exit(domain);
2066                        domain = found;
2067                } else {
2068                        list_add(&info->link, &domain->devices);
2069                        list_add(&info->global, &device_domain_list);
2070                        spin_unlock_irqrestore(&device_domain_lock, flags);
2071                }
2072        }
2073
2074found_domain:
2075        info = alloc_devinfo_mem();
2076        if (!info)
2077                goto error;
2078        info->segment = segment;
2079        info->bus = pdev->bus->number;
2080        info->devfn = pdev->devfn;
2081        info->dev = pdev;
2082        info->domain = domain;
2083        spin_lock_irqsave(&device_domain_lock, flags);
2084        /* somebody is fast */
2085        found = find_domain(pdev);
2086        if (found != NULL) {
2087                spin_unlock_irqrestore(&device_domain_lock, flags);
2088                if (found != domain) {
2089                        domain_exit(domain);
2090                        domain = found;
2091                }
2092                free_devinfo_mem(info);
2093                return domain;
2094        }
2095        list_add(&info->link, &domain->devices);
2096        list_add(&info->global, &device_domain_list);
2097        pdev->dev.archdata.iommu = info;
2098        spin_unlock_irqrestore(&device_domain_lock, flags);
2099        return domain;
2100error:
2101        /* recheck it here, maybe others set it */
2102        return find_domain(pdev);
2103}
2104
2105static int iommu_identity_mapping;
2106#define IDENTMAP_ALL            1
2107#define IDENTMAP_GFX            2
2108#define IDENTMAP_AZALIA         4
2109
2110static int iommu_domain_identity_map(struct dmar_domain *domain,
2111                                     unsigned long long start,
2112                                     unsigned long long end)
2113{
2114        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2115        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2116
2117        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2118                          dma_to_mm_pfn(last_vpfn))) {
2119                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2120                return -ENOMEM;
2121        }
2122
2123        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2124                 start, end, domain->id);
2125        /*
2126         * RMRR range might have overlap with physical memory range,
2127         * clear it first
2128         */
2129        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2130
2131        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2132                                  last_vpfn - first_vpfn + 1,
2133                                  DMA_PTE_READ|DMA_PTE_WRITE);
2134}
2135
2136static int iommu_prepare_identity_map(struct pci_dev *pdev,
2137                                      unsigned long long start,
2138                                      unsigned long long end)
2139{
2140        struct dmar_domain *domain;
2141        int ret;
2142
2143        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2144        if (!domain)
2145                return -ENOMEM;
2146
2147        /* For _hardware_ passthrough, don't bother. But for software
2148           passthrough, we do it anyway -- it may indicate a memory
2149           range which is reserved in E820, so which didn't get set
2150           up to start with in si_domain */
2151        if (domain == si_domain && hw_pass_through) {
2152                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2153                       pci_name(pdev), start, end);
2154                return 0;
2155        }
2156
2157        printk(KERN_INFO
2158               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2159               pci_name(pdev), start, end);
2160        
2161        if (end < start) {
2162                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2163                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2164                        dmi_get_system_info(DMI_BIOS_VENDOR),
2165                        dmi_get_system_info(DMI_BIOS_VERSION),
2166                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2167                ret = -EIO;
2168                goto error;
2169        }
2170
2171        if (end >> agaw_to_width(domain->agaw)) {
2172                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2173                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2174                     agaw_to_width(domain->agaw),
2175                     dmi_get_system_info(DMI_BIOS_VENDOR),
2176                     dmi_get_system_info(DMI_BIOS_VERSION),
2177                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2178                ret = -EIO;
2179                goto error;
2180        }
2181
2182        ret = iommu_domain_identity_map(domain, start, end);
2183        if (ret)
2184                goto error;
2185
2186        /* context entry init */
2187        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2188        if (ret)
2189                goto error;
2190
2191        return 0;
2192
2193 error:
2194        domain_exit(domain);
2195        return ret;
2196}
2197
2198static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2199        struct pci_dev *pdev)
2200{
2201        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2202                return 0;
2203        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2204                rmrr->end_address);
2205}
2206
2207#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2208static inline void iommu_prepare_isa(void)
2209{
2210        struct pci_dev *pdev;
2211        int ret;
2212
2213        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2214        if (!pdev)
2215                return;
2216
2217        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2218        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2219
2220        if (ret)
2221                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2222                       "floppy might not work\n");
2223
2224}
2225#else
2226static inline void iommu_prepare_isa(void)
2227{
2228        return;
2229}
2230#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2231
2232static int md_domain_init(struct dmar_domain *domain, int guest_width);
2233
2234static int __init si_domain_init(int hw)
2235{
2236        struct dmar_drhd_unit *drhd;
2237        struct intel_iommu *iommu;
2238        int nid, ret = 0;
2239
2240        si_domain = alloc_domain();
2241        if (!si_domain)
2242                return -EFAULT;
2243
2244        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2245
2246        for_each_active_iommu(iommu, drhd) {
2247                ret = iommu_attach_domain(si_domain, iommu);
2248                if (ret) {
2249                        domain_exit(si_domain);
2250                        return -EFAULT;
2251                }
2252        }
2253
2254        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2255                domain_exit(si_domain);
2256                return -EFAULT;
2257        }
2258
2259        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2260
2261        if (hw)
2262                return 0;
2263
2264        for_each_online_node(nid) {
2265                unsigned long start_pfn, end_pfn;
2266                int i;
2267
2268                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2269                        ret = iommu_domain_identity_map(si_domain,
2270                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2271                        if (ret)
2272                                return ret;
2273                }
2274        }
2275
2276        return 0;
2277}
2278
2279static void domain_remove_one_dev_info(struct dmar_domain *domain,
2280                                          struct pci_dev *pdev);
2281static int identity_mapping(struct pci_dev *pdev)
2282{
2283        struct device_domain_info *info;
2284
2285        if (likely(!iommu_identity_mapping))
2286                return 0;
2287
2288        info = pdev->dev.archdata.iommu;
2289        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2290                return (info->domain == si_domain);
2291
2292        return 0;
2293}
2294
2295static int domain_add_dev_info(struct dmar_domain *domain,
2296                               struct pci_dev *pdev,
2297                               int translation)
2298{
2299        struct device_domain_info *info;
2300        unsigned long flags;
2301        int ret;
2302
2303        info = alloc_devinfo_mem();
2304        if (!info)
2305                return -ENOMEM;
2306
2307        info->segment = pci_domain_nr(pdev->bus);
2308        info->bus = pdev->bus->number;
2309        info->devfn = pdev->devfn;
2310        info->dev = pdev;
2311        info->domain = domain;
2312
2313        spin_lock_irqsave(&device_domain_lock, flags);
2314        list_add(&info->link, &domain->devices);
2315        list_add(&info->global, &device_domain_list);
2316        pdev->dev.archdata.iommu = info;
2317        spin_unlock_irqrestore(&device_domain_lock, flags);
2318
2319        ret = domain_context_mapping(domain, pdev, translation);
2320        if (ret) {
2321                spin_lock_irqsave(&device_domain_lock, flags);
2322                unlink_domain_info(info);
2323                spin_unlock_irqrestore(&device_domain_lock, flags);
2324                free_devinfo_mem(info);
2325                return ret;
2326        }
2327
2328        return 0;
2329}
2330
2331static bool device_has_rmrr(struct pci_dev *dev)
2332{
2333        struct dmar_rmrr_unit *rmrr;
2334        int i;
2335
2336        for_each_rmrr_units(rmrr) {
2337                for (i = 0; i < rmrr->devices_cnt; i++) {
2338                        /*
2339                         * Return TRUE if this RMRR contains the device that
2340                         * is passed in.
2341                         */
2342                        if (rmrr->devices[i] == dev)
2343                                return true;
2344                }
2345        }
2346        return false;
2347}
2348
2349static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2350{
2351
2352        /*
2353         * We want to prevent any device associated with an RMRR from
2354         * getting placed into the SI Domain. This is done because
2355         * problems exist when devices are moved in and out of domains
2356         * and their respective RMRR info is lost. We exempt USB devices
2357         * from this process due to their usage of RMRRs that are known
2358         * to not be needed after BIOS hand-off to OS.
2359         */
2360        if (device_has_rmrr(pdev) &&
2361            (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2362                return 0;
2363
2364        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2365                return 1;
2366
2367        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2368                return 1;
2369
2370        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2371                return 0;
2372
2373        /*
2374         * We want to start off with all devices in the 1:1 domain, and
2375         * take them out later if we find they can't access all of memory.
2376         *
2377         * However, we can't do this for PCI devices behind bridges,
2378         * because all PCI devices behind the same bridge will end up
2379         * with the same source-id on their transactions.
2380         *
2381         * Practically speaking, we can't change things around for these
2382         * devices at run-time, because we can't be sure there'll be no
2383         * DMA transactions in flight for any of their siblings.
2384         * 
2385         * So PCI devices (unless they're on the root bus) as well as
2386         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2387         * the 1:1 domain, just in _case_ one of their siblings turns out
2388         * not to be able to map all of memory.
2389         */
2390        if (!pci_is_pcie(pdev)) {
2391                if (!pci_is_root_bus(pdev->bus))
2392                        return 0;
2393                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2394                        return 0;
2395        } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2396                return 0;
2397
2398        /* 
2399         * At boot time, we don't yet know if devices will be 64-bit capable.
2400         * Assume that they will -- if they turn out not to be, then we can 
2401         * take them out of the 1:1 domain later.
2402         */
2403        if (!startup) {
2404                /*
2405                 * If the device's dma_mask is less than the system's memory
2406                 * size then this is not a candidate for identity mapping.
2407                 */
2408                u64 dma_mask = pdev->dma_mask;
2409
2410                if (pdev->dev.coherent_dma_mask &&
2411                    pdev->dev.coherent_dma_mask < dma_mask)
2412                        dma_mask = pdev->dev.coherent_dma_mask;
2413
2414                return dma_mask >= dma_get_required_mask(&pdev->dev);
2415        }
2416
2417        return 1;
2418}
2419
2420static int __init iommu_prepare_static_identity_mapping(int hw)
2421{
2422        struct pci_dev *pdev = NULL;
2423        int ret;
2424
2425        ret = si_domain_init(hw);
2426        if (ret)
2427                return -EFAULT;
2428
2429        for_each_pci_dev(pdev) {
2430                if (iommu_should_identity_map(pdev, 1)) {
2431                        ret = domain_add_dev_info(si_domain, pdev,
2432                                             hw ? CONTEXT_TT_PASS_THROUGH :
2433                                                  CONTEXT_TT_MULTI_LEVEL);
2434                        if (ret) {
2435                                /* device not associated with an iommu */
2436                                if (ret == -ENODEV)
2437                                        continue;
2438                                return ret;
2439                        }
2440                        pr_info("IOMMU: %s identity mapping for device %s\n",
2441                                hw ? "hardware" : "software", pci_name(pdev));
2442                }
2443        }
2444
2445        return 0;
2446}
2447
2448static int __init init_dmars(void)
2449{
2450        struct dmar_drhd_unit *drhd;
2451        struct dmar_rmrr_unit *rmrr;
2452        struct pci_dev *pdev;
2453        struct intel_iommu *iommu;
2454        int i, ret;
2455
2456        /*
2457         * for each drhd
2458         *    allocate root
2459         *    initialize and program root entry to not present
2460         * endfor
2461         */
2462        for_each_drhd_unit(drhd) {
2463                /*
2464                 * lock not needed as this is only incremented in the single
2465                 * threaded kernel __init code path all other access are read
2466                 * only
2467                 */
2468                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2469                        g_num_of_iommus++;
2470                        continue;
2471                }
2472                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2473                          IOMMU_UNITS_SUPPORTED);
2474        }
2475
2476        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2477                        GFP_KERNEL);
2478        if (!g_iommus) {
2479                printk(KERN_ERR "Allocating global iommu array failed\n");
2480                ret = -ENOMEM;
2481                goto error;
2482        }
2483
2484        deferred_flush = kzalloc(g_num_of_iommus *
2485                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2486        if (!deferred_flush) {
2487                ret = -ENOMEM;
2488                goto error;
2489        }
2490
2491        for_each_drhd_unit(drhd) {
2492                if (drhd->ignored)
2493                        continue;
2494
2495                iommu = drhd->iommu;
2496                g_iommus[iommu->seq_id] = iommu;
2497
2498                ret = iommu_init_domains(iommu);
2499                if (ret)
2500                        goto error;
2501
2502                /*
2503                 * TBD:
2504                 * we could share the same root & context tables
2505                 * among all IOMMU's. Need to Split it later.
2506                 */
2507                ret = iommu_alloc_root_entry(iommu);
2508                if (ret) {
2509                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2510                        goto error;
2511                }
2512                if (!ecap_pass_through(iommu->ecap))
2513                        hw_pass_through = 0;
2514        }
2515
2516        /*
2517         * Start from the sane iommu hardware state.
2518         */
2519        for_each_drhd_unit(drhd) {
2520                if (drhd->ignored)
2521                        continue;
2522
2523                iommu = drhd->iommu;
2524
2525                /*
2526                 * If the queued invalidation is already initialized by us
2527                 * (for example, while enabling interrupt-remapping) then
2528                 * we got the things already rolling from a sane state.
2529                 */
2530                if (iommu->qi)
2531                        continue;
2532
2533                /*
2534                 * Clear any previous faults.
2535                 */
2536                dmar_fault(-1, iommu);
2537                /*
2538                 * Disable queued invalidation if supported and already enabled
2539                 * before OS handover.
2540                 */
2541                dmar_disable_qi(iommu);
2542        }
2543
2544        for_each_drhd_unit(drhd) {
2545                if (drhd->ignored)
2546                        continue;
2547
2548                iommu = drhd->iommu;
2549
2550                if (dmar_enable_qi(iommu)) {
2551                        /*
2552                         * Queued Invalidate not enabled, use Register Based
2553                         * Invalidate
2554                         */
2555                        iommu->flush.flush_context = __iommu_flush_context;
2556                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2557                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2558                               "invalidation\n",
2559                                iommu->seq_id,
2560                               (unsigned long long)drhd->reg_base_addr);
2561                } else {
2562                        iommu->flush.flush_context = qi_flush_context;
2563                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2564                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2565                               "invalidation\n",
2566                                iommu->seq_id,
2567                               (unsigned long long)drhd->reg_base_addr);
2568                }
2569        }
2570
2571        if (iommu_pass_through)
2572                iommu_identity_mapping |= IDENTMAP_ALL;
2573
2574#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2575        iommu_identity_mapping |= IDENTMAP_GFX;
2576#endif
2577
2578        check_tylersburg_isoch();
2579
2580        /*
2581         * If pass through is not set or not enabled, setup context entries for
2582         * identity mappings for rmrr, gfx, and isa and may fall back to static
2583         * identity mapping if iommu_identity_mapping is set.
2584         */
2585        if (iommu_identity_mapping) {
2586                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2587                if (ret) {
2588                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2589                        goto error;
2590                }
2591        }
2592        /*
2593         * For each rmrr
2594         *   for each dev attached to rmrr
2595         *   do
2596         *     locate drhd for dev, alloc domain for dev
2597         *     allocate free domain
2598         *     allocate page table entries for rmrr
2599         *     if context not allocated for bus
2600         *           allocate and init context
2601         *           set present in root table for this bus
2602         *     init context with domain, translation etc
2603         *    endfor
2604         * endfor
2605         */
2606        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2607        for_each_rmrr_units(rmrr) {
2608                for (i = 0; i < rmrr->devices_cnt; i++) {
2609                        pdev = rmrr->devices[i];
2610                        /*
2611                         * some BIOS lists non-exist devices in DMAR
2612                         * table.
2613                         */
2614                        if (!pdev)
2615                                continue;
2616                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2617                        if (ret)
2618                                printk(KERN_ERR
2619                                       "IOMMU: mapping reserved region failed\n");
2620                }
2621        }
2622
2623        iommu_prepare_isa();
2624
2625        /*
2626         * for each drhd
2627         *   enable fault log
2628         *   global invalidate context cache
2629         *   global invalidate iotlb
2630         *   enable translation
2631         */
2632        for_each_drhd_unit(drhd) {
2633                if (drhd->ignored) {
2634                        /*
2635                         * we always have to disable PMRs or DMA may fail on
2636                         * this device
2637                         */
2638                        if (force_on)
2639                                iommu_disable_protect_mem_regions(drhd->iommu);
2640                        continue;
2641                }
2642                iommu = drhd->iommu;
2643
2644                iommu_flush_write_buffer(iommu);
2645
2646                ret = dmar_set_interrupt(iommu);
2647                if (ret)
2648                        goto error;
2649
2650                iommu_set_root_entry(iommu);
2651
2652                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2653                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2654
2655                ret = iommu_enable_translation(iommu);
2656                if (ret)
2657                        goto error;
2658
2659                iommu_disable_protect_mem_regions(iommu);
2660        }
2661
2662        return 0;
2663error:
2664        for_each_drhd_unit(drhd) {
2665                if (drhd->ignored)
2666                        continue;
2667                iommu = drhd->iommu;
2668                free_iommu(iommu);
2669        }
2670        kfree(g_iommus);
2671        return ret;
2672}
2673
2674/* This takes a number of _MM_ pages, not VTD pages */
2675static struct iova *intel_alloc_iova(struct device *dev,
2676                                     struct dmar_domain *domain,
2677                                     unsigned long nrpages, uint64_t dma_mask)
2678{
2679        struct pci_dev *pdev = to_pci_dev(dev);
2680        struct iova *iova = NULL;
2681
2682        /* Restrict dma_mask to the width that the iommu can handle */
2683        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2684
2685        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2686                /*
2687                 * First try to allocate an io virtual address in
2688                 * DMA_BIT_MASK(32) and if that fails then try allocating
2689                 * from higher range
2690                 */
2691                iova = alloc_iova(&domain->iovad, nrpages,
2692                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2693                if (iova)
2694                        return iova;
2695        }
2696        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2697        if (unlikely(!iova)) {
2698                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2699                       nrpages, pci_name(pdev));
2700                return NULL;
2701        }
2702
2703        return iova;
2704}
2705
2706static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2707{
2708        struct dmar_domain *domain;
2709        int ret;
2710
2711        domain = get_domain_for_dev(pdev,
2712                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2713        if (!domain) {
2714                printk(KERN_ERR
2715                        "Allocating domain for %s failed", pci_name(pdev));
2716                return NULL;
2717        }
2718
2719        /* make sure context mapping is ok */
2720        if (unlikely(!domain_context_mapped(pdev))) {
2721                ret = domain_context_mapping(domain, pdev,
2722                                             CONTEXT_TT_MULTI_LEVEL);
2723                if (ret) {
2724                        printk(KERN_ERR
2725                                "Domain context map for %s failed",
2726                                pci_name(pdev));
2727                        return NULL;
2728                }
2729        }
2730
2731        return domain;
2732}
2733
2734static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2735{
2736        struct device_domain_info *info;
2737
2738        /* No lock here, assumes no domain exit in normal case */
2739        info = dev->dev.archdata.iommu;
2740        if (likely(info))
2741                return info->domain;
2742
2743        return __get_valid_domain_for_dev(dev);
2744}
2745
2746static int iommu_dummy(struct pci_dev *pdev)
2747{
2748        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2749}
2750
2751/* Check if the pdev needs to go through non-identity map and unmap process.*/
2752static int iommu_no_mapping(struct device *dev)
2753{
2754        struct pci_dev *pdev;
2755        int found;
2756
2757        if (unlikely(dev->bus != &pci_bus_type))
2758                return 1;
2759
2760        pdev = to_pci_dev(dev);
2761        if (iommu_dummy(pdev))
2762                return 1;
2763
2764        if (!iommu_identity_mapping)
2765                return 0;
2766
2767        found = identity_mapping(pdev);
2768        if (found) {
2769                if (iommu_should_identity_map(pdev, 0))
2770                        return 1;
2771                else {
2772                        /*
2773                         * 32 bit DMA is removed from si_domain and fall back
2774                         * to non-identity mapping.
2775                         */
2776                        domain_remove_one_dev_info(si_domain, pdev);
2777                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2778                               pci_name(pdev));
2779                        return 0;
2780                }
2781        } else {
2782                /*
2783                 * In case of a detached 64 bit DMA device from vm, the device
2784                 * is put into si_domain for identity mapping.
2785                 */
2786                if (iommu_should_identity_map(pdev, 0)) {
2787                        int ret;
2788                        ret = domain_add_dev_info(si_domain, pdev,
2789                                                  hw_pass_through ?
2790                                                  CONTEXT_TT_PASS_THROUGH :
2791                                                  CONTEXT_TT_MULTI_LEVEL);
2792                        if (!ret) {
2793                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2794                                       pci_name(pdev));
2795                                return 1;
2796                        }
2797                }
2798        }
2799
2800        return 0;
2801}
2802
2803static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2804                                     size_t size, int dir, u64 dma_mask)
2805{
2806        struct pci_dev *pdev = to_pci_dev(hwdev);
2807        struct dmar_domain *domain;
2808        phys_addr_t start_paddr;
2809        struct iova *iova;
2810        int prot = 0;
2811        int ret;
2812        struct intel_iommu *iommu;
2813        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2814
2815        BUG_ON(dir == DMA_NONE);
2816
2817        if (iommu_no_mapping(hwdev))
2818                return paddr;
2819
2820        domain = get_valid_domain_for_dev(pdev);
2821        if (!domain)
2822                return 0;
2823
2824        iommu = domain_get_iommu(domain);
2825        size = aligned_nrpages(paddr, size);
2826
2827        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2828        if (!iova)
2829                goto error;
2830
2831        /*
2832         * Check if DMAR supports zero-length reads on write only
2833         * mappings..
2834         */
2835        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2836                        !cap_zlr(iommu->cap))
2837                prot |= DMA_PTE_READ;
2838        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2839                prot |= DMA_PTE_WRITE;
2840        /*
2841         * paddr - (paddr + size) might be partial page, we should map the whole
2842         * page.  Note: if two part of one page are separately mapped, we
2843         * might have two guest_addr mapping to the same host paddr, but this
2844         * is not a big problem
2845         */
2846        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2847                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2848        if (ret)
2849                goto error;
2850
2851        /* it's a non-present to present mapping. Only flush if caching mode */
2852        if (cap_caching_mode(iommu->cap))
2853                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2854        else
2855                iommu_flush_write_buffer(iommu);
2856
2857        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2858        start_paddr += paddr & ~PAGE_MASK;
2859        return start_paddr;
2860
2861error:
2862        if (iova)
2863                __free_iova(&domain->iovad, iova);
2864        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2865                pci_name(pdev), size, (unsigned long long)paddr, dir);
2866        return 0;
2867}
2868
2869static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2870                                 unsigned long offset, size_t size,
2871                                 enum dma_data_direction dir,
2872                                 struct dma_attrs *attrs)
2873{
2874        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2875                                  dir, to_pci_dev(dev)->dma_mask);
2876}
2877
2878static void flush_unmaps(void)
2879{
2880        int i, j;
2881
2882        timer_on = 0;
2883
2884        /* just flush them all */
2885        for (i = 0; i < g_num_of_iommus; i++) {
2886                struct intel_iommu *iommu = g_iommus[i];
2887                if (!iommu)
2888                        continue;
2889
2890                if (!deferred_flush[i].next)
2891                        continue;
2892
2893                /* In caching mode, global flushes turn emulation expensive */
2894                if (!cap_caching_mode(iommu->cap))
2895                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2896                                         DMA_TLB_GLOBAL_FLUSH);
2897                for (j = 0; j < deferred_flush[i].next; j++) {
2898                        unsigned long mask;
2899                        struct iova *iova = deferred_flush[i].iova[j];
2900                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2901
2902                        /* On real hardware multiple invalidations are expensive */
2903                        if (cap_caching_mode(iommu->cap))
2904                                iommu_flush_iotlb_psi(iommu, domain->id,
2905                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2906                        else {
2907                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2908                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2909                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2910                        }
2911                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2912                }
2913                deferred_flush[i].next = 0;
2914        }
2915
2916        list_size = 0;
2917}
2918
2919static void flush_unmaps_timeout(unsigned long data)
2920{
2921        unsigned long flags;
2922
2923        spin_lock_irqsave(&async_umap_flush_lock, flags);
2924        flush_unmaps();
2925        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2926}
2927
2928static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2929{
2930        unsigned long flags;
2931        int next, iommu_id;
2932        struct intel_iommu *iommu;
2933
2934        spin_lock_irqsave(&async_umap_flush_lock, flags);
2935        if (list_size == HIGH_WATER_MARK)
2936                flush_unmaps();
2937
2938        iommu = domain_get_iommu(dom);
2939        iommu_id = iommu->seq_id;
2940
2941        next = deferred_flush[iommu_id].next;
2942        deferred_flush[iommu_id].domain[next] = dom;
2943        deferred_flush[iommu_id].iova[next] = iova;
2944        deferred_flush[iommu_id].next++;
2945
2946        if (!timer_on) {
2947                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2948                timer_on = 1;
2949        }
2950        list_size++;
2951        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2952}
2953
2954static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2955                             size_t size, enum dma_data_direction dir,
2956                             struct dma_attrs *attrs)
2957{
2958        struct pci_dev *pdev = to_pci_dev(dev);
2959        struct dmar_domain *domain;
2960        unsigned long start_pfn, last_pfn;
2961        struct iova *iova;
2962        struct intel_iommu *iommu;
2963
2964        if (iommu_no_mapping(dev))
2965                return;
2966
2967        domain = find_domain(pdev);
2968        BUG_ON(!domain);
2969
2970        iommu = domain_get_iommu(domain);
2971
2972        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2973        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2974                      (unsigned long long)dev_addr))
2975                return;
2976
2977        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2978        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2979
2980        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2981                 pci_name(pdev), start_pfn, last_pfn);
2982
2983        /*  clear the whole page */
2984        dma_pte_clear_range(domain, start_pfn, last_pfn);
2985
2986        /* free page tables */
2987        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2988
2989        if (intel_iommu_strict) {
2990                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2991                                      last_pfn - start_pfn + 1, 0);
2992                /* free iova */
2993                __free_iova(&domain->iovad, iova);
2994        } else {
2995                add_unmap(domain, iova);
2996                /*
2997                 * queue up the release of the unmap to save the 1/6th of the
2998                 * cpu used up by the iotlb flush operation...
2999                 */
3000        }
3001}
3002
3003static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3004                                  dma_addr_t *dma_handle, gfp_t flags,
3005                                  struct dma_attrs *attrs)
3006{
3007        void *vaddr;
3008        int order;
3009
3010        size = PAGE_ALIGN(size);
3011        order = get_order(size);
3012
3013        if (!iommu_no_mapping(hwdev))
3014                flags &= ~(GFP_DMA | GFP_DMA32);
3015        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3016                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3017                        flags |= GFP_DMA;
3018                else
3019                        flags |= GFP_DMA32;
3020        }
3021
3022        vaddr = (void *)__get_free_pages(flags, order);
3023        if (!vaddr)
3024                return NULL;
3025        memset(vaddr, 0, size);
3026
3027        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3028                                         DMA_BIDIRECTIONAL,
3029                                         hwdev->coherent_dma_mask);
3030        if (*dma_handle)
3031                return vaddr;
3032        free_pages((unsigned long)vaddr, order);
3033        return NULL;
3034}
3035
3036static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3037                                dma_addr_t dma_handle, struct dma_attrs *attrs)
3038{
3039        int order;
3040
3041        size = PAGE_ALIGN(size);
3042        order = get_order(size);
3043
3044        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3045        free_pages((unsigned long)vaddr, order);
3046}
3047
3048static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3049                           int nelems, enum dma_data_direction dir,
3050                           struct dma_attrs *attrs)
3051{
3052        struct pci_dev *pdev = to_pci_dev(hwdev);
3053        struct dmar_domain *domain;
3054        unsigned long start_pfn, last_pfn;
3055        struct iova *iova;
3056        struct intel_iommu *iommu;
3057
3058        if (iommu_no_mapping(hwdev))
3059                return;
3060
3061        domain = find_domain(pdev);
3062        BUG_ON(!domain);
3063
3064        iommu = domain_get_iommu(domain);
3065
3066        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3067        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3068                      (unsigned long long)sglist[0].dma_address))
3069                return;
3070
3071        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3072        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3073
3074        /*  clear the whole page */
3075        dma_pte_clear_range(domain, start_pfn, last_pfn);
3076
3077        /* free page tables */
3078        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3079
3080        if (intel_iommu_strict) {
3081                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3082                                      last_pfn - start_pfn + 1, 0);
3083                /* free iova */
3084                __free_iova(&domain->iovad, iova);
3085        } else {
3086                add_unmap(domain, iova);
3087                /*
3088                 * queue up the release of the unmap to save the 1/6th of the
3089                 * cpu used up by the iotlb flush operation...
3090                 */
3091        }
3092}
3093
3094static int intel_nontranslate_map_sg(struct device *hddev,
3095        struct scatterlist *sglist, int nelems, int dir)
3096{
3097        int i;
3098        struct scatterlist *sg;
3099
3100        for_each_sg(sglist, sg, nelems, i) {
3101                BUG_ON(!sg_page(sg));
3102                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3103                sg->dma_length = sg->length;
3104        }
3105        return nelems;
3106}
3107
3108static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3109                        enum dma_data_direction dir, struct dma_attrs *attrs)
3110{
3111        int i;
3112        struct pci_dev *pdev = to_pci_dev(hwdev);
3113        struct dmar_domain *domain;
3114        size_t size = 0;
3115        int prot = 0;
3116        struct iova *iova = NULL;
3117        int ret;
3118        struct scatterlist *sg;
3119        unsigned long start_vpfn;
3120        struct intel_iommu *iommu;
3121
3122        BUG_ON(dir == DMA_NONE);
3123        if (iommu_no_mapping(hwdev))
3124                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3125
3126        domain = get_valid_domain_for_dev(pdev);
3127        if (!domain)
3128                return 0;
3129
3130        iommu = domain_get_iommu(domain);
3131
3132        for_each_sg(sglist, sg, nelems, i)
3133                size += aligned_nrpages(sg->offset, sg->length);
3134
3135        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3136                                pdev->dma_mask);
3137        if (!iova) {
3138                sglist->dma_length = 0;
3139                return 0;
3140        }
3141
3142        /*
3143         * Check if DMAR supports zero-length reads on write only
3144         * mappings..
3145         */
3146        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3147                        !cap_zlr(iommu->cap))
3148                prot |= DMA_PTE_READ;
3149        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3150                prot |= DMA_PTE_WRITE;
3151
3152        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3153
3154        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3155        if (unlikely(ret)) {
3156                /*  clear the page */
3157                dma_pte_clear_range(domain, start_vpfn,
3158                                    start_vpfn + size - 1);
3159                /* free page tables */
3160                dma_pte_free_pagetable(domain, start_vpfn,
3161                                       start_vpfn + size - 1);
3162                /* free iova */
3163                __free_iova(&domain->iovad, iova);
3164                return 0;
3165        }
3166
3167        /* it's a non-present to present mapping. Only flush if caching mode */
3168        if (cap_caching_mode(iommu->cap))
3169                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3170        else
3171                iommu_flush_write_buffer(iommu);
3172
3173        return nelems;
3174}
3175
3176static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3177{
3178        return !dma_addr;
3179}
3180
3181struct dma_map_ops intel_dma_ops = {
3182        .alloc = intel_alloc_coherent,
3183        .free = intel_free_coherent,
3184        .map_sg = intel_map_sg,
3185        .unmap_sg = intel_unmap_sg,
3186        .map_page = intel_map_page,
3187        .unmap_page = intel_unmap_page,
3188        .mapping_error = intel_mapping_error,
3189};
3190
3191static inline int iommu_domain_cache_init(void)
3192{
3193        int ret = 0;
3194
3195        iommu_domain_cache = kmem_cache_create("iommu_domain",
3196                                         sizeof(struct dmar_domain),
3197                                         0,
3198                                         SLAB_HWCACHE_ALIGN,
3199
3200                                         NULL);
3201        if (!iommu_domain_cache) {
3202                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3203                ret = -ENOMEM;
3204        }
3205
3206        return ret;
3207}
3208
3209static inline int iommu_devinfo_cache_init(void)
3210{
3211        int ret = 0;
3212
3213        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3214                                         sizeof(struct device_domain_info),
3215                                         0,
3216                                         SLAB_HWCACHE_ALIGN,
3217                                         NULL);
3218        if (!iommu_devinfo_cache) {
3219                printk(KERN_ERR "Couldn't create devinfo cache\n");
3220                ret = -ENOMEM;
3221        }
3222
3223        return ret;
3224}
3225
3226static inline int iommu_iova_cache_init(void)
3227{
3228        int ret = 0;
3229
3230        iommu_iova_cache = kmem_cache_create("iommu_iova",
3231                                         sizeof(struct iova),
3232                                         0,
3233                                         SLAB_HWCACHE_ALIGN,
3234                                         NULL);
3235        if (!iommu_iova_cache) {
3236                printk(KERN_ERR "Couldn't create iova cache\n");
3237                ret = -ENOMEM;
3238        }
3239
3240        return ret;
3241}
3242
3243static int __init iommu_init_mempool(void)
3244{
3245        int ret;
3246        ret = iommu_iova_cache_init();
3247        if (ret)
3248                return ret;
3249
3250        ret = iommu_domain_cache_init();
3251        if (ret)
3252                goto domain_error;
3253
3254        ret = iommu_devinfo_cache_init();
3255        if (!ret)
3256                return ret;
3257
3258        kmem_cache_destroy(iommu_domain_cache);
3259domain_error:
3260        kmem_cache_destroy(iommu_iova_cache);
3261
3262        return -ENOMEM;
3263}
3264
3265static void __init iommu_exit_mempool(void)
3266{
3267        kmem_cache_destroy(iommu_devinfo_cache);
3268        kmem_cache_destroy(iommu_domain_cache);
3269        kmem_cache_destroy(iommu_iova_cache);
3270
3271}
3272
3273static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3274{
3275        struct dmar_drhd_unit *drhd;
3276        u32 vtbar;
3277        int rc;
3278
3279        /* We know that this device on this chipset has its own IOMMU.
3280         * If we find it under a different IOMMU, then the BIOS is lying
3281         * to us. Hope that the IOMMU for this device is actually
3282         * disabled, and it needs no translation...
3283         */
3284        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3285        if (rc) {
3286                /* "can't" happen */
3287                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3288                return;
3289        }
3290        vtbar &= 0xffff0000;
3291
3292        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3293        drhd = dmar_find_matched_drhd_unit(pdev);
3294        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3295                            TAINT_FIRMWARE_WORKAROUND,
3296                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3297                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3298}
3299DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3300
3301static void __init init_no_remapping_devices(void)
3302{
3303        struct dmar_drhd_unit *drhd;
3304
3305        for_each_drhd_unit(drhd) {
3306                if (!drhd->include_all) {
3307                        int i;
3308                        for (i = 0; i < drhd->devices_cnt; i++)
3309                                if (drhd->devices[i] != NULL)
3310                                        break;
3311                        /* ignore DMAR unit if no pci devices exist */
3312                        if (i == drhd->devices_cnt)
3313                                drhd->ignored = 1;
3314                }
3315        }
3316
3317        for_each_drhd_unit(drhd) {
3318                int i;
3319                if (drhd->ignored || drhd->include_all)
3320                        continue;
3321
3322                for (i = 0; i < drhd->devices_cnt; i++)
3323                        if (drhd->devices[i] &&
3324                            !IS_GFX_DEVICE(drhd->devices[i]))
3325                                break;
3326
3327                if (i < drhd->devices_cnt)
3328                        continue;
3329
3330                /* This IOMMU has *only* gfx devices. Either bypass it or
3331                   set the gfx_mapped flag, as appropriate */
3332                if (dmar_map_gfx) {
3333                        intel_iommu_gfx_mapped = 1;
3334                } else {
3335                        drhd->ignored = 1;
3336                        for (i = 0; i < drhd->devices_cnt; i++) {
3337                                if (!drhd->devices[i])
3338                                        continue;
3339                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3340                        }
3341                }
3342        }
3343}
3344
3345#ifdef CONFIG_SUSPEND
3346static int init_iommu_hw(void)
3347{
3348        struct dmar_drhd_unit *drhd;
3349        struct intel_iommu *iommu = NULL;
3350
3351        for_each_active_iommu(iommu, drhd)
3352                if (iommu->qi)
3353                        dmar_reenable_qi(iommu);
3354
3355        for_each_iommu(iommu, drhd) {
3356                if (drhd->ignored) {
3357                        /*
3358                         * we always have to disable PMRs or DMA may fail on
3359                         * this device
3360                         */
3361                        if (force_on)
3362                                iommu_disable_protect_mem_regions(iommu);
3363                        continue;
3364                }
3365        
3366                iommu_flush_write_buffer(iommu);
3367
3368                iommu_set_root_entry(iommu);
3369
3370                iommu->flush.flush_context(iommu, 0, 0, 0,
3371                                           DMA_CCMD_GLOBAL_INVL);
3372                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3373                                         DMA_TLB_GLOBAL_FLUSH);
3374                if (iommu_enable_translation(iommu))
3375                        return 1;
3376                iommu_disable_protect_mem_regions(iommu);
3377        }
3378
3379        return 0;
3380}
3381
3382static void iommu_flush_all(void)
3383{
3384        struct dmar_drhd_unit *drhd;
3385        struct intel_iommu *iommu;
3386
3387        for_each_active_iommu(iommu, drhd) {
3388                iommu->flush.flush_context(iommu, 0, 0, 0,
3389                                           DMA_CCMD_GLOBAL_INVL);
3390                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3391                                         DMA_TLB_GLOBAL_FLUSH);
3392        }
3393}
3394
3395static int iommu_suspend(void)
3396{
3397        struct dmar_drhd_unit *drhd;
3398        struct intel_iommu *iommu = NULL;
3399        unsigned long flag;
3400
3401        for_each_active_iommu(iommu, drhd) {
3402                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3403                                                 GFP_ATOMIC);
3404                if (!iommu->iommu_state)
3405                        goto nomem;
3406        }
3407
3408        iommu_flush_all();
3409
3410        for_each_active_iommu(iommu, drhd) {
3411                iommu_disable_translation(iommu);
3412
3413                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3414
3415                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3416                        readl(iommu->reg + DMAR_FECTL_REG);
3417                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3418                        readl(iommu->reg + DMAR_FEDATA_REG);
3419                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3420                        readl(iommu->reg + DMAR_FEADDR_REG);
3421                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3422                        readl(iommu->reg + DMAR_FEUADDR_REG);
3423
3424                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3425        }
3426        return 0;
3427
3428nomem:
3429        for_each_active_iommu(iommu, drhd)
3430                kfree(iommu->iommu_state);
3431
3432        return -ENOMEM;
3433}
3434
3435static void iommu_resume(void)
3436{
3437        struct dmar_drhd_unit *drhd;
3438        struct intel_iommu *iommu = NULL;
3439        unsigned long flag;
3440
3441        if (init_iommu_hw()) {
3442                if (force_on)
3443                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3444                else
3445                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3446                return;
3447        }
3448
3449        for_each_active_iommu(iommu, drhd) {
3450
3451                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3452
3453                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3454                        iommu->reg + DMAR_FECTL_REG);
3455                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3456                        iommu->reg + DMAR_FEDATA_REG);
3457                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3458                        iommu->reg + DMAR_FEADDR_REG);
3459                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3460                        iommu->reg + DMAR_FEUADDR_REG);
3461
3462                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3463        }
3464
3465        for_each_active_iommu(iommu, drhd)
3466                kfree(iommu->iommu_state);
3467}
3468
3469static struct syscore_ops iommu_syscore_ops = {
3470        .resume         = iommu_resume,
3471        .suspend        = iommu_suspend,
3472};
3473
3474static void __init init_iommu_pm_ops(void)
3475{
3476        register_syscore_ops(&iommu_syscore_ops);
3477}
3478
3479#else
3480static inline void init_iommu_pm_ops(void) {}
3481#endif  /* CONFIG_PM */
3482
3483LIST_HEAD(dmar_rmrr_units);
3484
3485static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3486{
3487        list_add(&rmrr->list, &dmar_rmrr_units);
3488}
3489
3490
3491int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3492{
3493        struct acpi_dmar_reserved_memory *rmrr;
3494        struct dmar_rmrr_unit *rmrru;
3495
3496        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3497        if (!rmrru)
3498                return -ENOMEM;
3499
3500        rmrru->hdr = header;
3501        rmrr = (struct acpi_dmar_reserved_memory *)header;
3502        rmrru->base_address = rmrr->base_address;
3503        rmrru->end_address = rmrr->end_address;
3504
3505        dmar_register_rmrr_unit(rmrru);
3506        return 0;
3507}
3508
3509static int __init
3510rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3511{
3512        struct acpi_dmar_reserved_memory *rmrr;
3513        int ret;
3514
3515        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3516        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3517                ((void *)rmrr) + rmrr->header.length,
3518                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3519
3520        if (ret || (rmrru->devices_cnt == 0)) {
3521                list_del(&rmrru->list);
3522                kfree(rmrru);
3523        }
3524        return ret;
3525}
3526
3527static LIST_HEAD(dmar_atsr_units);
3528
3529int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3530{
3531        struct acpi_dmar_atsr *atsr;
3532        struct dmar_atsr_unit *atsru;
3533
3534        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3535        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3536        if (!atsru)
3537                return -ENOMEM;
3538
3539        atsru->hdr = hdr;
3540        atsru->include_all = atsr->flags & 0x1;
3541
3542        list_add(&atsru->list, &dmar_atsr_units);
3543
3544        return 0;
3545}
3546
3547static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3548{
3549        int rc;
3550        struct acpi_dmar_atsr *atsr;
3551
3552        if (atsru->include_all)
3553                return 0;
3554
3555        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3556        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3557                                (void *)atsr + atsr->header.length,
3558                                &atsru->devices_cnt, &atsru->devices,
3559                                atsr->segment);
3560        if (rc || !atsru->devices_cnt) {
3561                list_del(&atsru->list);
3562                kfree(atsru);
3563        }
3564
3565        return rc;
3566}
3567
3568int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3569{
3570        int i;
3571        struct pci_bus *bus;
3572        struct acpi_dmar_atsr *atsr;
3573        struct dmar_atsr_unit *atsru;
3574
3575        dev = pci_physfn(dev);
3576
3577        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3578                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3579                if (atsr->segment == pci_domain_nr(dev->bus))
3580                        goto found;
3581        }
3582
3583        return 0;
3584
3585found:
3586        for (bus = dev->bus; bus; bus = bus->parent) {
3587                struct pci_dev *bridge = bus->self;
3588
3589                if (!bridge || !pci_is_pcie(bridge) ||
3590                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3591                        return 0;
3592
3593                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3594                        for (i = 0; i < atsru->devices_cnt; i++)
3595                                if (atsru->devices[i] == bridge)
3596                                        return 1;
3597                        break;
3598                }
3599        }
3600
3601        if (atsru->include_all)
3602                return 1;
3603
3604        return 0;
3605}
3606
3607int __init dmar_parse_rmrr_atsr_dev(void)
3608{
3609        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3610        struct dmar_atsr_unit *atsr, *atsr_n;
3611        int ret = 0;
3612
3613        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3614                ret = rmrr_parse_dev(rmrr);
3615                if (ret)
3616                        return ret;
3617        }
3618
3619        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3620                ret = atsr_parse_dev(atsr);
3621                if (ret)
3622                        return ret;
3623        }
3624
3625        return ret;
3626}
3627
3628/*
3629 * Here we only respond to action of unbound device from driver.
3630 *
3631 * Added device is not attached to its DMAR domain here yet. That will happen
3632 * when mapping the device to iova.
3633 */
3634static int device_notifier(struct notifier_block *nb,
3635                                  unsigned long action, void *data)
3636{
3637        struct device *dev = data;
3638        struct pci_dev *pdev = to_pci_dev(dev);
3639        struct dmar_domain *domain;
3640
3641        if (iommu_no_mapping(dev))
3642                return 0;
3643
3644        domain = find_domain(pdev);
3645        if (!domain)
3646                return 0;
3647
3648        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3649                domain_remove_one_dev_info(domain, pdev);
3650
3651                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3652                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3653                    list_empty(&domain->devices))
3654                        domain_exit(domain);
3655        }
3656
3657        return 0;
3658}
3659
3660static struct notifier_block device_nb = {
3661        .notifier_call = device_notifier,
3662};
3663
3664int __init intel_iommu_init(void)
3665{
3666        int ret = 0;
3667        struct dmar_drhd_unit *drhd;
3668
3669        /* VT-d is required for a TXT/tboot launch, so enforce that */
3670        force_on = tboot_force_iommu();
3671
3672        if (dmar_table_init()) {
3673                if (force_on)
3674                        panic("tboot: Failed to initialize DMAR table\n");
3675                return  -ENODEV;
3676        }
3677
3678        /*
3679         * Disable translation if already enabled prior to OS handover.
3680         */
3681        for_each_drhd_unit(drhd) {
3682                struct intel_iommu *iommu;
3683
3684                if (drhd->ignored)
3685                        continue;
3686
3687                iommu = drhd->iommu;
3688                if (iommu->gcmd & DMA_GCMD_TE)
3689                        iommu_disable_translation(iommu);
3690        }
3691
3692        if (dmar_dev_scope_init() < 0) {
3693                if (force_on)
3694                        panic("tboot: Failed to initialize DMAR device scope\n");
3695                return  -ENODEV;
3696        }
3697
3698        if (no_iommu || dmar_disabled)
3699                return -ENODEV;
3700
3701        if (iommu_init_mempool()) {
3702                if (force_on)
3703                        panic("tboot: Failed to initialize iommu memory\n");
3704                return  -ENODEV;
3705        }
3706
3707        if (list_empty(&dmar_rmrr_units))
3708                printk(KERN_INFO "DMAR: No RMRR found\n");
3709
3710        if (list_empty(&dmar_atsr_units))
3711                printk(KERN_INFO "DMAR: No ATSR found\n");
3712
3713        if (dmar_init_reserved_ranges()) {
3714                if (force_on)
3715                        panic("tboot: Failed to reserve iommu ranges\n");
3716                return  -ENODEV;
3717        }
3718
3719        init_no_remapping_devices();
3720
3721        ret = init_dmars();
3722        if (ret) {
3723                if (force_on)
3724                        panic("tboot: Failed to initialize DMARs\n");
3725                printk(KERN_ERR "IOMMU: dmar init failed\n");
3726                put_iova_domain(&reserved_iova_list);
3727                iommu_exit_mempool();
3728                return ret;
3729        }
3730        printk(KERN_INFO
3731        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3732
3733        init_timer(&unmap_timer);
3734#ifdef CONFIG_SWIOTLB
3735        swiotlb = 0;
3736#endif
3737        dma_ops = &intel_dma_ops;
3738
3739        init_iommu_pm_ops();
3740
3741        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3742
3743        bus_register_notifier(&pci_bus_type, &device_nb);
3744
3745        intel_iommu_enabled = 1;
3746
3747        return 0;
3748}
3749
3750static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3751                                           struct pci_dev *pdev)
3752{
3753        struct pci_dev *tmp, *parent;
3754
3755        if (!iommu || !pdev)
3756                return;
3757
3758        /* dependent device detach */
3759        tmp = pci_find_upstream_pcie_bridge(pdev);
3760        /* Secondary interface's bus number and devfn 0 */
3761        if (tmp) {
3762                parent = pdev->bus->self;
3763                while (parent != tmp) {
3764                        iommu_detach_dev(iommu, parent->bus->number,
3765                                         parent->devfn);
3766                        parent = parent->bus->self;
3767                }
3768                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3769                        iommu_detach_dev(iommu,
3770                                tmp->subordinate->number, 0);
3771                else /* this is a legacy PCI bridge */
3772                        iommu_detach_dev(iommu, tmp->bus->number,
3773                                         tmp->devfn);
3774        }
3775}
3776
3777static void domain_remove_one_dev_info(struct dmar_domain *domain,
3778                                          struct pci_dev *pdev)
3779{
3780        struct device_domain_info *info;
3781        struct intel_iommu *iommu;
3782        unsigned long flags;
3783        int found = 0;
3784        struct list_head *entry, *tmp;
3785
3786        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3787                                pdev->devfn);
3788        if (!iommu)
3789                return;
3790
3791        spin_lock_irqsave(&device_domain_lock, flags);
3792        list_for_each_safe(entry, tmp, &domain->devices) {
3793                info = list_entry(entry, struct device_domain_info, link);
3794                if (info->segment == pci_domain_nr(pdev->bus) &&
3795                    info->bus == pdev->bus->number &&
3796                    info->devfn == pdev->devfn) {
3797                        unlink_domain_info(info);
3798                        spin_unlock_irqrestore(&device_domain_lock, flags);
3799
3800                        iommu_disable_dev_iotlb(info);
3801                        iommu_detach_dev(iommu, info->bus, info->devfn);
3802                        iommu_detach_dependent_devices(iommu, pdev);
3803                        free_devinfo_mem(info);
3804
3805                        spin_lock_irqsave(&device_domain_lock, flags);
3806
3807                        if (found)
3808                                break;
3809                        else
3810                                continue;
3811                }
3812
3813                /* if there is no other devices under the same iommu
3814                 * owned by this domain, clear this iommu in iommu_bmp
3815                 * update iommu count and coherency
3816                 */
3817                if (iommu == device_to_iommu(info->segment, info->bus,
3818                                            info->devfn))
3819                        found = 1;
3820        }
3821
3822        spin_unlock_irqrestore(&device_domain_lock, flags);
3823
3824        if (found == 0) {
3825                unsigned long tmp_flags;
3826                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3827                clear_bit(iommu->seq_id, domain->iommu_bmp);
3828                domain->iommu_count--;
3829                domain_update_iommu_cap(domain);
3830                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3831
3832                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3833                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3834                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3835                        clear_bit(domain->id, iommu->domain_ids);
3836                        iommu->domains[domain->id] = NULL;
3837                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3838                }
3839        }
3840}
3841
3842static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3843{
3844        struct device_domain_info *info;
3845        struct intel_iommu *iommu;
3846        unsigned long flags1, flags2;
3847
3848        spin_lock_irqsave(&device_domain_lock, flags1);
3849        while (!list_empty(&domain->devices)) {
3850                info = list_entry(domain->devices.next,
3851                        struct device_domain_info, link);
3852                unlink_domain_info(info);
3853                spin_unlock_irqrestore(&device_domain_lock, flags1);
3854
3855                iommu_disable_dev_iotlb(info);
3856                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3857                iommu_detach_dev(iommu, info->bus, info->devfn);
3858                iommu_detach_dependent_devices(iommu, info->dev);
3859
3860                /* clear this iommu in iommu_bmp, update iommu count
3861                 * and capabilities
3862                 */
3863                spin_lock_irqsave(&domain->iommu_lock, flags2);
3864                if (test_and_clear_bit(iommu->seq_id,
3865                                       domain->iommu_bmp)) {
3866                        domain->iommu_count--;
3867                        domain_update_iommu_cap(domain);
3868                }
3869                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3870
3871                free_devinfo_mem(info);
3872                spin_lock_irqsave(&device_domain_lock, flags1);
3873        }
3874        spin_unlock_irqrestore(&device_domain_lock, flags1);
3875}
3876
3877/* domain id for virtual machine, it won't be set in context */
3878static unsigned long vm_domid;
3879
3880static struct dmar_domain *iommu_alloc_vm_domain(void)
3881{
3882        struct dmar_domain *domain;
3883
3884        domain = alloc_domain_mem();
3885        if (!domain)
3886                return NULL;
3887
3888        domain->id = vm_domid++;
3889        domain->nid = -1;
3890        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3891        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3892
3893        return domain;
3894}
3895
3896static int md_domain_init(struct dmar_domain *domain, int guest_width)
3897{
3898        int adjust_width;
3899
3900        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3901        spin_lock_init(&domain->iommu_lock);
3902
3903        domain_reserve_special_ranges(domain);
3904
3905        /* calculate AGAW */
3906        domain->gaw = guest_width;
3907        adjust_width = guestwidth_to_adjustwidth(guest_width);
3908        domain->agaw = width_to_agaw(adjust_width);
3909
3910        INIT_LIST_HEAD(&domain->devices);
3911
3912        domain->iommu_count = 0;
3913        domain->iommu_coherency = 0;
3914        domain->iommu_snooping = 0;
3915        domain->iommu_superpage = 0;
3916        domain->max_addr = 0;
3917        domain->nid = -1;
3918
3919        /* always allocate the top pgd */
3920        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3921        if (!domain->pgd)
3922                return -ENOMEM;
3923        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3924        return 0;
3925}
3926
3927static void iommu_free_vm_domain(struct dmar_domain *domain)
3928{
3929        unsigned long flags;
3930        struct dmar_drhd_unit *drhd;
3931        struct intel_iommu *iommu;
3932        unsigned long i;
3933        unsigned long ndomains;
3934
3935        for_each_drhd_unit(drhd) {
3936                if (drhd->ignored)
3937                        continue;
3938                iommu = drhd->iommu;
3939
3940                ndomains = cap_ndoms(iommu->cap);
3941                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3942                        if (iommu->domains[i] == domain) {
3943                                spin_lock_irqsave(&iommu->lock, flags);
3944                                clear_bit(i, iommu->domain_ids);
3945                                iommu->domains[i] = NULL;
3946                                spin_unlock_irqrestore(&iommu->lock, flags);
3947                                break;
3948                        }
3949                }
3950        }
3951}
3952
3953static void vm_domain_exit(struct dmar_domain *domain)
3954{
3955        /* Domain 0 is reserved, so dont process it */
3956        if (!domain)
3957                return;
3958
3959        vm_domain_remove_all_dev_info(domain);
3960        /* destroy iovas */
3961        put_iova_domain(&domain->iovad);
3962
3963        /* clear ptes */
3964        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3965
3966        /* free page tables */
3967        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3968
3969        iommu_free_vm_domain(domain);
3970        free_domain_mem(domain);
3971}
3972
3973static int intel_iommu_domain_init(struct iommu_domain *domain)
3974{
3975        struct dmar_domain *dmar_domain;
3976
3977        dmar_domain = iommu_alloc_vm_domain();
3978        if (!dmar_domain) {
3979                printk(KERN_ERR
3980                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3981                return -ENOMEM;
3982        }
3983        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3984                printk(KERN_ERR
3985                        "intel_iommu_domain_init() failed\n");
3986                vm_domain_exit(dmar_domain);
3987                return -ENOMEM;
3988        }
3989        domain_update_iommu_cap(dmar_domain);
3990        domain->priv = dmar_domain;
3991
3992        domain->geometry.aperture_start = 0;
3993        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3994        domain->geometry.force_aperture = true;
3995
3996        return 0;
3997}
3998
3999static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4000{
4001        struct dmar_domain *dmar_domain = domain->priv;
4002
4003        domain->priv = NULL;
4004        vm_domain_exit(dmar_domain);
4005}
4006
4007static int intel_iommu_attach_device(struct iommu_domain *domain,
4008                                     struct device *dev)
4009{
4010        struct dmar_domain *dmar_domain = domain->priv;
4011        struct pci_dev *pdev = to_pci_dev(dev);
4012        struct intel_iommu *iommu;
4013        int addr_width;
4014
4015        /* normally pdev is not mapped */
4016        if (unlikely(domain_context_mapped(pdev))) {
4017                struct dmar_domain *old_domain;
4018
4019                old_domain = find_domain(pdev);
4020                if (old_domain) {
4021                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4022                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4023                                domain_remove_one_dev_info(old_domain, pdev);
4024                        else
4025                                domain_remove_dev_info(old_domain);
4026                }
4027        }
4028
4029        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4030                                pdev->devfn);
4031        if (!iommu)
4032                return -ENODEV;
4033
4034        /* check if this iommu agaw is sufficient for max mapped address */
4035        addr_width = agaw_to_width(iommu->agaw);
4036        if (addr_width > cap_mgaw(iommu->cap))
4037                addr_width = cap_mgaw(iommu->cap);
4038
4039        if (dmar_domain->max_addr > (1LL << addr_width)) {
4040                printk(KERN_ERR "%s: iommu width (%d) is not "
4041                       "sufficient for the mapped address (%llx)\n",
4042                       __func__, addr_width, dmar_domain->max_addr);
4043                return -EFAULT;
4044        }
4045        dmar_domain->gaw = addr_width;
4046
4047        /*
4048         * Knock out extra levels of page tables if necessary
4049         */
4050        while (iommu->agaw < dmar_domain->agaw) {
4051                struct dma_pte *pte;
4052
4053                pte = dmar_domain->pgd;
4054                if (dma_pte_present(pte)) {
4055                        dmar_domain->pgd = (struct dma_pte *)
4056                                phys_to_virt(dma_pte_addr(pte));
4057                        free_pgtable_page(pte);
4058                }
4059                dmar_domain->agaw--;
4060        }
4061
4062        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4063}
4064
4065static void intel_iommu_detach_device(struct iommu_domain *domain,
4066                                      struct device *dev)
4067{
4068        struct dmar_domain *dmar_domain = domain->priv;
4069        struct pci_dev *pdev = to_pci_dev(dev);
4070
4071        domain_remove_one_dev_info(dmar_domain, pdev);
4072}
4073
4074static int intel_iommu_map(struct iommu_domain *domain,
4075                           unsigned long iova, phys_addr_t hpa,
4076                           size_t size, int iommu_prot)
4077{
4078        struct dmar_domain *dmar_domain = domain->priv;
4079        u64 max_addr;
4080        int prot = 0;
4081        int ret;
4082
4083        if (iommu_prot & IOMMU_READ)
4084                prot |= DMA_PTE_READ;
4085        if (iommu_prot & IOMMU_WRITE)
4086                prot |= DMA_PTE_WRITE;
4087        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4088                prot |= DMA_PTE_SNP;
4089
4090        max_addr = iova + size;
4091        if (dmar_domain->max_addr < max_addr) {
4092                u64 end;
4093
4094                /* check if minimum agaw is sufficient for mapped address */
4095                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4096                if (end < max_addr) {
4097                        printk(KERN_ERR "%s: iommu width (%d) is not "
4098                               "sufficient for the mapped address (%llx)\n",
4099                               __func__, dmar_domain->gaw, max_addr);
4100                        return -EFAULT;
4101                }
4102                dmar_domain->max_addr = max_addr;
4103        }
4104        /* Round up size to next multiple of PAGE_SIZE, if it and
4105           the low bits of hpa would take us onto the next page */
4106        size = aligned_nrpages(hpa, size);
4107        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4108                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4109        return ret;
4110}
4111
4112static size_t intel_iommu_unmap(struct iommu_domain *domain,
4113                             unsigned long iova, size_t size)
4114{
4115        struct dmar_domain *dmar_domain = domain->priv;
4116        int order;
4117
4118        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4119                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4120
4121        if (dmar_domain->max_addr == iova + size)
4122                dmar_domain->max_addr = iova;
4123
4124        return PAGE_SIZE << order;
4125}
4126
4127static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4128                                            dma_addr_t iova)
4129{
4130        struct dmar_domain *dmar_domain = domain->priv;
4131        struct dma_pte *pte;
4132        u64 phys = 0;
4133
4134        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4135        if (pte)
4136                phys = dma_pte_addr(pte);
4137
4138        return phys;
4139}
4140
4141static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4142                                      unsigned long cap)
4143{
4144        struct dmar_domain *dmar_domain = domain->priv;
4145
4146        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4147                return dmar_domain->iommu_snooping;
4148        if (cap == IOMMU_CAP_INTR_REMAP)
4149                return irq_remapping_enabled;
4150
4151        return 0;
4152}
4153
4154#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4155
4156static int intel_iommu_add_device(struct device *dev)
4157{
4158        struct pci_dev *pdev = to_pci_dev(dev);
4159        struct pci_dev *bridge, *dma_pdev = NULL;
4160        struct iommu_group *group;
4161        int ret;
4162
4163        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4164                             pdev->bus->number, pdev->devfn))
4165                return -ENODEV;
4166
4167        bridge = pci_find_upstream_pcie_bridge(pdev);
4168        if (bridge) {
4169                if (pci_is_pcie(bridge))
4170                        dma_pdev = pci_get_domain_bus_and_slot(
4171                                                pci_domain_nr(pdev->bus),
4172                                                bridge->subordinate->number, 0);
4173                if (!dma_pdev)
4174                        dma_pdev = pci_dev_get(bridge);
4175        } else
4176                dma_pdev = pci_dev_get(pdev);
4177
4178        /* Account for quirked devices */
4179        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4180
4181        /*
4182         * If it's a multifunction device that does not support our
4183         * required ACS flags, add to the same group as lowest numbered
4184         * function that also does not suport the required ACS flags.
4185         */
4186        if (dma_pdev->multifunction &&
4187            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4188                u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4189
4190                for (i = 0; i < 8; i++) {
4191                        struct pci_dev *tmp;
4192
4193                        tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4194                        if (!tmp)
4195                                continue;
4196
4197                        if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4198                                swap_pci_ref(&dma_pdev, tmp);
4199                                break;
4200                        }
4201                        pci_dev_put(tmp);
4202                }
4203        }
4204
4205        /*
4206         * Devices on the root bus go through the iommu.  If that's not us,
4207         * find the next upstream device and test ACS up to the root bus.
4208         * Finding the next device may require skipping virtual buses.
4209         */
4210        while (!pci_is_root_bus(dma_pdev->bus)) {
4211                struct pci_bus *bus = dma_pdev->bus;
4212
4213                while (!bus->self) {
4214                        if (!pci_is_root_bus(bus))
4215                                bus = bus->parent;
4216                        else
4217                                goto root_bus;
4218                }
4219
4220                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4221                        break;
4222
4223                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4224        }
4225
4226root_bus:
4227        group = iommu_group_get(&dma_pdev->dev);
4228        pci_dev_put(dma_pdev);
4229        if (!group) {
4230                group = iommu_group_alloc();
4231                if (IS_ERR(group))
4232                        return PTR_ERR(group);
4233        }
4234
4235        ret = iommu_group_add_device(group, dev);
4236
4237        iommu_group_put(group);
4238        return ret;
4239}
4240
4241static void intel_iommu_remove_device(struct device *dev)
4242{
4243        iommu_group_remove_device(dev);
4244}
4245
4246static struct iommu_ops intel_iommu_ops = {
4247        .domain_init    = intel_iommu_domain_init,
4248        .domain_destroy = intel_iommu_domain_destroy,
4249        .attach_dev     = intel_iommu_attach_device,
4250        .detach_dev     = intel_iommu_detach_device,
4251        .map            = intel_iommu_map,
4252        .unmap          = intel_iommu_unmap,
4253        .iova_to_phys   = intel_iommu_iova_to_phys,
4254        .domain_has_cap = intel_iommu_domain_has_cap,
4255        .add_device     = intel_iommu_add_device,
4256        .remove_device  = intel_iommu_remove_device,
4257        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4258};
4259
4260static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4261{
4262        /* G4x/GM45 integrated gfx dmar support is totally busted. */
4263        printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4264        dmar_map_gfx = 0;
4265}
4266
4267DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4268DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4269DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4270DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4271DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4272DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4273DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4274
4275static void quirk_iommu_rwbf(struct pci_dev *dev)
4276{
4277        /*
4278         * Mobile 4 Series Chipset neglects to set RWBF capability,
4279         * but needs it. Same seems to hold for the desktop versions.
4280         */
4281        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4282        rwbf_quirk = 1;
4283}
4284
4285DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4286DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4287DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4288DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4289DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4290DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4291DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4292
4293#define GGC 0x52
4294#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4295#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4296#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4297#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4298#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4299#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4300#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4301#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4302
4303static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4304{
4305        unsigned short ggc;
4306
4307        if (pci_read_config_word(dev, GGC, &ggc))
4308                return;
4309
4310        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4311                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4312                dmar_map_gfx = 0;
4313        } else if (dmar_map_gfx) {
4314                /* we have to ensure the gfx device is idle before we flush */
4315                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4316                intel_iommu_strict = 1;
4317       }
4318}
4319DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4320DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4321DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4322DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4323
4324/* On Tylersburg chipsets, some BIOSes have been known to enable the
4325   ISOCH DMAR unit for the Azalia sound device, but not give it any
4326   TLB entries, which causes it to deadlock. Check for that.  We do
4327   this in a function called from init_dmars(), instead of in a PCI
4328   quirk, because we don't want to print the obnoxious "BIOS broken"
4329   message if VT-d is actually disabled.
4330*/
4331static void __init check_tylersburg_isoch(void)
4332{
4333        struct pci_dev *pdev;
4334        uint32_t vtisochctrl;
4335
4336        /* If there's no Azalia in the system anyway, forget it. */
4337        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4338        if (!pdev)
4339                return;
4340        pci_dev_put(pdev);
4341
4342        /* System Management Registers. Might be hidden, in which case
4343           we can't do the sanity check. But that's OK, because the
4344           known-broken BIOSes _don't_ actually hide it, so far. */
4345        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4346        if (!pdev)
4347                return;
4348
4349        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4350                pci_dev_put(pdev);
4351                return;
4352        }
4353
4354        pci_dev_put(pdev);
4355
4356        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4357        if (vtisochctrl & 1)
4358                return;
4359
4360        /* Drop all bits other than the number of TLB entries */
4361        vtisochctrl &= 0x1c;
4362
4363        /* If we have the recommended number of TLB entries (16), fine. */
4364        if (vtisochctrl == 0x10)
4365                return;
4366
4367        /* Zero TLB entries? You get to ride the short bus to school. */
4368        if (!vtisochctrl) {
4369                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4370                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4371                     dmi_get_system_info(DMI_BIOS_VENDOR),
4372                     dmi_get_system_info(DMI_BIOS_VERSION),
4373                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4374                iommu_identity_mapping |= IDENTMAP_AZALIA;
4375                return;
4376        }
4377        
4378        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4379               vtisochctrl);
4380}
4381
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.