linux/drivers/iommu/intel-iommu.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
  27#include <linux/export.h>
  28#include <linux/slab.h>
  29#include <linux/irq.h>
  30#include <linux/interrupt.h>
  31#include <linux/spinlock.h>
  32#include <linux/pci.h>
  33#include <linux/dmar.h>
  34#include <linux/dma-mapping.h>
  35#include <linux/mempool.h>
  36#include <linux/timer.h>
  37#include <linux/iova.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/syscore_ops.h>
  41#include <linux/tboot.h>
  42#include <linux/dmi.h>
  43#include <linux/pci-ats.h>
  44#include <linux/memblock.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/cacheflush.h>
  47#include <asm/iommu.h>
  48
  49#include "irq_remapping.h"
  50
  51#define ROOT_SIZE               VTD_PAGE_SIZE
  52#define CONTEXT_SIZE            VTD_PAGE_SIZE
  53
  54#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  55#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  56#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  57
  58#define IOAPIC_RANGE_START      (0xfee00000)
  59#define IOAPIC_RANGE_END        (0xfeefffff)
  60#define IOVA_START_ADDR         (0x1000)
  61
  62#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  63
  64#define MAX_AGAW_WIDTH 64
  65
  66#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  67#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  68
  69/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  70   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  71#define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  72                                __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  73#define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  74
  75#define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  76#define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  77#define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  78
  79/* page table handling */
  80#define LEVEL_STRIDE            (9)
  81#define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  82
  83/*
  84 * This bitmap is used to advertise the page sizes our hardware support
  85 * to the IOMMU core, which will then use this information to split
  86 * physically contiguous memory regions it is mapping into page sizes
  87 * that we support.
  88 *
  89 * Traditionally the IOMMU core just handed us the mappings directly,
  90 * after making sure the size is an order of a 4KiB page and that the
  91 * mapping has natural alignment.
  92 *
  93 * To retain this behavior, we currently advertise that we support
  94 * all page sizes that are an order of 4KiB.
  95 *
  96 * If at some point we'd like to utilize the IOMMU core's new behavior,
  97 * we could change this to advertise the real page sizes we support.
  98 */
  99#define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 100
 101static inline int agaw_to_level(int agaw)
 102{
 103        return agaw + 2;
 104}
 105
 106static inline int agaw_to_width(int agaw)
 107{
 108        return 30 + agaw * LEVEL_STRIDE;
 109}
 110
 111static inline int width_to_agaw(int width)
 112{
 113        return (width - 30) / LEVEL_STRIDE;
 114}
 115
 116static inline unsigned int level_to_offset_bits(int level)
 117{
 118        return (level - 1) * LEVEL_STRIDE;
 119}
 120
 121static inline int pfn_level_offset(unsigned long pfn, int level)
 122{
 123        return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 124}
 125
 126static inline unsigned long level_mask(int level)
 127{
 128        return -1UL << level_to_offset_bits(level);
 129}
 130
 131static inline unsigned long level_size(int level)
 132{
 133        return 1UL << level_to_offset_bits(level);
 134}
 135
 136static inline unsigned long align_to_level(unsigned long pfn, int level)
 137{
 138        return (pfn + level_size(level) - 1) & level_mask(level);
 139}
 140
 141static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 142{
 143        return  1 << ((lvl - 1) * LEVEL_STRIDE);
 144}
 145
 146/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 147   are never going to work. */
 148static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 149{
 150        return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 151}
 152
 153static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 154{
 155        return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 156}
 157static inline unsigned long page_to_dma_pfn(struct page *pg)
 158{
 159        return mm_to_dma_pfn(page_to_pfn(pg));
 160}
 161static inline unsigned long virt_to_dma_pfn(void *p)
 162{
 163        return page_to_dma_pfn(virt_to_page(p));
 164}
 165
 166/* global iommu list, set NULL for ignored DMAR units */
 167static struct intel_iommu **g_iommus;
 168
 169static void __init check_tylersburg_isoch(void);
 170static int rwbf_quirk;
 171
 172/*
 173 * set to 1 to panic kernel if can't successfully enable VT-d
 174 * (used when kernel is launched w/ TXT)
 175 */
 176static int force_on = 0;
 177
 178/*
 179 * 0: Present
 180 * 1-11: Reserved
 181 * 12-63: Context Ptr (12 - (haw-1))
 182 * 64-127: Reserved
 183 */
 184struct root_entry {
 185        u64     val;
 186        u64     rsvd1;
 187};
 188#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 189static inline bool root_present(struct root_entry *root)
 190{
 191        return (root->val & 1);
 192}
 193static inline void set_root_present(struct root_entry *root)
 194{
 195        root->val |= 1;
 196}
 197static inline void set_root_value(struct root_entry *root, unsigned long value)
 198{
 199        root->val |= value & VTD_PAGE_MASK;
 200}
 201
 202static inline struct context_entry *
 203get_context_addr_from_root(struct root_entry *root)
 204{
 205        return (struct context_entry *)
 206                (root_present(root)?phys_to_virt(
 207                root->val & VTD_PAGE_MASK) :
 208                NULL);
 209}
 210
 211/*
 212 * low 64 bits:
 213 * 0: present
 214 * 1: fault processing disable
 215 * 2-3: translation type
 216 * 12-63: address space root
 217 * high 64 bits:
 218 * 0-2: address width
 219 * 3-6: aval
 220 * 8-23: domain id
 221 */
 222struct context_entry {
 223        u64 lo;
 224        u64 hi;
 225};
 226
 227static inline bool context_present(struct context_entry *context)
 228{
 229        return (context->lo & 1);
 230}
 231static inline void context_set_present(struct context_entry *context)
 232{
 233        context->lo |= 1;
 234}
 235
 236static inline void context_set_fault_enable(struct context_entry *context)
 237{
 238        context->lo &= (((u64)-1) << 2) | 1;
 239}
 240
 241static inline void context_set_translation_type(struct context_entry *context,
 242                                                unsigned long value)
 243{
 244        context->lo &= (((u64)-1) << 4) | 3;
 245        context->lo |= (value & 3) << 2;
 246}
 247
 248static inline void context_set_address_root(struct context_entry *context,
 249                                            unsigned long value)
 250{
 251        context->lo |= value & VTD_PAGE_MASK;
 252}
 253
 254static inline void context_set_address_width(struct context_entry *context,
 255                                             unsigned long value)
 256{
 257        context->hi |= value & 7;
 258}
 259
 260static inline void context_set_domain_id(struct context_entry *context,
 261                                         unsigned long value)
 262{
 263        context->hi |= (value & ((1 << 16) - 1)) << 8;
 264}
 265
 266static inline void context_clear_entry(struct context_entry *context)
 267{
 268        context->lo = 0;
 269        context->hi = 0;
 270}
 271
 272/*
 273 * 0: readable
 274 * 1: writable
 275 * 2-6: reserved
 276 * 7: super page
 277 * 8-10: available
 278 * 11: snoop behavior
 279 * 12-63: Host physcial address
 280 */
 281struct dma_pte {
 282        u64 val;
 283};
 284
 285static inline void dma_clear_pte(struct dma_pte *pte)
 286{
 287        pte->val = 0;
 288}
 289
 290static inline void dma_set_pte_readable(struct dma_pte *pte)
 291{
 292        pte->val |= DMA_PTE_READ;
 293}
 294
 295static inline void dma_set_pte_writable(struct dma_pte *pte)
 296{
 297        pte->val |= DMA_PTE_WRITE;
 298}
 299
 300static inline void dma_set_pte_snp(struct dma_pte *pte)
 301{
 302        pte->val |= DMA_PTE_SNP;
 303}
 304
 305static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 306{
 307        pte->val = (pte->val & ~3) | (prot & 3);
 308}
 309
 310static inline u64 dma_pte_addr(struct dma_pte *pte)
 311{
 312#ifdef CONFIG_64BIT
 313        return pte->val & VTD_PAGE_MASK;
 314#else
 315        /* Must have a full atomic 64-bit read */
 316        return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 317#endif
 318}
 319
 320static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 321{
 322        pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 323}
 324
 325static inline bool dma_pte_present(struct dma_pte *pte)
 326{
 327        return (pte->val & 3) != 0;
 328}
 329
 330static inline bool dma_pte_superpage(struct dma_pte *pte)
 331{
 332        return (pte->val & (1 << 7));
 333}
 334
 335static inline int first_pte_in_page(struct dma_pte *pte)
 336{
 337        return !((unsigned long)pte & ~VTD_PAGE_MASK);
 338}
 339
 340/*
 341 * This domain is a statically identity mapping domain.
 342 *      1. This domain creats a static 1:1 mapping to all usable memory.
 343 *      2. It maps to each iommu if successful.
 344 *      3. Each iommu mapps to this domain if successful.
 345 */
 346static struct dmar_domain *si_domain;
 347static int hw_pass_through = 1;
 348
 349/* devices under the same p2p bridge are owned in one domain */
 350#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 351
 352/* domain represents a virtual machine, more than one devices
 353 * across iommus may be owned in one domain, e.g. kvm guest.
 354 */
 355#define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
 356
 357/* si_domain contains mulitple devices */
 358#define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
 359
 360/* define the limit of IOMMUs supported in each domain */
 361#ifdef  CONFIG_X86
 362# define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
 363#else
 364# define        IOMMU_UNITS_SUPPORTED   64
 365#endif
 366
 367struct dmar_domain {
 368        int     id;                     /* domain id */
 369        int     nid;                    /* node id */
 370        DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
 371                                        /* bitmap of iommus this domain uses*/
 372
 373        struct list_head devices;       /* all devices' list */
 374        struct iova_domain iovad;       /* iova's that belong to this domain */
 375
 376        struct dma_pte  *pgd;           /* virtual address */
 377        int             gaw;            /* max guest address width */
 378
 379        /* adjusted guest address width, 0 is level 2 30-bit */
 380        int             agaw;
 381
 382        int             flags;          /* flags to find out type of domain */
 383
 384        int             iommu_coherency;/* indicate coherency of iommu access */
 385        int             iommu_snooping; /* indicate snooping control feature*/
 386        int             iommu_count;    /* reference count of iommu */
 387        int             iommu_superpage;/* Level of superpages supported:
 388                                           0 == 4KiB (no superpages), 1 == 2MiB,
 389                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 390        spinlock_t      iommu_lock;     /* protect iommu set in domain */
 391        u64             max_addr;       /* maximum mapped address */
 392};
 393
 394/* PCI domain-device relationship */
 395struct device_domain_info {
 396        struct list_head link;  /* link to domain siblings */
 397        struct list_head global; /* link to global list */
 398        int segment;            /* PCI domain */
 399        u8 bus;                 /* PCI bus number */
 400        u8 devfn;               /* PCI devfn number */
 401        struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 402        struct intel_iommu *iommu; /* IOMMU used by this device */
 403        struct dmar_domain *domain; /* pointer to domain */
 404};
 405
 406static void flush_unmaps_timeout(unsigned long data);
 407
 408DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 409
 410#define HIGH_WATER_MARK 250
 411struct deferred_flush_tables {
 412        int next;
 413        struct iova *iova[HIGH_WATER_MARK];
 414        struct dmar_domain *domain[HIGH_WATER_MARK];
 415};
 416
 417static struct deferred_flush_tables *deferred_flush;
 418
 419/* bitmap for indexing intel_iommus */
 420static int g_num_of_iommus;
 421
 422static DEFINE_SPINLOCK(async_umap_flush_lock);
 423static LIST_HEAD(unmaps_to_do);
 424
 425static int timer_on;
 426static long list_size;
 427
 428static void domain_remove_dev_info(struct dmar_domain *domain);
 429
 430#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 431int dmar_disabled = 0;
 432#else
 433int dmar_disabled = 1;
 434#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 435
 436int intel_iommu_enabled = 0;
 437EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 438
 439static int dmar_map_gfx = 1;
 440static int dmar_forcedac;
 441static int intel_iommu_strict;
 442static int intel_iommu_superpage = 1;
 443
 444int intel_iommu_gfx_mapped;
 445EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 446
 447#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 448static DEFINE_SPINLOCK(device_domain_lock);
 449static LIST_HEAD(device_domain_list);
 450
 451static struct iommu_ops intel_iommu_ops;
 452
 453static int __init intel_iommu_setup(char *str)
 454{
 455        if (!str)
 456                return -EINVAL;
 457        while (*str) {
 458                if (!strncmp(str, "on", 2)) {
 459                        dmar_disabled = 0;
 460                        printk(KERN_INFO "Intel-IOMMU: enabled\n");
 461                } else if (!strncmp(str, "off", 3)) {
 462                        dmar_disabled = 1;
 463                        printk(KERN_INFO "Intel-IOMMU: disabled\n");
 464                } else if (!strncmp(str, "igfx_off", 8)) {
 465                        dmar_map_gfx = 0;
 466                        printk(KERN_INFO
 467                                "Intel-IOMMU: disable GFX device mapping\n");
 468                } else if (!strncmp(str, "forcedac", 8)) {
 469                        printk(KERN_INFO
 470                                "Intel-IOMMU: Forcing DAC for PCI devices\n");
 471                        dmar_forcedac = 1;
 472                } else if (!strncmp(str, "strict", 6)) {
 473                        printk(KERN_INFO
 474                                "Intel-IOMMU: disable batched IOTLB flush\n");
 475                        intel_iommu_strict = 1;
 476                } else if (!strncmp(str, "sp_off", 6)) {
 477                        printk(KERN_INFO
 478                                "Intel-IOMMU: disable supported super page\n");
 479                        intel_iommu_superpage = 0;
 480                }
 481
 482                str += strcspn(str, ",");
 483                while (*str == ',')
 484                        str++;
 485        }
 486        return 0;
 487}
 488__setup("intel_iommu=", intel_iommu_setup);
 489
 490static struct kmem_cache *iommu_domain_cache;
 491static struct kmem_cache *iommu_devinfo_cache;
 492static struct kmem_cache *iommu_iova_cache;
 493
 494static inline void *alloc_pgtable_page(int node)
 495{
 496        struct page *page;
 497        void *vaddr = NULL;
 498
 499        page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 500        if (page)
 501                vaddr = page_address(page);
 502        return vaddr;
 503}
 504
 505static inline void free_pgtable_page(void *vaddr)
 506{
 507        free_page((unsigned long)vaddr);
 508}
 509
 510static inline void *alloc_domain_mem(void)
 511{
 512        return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 513}
 514
 515static void free_domain_mem(void *vaddr)
 516{
 517        kmem_cache_free(iommu_domain_cache, vaddr);
 518}
 519
 520static inline void * alloc_devinfo_mem(void)
 521{
 522        return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 523}
 524
 525static inline void free_devinfo_mem(void *vaddr)
 526{
 527        kmem_cache_free(iommu_devinfo_cache, vaddr);
 528}
 529
 530struct iova *alloc_iova_mem(void)
 531{
 532        return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 533}
 534
 535void free_iova_mem(struct iova *iova)
 536{
 537        kmem_cache_free(iommu_iova_cache, iova);
 538}
 539
 540
 541static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 542{
 543        unsigned long sagaw;
 544        int agaw = -1;
 545
 546        sagaw = cap_sagaw(iommu->cap);
 547        for (agaw = width_to_agaw(max_gaw);
 548             agaw >= 0; agaw--) {
 549                if (test_bit(agaw, &sagaw))
 550                        break;
 551        }
 552
 553        return agaw;
 554}
 555
 556/*
 557 * Calculate max SAGAW for each iommu.
 558 */
 559int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 560{
 561        return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 562}
 563
 564/*
 565 * calculate agaw for each iommu.
 566 * "SAGAW" may be different across iommus, use a default agaw, and
 567 * get a supported less agaw for iommus that don't support the default agaw.
 568 */
 569int iommu_calculate_agaw(struct intel_iommu *iommu)
 570{
 571        return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 572}
 573
 574/* This functionin only returns single iommu in a domain */
 575static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 576{
 577        int iommu_id;
 578
 579        /* si_domain and vm domain should not get here. */
 580        BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 581        BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 582
 583        iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 584        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 585                return NULL;
 586
 587        return g_iommus[iommu_id];
 588}
 589
 590static void domain_update_iommu_coherency(struct dmar_domain *domain)
 591{
 592        int i;
 593
 594        i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
 595
 596        domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
 597
 598        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 599                if (!ecap_coherent(g_iommus[i]->ecap)) {
 600                        domain->iommu_coherency = 0;
 601                        break;
 602                }
 603        }
 604}
 605
 606static void domain_update_iommu_snooping(struct dmar_domain *domain)
 607{
 608        int i;
 609
 610        domain->iommu_snooping = 1;
 611
 612        for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
 613                if (!ecap_sc_support(g_iommus[i]->ecap)) {
 614                        domain->iommu_snooping = 0;
 615                        break;
 616                }
 617        }
 618}
 619
 620static void domain_update_iommu_superpage(struct dmar_domain *domain)
 621{
 622        struct dmar_drhd_unit *drhd;
 623        struct intel_iommu *iommu = NULL;
 624        int mask = 0xf;
 625
 626        if (!intel_iommu_superpage) {
 627                domain->iommu_superpage = 0;
 628                return;
 629        }
 630
 631        /* set iommu_superpage to the smallest common denominator */
 632        for_each_active_iommu(iommu, drhd) {
 633                mask &= cap_super_page_val(iommu->cap);
 634                if (!mask) {
 635                        break;
 636                }
 637        }
 638        domain->iommu_superpage = fls(mask);
 639}
 640
 641/* Some capabilities may be different across iommus */
 642static void domain_update_iommu_cap(struct dmar_domain *domain)
 643{
 644        domain_update_iommu_coherency(domain);
 645        domain_update_iommu_snooping(domain);
 646        domain_update_iommu_superpage(domain);
 647}
 648
 649static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 650{
 651        struct dmar_drhd_unit *drhd = NULL;
 652        int i;
 653
 654        for_each_drhd_unit(drhd) {
 655                if (drhd->ignored)
 656                        continue;
 657                if (segment != drhd->segment)
 658                        continue;
 659
 660                for (i = 0; i < drhd->devices_cnt; i++) {
 661                        if (drhd->devices[i] &&
 662                            drhd->devices[i]->bus->number == bus &&
 663                            drhd->devices[i]->devfn == devfn)
 664                                return drhd->iommu;
 665                        if (drhd->devices[i] &&
 666                            drhd->devices[i]->subordinate &&
 667                            drhd->devices[i]->subordinate->number <= bus &&
 668                            drhd->devices[i]->subordinate->busn_res.end >= bus)
 669                                return drhd->iommu;
 670                }
 671
 672                if (drhd->include_all)
 673                        return drhd->iommu;
 674        }
 675
 676        return NULL;
 677}
 678
 679static void domain_flush_cache(struct dmar_domain *domain,
 680                               void *addr, int size)
 681{
 682        if (!domain->iommu_coherency)
 683                clflush_cache_range(addr, size);
 684}
 685
 686/* Gets context entry for a given bus and devfn */
 687static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 688                u8 bus, u8 devfn)
 689{
 690        struct root_entry *root;
 691        struct context_entry *context;
 692        unsigned long phy_addr;
 693        unsigned long flags;
 694
 695        spin_lock_irqsave(&iommu->lock, flags);
 696        root = &iommu->root_entry[bus];
 697        context = get_context_addr_from_root(root);
 698        if (!context) {
 699                context = (struct context_entry *)
 700                                alloc_pgtable_page(iommu->node);
 701                if (!context) {
 702                        spin_unlock_irqrestore(&iommu->lock, flags);
 703                        return NULL;
 704                }
 705                __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 706                phy_addr = virt_to_phys((void *)context);
 707                set_root_value(root, phy_addr);
 708                set_root_present(root);
 709                __iommu_flush_cache(iommu, root, sizeof(*root));
 710        }
 711        spin_unlock_irqrestore(&iommu->lock, flags);
 712        return &context[devfn];
 713}
 714
 715static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 716{
 717        struct root_entry *root;
 718        struct context_entry *context;
 719        int ret;
 720        unsigned long flags;
 721
 722        spin_lock_irqsave(&iommu->lock, flags);
 723        root = &iommu->root_entry[bus];
 724        context = get_context_addr_from_root(root);
 725        if (!context) {
 726                ret = 0;
 727                goto out;
 728        }
 729        ret = context_present(&context[devfn]);
 730out:
 731        spin_unlock_irqrestore(&iommu->lock, flags);
 732        return ret;
 733}
 734
 735static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 736{
 737        struct root_entry *root;
 738        struct context_entry *context;
 739        unsigned long flags;
 740
 741        spin_lock_irqsave(&iommu->lock, flags);
 742        root = &iommu->root_entry[bus];
 743        context = get_context_addr_from_root(root);
 744        if (context) {
 745                context_clear_entry(&context[devfn]);
 746                __iommu_flush_cache(iommu, &context[devfn], \
 747                        sizeof(*context));
 748        }
 749        spin_unlock_irqrestore(&iommu->lock, flags);
 750}
 751
 752static void free_context_table(struct intel_iommu *iommu)
 753{
 754        struct root_entry *root;
 755        int i;
 756        unsigned long flags;
 757        struct context_entry *context;
 758
 759        spin_lock_irqsave(&iommu->lock, flags);
 760        if (!iommu->root_entry) {
 761                goto out;
 762        }
 763        for (i = 0; i < ROOT_ENTRY_NR; i++) {
 764                root = &iommu->root_entry[i];
 765                context = get_context_addr_from_root(root);
 766                if (context)
 767                        free_pgtable_page(context);
 768        }
 769        free_pgtable_page(iommu->root_entry);
 770        iommu->root_entry = NULL;
 771out:
 772        spin_unlock_irqrestore(&iommu->lock, flags);
 773}
 774
 775static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 776                                      unsigned long pfn, int target_level)
 777{
 778        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 779        struct dma_pte *parent, *pte = NULL;
 780        int level = agaw_to_level(domain->agaw);
 781        int offset;
 782
 783        BUG_ON(!domain->pgd);
 784        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 785        parent = domain->pgd;
 786
 787        while (level > 0) {
 788                void *tmp_page;
 789
 790                offset = pfn_level_offset(pfn, level);
 791                pte = &parent[offset];
 792                if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 793                        break;
 794                if (level == target_level)
 795                        break;
 796
 797                if (!dma_pte_present(pte)) {
 798                        uint64_t pteval;
 799
 800                        tmp_page = alloc_pgtable_page(domain->nid);
 801
 802                        if (!tmp_page)
 803                                return NULL;
 804
 805                        domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 806                        pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 807                        if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 808                                /* Someone else set it while we were thinking; use theirs. */
 809                                free_pgtable_page(tmp_page);
 810                        } else {
 811                                dma_pte_addr(pte);
 812                                domain_flush_cache(domain, pte, sizeof(*pte));
 813                        }
 814                }
 815                parent = phys_to_virt(dma_pte_addr(pte));
 816                level--;
 817        }
 818
 819        return pte;
 820}
 821
 822
 823/* return address's pte at specific level */
 824static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 825                                         unsigned long pfn,
 826                                         int level, int *large_page)
 827{
 828        struct dma_pte *parent, *pte = NULL;
 829        int total = agaw_to_level(domain->agaw);
 830        int offset;
 831
 832        parent = domain->pgd;
 833        while (level <= total) {
 834                offset = pfn_level_offset(pfn, total);
 835                pte = &parent[offset];
 836                if (level == total)
 837                        return pte;
 838
 839                if (!dma_pte_present(pte)) {
 840                        *large_page = total;
 841                        break;
 842                }
 843
 844                if (pte->val & DMA_PTE_LARGE_PAGE) {
 845                        *large_page = total;
 846                        return pte;
 847                }
 848
 849                parent = phys_to_virt(dma_pte_addr(pte));
 850                total--;
 851        }
 852        return NULL;
 853}
 854
 855/* clear last level pte, a tlb flush should be followed */
 856static int dma_pte_clear_range(struct dmar_domain *domain,
 857                                unsigned long start_pfn,
 858                                unsigned long last_pfn)
 859{
 860        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 861        unsigned int large_page = 1;
 862        struct dma_pte *first_pte, *pte;
 863        int order;
 864
 865        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 866        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 867        BUG_ON(start_pfn > last_pfn);
 868
 869        /* we don't need lock here; nobody else touches the iova range */
 870        do {
 871                large_page = 1;
 872                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 873                if (!pte) {
 874                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 875                        continue;
 876                }
 877                do {
 878                        dma_clear_pte(pte);
 879                        start_pfn += lvl_to_nr_pages(large_page);
 880                        pte++;
 881                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 882
 883                domain_flush_cache(domain, first_pte,
 884                                   (void *)pte - (void *)first_pte);
 885
 886        } while (start_pfn && start_pfn <= last_pfn);
 887
 888        order = (large_page - 1) * 9;
 889        return order;
 890}
 891
 892/* free page table pages. last level pte should already be cleared */
 893static void dma_pte_free_pagetable(struct dmar_domain *domain,
 894                                   unsigned long start_pfn,
 895                                   unsigned long last_pfn)
 896{
 897        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 898        struct dma_pte *first_pte, *pte;
 899        int total = agaw_to_level(domain->agaw);
 900        int level;
 901        unsigned long tmp;
 902        int large_page = 2;
 903
 904        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 905        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 906        BUG_ON(start_pfn > last_pfn);
 907
 908        /* We don't need lock here; nobody else touches the iova range */
 909        level = 2;
 910        while (level <= total) {
 911                tmp = align_to_level(start_pfn, level);
 912
 913                /* If we can't even clear one PTE at this level, we're done */
 914                if (tmp + level_size(level) - 1 > last_pfn)
 915                        return;
 916
 917                do {
 918                        large_page = level;
 919                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 920                        if (large_page > level)
 921                                level = large_page + 1;
 922                        if (!pte) {
 923                                tmp = align_to_level(tmp + 1, level + 1);
 924                                continue;
 925                        }
 926                        do {
 927                                if (dma_pte_present(pte)) {
 928                                        free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 929                                        dma_clear_pte(pte);
 930                                }
 931                                pte++;
 932                                tmp += level_size(level);
 933                        } while (!first_pte_in_page(pte) &&
 934                                 tmp + level_size(level) - 1 <= last_pfn);
 935
 936                        domain_flush_cache(domain, first_pte,
 937                                           (void *)pte - (void *)first_pte);
 938                        
 939                } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 940                level++;
 941        }
 942        /* free pgd */
 943        if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 944                free_pgtable_page(domain->pgd);
 945                domain->pgd = NULL;
 946        }
 947}
 948
 949/* iommu handling */
 950static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 951{
 952        struct root_entry *root;
 953        unsigned long flags;
 954
 955        root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 956        if (!root)
 957                return -ENOMEM;
 958
 959        __iommu_flush_cache(iommu, root, ROOT_SIZE);
 960
 961        spin_lock_irqsave(&iommu->lock, flags);
 962        iommu->root_entry = root;
 963        spin_unlock_irqrestore(&iommu->lock, flags);
 964
 965        return 0;
 966}
 967
 968static void iommu_set_root_entry(struct intel_iommu *iommu)
 969{
 970        void *addr;
 971        u32 sts;
 972        unsigned long flag;
 973
 974        addr = iommu->root_entry;
 975
 976        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 977        dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 978
 979        writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 980
 981        /* Make sure hardware complete it */
 982        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 983                      readl, (sts & DMA_GSTS_RTPS), sts);
 984
 985        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 986}
 987
 988static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 989{
 990        u32 val;
 991        unsigned long flag;
 992
 993        if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 994                return;
 995
 996        raw_spin_lock_irqsave(&iommu->register_lock, flag);
 997        writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 998
 999        /* Make sure hardware complete it */
1000        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001                      readl, (!(val & DMA_GSTS_WBFS)), val);
1002
1003        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1004}
1005
1006/* return value determine if we need a write buffer flush */
1007static void __iommu_flush_context(struct intel_iommu *iommu,
1008                                  u16 did, u16 source_id, u8 function_mask,
1009                                  u64 type)
1010{
1011        u64 val = 0;
1012        unsigned long flag;
1013
1014        switch (type) {
1015        case DMA_CCMD_GLOBAL_INVL:
1016                val = DMA_CCMD_GLOBAL_INVL;
1017                break;
1018        case DMA_CCMD_DOMAIN_INVL:
1019                val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1020                break;
1021        case DMA_CCMD_DEVICE_INVL:
1022                val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1023                        | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1024                break;
1025        default:
1026                BUG();
1027        }
1028        val |= DMA_CCMD_ICC;
1029
1030        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1031        dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1032
1033        /* Make sure hardware complete it */
1034        IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1035                dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1036
1037        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1038}
1039
1040/* return value determine if we need a write buffer flush */
1041static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1042                                u64 addr, unsigned int size_order, u64 type)
1043{
1044        int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1045        u64 val = 0, val_iva = 0;
1046        unsigned long flag;
1047
1048        switch (type) {
1049        case DMA_TLB_GLOBAL_FLUSH:
1050                /* global flush doesn't need set IVA_REG */
1051                val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1052                break;
1053        case DMA_TLB_DSI_FLUSH:
1054                val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1055                break;
1056        case DMA_TLB_PSI_FLUSH:
1057                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1058                /* Note: always flush non-leaf currently */
1059                val_iva = size_order | addr;
1060                break;
1061        default:
1062                BUG();
1063        }
1064        /* Note: set drain read/write */
1065#if 0
1066        /*
1067         * This is probably to be super secure.. Looks like we can
1068         * ignore it without any impact.
1069         */
1070        if (cap_read_drain(iommu->cap))
1071                val |= DMA_TLB_READ_DRAIN;
1072#endif
1073        if (cap_write_drain(iommu->cap))
1074                val |= DMA_TLB_WRITE_DRAIN;
1075
1076        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1077        /* Note: Only uses first TLB reg currently */
1078        if (val_iva)
1079                dmar_writeq(iommu->reg + tlb_offset, val_iva);
1080        dmar_writeq(iommu->reg + tlb_offset + 8, val);
1081
1082        /* Make sure hardware complete it */
1083        IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1084                dmar_readq, (!(val & DMA_TLB_IVT)), val);
1085
1086        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1087
1088        /* check IOTLB invalidation granularity */
1089        if (DMA_TLB_IAIG(val) == 0)
1090                printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1091        if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1092                pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1093                        (unsigned long long)DMA_TLB_IIRG(type),
1094                        (unsigned long long)DMA_TLB_IAIG(val));
1095}
1096
1097static struct device_domain_info *iommu_support_dev_iotlb(
1098        struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1099{
1100        int found = 0;
1101        unsigned long flags;
1102        struct device_domain_info *info;
1103        struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1104
1105        if (!ecap_dev_iotlb_support(iommu->ecap))
1106                return NULL;
1107
1108        if (!iommu->qi)
1109                return NULL;
1110
1111        spin_lock_irqsave(&device_domain_lock, flags);
1112        list_for_each_entry(info, &domain->devices, link)
1113                if (info->bus == bus && info->devfn == devfn) {
1114                        found = 1;
1115                        break;
1116                }
1117        spin_unlock_irqrestore(&device_domain_lock, flags);
1118
1119        if (!found || !info->dev)
1120                return NULL;
1121
1122        if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1123                return NULL;
1124
1125        if (!dmar_find_matched_atsr_unit(info->dev))
1126                return NULL;
1127
1128        info->iommu = iommu;
1129
1130        return info;
1131}
1132
1133static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1134{
1135        if (!info)
1136                return;
1137
1138        pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1139}
1140
1141static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1142{
1143        if (!info->dev || !pci_ats_enabled(info->dev))
1144                return;
1145
1146        pci_disable_ats(info->dev);
1147}
1148
1149static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1150                                  u64 addr, unsigned mask)
1151{
1152        u16 sid, qdep;
1153        unsigned long flags;
1154        struct device_domain_info *info;
1155
1156        spin_lock_irqsave(&device_domain_lock, flags);
1157        list_for_each_entry(info, &domain->devices, link) {
1158                if (!info->dev || !pci_ats_enabled(info->dev))
1159                        continue;
1160
1161                sid = info->bus << 8 | info->devfn;
1162                qdep = pci_ats_queue_depth(info->dev);
1163                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1164        }
1165        spin_unlock_irqrestore(&device_domain_lock, flags);
1166}
1167
1168static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1169                                  unsigned long pfn, unsigned int pages, int map)
1170{
1171        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1172        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1173
1174        BUG_ON(pages == 0);
1175
1176        /*
1177         * Fallback to domain selective flush if no PSI support or the size is
1178         * too big.
1179         * PSI requires page size to be 2 ^ x, and the base address is naturally
1180         * aligned to the size
1181         */
1182        if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1183                iommu->flush.flush_iotlb(iommu, did, 0, 0,
1184                                                DMA_TLB_DSI_FLUSH);
1185        else
1186                iommu->flush.flush_iotlb(iommu, did, addr, mask,
1187                                                DMA_TLB_PSI_FLUSH);
1188
1189        /*
1190         * In caching mode, changes of pages from non-present to present require
1191         * flush. However, device IOTLB doesn't need to be flushed in this case.
1192         */
1193        if (!cap_caching_mode(iommu->cap) || !map)
1194                iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1195}
1196
1197static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1198{
1199        u32 pmen;
1200        unsigned long flags;
1201
1202        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1203        pmen = readl(iommu->reg + DMAR_PMEN_REG);
1204        pmen &= ~DMA_PMEN_EPM;
1205        writel(pmen, iommu->reg + DMAR_PMEN_REG);
1206
1207        /* wait for the protected region status bit to clear */
1208        IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1209                readl, !(pmen & DMA_PMEN_PRS), pmen);
1210
1211        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1212}
1213
1214static int iommu_enable_translation(struct intel_iommu *iommu)
1215{
1216        u32 sts;
1217        unsigned long flags;
1218
1219        raw_spin_lock_irqsave(&iommu->register_lock, flags);
1220        iommu->gcmd |= DMA_GCMD_TE;
1221        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1222
1223        /* Make sure hardware complete it */
1224        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1225                      readl, (sts & DMA_GSTS_TES), sts);
1226
1227        raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1228        return 0;
1229}
1230
1231static int iommu_disable_translation(struct intel_iommu *iommu)
1232{
1233        u32 sts;
1234        unsigned long flag;
1235
1236        raw_spin_lock_irqsave(&iommu->register_lock, flag);
1237        iommu->gcmd &= ~DMA_GCMD_TE;
1238        writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1239
1240        /* Make sure hardware complete it */
1241        IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1242                      readl, (!(sts & DMA_GSTS_TES)), sts);
1243
1244        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1245        return 0;
1246}
1247
1248
1249static int iommu_init_domains(struct intel_iommu *iommu)
1250{
1251        unsigned long ndomains;
1252        unsigned long nlongs;
1253
1254        ndomains = cap_ndoms(iommu->cap);
1255        pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1256                        ndomains);
1257        nlongs = BITS_TO_LONGS(ndomains);
1258
1259        spin_lock_init(&iommu->lock);
1260
1261        /* TBD: there might be 64K domains,
1262         * consider other allocation for future chip
1263         */
1264        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1265        if (!iommu->domain_ids) {
1266                printk(KERN_ERR "Allocating domain id array failed\n");
1267                return -ENOMEM;
1268        }
1269        iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1270                        GFP_KERNEL);
1271        if (!iommu->domains) {
1272                printk(KERN_ERR "Allocating domain array failed\n");
1273                return -ENOMEM;
1274        }
1275
1276        /*
1277         * if Caching mode is set, then invalid translations are tagged
1278         * with domainid 0. Hence we need to pre-allocate it.
1279         */
1280        if (cap_caching_mode(iommu->cap))
1281                set_bit(0, iommu->domain_ids);
1282        return 0;
1283}
1284
1285
1286static void domain_exit(struct dmar_domain *domain);
1287static void vm_domain_exit(struct dmar_domain *domain);
1288
1289void free_dmar_iommu(struct intel_iommu *iommu)
1290{
1291        struct dmar_domain *domain;
1292        int i;
1293        unsigned long flags;
1294
1295        if ((iommu->domains) && (iommu->domain_ids)) {
1296                for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1297                        domain = iommu->domains[i];
1298                        clear_bit(i, iommu->domain_ids);
1299
1300                        spin_lock_irqsave(&domain->iommu_lock, flags);
1301                        if (--domain->iommu_count == 0) {
1302                                if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1303                                        vm_domain_exit(domain);
1304                                else
1305                                        domain_exit(domain);
1306                        }
1307                        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1308                }
1309        }
1310
1311        if (iommu->gcmd & DMA_GCMD_TE)
1312                iommu_disable_translation(iommu);
1313
1314        if (iommu->irq) {
1315                irq_set_handler_data(iommu->irq, NULL);
1316                /* This will mask the irq */
1317                free_irq(iommu->irq, iommu);
1318                destroy_irq(iommu->irq);
1319        }
1320
1321        kfree(iommu->domains);
1322        kfree(iommu->domain_ids);
1323
1324        g_iommus[iommu->seq_id] = NULL;
1325
1326        /* if all iommus are freed, free g_iommus */
1327        for (i = 0; i < g_num_of_iommus; i++) {
1328                if (g_iommus[i])
1329                        break;
1330        }
1331
1332        if (i == g_num_of_iommus)
1333                kfree(g_iommus);
1334
1335        /* free context mapping */
1336        free_context_table(iommu);
1337}
1338
1339static struct dmar_domain *alloc_domain(void)
1340{
1341        struct dmar_domain *domain;
1342
1343        domain = alloc_domain_mem();
1344        if (!domain)
1345                return NULL;
1346
1347        domain->nid = -1;
1348        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1349        domain->flags = 0;
1350
1351        return domain;
1352}
1353
1354static int iommu_attach_domain(struct dmar_domain *domain,
1355                               struct intel_iommu *iommu)
1356{
1357        int num;
1358        unsigned long ndomains;
1359        unsigned long flags;
1360
1361        ndomains = cap_ndoms(iommu->cap);
1362
1363        spin_lock_irqsave(&iommu->lock, flags);
1364
1365        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1366        if (num >= ndomains) {
1367                spin_unlock_irqrestore(&iommu->lock, flags);
1368                printk(KERN_ERR "IOMMU: no free domain ids\n");
1369                return -ENOMEM;
1370        }
1371
1372        domain->id = num;
1373        set_bit(num, iommu->domain_ids);
1374        set_bit(iommu->seq_id, domain->iommu_bmp);
1375        iommu->domains[num] = domain;
1376        spin_unlock_irqrestore(&iommu->lock, flags);
1377
1378        return 0;
1379}
1380
1381static void iommu_detach_domain(struct dmar_domain *domain,
1382                                struct intel_iommu *iommu)
1383{
1384        unsigned long flags;
1385        int num, ndomains;
1386        int found = 0;
1387
1388        spin_lock_irqsave(&iommu->lock, flags);
1389        ndomains = cap_ndoms(iommu->cap);
1390        for_each_set_bit(num, iommu->domain_ids, ndomains) {
1391                if (iommu->domains[num] == domain) {
1392                        found = 1;
1393                        break;
1394                }
1395        }
1396
1397        if (found) {
1398                clear_bit(num, iommu->domain_ids);
1399                clear_bit(iommu->seq_id, domain->iommu_bmp);
1400                iommu->domains[num] = NULL;
1401        }
1402        spin_unlock_irqrestore(&iommu->lock, flags);
1403}
1404
1405static struct iova_domain reserved_iova_list;
1406static struct lock_class_key reserved_rbtree_key;
1407
1408static int dmar_init_reserved_ranges(void)
1409{
1410        struct pci_dev *pdev = NULL;
1411        struct iova *iova;
1412        int i;
1413
1414        init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1415
1416        lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1417                &reserved_rbtree_key);
1418
1419        /* IOAPIC ranges shouldn't be accessed by DMA */
1420        iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1421                IOVA_PFN(IOAPIC_RANGE_END));
1422        if (!iova) {
1423                printk(KERN_ERR "Reserve IOAPIC range failed\n");
1424                return -ENODEV;
1425        }
1426
1427        /* Reserve all PCI MMIO to avoid peer-to-peer access */
1428        for_each_pci_dev(pdev) {
1429                struct resource *r;
1430
1431                for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1432                        r = &pdev->resource[i];
1433                        if (!r->flags || !(r->flags & IORESOURCE_MEM))
1434                                continue;
1435                        iova = reserve_iova(&reserved_iova_list,
1436                                            IOVA_PFN(r->start),
1437                                            IOVA_PFN(r->end));
1438                        if (!iova) {
1439                                printk(KERN_ERR "Reserve iova failed\n");
1440                                return -ENODEV;
1441                        }
1442                }
1443        }
1444        return 0;
1445}
1446
1447static void domain_reserve_special_ranges(struct dmar_domain *domain)
1448{
1449        copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1450}
1451
1452static inline int guestwidth_to_adjustwidth(int gaw)
1453{
1454        int agaw;
1455        int r = (gaw - 12) % 9;
1456
1457        if (r == 0)
1458                agaw = gaw;
1459        else
1460                agaw = gaw + 9 - r;
1461        if (agaw > 64)
1462                agaw = 64;
1463        return agaw;
1464}
1465
1466static int domain_init(struct dmar_domain *domain, int guest_width)
1467{
1468        struct intel_iommu *iommu;
1469        int adjust_width, agaw;
1470        unsigned long sagaw;
1471
1472        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1473        spin_lock_init(&domain->iommu_lock);
1474
1475        domain_reserve_special_ranges(domain);
1476
1477        /* calculate AGAW */
1478        iommu = domain_get_iommu(domain);
1479        if (guest_width > cap_mgaw(iommu->cap))
1480                guest_width = cap_mgaw(iommu->cap);
1481        domain->gaw = guest_width;
1482        adjust_width = guestwidth_to_adjustwidth(guest_width);
1483        agaw = width_to_agaw(adjust_width);
1484        sagaw = cap_sagaw(iommu->cap);
1485        if (!test_bit(agaw, &sagaw)) {
1486                /* hardware doesn't support it, choose a bigger one */
1487                pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1488                agaw = find_next_bit(&sagaw, 5, agaw);
1489                if (agaw >= 5)
1490                        return -ENODEV;
1491        }
1492        domain->agaw = agaw;
1493        INIT_LIST_HEAD(&domain->devices);
1494
1495        if (ecap_coherent(iommu->ecap))
1496                domain->iommu_coherency = 1;
1497        else
1498                domain->iommu_coherency = 0;
1499
1500        if (ecap_sc_support(iommu->ecap))
1501                domain->iommu_snooping = 1;
1502        else
1503                domain->iommu_snooping = 0;
1504
1505        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1506        domain->iommu_count = 1;
1507        domain->nid = iommu->node;
1508
1509        /* always allocate the top pgd */
1510        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1511        if (!domain->pgd)
1512                return -ENOMEM;
1513        __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1514        return 0;
1515}
1516
1517static void domain_exit(struct dmar_domain *domain)
1518{
1519        struct dmar_drhd_unit *drhd;
1520        struct intel_iommu *iommu;
1521
1522        /* Domain 0 is reserved, so dont process it */
1523        if (!domain)
1524                return;
1525
1526        /* Flush any lazy unmaps that may reference this domain */
1527        if (!intel_iommu_strict)
1528                flush_unmaps_timeout(0);
1529
1530        domain_remove_dev_info(domain);
1531        /* destroy iovas */
1532        put_iova_domain(&domain->iovad);
1533
1534        /* clear ptes */
1535        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1536
1537        /* free page tables */
1538        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1539
1540        for_each_active_iommu(iommu, drhd)
1541                if (test_bit(iommu->seq_id, domain->iommu_bmp))
1542                        iommu_detach_domain(domain, iommu);
1543
1544        free_domain_mem(domain);
1545}
1546
1547static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1548                                 u8 bus, u8 devfn, int translation)
1549{
1550        struct context_entry *context;
1551        unsigned long flags;
1552        struct intel_iommu *iommu;
1553        struct dma_pte *pgd;
1554        unsigned long num;
1555        unsigned long ndomains;
1556        int id;
1557        int agaw;
1558        struct device_domain_info *info = NULL;
1559
1560        pr_debug("Set context mapping for %02x:%02x.%d\n",
1561                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1562
1563        BUG_ON(!domain->pgd);
1564        BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1565               translation != CONTEXT_TT_MULTI_LEVEL);
1566
1567        iommu = device_to_iommu(segment, bus, devfn);
1568        if (!iommu)
1569                return -ENODEV;
1570
1571        context = device_to_context_entry(iommu, bus, devfn);
1572        if (!context)
1573                return -ENOMEM;
1574        spin_lock_irqsave(&iommu->lock, flags);
1575        if (context_present(context)) {
1576                spin_unlock_irqrestore(&iommu->lock, flags);
1577                return 0;
1578        }
1579
1580        id = domain->id;
1581        pgd = domain->pgd;
1582
1583        if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1584            domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1585                int found = 0;
1586
1587                /* find an available domain id for this device in iommu */
1588                ndomains = cap_ndoms(iommu->cap);
1589                for_each_set_bit(num, iommu->domain_ids, ndomains) {
1590                        if (iommu->domains[num] == domain) {
1591                                id = num;
1592                                found = 1;
1593                                break;
1594                        }
1595                }
1596
1597                if (found == 0) {
1598                        num = find_first_zero_bit(iommu->domain_ids, ndomains);
1599                        if (num >= ndomains) {
1600                                spin_unlock_irqrestore(&iommu->lock, flags);
1601                                printk(KERN_ERR "IOMMU: no free domain ids\n");
1602                                return -EFAULT;
1603                        }
1604
1605                        set_bit(num, iommu->domain_ids);
1606                        iommu->domains[num] = domain;
1607                        id = num;
1608                }
1609
1610                /* Skip top levels of page tables for
1611                 * iommu which has less agaw than default.
1612                 * Unnecessary for PT mode.
1613                 */
1614                if (translation != CONTEXT_TT_PASS_THROUGH) {
1615                        for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1616                                pgd = phys_to_virt(dma_pte_addr(pgd));
1617                                if (!dma_pte_present(pgd)) {
1618                                        spin_unlock_irqrestore(&iommu->lock, flags);
1619                                        return -ENOMEM;
1620                                }
1621                        }
1622                }
1623        }
1624
1625        context_set_domain_id(context, id);
1626
1627        if (translation != CONTEXT_TT_PASS_THROUGH) {
1628                info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1629                translation = info ? CONTEXT_TT_DEV_IOTLB :
1630                                     CONTEXT_TT_MULTI_LEVEL;
1631        }
1632        /*
1633         * In pass through mode, AW must be programmed to indicate the largest
1634         * AGAW value supported by hardware. And ASR is ignored by hardware.
1635         */
1636        if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1637                context_set_address_width(context, iommu->msagaw);
1638        else {
1639                context_set_address_root(context, virt_to_phys(pgd));
1640                context_set_address_width(context, iommu->agaw);
1641        }
1642
1643        context_set_translation_type(context, translation);
1644        context_set_fault_enable(context);
1645        context_set_present(context);
1646        domain_flush_cache(domain, context, sizeof(*context));
1647
1648        /*
1649         * It's a non-present to present mapping. If hardware doesn't cache
1650         * non-present entry we only need to flush the write-buffer. If the
1651         * _does_ cache non-present entries, then it does so in the special
1652         * domain #0, which we have to flush:
1653         */
1654        if (cap_caching_mode(iommu->cap)) {
1655                iommu->flush.flush_context(iommu, 0,
1656                                           (((u16)bus) << 8) | devfn,
1657                                           DMA_CCMD_MASK_NOBIT,
1658                                           DMA_CCMD_DEVICE_INVL);
1659                iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1660        } else {
1661                iommu_flush_write_buffer(iommu);
1662        }
1663        iommu_enable_dev_iotlb(info);
1664        spin_unlock_irqrestore(&iommu->lock, flags);
1665
1666        spin_lock_irqsave(&domain->iommu_lock, flags);
1667        if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1668                domain->iommu_count++;
1669                if (domain->iommu_count == 1)
1670                        domain->nid = iommu->node;
1671                domain_update_iommu_cap(domain);
1672        }
1673        spin_unlock_irqrestore(&domain->iommu_lock, flags);
1674        return 0;
1675}
1676
1677static int
1678domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1679                        int translation)
1680{
1681        int ret;
1682        struct pci_dev *tmp, *parent;
1683
1684        ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1685                                         pdev->bus->number, pdev->devfn,
1686                                         translation);
1687        if (ret)
1688                return ret;
1689
1690        /* dependent device mapping */
1691        tmp = pci_find_upstream_pcie_bridge(pdev);
1692        if (!tmp)
1693                return 0;
1694        /* Secondary interface's bus number and devfn 0 */
1695        parent = pdev->bus->self;
1696        while (parent != tmp) {
1697                ret = domain_context_mapping_one(domain,
1698                                                 pci_domain_nr(parent->bus),
1699                                                 parent->bus->number,
1700                                                 parent->devfn, translation);
1701                if (ret)
1702                        return ret;
1703                parent = parent->bus->self;
1704        }
1705        if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1706                return domain_context_mapping_one(domain,
1707                                        pci_domain_nr(tmp->subordinate),
1708                                        tmp->subordinate->number, 0,
1709                                        translation);
1710        else /* this is a legacy PCI bridge */
1711                return domain_context_mapping_one(domain,
1712                                                  pci_domain_nr(tmp->bus),
1713                                                  tmp->bus->number,
1714                                                  tmp->devfn,
1715                                                  translation);
1716}
1717
1718static int domain_context_mapped(struct pci_dev *pdev)
1719{
1720        int ret;
1721        struct pci_dev *tmp, *parent;
1722        struct intel_iommu *iommu;
1723
1724        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1725                                pdev->devfn);
1726        if (!iommu)
1727                return -ENODEV;
1728
1729        ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1730        if (!ret)
1731                return ret;
1732        /* dependent device mapping */
1733        tmp = pci_find_upstream_pcie_bridge(pdev);
1734        if (!tmp)
1735                return ret;
1736        /* Secondary interface's bus number and devfn 0 */
1737        parent = pdev->bus->self;
1738        while (parent != tmp) {
1739                ret = device_context_mapped(iommu, parent->bus->number,
1740                                            parent->devfn);
1741                if (!ret)
1742                        return ret;
1743                parent = parent->bus->self;
1744        }
1745        if (pci_is_pcie(tmp))
1746                return device_context_mapped(iommu, tmp->subordinate->number,
1747                                             0);
1748        else
1749                return device_context_mapped(iommu, tmp->bus->number,
1750                                             tmp->devfn);
1751}
1752
1753/* Returns a number of VTD pages, but aligned to MM page size */
1754static inline unsigned long aligned_nrpages(unsigned long host_addr,
1755                                            size_t size)
1756{
1757        host_addr &= ~PAGE_MASK;
1758        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1759}
1760
1761/* Return largest possible superpage level for a given mapping */
1762static inline int hardware_largepage_caps(struct dmar_domain *domain,
1763                                          unsigned long iov_pfn,
1764                                          unsigned long phy_pfn,
1765                                          unsigned long pages)
1766{
1767        int support, level = 1;
1768        unsigned long pfnmerge;
1769
1770        support = domain->iommu_superpage;
1771
1772        /* To use a large page, the virtual *and* physical addresses
1773           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1774           of them will mean we have to use smaller pages. So just
1775           merge them and check both at once. */
1776        pfnmerge = iov_pfn | phy_pfn;
1777
1778        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1779                pages >>= VTD_STRIDE_SHIFT;
1780                if (!pages)
1781                        break;
1782                pfnmerge >>= VTD_STRIDE_SHIFT;
1783                level++;
1784                support--;
1785        }
1786        return level;
1787}
1788
1789static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1790                            struct scatterlist *sg, unsigned long phys_pfn,
1791                            unsigned long nr_pages, int prot)
1792{
1793        struct dma_pte *first_pte = NULL, *pte = NULL;
1794        phys_addr_t uninitialized_var(pteval);
1795        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1796        unsigned long sg_res;
1797        unsigned int largepage_lvl = 0;
1798        unsigned long lvl_pages = 0;
1799
1800        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1801
1802        if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1803                return -EINVAL;
1804
1805        prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1806
1807        if (sg)
1808                sg_res = 0;
1809        else {
1810                sg_res = nr_pages + 1;
1811                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1812        }
1813
1814        while (nr_pages > 0) {
1815                uint64_t tmp;
1816
1817                if (!sg_res) {
1818                        sg_res = aligned_nrpages(sg->offset, sg->length);
1819                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1820                        sg->dma_length = sg->length;
1821                        pteval = page_to_phys(sg_page(sg)) | prot;
1822                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
1823                }
1824
1825                if (!pte) {
1826                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1827
1828                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1829                        if (!pte)
1830                                return -ENOMEM;
1831                        /* It is large page*/
1832                        if (largepage_lvl > 1) {
1833                                pteval |= DMA_PTE_LARGE_PAGE;
1834                                /* Ensure that old small page tables are removed to make room
1835                                   for superpage, if they exist. */
1836                                dma_pte_clear_range(domain, iov_pfn,
1837                                                    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838                                dma_pte_free_pagetable(domain, iov_pfn,
1839                                                       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1840                        } else {
1841                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1842                        }
1843
1844                }
1845                /* We don't need lock here, nobody else
1846                 * touches the iova range
1847                 */
1848                tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1849                if (tmp) {
1850                        static int dumps = 5;
1851                        printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1852                               iov_pfn, tmp, (unsigned long long)pteval);
1853                        if (dumps) {
1854                                dumps--;
1855                                debug_dma_dump_mappings(NULL);
1856                        }
1857                        WARN_ON(1);
1858                }
1859
1860                lvl_pages = lvl_to_nr_pages(largepage_lvl);
1861
1862                BUG_ON(nr_pages < lvl_pages);
1863                BUG_ON(sg_res < lvl_pages);
1864
1865                nr_pages -= lvl_pages;
1866                iov_pfn += lvl_pages;
1867                phys_pfn += lvl_pages;
1868                pteval += lvl_pages * VTD_PAGE_SIZE;
1869                sg_res -= lvl_pages;
1870
1871                /* If the next PTE would be the first in a new page, then we
1872                   need to flush the cache on the entries we've just written.
1873                   And then we'll need to recalculate 'pte', so clear it and
1874                   let it get set again in the if (!pte) block above.
1875
1876                   If we're done (!nr_pages) we need to flush the cache too.
1877
1878                   Also if we've been setting superpages, we may need to
1879                   recalculate 'pte' and switch back to smaller pages for the
1880                   end of the mapping, if the trailing size is not enough to
1881                   use another superpage (i.e. sg_res < lvl_pages). */
1882                pte++;
1883                if (!nr_pages || first_pte_in_page(pte) ||
1884                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1885                        domain_flush_cache(domain, first_pte,
1886                                           (void *)pte - (void *)first_pte);
1887                        pte = NULL;
1888                }
1889
1890                if (!sg_res && nr_pages)
1891                        sg = sg_next(sg);
1892        }
1893        return 0;
1894}
1895
1896static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1897                                    struct scatterlist *sg, unsigned long nr_pages,
1898                                    int prot)
1899{
1900        return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1901}
1902
1903static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1904                                     unsigned long phys_pfn, unsigned long nr_pages,
1905                                     int prot)
1906{
1907        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1908}
1909
1910static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1911{
1912        if (!iommu)
1913                return;
1914
1915        clear_context_table(iommu, bus, devfn);
1916        iommu->flush.flush_context(iommu, 0, 0, 0,
1917                                           DMA_CCMD_GLOBAL_INVL);
1918        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1919}
1920
1921static inline void unlink_domain_info(struct device_domain_info *info)
1922{
1923        assert_spin_locked(&device_domain_lock);
1924        list_del(&info->link);
1925        list_del(&info->global);
1926        if (info->dev)
1927                info->dev->dev.archdata.iommu = NULL;
1928}
1929
1930static void domain_remove_dev_info(struct dmar_domain *domain)
1931{
1932        struct device_domain_info *info;
1933        unsigned long flags;
1934        struct intel_iommu *iommu;
1935
1936        spin_lock_irqsave(&device_domain_lock, flags);
1937        while (!list_empty(&domain->devices)) {
1938                info = list_entry(domain->devices.next,
1939                        struct device_domain_info, link);
1940                unlink_domain_info(info);
1941                spin_unlock_irqrestore(&device_domain_lock, flags);
1942
1943                iommu_disable_dev_iotlb(info);
1944                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1945                iommu_detach_dev(iommu, info->bus, info->devfn);
1946                free_devinfo_mem(info);
1947
1948                spin_lock_irqsave(&device_domain_lock, flags);
1949        }
1950        spin_unlock_irqrestore(&device_domain_lock, flags);
1951}
1952
1953/*
1954 * find_domain
1955 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1956 */
1957static struct dmar_domain *
1958find_domain(struct pci_dev *pdev)
1959{
1960        struct device_domain_info *info;
1961
1962        /* No lock here, assumes no domain exit in normal case */
1963        info = pdev->dev.archdata.iommu;
1964        if (info)
1965                return info->domain;
1966        return NULL;
1967}
1968
1969/* domain is initialized */
1970static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1971{
1972        struct dmar_domain *domain, *found = NULL;
1973        struct intel_iommu *iommu;
1974        struct dmar_drhd_unit *drhd;
1975        struct device_domain_info *info, *tmp;
1976        struct pci_dev *dev_tmp;
1977        unsigned long flags;
1978        int bus = 0, devfn = 0;
1979        int segment;
1980        int ret;
1981
1982        domain = find_domain(pdev);
1983        if (domain)
1984                return domain;
1985
1986        segment = pci_domain_nr(pdev->bus);
1987
1988        dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1989        if (dev_tmp) {
1990                if (pci_is_pcie(dev_tmp)) {
1991                        bus = dev_tmp->subordinate->number;
1992                        devfn = 0;
1993                } else {
1994                        bus = dev_tmp->bus->number;
1995                        devfn = dev_tmp->devfn;
1996                }
1997                spin_lock_irqsave(&device_domain_lock, flags);
1998                list_for_each_entry(info, &device_domain_list, global) {
1999                        if (info->segment == segment &&
2000                            info->bus == bus && info->devfn == devfn) {
2001                                found = info->domain;
2002                                break;
2003                        }
2004                }
2005                spin_unlock_irqrestore(&device_domain_lock, flags);
2006                /* pcie-pci bridge already has a domain, uses it */
2007                if (found) {
2008                        domain = found;
2009                        goto found_domain;
2010                }
2011        }
2012
2013        domain = alloc_domain();
2014        if (!domain)
2015                goto error;
2016
2017        /* Allocate new domain for the device */
2018        drhd = dmar_find_matched_drhd_unit(pdev);
2019        if (!drhd) {
2020                printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2021                        pci_name(pdev));
2022                free_domain_mem(domain);
2023                return NULL;
2024        }
2025        iommu = drhd->iommu;
2026
2027        ret = iommu_attach_domain(domain, iommu);
2028        if (ret) {
2029                free_domain_mem(domain);
2030                goto error;
2031        }
2032
2033        if (domain_init(domain, gaw)) {
2034                domain_exit(domain);
2035                goto error;
2036        }
2037
2038        /* register pcie-to-pci device */
2039        if (dev_tmp) {
2040                info = alloc_devinfo_mem();
2041                if (!info) {
2042                        domain_exit(domain);
2043                        goto error;
2044                }
2045                info->segment = segment;
2046                info->bus = bus;
2047                info->devfn = devfn;
2048                info->dev = NULL;
2049                info->domain = domain;
2050                /* This domain is shared by devices under p2p bridge */
2051                domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2052
2053                /* pcie-to-pci bridge already has a domain, uses it */
2054                found = NULL;
2055                spin_lock_irqsave(&device_domain_lock, flags);
2056                list_for_each_entry(tmp, &device_domain_list, global) {
2057                        if (tmp->segment == segment &&
2058                            tmp->bus == bus && tmp->devfn == devfn) {
2059                                found = tmp->domain;
2060                                break;
2061                        }
2062                }
2063                if (found) {
2064                        spin_unlock_irqrestore(&device_domain_lock, flags);
2065                        free_devinfo_mem(info);
2066                        domain_exit(domain);
2067                        domain = found;
2068                } else {
2069                        list_add(&info->link, &domain->devices);
2070                        list_add(&info->global, &device_domain_list);
2071                        spin_unlock_irqrestore(&device_domain_lock, flags);
2072                }
2073        }
2074
2075found_domain:
2076        info = alloc_devinfo_mem();
2077        if (!info)
2078                goto error;
2079        info->segment = segment;
2080        info->bus = pdev->bus->number;
2081        info->devfn = pdev->devfn;
2082        info->dev = pdev;
2083        info->domain = domain;
2084        spin_lock_irqsave(&device_domain_lock, flags);
2085        /* somebody is fast */
2086        found = find_domain(pdev);
2087        if (found != NULL) {
2088                spin_unlock_irqrestore(&device_domain_lock, flags);
2089                if (found != domain) {
2090                        domain_exit(domain);
2091                        domain = found;
2092                }
2093                free_devinfo_mem(info);
2094                return domain;
2095        }
2096        list_add(&info->link, &domain->devices);
2097        list_add(&info->global, &device_domain_list);
2098        pdev->dev.archdata.iommu = info;
2099        spin_unlock_irqrestore(&device_domain_lock, flags);
2100        return domain;
2101error:
2102        /* recheck it here, maybe others set it */
2103        return find_domain(pdev);
2104}
2105
2106static int iommu_identity_mapping;
2107#define IDENTMAP_ALL            1
2108#define IDENTMAP_GFX            2
2109#define IDENTMAP_AZALIA         4
2110
2111static int iommu_domain_identity_map(struct dmar_domain *domain,
2112                                     unsigned long long start,
2113                                     unsigned long long end)
2114{
2115        unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2116        unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2117
2118        if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2119                          dma_to_mm_pfn(last_vpfn))) {
2120                printk(KERN_ERR "IOMMU: reserve iova failed\n");
2121                return -ENOMEM;
2122        }
2123
2124        pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2125                 start, end, domain->id);
2126        /*
2127         * RMRR range might have overlap with physical memory range,
2128         * clear it first
2129         */
2130        dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2131
2132        return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2133                                  last_vpfn - first_vpfn + 1,
2134                                  DMA_PTE_READ|DMA_PTE_WRITE);
2135}
2136
2137static int iommu_prepare_identity_map(struct pci_dev *pdev,
2138                                      unsigned long long start,
2139                                      unsigned long long end)
2140{
2141        struct dmar_domain *domain;
2142        int ret;
2143
2144        domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2145        if (!domain)
2146                return -ENOMEM;
2147
2148        /* For _hardware_ passthrough, don't bother. But for software
2149           passthrough, we do it anyway -- it may indicate a memory
2150           range which is reserved in E820, so which didn't get set
2151           up to start with in si_domain */
2152        if (domain == si_domain && hw_pass_through) {
2153                printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2154                       pci_name(pdev), start, end);
2155                return 0;
2156        }
2157
2158        printk(KERN_INFO
2159               "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2160               pci_name(pdev), start, end);
2161        
2162        if (end < start) {
2163                WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2164                        "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2165                        dmi_get_system_info(DMI_BIOS_VENDOR),
2166                        dmi_get_system_info(DMI_BIOS_VERSION),
2167                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2168                ret = -EIO;
2169                goto error;
2170        }
2171
2172        if (end >> agaw_to_width(domain->agaw)) {
2173                WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2174                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2175                     agaw_to_width(domain->agaw),
2176                     dmi_get_system_info(DMI_BIOS_VENDOR),
2177                     dmi_get_system_info(DMI_BIOS_VERSION),
2178                     dmi_get_system_info(DMI_PRODUCT_VERSION));
2179                ret = -EIO;
2180                goto error;
2181        }
2182
2183        ret = iommu_domain_identity_map(domain, start, end);
2184        if (ret)
2185                goto error;
2186
2187        /* context entry init */
2188        ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2189        if (ret)
2190                goto error;
2191
2192        return 0;
2193
2194 error:
2195        domain_exit(domain);
2196        return ret;
2197}
2198
2199static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2200        struct pci_dev *pdev)
2201{
2202        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2203                return 0;
2204        return iommu_prepare_identity_map(pdev, rmrr->base_address,
2205                rmrr->end_address);
2206}
2207
2208#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2209static inline void iommu_prepare_isa(void)
2210{
2211        struct pci_dev *pdev;
2212        int ret;
2213
2214        pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2215        if (!pdev)
2216                return;
2217
2218        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2219        ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2220
2221        if (ret)
2222                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2223                       "floppy might not work\n");
2224
2225}
2226#else
2227static inline void iommu_prepare_isa(void)
2228{
2229        return;
2230}
2231#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2232
2233static int md_domain_init(struct dmar_domain *domain, int guest_width);
2234
2235static int __init si_domain_init(int hw)
2236{
2237        struct dmar_drhd_unit *drhd;
2238        struct intel_iommu *iommu;
2239        int nid, ret = 0;
2240
2241        si_domain = alloc_domain();
2242        if (!si_domain)
2243                return -EFAULT;
2244
2245        pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2246
2247        for_each_active_iommu(iommu, drhd) {
2248                ret = iommu_attach_domain(si_domain, iommu);
2249                if (ret) {
2250                        domain_exit(si_domain);
2251                        return -EFAULT;
2252                }
2253        }
2254
2255        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2256                domain_exit(si_domain);
2257                return -EFAULT;
2258        }
2259
2260        si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2261
2262        if (hw)
2263                return 0;
2264
2265        for_each_online_node(nid) {
2266                unsigned long start_pfn, end_pfn;
2267                int i;
2268
2269                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2270                        ret = iommu_domain_identity_map(si_domain,
2271                                        PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2272                        if (ret)
2273                                return ret;
2274                }
2275        }
2276
2277        return 0;
2278}
2279
2280static void domain_remove_one_dev_info(struct dmar_domain *domain,
2281                                          struct pci_dev *pdev);
2282static int identity_mapping(struct pci_dev *pdev)
2283{
2284        struct device_domain_info *info;
2285
2286        if (likely(!iommu_identity_mapping))
2287                return 0;
2288
2289        info = pdev->dev.archdata.iommu;
2290        if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2291                return (info->domain == si_domain);
2292
2293        return 0;
2294}
2295
2296static int domain_add_dev_info(struct dmar_domain *domain,
2297                               struct pci_dev *pdev,
2298                               int translation)
2299{
2300        struct device_domain_info *info;
2301        unsigned long flags;
2302        int ret;
2303
2304        info = alloc_devinfo_mem();
2305        if (!info)
2306                return -ENOMEM;
2307
2308        info->segment = pci_domain_nr(pdev->bus);
2309        info->bus = pdev->bus->number;
2310        info->devfn = pdev->devfn;
2311        info->dev = pdev;
2312        info->domain = domain;
2313
2314        spin_lock_irqsave(&device_domain_lock, flags);
2315        list_add(&info->link, &domain->devices);
2316        list_add(&info->global, &device_domain_list);
2317        pdev->dev.archdata.iommu = info;
2318        spin_unlock_irqrestore(&device_domain_lock, flags);
2319
2320        ret = domain_context_mapping(domain, pdev, translation);
2321        if (ret) {
2322                spin_lock_irqsave(&device_domain_lock, flags);
2323                unlink_domain_info(info);
2324                spin_unlock_irqrestore(&device_domain_lock, flags);
2325                free_devinfo_mem(info);
2326                return ret;
2327        }
2328
2329        return 0;
2330}
2331
2332static bool device_has_rmrr(struct pci_dev *dev)
2333{
2334        struct dmar_rmrr_unit *rmrr;
2335        int i;
2336
2337        for_each_rmrr_units(rmrr) {
2338                for (i = 0; i < rmrr->devices_cnt; i++) {
2339                        /*
2340                         * Return TRUE if this RMRR contains the device that
2341                         * is passed in.
2342                         */
2343                        if (rmrr->devices[i] == dev)
2344                                return true;
2345                }
2346        }
2347        return false;
2348}
2349
2350static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2351{
2352
2353        /*
2354         * We want to prevent any device associated with an RMRR from
2355         * getting placed into the SI Domain. This is done because
2356         * problems exist when devices are moved in and out of domains
2357         * and their respective RMRR info is lost. We exempt USB devices
2358         * from this process due to their usage of RMRRs that are known
2359         * to not be needed after BIOS hand-off to OS.
2360         */
2361        if (device_has_rmrr(pdev) &&
2362            (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2363                return 0;
2364
2365        if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2366                return 1;
2367
2368        if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2369                return 1;
2370
2371        if (!(iommu_identity_mapping & IDENTMAP_ALL))
2372                return 0;
2373
2374        /*
2375         * We want to start off with all devices in the 1:1 domain, and
2376         * take them out later if we find they can't access all of memory.
2377         *
2378         * However, we can't do this for PCI devices behind bridges,
2379         * because all PCI devices behind the same bridge will end up
2380         * with the same source-id on their transactions.
2381         *
2382         * Practically speaking, we can't change things around for these
2383         * devices at run-time, because we can't be sure there'll be no
2384         * DMA transactions in flight for any of their siblings.
2385         * 
2386         * So PCI devices (unless they're on the root bus) as well as
2387         * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2388         * the 1:1 domain, just in _case_ one of their siblings turns out
2389         * not to be able to map all of memory.
2390         */
2391        if (!pci_is_pcie(pdev)) {
2392                if (!pci_is_root_bus(pdev->bus))
2393                        return 0;
2394                if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2395                        return 0;
2396        } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2397                return 0;
2398
2399        /* 
2400         * At boot time, we don't yet know if devices will be 64-bit capable.
2401         * Assume that they will -- if they turn out not to be, then we can 
2402         * take them out of the 1:1 domain later.
2403         */
2404        if (!startup) {
2405                /*
2406                 * If the device's dma_mask is less than the system's memory
2407                 * size then this is not a candidate for identity mapping.
2408                 */
2409                u64 dma_mask = pdev->dma_mask;
2410
2411                if (pdev->dev.coherent_dma_mask &&
2412                    pdev->dev.coherent_dma_mask < dma_mask)
2413                        dma_mask = pdev->dev.coherent_dma_mask;
2414
2415                return dma_mask >= dma_get_required_mask(&pdev->dev);
2416        }
2417
2418        return 1;
2419}
2420
2421static int __init iommu_prepare_static_identity_mapping(int hw)
2422{
2423        struct pci_dev *pdev = NULL;
2424        int ret;
2425
2426        ret = si_domain_init(hw);
2427        if (ret)
2428                return -EFAULT;
2429
2430        for_each_pci_dev(pdev) {
2431                if (iommu_should_identity_map(pdev, 1)) {
2432                        ret = domain_add_dev_info(si_domain, pdev,
2433                                             hw ? CONTEXT_TT_PASS_THROUGH :
2434                                                  CONTEXT_TT_MULTI_LEVEL);
2435                        if (ret) {
2436                                /* device not associated with an iommu */
2437                                if (ret == -ENODEV)
2438                                        continue;
2439                                return ret;
2440                        }
2441                        pr_info("IOMMU: %s identity mapping for device %s\n",
2442                                hw ? "hardware" : "software", pci_name(pdev));
2443                }
2444        }
2445
2446        return 0;
2447}
2448
2449static int __init init_dmars(void)
2450{
2451        struct dmar_drhd_unit *drhd;
2452        struct dmar_rmrr_unit *rmrr;
2453        struct pci_dev *pdev;
2454        struct intel_iommu *iommu;
2455        int i, ret;
2456
2457        /*
2458         * for each drhd
2459         *    allocate root
2460         *    initialize and program root entry to not present
2461         * endfor
2462         */
2463        for_each_drhd_unit(drhd) {
2464                /*
2465                 * lock not needed as this is only incremented in the single
2466                 * threaded kernel __init code path all other access are read
2467                 * only
2468                 */
2469                if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2470                        g_num_of_iommus++;
2471                        continue;
2472                }
2473                printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2474                          IOMMU_UNITS_SUPPORTED);
2475        }
2476
2477        g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2478                        GFP_KERNEL);
2479        if (!g_iommus) {
2480                printk(KERN_ERR "Allocating global iommu array failed\n");
2481                ret = -ENOMEM;
2482                goto error;
2483        }
2484
2485        deferred_flush = kzalloc(g_num_of_iommus *
2486                sizeof(struct deferred_flush_tables), GFP_KERNEL);
2487        if (!deferred_flush) {
2488                ret = -ENOMEM;
2489                goto error;
2490        }
2491
2492        for_each_drhd_unit(drhd) {
2493                if (drhd->ignored)
2494                        continue;
2495
2496                iommu = drhd->iommu;
2497                g_iommus[iommu->seq_id] = iommu;
2498
2499                ret = iommu_init_domains(iommu);
2500                if (ret)
2501                        goto error;
2502
2503                /*
2504                 * TBD:
2505                 * we could share the same root & context tables
2506                 * among all IOMMU's. Need to Split it later.
2507                 */
2508                ret = iommu_alloc_root_entry(iommu);
2509                if (ret) {
2510                        printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2511                        goto error;
2512                }
2513                if (!ecap_pass_through(iommu->ecap))
2514                        hw_pass_through = 0;
2515        }
2516
2517        /*
2518         * Start from the sane iommu hardware state.
2519         */
2520        for_each_drhd_unit(drhd) {
2521                if (drhd->ignored)
2522                        continue;
2523
2524                iommu = drhd->iommu;
2525
2526                /*
2527                 * If the queued invalidation is already initialized by us
2528                 * (for example, while enabling interrupt-remapping) then
2529                 * we got the things already rolling from a sane state.
2530                 */
2531                if (iommu->qi)
2532                        continue;
2533
2534                /*
2535                 * Clear any previous faults.
2536                 */
2537                dmar_fault(-1, iommu);
2538                /*
2539                 * Disable queued invalidation if supported and already enabled
2540                 * before OS handover.
2541                 */
2542                dmar_disable_qi(iommu);
2543        }
2544
2545        for_each_drhd_unit(drhd) {
2546                if (drhd->ignored)
2547                        continue;
2548
2549                iommu = drhd->iommu;
2550
2551                if (dmar_enable_qi(iommu)) {
2552                        /*
2553                         * Queued Invalidate not enabled, use Register Based
2554                         * Invalidate
2555                         */
2556                        iommu->flush.flush_context = __iommu_flush_context;
2557                        iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2558                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2559                               "invalidation\n",
2560                                iommu->seq_id,
2561                               (unsigned long long)drhd->reg_base_addr);
2562                } else {
2563                        iommu->flush.flush_context = qi_flush_context;
2564                        iommu->flush.flush_iotlb = qi_flush_iotlb;
2565                        printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2566                               "invalidation\n",
2567                                iommu->seq_id,
2568                               (unsigned long long)drhd->reg_base_addr);
2569                }
2570        }
2571
2572        if (iommu_pass_through)
2573                iommu_identity_mapping |= IDENTMAP_ALL;
2574
2575#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2576        iommu_identity_mapping |= IDENTMAP_GFX;
2577#endif
2578
2579        check_tylersburg_isoch();
2580
2581        /*
2582         * If pass through is not set or not enabled, setup context entries for
2583         * identity mappings for rmrr, gfx, and isa and may fall back to static
2584         * identity mapping if iommu_identity_mapping is set.
2585         */
2586        if (iommu_identity_mapping) {
2587                ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2588                if (ret) {
2589                        printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2590                        goto error;
2591                }
2592        }
2593        /*
2594         * For each rmrr
2595         *   for each dev attached to rmrr
2596         *   do
2597         *     locate drhd for dev, alloc domain for dev
2598         *     allocate free domain
2599         *     allocate page table entries for rmrr
2600         *     if context not allocated for bus
2601         *           allocate and init context
2602         *           set present in root table for this bus
2603         *     init context with domain, translation etc
2604         *    endfor
2605         * endfor
2606         */
2607        printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2608        for_each_rmrr_units(rmrr) {
2609                for (i = 0; i < rmrr->devices_cnt; i++) {
2610                        pdev = rmrr->devices[i];
2611                        /*
2612                         * some BIOS lists non-exist devices in DMAR
2613                         * table.
2614                         */
2615                        if (!pdev)
2616                                continue;
2617                        ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2618                        if (ret)
2619                                printk(KERN_ERR
2620                                       "IOMMU: mapping reserved region failed\n");
2621                }
2622        }
2623
2624        iommu_prepare_isa();
2625
2626        /*
2627         * for each drhd
2628         *   enable fault log
2629         *   global invalidate context cache
2630         *   global invalidate iotlb
2631         *   enable translation
2632         */
2633        for_each_drhd_unit(drhd) {
2634                if (drhd->ignored) {
2635                        /*
2636                         * we always have to disable PMRs or DMA may fail on
2637                         * this device
2638                         */
2639                        if (force_on)
2640                                iommu_disable_protect_mem_regions(drhd->iommu);
2641                        continue;
2642                }
2643                iommu = drhd->iommu;
2644
2645                iommu_flush_write_buffer(iommu);
2646
2647                ret = dmar_set_interrupt(iommu);
2648                if (ret)
2649                        goto error;
2650
2651                iommu_set_root_entry(iommu);
2652
2653                iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2654                iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2655
2656                ret = iommu_enable_translation(iommu);
2657                if (ret)
2658                        goto error;
2659
2660                iommu_disable_protect_mem_regions(iommu);
2661        }
2662
2663        return 0;
2664error:
2665        for_each_drhd_unit(drhd) {
2666                if (drhd->ignored)
2667                        continue;
2668                iommu = drhd->iommu;
2669                free_iommu(iommu);
2670        }
2671        kfree(g_iommus);
2672        return ret;
2673}
2674
2675/* This takes a number of _MM_ pages, not VTD pages */
2676static struct iova *intel_alloc_iova(struct device *dev,
2677                                     struct dmar_domain *domain,
2678                                     unsigned long nrpages, uint64_t dma_mask)
2679{
2680        struct pci_dev *pdev = to_pci_dev(dev);
2681        struct iova *iova = NULL;
2682
2683        /* Restrict dma_mask to the width that the iommu can handle */
2684        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2685
2686        if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2687                /*
2688                 * First try to allocate an io virtual address in
2689                 * DMA_BIT_MASK(32) and if that fails then try allocating
2690                 * from higher range
2691                 */
2692                iova = alloc_iova(&domain->iovad, nrpages,
2693                                  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2694                if (iova)
2695                        return iova;
2696        }
2697        iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2698        if (unlikely(!iova)) {
2699                printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2700                       nrpages, pci_name(pdev));
2701                return NULL;
2702        }
2703
2704        return iova;
2705}
2706
2707static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2708{
2709        struct dmar_domain *domain;
2710        int ret;
2711
2712        domain = get_domain_for_dev(pdev,
2713                        DEFAULT_DOMAIN_ADDRESS_WIDTH);
2714        if (!domain) {
2715                printk(KERN_ERR
2716                        "Allocating domain for %s failed", pci_name(pdev));
2717                return NULL;
2718        }
2719
2720        /* make sure context mapping is ok */
2721        if (unlikely(!domain_context_mapped(pdev))) {
2722                ret = domain_context_mapping(domain, pdev,
2723                                             CONTEXT_TT_MULTI_LEVEL);
2724                if (ret) {
2725                        printk(KERN_ERR
2726                                "Domain context map for %s failed",
2727                                pci_name(pdev));
2728                        return NULL;
2729                }
2730        }
2731
2732        return domain;
2733}
2734
2735static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2736{
2737        struct device_domain_info *info;
2738
2739        /* No lock here, assumes no domain exit in normal case */
2740        info = dev->dev.archdata.iommu;
2741        if (likely(info))
2742                return info->domain;
2743
2744        return __get_valid_domain_for_dev(dev);
2745}
2746
2747static int iommu_dummy(struct pci_dev *pdev)
2748{
2749        return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2750}
2751
2752/* Check if the pdev needs to go through non-identity map and unmap process.*/
2753static int iommu_no_mapping(struct device *dev)
2754{
2755        struct pci_dev *pdev;
2756        int found;
2757
2758        if (unlikely(dev->bus != &pci_bus_type))
2759                return 1;
2760
2761        pdev = to_pci_dev(dev);
2762        if (iommu_dummy(pdev))
2763                return 1;
2764
2765        if (!iommu_identity_mapping)
2766                return 0;
2767
2768        found = identity_mapping(pdev);
2769        if (found) {
2770                if (iommu_should_identity_map(pdev, 0))
2771                        return 1;
2772                else {
2773                        /*
2774                         * 32 bit DMA is removed from si_domain and fall back
2775                         * to non-identity mapping.
2776                         */
2777                        domain_remove_one_dev_info(si_domain, pdev);
2778                        printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2779                               pci_name(pdev));
2780                        return 0;
2781                }
2782        } else {
2783                /*
2784                 * In case of a detached 64 bit DMA device from vm, the device
2785                 * is put into si_domain for identity mapping.
2786                 */
2787                if (iommu_should_identity_map(pdev, 0)) {
2788                        int ret;
2789                        ret = domain_add_dev_info(si_domain, pdev,
2790                                                  hw_pass_through ?
2791                                                  CONTEXT_TT_PASS_THROUGH :
2792                                                  CONTEXT_TT_MULTI_LEVEL);
2793                        if (!ret) {
2794                                printk(KERN_INFO "64bit %s uses identity mapping\n",
2795                                       pci_name(pdev));
2796                                return 1;
2797                        }
2798                }
2799        }
2800
2801        return 0;
2802}
2803
2804static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2805                                     size_t size, int dir, u64 dma_mask)
2806{
2807        struct pci_dev *pdev = to_pci_dev(hwdev);
2808        struct dmar_domain *domain;
2809        phys_addr_t start_paddr;
2810        struct iova *iova;
2811        int prot = 0;
2812        int ret;
2813        struct intel_iommu *iommu;
2814        unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2815
2816        BUG_ON(dir == DMA_NONE);
2817
2818        if (iommu_no_mapping(hwdev))
2819                return paddr;
2820
2821        domain = get_valid_domain_for_dev(pdev);
2822        if (!domain)
2823                return 0;
2824
2825        iommu = domain_get_iommu(domain);
2826        size = aligned_nrpages(paddr, size);
2827
2828        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2829        if (!iova)
2830                goto error;
2831
2832        /*
2833         * Check if DMAR supports zero-length reads on write only
2834         * mappings..
2835         */
2836        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2837                        !cap_zlr(iommu->cap))
2838                prot |= DMA_PTE_READ;
2839        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2840                prot |= DMA_PTE_WRITE;
2841        /*
2842         * paddr - (paddr + size) might be partial page, we should map the whole
2843         * page.  Note: if two part of one page are separately mapped, we
2844         * might have two guest_addr mapping to the same host paddr, but this
2845         * is not a big problem
2846         */
2847        ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2848                                 mm_to_dma_pfn(paddr_pfn), size, prot);
2849        if (ret)
2850                goto error;
2851
2852        /* it's a non-present to present mapping. Only flush if caching mode */
2853        if (cap_caching_mode(iommu->cap))
2854                iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2855        else
2856                iommu_flush_write_buffer(iommu);
2857
2858        start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2859        start_paddr += paddr & ~PAGE_MASK;
2860        return start_paddr;
2861
2862error:
2863        if (iova)
2864                __free_iova(&domain->iovad, iova);
2865        printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2866                pci_name(pdev), size, (unsigned long long)paddr, dir);
2867        return 0;
2868}
2869
2870static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2871                                 unsigned long offset, size_t size,
2872                                 enum dma_data_direction dir,
2873                                 struct dma_attrs *attrs)
2874{
2875        return __intel_map_single(dev, page_to_phys(page) + offset, size,
2876                                  dir, to_pci_dev(dev)->dma_mask);
2877}
2878
2879static void flush_unmaps(void)
2880{
2881        int i, j;
2882
2883        timer_on = 0;
2884
2885        /* just flush them all */
2886        for (i = 0; i < g_num_of_iommus; i++) {
2887                struct intel_iommu *iommu = g_iommus[i];
2888                if (!iommu)
2889                        continue;
2890
2891                if (!deferred_flush[i].next)
2892                        continue;
2893
2894                /* In caching mode, global flushes turn emulation expensive */
2895                if (!cap_caching_mode(iommu->cap))
2896                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2897                                         DMA_TLB_GLOBAL_FLUSH);
2898                for (j = 0; j < deferred_flush[i].next; j++) {
2899                        unsigned long mask;
2900                        struct iova *iova = deferred_flush[i].iova[j];
2901                        struct dmar_domain *domain = deferred_flush[i].domain[j];
2902
2903                        /* On real hardware multiple invalidations are expensive */
2904                        if (cap_caching_mode(iommu->cap))
2905                                iommu_flush_iotlb_psi(iommu, domain->id,
2906                                iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2907                        else {
2908                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2909                                iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2910                                                (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2911                        }
2912                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2913                }
2914                deferred_flush[i].next = 0;
2915        }
2916
2917        list_size = 0;
2918}
2919
2920static void flush_unmaps_timeout(unsigned long data)
2921{
2922        unsigned long flags;
2923
2924        spin_lock_irqsave(&async_umap_flush_lock, flags);
2925        flush_unmaps();
2926        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2927}
2928
2929static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2930{
2931        unsigned long flags;
2932        int next, iommu_id;
2933        struct intel_iommu *iommu;
2934
2935        spin_lock_irqsave(&async_umap_flush_lock, flags);
2936        if (list_size == HIGH_WATER_MARK)
2937                flush_unmaps();
2938
2939        iommu = domain_get_iommu(dom);
2940        iommu_id = iommu->seq_id;
2941
2942        next = deferred_flush[iommu_id].next;
2943        deferred_flush[iommu_id].domain[next] = dom;
2944        deferred_flush[iommu_id].iova[next] = iova;
2945        deferred_flush[iommu_id].next++;
2946
2947        if (!timer_on) {
2948                mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2949                timer_on = 1;
2950        }
2951        list_size++;
2952        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2953}
2954
2955static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2956                             size_t size, enum dma_data_direction dir,
2957                             struct dma_attrs *attrs)
2958{
2959        struct pci_dev *pdev = to_pci_dev(dev);
2960        struct dmar_domain *domain;
2961        unsigned long start_pfn, last_pfn;
2962        struct iova *iova;
2963        struct intel_iommu *iommu;
2964
2965        if (iommu_no_mapping(dev))
2966                return;
2967
2968        domain = find_domain(pdev);
2969        BUG_ON(!domain);
2970
2971        iommu = domain_get_iommu(domain);
2972
2973        iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2974        if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2975                      (unsigned long long)dev_addr))
2976                return;
2977
2978        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2979        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2980
2981        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2982                 pci_name(pdev), start_pfn, last_pfn);
2983
2984        /*  clear the whole page */
2985        dma_pte_clear_range(domain, start_pfn, last_pfn);
2986
2987        /* free page tables */
2988        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2989
2990        if (intel_iommu_strict) {
2991                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2992                                      last_pfn - start_pfn + 1, 0);
2993                /* free iova */
2994                __free_iova(&domain->iovad, iova);
2995        } else {
2996                add_unmap(domain, iova);
2997                /*
2998                 * queue up the release of the unmap to save the 1/6th of the
2999                 * cpu used up by the iotlb flush operation...
3000                 */
3001        }
3002}
3003
3004static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3005                                  dma_addr_t *dma_handle, gfp_t flags,
3006                                  struct dma_attrs *attrs)
3007{
3008        void *vaddr;
3009        int order;
3010
3011        size = PAGE_ALIGN(size);
3012        order = get_order(size);
3013
3014        if (!iommu_no_mapping(hwdev))
3015                flags &= ~(GFP_DMA | GFP_DMA32);
3016        else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3017                if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3018                        flags |= GFP_DMA;
3019                else
3020                        flags |= GFP_DMA32;
3021        }
3022
3023        vaddr = (void *)__get_free_pages(flags, order);
3024        if (!vaddr)
3025                return NULL;
3026        memset(vaddr, 0, size);
3027
3028        *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3029                                         DMA_BIDIRECTIONAL,
3030                                         hwdev->coherent_dma_mask);
3031        if (*dma_handle)
3032                return vaddr;
3033        free_pages((unsigned long)vaddr, order);
3034        return NULL;
3035}
3036
3037static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3038                                dma_addr_t dma_handle, struct dma_attrs *attrs)
3039{
3040        int order;
3041
3042        size = PAGE_ALIGN(size);
3043        order = get_order(size);
3044
3045        intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3046        free_pages((unsigned long)vaddr, order);
3047}
3048
3049static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3050                           int nelems, enum dma_data_direction dir,
3051                           struct dma_attrs *attrs)
3052{
3053        struct pci_dev *pdev = to_pci_dev(hwdev);
3054        struct dmar_domain *domain;
3055        unsigned long start_pfn, last_pfn;
3056        struct iova *iova;
3057        struct intel_iommu *iommu;
3058
3059        if (iommu_no_mapping(hwdev))
3060                return;
3061
3062        domain = find_domain(pdev);
3063        BUG_ON(!domain);
3064
3065        iommu = domain_get_iommu(domain);
3066
3067        iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3068        if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3069                      (unsigned long long)sglist[0].dma_address))
3070                return;
3071
3072        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3073        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3074
3075        /*  clear the whole page */
3076        dma_pte_clear_range(domain, start_pfn, last_pfn);
3077
3078        /* free page tables */
3079        dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3080
3081        if (intel_iommu_strict) {
3082                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3083                                      last_pfn - start_pfn + 1, 0);
3084                /* free iova */
3085                __free_iova(&domain->iovad, iova);
3086        } else {
3087                add_unmap(domain, iova);
3088                /*
3089                 * queue up the release of the unmap to save the 1/6th of the
3090                 * cpu used up by the iotlb flush operation...
3091                 */
3092        }
3093}
3094
3095static int intel_nontranslate_map_sg(struct device *hddev,
3096        struct scatterlist *sglist, int nelems, int dir)
3097{
3098        int i;
3099        struct scatterlist *sg;
3100
3101        for_each_sg(sglist, sg, nelems, i) {
3102                BUG_ON(!sg_page(sg));
3103                sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3104                sg->dma_length = sg->length;
3105        }
3106        return nelems;
3107}
3108
3109static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3110                        enum dma_data_direction dir, struct dma_attrs *attrs)
3111{
3112        int i;
3113        struct pci_dev *pdev = to_pci_dev(hwdev);
3114        struct dmar_domain *domain;
3115        size_t size = 0;
3116        int prot = 0;
3117        struct iova *iova = NULL;
3118        int ret;
3119        struct scatterlist *sg;
3120        unsigned long start_vpfn;
3121        struct intel_iommu *iommu;
3122
3123        BUG_ON(dir == DMA_NONE);
3124        if (iommu_no_mapping(hwdev))
3125                return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3126
3127        domain = get_valid_domain_for_dev(pdev);
3128        if (!domain)
3129                return 0;
3130
3131        iommu = domain_get_iommu(domain);
3132
3133        for_each_sg(sglist, sg, nelems, i)
3134                size += aligned_nrpages(sg->offset, sg->length);
3135
3136        iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3137                                pdev->dma_mask);
3138        if (!iova) {
3139                sglist->dma_length = 0;
3140                return 0;
3141        }
3142
3143        /*
3144         * Check if DMAR supports zero-length reads on write only
3145         * mappings..
3146         */
3147        if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3148                        !cap_zlr(iommu->cap))
3149                prot |= DMA_PTE_READ;
3150        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3151                prot |= DMA_PTE_WRITE;
3152
3153        start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3154
3155        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3156        if (unlikely(ret)) {
3157                /*  clear the page */
3158                dma_pte_clear_range(domain, start_vpfn,
3159                                    start_vpfn + size - 1);
3160                /* free page tables */
3161                dma_pte_free_pagetable(domain, start_vpfn,
3162                                       start_vpfn + size - 1);
3163                /* free iova */
3164                __free_iova(&domain->iovad, iova);
3165                return 0;
3166        }
3167
3168        /* it's a non-present to present mapping. Only flush if caching mode */
3169        if (cap_caching_mode(iommu->cap))
3170                iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3171        else
3172                iommu_flush_write_buffer(iommu);
3173
3174        return nelems;
3175}
3176
3177static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3178{
3179        return !dma_addr;
3180}
3181
3182struct dma_map_ops intel_dma_ops = {
3183        .alloc = intel_alloc_coherent,
3184        .free = intel_free_coherent,
3185        .map_sg = intel_map_sg,
3186        .unmap_sg = intel_unmap_sg,
3187        .map_page = intel_map_page,
3188        .unmap_page = intel_unmap_page,
3189        .mapping_error = intel_mapping_error,
3190};
3191
3192static inline int iommu_domain_cache_init(void)
3193{
3194        int ret = 0;
3195
3196        iommu_domain_cache = kmem_cache_create("iommu_domain",
3197                                         sizeof(struct dmar_domain),
3198                                         0,
3199                                         SLAB_HWCACHE_ALIGN,
3200
3201                                         NULL);
3202        if (!iommu_domain_cache) {
3203                printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3204                ret = -ENOMEM;
3205        }
3206
3207        return ret;
3208}
3209
3210static inline int iommu_devinfo_cache_init(void)
3211{
3212        int ret = 0;
3213
3214        iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3215                                         sizeof(struct device_domain_info),
3216                                         0,
3217                                         SLAB_HWCACHE_ALIGN,
3218                                         NULL);
3219        if (!iommu_devinfo_cache) {
3220                printk(KERN_ERR "Couldn't create devinfo cache\n");
3221                ret = -ENOMEM;
3222        }
3223
3224        return ret;
3225}
3226
3227static inline int iommu_iova_cache_init(void)
3228{
3229        int ret = 0;
3230
3231        iommu_iova_cache = kmem_cache_create("iommu_iova",
3232                                         sizeof(struct iova),
3233                                         0,
3234                                         SLAB_HWCACHE_ALIGN,
3235                                         NULL);
3236        if (!iommu_iova_cache) {
3237                printk(KERN_ERR "Couldn't create iova cache\n");
3238                ret = -ENOMEM;
3239        }
3240
3241        return ret;
3242}
3243
3244static int __init iommu_init_mempool(void)
3245{
3246        int ret;
3247        ret = iommu_iova_cache_init();
3248        if (ret)
3249                return ret;
3250
3251        ret = iommu_domain_cache_init();
3252        if (ret)
3253                goto domain_error;
3254
3255        ret = iommu_devinfo_cache_init();
3256        if (!ret)
3257                return ret;
3258
3259        kmem_cache_destroy(iommu_domain_cache);
3260domain_error:
3261        kmem_cache_destroy(iommu_iova_cache);
3262
3263        return -ENOMEM;
3264}
3265
3266static void __init iommu_exit_mempool(void)
3267{
3268        kmem_cache_destroy(iommu_devinfo_cache);
3269        kmem_cache_destroy(iommu_domain_cache);
3270        kmem_cache_destroy(iommu_iova_cache);
3271
3272}
3273
3274static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3275{
3276        struct dmar_drhd_unit *drhd;
3277        u32 vtbar;
3278        int rc;
3279
3280        /* We know that this device on this chipset has its own IOMMU.
3281         * If we find it under a different IOMMU, then the BIOS is lying
3282         * to us. Hope that the IOMMU for this device is actually
3283         * disabled, and it needs no translation...
3284         */
3285        rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3286        if (rc) {
3287                /* "can't" happen */
3288                dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3289                return;
3290        }
3291        vtbar &= 0xffff0000;
3292
3293        /* we know that the this iommu should be at offset 0xa000 from vtbar */
3294        drhd = dmar_find_matched_drhd_unit(pdev);
3295        if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3296                            TAINT_FIRMWARE_WORKAROUND,
3297                            "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3298                pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299}
3300DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3301
3302static void __init init_no_remapping_devices(void)
3303{
3304        struct dmar_drhd_unit *drhd;
3305
3306        for_each_drhd_unit(drhd) {
3307                if (!drhd->include_all) {
3308                        int i;
3309                        for (i = 0; i < drhd->devices_cnt; i++)
3310                                if (drhd->devices[i] != NULL)
3311                                        break;
3312                        /* ignore DMAR unit if no pci devices exist */
3313                        if (i == drhd->devices_cnt)
3314                                drhd->ignored = 1;
3315                }
3316        }
3317
3318        for_each_drhd_unit(drhd) {
3319                int i;
3320                if (drhd->ignored || drhd->include_all)
3321                        continue;
3322
3323                for (i = 0; i < drhd->devices_cnt; i++)
3324                        if (drhd->devices[i] &&
3325                            !IS_GFX_DEVICE(drhd->devices[i]))
3326                                break;
3327
3328                if (i < drhd->devices_cnt)
3329                        continue;
3330
3331                /* This IOMMU has *only* gfx devices. Either bypass it or
3332                   set the gfx_mapped flag, as appropriate */
3333                if (dmar_map_gfx) {
3334                        intel_iommu_gfx_mapped = 1;
3335                } else {
3336                        drhd->ignored = 1;
3337                        for (i = 0; i < drhd->devices_cnt; i++) {
3338                                if (!drhd->devices[i])
3339                                        continue;
3340                                drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3341                        }
3342                }
3343        }
3344}
3345
3346#ifdef CONFIG_SUSPEND
3347static int init_iommu_hw(void)
3348{
3349        struct dmar_drhd_unit *drhd;
3350        struct intel_iommu *iommu = NULL;
3351
3352        for_each_active_iommu(iommu, drhd)
3353                if (iommu->qi)
3354                        dmar_reenable_qi(iommu);
3355
3356        for_each_iommu(iommu, drhd) {
3357                if (drhd->ignored) {
3358                        /*
3359                         * we always have to disable PMRs or DMA may fail on
3360                         * this device
3361                         */
3362                        if (force_on)
3363                                iommu_disable_protect_mem_regions(iommu);
3364                        continue;
3365                }
3366        
3367                iommu_flush_write_buffer(iommu);
3368
3369                iommu_set_root_entry(iommu);
3370
3371                iommu->flush.flush_context(iommu, 0, 0, 0,
3372                                           DMA_CCMD_GLOBAL_INVL);
3373                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3374                                         DMA_TLB_GLOBAL_FLUSH);
3375                if (iommu_enable_translation(iommu))
3376                        return 1;
3377                iommu_disable_protect_mem_regions(iommu);
3378        }
3379
3380        return 0;
3381}
3382
3383static void iommu_flush_all(void)
3384{
3385        struct dmar_drhd_unit *drhd;
3386        struct intel_iommu *iommu;
3387
3388        for_each_active_iommu(iommu, drhd) {
3389                iommu->flush.flush_context(iommu, 0, 0, 0,
3390                                           DMA_CCMD_GLOBAL_INVL);
3391                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3392                                         DMA_TLB_GLOBAL_FLUSH);
3393        }
3394}
3395
3396static int iommu_suspend(void)
3397{
3398        struct dmar_drhd_unit *drhd;
3399        struct intel_iommu *iommu = NULL;
3400        unsigned long flag;
3401
3402        for_each_active_iommu(iommu, drhd) {
3403                iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3404                                                 GFP_ATOMIC);
3405                if (!iommu->iommu_state)
3406                        goto nomem;
3407        }
3408
3409        iommu_flush_all();
3410
3411        for_each_active_iommu(iommu, drhd) {
3412                iommu_disable_translation(iommu);
3413
3414                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3415
3416                iommu->iommu_state[SR_DMAR_FECTL_REG] =
3417                        readl(iommu->reg + DMAR_FECTL_REG);
3418                iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3419                        readl(iommu->reg + DMAR_FEDATA_REG);
3420                iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3421                        readl(iommu->reg + DMAR_FEADDR_REG);
3422                iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3423                        readl(iommu->reg + DMAR_FEUADDR_REG);
3424
3425                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3426        }
3427        return 0;
3428
3429nomem:
3430        for_each_active_iommu(iommu, drhd)
3431                kfree(iommu->iommu_state);
3432
3433        return -ENOMEM;
3434}
3435
3436static void iommu_resume(void)
3437{
3438        struct dmar_drhd_unit *drhd;
3439        struct intel_iommu *iommu = NULL;
3440        unsigned long flag;
3441
3442        if (init_iommu_hw()) {
3443                if (force_on)
3444                        panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3445                else
3446                        WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3447                return;
3448        }
3449
3450        for_each_active_iommu(iommu, drhd) {
3451
3452                raw_spin_lock_irqsave(&iommu->register_lock, flag);
3453
3454                writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3455                        iommu->reg + DMAR_FECTL_REG);
3456                writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3457                        iommu->reg + DMAR_FEDATA_REG);
3458                writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3459                        iommu->reg + DMAR_FEADDR_REG);
3460                writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3461                        iommu->reg + DMAR_FEUADDR_REG);
3462
3463                raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3464        }
3465
3466        for_each_active_iommu(iommu, drhd)
3467                kfree(iommu->iommu_state);
3468}
3469
3470static struct syscore_ops iommu_syscore_ops = {
3471        .resume         = iommu_resume,
3472        .suspend        = iommu_suspend,
3473};
3474
3475static void __init init_iommu_pm_ops(void)
3476{
3477        register_syscore_ops(&iommu_syscore_ops);
3478}
3479
3480#else
3481static inline void init_iommu_pm_ops(void) {}
3482#endif  /* CONFIG_PM */
3483
3484LIST_HEAD(dmar_rmrr_units);
3485
3486static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3487{
3488        list_add(&rmrr->list, &dmar_rmrr_units);
3489}
3490
3491
3492int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3493{
3494        struct acpi_dmar_reserved_memory *rmrr;
3495        struct dmar_rmrr_unit *rmrru;
3496
3497        rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3498        if (!rmrru)
3499                return -ENOMEM;
3500
3501        rmrru->hdr = header;
3502        rmrr = (struct acpi_dmar_reserved_memory *)header;
3503        rmrru->base_address = rmrr->base_address;
3504        rmrru->end_address = rmrr->end_address;
3505
3506        dmar_register_rmrr_unit(rmrru);
3507        return 0;
3508}
3509
3510static int __init
3511rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3512{
3513        struct acpi_dmar_reserved_memory *rmrr;
3514        int ret;
3515
3516        rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3517        ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3518                ((void *)rmrr) + rmrr->header.length,
3519                &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3520
3521        if (ret || (rmrru->devices_cnt == 0)) {
3522                list_del(&rmrru->list);
3523                kfree(rmrru);
3524        }
3525        return ret;
3526}
3527
3528static LIST_HEAD(dmar_atsr_units);
3529
3530int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3531{
3532        struct acpi_dmar_atsr *atsr;
3533        struct dmar_atsr_unit *atsru;
3534
3535        atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3536        atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3537        if (!atsru)
3538                return -ENOMEM;
3539
3540        atsru->hdr = hdr;
3541        atsru->include_all = atsr->flags & 0x1;
3542
3543        list_add(&atsru->list, &dmar_atsr_units);
3544
3545        return 0;
3546}
3547
3548static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3549{
3550        int rc;
3551        struct acpi_dmar_atsr *atsr;
3552
3553        if (atsru->include_all)
3554                return 0;
3555
3556        atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3557        rc = dmar_parse_dev_scope((void *)(atsr + 1),
3558                                (void *)atsr + atsr->header.length,
3559                                &atsru->devices_cnt, &atsru->devices,
3560                                atsr->segment);
3561        if (rc || !atsru->devices_cnt) {
3562                list_del(&atsru->list);
3563                kfree(atsru);
3564        }
3565
3566        return rc;
3567}
3568
3569int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3570{
3571        int i;
3572        struct pci_bus *bus;
3573        struct acpi_dmar_atsr *atsr;
3574        struct dmar_atsr_unit *atsru;
3575
3576        dev = pci_physfn(dev);
3577
3578        list_for_each_entry(atsru, &dmar_atsr_units, list) {
3579                atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3580                if (atsr->segment == pci_domain_nr(dev->bus))
3581                        goto found;
3582        }
3583
3584        return 0;
3585
3586found:
3587        for (bus = dev->bus; bus; bus = bus->parent) {
3588                struct pci_dev *bridge = bus->self;
3589
3590                if (!bridge || !pci_is_pcie(bridge) ||
3591                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3592                        return 0;
3593
3594                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3595                        for (i = 0; i < atsru->devices_cnt; i++)
3596                                if (atsru->devices[i] == bridge)
3597                                        return 1;
3598                        break;
3599                }
3600        }
3601
3602        if (atsru->include_all)
3603                return 1;
3604
3605        return 0;
3606}
3607
3608int __init dmar_parse_rmrr_atsr_dev(void)
3609{
3610        struct dmar_rmrr_unit *rmrr, *rmrr_n;
3611        struct dmar_atsr_unit *atsr, *atsr_n;
3612        int ret = 0;
3613
3614        list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3615                ret = rmrr_parse_dev(rmrr);
3616                if (ret)
3617                        return ret;
3618        }
3619
3620        list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3621                ret = atsr_parse_dev(atsr);
3622                if (ret)
3623                        return ret;
3624        }
3625
3626        return ret;
3627}
3628
3629/*
3630 * Here we only respond to action of unbound device from driver.
3631 *
3632 * Added device is not attached to its DMAR domain here yet. That will happen
3633 * when mapping the device to iova.
3634 */
3635static int device_notifier(struct notifier_block *nb,
3636                                  unsigned long action, void *data)
3637{
3638        struct device *dev = data;
3639        struct pci_dev *pdev = to_pci_dev(dev);
3640        struct dmar_domain *domain;
3641
3642        if (iommu_no_mapping(dev))
3643                return 0;
3644
3645        domain = find_domain(pdev);
3646        if (!domain)
3647                return 0;
3648
3649        if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3650                domain_remove_one_dev_info(domain, pdev);
3651
3652                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3653                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3654                    list_empty(&domain->devices))
3655                        domain_exit(domain);
3656        }
3657
3658        return 0;
3659}
3660
3661static struct notifier_block device_nb = {
3662        .notifier_call = device_notifier,
3663};
3664
3665int __init intel_iommu_init(void)
3666{
3667        int ret = 0;
3668
3669        /* VT-d is required for a TXT/tboot launch, so enforce that */
3670        force_on = tboot_force_iommu();
3671
3672        if (dmar_table_init()) {
3673                if (force_on)
3674                        panic("tboot: Failed to initialize DMAR table\n");
3675                return  -ENODEV;
3676        }
3677
3678        if (dmar_dev_scope_init() < 0) {
3679                if (force_on)
3680                        panic("tboot: Failed to initialize DMAR device scope\n");
3681                return  -ENODEV;
3682        }
3683
3684        if (no_iommu || dmar_disabled)
3685                return -ENODEV;
3686
3687        if (iommu_init_mempool()) {
3688                if (force_on)
3689                        panic("tboot: Failed to initialize iommu memory\n");
3690                return  -ENODEV;
3691        }
3692
3693        if (list_empty(&dmar_rmrr_units))
3694                printk(KERN_INFO "DMAR: No RMRR found\n");
3695
3696        if (list_empty(&dmar_atsr_units))
3697                printk(KERN_INFO "DMAR: No ATSR found\n");
3698
3699        if (dmar_init_reserved_ranges()) {
3700                if (force_on)
3701                        panic("tboot: Failed to reserve iommu ranges\n");
3702                return  -ENODEV;
3703        }
3704
3705        init_no_remapping_devices();
3706
3707        ret = init_dmars();
3708        if (ret) {
3709                if (force_on)
3710                        panic("tboot: Failed to initialize DMARs\n");
3711                printk(KERN_ERR "IOMMU: dmar init failed\n");
3712                put_iova_domain(&reserved_iova_list);
3713                iommu_exit_mempool();
3714                return ret;
3715        }
3716        printk(KERN_INFO
3717        "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3718
3719        init_timer(&unmap_timer);
3720#ifdef CONFIG_SWIOTLB
3721        swiotlb = 0;
3722#endif
3723        dma_ops = &intel_dma_ops;
3724
3725        init_iommu_pm_ops();
3726
3727        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3728
3729        bus_register_notifier(&pci_bus_type, &device_nb);
3730
3731        intel_iommu_enabled = 1;
3732
3733        return 0;
3734}
3735
3736static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3737                                           struct pci_dev *pdev)
3738{
3739        struct pci_dev *tmp, *parent;
3740
3741        if (!iommu || !pdev)
3742                return;
3743
3744        /* dependent device detach */
3745        tmp = pci_find_upstream_pcie_bridge(pdev);
3746        /* Secondary interface's bus number and devfn 0 */
3747        if (tmp) {
3748                parent = pdev->bus->self;
3749                while (parent != tmp) {
3750                        iommu_detach_dev(iommu, parent->bus->number,
3751                                         parent->devfn);
3752                        parent = parent->bus->self;
3753                }
3754                if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3755                        iommu_detach_dev(iommu,
3756                                tmp->subordinate->number, 0);
3757                else /* this is a legacy PCI bridge */
3758                        iommu_detach_dev(iommu, tmp->bus->number,
3759                                         tmp->devfn);
3760        }
3761}
3762
3763static void domain_remove_one_dev_info(struct dmar_domain *domain,
3764                                          struct pci_dev *pdev)
3765{
3766        struct device_domain_info *info;
3767        struct intel_iommu *iommu;
3768        unsigned long flags;
3769        int found = 0;
3770        struct list_head *entry, *tmp;
3771
3772        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3773                                pdev->devfn);
3774        if (!iommu)
3775                return;
3776
3777        spin_lock_irqsave(&device_domain_lock, flags);
3778        list_for_each_safe(entry, tmp, &domain->devices) {
3779                info = list_entry(entry, struct device_domain_info, link);
3780                if (info->segment == pci_domain_nr(pdev->bus) &&
3781                    info->bus == pdev->bus->number &&
3782                    info->devfn == pdev->devfn) {
3783                        unlink_domain_info(info);
3784                        spin_unlock_irqrestore(&device_domain_lock, flags);
3785
3786                        iommu_disable_dev_iotlb(info);
3787                        iommu_detach_dev(iommu, info->bus, info->devfn);
3788                        iommu_detach_dependent_devices(iommu, pdev);
3789                        free_devinfo_mem(info);
3790
3791                        spin_lock_irqsave(&device_domain_lock, flags);
3792
3793                        if (found)
3794                                break;
3795                        else
3796                                continue;
3797                }
3798
3799                /* if there is no other devices under the same iommu
3800                 * owned by this domain, clear this iommu in iommu_bmp
3801                 * update iommu count and coherency
3802                 */
3803                if (iommu == device_to_iommu(info->segment, info->bus,
3804                                            info->devfn))
3805                        found = 1;
3806        }
3807
3808        spin_unlock_irqrestore(&device_domain_lock, flags);
3809
3810        if (found == 0) {
3811                unsigned long tmp_flags;
3812                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3813                clear_bit(iommu->seq_id, domain->iommu_bmp);
3814                domain->iommu_count--;
3815                domain_update_iommu_cap(domain);
3816                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3817
3818                if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3819                    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3820                        spin_lock_irqsave(&iommu->lock, tmp_flags);
3821                        clear_bit(domain->id, iommu->domain_ids);
3822                        iommu->domains[domain->id] = NULL;
3823                        spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3824                }
3825        }
3826}
3827
3828static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3829{
3830        struct device_domain_info *info;
3831        struct intel_iommu *iommu;
3832        unsigned long flags1, flags2;
3833
3834        spin_lock_irqsave(&device_domain_lock, flags1);
3835        while (!list_empty(&domain->devices)) {
3836                info = list_entry(domain->devices.next,
3837                        struct device_domain_info, link);
3838                unlink_domain_info(info);
3839                spin_unlock_irqrestore(&device_domain_lock, flags1);
3840
3841                iommu_disable_dev_iotlb(info);
3842                iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3843                iommu_detach_dev(iommu, info->bus, info->devfn);
3844                iommu_detach_dependent_devices(iommu, info->dev);
3845
3846                /* clear this iommu in iommu_bmp, update iommu count
3847                 * and capabilities
3848                 */
3849                spin_lock_irqsave(&domain->iommu_lock, flags2);
3850                if (test_and_clear_bit(iommu->seq_id,
3851                                       domain->iommu_bmp)) {
3852                        domain->iommu_count--;
3853                        domain_update_iommu_cap(domain);
3854                }
3855                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3856
3857                free_devinfo_mem(info);
3858                spin_lock_irqsave(&device_domain_lock, flags1);
3859        }
3860        spin_unlock_irqrestore(&device_domain_lock, flags1);
3861}
3862
3863/* domain id for virtual machine, it won't be set in context */
3864static unsigned long vm_domid;
3865
3866static struct dmar_domain *iommu_alloc_vm_domain(void)
3867{
3868        struct dmar_domain *domain;
3869
3870        domain = alloc_domain_mem();
3871        if (!domain)
3872                return NULL;
3873
3874        domain->id = vm_domid++;
3875        domain->nid = -1;
3876        memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3877        domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3878
3879        return domain;
3880}
3881
3882static int md_domain_init(struct dmar_domain *domain, int guest_width)
3883{
3884        int adjust_width;
3885
3886        init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3887        spin_lock_init(&domain->iommu_lock);
3888
3889        domain_reserve_special_ranges(domain);
3890
3891        /* calculate AGAW */
3892        domain->gaw = guest_width;
3893        adjust_width = guestwidth_to_adjustwidth(guest_width);
3894        domain->agaw = width_to_agaw(adjust_width);
3895
3896        INIT_LIST_HEAD(&domain->devices);
3897
3898        domain->iommu_count = 0;
3899        domain->iommu_coherency = 0;
3900        domain->iommu_snooping = 0;
3901        domain->iommu_superpage = 0;
3902        domain->max_addr = 0;
3903        domain->nid = -1;
3904
3905        /* always allocate the top pgd */
3906        domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3907        if (!domain->pgd)
3908                return -ENOMEM;
3909        domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3910        return 0;
3911}
3912
3913static void iommu_free_vm_domain(struct dmar_domain *domain)
3914{
3915        unsigned long flags;
3916        struct dmar_drhd_unit *drhd;
3917        struct intel_iommu *iommu;
3918        unsigned long i;
3919        unsigned long ndomains;
3920
3921        for_each_drhd_unit(drhd) {
3922                if (drhd->ignored)
3923                        continue;
3924                iommu = drhd->iommu;
3925
3926                ndomains = cap_ndoms(iommu->cap);
3927                for_each_set_bit(i, iommu->domain_ids, ndomains) {
3928                        if (iommu->domains[i] == domain) {
3929                                spin_lock_irqsave(&iommu->lock, flags);
3930                                clear_bit(i, iommu->domain_ids);
3931                                iommu->domains[i] = NULL;
3932                                spin_unlock_irqrestore(&iommu->lock, flags);
3933                                break;
3934                        }
3935                }
3936        }
3937}
3938
3939static void vm_domain_exit(struct dmar_domain *domain)
3940{
3941        /* Domain 0 is reserved, so dont process it */
3942        if (!domain)
3943                return;
3944
3945        vm_domain_remove_all_dev_info(domain);
3946        /* destroy iovas */
3947        put_iova_domain(&domain->iovad);
3948
3949        /* clear ptes */
3950        dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3951
3952        /* free page tables */
3953        dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3954
3955        iommu_free_vm_domain(domain);
3956        free_domain_mem(domain);
3957}
3958
3959static int intel_iommu_domain_init(struct iommu_domain *domain)
3960{
3961        struct dmar_domain *dmar_domain;
3962
3963        dmar_domain = iommu_alloc_vm_domain();
3964        if (!dmar_domain) {
3965                printk(KERN_ERR
3966                        "intel_iommu_domain_init: dmar_domain == NULL\n");
3967                return -ENOMEM;
3968        }
3969        if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3970                printk(KERN_ERR
3971                        "intel_iommu_domain_init() failed\n");
3972                vm_domain_exit(dmar_domain);
3973                return -ENOMEM;
3974        }
3975        domain_update_iommu_cap(dmar_domain);
3976        domain->priv = dmar_domain;
3977
3978        domain->geometry.aperture_start = 0;
3979        domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3980        domain->geometry.force_aperture = true;
3981
3982        return 0;
3983}
3984
3985static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3986{
3987        struct dmar_domain *dmar_domain = domain->priv;
3988
3989        domain->priv = NULL;
3990        vm_domain_exit(dmar_domain);
3991}
3992
3993static int intel_iommu_attach_device(struct iommu_domain *domain,
3994                                     struct device *dev)
3995{
3996        struct dmar_domain *dmar_domain = domain->priv;
3997        struct pci_dev *pdev = to_pci_dev(dev);
3998        struct intel_iommu *iommu;
3999        int addr_width;
4000
4001        /* normally pdev is not mapped */
4002        if (unlikely(domain_context_mapped(pdev))) {
4003                struct dmar_domain *old_domain;
4004
4005                old_domain = find_domain(pdev);
4006                if (old_domain) {
4007                        if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4008                            dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4009                                domain_remove_one_dev_info(old_domain, pdev);
4010                        else
4011                                domain_remove_dev_info(old_domain);
4012                }
4013        }
4014
4015        iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4016                                pdev->devfn);
4017        if (!iommu)
4018                return -ENODEV;
4019
4020        /* check if this iommu agaw is sufficient for max mapped address */
4021        addr_width = agaw_to_width(iommu->agaw);
4022        if (addr_width > cap_mgaw(iommu->cap))
4023                addr_width = cap_mgaw(iommu->cap);
4024
4025        if (dmar_domain->max_addr > (1LL << addr_width)) {
4026                printk(KERN_ERR "%s: iommu width (%d) is not "
4027                       "sufficient for the mapped address (%llx)\n",
4028                       __func__, addr_width, dmar_domain->max_addr);
4029                return -EFAULT;
4030        }
4031        dmar_domain->gaw = addr_width;
4032
4033        /*
4034         * Knock out extra levels of page tables if necessary
4035         */
4036        while (iommu->agaw < dmar_domain->agaw) {
4037                struct dma_pte *pte;
4038
4039                pte = dmar_domain->pgd;
4040                if (dma_pte_present(pte)) {
4041                        dmar_domain->pgd = (struct dma_pte *)
4042                                phys_to_virt(dma_pte_addr(pte));
4043                        free_pgtable_page(pte);
4044                }
4045                dmar_domain->agaw--;
4046        }
4047
4048        return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4049}
4050
4051static void intel_iommu_detach_device(struct iommu_domain *domain,
4052                                      struct device *dev)
4053{
4054        struct dmar_domain *dmar_domain = domain->priv;
4055        struct pci_dev *pdev = to_pci_dev(dev);
4056
4057        domain_remove_one_dev_info(dmar_domain, pdev);
4058}
4059
4060static int intel_iommu_map(struct iommu_domain *domain,
4061                           unsigned long iova, phys_addr_t hpa,
4062                           size_t size, int iommu_prot)
4063{
4064        struct dmar_domain *dmar_domain = domain->priv;
4065        u64 max_addr;
4066        int prot = 0;
4067        int ret;
4068
4069        if (iommu_prot & IOMMU_READ)
4070                prot |= DMA_PTE_READ;
4071        if (iommu_prot & IOMMU_WRITE)
4072                prot |= DMA_PTE_WRITE;
4073        if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4074                prot |= DMA_PTE_SNP;
4075
4076        max_addr = iova + size;
4077        if (dmar_domain->max_addr < max_addr) {
4078                u64 end;
4079
4080                /* check if minimum agaw is sufficient for mapped address */
4081                end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4082                if (end < max_addr) {
4083                        printk(KERN_ERR "%s: iommu width (%d) is not "
4084                               "sufficient for the mapped address (%llx)\n",
4085                               __func__, dmar_domain->gaw, max_addr);
4086                        return -EFAULT;
4087                }
4088                dmar_domain->max_addr = max_addr;
4089        }
4090        /* Round up size to next multiple of PAGE_SIZE, if it and
4091           the low bits of hpa would take us onto the next page */
4092        size = aligned_nrpages(hpa, size);
4093        ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4094                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4095        return ret;
4096}
4097
4098static size_t intel_iommu_unmap(struct iommu_domain *domain,
4099                             unsigned long iova, size_t size)
4100{
4101        struct dmar_domain *dmar_domain = domain->priv;
4102        int order;
4103
4104        order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4105                            (iova + size - 1) >> VTD_PAGE_SHIFT);
4106
4107        if (dmar_domain->max_addr == iova + size)
4108                dmar_domain->max_addr = iova;
4109
4110        return PAGE_SIZE << order;
4111}
4112
4113static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4114                                            unsigned long iova)
4115{
4116        struct dmar_domain *dmar_domain = domain->priv;
4117        struct dma_pte *pte;
4118        u64 phys = 0;
4119
4120        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4121        if (pte)
4122                phys = dma_pte_addr(pte);
4123
4124        return phys;
4125}
4126
4127static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4128                                      unsigned long cap)
4129{
4130        struct dmar_domain *dmar_domain = domain->priv;
4131
4132        if (cap == IOMMU_CAP_CACHE_COHERENCY)
4133                return dmar_domain->iommu_snooping;
4134        if (cap == IOMMU_CAP_INTR_REMAP)
4135                return irq_remapping_enabled;
4136
4137        return 0;
4138}
4139
4140static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
4141{
4142        pci_dev_put(*from);
4143        *from = to;
4144}
4145
4146#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4147
4148static int intel_iommu_add_device(struct device *dev)
4149{
4150        struct pci_dev *pdev = to_pci_dev(dev);
4151        struct pci_dev *bridge, *dma_pdev = NULL;
4152        struct iommu_group *group;
4153        int ret;
4154
4155        if (!device_to_iommu(pci_domain_nr(pdev->bus),
4156                             pdev->bus->number, pdev->devfn))
4157                return -ENODEV;
4158
4159        bridge = pci_find_upstream_pcie_bridge(pdev);
4160        if (bridge) {
4161                if (pci_is_pcie(bridge))
4162                        dma_pdev = pci_get_domain_bus_and_slot(
4163                                                pci_domain_nr(pdev->bus),
4164                                                bridge->subordinate->number, 0);
4165                if (!dma_pdev)
4166                        dma_pdev = pci_dev_get(bridge);
4167        } else
4168                dma_pdev = pci_dev_get(pdev);
4169
4170        /* Account for quirked devices */
4171        swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4172
4173        /*
4174         * If it's a multifunction device that does not support our
4175         * required ACS flags, add to the same group as function 0.
4176         */
4177        if (dma_pdev->multifunction &&
4178            !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
4179                swap_pci_ref(&dma_pdev,
4180                             pci_get_slot(dma_pdev->bus,
4181                                          PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
4182                                          0)));
4183
4184        /*
4185         * Devices on the root bus go through the iommu.  If that's not us,
4186         * find the next upstream device and test ACS up to the root bus.
4187         * Finding the next device may require skipping virtual buses.
4188         */
4189        while (!pci_is_root_bus(dma_pdev->bus)) {
4190                struct pci_bus *bus = dma_pdev->bus;
4191
4192                while (!bus->self) {
4193                        if (!pci_is_root_bus(bus))
4194                                bus = bus->parent;
4195                        else
4196                                goto root_bus;
4197                }
4198
4199                if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4200                        break;
4201
4202                swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4203        }
4204
4205root_bus:
4206        group = iommu_group_get(&dma_pdev->dev);
4207        pci_dev_put(dma_pdev);
4208        if (!group) {
4209                group = iommu_group_alloc();
4210                if (IS_ERR(group))
4211                        return PTR_ERR(group);
4212        }
4213
4214        ret = iommu_group_add_device(group, dev);
4215
4216        iommu_group_put(group);
4217        return ret;
4218}
4219
4220static void intel_iommu_remove_device(struct device *dev)
4221{
4222        iommu_group_remove_device(dev);
4223}
4224
4225static struct iommu_ops intel_iommu_ops = {
4226        .domain_init    = intel_iommu_domain_init,
4227        .domain_destroy = intel_iommu_domain_destroy,
4228        .attach_dev     = intel_iommu_attach_device,
4229        .detach_dev     = intel_iommu_detach_device,
4230        .map            = intel_iommu_map,
4231        .unmap          = intel_iommu_unmap,
4232        .iova_to_phys   = intel_iommu_iova_to_phys,
4233        .domain_has_cap = intel_iommu_domain_has_cap,
4234        .add_device     = intel_iommu_add_device,
4235        .remove_device  = intel_iommu_remove_device,
4236        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4237};
4238
4239static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4240{
4241        /* G4x/GM45 integrated gfx dmar support is totally busted. */
4242        printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4243        dmar_map_gfx = 0;
4244}
4245
4246DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4247DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4248DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4249DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4250DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4251DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4252DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4253
4254static void quirk_iommu_rwbf(struct pci_dev *dev)
4255{
4256        /*
4257         * Mobile 4 Series Chipset neglects to set RWBF capability,
4258         * but needs it. Same seems to hold for the desktop versions.
4259         */
4260        printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4261        rwbf_quirk = 1;
4262}
4263
4264DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4265DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4266DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4267DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4268DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4269DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4270DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4271
4272#define GGC 0x52
4273#define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4274#define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4275#define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4276#define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4277#define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4278#define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4279#define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4280#define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4281
4282static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4283{
4284        unsigned short ggc;
4285
4286        if (pci_read_config_word(dev, GGC, &ggc))
4287                return;
4288
4289        if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4290                printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4291                dmar_map_gfx = 0;
4292        } else if (dmar_map_gfx) {
4293                /* we have to ensure the gfx device is idle before we flush */
4294                printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4295                intel_iommu_strict = 1;
4296       }
4297}
4298DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4299DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4300DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4301DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4302
4303/* On Tylersburg chipsets, some BIOSes have been known to enable the
4304   ISOCH DMAR unit for the Azalia sound device, but not give it any
4305   TLB entries, which causes it to deadlock. Check for that.  We do
4306   this in a function called from init_dmars(), instead of in a PCI
4307   quirk, because we don't want to print the obnoxious "BIOS broken"
4308   message if VT-d is actually disabled.
4309*/
4310static void __init check_tylersburg_isoch(void)
4311{
4312        struct pci_dev *pdev;
4313        uint32_t vtisochctrl;
4314
4315        /* If there's no Azalia in the system anyway, forget it. */
4316        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4317        if (!pdev)
4318                return;
4319        pci_dev_put(pdev);
4320
4321        /* System Management Registers. Might be hidden, in which case
4322           we can't do the sanity check. But that's OK, because the
4323           known-broken BIOSes _don't_ actually hide it, so far. */
4324        pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4325        if (!pdev)
4326                return;
4327
4328        if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4329                pci_dev_put(pdev);
4330                return;
4331        }
4332
4333        pci_dev_put(pdev);
4334
4335        /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4336        if (vtisochctrl & 1)
4337                return;
4338
4339        /* Drop all bits other than the number of TLB entries */
4340        vtisochctrl &= 0x1c;
4341
4342        /* If we have the recommended number of TLB entries (16), fine. */
4343        if (vtisochctrl == 0x10)
4344                return;
4345
4346        /* Zero TLB entries? You get to ride the short bus to school. */
4347        if (!vtisochctrl) {
4348                WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4349                     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4350                     dmi_get_system_info(DMI_BIOS_VENDOR),
4351                     dmi_get_system_info(DMI_BIOS_VERSION),
4352                     dmi_get_system_info(DMI_PRODUCT_VERSION));
4353                iommu_identity_mapping |= IDENTMAP_AZALIA;
4354                return;
4355        }
4356        
4357        printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4358               vtisochctrl);
4359}
4360
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.