linux/arch/x86/mm/pat.c
<<
>>
Prefs
   1/*
   2 * Handle caching attributes in page tables (PAT)
   3 *
   4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
   5 *          Suresh B Siddha <suresh.b.siddha@intel.com>
   6 *
   7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
   8 */
   9
  10#include <linux/seq_file.h>
  11#include <linux/bootmem.h>
  12#include <linux/debugfs.h>
  13#include <linux/kernel.h>
  14#include <linux/gfp.h>
  15#include <linux/mm.h>
  16#include <linux/fs.h>
  17
  18#include <asm/cacheflush.h>
  19#include <asm/processor.h>
  20#include <asm/tlbflush.h>
  21#include <asm/pgtable.h>
  22#include <asm/fcntl.h>
  23#include <asm/e820.h>
  24#include <asm/mtrr.h>
  25#include <asm/page.h>
  26#include <asm/msr.h>
  27#include <asm/pat.h>
  28#include <asm/io.h>
  29
  30#ifdef CONFIG_X86_PAT
  31int __read_mostly pat_enabled = 1;
  32
  33void __cpuinit pat_disable(char *reason)
  34{
  35        pat_enabled = 0;
  36        printk(KERN_INFO "%s\n", reason);
  37}
  38
  39static int __init nopat(char *str)
  40{
  41        pat_disable("PAT support disabled.");
  42        return 0;
  43}
  44early_param("nopat", nopat);
  45#endif
  46
  47
  48static int debug_enable;
  49
  50static int __init pat_debug_setup(char *str)
  51{
  52        debug_enable = 1;
  53        return 0;
  54}
  55__setup("debugpat", pat_debug_setup);
  56
  57#define dprintk(fmt, arg...) \
  58        do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
  59
  60
  61static u64 __read_mostly boot_pat_state;
  62
  63enum {
  64        PAT_UC = 0,             /* uncached */
  65        PAT_WC = 1,             /* Write combining */
  66        PAT_WT = 4,             /* Write Through */
  67        PAT_WP = 5,             /* Write Protected */
  68        PAT_WB = 6,             /* Write Back (default) */
  69        PAT_UC_MINUS = 7,       /* UC, but can be overriden by MTRR */
  70};
  71
  72#define PAT(x, y)       ((u64)PAT_ ## y << ((x)*8))
  73
  74void pat_init(void)
  75{
  76        u64 pat;
  77
  78        if (!pat_enabled)
  79                return;
  80
  81        /* Paranoia check. */
  82        if (!cpu_has_pat && boot_pat_state) {
  83                /*
  84                 * If this happens we are on a secondary CPU, but
  85                 * switched to PAT on the boot CPU. We have no way to
  86                 * undo PAT.
  87                 */
  88                printk(KERN_ERR "PAT enabled, "
  89                       "but not supported by secondary CPU\n");
  90                BUG();
  91        }
  92
  93        /* Set PWT to Write-Combining. All other bits stay the same */
  94        /*
  95         * PTE encoding used in Linux:
  96         *      PAT
  97         *      |PCD
  98         *      ||PWT
  99         *      |||
 100         *      000 WB          _PAGE_CACHE_WB
 101         *      001 WC          _PAGE_CACHE_WC
 102         *      010 UC-         _PAGE_CACHE_UC_MINUS
 103         *      011 UC          _PAGE_CACHE_UC
 104         * PAT bit unused
 105         */
 106        pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
 107              PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
 108
 109        /* Boot CPU check */
 110        if (!boot_pat_state)
 111                rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
 112
 113        wrmsrl(MSR_IA32_CR_PAT, pat);
 114        printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
 115               smp_processor_id(), boot_pat_state, pat);
 116}
 117
 118#undef PAT
 119
 120static char *cattr_name(unsigned long flags)
 121{
 122        switch (flags & _PAGE_CACHE_MASK) {
 123        case _PAGE_CACHE_UC:            return "uncached";
 124        case _PAGE_CACHE_UC_MINUS:      return "uncached-minus";
 125        case _PAGE_CACHE_WB:            return "write-back";
 126        case _PAGE_CACHE_WC:            return "write-combining";
 127        default:                        return "broken";
 128        }
 129}
 130
 131/*
 132 * The global memtype list keeps track of memory type for specific
 133 * physical memory areas. Conflicting memory types in different
 134 * mappings can cause CPU cache corruption. To avoid this we keep track.
 135 *
 136 * The list is sorted based on starting address and can contain multiple
 137 * entries for each address (this allows reference counting for overlapping
 138 * areas). All the aliases have the same cache attributes of course.
 139 * Zero attributes are represented as holes.
 140 *
 141 * Currently the data structure is a list because the number of mappings
 142 * are expected to be relatively small. If this should be a problem
 143 * it could be changed to a rbtree or similar.
 144 *
 145 * memtype_lock protects the whole list.
 146 */
 147
 148struct memtype {
 149        u64                     start;
 150        u64                     end;
 151        unsigned long           type;
 152        struct list_head        nd;
 153};
 154
 155static LIST_HEAD(memtype_list);
 156static DEFINE_SPINLOCK(memtype_lock);   /* protects memtype list */
 157
 158/*
 159 * Does intersection of PAT memory type and MTRR memory type and returns
 160 * the resulting memory type as PAT understands it.
 161 * (Type in pat and mtrr will not have same value)
 162 * The intersection is based on "Effective Memory Type" tables in IA-32
 163 * SDM vol 3a
 164 */
 165static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 166{
 167        /*
 168         * Look for MTRR hint to get the effective type in case where PAT
 169         * request is for WB.
 170         */
 171        if (req_type == _PAGE_CACHE_WB) {
 172                u8 mtrr_type;
 173
 174                mtrr_type = mtrr_type_lookup(start, end);
 175                if (mtrr_type == MTRR_TYPE_UNCACHABLE)
 176                        return _PAGE_CACHE_UC;
 177                if (mtrr_type == MTRR_TYPE_WRCOMB)
 178                        return _PAGE_CACHE_WC;
 179        }
 180
 181        return req_type;
 182}
 183
 184static int
 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
 186{
 187        if (new->type != entry->type) {
 188                if (type) {
 189                        new->type = entry->type;
 190                        *type = entry->type;
 191                } else
 192                        goto conflict;
 193        }
 194
 195         /* check overlaps with more than one entry in the list */
 196        list_for_each_entry_continue(entry, &memtype_list, nd) {
 197                if (new->end <= entry->start)
 198                        break;
 199                else if (new->type != entry->type)
 200                        goto conflict;
 201        }
 202        return 0;
 203
 204 conflict:
 205        printk(KERN_INFO "%s:%d conflicting memory types "
 206               "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
 207               new->end, cattr_name(new->type), cattr_name(entry->type));
 208        return -EBUSY;
 209}
 210
 211static struct memtype *cached_entry;
 212static u64 cached_start;
 213
 214/*
 215 * For RAM pages, mark the pages as non WB memory type using
 216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
 217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
 218 * This is ok, because only one driver will be owning the page and
 219 * doing set_memory_*() calls.
 220 *
 221 * For now, we use PageNonWB to track that the RAM page is being mapped
 222 * as non WB. In future, we will have to use one more flag
 223 * (or some other mechanism in page_struct) to distinguish between
 224 * UC and WC mapping.
 225 */
 226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 227                                  unsigned long *new_type)
 228{
 229        struct page *page;
 230        u64 pfn, end_pfn;
 231
 232        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 233                page = pfn_to_page(pfn);
 234                if (page_mapped(page) || PageNonWB(page))
 235                        goto out;
 236
 237                SetPageNonWB(page);
 238        }
 239        return 0;
 240
 241out:
 242        end_pfn = pfn;
 243        for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
 244                page = pfn_to_page(pfn);
 245                ClearPageNonWB(page);
 246        }
 247
 248        return -EINVAL;
 249}
 250
 251static int free_ram_pages_type(u64 start, u64 end)
 252{
 253        struct page *page;
 254        u64 pfn, end_pfn;
 255
 256        for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 257                page = pfn_to_page(pfn);
 258                if (page_mapped(page) || !PageNonWB(page))
 259                        goto out;
 260
 261                ClearPageNonWB(page);
 262        }
 263        return 0;
 264
 265out:
 266        end_pfn = pfn;
 267        for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
 268                page = pfn_to_page(pfn);
 269                SetPageNonWB(page);
 270        }
 271        return -EINVAL;
 272}
 273
 274/*
 275 * req_type typically has one of the:
 276 * - _PAGE_CACHE_WB
 277 * - _PAGE_CACHE_WC
 278 * - _PAGE_CACHE_UC_MINUS
 279 * - _PAGE_CACHE_UC
 280 *
 281 * req_type will have a special case value '-1', when requester want to inherit
 282 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
 283 *
 284 * If new_type is NULL, function will return an error if it cannot reserve the
 285 * region with req_type. If new_type is non-NULL, function will return
 286 * available type in new_type in case of no error. In case of any error
 287 * it will return a negative return value.
 288 */
 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 290                    unsigned long *new_type)
 291{
 292        struct memtype *new, *entry;
 293        unsigned long actual_type;
 294        struct list_head *where;
 295        int is_range_ram;
 296        int err = 0;
 297
 298        BUG_ON(start >= end); /* end is exclusive */
 299
 300        if (!pat_enabled) {
 301                /* This is identical to page table setting without PAT */
 302                if (new_type) {
 303                        if (req_type == -1)
 304                                *new_type = _PAGE_CACHE_WB;
 305                        else
 306                                *new_type = req_type & _PAGE_CACHE_MASK;
 307                }
 308                return 0;
 309        }
 310
 311        /* Low ISA region is always mapped WB in page table. No need to track */
 312        if (is_ISA_range(start, end - 1)) {
 313                if (new_type)
 314                        *new_type = _PAGE_CACHE_WB;
 315                return 0;
 316        }
 317
 318        if (req_type == -1) {
 319                /*
 320                 * Call mtrr_lookup to get the type hint. This is an
 321                 * optimization for /dev/mem mmap'ers into WB memory (BIOS
 322                 * tools and ACPI tools). Use WB request for WB memory and use
 323                 * UC_MINUS otherwise.
 324                 */
 325                u8 mtrr_type = mtrr_type_lookup(start, end);
 326
 327                if (mtrr_type == MTRR_TYPE_WRBACK)
 328                        actual_type = _PAGE_CACHE_WB;
 329                else
 330                        actual_type = _PAGE_CACHE_UC_MINUS;
 331        } else {
 332                actual_type = pat_x_mtrr_type(start, end,
 333                                              req_type & _PAGE_CACHE_MASK);
 334        }
 335
 336        is_range_ram = pagerange_is_ram(start, end);
 337        if (is_range_ram == 1)
 338                return reserve_ram_pages_type(start, end, req_type, new_type);
 339        else if (is_range_ram < 0)
 340                return -EINVAL;
 341
 342        new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
 343        if (!new)
 344                return -ENOMEM;
 345
 346        new->start      = start;
 347        new->end        = end;
 348        new->type       = actual_type;
 349
 350        if (new_type)
 351                *new_type = actual_type;
 352
 353        spin_lock(&memtype_lock);
 354
 355        if (cached_entry && start >= cached_start)
 356                entry = cached_entry;
 357        else
 358                entry = list_entry(&memtype_list, struct memtype, nd);
 359
 360        /* Search for existing mapping that overlaps the current range */
 361        where = NULL;
 362        list_for_each_entry_continue(entry, &memtype_list, nd) {
 363                if (end <= entry->start) {
 364                        where = entry->nd.prev;
 365                        cached_entry = list_entry(where, struct memtype, nd);
 366                        break;
 367                } else if (start <= entry->start) { /* end > entry->start */
 368                        err = chk_conflict(new, entry, new_type);
 369                        if (!err) {
 370                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
 371                                        entry->start, entry->end);
 372                                where = entry->nd.prev;
 373                                cached_entry = list_entry(where,
 374                                                        struct memtype, nd);
 375                        }
 376                        break;
 377                } else if (start < entry->end) { /* start > entry->start */
 378                        err = chk_conflict(new, entry, new_type);
 379                        if (!err) {
 380                                dprintk("Overlap at 0x%Lx-0x%Lx\n",
 381                                        entry->start, entry->end);
 382                                cached_entry = list_entry(entry->nd.prev,
 383                                                        struct memtype, nd);
 384
 385                                /*
 386                                 * Move to right position in the linked
 387                                 * list to add this new entry
 388                                 */
 389                                list_for_each_entry_continue(entry,
 390                                                        &memtype_list, nd) {
 391                                        if (start <= entry->start) {
 392                                                where = entry->nd.prev;
 393                                                break;
 394                                        }
 395                                }
 396                        }
 397                        break;
 398                }
 399        }
 400
 401        if (err) {
 402                printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
 403                       "track %s, req %s\n",
 404                       start, end, cattr_name(new->type), cattr_name(req_type));
 405                kfree(new);
 406                spin_unlock(&memtype_lock);
 407
 408                return err;
 409        }
 410
 411        cached_start = start;
 412
 413        if (where)
 414                list_add(&new->nd, where);
 415        else
 416                list_add_tail(&new->nd, &memtype_list);
 417
 418        spin_unlock(&memtype_lock);
 419
 420        dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
 421                start, end, cattr_name(new->type), cattr_name(req_type),
 422                new_type ? cattr_name(*new_type) : "-");
 423
 424        return err;
 425}
 426
 427int free_memtype(u64 start, u64 end)
 428{
 429        struct memtype *entry;
 430        int err = -EINVAL;
 431        int is_range_ram;
 432
 433        if (!pat_enabled)
 434                return 0;
 435
 436        /* Low ISA region is always mapped WB. No need to track */
 437        if (is_ISA_range(start, end - 1))
 438                return 0;
 439
 440        is_range_ram = pagerange_is_ram(start, end);
 441        if (is_range_ram == 1)
 442                return free_ram_pages_type(start, end);
 443        else if (is_range_ram < 0)
 444                return -EINVAL;
 445
 446        spin_lock(&memtype_lock);
 447        list_for_each_entry(entry, &memtype_list, nd) {
 448                if (entry->start == start && entry->end == end) {
 449                        if (cached_entry == entry || cached_start == start)
 450                                cached_entry = NULL;
 451
 452                        list_del(&entry->nd);
 453                        kfree(entry);
 454                        err = 0;
 455                        break;
 456                }
 457        }
 458        spin_unlock(&memtype_lock);
 459
 460        if (err) {
 461                printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
 462                        current->comm, current->pid, start, end);
 463        }
 464
 465        dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 466
 467        return err;
 468}
 469
 470
 471pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 472                                unsigned long size, pgprot_t vma_prot)
 473{
 474        return vma_prot;
 475}
 476
 477#ifdef CONFIG_STRICT_DEVMEM
 478/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
 479static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 480{
 481        return 1;
 482}
 483#else
 484/* This check is needed to avoid cache aliasing when PAT is enabled */
 485static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 486{
 487        u64 from = ((u64)pfn) << PAGE_SHIFT;
 488        u64 to = from + size;
 489        u64 cursor = from;
 490
 491        if (!pat_enabled)
 492                return 1;
 493
 494        while (cursor < to) {
 495                if (!devmem_is_allowed(pfn)) {
 496                        printk(KERN_INFO
 497                "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
 498                                current->comm, from, to);
 499                        return 0;
 500                }
 501                cursor += PAGE_SIZE;
 502                pfn++;
 503        }
 504        return 1;
 505}
 506#endif /* CONFIG_STRICT_DEVMEM */
 507
 508int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 509                                unsigned long size, pgprot_t *vma_prot)
 510{
 511        u64 offset = ((u64) pfn) << PAGE_SHIFT;
 512        unsigned long flags = -1;
 513        int retval;
 514
 515        if (!range_is_allowed(pfn, size))
 516                return 0;
 517
 518        if (file->f_flags & O_SYNC) {
 519                flags = _PAGE_CACHE_UC_MINUS;
 520        }
 521
 522#ifdef CONFIG_X86_32
 523        /*
 524         * On the PPro and successors, the MTRRs are used to set
 525         * memory types for physical addresses outside main memory,
 526         * so blindly setting UC or PWT on those pages is wrong.
 527         * For Pentiums and earlier, the surround logic should disable
 528         * caching for the high addresses through the KEN pin, but
 529         * we maintain the tradition of paranoia in this code.
 530         */
 531        if (!pat_enabled &&
 532            !(boot_cpu_has(X86_FEATURE_MTRR) ||
 533              boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 534              boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
 535              boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
 536            (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
 537                flags = _PAGE_CACHE_UC;
 538        }
 539#endif
 540
 541        /*
 542         * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
 543         *
 544         * Without O_SYNC, we want to get
 545         * - WB for WB-able memory and no other conflicting mappings
 546         * - UC_MINUS for non-WB-able memory with no other conflicting mappings
 547         * - Inherit from confliting mappings otherwise
 548         */
 549        if (flags != -1) {
 550                retval = reserve_memtype(offset, offset + size, flags, NULL);
 551        } else {
 552                retval = reserve_memtype(offset, offset + size, -1, &flags);
 553        }
 554
 555        if (retval < 0)
 556                return 0;
 557
 558        if (((pfn < max_low_pfn_mapped) ||
 559             (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
 560            ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
 561                free_memtype(offset, offset + size);
 562                printk(KERN_INFO
 563                "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
 564                        current->comm, current->pid,
 565                        cattr_name(flags),
 566                        offset, (unsigned long long)(offset + size));
 567                return 0;
 568        }
 569
 570        *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 571                             flags);
 572        return 1;
 573}
 574
 575void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 576{
 577        unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
 578        u64 addr = (u64)pfn << PAGE_SHIFT;
 579        unsigned long flags;
 580
 581        reserve_memtype(addr, addr + size, want_flags, &flags);
 582        if (flags != want_flags) {
 583                printk(KERN_INFO
 584                "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
 585                        current->comm, current->pid,
 586                        cattr_name(want_flags),
 587                        addr, (unsigned long long)(addr + size),
 588                        cattr_name(flags));
 589        }
 590}
 591
 592void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
 593{
 594        u64 addr = (u64)pfn << PAGE_SHIFT;
 595
 596        free_memtype(addr, addr + size);
 597}
 598
 599#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 600
 601/* get Nth element of the linked list */
 602static struct memtype *memtype_get_idx(loff_t pos)
 603{
 604        struct memtype *list_node, *print_entry;
 605        int i = 1;
 606
 607        print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
 608        if (!print_entry)
 609                return NULL;
 610
 611        spin_lock(&memtype_lock);
 612        list_for_each_entry(list_node, &memtype_list, nd) {
 613                if (pos == i) {
 614                        *print_entry = *list_node;
 615                        spin_unlock(&memtype_lock);
 616                        return print_entry;
 617                }
 618                ++i;
 619        }
 620        spin_unlock(&memtype_lock);
 621        kfree(print_entry);
 622
 623        return NULL;
 624}
 625
 626static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
 627{
 628        if (*pos == 0) {
 629                ++*pos;
 630                seq_printf(seq, "PAT memtype list:\n");
 631        }
 632
 633        return memtype_get_idx(*pos);
 634}
 635
 636static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 637{
 638        ++*pos;
 639        return memtype_get_idx(*pos);
 640}
 641
 642static void memtype_seq_stop(struct seq_file *seq, void *v)
 643{
 644}
 645
 646static int memtype_seq_show(struct seq_file *seq, void *v)
 647{
 648        struct memtype *print_entry = (struct memtype *)v;
 649
 650        seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
 651                        print_entry->start, print_entry->end);
 652        kfree(print_entry);
 653
 654        return 0;
 655}
 656
 657static struct seq_operations memtype_seq_ops = {
 658        .start = memtype_seq_start,
 659        .next  = memtype_seq_next,
 660        .stop  = memtype_seq_stop,
 661        .show  = memtype_seq_show,
 662};
 663
 664static int memtype_seq_open(struct inode *inode, struct file *file)
 665{
 666        return seq_open(file, &memtype_seq_ops);
 667}
 668
 669static const struct file_operations memtype_fops = {
 670        .open    = memtype_seq_open,
 671        .read    = seq_read,
 672        .llseek  = seq_lseek,
 673        .release = seq_release,
 674};
 675
 676static int __init pat_memtype_list_init(void)
 677{
 678        debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
 679                                NULL, &memtype_fops);
 680        return 0;
 681}
 682
 683late_initcall(pat_memtype_list_init);
 684
 685#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
 686