linux/arch/x86/mm/pageattr.c
<<
>>
Prefs
   1/*
   2 * Copyright 2002 Andi Kleen, SuSE Labs.
   3 * Thanks to Ben LaHaise for precious feedback.
   4 */
   5#include <linux/highmem.h>
   6#include <linux/bootmem.h>
   7#include <linux/sched.h>
   8#include <linux/mm.h>
   9#include <linux/interrupt.h>
  10#include <linux/seq_file.h>
  11#include <linux/debugfs.h>
  12#include <linux/pfn.h>
  13#include <linux/percpu.h>
  14#include <linux/gfp.h>
  15#include <linux/pci.h>
  16#include <linux/vmalloc.h>
  17
  18#include <asm/e820.h>
  19#include <asm/processor.h>
  20#include <asm/tlbflush.h>
  21#include <asm/sections.h>
  22#include <asm/setup.h>
  23#include <asm/uaccess.h>
  24#include <asm/pgalloc.h>
  25#include <asm/proto.h>
  26#include <asm/pat.h>
  27
  28/*
  29 * The current flushing context - we pass it instead of 5 arguments:
  30 */
  31struct cpa_data {
  32        unsigned long   *vaddr;
  33        pgd_t           *pgd;
  34        pgprot_t        mask_set;
  35        pgprot_t        mask_clr;
  36        unsigned long   numpages;
  37        int             flags;
  38        unsigned long   pfn;
  39        unsigned        force_split : 1;
  40        int             curpage;
  41        struct page     **pages;
  42};
  43
  44/*
  45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
  46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
  47 * entries change the page attribute in parallel to some other cpu
  48 * splitting a large page entry along with changing the attribute.
  49 */
  50static DEFINE_SPINLOCK(cpa_lock);
  51
  52#define CPA_FLUSHTLB 1
  53#define CPA_ARRAY 2
  54#define CPA_PAGES_ARRAY 4
  55
  56#ifdef CONFIG_PROC_FS
  57static unsigned long direct_pages_count[PG_LEVEL_NUM];
  58
  59void update_page_count(int level, unsigned long pages)
  60{
  61        /* Protect against CPA */
  62        spin_lock(&pgd_lock);
  63        direct_pages_count[level] += pages;
  64        spin_unlock(&pgd_lock);
  65}
  66
  67static void split_page_count(int level)
  68{
  69        if (direct_pages_count[level] == 0)
  70                return;
  71
  72        direct_pages_count[level]--;
  73        direct_pages_count[level - 1] += PTRS_PER_PTE;
  74}
  75
  76void arch_report_meminfo(struct seq_file *m)
  77{
  78        seq_printf(m, "DirectMap4k:    %8lu kB\n",
  79                        direct_pages_count[PG_LEVEL_4K] << 2);
  80#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
  81        seq_printf(m, "DirectMap2M:    %8lu kB\n",
  82                        direct_pages_count[PG_LEVEL_2M] << 11);
  83#else
  84        seq_printf(m, "DirectMap4M:    %8lu kB\n",
  85                        direct_pages_count[PG_LEVEL_2M] << 12);
  86#endif
  87        if (direct_gbpages)
  88                seq_printf(m, "DirectMap1G:    %8lu kB\n",
  89                        direct_pages_count[PG_LEVEL_1G] << 20);
  90}
  91#else
  92static inline void split_page_count(int level) { }
  93#endif
  94
  95#ifdef CONFIG_X86_64
  96
  97static inline unsigned long highmap_start_pfn(void)
  98{
  99        return __pa_symbol(_text) >> PAGE_SHIFT;
 100}
 101
 102static inline unsigned long highmap_end_pfn(void)
 103{
 104        /* Do not reference physical address outside the kernel. */
 105        return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
 106}
 107
 108#endif
 109
 110static inline int
 111within(unsigned long addr, unsigned long start, unsigned long end)
 112{
 113        return addr >= start && addr < end;
 114}
 115
 116static inline int
 117within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
 118{
 119        return addr >= start && addr <= end;
 120}
 121
 122/*
 123 * Flushing functions
 124 */
 125
 126/**
 127 * clflush_cache_range - flush a cache range with clflush
 128 * @vaddr:      virtual start address
 129 * @size:       number of bytes to flush
 130 *
 131 * clflushopt is an unordered instruction which needs fencing with mfence or
 132 * sfence to avoid ordering issues.
 133 */
 134void clflush_cache_range(void *vaddr, unsigned int size)
 135{
 136        const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
 137        void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
 138        void *vend = vaddr + size;
 139
 140        if (p >= vend)
 141                return;
 142
 143        mb();
 144
 145        for (; p < vend; p += clflush_size)
 146                clflushopt(p);
 147
 148        mb();
 149}
 150EXPORT_SYMBOL_GPL(clflush_cache_range);
 151
 152static void __cpa_flush_all(void *arg)
 153{
 154        unsigned long cache = (unsigned long)arg;
 155
 156        /*
 157         * Flush all to work around Errata in early athlons regarding
 158         * large page flushing.
 159         */
 160        __flush_tlb_all();
 161
 162        if (cache && boot_cpu_data.x86 >= 4)
 163                wbinvd();
 164}
 165
 166static void cpa_flush_all(unsigned long cache)
 167{
 168        BUG_ON(irqs_disabled());
 169
 170        on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 171}
 172
 173static void __cpa_flush_range(void *arg)
 174{
 175        /*
 176         * We could optimize that further and do individual per page
 177         * tlb invalidates for a low number of pages. Caveat: we must
 178         * flush the high aliases on 64bit as well.
 179         */
 180        __flush_tlb_all();
 181}
 182
 183static void cpa_flush_range(unsigned long start, int numpages, int cache)
 184{
 185        unsigned int i, level;
 186        unsigned long addr;
 187
 188        BUG_ON(irqs_disabled());
 189        WARN_ON(PAGE_ALIGN(start) != start);
 190
 191        on_each_cpu(__cpa_flush_range, NULL, 1);
 192
 193        if (!cache)
 194                return;
 195
 196        /*
 197         * We only need to flush on one CPU,
 198         * clflush is a MESI-coherent instruction that
 199         * will cause all other CPUs to flush the same
 200         * cachelines:
 201         */
 202        for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 203                pte_t *pte = lookup_address(addr, &level);
 204
 205                /*
 206                 * Only flush present addresses:
 207                 */
 208                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 209                        clflush_cache_range((void *) addr, PAGE_SIZE);
 210        }
 211}
 212
 213static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 214                            int in_flags, struct page **pages)
 215{
 216        unsigned int i, level;
 217        unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 218
 219        BUG_ON(irqs_disabled());
 220
 221        on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 222
 223        if (!cache || do_wbinvd)
 224                return;
 225
 226        /*
 227         * We only need to flush on one CPU,
 228         * clflush is a MESI-coherent instruction that
 229         * will cause all other CPUs to flush the same
 230         * cachelines:
 231         */
 232        for (i = 0; i < numpages; i++) {
 233                unsigned long addr;
 234                pte_t *pte;
 235
 236                if (in_flags & CPA_PAGES_ARRAY)
 237                        addr = (unsigned long)page_address(pages[i]);
 238                else
 239                        addr = start[i];
 240
 241                pte = lookup_address(addr, &level);
 242
 243                /*
 244                 * Only flush present addresses:
 245                 */
 246                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 247                        clflush_cache_range((void *)addr, PAGE_SIZE);
 248        }
 249}
 250
 251/*
 252 * Certain areas of memory on x86 require very specific protection flags,
 253 * for example the BIOS area or kernel text. Callers don't always get this
 254 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 255 * checks and fixes these known static required protection bits.
 256 */
 257static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 258                                   unsigned long pfn)
 259{
 260        pgprot_t forbidden = __pgprot(0);
 261
 262        /*
 263         * The BIOS area between 640k and 1Mb needs to be executable for
 264         * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 265         */
 266#ifdef CONFIG_PCI_BIOS
 267        if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 268                pgprot_val(forbidden) |= _PAGE_NX;
 269#endif
 270
 271        /*
 272         * The kernel text needs to be executable for obvious reasons
 273         * Does not cover __inittext since that is gone later on. On
 274         * 64bit we do not enforce !NX on the low mapping
 275         */
 276        if (within(address, (unsigned long)_text, (unsigned long)_etext))
 277                pgprot_val(forbidden) |= _PAGE_NX;
 278
 279        /*
 280         * The .rodata section needs to be read-only. Using the pfn
 281         * catches all aliases.
 282         */
 283        if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
 284                   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 285                pgprot_val(forbidden) |= _PAGE_RW;
 286
 287#if defined(CONFIG_X86_64)
 288        /*
 289         * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 290         * kernel text mappings for the large page aligned text, rodata sections
 291         * will be always read-only. For the kernel identity mappings covering
 292         * the holes caused by this alignment can be anything that user asks.
 293         *
 294         * This will preserve the large page mappings for kernel text/data
 295         * at no extra cost.
 296         */
 297        if (kernel_set_to_readonly &&
 298            within(address, (unsigned long)_text,
 299                   (unsigned long)__end_rodata_hpage_align)) {
 300                unsigned int level;
 301
 302                /*
 303                 * Don't enforce the !RW mapping for the kernel text mapping,
 304                 * if the current mapping is already using small page mapping.
 305                 * No need to work hard to preserve large page mappings in this
 306                 * case.
 307                 *
 308                 * This also fixes the Linux Xen paravirt guest boot failure
 309                 * (because of unexpected read-only mappings for kernel identity
 310                 * mappings). In this paravirt guest case, the kernel text
 311                 * mapping and the kernel identity mapping share the same
 312                 * page-table pages. Thus we can't really use different
 313                 * protections for the kernel text and identity mappings. Also,
 314                 * these shared mappings are made of small page mappings.
 315                 * Thus this don't enforce !RW mapping for small page kernel
 316                 * text mapping logic will help Linux Xen parvirt guest boot
 317                 * as well.
 318                 */
 319                if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 320                        pgprot_val(forbidden) |= _PAGE_RW;
 321        }
 322#endif
 323
 324        prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 325
 326        return prot;
 327}
 328
 329/*
 330 * Lookup the page table entry for a virtual address in a specific pgd.
 331 * Return a pointer to the entry and the level of the mapping.
 332 */
 333pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 334                             unsigned int *level)
 335{
 336        pud_t *pud;
 337        pmd_t *pmd;
 338
 339        *level = PG_LEVEL_NONE;
 340
 341        if (pgd_none(*pgd))
 342                return NULL;
 343
 344        pud = pud_offset(pgd, address);
 345        if (pud_none(*pud))
 346                return NULL;
 347
 348        *level = PG_LEVEL_1G;
 349        if (pud_large(*pud) || !pud_present(*pud))
 350                return (pte_t *)pud;
 351
 352        pmd = pmd_offset(pud, address);
 353        if (pmd_none(*pmd))
 354                return NULL;
 355
 356        *level = PG_LEVEL_2M;
 357        if (pmd_large(*pmd) || !pmd_present(*pmd))
 358                return (pte_t *)pmd;
 359
 360        *level = PG_LEVEL_4K;
 361
 362        return pte_offset_kernel(pmd, address);
 363}
 364
 365/*
 366 * Lookup the page table entry for a virtual address. Return a pointer
 367 * to the entry and the level of the mapping.
 368 *
 369 * Note: We return pud and pmd either when the entry is marked large
 370 * or when the present bit is not set. Otherwise we would return a
 371 * pointer to a nonexisting mapping.
 372 */
 373pte_t *lookup_address(unsigned long address, unsigned int *level)
 374{
 375        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
 376}
 377EXPORT_SYMBOL_GPL(lookup_address);
 378
 379static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
 380                                  unsigned int *level)
 381{
 382        if (cpa->pgd)
 383                return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
 384                                               address, level);
 385
 386        return lookup_address(address, level);
 387}
 388
 389/*
 390 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 391 * or NULL if not present.
 392 */
 393pmd_t *lookup_pmd_address(unsigned long address)
 394{
 395        pgd_t *pgd;
 396        pud_t *pud;
 397
 398        pgd = pgd_offset_k(address);
 399        if (pgd_none(*pgd))
 400                return NULL;
 401
 402        pud = pud_offset(pgd, address);
 403        if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
 404                return NULL;
 405
 406        return pmd_offset(pud, address);
 407}
 408
 409/*
 410 * This is necessary because __pa() does not work on some
 411 * kinds of memory, like vmalloc() or the alloc_remap()
 412 * areas on 32-bit NUMA systems.  The percpu areas can
 413 * end up in this kind of memory, for instance.
 414 *
 415 * This could be optimized, but it is only intended to be
 416 * used at inititalization time, and keeping it
 417 * unoptimized should increase the testing coverage for
 418 * the more obscure platforms.
 419 */
 420phys_addr_t slow_virt_to_phys(void *__virt_addr)
 421{
 422        unsigned long virt_addr = (unsigned long)__virt_addr;
 423        phys_addr_t phys_addr;
 424        unsigned long offset;
 425        enum pg_level level;
 426        pte_t *pte;
 427
 428        pte = lookup_address(virt_addr, &level);
 429        BUG_ON(!pte);
 430
 431        /*
 432         * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
 433         * before being left-shifted PAGE_SHIFT bits -- this trick is to
 434         * make 32-PAE kernel work correctly.
 435         */
 436        switch (level) {
 437        case PG_LEVEL_1G:
 438                phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
 439                offset = virt_addr & ~PUD_PAGE_MASK;
 440                break;
 441        case PG_LEVEL_2M:
 442                phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
 443                offset = virt_addr & ~PMD_PAGE_MASK;
 444                break;
 445        default:
 446                phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 447                offset = virt_addr & ~PAGE_MASK;
 448        }
 449
 450        return (phys_addr_t)(phys_addr | offset);
 451}
 452EXPORT_SYMBOL_GPL(slow_virt_to_phys);
 453
 454/*
 455 * Set the new pmd in all the pgds we know about:
 456 */
 457static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 458{
 459        /* change init_mm */
 460        set_pte_atomic(kpte, pte);
 461#ifdef CONFIG_X86_32
 462        if (!SHARED_KERNEL_PMD) {
 463                struct page *page;
 464
 465                list_for_each_entry(page, &pgd_list, lru) {
 466                        pgd_t *pgd;
 467                        pud_t *pud;
 468                        pmd_t *pmd;
 469
 470                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
 471                        pud = pud_offset(pgd, address);
 472                        pmd = pmd_offset(pud, address);
 473                        set_pte_atomic((pte_t *)pmd, pte);
 474                }
 475        }
 476#endif
 477}
 478
 479static int
 480try_preserve_large_page(pte_t *kpte, unsigned long address,
 481                        struct cpa_data *cpa)
 482{
 483        unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
 484        pte_t new_pte, old_pte, *tmp;
 485        pgprot_t old_prot, new_prot, req_prot;
 486        int i, do_split = 1;
 487        enum pg_level level;
 488
 489        if (cpa->force_split)
 490                return 1;
 491
 492        spin_lock(&pgd_lock);
 493        /*
 494         * Check for races, another CPU might have split this page
 495         * up already:
 496         */
 497        tmp = _lookup_address_cpa(cpa, address, &level);
 498        if (tmp != kpte)
 499                goto out_unlock;
 500
 501        switch (level) {
 502        case PG_LEVEL_2M:
 503                old_prot = pmd_pgprot(*(pmd_t *)kpte);
 504                old_pfn = pmd_pfn(*(pmd_t *)kpte);
 505                break;
 506        case PG_LEVEL_1G:
 507                old_prot = pud_pgprot(*(pud_t *)kpte);
 508                old_pfn = pud_pfn(*(pud_t *)kpte);
 509                break;
 510        default:
 511                do_split = -EINVAL;
 512                goto out_unlock;
 513        }
 514
 515        psize = page_level_size(level);
 516        pmask = page_level_mask(level);
 517
 518        /*
 519         * Calculate the number of pages, which fit into this large
 520         * page starting at address:
 521         */
 522        nextpage_addr = (address + psize) & pmask;
 523        numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 524        if (numpages < cpa->numpages)
 525                cpa->numpages = numpages;
 526
 527        /*
 528         * We are safe now. Check whether the new pgprot is the same:
 529         * Convert protection attributes to 4k-format, as cpa->mask* are set
 530         * up accordingly.
 531         */
 532        old_pte = *kpte;
 533        req_prot = pgprot_large_2_4k(old_prot);
 534
 535        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 536        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 537
 538        /*
 539         * req_prot is in format of 4k pages. It must be converted to large
 540         * page format: the caching mode includes the PAT bit located at
 541         * different bit positions in the two formats.
 542         */
 543        req_prot = pgprot_4k_2_large(req_prot);
 544
 545        /*
 546         * Set the PSE and GLOBAL flags only if the PRESENT flag is
 547         * set otherwise pmd_present/pmd_huge will return true even on
 548         * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
 549         * for the ancient hardware that doesn't support it.
 550         */
 551        if (pgprot_val(req_prot) & _PAGE_PRESENT)
 552                pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
 553        else
 554                pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
 555
 556        req_prot = canon_pgprot(req_prot);
 557
 558        /*
 559         * old_pfn points to the large page base pfn. So we need
 560         * to add the offset of the virtual address:
 561         */
 562        pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
 563        cpa->pfn = pfn;
 564
 565        new_prot = static_protections(req_prot, address, pfn);
 566
 567        /*
 568         * We need to check the full range, whether
 569         * static_protection() requires a different pgprot for one of
 570         * the pages in the range we try to preserve:
 571         */
 572        addr = address & pmask;
 573        pfn = old_pfn;
 574        for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 575                pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 576
 577                if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 578                        goto out_unlock;
 579        }
 580
 581        /*
 582         * If there are no changes, return. maxpages has been updated
 583         * above:
 584         */
 585        if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 586                do_split = 0;
 587                goto out_unlock;
 588        }
 589
 590        /*
 591         * We need to change the attributes. Check, whether we can
 592         * change the large page in one go. We request a split, when
 593         * the address is not aligned and the number of pages is
 594         * smaller than the number of pages in the large page. Note
 595         * that we limited the number of possible pages already to
 596         * the number of pages in the large page.
 597         */
 598        if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 599                /*
 600                 * The address is aligned and the number of pages
 601                 * covers the full page.
 602                 */
 603                new_pte = pfn_pte(old_pfn, new_prot);
 604                __set_pmd_pte(kpte, address, new_pte);
 605                cpa->flags |= CPA_FLUSHTLB;
 606                do_split = 0;
 607        }
 608
 609out_unlock:
 610        spin_unlock(&pgd_lock);
 611
 612        return do_split;
 613}
 614
 615static int
 616__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 617                   struct page *base)
 618{
 619        pte_t *pbase = (pte_t *)page_address(base);
 620        unsigned long ref_pfn, pfn, pfninc = 1;
 621        unsigned int i, level;
 622        pte_t *tmp;
 623        pgprot_t ref_prot;
 624
 625        spin_lock(&pgd_lock);
 626        /*
 627         * Check for races, another CPU might have split this page
 628         * up for us already:
 629         */
 630        tmp = _lookup_address_cpa(cpa, address, &level);
 631        if (tmp != kpte) {
 632                spin_unlock(&pgd_lock);
 633                return 1;
 634        }
 635
 636        paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 637
 638        switch (level) {
 639        case PG_LEVEL_2M:
 640                ref_prot = pmd_pgprot(*(pmd_t *)kpte);
 641                /* clear PSE and promote PAT bit to correct position */
 642                ref_prot = pgprot_large_2_4k(ref_prot);
 643                ref_pfn = pmd_pfn(*(pmd_t *)kpte);
 644                break;
 645
 646        case PG_LEVEL_1G:
 647                ref_prot = pud_pgprot(*(pud_t *)kpte);
 648                ref_pfn = pud_pfn(*(pud_t *)kpte);
 649                pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 650
 651                /*
 652                 * Clear the PSE flags if the PRESENT flag is not set
 653                 * otherwise pmd_present/pmd_huge will return true
 654                 * even on a non present pmd.
 655                 */
 656                if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
 657                        pgprot_val(ref_prot) &= ~_PAGE_PSE;
 658                break;
 659
 660        default:
 661                spin_unlock(&pgd_lock);
 662                return 1;
 663        }
 664
 665        /*
 666         * Set the GLOBAL flags only if the PRESENT flag is set
 667         * otherwise pmd/pte_present will return true even on a non
 668         * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
 669         * for the ancient hardware that doesn't support it.
 670         */
 671        if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 672                pgprot_val(ref_prot) |= _PAGE_GLOBAL;
 673        else
 674                pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
 675
 676        /*
 677         * Get the target pfn from the original entry:
 678         */
 679        pfn = ref_pfn;
 680        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 681                set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
 682
 683        if (virt_addr_valid(address)) {
 684                unsigned long pfn = PFN_DOWN(__pa(address));
 685
 686                if (pfn_range_is_mapped(pfn, pfn + 1))
 687                        split_page_count(level);
 688        }
 689
 690        /*
 691         * Install the new, split up pagetable.
 692         *
 693         * We use the standard kernel pagetable protections for the new
 694         * pagetable protections, the actual ptes set above control the
 695         * primary protection behavior:
 696         */
 697        __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 698
 699        /*
 700         * Intel Atom errata AAH41 workaround.
 701         *
 702         * The real fix should be in hw or in a microcode update, but
 703         * we also probabilistically try to reduce the window of having
 704         * a large TLB mixed with 4K TLBs while instruction fetches are
 705         * going on.
 706         */
 707        __flush_tlb_all();
 708        spin_unlock(&pgd_lock);
 709
 710        return 0;
 711}
 712
 713static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 714                            unsigned long address)
 715{
 716        struct page *base;
 717
 718        if (!debug_pagealloc_enabled())
 719                spin_unlock(&cpa_lock);
 720        base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
 721        if (!debug_pagealloc_enabled())
 722                spin_lock(&cpa_lock);
 723        if (!base)
 724                return -ENOMEM;
 725
 726        if (__split_large_page(cpa, kpte, address, base))
 727                __free_page(base);
 728
 729        return 0;
 730}
 731
 732static bool try_to_free_pte_page(pte_t *pte)
 733{
 734        int i;
 735
 736        for (i = 0; i < PTRS_PER_PTE; i++)
 737                if (!pte_none(pte[i]))
 738                        return false;
 739
 740        free_page((unsigned long)pte);
 741        return true;
 742}
 743
 744static bool try_to_free_pmd_page(pmd_t *pmd)
 745{
 746        int i;
 747
 748        for (i = 0; i < PTRS_PER_PMD; i++)
 749                if (!pmd_none(pmd[i]))
 750                        return false;
 751
 752        free_page((unsigned long)pmd);
 753        return true;
 754}
 755
 756static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 757{
 758        pte_t *pte = pte_offset_kernel(pmd, start);
 759
 760        while (start < end) {
 761                set_pte(pte, __pte(0));
 762
 763                start += PAGE_SIZE;
 764                pte++;
 765        }
 766
 767        if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
 768                pmd_clear(pmd);
 769                return true;
 770        }
 771        return false;
 772}
 773
 774static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 775                              unsigned long start, unsigned long end)
 776{
 777        if (unmap_pte_range(pmd, start, end))
 778                if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 779                        pud_clear(pud);
 780}
 781
 782static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 783{
 784        pmd_t *pmd = pmd_offset(pud, start);
 785
 786        /*
 787         * Not on a 2MB page boundary?
 788         */
 789        if (start & (PMD_SIZE - 1)) {
 790                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 791                unsigned long pre_end = min_t(unsigned long, end, next_page);
 792
 793                __unmap_pmd_range(pud, pmd, start, pre_end);
 794
 795                start = pre_end;
 796                pmd++;
 797        }
 798
 799        /*
 800         * Try to unmap in 2M chunks.
 801         */
 802        while (end - start >= PMD_SIZE) {
 803                if (pmd_large(*pmd))
 804                        pmd_clear(pmd);
 805                else
 806                        __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
 807
 808                start += PMD_SIZE;
 809                pmd++;
 810        }
 811
 812        /*
 813         * 4K leftovers?
 814         */
 815        if (start < end)
 816                return __unmap_pmd_range(pud, pmd, start, end);
 817
 818        /*
 819         * Try again to free the PMD page if haven't succeeded above.
 820         */
 821        if (!pud_none(*pud))
 822                if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 823                        pud_clear(pud);
 824}
 825
 826static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 827{
 828        pud_t *pud = pud_offset(pgd, start);
 829
 830        /*
 831         * Not on a GB page boundary?
 832         */
 833        if (start & (PUD_SIZE - 1)) {
 834                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 835                unsigned long pre_end   = min_t(unsigned long, end, next_page);
 836
 837                unmap_pmd_range(pud, start, pre_end);
 838
 839                start = pre_end;
 840                pud++;
 841        }
 842
 843        /*
 844         * Try to unmap in 1G chunks?
 845         */
 846        while (end - start >= PUD_SIZE) {
 847
 848                if (pud_large(*pud))
 849                        pud_clear(pud);
 850                else
 851                        unmap_pmd_range(pud, start, start + PUD_SIZE);
 852
 853                start += PUD_SIZE;
 854                pud++;
 855        }
 856
 857        /*
 858         * 2M leftovers?
 859         */
 860        if (start < end)
 861                unmap_pmd_range(pud, start, end);
 862
 863        /*
 864         * No need to try to free the PUD page because we'll free it in
 865         * populate_pgd's error path
 866         */
 867}
 868
 869static int alloc_pte_page(pmd_t *pmd)
 870{
 871        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 872        if (!pte)
 873                return -1;
 874
 875        set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 876        return 0;
 877}
 878
 879static int alloc_pmd_page(pud_t *pud)
 880{
 881        pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 882        if (!pmd)
 883                return -1;
 884
 885        set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 886        return 0;
 887}
 888
 889static void populate_pte(struct cpa_data *cpa,
 890                         unsigned long start, unsigned long end,
 891                         unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
 892{
 893        pte_t *pte;
 894
 895        pte = pte_offset_kernel(pmd, start);
 896
 897        /*
 898         * Set the GLOBAL flags only if the PRESENT flag is
 899         * set otherwise pte_present will return true even on
 900         * a non present pte. The canon_pgprot will clear
 901         * _PAGE_GLOBAL for the ancient hardware that doesn't
 902         * support it.
 903         */
 904        if (pgprot_val(pgprot) & _PAGE_PRESENT)
 905                pgprot_val(pgprot) |= _PAGE_GLOBAL;
 906        else
 907                pgprot_val(pgprot) &= ~_PAGE_GLOBAL;
 908
 909        pgprot = canon_pgprot(pgprot);
 910
 911        while (num_pages-- && start < end) {
 912                set_pte(pte, pfn_pte(cpa->pfn, pgprot));
 913
 914                start    += PAGE_SIZE;
 915                cpa->pfn++;
 916                pte++;
 917        }
 918}
 919
 920static long populate_pmd(struct cpa_data *cpa,
 921                         unsigned long start, unsigned long end,
 922                         unsigned num_pages, pud_t *pud, pgprot_t pgprot)
 923{
 924        long cur_pages = 0;
 925        pmd_t *pmd;
 926        pgprot_t pmd_pgprot;
 927
 928        /*
 929         * Not on a 2M boundary?
 930         */
 931        if (start & (PMD_SIZE - 1)) {
 932                unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
 933                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 934
 935                pre_end   = min_t(unsigned long, pre_end, next_page);
 936                cur_pages = (pre_end - start) >> PAGE_SHIFT;
 937                cur_pages = min_t(unsigned int, num_pages, cur_pages);
 938
 939                /*
 940                 * Need a PTE page?
 941                 */
 942                pmd = pmd_offset(pud, start);
 943                if (pmd_none(*pmd))
 944                        if (alloc_pte_page(pmd))
 945                                return -1;
 946
 947                populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
 948
 949                start = pre_end;
 950        }
 951
 952        /*
 953         * We mapped them all?
 954         */
 955        if (num_pages == cur_pages)
 956                return cur_pages;
 957
 958        pmd_pgprot = pgprot_4k_2_large(pgprot);
 959
 960        while (end - start >= PMD_SIZE) {
 961
 962                /*
 963                 * We cannot use a 1G page so allocate a PMD page if needed.
 964                 */
 965                if (pud_none(*pud))
 966                        if (alloc_pmd_page(pud))
 967                                return -1;
 968
 969                pmd = pmd_offset(pud, start);
 970
 971                set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 972                                   massage_pgprot(pmd_pgprot)));
 973
 974                start     += PMD_SIZE;
 975                cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
 976                cur_pages += PMD_SIZE >> PAGE_SHIFT;
 977        }
 978
 979        /*
 980         * Map trailing 4K pages.
 981         */
 982        if (start < end) {
 983                pmd = pmd_offset(pud, start);
 984                if (pmd_none(*pmd))
 985                        if (alloc_pte_page(pmd))
 986                                return -1;
 987
 988                populate_pte(cpa, start, end, num_pages - cur_pages,
 989                             pmd, pgprot);
 990        }
 991        return num_pages;
 992}
 993
 994static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 995                         pgprot_t pgprot)
 996{
 997        pud_t *pud;
 998        unsigned long end;
 999        long cur_pages = 0;
1000        pgprot_t pud_pgprot;
1001
1002        end = start + (cpa->numpages << PAGE_SHIFT);
1003
1004        /*
1005         * Not on a Gb page boundary? => map everything up to it with
1006         * smaller pages.
1007         */
1008        if (start & (PUD_SIZE - 1)) {
1009                unsigned long pre_end;
1010                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1011
1012                pre_end   = min_t(unsigned long, end, next_page);
1013                cur_pages = (pre_end - start) >> PAGE_SHIFT;
1014                cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
1015
1016                pud = pud_offset(pgd, start);
1017
1018                /*
1019                 * Need a PMD page?
1020                 */
1021                if (pud_none(*pud))
1022                        if (alloc_pmd_page(pud))
1023                                return -1;
1024
1025                cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1026                                         pud, pgprot);
1027                if (cur_pages < 0)
1028                        return cur_pages;
1029
1030                start = pre_end;
1031        }
1032
1033        /* We mapped them all? */
1034        if (cpa->numpages == cur_pages)
1035                return cur_pages;
1036
1037        pud = pud_offset(pgd, start);
1038        pud_pgprot = pgprot_4k_2_large(pgprot);
1039
1040        /*
1041         * Map everything starting from the Gb boundary, possibly with 1G pages
1042         */
1043        while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
1044                set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
1045                                   massage_pgprot(pud_pgprot)));
1046
1047                start     += PUD_SIZE;
1048                cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
1049                cur_pages += PUD_SIZE >> PAGE_SHIFT;
1050                pud++;
1051        }
1052
1053        /* Map trailing leftover */
1054        if (start < end) {
1055                long tmp;
1056
1057                pud = pud_offset(pgd, start);
1058                if (pud_none(*pud))
1059                        if (alloc_pmd_page(pud))
1060                                return -1;
1061
1062                tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1063                                   pud, pgprot);
1064                if (tmp < 0)
1065                        return cur_pages;
1066
1067                cur_pages += tmp;
1068        }
1069        return cur_pages;
1070}
1071
1072/*
1073 * Restrictions for kernel page table do not necessarily apply when mapping in
1074 * an alternate PGD.
1075 */
1076static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1077{
1078        pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1079        pud_t *pud = NULL;      /* shut up gcc */
1080        pgd_t *pgd_entry;
1081        long ret;
1082
1083        pgd_entry = cpa->pgd + pgd_index(addr);
1084
1085        /*
1086         * Allocate a PUD page and hand it down for mapping.
1087         */
1088        if (pgd_none(*pgd_entry)) {
1089                pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1090                if (!pud)
1091                        return -1;
1092
1093                set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1094        }
1095
1096        pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1097        pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1098
1099        ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1100        if (ret < 0) {
1101                /*
1102                 * Leave the PUD page in place in case some other CPU or thread
1103                 * already found it, but remove any useless entries we just
1104                 * added to it.
1105                 */
1106                unmap_pud_range(pgd_entry, addr,
1107                                addr + (cpa->numpages << PAGE_SHIFT));
1108                return ret;
1109        }
1110
1111        cpa->numpages = ret;
1112        return 0;
1113}
1114
1115static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1116                               int primary)
1117{
1118        if (cpa->pgd) {
1119                /*
1120                 * Right now, we only execute this code path when mapping
1121                 * the EFI virtual memory map regions, no other users
1122                 * provide a ->pgd value. This may change in the future.
1123                 */
1124                return populate_pgd(cpa, vaddr);
1125        }
1126
1127        /*
1128         * Ignore all non primary paths.
1129         */
1130        if (!primary) {
1131                cpa->numpages = 1;
1132                return 0;
1133        }
1134
1135        /*
1136         * Ignore the NULL PTE for kernel identity mapping, as it is expected
1137         * to have holes.
1138         * Also set numpages to '1' indicating that we processed cpa req for
1139         * one virtual address page and its pfn. TBD: numpages can be set based
1140         * on the initial value and the level returned by lookup_address().
1141         */
1142        if (within(vaddr, PAGE_OFFSET,
1143                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1144                cpa->numpages = 1;
1145                cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1146                return 0;
1147        } else {
1148                WARN(1, KERN_WARNING "CPA: called for zero pte. "
1149                        "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1150                        *cpa->vaddr);
1151
1152                return -EFAULT;
1153        }
1154}
1155
1156static int __change_page_attr(struct cpa_data *cpa, int primary)
1157{
1158        unsigned long address;
1159        int do_split, err;
1160        unsigned int level;
1161        pte_t *kpte, old_pte;
1162
1163        if (cpa->flags & CPA_PAGES_ARRAY) {
1164                struct page *page = cpa->pages[cpa->curpage];
1165                if (unlikely(PageHighMem(page)))
1166                        return 0;
1167                address = (unsigned long)page_address(page);
1168        } else if (cpa->flags & CPA_ARRAY)
1169                address = cpa->vaddr[cpa->curpage];
1170        else
1171                address = *cpa->vaddr;
1172repeat:
1173        kpte = _lookup_address_cpa(cpa, address, &level);
1174        if (!kpte)
1175                return __cpa_process_fault(cpa, address, primary);
1176
1177        old_pte = *kpte;
1178        if (pte_none(old_pte))
1179                return __cpa_process_fault(cpa, address, primary);
1180
1181        if (level == PG_LEVEL_4K) {
1182                pte_t new_pte;
1183                pgprot_t new_prot = pte_pgprot(old_pte);
1184                unsigned long pfn = pte_pfn(old_pte);
1185
1186                pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1187                pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1188
1189                new_prot = static_protections(new_prot, address, pfn);
1190
1191                /*
1192                 * Set the GLOBAL flags only if the PRESENT flag is
1193                 * set otherwise pte_present will return true even on
1194                 * a non present pte. The canon_pgprot will clear
1195                 * _PAGE_GLOBAL for the ancient hardware that doesn't
1196                 * support it.
1197                 */
1198                if (pgprot_val(new_prot) & _PAGE_PRESENT)
1199                        pgprot_val(new_prot) |= _PAGE_GLOBAL;
1200                else
1201                        pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1202
1203                /*
1204                 * We need to keep the pfn from the existing PTE,
1205                 * after all we're only going to change it's attributes
1206                 * not the memory it points to
1207                 */
1208                new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1209                cpa->pfn = pfn;
1210                /*
1211                 * Do we really change anything ?
1212                 */
1213                if (pte_val(old_pte) != pte_val(new_pte)) {
1214                        set_pte_atomic(kpte, new_pte);
1215                        cpa->flags |= CPA_FLUSHTLB;
1216                }
1217                cpa->numpages = 1;
1218                return 0;
1219        }
1220
1221        /*
1222         * Check, whether we can keep the large page intact
1223         * and just change the pte:
1224         */
1225        do_split = try_preserve_large_page(kpte, address, cpa);
1226        /*
1227         * When the range fits into the existing large page,
1228         * return. cp->numpages and cpa->tlbflush have been updated in
1229         * try_large_page:
1230         */
1231        if (do_split <= 0)
1232                return do_split;
1233
1234        /*
1235         * We have to split the large page:
1236         */
1237        err = split_large_page(cpa, kpte, address);
1238        if (!err) {
1239                /*
1240                 * Do a global flush tlb after splitting the large page
1241                 * and before we do the actual change page attribute in the PTE.
1242                 *
1243                 * With out this, we violate the TLB application note, that says
1244                 * "The TLBs may contain both ordinary and large-page
1245                 *  translations for a 4-KByte range of linear addresses. This
1246                 *  may occur if software modifies the paging structures so that
1247                 *  the page size used for the address range changes. If the two
1248                 *  translations differ with respect to page frame or attributes
1249                 *  (e.g., permissions), processor behavior is undefined and may
1250                 *  be implementation-specific."
1251                 *
1252                 * We do this global tlb flush inside the cpa_lock, so that we
1253                 * don't allow any other cpu, with stale tlb entries change the
1254                 * page attribute in parallel, that also falls into the
1255                 * just split large page entry.
1256                 */
1257                flush_tlb_all();
1258                goto repeat;
1259        }
1260
1261        return err;
1262}
1263
1264static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1265
1266static int cpa_process_alias(struct cpa_data *cpa)
1267{
1268        struct cpa_data alias_cpa;
1269        unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1270        unsigned long vaddr;
1271        int ret;
1272
1273        if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1274                return 0;
1275
1276        /*
1277         * No need to redo, when the primary call touched the direct
1278         * mapping already:
1279         */
1280        if (cpa->flags & CPA_PAGES_ARRAY) {
1281                struct page *page = cpa->pages[cpa->curpage];
1282                if (unlikely(PageHighMem(page)))
1283                        return 0;
1284                vaddr = (unsigned long)page_address(page);
1285        } else if (cpa->flags & CPA_ARRAY)
1286                vaddr = cpa->vaddr[cpa->curpage];
1287        else
1288                vaddr = *cpa->vaddr;
1289
1290        if (!(within(vaddr, PAGE_OFFSET,
1291                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1292
1293                alias_cpa = *cpa;
1294                alias_cpa.vaddr = &laddr;
1295                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1296
1297                ret = __change_page_attr_set_clr(&alias_cpa, 0);
1298                if (ret)
1299                        return ret;
1300        }
1301
1302#ifdef CONFIG_X86_64
1303        /*
1304         * If the primary call didn't touch the high mapping already
1305         * and the physical address is inside the kernel map, we need
1306         * to touch the high mapped kernel as well:
1307         */
1308        if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1309            within_inclusive(cpa->pfn, highmap_start_pfn(),
1310                             highmap_end_pfn())) {
1311                unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1312                                               __START_KERNEL_map - phys_base;
1313                alias_cpa = *cpa;
1314                alias_cpa.vaddr = &temp_cpa_vaddr;
1315                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1316
1317                /*
1318                 * The high mapping range is imprecise, so ignore the
1319                 * return value.
1320                 */
1321                __change_page_attr_set_clr(&alias_cpa, 0);
1322        }
1323#endif
1324
1325        return 0;
1326}
1327
1328static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1329{
1330        unsigned long numpages = cpa->numpages;
1331        int ret;
1332
1333        while (numpages) {
1334                /*
1335                 * Store the remaining nr of pages for the large page
1336                 * preservation check.
1337                 */
1338                cpa->numpages = numpages;
1339                /* for array changes, we can't use large page */
1340                if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1341                        cpa->numpages = 1;
1342
1343                if (!debug_pagealloc_enabled())
1344                        spin_lock(&cpa_lock);
1345                ret = __change_page_attr(cpa, checkalias);
1346                if (!debug_pagealloc_enabled())
1347                        spin_unlock(&cpa_lock);
1348                if (ret)
1349                        return ret;
1350
1351                if (checkalias) {
1352                        ret = cpa_process_alias(cpa);
1353                        if (ret)
1354                                return ret;
1355                }
1356
1357                /*
1358                 * Adjust the number of pages with the result of the
1359                 * CPA operation. Either a large page has been
1360                 * preserved or a single page update happened.
1361                 */
1362                BUG_ON(cpa->numpages > numpages || !cpa->numpages);
1363                numpages -= cpa->numpages;
1364                if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1365                        cpa->curpage++;
1366                else
1367                        *cpa->vaddr += cpa->numpages * PAGE_SIZE;
1368
1369        }
1370        return 0;
1371}
1372
1373static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1374                                    pgprot_t mask_set, pgprot_t mask_clr,
1375                                    int force_split, int in_flag,
1376                                    struct page **pages)
1377{
1378        struct cpa_data cpa;
1379        int ret, cache, checkalias;
1380        unsigned long baddr = 0;
1381
1382        memset(&cpa, 0, sizeof(cpa));
1383
1384        /*
1385         * Check, if we are requested to change a not supported
1386         * feature:
1387         */
1388        mask_set = canon_pgprot(mask_set);
1389        mask_clr = canon_pgprot(mask_clr);
1390        if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1391                return 0;
1392
1393        /* Ensure we are PAGE_SIZE aligned */
1394        if (in_flag & CPA_ARRAY) {
1395                int i;
1396                for (i = 0; i < numpages; i++) {
1397                        if (addr[i] & ~PAGE_MASK) {
1398                                addr[i] &= PAGE_MASK;
1399                                WARN_ON_ONCE(1);
1400                        }
1401                }
1402        } else if (!(in_flag & CPA_PAGES_ARRAY)) {
1403                /*
1404                 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1405                 * No need to cehck in that case
1406                 */
1407                if (*addr & ~PAGE_MASK) {
1408                        *addr &= PAGE_MASK;
1409                        /*
1410                         * People should not be passing in unaligned addresses:
1411                         */
1412                        WARN_ON_ONCE(1);
1413                }
1414                /*
1415                 * Save address for cache flush. *addr is modified in the call
1416                 * to __change_page_attr_set_clr() below.
1417                 */
1418                baddr = *addr;
1419        }
1420
1421        /* Must avoid aliasing mappings in the highmem code */
1422        kmap_flush_unused();
1423
1424        vm_unmap_aliases();
1425
1426        cpa.vaddr = addr;
1427        cpa.pages = pages;
1428        cpa.numpages = numpages;
1429        cpa.mask_set = mask_set;
1430        cpa.mask_clr = mask_clr;
1431        cpa.flags = 0;
1432        cpa.curpage = 0;
1433        cpa.force_split = force_split;
1434
1435        if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1436                cpa.flags |= in_flag;
1437
1438        /* No alias checking for _NX bit modifications */
1439        checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1440
1441        ret = __change_page_attr_set_clr(&cpa, checkalias);
1442
1443        /*
1444         * Check whether we really changed something:
1445         */
1446        if (!(cpa.flags & CPA_FLUSHTLB))
1447                goto out;
1448
1449        /*
1450         * No need to flush, when we did not set any of the caching
1451         * attributes:
1452         */
1453        cache = !!pgprot2cachemode(mask_set);
1454
1455        /*
1456         * On success we use CLFLUSH, when the CPU supports it to
1457         * avoid the WBINVD. If the CPU does not support it and in the
1458         * error case we fall back to cpa_flush_all (which uses
1459         * WBINVD):
1460         */
1461        if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
1462                if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1463                        cpa_flush_array(addr, numpages, cache,
1464                                        cpa.flags, pages);
1465                } else
1466                        cpa_flush_range(baddr, numpages, cache);
1467        } else
1468                cpa_flush_all(cache);
1469
1470out:
1471        return ret;
1472}
1473
1474static inline int change_page_attr_set(unsigned long *addr, int numpages,
1475                                       pgprot_t mask, int array)
1476{
1477        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1478                (array ? CPA_ARRAY : 0), NULL);
1479}
1480
1481static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1482                                         pgprot_t mask, int array)
1483{
1484        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1485                (array ? CPA_ARRAY : 0), NULL);
1486}
1487
1488static inline int cpa_set_pages_array(struct page **pages, int numpages,
1489                                       pgprot_t mask)
1490{
1491        return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1492                CPA_PAGES_ARRAY, pages);
1493}
1494
1495static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1496                                         pgprot_t mask)
1497{
1498        return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1499                CPA_PAGES_ARRAY, pages);
1500}
1501
1502int _set_memory_uc(unsigned long addr, int numpages)
1503{
1504        /*
1505         * for now UC MINUS. see comments in ioremap_nocache()
1506         * If you really need strong UC use ioremap_uc(), but note
1507         * that you cannot override IO areas with set_memory_*() as
1508         * these helpers cannot work with IO memory.
1509         */
1510        return change_page_attr_set(&addr, numpages,
1511                                    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1512                                    0);
1513}
1514
1515int set_memory_uc(unsigned long addr, int numpages)
1516{
1517        int ret;
1518
1519        /*
1520         * for now UC MINUS. see comments in ioremap_nocache()
1521         */
1522        ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1523                              _PAGE_CACHE_MODE_UC_MINUS, NULL);
1524        if (ret)
1525                goto out_err;
1526
1527        ret = _set_memory_uc(addr, numpages);
1528        if (ret)
1529                goto out_free;
1530
1531        return 0;
1532
1533out_free:
1534        free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1535out_err:
1536        return ret;
1537}
1538EXPORT_SYMBOL(set_memory_uc);
1539
1540static int _set_memory_array(unsigned long *addr, int addrinarray,
1541                enum page_cache_mode new_type)
1542{
1543        enum page_cache_mode set_type;
1544        int i, j;
1545        int ret;
1546
1547        for (i = 0; i < addrinarray; i++) {
1548                ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1549                                        new_type, NULL);
1550                if (ret)
1551                        goto out_free;
1552        }
1553
1554        /* If WC, set to UC- first and then WC */
1555        set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
1556                                _PAGE_CACHE_MODE_UC_MINUS : new_type;
1557
1558        ret = change_page_attr_set(addr, addrinarray,
1559                                   cachemode2pgprot(set_type), 1);
1560
1561        if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1562                ret = change_page_attr_set_clr(addr, addrinarray,
1563                                               cachemode2pgprot(
1564                                                _PAGE_CACHE_MODE_WC),
1565                                               __pgprot(_PAGE_CACHE_MASK),
1566                                               0, CPA_ARRAY, NULL);
1567        if (ret)
1568                goto out_free;
1569
1570        return 0;
1571
1572out_free:
1573        for (j = 0; j < i; j++)
1574                free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1575
1576        return ret;
1577}
1578
1579int set_memory_array_uc(unsigned long *addr, int addrinarray)
1580{
1581        return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1582}
1583EXPORT_SYMBOL(set_memory_array_uc);
1584
1585int set_memory_array_wc(unsigned long *addr, int addrinarray)
1586{
1587        return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
1588}
1589EXPORT_SYMBOL(set_memory_array_wc);
1590
1591int set_memory_array_wt(unsigned long *addr, int addrinarray)
1592{
1593        return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
1594}
1595EXPORT_SYMBOL_GPL(set_memory_array_wt);
1596
1597int _set_memory_wc(unsigned long addr, int numpages)
1598{
1599        int ret;
1600        unsigned long addr_copy = addr;
1601
1602        ret = change_page_attr_set(&addr, numpages,
1603                                   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1604                                   0);
1605        if (!ret) {
1606                ret = change_page_attr_set_clr(&addr_copy, numpages,
1607                                               cachemode2pgprot(
1608                                                _PAGE_CACHE_MODE_WC),
1609                                               __pgprot(_PAGE_CACHE_MASK),
1610                                               0, 0, NULL);
1611        }
1612        return ret;
1613}
1614
1615int set_memory_wc(unsigned long addr, int numpages)
1616{
1617        int ret;
1618
1619        ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1620                _PAGE_CACHE_MODE_WC, NULL);
1621        if (ret)
1622                return ret;
1623
1624        ret = _set_memory_wc(addr, numpages);
1625        if (ret)
1626                free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1627
1628        return ret;
1629}
1630EXPORT_SYMBOL(set_memory_wc);
1631
1632int _set_memory_wt(unsigned long addr, int numpages)
1633{
1634        return change_page_attr_set(&addr, numpages,
1635                                    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
1636}
1637
1638int set_memory_wt(unsigned long addr, int numpages)
1639{
1640        int ret;
1641
1642        ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1643                              _PAGE_CACHE_MODE_WT, NULL);
1644        if (ret)
1645                return ret;
1646
1647        ret = _set_memory_wt(addr, numpages);
1648        if (ret)
1649                free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1650
1651        return ret;
1652}
1653EXPORT_SYMBOL_GPL(set_memory_wt);
1654
1655int _set_memory_wb(unsigned long addr, int numpages)
1656{
1657        /* WB cache mode is hard wired to all cache attribute bits being 0 */
1658        return change_page_attr_clear(&addr, numpages,
1659                                      __pgprot(_PAGE_CACHE_MASK), 0);
1660}
1661
1662int set_memory_wb(unsigned long addr, int numpages)
1663{
1664        int ret;
1665
1666        ret = _set_memory_wb(addr, numpages);
1667        if (ret)
1668                return ret;
1669
1670        free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1671        return 0;
1672}
1673EXPORT_SYMBOL(set_memory_wb);
1674
1675int set_memory_array_wb(unsigned long *addr, int addrinarray)
1676{
1677        int i;
1678        int ret;
1679
1680        /* WB cache mode is hard wired to all cache attribute bits being 0 */
1681        ret = change_page_attr_clear(addr, addrinarray,
1682                                      __pgprot(_PAGE_CACHE_MASK), 1);
1683        if (ret)
1684                return ret;
1685
1686        for (i = 0; i < addrinarray; i++)
1687                free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1688
1689        return 0;
1690}
1691EXPORT_SYMBOL(set_memory_array_wb);
1692
1693int set_memory_x(unsigned long addr, int numpages)
1694{
1695        if (!(__supported_pte_mask & _PAGE_NX))
1696                return 0;
1697
1698        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1699}
1700EXPORT_SYMBOL(set_memory_x);
1701
1702int set_memory_nx(unsigned long addr, int numpages)
1703{
1704        if (!(__supported_pte_mask & _PAGE_NX))
1705                return 0;
1706
1707        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1708}
1709EXPORT_SYMBOL(set_memory_nx);
1710
1711int set_memory_ro(unsigned long addr, int numpages)
1712{
1713        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1714}
1715
1716int set_memory_rw(unsigned long addr, int numpages)
1717{
1718        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1719}
1720
1721int set_memory_np(unsigned long addr, int numpages)
1722{
1723        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1724}
1725
1726int set_memory_4k(unsigned long addr, int numpages)
1727{
1728        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1729                                        __pgprot(0), 1, 0, NULL);
1730}
1731
1732int set_pages_uc(struct page *page, int numpages)
1733{
1734        unsigned long addr = (unsigned long)page_address(page);
1735
1736        return set_memory_uc(addr, numpages);
1737}
1738EXPORT_SYMBOL(set_pages_uc);
1739
1740static int _set_pages_array(struct page **pages, int addrinarray,
1741                enum page_cache_mode new_type)
1742{
1743        unsigned long start;
1744        unsigned long end;
1745        enum page_cache_mode set_type;
1746        int i;
1747        int free_idx;
1748        int ret;
1749
1750        for (i = 0; i < addrinarray; i++) {
1751                if (PageHighMem(pages[i]))
1752                        continue;
1753                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1754                end = start + PAGE_SIZE;
1755                if (reserve_memtype(start, end, new_type, NULL))
1756                        goto err_out;
1757        }
1758
1759        /* If WC, set to UC- first and then WC */
1760        set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
1761                                _PAGE_CACHE_MODE_UC_MINUS : new_type;
1762
1763        ret = cpa_set_pages_array(pages, addrinarray,
1764                                  cachemode2pgprot(set_type));
1765        if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1766                ret = change_page_attr_set_clr(NULL, addrinarray,
1767                                               cachemode2pgprot(
1768                                                _PAGE_CACHE_MODE_WC),
1769                                               __pgprot(_PAGE_CACHE_MASK),
1770                                               0, CPA_PAGES_ARRAY, pages);
1771        if (ret)
1772                goto err_out;
1773        return 0; /* Success */
1774err_out:
1775        free_idx = i;
1776        for (i = 0; i < free_idx; i++) {
1777                if (PageHighMem(pages[i]))
1778                        continue;
1779                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1780                end = start + PAGE_SIZE;
1781                free_memtype(start, end);
1782        }
1783        return -EINVAL;
1784}
1785
1786int set_pages_array_uc(struct page **pages, int addrinarray)
1787{
1788        return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1789}
1790EXPORT_SYMBOL(set_pages_array_uc);
1791
1792int set_pages_array_wc(struct page **pages, int addrinarray)
1793{
1794        return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
1795}
1796EXPORT_SYMBOL(set_pages_array_wc);
1797
1798int set_pages_array_wt(struct page **pages, int addrinarray)
1799{
1800        return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
1801}
1802EXPORT_SYMBOL_GPL(set_pages_array_wt);
1803
1804int set_pages_wb(struct page *page, int numpages)
1805{
1806        unsigned long addr = (unsigned long)page_address(page);
1807
1808        return set_memory_wb(addr, numpages);
1809}
1810EXPORT_SYMBOL(set_pages_wb);
1811
1812int set_pages_array_wb(struct page **pages, int addrinarray)
1813{
1814        int retval;
1815        unsigned long start;
1816        unsigned long end;
1817        int i;
1818
1819        /* WB cache mode is hard wired to all cache attribute bits being 0 */
1820        retval = cpa_clear_pages_array(pages, addrinarray,
1821                        __pgprot(_PAGE_CACHE_MASK));
1822        if (retval)
1823                return retval;
1824
1825        for (i = 0; i < addrinarray; i++) {
1826                if (PageHighMem(pages[i]))
1827                        continue;
1828                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1829                end = start + PAGE_SIZE;
1830                free_memtype(start, end);
1831        }
1832
1833        return 0;
1834}
1835EXPORT_SYMBOL(set_pages_array_wb);
1836
1837int set_pages_x(struct page *page, int numpages)
1838{
1839        unsigned long addr = (unsigned long)page_address(page);
1840
1841        return set_memory_x(addr, numpages);
1842}
1843EXPORT_SYMBOL(set_pages_x);
1844
1845int set_pages_nx(struct page *page, int numpages)
1846{
1847        unsigned long addr = (unsigned long)page_address(page);
1848
1849        return set_memory_nx(addr, numpages);
1850}
1851EXPORT_SYMBOL(set_pages_nx);
1852
1853int set_pages_ro(struct page *page, int numpages)
1854{
1855        unsigned long addr = (unsigned long)page_address(page);
1856
1857        return set_memory_ro(addr, numpages);
1858}
1859
1860int set_pages_rw(struct page *page, int numpages)
1861{
1862        unsigned long addr = (unsigned long)page_address(page);
1863
1864        return set_memory_rw(addr, numpages);
1865}
1866
1867#ifdef CONFIG_DEBUG_PAGEALLOC
1868
1869static int __set_pages_p(struct page *page, int numpages)
1870{
1871        unsigned long tempaddr = (unsigned long) page_address(page);
1872        struct cpa_data cpa = { .vaddr = &tempaddr,
1873                                .pgd = NULL,
1874                                .numpages = numpages,
1875                                .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1876                                .mask_clr = __pgprot(0),
1877                                .flags = 0};
1878
1879        /*
1880         * No alias checking needed for setting present flag. otherwise,
1881         * we may need to break large pages for 64-bit kernel text
1882         * mappings (this adds to complexity if we want to do this from
1883         * atomic context especially). Let's keep it simple!
1884         */
1885        return __change_page_attr_set_clr(&cpa, 0);
1886}
1887
1888static int __set_pages_np(struct page *page, int numpages)
1889{
1890        unsigned long tempaddr = (unsigned long) page_address(page);
1891        struct cpa_data cpa = { .vaddr = &tempaddr,
1892                                .pgd = NULL,
1893                                .numpages = numpages,
1894                                .mask_set = __pgprot(0),
1895                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1896                                .flags = 0};
1897
1898        /*
1899         * No alias checking needed for setting not present flag. otherwise,
1900         * we may need to break large pages for 64-bit kernel text
1901         * mappings (this adds to complexity if we want to do this from
1902         * atomic context especially). Let's keep it simple!
1903         */
1904        return __change_page_attr_set_clr(&cpa, 0);
1905}
1906
1907void __kernel_map_pages(struct page *page, int numpages, int enable)
1908{
1909        if (PageHighMem(page))
1910                return;
1911        if (!enable) {
1912                debug_check_no_locks_freed(page_address(page),
1913                                           numpages * PAGE_SIZE);
1914        }
1915
1916        /*
1917         * The return value is ignored as the calls cannot fail.
1918         * Large pages for identity mappings are not used at boot time
1919         * and hence no memory allocations during large page split.
1920         */
1921        if (enable)
1922                __set_pages_p(page, numpages);
1923        else
1924                __set_pages_np(page, numpages);
1925
1926        /*
1927         * We should perform an IPI and flush all tlbs,
1928         * but that can deadlock->flush only current cpu:
1929         */
1930        __flush_tlb_all();
1931
1932        arch_flush_lazy_mmu_mode();
1933}
1934
1935#ifdef CONFIG_HIBERNATION
1936
1937bool kernel_page_present(struct page *page)
1938{
1939        unsigned int level;
1940        pte_t *pte;
1941
1942        if (PageHighMem(page))
1943                return false;
1944
1945        pte = lookup_address((unsigned long)page_address(page), &level);
1946        return (pte_val(*pte) & _PAGE_PRESENT);
1947}
1948
1949#endif /* CONFIG_HIBERNATION */
1950
1951#endif /* CONFIG_DEBUG_PAGEALLOC */
1952
1953int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1954                            unsigned numpages, unsigned long page_flags)
1955{
1956        int retval = -EINVAL;
1957
1958        struct cpa_data cpa = {
1959                .vaddr = &address,
1960                .pfn = pfn,
1961                .pgd = pgd,
1962                .numpages = numpages,
1963                .mask_set = __pgprot(0),
1964                .mask_clr = __pgprot(0),
1965                .flags = 0,
1966        };
1967
1968        if (!(__supported_pte_mask & _PAGE_NX))
1969                goto out;
1970
1971        if (!(page_flags & _PAGE_NX))
1972                cpa.mask_clr = __pgprot(_PAGE_NX);
1973
1974        if (!(page_flags & _PAGE_RW))
1975                cpa.mask_clr = __pgprot(_PAGE_RW);
1976
1977        cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1978
1979        retval = __change_page_attr_set_clr(&cpa, 0);
1980        __flush_tlb_all();
1981
1982out:
1983        return retval;
1984}
1985
1986/*
1987 * The testcases use internal knowledge of the implementation that shouldn't
1988 * be exposed to the rest of the kernel. Include these directly here.
1989 */
1990#ifdef CONFIG_CPA_DEBUG
1991#include "pageattr-test.c"
1992#endif
1993
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.