linux/arch/x86/mm/pageattr.c
<<
>>
Prefs
   1/*
   2 * Copyright 2002 Andi Kleen, SuSE Labs.
   3 * Thanks to Ben LaHaise for precious feedback.
   4 */
   5#include <linux/highmem.h>
   6#include <linux/bootmem.h>
   7#include <linux/module.h>
   8#include <linux/sched.h>
   9#include <linux/mm.h>
  10#include <linux/interrupt.h>
  11#include <linux/seq_file.h>
  12#include <linux/debugfs.h>
  13#include <linux/pfn.h>
  14#include <linux/percpu.h>
  15#include <linux/gfp.h>
  16#include <linux/pci.h>
  17
  18#include <asm/e820.h>
  19#include <asm/processor.h>
  20#include <asm/tlbflush.h>
  21#include <asm/sections.h>
  22#include <asm/setup.h>
  23#include <asm/uaccess.h>
  24#include <asm/pgalloc.h>
  25#include <asm/proto.h>
  26#include <asm/pat.h>
  27
  28/*
  29 * The current flushing context - we pass it instead of 5 arguments:
  30 */
  31struct cpa_data {
  32        unsigned long   *vaddr;
  33        pgd_t           *pgd;
  34        pgprot_t        mask_set;
  35        pgprot_t        mask_clr;
  36        int             numpages;
  37        int             flags;
  38        unsigned long   pfn;
  39        unsigned        force_split : 1;
  40        int             curpage;
  41        struct page     **pages;
  42};
  43
  44/*
  45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
  46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
  47 * entries change the page attribute in parallel to some other cpu
  48 * splitting a large page entry along with changing the attribute.
  49 */
  50static DEFINE_SPINLOCK(cpa_lock);
  51
  52#define CPA_FLUSHTLB 1
  53#define CPA_ARRAY 2
  54#define CPA_PAGES_ARRAY 4
  55
  56#ifdef CONFIG_PROC_FS
  57static unsigned long direct_pages_count[PG_LEVEL_NUM];
  58
  59void update_page_count(int level, unsigned long pages)
  60{
  61        /* Protect against CPA */
  62        spin_lock(&pgd_lock);
  63        direct_pages_count[level] += pages;
  64        spin_unlock(&pgd_lock);
  65}
  66
  67static void split_page_count(int level)
  68{
  69        direct_pages_count[level]--;
  70        direct_pages_count[level - 1] += PTRS_PER_PTE;
  71}
  72
  73void arch_report_meminfo(struct seq_file *m)
  74{
  75        seq_printf(m, "DirectMap4k:    %8lu kB\n",
  76                        direct_pages_count[PG_LEVEL_4K] << 2);
  77#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
  78        seq_printf(m, "DirectMap2M:    %8lu kB\n",
  79                        direct_pages_count[PG_LEVEL_2M] << 11);
  80#else
  81        seq_printf(m, "DirectMap4M:    %8lu kB\n",
  82                        direct_pages_count[PG_LEVEL_2M] << 12);
  83#endif
  84#ifdef CONFIG_X86_64
  85        if (direct_gbpages)
  86                seq_printf(m, "DirectMap1G:    %8lu kB\n",
  87                        direct_pages_count[PG_LEVEL_1G] << 20);
  88#endif
  89}
  90#else
  91static inline void split_page_count(int level) { }
  92#endif
  93
  94#ifdef CONFIG_X86_64
  95
  96static inline unsigned long highmap_start_pfn(void)
  97{
  98        return __pa_symbol(_text) >> PAGE_SHIFT;
  99}
 100
 101static inline unsigned long highmap_end_pfn(void)
 102{
 103        return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 104}
 105
 106#endif
 107
 108#ifdef CONFIG_DEBUG_PAGEALLOC
 109# define debug_pagealloc 1
 110#else
 111# define debug_pagealloc 0
 112#endif
 113
 114static inline int
 115within(unsigned long addr, unsigned long start, unsigned long end)
 116{
 117        return addr >= start && addr < end;
 118}
 119
 120/*
 121 * Flushing functions
 122 */
 123
 124/**
 125 * clflush_cache_range - flush a cache range with clflush
 126 * @vaddr:      virtual start address
 127 * @size:       number of bytes to flush
 128 *
 129 * clflushopt is an unordered instruction which needs fencing with mfence or
 130 * sfence to avoid ordering issues.
 131 */
 132void clflush_cache_range(void *vaddr, unsigned int size)
 133{
 134        void *vend = vaddr + size - 1;
 135
 136        mb();
 137
 138        for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
 139                clflushopt(vaddr);
 140        /*
 141         * Flush any possible final partial cacheline:
 142         */
 143        clflushopt(vend);
 144
 145        mb();
 146}
 147EXPORT_SYMBOL_GPL(clflush_cache_range);
 148
 149static void __cpa_flush_all(void *arg)
 150{
 151        unsigned long cache = (unsigned long)arg;
 152
 153        /*
 154         * Flush all to work around Errata in early athlons regarding
 155         * large page flushing.
 156         */
 157        __flush_tlb_all();
 158
 159        if (cache && boot_cpu_data.x86 >= 4)
 160                wbinvd();
 161}
 162
 163static void cpa_flush_all(unsigned long cache)
 164{
 165        BUG_ON(irqs_disabled());
 166
 167        on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 168}
 169
 170static void __cpa_flush_range(void *arg)
 171{
 172        /*
 173         * We could optimize that further and do individual per page
 174         * tlb invalidates for a low number of pages. Caveat: we must
 175         * flush the high aliases on 64bit as well.
 176         */
 177        __flush_tlb_all();
 178}
 179
 180static void cpa_flush_range(unsigned long start, int numpages, int cache)
 181{
 182        unsigned int i, level;
 183        unsigned long addr;
 184
 185        BUG_ON(irqs_disabled());
 186        WARN_ON(PAGE_ALIGN(start) != start);
 187
 188        on_each_cpu(__cpa_flush_range, NULL, 1);
 189
 190        if (!cache)
 191                return;
 192
 193        /*
 194         * We only need to flush on one CPU,
 195         * clflush is a MESI-coherent instruction that
 196         * will cause all other CPUs to flush the same
 197         * cachelines:
 198         */
 199        for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 200                pte_t *pte = lookup_address(addr, &level);
 201
 202                /*
 203                 * Only flush present addresses:
 204                 */
 205                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 206                        clflush_cache_range((void *) addr, PAGE_SIZE);
 207        }
 208}
 209
 210static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 211                            int in_flags, struct page **pages)
 212{
 213        unsigned int i, level;
 214        unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 215
 216        BUG_ON(irqs_disabled());
 217
 218        on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 219
 220        if (!cache || do_wbinvd)
 221                return;
 222
 223        /*
 224         * We only need to flush on one CPU,
 225         * clflush is a MESI-coherent instruction that
 226         * will cause all other CPUs to flush the same
 227         * cachelines:
 228         */
 229        for (i = 0; i < numpages; i++) {
 230                unsigned long addr;
 231                pte_t *pte;
 232
 233                if (in_flags & CPA_PAGES_ARRAY)
 234                        addr = (unsigned long)page_address(pages[i]);
 235                else
 236                        addr = start[i];
 237
 238                pte = lookup_address(addr, &level);
 239
 240                /*
 241                 * Only flush present addresses:
 242                 */
 243                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 244                        clflush_cache_range((void *)addr, PAGE_SIZE);
 245        }
 246}
 247
 248/*
 249 * Certain areas of memory on x86 require very specific protection flags,
 250 * for example the BIOS area or kernel text. Callers don't always get this
 251 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 252 * checks and fixes these known static required protection bits.
 253 */
 254static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 255                                   unsigned long pfn)
 256{
 257        pgprot_t forbidden = __pgprot(0);
 258
 259        /*
 260         * The BIOS area between 640k and 1Mb needs to be executable for
 261         * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 262         */
 263#ifdef CONFIG_PCI_BIOS
 264        if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 265                pgprot_val(forbidden) |= _PAGE_NX;
 266#endif
 267
 268        /*
 269         * The kernel text needs to be executable for obvious reasons
 270         * Does not cover __inittext since that is gone later on. On
 271         * 64bit we do not enforce !NX on the low mapping
 272         */
 273        if (within(address, (unsigned long)_text, (unsigned long)_etext))
 274                pgprot_val(forbidden) |= _PAGE_NX;
 275
 276        /*
 277         * The .rodata section needs to be read-only. Using the pfn
 278         * catches all aliases.
 279         */
 280        if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
 281                   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 282                pgprot_val(forbidden) |= _PAGE_RW;
 283
 284#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 285        /*
 286         * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 287         * kernel text mappings for the large page aligned text, rodata sections
 288         * will be always read-only. For the kernel identity mappings covering
 289         * the holes caused by this alignment can be anything that user asks.
 290         *
 291         * This will preserve the large page mappings for kernel text/data
 292         * at no extra cost.
 293         */
 294        if (kernel_set_to_readonly &&
 295            within(address, (unsigned long)_text,
 296                   (unsigned long)__end_rodata_hpage_align)) {
 297                unsigned int level;
 298
 299                /*
 300                 * Don't enforce the !RW mapping for the kernel text mapping,
 301                 * if the current mapping is already using small page mapping.
 302                 * No need to work hard to preserve large page mappings in this
 303                 * case.
 304                 *
 305                 * This also fixes the Linux Xen paravirt guest boot failure
 306                 * (because of unexpected read-only mappings for kernel identity
 307                 * mappings). In this paravirt guest case, the kernel text
 308                 * mapping and the kernel identity mapping share the same
 309                 * page-table pages. Thus we can't really use different
 310                 * protections for the kernel text and identity mappings. Also,
 311                 * these shared mappings are made of small page mappings.
 312                 * Thus this don't enforce !RW mapping for small page kernel
 313                 * text mapping logic will help Linux Xen parvirt guest boot
 314                 * as well.
 315                 */
 316                if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 317                        pgprot_val(forbidden) |= _PAGE_RW;
 318        }
 319#endif
 320
 321        prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 322
 323        return prot;
 324}
 325
 326/*
 327 * Lookup the page table entry for a virtual address in a specific pgd.
 328 * Return a pointer to the entry and the level of the mapping.
 329 */
 330pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 331                             unsigned int *level)
 332{
 333        pud_t *pud;
 334        pmd_t *pmd;
 335
 336        *level = PG_LEVEL_NONE;
 337
 338        if (pgd_none(*pgd))
 339                return NULL;
 340
 341        pud = pud_offset(pgd, address);
 342        if (pud_none(*pud))
 343                return NULL;
 344
 345        *level = PG_LEVEL_1G;
 346        if (pud_large(*pud) || !pud_present(*pud))
 347                return (pte_t *)pud;
 348
 349        pmd = pmd_offset(pud, address);
 350        if (pmd_none(*pmd))
 351                return NULL;
 352
 353        *level = PG_LEVEL_2M;
 354        if (pmd_large(*pmd) || !pmd_present(*pmd))
 355                return (pte_t *)pmd;
 356
 357        *level = PG_LEVEL_4K;
 358
 359        return pte_offset_kernel(pmd, address);
 360}
 361
 362/*
 363 * Lookup the page table entry for a virtual address. Return a pointer
 364 * to the entry and the level of the mapping.
 365 *
 366 * Note: We return pud and pmd either when the entry is marked large
 367 * or when the present bit is not set. Otherwise we would return a
 368 * pointer to a nonexisting mapping.
 369 */
 370pte_t *lookup_address(unsigned long address, unsigned int *level)
 371{
 372        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
 373}
 374EXPORT_SYMBOL_GPL(lookup_address);
 375
 376static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
 377                                  unsigned int *level)
 378{
 379        if (cpa->pgd)
 380                return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
 381                                               address, level);
 382
 383        return lookup_address(address, level);
 384}
 385
 386/*
 387 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 388 * or NULL if not present.
 389 */
 390pmd_t *lookup_pmd_address(unsigned long address)
 391{
 392        pgd_t *pgd;
 393        pud_t *pud;
 394
 395        pgd = pgd_offset_k(address);
 396        if (pgd_none(*pgd))
 397                return NULL;
 398
 399        pud = pud_offset(pgd, address);
 400        if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
 401                return NULL;
 402
 403        return pmd_offset(pud, address);
 404}
 405
 406/*
 407 * This is necessary because __pa() does not work on some
 408 * kinds of memory, like vmalloc() or the alloc_remap()
 409 * areas on 32-bit NUMA systems.  The percpu areas can
 410 * end up in this kind of memory, for instance.
 411 *
 412 * This could be optimized, but it is only intended to be
 413 * used at inititalization time, and keeping it
 414 * unoptimized should increase the testing coverage for
 415 * the more obscure platforms.
 416 */
 417phys_addr_t slow_virt_to_phys(void *__virt_addr)
 418{
 419        unsigned long virt_addr = (unsigned long)__virt_addr;
 420        phys_addr_t phys_addr;
 421        unsigned long offset;
 422        enum pg_level level;
 423        unsigned long psize;
 424        unsigned long pmask;
 425        pte_t *pte;
 426
 427        pte = lookup_address(virt_addr, &level);
 428        BUG_ON(!pte);
 429        psize = page_level_size(level);
 430        pmask = page_level_mask(level);
 431        offset = virt_addr & ~pmask;
 432        phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 433        return (phys_addr | offset);
 434}
 435EXPORT_SYMBOL_GPL(slow_virt_to_phys);
 436
 437/*
 438 * Set the new pmd in all the pgds we know about:
 439 */
 440static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 441{
 442        /* change init_mm */
 443        set_pte_atomic(kpte, pte);
 444#ifdef CONFIG_X86_32
 445        if (!SHARED_KERNEL_PMD) {
 446                struct page *page;
 447
 448                list_for_each_entry(page, &pgd_list, lru) {
 449                        pgd_t *pgd;
 450                        pud_t *pud;
 451                        pmd_t *pmd;
 452
 453                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
 454                        pud = pud_offset(pgd, address);
 455                        pmd = pmd_offset(pud, address);
 456                        set_pte_atomic((pte_t *)pmd, pte);
 457                }
 458        }
 459#endif
 460}
 461
 462static int
 463try_preserve_large_page(pte_t *kpte, unsigned long address,
 464                        struct cpa_data *cpa)
 465{
 466        unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 467        pte_t new_pte, old_pte, *tmp;
 468        pgprot_t old_prot, new_prot, req_prot;
 469        int i, do_split = 1;
 470        enum pg_level level;
 471
 472        if (cpa->force_split)
 473                return 1;
 474
 475        spin_lock(&pgd_lock);
 476        /*
 477         * Check for races, another CPU might have split this page
 478         * up already:
 479         */
 480        tmp = _lookup_address_cpa(cpa, address, &level);
 481        if (tmp != kpte)
 482                goto out_unlock;
 483
 484        switch (level) {
 485        case PG_LEVEL_2M:
 486#ifdef CONFIG_X86_64
 487        case PG_LEVEL_1G:
 488#endif
 489                psize = page_level_size(level);
 490                pmask = page_level_mask(level);
 491                break;
 492        default:
 493                do_split = -EINVAL;
 494                goto out_unlock;
 495        }
 496
 497        /*
 498         * Calculate the number of pages, which fit into this large
 499         * page starting at address:
 500         */
 501        nextpage_addr = (address + psize) & pmask;
 502        numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 503        if (numpages < cpa->numpages)
 504                cpa->numpages = numpages;
 505
 506        /*
 507         * We are safe now. Check whether the new pgprot is the same:
 508         * Convert protection attributes to 4k-format, as cpa->mask* are set
 509         * up accordingly.
 510         */
 511        old_pte = *kpte;
 512        old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
 513
 514        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 515        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 516
 517        /*
 518         * req_prot is in format of 4k pages. It must be converted to large
 519         * page format: the caching mode includes the PAT bit located at
 520         * different bit positions in the two formats.
 521         */
 522        req_prot = pgprot_4k_2_large(req_prot);
 523
 524        /*
 525         * Set the PSE and GLOBAL flags only if the PRESENT flag is
 526         * set otherwise pmd_present/pmd_huge will return true even on
 527         * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
 528         * for the ancient hardware that doesn't support it.
 529         */
 530        if (pgprot_val(req_prot) & _PAGE_PRESENT)
 531                pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
 532        else
 533                pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
 534
 535        req_prot = canon_pgprot(req_prot);
 536
 537        /*
 538         * old_pte points to the large page base address. So we need
 539         * to add the offset of the virtual address:
 540         */
 541        pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
 542        cpa->pfn = pfn;
 543
 544        new_prot = static_protections(req_prot, address, pfn);
 545
 546        /*
 547         * We need to check the full range, whether
 548         * static_protection() requires a different pgprot for one of
 549         * the pages in the range we try to preserve:
 550         */
 551        addr = address & pmask;
 552        pfn = pte_pfn(old_pte);
 553        for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 554                pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 555
 556                if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 557                        goto out_unlock;
 558        }
 559
 560        /*
 561         * If there are no changes, return. maxpages has been updated
 562         * above:
 563         */
 564        if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 565                do_split = 0;
 566                goto out_unlock;
 567        }
 568
 569        /*
 570         * We need to change the attributes. Check, whether we can
 571         * change the large page in one go. We request a split, when
 572         * the address is not aligned and the number of pages is
 573         * smaller than the number of pages in the large page. Note
 574         * that we limited the number of possible pages already to
 575         * the number of pages in the large page.
 576         */
 577        if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 578                /*
 579                 * The address is aligned and the number of pages
 580                 * covers the full page.
 581                 */
 582                new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
 583                __set_pmd_pte(kpte, address, new_pte);
 584                cpa->flags |= CPA_FLUSHTLB;
 585                do_split = 0;
 586        }
 587
 588out_unlock:
 589        spin_unlock(&pgd_lock);
 590
 591        return do_split;
 592}
 593
 594static int
 595__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 596                   struct page *base)
 597{
 598        pte_t *pbase = (pte_t *)page_address(base);
 599        unsigned long pfn, pfninc = 1;
 600        unsigned int i, level;
 601        pte_t *tmp;
 602        pgprot_t ref_prot;
 603
 604        spin_lock(&pgd_lock);
 605        /*
 606         * Check for races, another CPU might have split this page
 607         * up for us already:
 608         */
 609        tmp = _lookup_address_cpa(cpa, address, &level);
 610        if (tmp != kpte) {
 611                spin_unlock(&pgd_lock);
 612                return 1;
 613        }
 614
 615        paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 616        ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 617
 618        /* promote PAT bit to correct position */
 619        if (level == PG_LEVEL_2M)
 620                ref_prot = pgprot_large_2_4k(ref_prot);
 621
 622#ifdef CONFIG_X86_64
 623        if (level == PG_LEVEL_1G) {
 624                pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 625                /*
 626                 * Set the PSE flags only if the PRESENT flag is set
 627                 * otherwise pmd_present/pmd_huge will return true
 628                 * even on a non present pmd.
 629                 */
 630                if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 631                        pgprot_val(ref_prot) |= _PAGE_PSE;
 632                else
 633                        pgprot_val(ref_prot) &= ~_PAGE_PSE;
 634        }
 635#endif
 636
 637        /*
 638         * Set the GLOBAL flags only if the PRESENT flag is set
 639         * otherwise pmd/pte_present will return true even on a non
 640         * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
 641         * for the ancient hardware that doesn't support it.
 642         */
 643        if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 644                pgprot_val(ref_prot) |= _PAGE_GLOBAL;
 645        else
 646                pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
 647
 648        /*
 649         * Get the target pfn from the original entry:
 650         */
 651        pfn = pte_pfn(*kpte);
 652        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 653                set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
 654
 655        if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
 656                                PFN_DOWN(__pa(address)) + 1))
 657                split_page_count(level);
 658
 659        /*
 660         * Install the new, split up pagetable.
 661         *
 662         * We use the standard kernel pagetable protections for the new
 663         * pagetable protections, the actual ptes set above control the
 664         * primary protection behavior:
 665         */
 666        __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 667
 668        /*
 669         * Intel Atom errata AAH41 workaround.
 670         *
 671         * The real fix should be in hw or in a microcode update, but
 672         * we also probabilistically try to reduce the window of having
 673         * a large TLB mixed with 4K TLBs while instruction fetches are
 674         * going on.
 675         */
 676        __flush_tlb_all();
 677        spin_unlock(&pgd_lock);
 678
 679        return 0;
 680}
 681
 682static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 683                            unsigned long address)
 684{
 685        struct page *base;
 686
 687        if (!debug_pagealloc)
 688                spin_unlock(&cpa_lock);
 689        base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
 690        if (!debug_pagealloc)
 691                spin_lock(&cpa_lock);
 692        if (!base)
 693                return -ENOMEM;
 694
 695        if (__split_large_page(cpa, kpte, address, base))
 696                __free_page(base);
 697
 698        return 0;
 699}
 700
 701static bool try_to_free_pte_page(pte_t *pte)
 702{
 703        int i;
 704
 705        for (i = 0; i < PTRS_PER_PTE; i++)
 706                if (!pte_none(pte[i]))
 707                        return false;
 708
 709        free_page((unsigned long)pte);
 710        return true;
 711}
 712
 713static bool try_to_free_pmd_page(pmd_t *pmd)
 714{
 715        int i;
 716
 717        for (i = 0; i < PTRS_PER_PMD; i++)
 718                if (!pmd_none(pmd[i]))
 719                        return false;
 720
 721        free_page((unsigned long)pmd);
 722        return true;
 723}
 724
 725static bool try_to_free_pud_page(pud_t *pud)
 726{
 727        int i;
 728
 729        for (i = 0; i < PTRS_PER_PUD; i++)
 730                if (!pud_none(pud[i]))
 731                        return false;
 732
 733        free_page((unsigned long)pud);
 734        return true;
 735}
 736
 737static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 738{
 739        pte_t *pte = pte_offset_kernel(pmd, start);
 740
 741        while (start < end) {
 742                set_pte(pte, __pte(0));
 743
 744                start += PAGE_SIZE;
 745                pte++;
 746        }
 747
 748        if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
 749                pmd_clear(pmd);
 750                return true;
 751        }
 752        return false;
 753}
 754
 755static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 756                              unsigned long start, unsigned long end)
 757{
 758        if (unmap_pte_range(pmd, start, end))
 759                if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 760                        pud_clear(pud);
 761}
 762
 763static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 764{
 765        pmd_t *pmd = pmd_offset(pud, start);
 766
 767        /*
 768         * Not on a 2MB page boundary?
 769         */
 770        if (start & (PMD_SIZE - 1)) {
 771                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 772                unsigned long pre_end = min_t(unsigned long, end, next_page);
 773
 774                __unmap_pmd_range(pud, pmd, start, pre_end);
 775
 776                start = pre_end;
 777                pmd++;
 778        }
 779
 780        /*
 781         * Try to unmap in 2M chunks.
 782         */
 783        while (end - start >= PMD_SIZE) {
 784                if (pmd_large(*pmd))
 785                        pmd_clear(pmd);
 786                else
 787                        __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
 788
 789                start += PMD_SIZE;
 790                pmd++;
 791        }
 792
 793        /*
 794         * 4K leftovers?
 795         */
 796        if (start < end)
 797                return __unmap_pmd_range(pud, pmd, start, end);
 798
 799        /*
 800         * Try again to free the PMD page if haven't succeeded above.
 801         */
 802        if (!pud_none(*pud))
 803                if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 804                        pud_clear(pud);
 805}
 806
 807static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 808{
 809        pud_t *pud = pud_offset(pgd, start);
 810
 811        /*
 812         * Not on a GB page boundary?
 813         */
 814        if (start & (PUD_SIZE - 1)) {
 815                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 816                unsigned long pre_end   = min_t(unsigned long, end, next_page);
 817
 818                unmap_pmd_range(pud, start, pre_end);
 819
 820                start = pre_end;
 821                pud++;
 822        }
 823
 824        /*
 825         * Try to unmap in 1G chunks?
 826         */
 827        while (end - start >= PUD_SIZE) {
 828
 829                if (pud_large(*pud))
 830                        pud_clear(pud);
 831                else
 832                        unmap_pmd_range(pud, start, start + PUD_SIZE);
 833
 834                start += PUD_SIZE;
 835                pud++;
 836        }
 837
 838        /*
 839         * 2M leftovers?
 840         */
 841        if (start < end)
 842                unmap_pmd_range(pud, start, end);
 843
 844        /*
 845         * No need to try to free the PUD page because we'll free it in
 846         * populate_pgd's error path
 847         */
 848}
 849
 850static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
 851{
 852        pgd_t *pgd_entry = root + pgd_index(addr);
 853
 854        unmap_pud_range(pgd_entry, addr, end);
 855
 856        if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
 857                pgd_clear(pgd_entry);
 858}
 859
 860static int alloc_pte_page(pmd_t *pmd)
 861{
 862        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 863        if (!pte)
 864                return -1;
 865
 866        set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 867        return 0;
 868}
 869
 870static int alloc_pmd_page(pud_t *pud)
 871{
 872        pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 873        if (!pmd)
 874                return -1;
 875
 876        set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 877        return 0;
 878}
 879
 880static void populate_pte(struct cpa_data *cpa,
 881                         unsigned long start, unsigned long end,
 882                         unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
 883{
 884        pte_t *pte;
 885
 886        pte = pte_offset_kernel(pmd, start);
 887
 888        while (num_pages-- && start < end) {
 889
 890                /* deal with the NX bit */
 891                if (!(pgprot_val(pgprot) & _PAGE_NX))
 892                        cpa->pfn &= ~_PAGE_NX;
 893
 894                set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
 895
 896                start    += PAGE_SIZE;
 897                cpa->pfn += PAGE_SIZE;
 898                pte++;
 899        }
 900}
 901
 902static int populate_pmd(struct cpa_data *cpa,
 903                        unsigned long start, unsigned long end,
 904                        unsigned num_pages, pud_t *pud, pgprot_t pgprot)
 905{
 906        unsigned int cur_pages = 0;
 907        pmd_t *pmd;
 908        pgprot_t pmd_pgprot;
 909
 910        /*
 911         * Not on a 2M boundary?
 912         */
 913        if (start & (PMD_SIZE - 1)) {
 914                unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
 915                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 916
 917                pre_end   = min_t(unsigned long, pre_end, next_page);
 918                cur_pages = (pre_end - start) >> PAGE_SHIFT;
 919                cur_pages = min_t(unsigned int, num_pages, cur_pages);
 920
 921                /*
 922                 * Need a PTE page?
 923                 */
 924                pmd = pmd_offset(pud, start);
 925                if (pmd_none(*pmd))
 926                        if (alloc_pte_page(pmd))
 927                                return -1;
 928
 929                populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
 930
 931                start = pre_end;
 932        }
 933
 934        /*
 935         * We mapped them all?
 936         */
 937        if (num_pages == cur_pages)
 938                return cur_pages;
 939
 940        pmd_pgprot = pgprot_4k_2_large(pgprot);
 941
 942        while (end - start >= PMD_SIZE) {
 943
 944                /*
 945                 * We cannot use a 1G page so allocate a PMD page if needed.
 946                 */
 947                if (pud_none(*pud))
 948                        if (alloc_pmd_page(pud))
 949                                return -1;
 950
 951                pmd = pmd_offset(pud, start);
 952
 953                set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
 954                                   massage_pgprot(pmd_pgprot)));
 955
 956                start     += PMD_SIZE;
 957                cpa->pfn  += PMD_SIZE;
 958                cur_pages += PMD_SIZE >> PAGE_SHIFT;
 959        }
 960
 961        /*
 962         * Map trailing 4K pages.
 963         */
 964        if (start < end) {
 965                pmd = pmd_offset(pud, start);
 966                if (pmd_none(*pmd))
 967                        if (alloc_pte_page(pmd))
 968                                return -1;
 969
 970                populate_pte(cpa, start, end, num_pages - cur_pages,
 971                             pmd, pgprot);
 972        }
 973        return num_pages;
 974}
 975
 976static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 977                        pgprot_t pgprot)
 978{
 979        pud_t *pud;
 980        unsigned long end;
 981        int cur_pages = 0;
 982        pgprot_t pud_pgprot;
 983
 984        end = start + (cpa->numpages << PAGE_SHIFT);
 985
 986        /*
 987         * Not on a Gb page boundary? => map everything up to it with
 988         * smaller pages.
 989         */
 990        if (start & (PUD_SIZE - 1)) {
 991                unsigned long pre_end;
 992                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 993
 994                pre_end   = min_t(unsigned long, end, next_page);
 995                cur_pages = (pre_end - start) >> PAGE_SHIFT;
 996                cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
 997
 998                pud = pud_offset(pgd, start);
 999
1000                /*
1001                 * Need a PMD page?
1002                 */
1003                if (pud_none(*pud))
1004                        if (alloc_pmd_page(pud))
1005                                return -1;
1006
1007                cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1008                                         pud, pgprot);
1009                if (cur_pages < 0)
1010                        return cur_pages;
1011
1012                start = pre_end;
1013        }
1014
1015        /* We mapped them all? */
1016        if (cpa->numpages == cur_pages)
1017                return cur_pages;
1018
1019        pud = pud_offset(pgd, start);
1020        pud_pgprot = pgprot_4k_2_large(pgprot);
1021
1022        /*
1023         * Map everything starting from the Gb boundary, possibly with 1G pages
1024         */
1025        while (end - start >= PUD_SIZE) {
1026                set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
1027                                   massage_pgprot(pud_pgprot)));
1028
1029                start     += PUD_SIZE;
1030                cpa->pfn  += PUD_SIZE;
1031                cur_pages += PUD_SIZE >> PAGE_SHIFT;
1032                pud++;
1033        }
1034
1035        /* Map trailing leftover */
1036        if (start < end) {
1037                int tmp;
1038
1039                pud = pud_offset(pgd, start);
1040                if (pud_none(*pud))
1041                        if (alloc_pmd_page(pud))
1042                                return -1;
1043
1044                tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1045                                   pud, pgprot);
1046                if (tmp < 0)
1047                        return cur_pages;
1048
1049                cur_pages += tmp;
1050        }
1051        return cur_pages;
1052}
1053
1054/*
1055 * Restrictions for kernel page table do not necessarily apply when mapping in
1056 * an alternate PGD.
1057 */
1058static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1059{
1060        pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1061        pud_t *pud = NULL;      /* shut up gcc */
1062        pgd_t *pgd_entry;
1063        int ret;
1064
1065        pgd_entry = cpa->pgd + pgd_index(addr);
1066
1067        /*
1068         * Allocate a PUD page and hand it down for mapping.
1069         */
1070        if (pgd_none(*pgd_entry)) {
1071                pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1072                if (!pud)
1073                        return -1;
1074
1075                set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1076        }
1077
1078        pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1079        pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1080
1081        ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1082        if (ret < 0) {
1083                unmap_pgd_range(cpa->pgd, addr,
1084                                addr + (cpa->numpages << PAGE_SHIFT));
1085                return ret;
1086        }
1087
1088        cpa->numpages = ret;
1089        return 0;
1090}
1091
1092static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1093                               int primary)
1094{
1095        if (cpa->pgd)
1096                return populate_pgd(cpa, vaddr);
1097
1098        /*
1099         * Ignore all non primary paths.
1100         */
1101        if (!primary)
1102                return 0;
1103
1104        /*
1105         * Ignore the NULL PTE for kernel identity mapping, as it is expected
1106         * to have holes.
1107         * Also set numpages to '1' indicating that we processed cpa req for
1108         * one virtual address page and its pfn. TBD: numpages can be set based
1109         * on the initial value and the level returned by lookup_address().
1110         */
1111        if (within(vaddr, PAGE_OFFSET,
1112                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1113                cpa->numpages = 1;
1114                cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1115                return 0;
1116        } else {
1117                WARN(1, KERN_WARNING "CPA: called for zero pte. "
1118                        "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1119                        *cpa->vaddr);
1120
1121                return -EFAULT;
1122        }
1123}
1124
1125static int __change_page_attr(struct cpa_data *cpa, int primary)
1126{
1127        unsigned long address;
1128        int do_split, err;
1129        unsigned int level;
1130        pte_t *kpte, old_pte;
1131
1132        if (cpa->flags & CPA_PAGES_ARRAY) {
1133                struct page *page = cpa->pages[cpa->curpage];
1134                if (unlikely(PageHighMem(page)))
1135                        return 0;
1136                address = (unsigned long)page_address(page);
1137        } else if (cpa->flags & CPA_ARRAY)
1138                address = cpa->vaddr[cpa->curpage];
1139        else
1140                address = *cpa->vaddr;
1141repeat:
1142        kpte = _lookup_address_cpa(cpa, address, &level);
1143        if (!kpte)
1144                return __cpa_process_fault(cpa, address, primary);
1145
1146        old_pte = *kpte;
1147        if (!pte_val(old_pte))
1148                return __cpa_process_fault(cpa, address, primary);
1149
1150        if (level == PG_LEVEL_4K) {
1151                pte_t new_pte;
1152                pgprot_t new_prot = pte_pgprot(old_pte);
1153                unsigned long pfn = pte_pfn(old_pte);
1154
1155                pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1156                pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1157
1158                new_prot = static_protections(new_prot, address, pfn);
1159
1160                /*
1161                 * Set the GLOBAL flags only if the PRESENT flag is
1162                 * set otherwise pte_present will return true even on
1163                 * a non present pte. The canon_pgprot will clear
1164                 * _PAGE_GLOBAL for the ancient hardware that doesn't
1165                 * support it.
1166                 */
1167                if (pgprot_val(new_prot) & _PAGE_PRESENT)
1168                        pgprot_val(new_prot) |= _PAGE_GLOBAL;
1169                else
1170                        pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1171
1172                /*
1173                 * We need to keep the pfn from the existing PTE,
1174                 * after all we're only going to change it's attributes
1175                 * not the memory it points to
1176                 */
1177                new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1178                cpa->pfn = pfn;
1179                /*
1180                 * Do we really change anything ?
1181                 */
1182                if (pte_val(old_pte) != pte_val(new_pte)) {
1183                        set_pte_atomic(kpte, new_pte);
1184                        cpa->flags |= CPA_FLUSHTLB;
1185                }
1186                cpa->numpages = 1;
1187                return 0;
1188        }
1189
1190        /*
1191         * Check, whether we can keep the large page intact
1192         * and just change the pte:
1193         */
1194        do_split = try_preserve_large_page(kpte, address, cpa);
1195        /*
1196         * When the range fits into the existing large page,
1197         * return. cp->numpages and cpa->tlbflush have been updated in
1198         * try_large_page:
1199         */
1200        if (do_split <= 0)
1201                return do_split;
1202
1203        /*
1204         * We have to split the large page:
1205         */
1206        err = split_large_page(cpa, kpte, address);
1207        if (!err) {
1208                /*
1209                 * Do a global flush tlb after splitting the large page
1210                 * and before we do the actual change page attribute in the PTE.
1211                 *
1212                 * With out this, we violate the TLB application note, that says
1213                 * "The TLBs may contain both ordinary and large-page
1214                 *  translations for a 4-KByte range of linear addresses. This
1215                 *  may occur if software modifies the paging structures so that
1216                 *  the page size used for the address range changes. If the two
1217                 *  translations differ with respect to page frame or attributes
1218                 *  (e.g., permissions), processor behavior is undefined and may
1219                 *  be implementation-specific."
1220                 *
1221                 * We do this global tlb flush inside the cpa_lock, so that we
1222                 * don't allow any other cpu, with stale tlb entries change the
1223                 * page attribute in parallel, that also falls into the
1224                 * just split large page entry.
1225                 */
1226                flush_tlb_all();
1227                goto repeat;
1228        }
1229
1230        return err;
1231}
1232
1233static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1234
1235static int cpa_process_alias(struct cpa_data *cpa)
1236{
1237        struct cpa_data alias_cpa;
1238        unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1239        unsigned long vaddr;
1240        int ret;
1241
1242        if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1243                return 0;
1244
1245        /*
1246         * No need to redo, when the primary call touched the direct
1247         * mapping already:
1248         */
1249        if (cpa->flags & CPA_PAGES_ARRAY) {
1250                struct page *page = cpa->pages[cpa->curpage];
1251                if (unlikely(PageHighMem(page)))
1252                        return 0;
1253                vaddr = (unsigned long)page_address(page);
1254        } else if (cpa->flags & CPA_ARRAY)
1255                vaddr = cpa->vaddr[cpa->curpage];
1256        else
1257                vaddr = *cpa->vaddr;
1258
1259        if (!(within(vaddr, PAGE_OFFSET,
1260                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1261
1262                alias_cpa = *cpa;
1263                alias_cpa.vaddr = &laddr;
1264                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1265
1266                ret = __change_page_attr_set_clr(&alias_cpa, 0);
1267                if (ret)
1268                        return ret;
1269        }
1270
1271#ifdef CONFIG_X86_64
1272        /*
1273         * If the primary call didn't touch the high mapping already
1274         * and the physical address is inside the kernel map, we need
1275         * to touch the high mapped kernel as well:
1276         */
1277        if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1278            within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
1279                unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1280                                               __START_KERNEL_map - phys_base;
1281                alias_cpa = *cpa;
1282                alias_cpa.vaddr = &temp_cpa_vaddr;
1283                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1284
1285                /*
1286                 * The high mapping range is imprecise, so ignore the
1287                 * return value.
1288                 */
1289                __change_page_attr_set_clr(&alias_cpa, 0);
1290        }
1291#endif
1292
1293        return 0;
1294}
1295
1296static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1297{
1298        int ret, numpages = cpa->numpages;
1299
1300        while (numpages) {
1301                /*
1302                 * Store the remaining nr of pages for the large page
1303                 * preservation check.
1304                 */
1305                cpa->numpages = numpages;
1306                /* for array changes, we can't use large page */
1307                if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1308                        cpa->numpages = 1;
1309
1310                if (!debug_pagealloc)
1311                        spin_lock(&cpa_lock);
1312                ret = __change_page_attr(cpa, checkalias);
1313                if (!debug_pagealloc)
1314                        spin_unlock(&cpa_lock);
1315                if (ret)
1316                        return ret;
1317
1318                if (checkalias) {
1319                        ret = cpa_process_alias(cpa);
1320                        if (ret)
1321                                return ret;
1322                }
1323
1324                /*
1325                 * Adjust the number of pages with the result of the
1326                 * CPA operation. Either a large page has been
1327                 * preserved or a single page update happened.
1328                 */
1329                BUG_ON(cpa->numpages > numpages);
1330                numpages -= cpa->numpages;
1331                if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1332                        cpa->curpage++;
1333                else
1334                        *cpa->vaddr += cpa->numpages * PAGE_SIZE;
1335
1336        }
1337        return 0;
1338}
1339
1340static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1341                                    pgprot_t mask_set, pgprot_t mask_clr,
1342                                    int force_split, int in_flag,
1343                                    struct page **pages)
1344{
1345        struct cpa_data cpa;
1346        int ret, cache, checkalias;
1347        unsigned long baddr = 0;
1348
1349        memset(&cpa, 0, sizeof(cpa));
1350
1351        /*
1352         * Check, if we are requested to change a not supported
1353         * feature:
1354         */
1355        mask_set = canon_pgprot(mask_set);
1356        mask_clr = canon_pgprot(mask_clr);
1357        if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1358                return 0;
1359
1360        /* Ensure we are PAGE_SIZE aligned */
1361        if (in_flag & CPA_ARRAY) {
1362                int i;
1363                for (i = 0; i < numpages; i++) {
1364                        if (addr[i] & ~PAGE_MASK) {
1365                                addr[i] &= PAGE_MASK;
1366                                WARN_ON_ONCE(1);
1367                        }
1368                }
1369        } else if (!(in_flag & CPA_PAGES_ARRAY)) {
1370                /*
1371                 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1372                 * No need to cehck in that case
1373                 */
1374                if (*addr & ~PAGE_MASK) {
1375                        *addr &= PAGE_MASK;
1376                        /*
1377                         * People should not be passing in unaligned addresses:
1378                         */
1379                        WARN_ON_ONCE(1);
1380                }
1381                /*
1382                 * Save address for cache flush. *addr is modified in the call
1383                 * to __change_page_attr_set_clr() below.
1384                 */
1385                baddr = *addr;
1386        }
1387
1388        /* Must avoid aliasing mappings in the highmem code */
1389        kmap_flush_unused();
1390
1391        vm_unmap_aliases();
1392
1393        cpa.vaddr = addr;
1394        cpa.pages = pages;
1395        cpa.numpages = numpages;
1396        cpa.mask_set = mask_set;
1397        cpa.mask_clr = mask_clr;
1398        cpa.flags = 0;
1399        cpa.curpage = 0;
1400        cpa.force_split = force_split;
1401
1402        if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1403                cpa.flags |= in_flag;
1404
1405        /* No alias checking for _NX bit modifications */
1406        checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1407
1408        ret = __change_page_attr_set_clr(&cpa, checkalias);
1409
1410        /*
1411         * Check whether we really changed something:
1412         */
1413        if (!(cpa.flags & CPA_FLUSHTLB))
1414                goto out;
1415
1416        /*
1417         * No need to flush, when we did not set any of the caching
1418         * attributes:
1419         */
1420        cache = !!pgprot2cachemode(mask_set);
1421
1422        /*
1423         * On success we use CLFLUSH, when the CPU supports it to
1424         * avoid the WBINVD. If the CPU does not support it and in the
1425         * error case we fall back to cpa_flush_all (which uses
1426         * WBINVD):
1427         */
1428        if (!ret && cpu_has_clflush) {
1429                if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1430                        cpa_flush_array(addr, numpages, cache,
1431                                        cpa.flags, pages);
1432                } else
1433                        cpa_flush_range(baddr, numpages, cache);
1434        } else
1435                cpa_flush_all(cache);
1436
1437out:
1438        return ret;
1439}
1440
1441static inline int change_page_attr_set(unsigned long *addr, int numpages,
1442                                       pgprot_t mask, int array)
1443{
1444        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1445                (array ? CPA_ARRAY : 0), NULL);
1446}
1447
1448static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1449                                         pgprot_t mask, int array)
1450{
1451        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1452                (array ? CPA_ARRAY : 0), NULL);
1453}
1454
1455static inline int cpa_set_pages_array(struct page **pages, int numpages,
1456                                       pgprot_t mask)
1457{
1458        return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1459                CPA_PAGES_ARRAY, pages);
1460}
1461
1462static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1463                                         pgprot_t mask)
1464{
1465        return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1466                CPA_PAGES_ARRAY, pages);
1467}
1468
1469int _set_memory_uc(unsigned long addr, int numpages)
1470{
1471        /*
1472         * for now UC MINUS. see comments in ioremap_nocache()
1473         */
1474        return change_page_attr_set(&addr, numpages,
1475                                    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1476                                    0);
1477}
1478
1479int set_memory_uc(unsigned long addr, int numpages)
1480{
1481        int ret;
1482
1483        /*
1484         * for now UC MINUS. see comments in ioremap_nocache()
1485         */
1486        ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1487                              _PAGE_CACHE_MODE_UC_MINUS, NULL);
1488        if (ret)
1489                goto out_err;
1490
1491        ret = _set_memory_uc(addr, numpages);
1492        if (ret)
1493                goto out_free;
1494
1495        return 0;
1496
1497out_free:
1498        free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1499out_err:
1500        return ret;
1501}
1502EXPORT_SYMBOL(set_memory_uc);
1503
1504static int _set_memory_array(unsigned long *addr, int addrinarray,
1505                enum page_cache_mode new_type)
1506{
1507        int i, j;
1508        int ret;
1509
1510        /*
1511         * for now UC MINUS. see comments in ioremap_nocache()
1512         */
1513        for (i = 0; i < addrinarray; i++) {
1514                ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1515                                        new_type, NULL);
1516                if (ret)
1517                        goto out_free;
1518        }
1519
1520        ret = change_page_attr_set(addr, addrinarray,
1521                                   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1522                                   1);
1523
1524        if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1525                ret = change_page_attr_set_clr(addr, addrinarray,
1526                                               cachemode2pgprot(
1527                                                _PAGE_CACHE_MODE_WC),
1528                                               __pgprot(_PAGE_CACHE_MASK),
1529                                               0, CPA_ARRAY, NULL);
1530        if (ret)
1531                goto out_free;
1532
1533        return 0;
1534
1535out_free:
1536        for (j = 0; j < i; j++)
1537                free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1538
1539        return ret;
1540}
1541
1542int set_memory_array_uc(unsigned long *addr, int addrinarray)
1543{
1544        return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1545}
1546EXPORT_SYMBOL(set_memory_array_uc);
1547
1548int set_memory_array_wc(unsigned long *addr, int addrinarray)
1549{
1550        return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
1551}
1552EXPORT_SYMBOL(set_memory_array_wc);
1553
1554int _set_memory_wc(unsigned long addr, int numpages)
1555{
1556        int ret;
1557        unsigned long addr_copy = addr;
1558
1559        ret = change_page_attr_set(&addr, numpages,
1560                                   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1561                                   0);
1562        if (!ret) {
1563                ret = change_page_attr_set_clr(&addr_copy, numpages,
1564                                               cachemode2pgprot(
1565                                                _PAGE_CACHE_MODE_WC),
1566                                               __pgprot(_PAGE_CACHE_MASK),
1567                                               0, 0, NULL);
1568        }
1569        return ret;
1570}
1571
1572int set_memory_wc(unsigned long addr, int numpages)
1573{
1574        int ret;
1575
1576        if (!pat_enabled)
1577                return set_memory_uc(addr, numpages);
1578
1579        ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1580                _PAGE_CACHE_MODE_WC, NULL);
1581        if (ret)
1582                goto out_err;
1583
1584        ret = _set_memory_wc(addr, numpages);
1585        if (ret)
1586                goto out_free;
1587
1588        return 0;
1589
1590out_free:
1591        free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1592out_err:
1593        return ret;
1594}
1595EXPORT_SYMBOL(set_memory_wc);
1596
1597int _set_memory_wb(unsigned long addr, int numpages)
1598{
1599        /* WB cache mode is hard wired to all cache attribute bits being 0 */
1600        return change_page_attr_clear(&addr, numpages,
1601                                      __pgprot(_PAGE_CACHE_MASK), 0);
1602}
1603
1604int set_memory_wb(unsigned long addr, int numpages)
1605{
1606        int ret;
1607
1608        ret = _set_memory_wb(addr, numpages);
1609        if (ret)
1610                return ret;
1611
1612        free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1613        return 0;
1614}
1615EXPORT_SYMBOL(set_memory_wb);
1616
1617int set_memory_array_wb(unsigned long *addr, int addrinarray)
1618{
1619        int i;
1620        int ret;
1621
1622        /* WB cache mode is hard wired to all cache attribute bits being 0 */
1623        ret = change_page_attr_clear(addr, addrinarray,
1624                                      __pgprot(_PAGE_CACHE_MASK), 1);
1625        if (ret)
1626                return ret;
1627
1628        for (i = 0; i < addrinarray; i++)
1629                free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1630
1631        return 0;
1632}
1633EXPORT_SYMBOL(set_memory_array_wb);
1634
1635int set_memory_x(unsigned long addr, int numpages)
1636{
1637        if (!(__supported_pte_mask & _PAGE_NX))
1638                return 0;
1639
1640        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1641}
1642EXPORT_SYMBOL(set_memory_x);
1643
1644int set_memory_nx(unsigned long addr, int numpages)
1645{
1646        if (!(__supported_pte_mask & _PAGE_NX))
1647                return 0;
1648
1649        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1650}
1651EXPORT_SYMBOL(set_memory_nx);
1652
1653int set_memory_ro(unsigned long addr, int numpages)
1654{
1655        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1656}
1657EXPORT_SYMBOL_GPL(set_memory_ro);
1658
1659int set_memory_rw(unsigned long addr, int numpages)
1660{
1661        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1662}
1663EXPORT_SYMBOL_GPL(set_memory_rw);
1664
1665int set_memory_np(unsigned long addr, int numpages)
1666{
1667        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1668}
1669
1670int set_memory_4k(unsigned long addr, int numpages)
1671{
1672        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1673                                        __pgprot(0), 1, 0, NULL);
1674}
1675
1676int set_pages_uc(struct page *page, int numpages)
1677{
1678        unsigned long addr = (unsigned long)page_address(page);
1679
1680        return set_memory_uc(addr, numpages);
1681}
1682EXPORT_SYMBOL(set_pages_uc);
1683
1684static int _set_pages_array(struct page **pages, int addrinarray,
1685                enum page_cache_mode new_type)
1686{
1687        unsigned long start;
1688        unsigned long end;
1689        int i;
1690        int free_idx;
1691        int ret;
1692
1693        for (i = 0; i < addrinarray; i++) {
1694                if (PageHighMem(pages[i]))
1695                        continue;
1696                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1697                end = start + PAGE_SIZE;
1698                if (reserve_memtype(start, end, new_type, NULL))
1699                        goto err_out;
1700        }
1701
1702        ret = cpa_set_pages_array(pages, addrinarray,
1703                        cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
1704        if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1705                ret = change_page_attr_set_clr(NULL, addrinarray,
1706                                               cachemode2pgprot(
1707                                                _PAGE_CACHE_MODE_WC),
1708                                               __pgprot(_PAGE_CACHE_MASK),
1709                                               0, CPA_PAGES_ARRAY, pages);
1710        if (ret)
1711                goto err_out;
1712        return 0; /* Success */
1713err_out:
1714        free_idx = i;
1715        for (i = 0; i < free_idx; i++) {
1716                if (PageHighMem(pages[i]))
1717                        continue;
1718                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1719                end = start + PAGE_SIZE;
1720                free_memtype(start, end);
1721        }
1722        return -EINVAL;
1723}
1724
1725int set_pages_array_uc(struct page **pages, int addrinarray)
1726{
1727        return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1728}
1729EXPORT_SYMBOL(set_pages_array_uc);
1730
1731int set_pages_array_wc(struct page **pages, int addrinarray)
1732{
1733        return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
1734}
1735EXPORT_SYMBOL(set_pages_array_wc);
1736
1737int set_pages_wb(struct page *page, int numpages)
1738{
1739        unsigned long addr = (unsigned long)page_address(page);
1740
1741        return set_memory_wb(addr, numpages);
1742}
1743EXPORT_SYMBOL(set_pages_wb);
1744
1745int set_pages_array_wb(struct page **pages, int addrinarray)
1746{
1747        int retval;
1748        unsigned long start;
1749        unsigned long end;
1750        int i;
1751
1752        /* WB cache mode is hard wired to all cache attribute bits being 0 */
1753        retval = cpa_clear_pages_array(pages, addrinarray,
1754                        __pgprot(_PAGE_CACHE_MASK));
1755        if (retval)
1756                return retval;
1757
1758        for (i = 0; i < addrinarray; i++) {
1759                if (PageHighMem(pages[i]))
1760                        continue;
1761                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1762                end = start + PAGE_SIZE;
1763                free_memtype(start, end);
1764        }
1765
1766        return 0;
1767}
1768EXPORT_SYMBOL(set_pages_array_wb);
1769
1770int set_pages_x(struct page *page, int numpages)
1771{
1772        unsigned long addr = (unsigned long)page_address(page);
1773
1774        return set_memory_x(addr, numpages);
1775}
1776EXPORT_SYMBOL(set_pages_x);
1777
1778int set_pages_nx(struct page *page, int numpages)
1779{
1780        unsigned long addr = (unsigned long)page_address(page);
1781
1782        return set_memory_nx(addr, numpages);
1783}
1784EXPORT_SYMBOL(set_pages_nx);
1785
1786int set_pages_ro(struct page *page, int numpages)
1787{
1788        unsigned long addr = (unsigned long)page_address(page);
1789
1790        return set_memory_ro(addr, numpages);
1791}
1792
1793int set_pages_rw(struct page *page, int numpages)
1794{
1795        unsigned long addr = (unsigned long)page_address(page);
1796
1797        return set_memory_rw(addr, numpages);
1798}
1799
1800#ifdef CONFIG_DEBUG_PAGEALLOC
1801
1802static int __set_pages_p(struct page *page, int numpages)
1803{
1804        unsigned long tempaddr = (unsigned long) page_address(page);
1805        struct cpa_data cpa = { .vaddr = &tempaddr,
1806                                .pgd = NULL,
1807                                .numpages = numpages,
1808                                .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1809                                .mask_clr = __pgprot(0),
1810                                .flags = 0};
1811
1812        /*
1813         * No alias checking needed for setting present flag. otherwise,
1814         * we may need to break large pages for 64-bit kernel text
1815         * mappings (this adds to complexity if we want to do this from
1816         * atomic context especially). Let's keep it simple!
1817         */
1818        return __change_page_attr_set_clr(&cpa, 0);
1819}
1820
1821static int __set_pages_np(struct page *page, int numpages)
1822{
1823        unsigned long tempaddr = (unsigned long) page_address(page);
1824        struct cpa_data cpa = { .vaddr = &tempaddr,
1825                                .pgd = NULL,
1826                                .numpages = numpages,
1827                                .mask_set = __pgprot(0),
1828                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1829                                .flags = 0};
1830
1831        /*
1832         * No alias checking needed for setting not present flag. otherwise,
1833         * we may need to break large pages for 64-bit kernel text
1834         * mappings (this adds to complexity if we want to do this from
1835         * atomic context especially). Let's keep it simple!
1836         */
1837        return __change_page_attr_set_clr(&cpa, 0);
1838}
1839
1840void __kernel_map_pages(struct page *page, int numpages, int enable)
1841{
1842        if (PageHighMem(page))
1843                return;
1844        if (!enable) {
1845                debug_check_no_locks_freed(page_address(page),
1846                                           numpages * PAGE_SIZE);
1847        }
1848
1849        /*
1850         * The return value is ignored as the calls cannot fail.
1851         * Large pages for identity mappings are not used at boot time
1852         * and hence no memory allocations during large page split.
1853         */
1854        if (enable)
1855                __set_pages_p(page, numpages);
1856        else
1857                __set_pages_np(page, numpages);
1858
1859        /*
1860         * We should perform an IPI and flush all tlbs,
1861         * but that can deadlock->flush only current cpu:
1862         */
1863        __flush_tlb_all();
1864
1865        arch_flush_lazy_mmu_mode();
1866}
1867
1868#ifdef CONFIG_HIBERNATION
1869
1870bool kernel_page_present(struct page *page)
1871{
1872        unsigned int level;
1873        pte_t *pte;
1874
1875        if (PageHighMem(page))
1876                return false;
1877
1878        pte = lookup_address((unsigned long)page_address(page), &level);
1879        return (pte_val(*pte) & _PAGE_PRESENT);
1880}
1881
1882#endif /* CONFIG_HIBERNATION */
1883
1884#endif /* CONFIG_DEBUG_PAGEALLOC */
1885
1886int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1887                            unsigned numpages, unsigned long page_flags)
1888{
1889        int retval = -EINVAL;
1890
1891        struct cpa_data cpa = {
1892                .vaddr = &address,
1893                .pfn = pfn,
1894                .pgd = pgd,
1895                .numpages = numpages,
1896                .mask_set = __pgprot(0),
1897                .mask_clr = __pgprot(0),
1898                .flags = 0,
1899        };
1900
1901        if (!(__supported_pte_mask & _PAGE_NX))
1902                goto out;
1903
1904        if (!(page_flags & _PAGE_NX))
1905                cpa.mask_clr = __pgprot(_PAGE_NX);
1906
1907        cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1908
1909        retval = __change_page_attr_set_clr(&cpa, 0);
1910        __flush_tlb_all();
1911
1912out:
1913        return retval;
1914}
1915
1916void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
1917                               unsigned numpages)
1918{
1919        unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
1920}
1921
1922/*
1923 * The testcases use internal knowledge of the implementation that shouldn't
1924 * be exposed to the rest of the kernel. Include these directly here.
1925 */
1926#ifdef CONFIG_CPA_DEBUG
1927#include "pageattr-test.c"
1928#endif
1929
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.