linux/arch/powerpc/mm/hugetlbpage.c
<<
>>
Prefs
   1/*
   2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3 *
   4 * Copyright (C) 2003 David Gibson, IBM Corporation.
   5 *
   6 * Based on the IA-32 version:
   7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8 */
   9
  10#include <linux/init.h>
  11#include <linux/fs.h>
  12#include <linux/mm.h>
  13#include <linux/hugetlb.h>
  14#include <linux/pagemap.h>
  15#include <linux/slab.h>
  16#include <linux/err.h>
  17#include <linux/sysctl.h>
  18#include <asm/mman.h>
  19#include <asm/pgalloc.h>
  20#include <asm/tlb.h>
  21#include <asm/tlbflush.h>
  22#include <asm/mmu_context.h>
  23#include <asm/machdep.h>
  24#include <asm/cputable.h>
  25#include <asm/spu.h>
  26
  27#define PAGE_SHIFT_64K  16
  28#define PAGE_SHIFT_16M  24
  29#define PAGE_SHIFT_16G  34
  30
  31#define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  32#define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  33#define MAX_NUMBER_GPAGES       1024
  34
  35/* Tracks the 16G pages after the device tree is scanned and before the
  36 * huge_boot_pages list is ready.  */
  37static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
  38static unsigned nr_gpages;
  39
  40/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  41 * stored for the huge page sizes that are valid.
  42 */
  43unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
  44
  45#define hugepte_shift                   mmu_huge_psizes
  46#define PTRS_PER_HUGEPTE(psize)         (1 << hugepte_shift[psize])
  47#define HUGEPTE_TABLE_SIZE(psize)       (sizeof(pte_t) << hugepte_shift[psize])
  48
  49#define HUGEPD_SHIFT(psize)             (mmu_psize_to_shift(psize) \
  50                                                + hugepte_shift[psize])
  51#define HUGEPD_SIZE(psize)              (1UL << HUGEPD_SHIFT(psize))
  52#define HUGEPD_MASK(psize)              (~(HUGEPD_SIZE(psize)-1))
  53
  54/* Subtract one from array size because we don't need a cache for 4K since
  55 * is not a huge page size */
  56#define HUGE_PGTABLE_INDEX(psize)       (HUGEPTE_CACHE_NUM + psize - 1)
  57#define HUGEPTE_CACHE_NAME(psize)       (huge_pgtable_cache_name[psize])
  58
  59static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
  60        "unused_4K", "hugepte_cache_64K", "unused_64K_AP",
  61        "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
  62};
  63
  64/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  65 * will choke on pointers to hugepte tables, which is handy for
  66 * catching screwups early. */
  67#define HUGEPD_OK       0x1
  68
  69typedef struct { unsigned long pd; } hugepd_t;
  70
  71#define hugepd_none(hpd)        ((hpd).pd == 0)
  72
  73static inline int shift_to_mmu_psize(unsigned int shift)
  74{
  75        switch (shift) {
  76#ifndef CONFIG_PPC_64K_PAGES
  77        case PAGE_SHIFT_64K:
  78            return MMU_PAGE_64K;
  79#endif
  80        case PAGE_SHIFT_16M:
  81            return MMU_PAGE_16M;
  82        case PAGE_SHIFT_16G:
  83            return MMU_PAGE_16G;
  84        }
  85        return -1;
  86}
  87
  88static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  89{
  90        if (mmu_psize_defs[mmu_psize].shift)
  91                return mmu_psize_defs[mmu_psize].shift;
  92        BUG();
  93}
  94
  95static inline pte_t *hugepd_page(hugepd_t hpd)
  96{
  97        BUG_ON(!(hpd.pd & HUGEPD_OK));
  98        return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  99}
 100
 101static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 102                                    struct hstate *hstate)
 103{
 104        unsigned int shift = huge_page_shift(hstate);
 105        int psize = shift_to_mmu_psize(shift);
 106        unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
 107        pte_t *dir = hugepd_page(*hpdp);
 108
 109        return dir + idx;
 110}
 111
 112static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 113                           unsigned long address, unsigned int psize)
 114{
 115        pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
 116                                      GFP_KERNEL|__GFP_REPEAT);
 117
 118        if (! new)
 119                return -ENOMEM;
 120
 121        spin_lock(&mm->page_table_lock);
 122        if (!hugepd_none(*hpdp))
 123                kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
 124        else
 125                hpdp->pd = (unsigned long)new | HUGEPD_OK;
 126        spin_unlock(&mm->page_table_lock);
 127        return 0;
 128}
 129
 130
 131static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
 132{
 133        if (huge_page_shift(hstate) < PUD_SHIFT)
 134                return pud_offset(pgd, addr);
 135        else
 136                return (pud_t *) pgd;
 137}
 138static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
 139                         struct hstate *hstate)
 140{
 141        if (huge_page_shift(hstate) < PUD_SHIFT)
 142                return pud_alloc(mm, pgd, addr);
 143        else
 144                return (pud_t *) pgd;
 145}
 146static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
 147{
 148        if (huge_page_shift(hstate) < PMD_SHIFT)
 149                return pmd_offset(pud, addr);
 150        else
 151                return (pmd_t *) pud;
 152}
 153static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
 154                         struct hstate *hstate)
 155{
 156        if (huge_page_shift(hstate) < PMD_SHIFT)
 157                return pmd_alloc(mm, pud, addr);
 158        else
 159                return (pmd_t *) pud;
 160}
 161
 162/* Build list of addresses of gigantic pages.  This function is used in early
 163 * boot before the buddy or bootmem allocator is setup.
 164 */
 165void add_gpage(unsigned long addr, unsigned long page_size,
 166        unsigned long number_of_pages)
 167{
 168        if (!addr)
 169                return;
 170        while (number_of_pages > 0) {
 171                gpage_freearray[nr_gpages] = addr;
 172                nr_gpages++;
 173                number_of_pages--;
 174                addr += page_size;
 175        }
 176}
 177
 178/* Moves the gigantic page addresses from the temporary list to the
 179 * huge_boot_pages list.
 180 */
 181int alloc_bootmem_huge_page(struct hstate *hstate)
 182{
 183        struct huge_bootmem_page *m;
 184        if (nr_gpages == 0)
 185                return 0;
 186        m = phys_to_virt(gpage_freearray[--nr_gpages]);
 187        gpage_freearray[nr_gpages] = 0;
 188        list_add(&m->list, &huge_boot_pages);
 189        m->hstate = hstate;
 190        return 1;
 191}
 192
 193
 194/* Modelled after find_linux_pte() */
 195pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 196{
 197        pgd_t *pg;
 198        pud_t *pu;
 199        pmd_t *pm;
 200
 201        unsigned int psize;
 202        unsigned int shift;
 203        unsigned long sz;
 204        struct hstate *hstate;
 205        psize = get_slice_psize(mm, addr);
 206        shift = mmu_psize_to_shift(psize);
 207        sz = ((1UL) << shift);
 208        hstate = size_to_hstate(sz);
 209
 210        addr &= hstate->mask;
 211
 212        pg = pgd_offset(mm, addr);
 213        if (!pgd_none(*pg)) {
 214                pu = hpud_offset(pg, addr, hstate);
 215                if (!pud_none(*pu)) {
 216                        pm = hpmd_offset(pu, addr, hstate);
 217                        if (!pmd_none(*pm))
 218                                return hugepte_offset((hugepd_t *)pm, addr,
 219                                                      hstate);
 220                }
 221        }
 222
 223        return NULL;
 224}
 225
 226pte_t *huge_pte_alloc(struct mm_struct *mm,
 227                        unsigned long addr, unsigned long sz)
 228{
 229        pgd_t *pg;
 230        pud_t *pu;
 231        pmd_t *pm;
 232        hugepd_t *hpdp = NULL;
 233        struct hstate *hstate;
 234        unsigned int psize;
 235        hstate = size_to_hstate(sz);
 236
 237        psize = get_slice_psize(mm, addr);
 238        BUG_ON(!mmu_huge_psizes[psize]);
 239
 240        addr &= hstate->mask;
 241
 242        pg = pgd_offset(mm, addr);
 243        pu = hpud_alloc(mm, pg, addr, hstate);
 244
 245        if (pu) {
 246                pm = hpmd_alloc(mm, pu, addr, hstate);
 247                if (pm)
 248                        hpdp = (hugepd_t *)pm;
 249        }
 250
 251        if (! hpdp)
 252                return NULL;
 253
 254        if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
 255                return NULL;
 256
 257        return hugepte_offset(hpdp, addr, hstate);
 258}
 259
 260int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 261{
 262        return 0;
 263}
 264
 265static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
 266                               unsigned int psize)
 267{
 268        pte_t *hugepte = hugepd_page(*hpdp);
 269
 270        hpdp->pd = 0;
 271        tlb->need_flush = 1;
 272        pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
 273                                                 HUGEPTE_CACHE_NUM+psize-1,
 274                                                 PGF_CACHENUM_MASK));
 275}
 276
 277static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 278                                   unsigned long addr, unsigned long end,
 279                                   unsigned long floor, unsigned long ceiling,
 280                                   unsigned int psize)
 281{
 282        pmd_t *pmd;
 283        unsigned long next;
 284        unsigned long start;
 285
 286        start = addr;
 287        pmd = pmd_offset(pud, addr);
 288        do {
 289                next = pmd_addr_end(addr, end);
 290                if (pmd_none(*pmd))
 291                        continue;
 292                free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
 293        } while (pmd++, addr = next, addr != end);
 294
 295        start &= PUD_MASK;
 296        if (start < floor)
 297                return;
 298        if (ceiling) {
 299                ceiling &= PUD_MASK;
 300                if (!ceiling)
 301                        return;
 302        }
 303        if (end - 1 > ceiling - 1)
 304                return;
 305
 306        pmd = pmd_offset(pud, start);
 307        pud_clear(pud);
 308        pmd_free_tlb(tlb, pmd);
 309}
 310
 311static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 312                                   unsigned long addr, unsigned long end,
 313                                   unsigned long floor, unsigned long ceiling)
 314{
 315        pud_t *pud;
 316        unsigned long next;
 317        unsigned long start;
 318        unsigned int shift;
 319        unsigned int psize = get_slice_psize(tlb->mm, addr);
 320        shift = mmu_psize_to_shift(psize);
 321
 322        start = addr;
 323        pud = pud_offset(pgd, addr);
 324        do {
 325                next = pud_addr_end(addr, end);
 326                if (shift < PMD_SHIFT) {
 327                        if (pud_none_or_clear_bad(pud))
 328                                continue;
 329                        hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
 330                                               ceiling, psize);
 331                } else {
 332                        if (pud_none(*pud))
 333                                continue;
 334                        free_hugepte_range(tlb, (hugepd_t *)pud, psize);
 335                }
 336        } while (pud++, addr = next, addr != end);
 337
 338        start &= PGDIR_MASK;
 339        if (start < floor)
 340                return;
 341        if (ceiling) {
 342                ceiling &= PGDIR_MASK;
 343                if (!ceiling)
 344                        return;
 345        }
 346        if (end - 1 > ceiling - 1)
 347                return;
 348
 349        pud = pud_offset(pgd, start);
 350        pgd_clear(pgd);
 351        pud_free_tlb(tlb, pud);
 352}
 353
 354/*
 355 * This function frees user-level page tables of a process.
 356 *
 357 * Must be called with pagetable lock held.
 358 */
 359void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 360                            unsigned long addr, unsigned long end,
 361                            unsigned long floor, unsigned long ceiling)
 362{
 363        pgd_t *pgd;
 364        unsigned long next;
 365        unsigned long start;
 366
 367        /*
 368         * Comments below take from the normal free_pgd_range().  They
 369         * apply here too.  The tests against HUGEPD_MASK below are
 370         * essential, because we *don't* test for this at the bottom
 371         * level.  Without them we'll attempt to free a hugepte table
 372         * when we unmap just part of it, even if there are other
 373         * active mappings using it.
 374         *
 375         * The next few lines have given us lots of grief...
 376         *
 377         * Why are we testing HUGEPD* at this top level?  Because
 378         * often there will be no work to do at all, and we'd prefer
 379         * not to go all the way down to the bottom just to discover
 380         * that.
 381         *
 382         * Why all these "- 1"s?  Because 0 represents both the bottom
 383         * of the address space and the top of it (using -1 for the
 384         * top wouldn't help much: the masks would do the wrong thing).
 385         * The rule is that addr 0 and floor 0 refer to the bottom of
 386         * the address space, but end 0 and ceiling 0 refer to the top
 387         * Comparisons need to use "end - 1" and "ceiling - 1" (though
 388         * that end 0 case should be mythical).
 389         *
 390         * Wherever addr is brought up or ceiling brought down, we
 391         * must be careful to reject "the opposite 0" before it
 392         * confuses the subsequent tests.  But what about where end is
 393         * brought down by HUGEPD_SIZE below? no, end can't go down to
 394         * 0 there.
 395         *
 396         * Whereas we round start (addr) and ceiling down, by different
 397         * masks at different levels, in order to test whether a table
 398         * now has no other vmas using it, so can be freed, we don't
 399         * bother to round floor or end up - the tests don't need that.
 400         */
 401        unsigned int psize = get_slice_psize(tlb->mm, addr);
 402
 403        addr &= HUGEPD_MASK(psize);
 404        if (addr < floor) {
 405                addr += HUGEPD_SIZE(psize);
 406                if (!addr)
 407                        return;
 408        }
 409        if (ceiling) {
 410                ceiling &= HUGEPD_MASK(psize);
 411                if (!ceiling)
 412                        return;
 413        }
 414        if (end - 1 > ceiling - 1)
 415                end -= HUGEPD_SIZE(psize);
 416        if (addr > end - 1)
 417                return;
 418
 419        start = addr;
 420        pgd = pgd_offset(tlb->mm, addr);
 421        do {
 422                psize = get_slice_psize(tlb->mm, addr);
 423                BUG_ON(!mmu_huge_psizes[psize]);
 424                next = pgd_addr_end(addr, end);
 425                if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
 426                        if (pgd_none_or_clear_bad(pgd))
 427                                continue;
 428                        hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
 429                } else {
 430                        if (pgd_none(*pgd))
 431                                continue;
 432                        free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
 433                }
 434        } while (pgd++, addr = next, addr != end);
 435}
 436
 437void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 438                     pte_t *ptep, pte_t pte)
 439{
 440        if (pte_present(*ptep)) {
 441                /* We open-code pte_clear because we need to pass the right
 442                 * argument to hpte_need_flush (huge / !huge). Might not be
 443                 * necessary anymore if we make hpte_need_flush() get the
 444                 * page size from the slices
 445                 */
 446                unsigned int psize = get_slice_psize(mm, addr);
 447                unsigned int shift = mmu_psize_to_shift(psize);
 448                unsigned long sz = ((1UL) << shift);
 449                struct hstate *hstate = size_to_hstate(sz);
 450                pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
 451        }
 452        *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 453}
 454
 455pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 456                              pte_t *ptep)
 457{
 458        unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 459        return __pte(old);
 460}
 461
 462struct page *
 463follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 464{
 465        pte_t *ptep;
 466        struct page *page;
 467        unsigned int mmu_psize = get_slice_psize(mm, address);
 468
 469        /* Verify it is a huge page else bail. */
 470        if (!mmu_huge_psizes[mmu_psize])
 471                return ERR_PTR(-EINVAL);
 472
 473        ptep = huge_pte_offset(mm, address);
 474        page = pte_page(*ptep);
 475        if (page) {
 476                unsigned int shift = mmu_psize_to_shift(mmu_psize);
 477                unsigned long sz = ((1UL) << shift);
 478                page += (address % sz) / PAGE_SIZE;
 479        }
 480
 481        return page;
 482}
 483
 484int pmd_huge(pmd_t pmd)
 485{
 486        return 0;
 487}
 488
 489int pud_huge(pud_t pud)
 490{
 491        return 0;
 492}
 493
 494struct page *
 495follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 496                pmd_t *pmd, int write)
 497{
 498        BUG();
 499        return NULL;
 500}
 501
 502
 503unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 504                                        unsigned long len, unsigned long pgoff,
 505                                        unsigned long flags)
 506{
 507        struct hstate *hstate = hstate_file(file);
 508        int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 509
 510        if (!mmu_huge_psizes[mmu_psize])
 511                return -EINVAL;
 512        return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 513}
 514
 515unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 516{
 517        unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
 518
 519        return 1UL << mmu_psize_to_shift(psize);
 520}
 521
 522/*
 523 * Called by asm hashtable.S for doing lazy icache flush
 524 */
 525static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 526                                        pte_t pte, int trap, unsigned long sz)
 527{
 528        struct page *page;
 529        int i;
 530
 531        if (!pfn_valid(pte_pfn(pte)))
 532                return rflags;
 533
 534        page = pte_page(pte);
 535
 536        /* page is dirty */
 537        if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 538                if (trap == 0x400) {
 539                        for (i = 0; i < (sz / PAGE_SIZE); i++)
 540                                __flush_dcache_icache(page_address(page+i));
 541                        set_bit(PG_arch_1, &page->flags);
 542                } else {
 543                        rflags |= HPTE_R_N;
 544                }
 545        }
 546        return rflags;
 547}
 548
 549int hash_huge_page(struct mm_struct *mm, unsigned long access,
 550                   unsigned long ea, unsigned long vsid, int local,
 551                   unsigned long trap)
 552{
 553        pte_t *ptep;
 554        unsigned long old_pte, new_pte;
 555        unsigned long va, rflags, pa, sz;
 556        long slot;
 557        int err = 1;
 558        int ssize = user_segment_size(ea);
 559        unsigned int mmu_psize;
 560        int shift;
 561        mmu_psize = get_slice_psize(mm, ea);
 562
 563        if (!mmu_huge_psizes[mmu_psize])
 564                goto out;
 565        ptep = huge_pte_offset(mm, ea);
 566
 567        /* Search the Linux page table for a match with va */
 568        va = hpt_va(ea, vsid, ssize);
 569
 570        /*
 571         * If no pte found or not present, send the problem up to
 572         * do_page_fault
 573         */
 574        if (unlikely(!ptep || pte_none(*ptep)))
 575                goto out;
 576
 577        /* 
 578         * Check the user's access rights to the page.  If access should be
 579         * prevented then send the problem up to do_page_fault.
 580         */
 581        if (unlikely(access & ~pte_val(*ptep)))
 582                goto out;
 583        /*
 584         * At this point, we have a pte (old_pte) which can be used to build
 585         * or update an HPTE. There are 2 cases:
 586         *
 587         * 1. There is a valid (present) pte with no associated HPTE (this is 
 588         *      the most common case)
 589         * 2. There is a valid (present) pte with an associated HPTE. The
 590         *      current values of the pp bits in the HPTE prevent access
 591         *      because we are doing software DIRTY bit management and the
 592         *      page is currently not DIRTY. 
 593         */
 594
 595
 596        do {
 597                old_pte = pte_val(*ptep);
 598                if (old_pte & _PAGE_BUSY)
 599                        goto out;
 600                new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
 601        } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 602                                         old_pte, new_pte));
 603
 604        rflags = 0x2 | (!(new_pte & _PAGE_RW));
 605        /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 606        rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 607        shift = mmu_psize_to_shift(mmu_psize);
 608        sz = ((1UL) << shift);
 609        if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 610                /* No CPU has hugepages but lacks no execute, so we
 611                 * don't need to worry about that case */
 612                rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 613                                                       trap, sz);
 614
 615        /* Check if pte already has an hpte (case 2) */
 616        if (unlikely(old_pte & _PAGE_HASHPTE)) {
 617                /* There MIGHT be an HPTE for this pte */
 618                unsigned long hash, slot;
 619
 620                hash = hpt_hash(va, shift, ssize);
 621                if (old_pte & _PAGE_F_SECOND)
 622                        hash = ~hash;
 623                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 624                slot += (old_pte & _PAGE_F_GIX) >> 12;
 625
 626                if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
 627                                         ssize, local) == -1)
 628                        old_pte &= ~_PAGE_HPTEFLAGS;
 629        }
 630
 631        if (likely(!(old_pte & _PAGE_HASHPTE))) {
 632                unsigned long hash = hpt_hash(va, shift, ssize);
 633                unsigned long hpte_group;
 634
 635                pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 636
 637repeat:
 638                hpte_group = ((hash & htab_hash_mask) *
 639                              HPTES_PER_GROUP) & ~0x7UL;
 640
 641                /* clear HPTE slot informations in new PTE */
 642#ifdef CONFIG_PPC_64K_PAGES
 643                new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
 644#else
 645                new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 646#endif
 647                /* Add in WIMG bits */
 648                rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 649                                      _PAGE_COHERENT | _PAGE_GUARDED));
 650
 651                /* Insert into the hash table, primary slot */
 652                slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 653                                          mmu_psize, ssize);
 654
 655                /* Primary is full, try the secondary */
 656                if (unlikely(slot == -1)) {
 657                        hpte_group = ((~hash & htab_hash_mask) *
 658                                      HPTES_PER_GROUP) & ~0x7UL; 
 659                        slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 660                                                  HPTE_V_SECONDARY,
 661                                                  mmu_psize, ssize);
 662                        if (slot == -1) {
 663                                if (mftb() & 0x1)
 664                                        hpte_group = ((hash & htab_hash_mask) *
 665                                                      HPTES_PER_GROUP)&~0x7UL;
 666
 667                                ppc_md.hpte_remove(hpte_group);
 668                                goto repeat;
 669                        }
 670                }
 671
 672                if (unlikely(slot == -2))
 673                        panic("hash_huge_page: pte_insert failed\n");
 674
 675                new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 676        }
 677
 678        /*
 679         * No need to use ldarx/stdcx here
 680         */
 681        *ptep = __pte(new_pte & ~_PAGE_BUSY);
 682
 683        err = 0;
 684
 685 out:
 686        return err;
 687}
 688
 689static void __init set_huge_psize(int psize)
 690{
 691        /* Check that it is a page size supported by the hardware and
 692         * that it fits within pagetable limits. */
 693        if (mmu_psize_defs[psize].shift &&
 694                mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
 695                (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
 696                 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
 697                 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
 698                /* Return if huge page size has already been setup or is the
 699                 * same as the base page size. */
 700                if (mmu_huge_psizes[psize] ||
 701                   mmu_psize_defs[psize].shift == PAGE_SHIFT)
 702                        return;
 703                hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
 704
 705                switch (mmu_psize_defs[psize].shift) {
 706                case PAGE_SHIFT_64K:
 707                    /* We only allow 64k hpages with 4k base page,
 708                     * which was checked above, and always put them
 709                     * at the PMD */
 710                    hugepte_shift[psize] = PMD_SHIFT;
 711                    break;
 712                case PAGE_SHIFT_16M:
 713                    /* 16M pages can be at two different levels
 714                     * of pagestables based on base page size */
 715                    if (PAGE_SHIFT == PAGE_SHIFT_64K)
 716                            hugepte_shift[psize] = PMD_SHIFT;
 717                    else /* 4k base page */
 718                            hugepte_shift[psize] = PUD_SHIFT;
 719                    break;
 720                case PAGE_SHIFT_16G:
 721                    /* 16G pages are always at PGD level */
 722                    hugepte_shift[psize] = PGDIR_SHIFT;
 723                    break;
 724                }
 725                hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
 726        } else
 727                hugepte_shift[psize] = 0;
 728}
 729
 730static int __init hugepage_setup_sz(char *str)
 731{
 732        unsigned long long size;
 733        int mmu_psize;
 734        int shift;
 735
 736        size = memparse(str, &str);
 737
 738        shift = __ffs(size);
 739        mmu_psize = shift_to_mmu_psize(shift);
 740        if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
 741                set_huge_psize(mmu_psize);
 742        else
 743                printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 744
 745        return 1;
 746}
 747__setup("hugepagesz=", hugepage_setup_sz);
 748
 749static int __init hugetlbpage_init(void)
 750{
 751        unsigned int psize;
 752
 753        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 754                return -ENODEV;
 755
 756        /* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
 757         * and adjust PTE_NONCACHE_NUM if the number of supported huge page
 758         * sizes changes.
 759         */
 760        set_huge_psize(MMU_PAGE_16M);
 761        set_huge_psize(MMU_PAGE_16G);
 762
 763        /* Temporarily disable support for 64K huge pages when 64K SPU local
 764         * store support is enabled as the current implementation conflicts.
 765         */
 766#ifndef CONFIG_SPU_FS_64K_LS
 767        set_huge_psize(MMU_PAGE_64K);
 768#endif
 769
 770        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 771                if (mmu_huge_psizes[psize]) {
 772                        pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
 773                                kmem_cache_create(
 774                                        HUGEPTE_CACHE_NAME(psize),
 775                                        HUGEPTE_TABLE_SIZE(psize),
 776                                        HUGEPTE_TABLE_SIZE(psize),
 777                                        0,
 778                                        NULL);
 779                        if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
 780                                panic("hugetlbpage_init(): could not create %s"\
 781                                      "\n", HUGEPTE_CACHE_NAME(psize));
 782                }
 783        }
 784
 785        return 0;
 786}
 787
 788module_init(hugetlbpage_init);
 789
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.