linux/mm/hugetlb.c
<<
>>
Prefs
   1/*
   2 * Generic hugetlb support.
   3 * (C) William Irwin, April 2004
   4 */
   5#include <linux/gfp.h>
   6#include <linux/list.h>
   7#include <linux/init.h>
   8#include <linux/module.h>
   9#include <linux/mm.h>
  10#include <linux/sysctl.h>
  11#include <linux/highmem.h>
  12#include <linux/mmu_notifier.h>
  13#include <linux/nodemask.h>
  14#include <linux/pagemap.h>
  15#include <linux/mempolicy.h>
  16#include <linux/cpuset.h>
  17#include <linux/mutex.h>
  18#include <linux/bootmem.h>
  19#include <linux/sysfs.h>
  20
  21#include <asm/page.h>
  22#include <asm/pgtable.h>
  23#include <asm/io.h>
  24
  25#include <linux/hugetlb.h>
  26#include "internal.h"
  27
  28const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
  29static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
  30unsigned long hugepages_treat_as_movable;
  31
  32static int max_hstate;
  33unsigned int default_hstate_idx;
  34struct hstate hstates[HUGE_MAX_HSTATE];
  35
  36__initdata LIST_HEAD(huge_boot_pages);
  37
  38/* for command line parsing */
  39static struct hstate * __initdata parsed_hstate;
  40static unsigned long __initdata default_hstate_max_huge_pages;
  41static unsigned long __initdata default_hstate_size;
  42
  43#define for_each_hstate(h) \
  44        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
  45
  46/*
  47 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  48 */
  49static DEFINE_SPINLOCK(hugetlb_lock);
  50
  51/*
  52 * Region tracking -- allows tracking of reservations and instantiated pages
  53 *                    across the pages in a mapping.
  54 *
  55 * The region data structures are protected by a combination of the mmap_sem
  56 * and the hugetlb_instantion_mutex.  To access or modify a region the caller
  57 * must either hold the mmap_sem for write, or the mmap_sem for read and
  58 * the hugetlb_instantiation mutex:
  59 *
  60 *      down_write(&mm->mmap_sem);
  61 * or
  62 *      down_read(&mm->mmap_sem);
  63 *      mutex_lock(&hugetlb_instantiation_mutex);
  64 */
  65struct file_region {
  66        struct list_head link;
  67        long from;
  68        long to;
  69};
  70
  71static long region_add(struct list_head *head, long f, long t)
  72{
  73        struct file_region *rg, *nrg, *trg;
  74
  75        /* Locate the region we are either in or before. */
  76        list_for_each_entry(rg, head, link)
  77                if (f <= rg->to)
  78                        break;
  79
  80        /* Round our left edge to the current segment if it encloses us. */
  81        if (f > rg->from)
  82                f = rg->from;
  83
  84        /* Check for and consume any regions we now overlap with. */
  85        nrg = rg;
  86        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
  87                if (&rg->link == head)
  88                        break;
  89                if (rg->from > t)
  90                        break;
  91
  92                /* If this area reaches higher then extend our area to
  93                 * include it completely.  If this is not the first area
  94                 * which we intend to reuse, free it. */
  95                if (rg->to > t)
  96                        t = rg->to;
  97                if (rg != nrg) {
  98                        list_del(&rg->link);
  99                        kfree(rg);
 100                }
 101        }
 102        nrg->from = f;
 103        nrg->to = t;
 104        return 0;
 105}
 106
 107static long region_chg(struct list_head *head, long f, long t)
 108{
 109        struct file_region *rg, *nrg;
 110        long chg = 0;
 111
 112        /* Locate the region we are before or in. */
 113        list_for_each_entry(rg, head, link)
 114                if (f <= rg->to)
 115                        break;
 116
 117        /* If we are below the current region then a new region is required.
 118         * Subtle, allocate a new region at the position but make it zero
 119         * size such that we can guarantee to record the reservation. */
 120        if (&rg->link == head || t < rg->from) {
 121                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
 122                if (!nrg)
 123                        return -ENOMEM;
 124                nrg->from = f;
 125                nrg->to   = f;
 126                INIT_LIST_HEAD(&nrg->link);
 127                list_add(&nrg->link, rg->link.prev);
 128
 129                return t - f;
 130        }
 131
 132        /* Round our left edge to the current segment if it encloses us. */
 133        if (f > rg->from)
 134                f = rg->from;
 135        chg = t - f;
 136
 137        /* Check for and consume any regions we now overlap with. */
 138        list_for_each_entry(rg, rg->link.prev, link) {
 139                if (&rg->link == head)
 140                        break;
 141                if (rg->from > t)
 142                        return chg;
 143
 144                /* We overlap with this area, if it extends futher than
 145                 * us then we must extend ourselves.  Account for its
 146                 * existing reservation. */
 147                if (rg->to > t) {
 148                        chg += rg->to - t;
 149                        t = rg->to;
 150                }
 151                chg -= rg->to - rg->from;
 152        }
 153        return chg;
 154}
 155
 156static long region_truncate(struct list_head *head, long end)
 157{
 158        struct file_region *rg, *trg;
 159        long chg = 0;
 160
 161        /* Locate the region we are either in or before. */
 162        list_for_each_entry(rg, head, link)
 163                if (end <= rg->to)
 164                        break;
 165        if (&rg->link == head)
 166                return 0;
 167
 168        /* If we are in the middle of a region then adjust it. */
 169        if (end > rg->from) {
 170                chg = rg->to - end;
 171                rg->to = end;
 172                rg = list_entry(rg->link.next, typeof(*rg), link);
 173        }
 174
 175        /* Drop any remaining regions. */
 176        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
 177                if (&rg->link == head)
 178                        break;
 179                chg += rg->to - rg->from;
 180                list_del(&rg->link);
 181                kfree(rg);
 182        }
 183        return chg;
 184}
 185
 186static long region_count(struct list_head *head, long f, long t)
 187{
 188        struct file_region *rg;
 189        long chg = 0;
 190
 191        /* Locate each segment we overlap with, and count that overlap. */
 192        list_for_each_entry(rg, head, link) {
 193                int seg_from;
 194                int seg_to;
 195
 196                if (rg->to <= f)
 197                        continue;
 198                if (rg->from >= t)
 199                        break;
 200
 201                seg_from = max(rg->from, f);
 202                seg_to = min(rg->to, t);
 203
 204                chg += seg_to - seg_from;
 205        }
 206
 207        return chg;
 208}
 209
 210/*
 211 * Convert the address within this vma to the page offset within
 212 * the mapping, in pagecache page units; huge pages here.
 213 */
 214static pgoff_t vma_hugecache_offset(struct hstate *h,
 215                        struct vm_area_struct *vma, unsigned long address)
 216{
 217        return ((address - vma->vm_start) >> huge_page_shift(h)) +
 218                        (vma->vm_pgoff >> huge_page_order(h));
 219}
 220
 221/*
 222 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 223 * bits of the reservation map pointer, which are always clear due to
 224 * alignment.
 225 */
 226#define HPAGE_RESV_OWNER    (1UL << 0)
 227#define HPAGE_RESV_UNMAPPED (1UL << 1)
 228#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
 229
 230/*
 231 * These helpers are used to track how many pages are reserved for
 232 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 233 * is guaranteed to have their future faults succeed.
 234 *
 235 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
 236 * the reserve counters are updated with the hugetlb_lock held. It is safe
 237 * to reset the VMA at fork() time as it is not in use yet and there is no
 238 * chance of the global counters getting corrupted as a result of the values.
 239 *
 240 * The private mapping reservation is represented in a subtly different
 241 * manner to a shared mapping.  A shared mapping has a region map associated
 242 * with the underlying file, this region map represents the backing file
 243 * pages which have ever had a reservation assigned which this persists even
 244 * after the page is instantiated.  A private mapping has a region map
 245 * associated with the original mmap which is attached to all VMAs which
 246 * reference it, this region map represents those offsets which have consumed
 247 * reservation ie. where pages have been instantiated.
 248 */
 249static unsigned long get_vma_private_data(struct vm_area_struct *vma)
 250{
 251        return (unsigned long)vma->vm_private_data;
 252}
 253
 254static void set_vma_private_data(struct vm_area_struct *vma,
 255                                                        unsigned long value)
 256{
 257        vma->vm_private_data = (void *)value;
 258}
 259
 260struct resv_map {
 261        struct kref refs;
 262        struct list_head regions;
 263};
 264
 265struct resv_map *resv_map_alloc(void)
 266{
 267        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
 268        if (!resv_map)
 269                return NULL;
 270
 271        kref_init(&resv_map->refs);
 272        INIT_LIST_HEAD(&resv_map->regions);
 273
 274        return resv_map;
 275}
 276
 277void resv_map_release(struct kref *ref)
 278{
 279        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
 280
 281        /* Clear out any active regions before we release the map. */
 282        region_truncate(&resv_map->regions, 0);
 283        kfree(resv_map);
 284}
 285
 286static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 287{
 288        VM_BUG_ON(!is_vm_hugetlb_page(vma));
 289        if (!(vma->vm_flags & VM_MAYSHARE))
 290                return (struct resv_map *)(get_vma_private_data(vma) &
 291                                                        ~HPAGE_RESV_MASK);
 292        return 0;
 293}
 294
 295static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
 296{
 297        VM_BUG_ON(!is_vm_hugetlb_page(vma));
 298        VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
 299
 300        set_vma_private_data(vma, (get_vma_private_data(vma) &
 301                                HPAGE_RESV_MASK) | (unsigned long)map);
 302}
 303
 304static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 305{
 306        VM_BUG_ON(!is_vm_hugetlb_page(vma));
 307        VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
 308
 309        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 310}
 311
 312static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 313{
 314        VM_BUG_ON(!is_vm_hugetlb_page(vma));
 315
 316        return (get_vma_private_data(vma) & flag) != 0;
 317}
 318
 319/* Decrement the reserved pages in the hugepage pool by one */
 320static void decrement_hugepage_resv_vma(struct hstate *h,
 321                        struct vm_area_struct *vma)
 322{
 323        if (vma->vm_flags & VM_NORESERVE)
 324                return;
 325
 326        if (vma->vm_flags & VM_MAYSHARE) {
 327                /* Shared mappings always use reserves */
 328                h->resv_huge_pages--;
 329        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 330                /*
 331                 * Only the process that called mmap() has reserves for
 332                 * private mappings.
 333                 */
 334                h->resv_huge_pages--;
 335        }
 336}
 337
 338/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
 339void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 340{
 341        VM_BUG_ON(!is_vm_hugetlb_page(vma));
 342        if (!(vma->vm_flags & VM_MAYSHARE))
 343                vma->vm_private_data = (void *)0;
 344}
 345
 346/* Returns true if the VMA has associated reserve pages */
 347static int vma_has_reserves(struct vm_area_struct *vma)
 348{
 349        if (vma->vm_flags & VM_MAYSHARE)
 350                return 1;
 351        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 352                return 1;
 353        return 0;
 354}
 355
 356static void clear_gigantic_page(struct page *page,
 357                        unsigned long addr, unsigned long sz)
 358{
 359        int i;
 360        struct page *p = page;
 361
 362        might_sleep();
 363        for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
 364                cond_resched();
 365                clear_user_highpage(p, addr + i * PAGE_SIZE);
 366        }
 367}
 368static void clear_huge_page(struct page *page,
 369                        unsigned long addr, unsigned long sz)
 370{
 371        int i;
 372
 373        if (unlikely(sz > MAX_ORDER_NR_PAGES))
 374                return clear_gigantic_page(page, addr, sz);
 375
 376        might_sleep();
 377        for (i = 0; i < sz/PAGE_SIZE; i++) {
 378                cond_resched();
 379                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
 380        }
 381}
 382
 383static void copy_gigantic_page(struct page *dst, struct page *src,
 384                           unsigned long addr, struct vm_area_struct *vma)
 385{
 386        int i;
 387        struct hstate *h = hstate_vma(vma);
 388        struct page *dst_base = dst;
 389        struct page *src_base = src;
 390        might_sleep();
 391        for (i = 0; i < pages_per_huge_page(h); ) {
 392                cond_resched();
 393                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
 394
 395                i++;
 396                dst = mem_map_next(dst, dst_base, i);
 397                src = mem_map_next(src, src_base, i);
 398        }
 399}
 400static void copy_huge_page(struct page *dst, struct page *src,
 401                           unsigned long addr, struct vm_area_struct *vma)
 402{
 403        int i;
 404        struct hstate *h = hstate_vma(vma);
 405
 406        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
 407                return copy_gigantic_page(dst, src, addr, vma);
 408
 409        might_sleep();
 410        for (i = 0; i < pages_per_huge_page(h); i++) {
 411                cond_resched();
 412                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
 413        }
 414}
 415
 416static void enqueue_huge_page(struct hstate *h, struct page *page)
 417{
 418        int nid = page_to_nid(page);
 419        list_add(&page->lru, &h->hugepage_freelists[nid]);
 420        h->free_huge_pages++;
 421        h->free_huge_pages_node[nid]++;
 422}
 423
 424static struct page *dequeue_huge_page(struct hstate *h)
 425{
 426        int nid;
 427        struct page *page = NULL;
 428
 429        for (nid = 0; nid < MAX_NUMNODES; ++nid) {
 430                if (!list_empty(&h->hugepage_freelists[nid])) {
 431                        page = list_entry(h->hugepage_freelists[nid].next,
 432                                          struct page, lru);
 433                        list_del(&page->lru);
 434                        h->free_huge_pages--;
 435                        h->free_huge_pages_node[nid]--;
 436                        break;
 437                }
 438        }
 439        return page;
 440}
 441
 442static struct page *dequeue_huge_page_vma(struct hstate *h,
 443                                struct vm_area_struct *vma,
 444                                unsigned long address, int avoid_reserve)
 445{
 446        int nid;
 447        struct page *page = NULL;
 448        struct mempolicy *mpol;
 449        nodemask_t *nodemask;
 450        struct zonelist *zonelist = huge_zonelist(vma, address,
 451                                        htlb_alloc_mask, &mpol, &nodemask);
 452        struct zone *zone;
 453        struct zoneref *z;
 454
 455        /*
 456         * A child process with MAP_PRIVATE mappings created by their parent
 457         * have no page reserves. This check ensures that reservations are
 458         * not "stolen". The child may still get SIGKILLed
 459         */
 460        if (!vma_has_reserves(vma) &&
 461                        h->free_huge_pages - h->resv_huge_pages == 0)
 462                return NULL;
 463
 464        /* If reserves cannot be used, ensure enough pages are in the pool */
 465        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 466                return NULL;
 467
 468        for_each_zone_zonelist_nodemask(zone, z, zonelist,
 469                                                MAX_NR_ZONES - 1, nodemask) {
 470                nid = zone_to_nid(zone);
 471                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
 472                    !list_empty(&h->hugepage_freelists[nid])) {
 473                        page = list_entry(h->hugepage_freelists[nid].next,
 474                                          struct page, lru);
 475                        list_del(&page->lru);
 476                        h->free_huge_pages--;
 477                        h->free_huge_pages_node[nid]--;
 478
 479                        if (!avoid_reserve)
 480                                decrement_hugepage_resv_vma(h, vma);
 481
 482                        break;
 483                }
 484        }
 485        mpol_cond_put(mpol);
 486        return page;
 487}
 488
 489static void update_and_free_page(struct hstate *h, struct page *page)
 490{
 491        int i;
 492
 493        VM_BUG_ON(h->order >= MAX_ORDER);
 494
 495        h->nr_huge_pages--;
 496        h->nr_huge_pages_node[page_to_nid(page)]--;
 497        for (i = 0; i < pages_per_huge_page(h); i++) {
 498                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 499                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 500                                1 << PG_private | 1<< PG_writeback);
 501        }
 502        set_compound_page_dtor(page, NULL);
 503        set_page_refcounted(page);
 504        arch_release_hugepage(page);
 505        __free_pages(page, huge_page_order(h));
 506}
 507
 508struct hstate *size_to_hstate(unsigned long size)
 509{
 510        struct hstate *h;
 511
 512        for_each_hstate(h) {
 513                if (huge_page_size(h) == size)
 514                        return h;
 515        }
 516        return NULL;
 517}
 518
 519static void free_huge_page(struct page *page)
 520{
 521        /*
 522         * Can't pass hstate in here because it is called from the
 523         * compound page destructor.
 524         */
 525        struct hstate *h = page_hstate(page);
 526        int nid = page_to_nid(page);
 527        struct address_space *mapping;
 528
 529        mapping = (struct address_space *) page_private(page);
 530        set_page_private(page, 0);
 531        BUG_ON(page_count(page));
 532        INIT_LIST_HEAD(&page->lru);
 533
 534        spin_lock(&hugetlb_lock);
 535        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
 536                update_and_free_page(h, page);
 537                h->surplus_huge_pages--;
 538                h->surplus_huge_pages_node[nid]--;
 539        } else {
 540                enqueue_huge_page(h, page);
 541        }
 542        spin_unlock(&hugetlb_lock);
 543        if (mapping)
 544                hugetlb_put_quota(mapping, 1);
 545}
 546
 547/*
 548 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 549 * balanced by operating on them in a round-robin fashion.
 550 * Returns 1 if an adjustment was made.
 551 */
 552static int adjust_pool_surplus(struct hstate *h, int delta)
 553{
 554        static int prev_nid;
 555        int nid = prev_nid;
 556        int ret = 0;
 557
 558        VM_BUG_ON(delta != -1 && delta != 1);
 559        do {
 560                nid = next_node(nid, node_online_map);
 561                if (nid == MAX_NUMNODES)
 562                        nid = first_node(node_online_map);
 563
 564                /* To shrink on this node, there must be a surplus page */
 565                if (delta < 0 && !h->surplus_huge_pages_node[nid])
 566                        continue;
 567                /* Surplus cannot exceed the total number of pages */
 568                if (delta > 0 && h->surplus_huge_pages_node[nid] >=
 569                                                h->nr_huge_pages_node[nid])
 570                        continue;
 571
 572                h->surplus_huge_pages += delta;
 573                h->surplus_huge_pages_node[nid] += delta;
 574                ret = 1;
 575                break;
 576        } while (nid != prev_nid);
 577
 578        prev_nid = nid;
 579        return ret;
 580}
 581
 582static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 583{
 584        set_compound_page_dtor(page, free_huge_page);
 585        spin_lock(&hugetlb_lock);
 586        h->nr_huge_pages++;
 587        h->nr_huge_pages_node[nid]++;
 588        spin_unlock(&hugetlb_lock);
 589        put_page(page); /* free it into the hugepage allocator */
 590}
 591
 592static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 593{
 594        struct page *page;
 595
 596        if (h->order >= MAX_ORDER)
 597                return NULL;
 598
 599        page = alloc_pages_node(nid,
 600                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
 601                                                __GFP_REPEAT|__GFP_NOWARN,
 602                huge_page_order(h));
 603        if (page) {
 604                if (arch_prepare_hugepage(page)) {
 605                        __free_pages(page, huge_page_order(h));
 606                        return NULL;
 607                }
 608                prep_new_huge_page(h, page, nid);
 609        }
 610
 611        return page;
 612}
 613
 614/*
 615 * Use a helper variable to find the next node and then
 616 * copy it back to hugetlb_next_nid afterwards:
 617 * otherwise there's a window in which a racer might
 618 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
 619 * But we don't need to use a spin_lock here: it really
 620 * doesn't matter if occasionally a racer chooses the
 621 * same nid as we do.  Move nid forward in the mask even
 622 * if we just successfully allocated a hugepage so that
 623 * the next caller gets hugepages on the next node.
 624 */
 625static int hstate_next_node(struct hstate *h)
 626{
 627        int next_nid;
 628        next_nid = next_node(h->hugetlb_next_nid, node_online_map);
 629        if (next_nid == MAX_NUMNODES)
 630                next_nid = first_node(node_online_map);
 631        h->hugetlb_next_nid = next_nid;
 632        return next_nid;
 633}
 634
 635static int alloc_fresh_huge_page(struct hstate *h)
 636{
 637        struct page *page;
 638        int start_nid;
 639        int next_nid;
 640        int ret = 0;
 641
 642        start_nid = h->hugetlb_next_nid;
 643
 644        do {
 645                page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
 646                if (page)
 647                        ret = 1;
 648                next_nid = hstate_next_node(h);
 649        } while (!page && h->hugetlb_next_nid != start_nid);
 650
 651        if (ret)
 652                count_vm_event(HTLB_BUDDY_PGALLOC);
 653        else
 654                count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 655
 656        return ret;
 657}
 658
 659static struct page *alloc_buddy_huge_page(struct hstate *h,
 660                        struct vm_area_struct *vma, unsigned long address)
 661{
 662        struct page *page;
 663        unsigned int nid;
 664
 665        if (h->order >= MAX_ORDER)
 666                return NULL;
 667
 668        /*
 669         * Assume we will successfully allocate the surplus page to
 670         * prevent racing processes from causing the surplus to exceed
 671         * overcommit
 672         *
 673         * This however introduces a different race, where a process B
 674         * tries to grow the static hugepage pool while alloc_pages() is
 675         * called by process A. B will only examine the per-node
 676         * counters in determining if surplus huge pages can be
 677         * converted to normal huge pages in adjust_pool_surplus(). A
 678         * won't be able to increment the per-node counter, until the
 679         * lock is dropped by B, but B doesn't drop hugetlb_lock until
 680         * no more huge pages can be converted from surplus to normal
 681         * state (and doesn't try to convert again). Thus, we have a
 682         * case where a surplus huge page exists, the pool is grown, and
 683         * the surplus huge page still exists after, even though it
 684         * should just have been converted to a normal huge page. This
 685         * does not leak memory, though, as the hugepage will be freed
 686         * once it is out of use. It also does not allow the counters to
 687         * go out of whack in adjust_pool_surplus() as we don't modify
 688         * the node values until we've gotten the hugepage and only the
 689         * per-node value is checked there.
 690         */
 691        spin_lock(&hugetlb_lock);
 692        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
 693                spin_unlock(&hugetlb_lock);
 694                return NULL;
 695        } else {
 696                h->nr_huge_pages++;
 697                h->surplus_huge_pages++;
 698        }
 699        spin_unlock(&hugetlb_lock);
 700
 701        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
 702                                        __GFP_REPEAT|__GFP_NOWARN,
 703                                        huge_page_order(h));
 704
 705        if (page && arch_prepare_hugepage(page)) {
 706                __free_pages(page, huge_page_order(h));
 707                return NULL;
 708        }
 709
 710        spin_lock(&hugetlb_lock);
 711        if (page) {
 712                /*
 713                 * This page is now managed by the hugetlb allocator and has
 714                 * no users -- drop the buddy allocator's reference.
 715                 */
 716                put_page_testzero(page);
 717                VM_BUG_ON(page_count(page));
 718                nid = page_to_nid(page);
 719                set_compound_page_dtor(page, free_huge_page);
 720                /*
 721                 * We incremented the global counters already
 722                 */
 723                h->nr_huge_pages_node[nid]++;
 724                h->surplus_huge_pages_node[nid]++;
 725                __count_vm_event(HTLB_BUDDY_PGALLOC);
 726        } else {
 727                h->nr_huge_pages--;
 728                h->surplus_huge_pages--;
 729                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 730        }
 731        spin_unlock(&hugetlb_lock);
 732
 733        return page;
 734}
 735
 736/*
 737 * Increase the hugetlb pool such that it can accomodate a reservation
 738 * of size 'delta'.
 739 */
 740static int gather_surplus_pages(struct hstate *h, int delta)
 741{
 742        struct list_head surplus_list;
 743        struct page *page, *tmp;
 744        int ret, i;
 745        int needed, allocated;
 746
 747        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
 748        if (needed <= 0) {
 749                h->resv_huge_pages += delta;
 750                return 0;
 751        }
 752
 753        allocated = 0;
 754        INIT_LIST_HEAD(&surplus_list);
 755
 756        ret = -ENOMEM;
 757retry:
 758        spin_unlock(&hugetlb_lock);
 759        for (i = 0; i < needed; i++) {
 760                page = alloc_buddy_huge_page(h, NULL, 0);
 761                if (!page) {
 762                        /*
 763                         * We were not able to allocate enough pages to
 764                         * satisfy the entire reservation so we free what
 765                         * we've allocated so far.
 766                         */
 767                        spin_lock(&hugetlb_lock);
 768                        needed = 0;
 769                        goto free;
 770                }
 771
 772                list_add(&page->lru, &surplus_list);
 773        }
 774        allocated += needed;
 775
 776        /*
 777         * After retaking hugetlb_lock, we need to recalculate 'needed'
 778         * because either resv_huge_pages or free_huge_pages may have changed.
 779         */
 780        spin_lock(&hugetlb_lock);
 781        needed = (h->resv_huge_pages + delta) -
 782                        (h->free_huge_pages + allocated);
 783        if (needed > 0)
 784                goto retry;
 785
 786        /*
 787         * The surplus_list now contains _at_least_ the number of extra pages
 788         * needed to accomodate the reservation.  Add the appropriate number
 789         * of pages to the hugetlb pool and free the extras back to the buddy
 790         * allocator.  Commit the entire reservation here to prevent another
 791         * process from stealing the pages as they are added to the pool but
 792         * before they are reserved.
 793         */
 794        needed += allocated;
 795        h->resv_huge_pages += delta;
 796        ret = 0;
 797free:
 798        /* Free the needed pages to the hugetlb pool */
 799        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
 800                if ((--needed) < 0)
 801                        break;
 802                list_del(&page->lru);
 803                enqueue_huge_page(h, page);
 804        }
 805
 806        /* Free unnecessary surplus pages to the buddy allocator */
 807        if (!list_empty(&surplus_list)) {
 808                spin_unlock(&hugetlb_lock);
 809                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
 810                        list_del(&page->lru);
 811                        /*
 812                         * The page has a reference count of zero already, so
 813                         * call free_huge_page directly instead of using
 814                         * put_page.  This must be done with hugetlb_lock
 815                         * unlocked which is safe because free_huge_page takes
 816                         * hugetlb_lock before deciding how to free the page.
 817                         */
 818                        free_huge_page(page);
 819                }
 820                spin_lock(&hugetlb_lock);
 821        }
 822
 823        return ret;
 824}
 825
 826/*
 827 * When releasing a hugetlb pool reservation, any surplus pages that were
 828 * allocated to satisfy the reservation must be explicitly freed if they were
 829 * never used.
 830 */
 831static void return_unused_surplus_pages(struct hstate *h,
 832                                        unsigned long unused_resv_pages)
 833{
 834        static int nid = -1;
 835        struct page *page;
 836        unsigned long nr_pages;
 837
 838        /*
 839         * We want to release as many surplus pages as possible, spread
 840         * evenly across all nodes. Iterate across all nodes until we
 841         * can no longer free unreserved surplus pages. This occurs when
 842         * the nodes with surplus pages have no free pages.
 843         */
 844        unsigned long remaining_iterations = num_online_nodes();
 845
 846        /* Uncommit the reservation */
 847        h->resv_huge_pages -= unused_resv_pages;
 848
 849        /* Cannot return gigantic pages currently */
 850        if (h->order >= MAX_ORDER)
 851                return;
 852
 853        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 854
 855        while (remaining_iterations-- && nr_pages) {
 856                nid = next_node(nid, node_online_map);
 857                if (nid == MAX_NUMNODES)
 858                        nid = first_node(node_online_map);
 859
 860                if (!h->surplus_huge_pages_node[nid])
 861                        continue;
 862
 863                if (!list_empty(&h->hugepage_freelists[nid])) {
 864                        page = list_entry(h->hugepage_freelists[nid].next,
 865                                          struct page, lru);
 866                        list_del(&page->lru);
 867                        update_and_free_page(h, page);
 868                        h->free_huge_pages--;
 869                        h->free_huge_pages_node[nid]--;
 870                        h->surplus_huge_pages--;
 871                        h->surplus_huge_pages_node[nid]--;
 872                        nr_pages--;
 873                        remaining_iterations = num_online_nodes();
 874                }
 875        }
 876}
 877
 878/*
 879 * Determine if the huge page at addr within the vma has an associated
 880 * reservation.  Where it does not we will need to logically increase
 881 * reservation and actually increase quota before an allocation can occur.
 882 * Where any new reservation would be required the reservation change is
 883 * prepared, but not committed.  Once the page has been quota'd allocated
 884 * an instantiated the change should be committed via vma_commit_reservation.
 885 * No action is required on failure.
 886 */
 887static int vma_needs_reservation(struct hstate *h,
 888                        struct vm_area_struct *vma, unsigned long addr)
 889{
 890        struct address_space *mapping = vma->vm_file->f_mapping;
 891        struct inode *inode = mapping->host;
 892
 893        if (vma->vm_flags & VM_MAYSHARE) {
 894                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 895                return region_chg(&inode->i_mapping->private_list,
 896                                                        idx, idx + 1);
 897
 898        } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 899                return 1;
 900
 901        } else  {
 902                int err;
 903                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 904                struct resv_map *reservations = vma_resv_map(vma);
 905
 906                err = region_chg(&reservations->regions, idx, idx + 1);
 907                if (err < 0)
 908                        return err;
 909                return 0;
 910        }
 911}
 912static void vma_commit_reservation(struct hstate *h,
 913                        struct vm_area_struct *vma, unsigned long addr)
 914{
 915        struct address_space *mapping = vma->vm_file->f_mapping;
 916        struct inode *inode = mapping->host;
 917
 918        if (vma->vm_flags & VM_MAYSHARE) {
 919                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 920                region_add(&inode->i_mapping->private_list, idx, idx + 1);
 921
 922        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 923                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 924                struct resv_map *reservations = vma_resv_map(vma);
 925
 926                /* Mark this page used in the map. */
 927                region_add(&reservations->regions, idx, idx + 1);
 928        }
 929}
 930
 931static struct page *alloc_huge_page(struct vm_area_struct *vma,
 932                                    unsigned long addr, int avoid_reserve)
 933{
 934        struct hstate *h = hstate_vma(vma);
 935        struct page *page;
 936        struct address_space *mapping = vma->vm_file->f_mapping;
 937        struct inode *inode = mapping->host;
 938        unsigned int chg;
 939
 940        /*
 941         * Processes that did not create the mapping will have no reserves and
 942         * will not have accounted against quota. Check that the quota can be
 943         * made before satisfying the allocation
 944         * MAP_NORESERVE mappings may also need pages and quota allocated
 945         * if no reserve mapping overlaps.
 946         */
 947        chg = vma_needs_reservation(h, vma, addr);
 948        if (chg < 0)
 949                return ERR_PTR(chg);
 950        if (chg)
 951                if (hugetlb_get_quota(inode->i_mapping, chg))
 952                        return ERR_PTR(-ENOSPC);
 953
 954        spin_lock(&hugetlb_lock);
 955        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
 956        spin_unlock(&hugetlb_lock);
 957
 958        if (!page) {
 959                page = alloc_buddy_huge_page(h, vma, addr);
 960                if (!page) {
 961                        hugetlb_put_quota(inode->i_mapping, chg);
 962                        return ERR_PTR(-VM_FAULT_OOM);
 963                }
 964        }
 965
 966        set_page_refcounted(page);
 967        set_page_private(page, (unsigned long) mapping);
 968
 969        vma_commit_reservation(h, vma, addr);
 970
 971        return page;
 972}
 973
 974__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
 975{
 976        struct huge_bootmem_page *m;
 977        int nr_nodes = nodes_weight(node_online_map);
 978
 979        while (nr_nodes) {
 980                void *addr;
 981
 982                addr = __alloc_bootmem_node_nopanic(
 983                                NODE_DATA(h->hugetlb_next_nid),
 984                                huge_page_size(h), huge_page_size(h), 0);
 985
 986                hstate_next_node(h);
 987                if (addr) {
 988                        /*
 989                         * Use the beginning of the huge page to store the
 990                         * huge_bootmem_page struct (until gather_bootmem
 991                         * puts them into the mem_map).
 992                         */
 993                        m = addr;
 994                        if (m)
 995                                goto found;
 996                }
 997                nr_nodes--;
 998        }
 999        return 0;
1000
1001found:
1002        BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
1003        /* Put them into a private list first because mem_map is not up yet */
1004        list_add(&m->list, &huge_boot_pages);
1005        m->hstate = h;
1006        return 1;
1007}
1008
1009static void prep_compound_huge_page(struct page *page, int order)
1010{
1011        if (unlikely(order > (MAX_ORDER - 1)))
1012                prep_compound_gigantic_page(page, order);
1013        else
1014                prep_compound_page(page, order);
1015}
1016
1017/* Put bootmem huge pages into the standard lists after mem_map is up */
1018static void __init gather_bootmem_prealloc(void)
1019{
1020        struct huge_bootmem_page *m;
1021
1022        list_for_each_entry(m, &huge_boot_pages, list) {
1023                struct page *page = virt_to_page(m);
1024                struct hstate *h = m->hstate;
1025                __ClearPageReserved(page);
1026                WARN_ON(page_count(page) != 1);
1027                prep_compound_huge_page(page, h->order);
1028                prep_new_huge_page(h, page, page_to_nid(page));
1029        }
1030}
1031
1032static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1033{
1034        unsigned long i;
1035
1036        for (i = 0; i < h->max_huge_pages; ++i) {
1037                if (h->order >= MAX_ORDER) {
1038                        if (!alloc_bootmem_huge_page(h))
1039                                break;
1040                } else if (!alloc_fresh_huge_page(h))
1041                        break;
1042        }
1043        h->max_huge_pages = i;
1044}
1045
1046static void __init hugetlb_init_hstates(void)
1047{
1048        struct hstate *h;
1049
1050        for_each_hstate(h) {
1051                /* oversize hugepages were init'ed in early boot */
1052                if (h->order < MAX_ORDER)
1053                        hugetlb_hstate_alloc_pages(h);
1054        }
1055}
1056
1057static char * __init memfmt(char *buf, unsigned long n)
1058{
1059        if (n >= (1UL << 30))
1060                sprintf(buf, "%lu GB", n >> 30);
1061        else if (n >= (1UL << 20))
1062                sprintf(buf, "%lu MB", n >> 20);
1063        else
1064                sprintf(buf, "%lu KB", n >> 10);
1065        return buf;
1066}
1067
1068static void __init report_hugepages(void)
1069{
1070        struct hstate *h;
1071
1072        for_each_hstate(h) {
1073                char buf[32];
1074                printk(KERN_INFO "HugeTLB registered %s page size, "
1075                                 "pre-allocated %ld pages\n",
1076                        memfmt(buf, huge_page_size(h)),
1077                        h->free_huge_pages);
1078        }
1079}
1080
1081#ifdef CONFIG_HIGHMEM
1082static void try_to_free_low(struct hstate *h, unsigned long count)
1083{
1084        int i;
1085
1086        if (h->order >= MAX_ORDER)
1087                return;
1088
1089        for (i = 0; i < MAX_NUMNODES; ++i) {
1090                struct page *page, *next;
1091                struct list_head *freel = &h->hugepage_freelists[i];
1092                list_for_each_entry_safe(page, next, freel, lru) {
1093                        if (count >= h->nr_huge_pages)
1094                                return;
1095                        if (PageHighMem(page))
1096                                continue;
1097                        list_del(&page->lru);
1098                        update_and_free_page(h, page);
1099                        h->free_huge_pages--;
1100                        h->free_huge_pages_node[page_to_nid(page)]--;
1101                }
1102        }
1103}
1104#else
1105static inline void try_to_free_low(struct hstate *h, unsigned long count)
1106{
1107}
1108#endif
1109
1110#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1111static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1112{
1113        unsigned long min_count, ret;
1114
1115        if (h->order >= MAX_ORDER)
1116                return h->max_huge_pages;
1117
1118        /*
1119         * Increase the pool size
1120         * First take pages out of surplus state.  Then make up the
1121         * remaining difference by allocating fresh huge pages.
1122         *
1123         * We might race with alloc_buddy_huge_page() here and be unable
1124         * to convert a surplus huge page to a normal huge page. That is
1125         * not critical, though, it just means the overall size of the
1126         * pool might be one hugepage larger than it needs to be, but
1127         * within all the constraints specified by the sysctls.
1128         */
1129        spin_lock(&hugetlb_lock);
1130        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1131                if (!adjust_pool_surplus(h, -1))
1132                        break;
1133        }
1134
1135        while (count > persistent_huge_pages(h)) {
1136                /*
1137                 * If this allocation races such that we no longer need the
1138                 * page, free_huge_page will handle it by freeing the page
1139                 * and reducing the surplus.
1140                 */
1141                spin_unlock(&hugetlb_lock);
1142                ret = alloc_fresh_huge_page(h);
1143                spin_lock(&hugetlb_lock);
1144                if (!ret)
1145                        goto out;
1146
1147        }
1148
1149        /*
1150         * Decrease the pool size
1151         * First return free pages to the buddy allocator (being careful
1152         * to keep enough around to satisfy reservations).  Then place
1153         * pages into surplus state as needed so the pool will shrink
1154         * to the desired size as pages become free.
1155         *
1156         * By placing pages into the surplus state independent of the
1157         * overcommit value, we are allowing the surplus pool size to
1158         * exceed overcommit. There are few sane options here. Since
1159         * alloc_buddy_huge_page() is checking the global counter,
1160         * though, we'll note that we're not allowed to exceed surplus
1161         * and won't grow the pool anywhere else. Not until one of the
1162         * sysctls are changed, or the surplus pages go out of use.
1163         */
1164        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1165        min_count = max(count, min_count);
1166        try_to_free_low(h, min_count);
1167        while (min_count < persistent_huge_pages(h)) {
1168                struct page *page = dequeue_huge_page(h);
1169                if (!page)
1170                        break;
1171                update_and_free_page(h, page);
1172        }
1173        while (count < persistent_huge_pages(h)) {
1174                if (!adjust_pool_surplus(h, 1))
1175                        break;
1176        }
1177out:
1178        ret = persistent_huge_pages(h);
1179        spin_unlock(&hugetlb_lock);
1180        return ret;
1181}
1182
1183#define HSTATE_ATTR_RO(_name) \
1184        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1185
1186#define HSTATE_ATTR(_name) \
1187        static struct kobj_attribute _name##_attr = \
1188                __ATTR(_name, 0644, _name##_show, _name##_store)
1189
1190static struct kobject *hugepages_kobj;
1191static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1192
1193static struct hstate *kobj_to_hstate(struct kobject *kobj)
1194{
1195        int i;
1196        for (i = 0; i < HUGE_MAX_HSTATE; i++)
1197                if (hstate_kobjs[i] == kobj)
1198                        return &hstates[i];
1199        BUG();
1200        return NULL;
1201}
1202
1203static ssize_t nr_hugepages_show(struct kobject *kobj,
1204                                        struct kobj_attribute *attr, char *buf)
1205{
1206        struct hstate *h = kobj_to_hstate(kobj);
1207        return sprintf(buf, "%lu\n", h->nr_huge_pages);
1208}
1209static ssize_t nr_hugepages_store(struct kobject *kobj,
1210                struct kobj_attribute *attr, const char *buf, size_t count)
1211{
1212        int err;
1213        unsigned long input;
1214        struct hstate *h = kobj_to_hstate(kobj);
1215
1216        err = strict_strtoul(buf, 10, &input);
1217        if (err)
1218                return 0;
1219
1220        h->max_huge_pages = set_max_huge_pages(h, input);
1221
1222        return count;
1223}
1224HSTATE_ATTR(nr_hugepages);
1225
1226static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1227                                        struct kobj_attribute *attr, char *buf)
1228{
1229        struct hstate *h = kobj_to_hstate(kobj);
1230        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1231}
1232static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1233                struct kobj_attribute *attr, const char *buf, size_t count)
1234{
1235        int err;
1236        unsigned long input;
1237        struct hstate *h = kobj_to_hstate(kobj);
1238
1239        err = strict_strtoul(buf, 10, &input);
1240        if (err)
1241                return 0;
1242
1243        spin_lock(&hugetlb_lock);
1244        h->nr_overcommit_huge_pages = input;
1245        spin_unlock(&hugetlb_lock);
1246
1247        return count;
1248}
1249HSTATE_ATTR(nr_overcommit_hugepages);
1250
1251static ssize_t free_hugepages_show(struct kobject *kobj,
1252                                        struct kobj_attribute *attr, char *buf)
1253{
1254        struct hstate *h = kobj_to_hstate(kobj);
1255        return sprintf(buf, "%lu\n", h->free_huge_pages);
1256}
1257HSTATE_ATTR_RO(free_hugepages);
1258
1259static ssize_t resv_hugepages_show(struct kobject *kobj,
1260                                        struct kobj_attribute *attr, char *buf)
1261{
1262        struct hstate *h = kobj_to_hstate(kobj);
1263        return sprintf(buf, "%lu\n", h->resv_huge_pages);
1264}
1265HSTATE_ATTR_RO(resv_hugepages);
1266
1267static ssize_t surplus_hugepages_show(struct kobject *kobj,
1268                                        struct kobj_attribute *attr, char *buf)
1269{
1270        struct hstate *h = kobj_to_hstate(kobj);
1271        return sprintf(buf, "%lu\n", h->surplus_huge_pages);
1272}
1273HSTATE_ATTR_RO(surplus_hugepages);
1274
1275static struct attribute *hstate_attrs[] = {
1276        &nr_hugepages_attr.attr,
1277        &nr_overcommit_hugepages_attr.attr,
1278        &free_hugepages_attr.attr,
1279        &resv_hugepages_attr.attr,
1280        &surplus_hugepages_attr.attr,
1281        NULL,
1282};
1283
1284static struct attribute_group hstate_attr_group = {
1285        .attrs = hstate_attrs,
1286};
1287
1288static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
1289{
1290        int retval;
1291
1292        hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
1293                                                        hugepages_kobj);
1294        if (!hstate_kobjs[h - hstates])
1295                return -ENOMEM;
1296
1297        retval = sysfs_create_group(hstate_kobjs[h - hstates],
1298                                                        &hstate_attr_group);
1299        if (retval)
1300                kobject_put(hstate_kobjs[h - hstates]);
1301
1302        return retval;
1303}
1304
1305static void __init hugetlb_sysfs_init(void)
1306{
1307        struct hstate *h;
1308        int err;
1309
1310        hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1311        if (!hugepages_kobj)
1312                return;
1313
1314        for_each_hstate(h) {
1315                err = hugetlb_sysfs_add_hstate(h);
1316                if (err)
1317                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1318                                                                h->name);
1319        }
1320}
1321
1322static void __exit hugetlb_exit(void)
1323{
1324        struct hstate *h;
1325
1326        for_each_hstate(h) {
1327                kobject_put(hstate_kobjs[h - hstates]);
1328        }
1329
1330        kobject_put(hugepages_kobj);
1331}
1332module_exit(hugetlb_exit);
1333
1334static int __init hugetlb_init(void)
1335{
1336        /* Some platform decide whether they support huge pages at boot
1337         * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1338         * there is no such support
1339         */
1340        if (HPAGE_SHIFT == 0)
1341                return 0;
1342
1343        if (!size_to_hstate(default_hstate_size)) {
1344                default_hstate_size = HPAGE_SIZE;
1345                if (!size_to_hstate(default_hstate_size))
1346                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1347        }
1348        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1349        if (default_hstate_max_huge_pages)
1350                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1351
1352        hugetlb_init_hstates();
1353
1354        gather_bootmem_prealloc();
1355
1356        report_hugepages();
1357
1358        hugetlb_sysfs_init();
1359
1360        return 0;
1361}
1362module_init(hugetlb_init);
1363
1364/* Should be called on processing a hugepagesz=... option */
1365void __init hugetlb_add_hstate(unsigned order)
1366{
1367        struct hstate *h;
1368        unsigned long i;
1369
1370        if (size_to_hstate(PAGE_SIZE << order)) {
1371                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1372                return;
1373        }
1374        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1375        BUG_ON(order == 0);
1376        h = &hstates[max_hstate++];
1377        h->order = order;
1378        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1379        h->nr_huge_pages = 0;
1380        h->free_huge_pages = 0;
1381        for (i = 0; i < MAX_NUMNODES; ++i)
1382                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1383        h->hugetlb_next_nid = first_node(node_online_map);
1384        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1385                                        huge_page_size(h)/1024);
1386
1387        parsed_hstate = h;
1388}
1389
1390static int __init hugetlb_nrpages_setup(char *s)
1391{
1392        unsigned long *mhp;
1393        static unsigned long *last_mhp;
1394
1395        /*
1396         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1397         * so this hugepages= parameter goes to the "default hstate".
1398         */
1399        if (!max_hstate)
1400                mhp = &default_hstate_max_huge_pages;
1401        else
1402                mhp = &parsed_hstate->max_huge_pages;
1403
1404        if (mhp == last_mhp) {
1405                printk(KERN_WARNING "hugepages= specified twice without "
1406                        "interleaving hugepagesz=, ignoring\n");
1407                return 1;
1408        }
1409
1410        if (sscanf(s, "%lu", mhp) <= 0)
1411                *mhp = 0;
1412
1413        /*
1414         * Global state is always initialized later in hugetlb_init.
1415         * But we need to allocate >= MAX_ORDER hstates here early to still
1416         * use the bootmem allocator.
1417         */
1418        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1419                hugetlb_hstate_alloc_pages(parsed_hstate);
1420
1421        last_mhp = mhp;
1422
1423        return 1;
1424}
1425__setup("hugepages=", hugetlb_nrpages_setup);
1426
1427static int __init hugetlb_default_setup(char *s)
1428{
1429        default_hstate_size = memparse(s, &s);
1430        return 1;
1431}
1432__setup("default_hugepagesz=", hugetlb_default_setup);
1433
1434static unsigned int cpuset_mems_nr(unsigned int *array)
1435{
1436        int node;
1437        unsigned int nr = 0;
1438
1439        for_each_node_mask(node, cpuset_current_mems_allowed)
1440                nr += array[node];
1441
1442        return nr;
1443}
1444
1445#ifdef CONFIG_SYSCTL
1446int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1447                           struct file *file, void __user *buffer,
1448                           size_t *length, loff_t *ppos)
1449{
1450        struct hstate *h = &default_hstate;
1451        unsigned long tmp;
1452
1453        if (!write)
1454                tmp = h->max_huge_pages;
1455
1456        table->data = &tmp;
1457        table->maxlen = sizeof(unsigned long);
1458        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
1459
1460        if (write)
1461                h->max_huge_pages = set_max_huge_pages(h, tmp);
1462
1463        return 0;
1464}
1465
1466int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1467                        struct file *file, void __user *buffer,
1468                        size_t *length, loff_t *ppos)
1469{
1470        proc_dointvec(table, write, file, buffer, length, ppos);
1471        if (hugepages_treat_as_movable)
1472                htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1473        else
1474                htlb_alloc_mask = GFP_HIGHUSER;
1475        return 0;
1476}
1477
1478int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1479                        struct file *file, void __user *buffer,
1480                        size_t *length, loff_t *ppos)
1481{
1482        struct hstate *h = &default_hstate;
1483        unsigned long tmp;
1484
1485        if (!write)
1486                tmp = h->nr_overcommit_huge_pages;
1487
1488        table->data = &tmp;
1489        table->maxlen = sizeof(unsigned long);
1490        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
1491
1492        if (write) {
1493                spin_lock(&hugetlb_lock);
1494                h->nr_overcommit_huge_pages = tmp;
1495                spin_unlock(&hugetlb_lock);
1496        }
1497
1498        return 0;
1499}
1500
1501#endif /* CONFIG_SYSCTL */
1502
1503int hugetlb_report_meminfo(char *buf)
1504{
1505        struct hstate *h = &default_hstate;
1506        return sprintf(buf,
1507                        "HugePages_Total: %5lu\n"
1508                        "HugePages_Free:  %5lu\n"
1509                        "HugePages_Rsvd:  %5lu\n"
1510                        "HugePages_Surp:  %5lu\n"
1511                        "Hugepagesize:    %5lu kB\n",
1512                        h->nr_huge_pages,
1513                        h->free_huge_pages,
1514                        h->resv_huge_pages,
1515                        h->surplus_huge_pages,
1516                        1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
1517}
1518
1519int hugetlb_report_node_meminfo(int nid, char *buf)
1520{
1521        struct hstate *h = &default_hstate;
1522        return sprintf(buf,
1523                "Node %d HugePages_Total: %5u\n"
1524                "Node %d HugePages_Free:  %5u\n"
1525                "Node %d HugePages_Surp:  %5u\n",
1526                nid, h->nr_huge_pages_node[nid],
1527                nid, h->free_huge_pages_node[nid],
1528                nid, h->surplus_huge_pages_node[nid]);
1529}
1530
1531/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
1532unsigned long hugetlb_total_pages(void)
1533{
1534        struct hstate *h = &default_hstate;
1535        return h->nr_huge_pages * pages_per_huge_page(h);
1536}
1537
1538static int hugetlb_acct_memory(struct hstate *h, long delta)
1539{
1540        int ret = -ENOMEM;
1541
1542        spin_lock(&hugetlb_lock);
1543        /*
1544         * When cpuset is configured, it breaks the strict hugetlb page
1545         * reservation as the accounting is done on a global variable. Such
1546         * reservation is completely rubbish in the presence of cpuset because
1547         * the reservation is not checked against page availability for the
1548         * current cpuset. Application can still potentially OOM'ed by kernel
1549         * with lack of free htlb page in cpuset that the task is in.
1550         * Attempt to enforce strict accounting with cpuset is almost
1551         * impossible (or too ugly) because cpuset is too fluid that
1552         * task or memory node can be dynamically moved between cpusets.
1553         *
1554         * The change of semantics for shared hugetlb mapping with cpuset is
1555         * undesirable. However, in order to preserve some of the semantics,
1556         * we fall back to check against current free page availability as
1557         * a best attempt and hopefully to minimize the impact of changing
1558         * semantics that cpuset has.
1559         */
1560        if (delta > 0) {
1561                if (gather_surplus_pages(h, delta) < 0)
1562                        goto out;
1563
1564                if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1565                        return_unused_surplus_pages(h, delta);
1566                        goto out;
1567                }
1568        }
1569
1570        ret = 0;
1571        if (delta < 0)
1572                return_unused_surplus_pages(h, (unsigned long) -delta);
1573
1574out:
1575        spin_unlock(&hugetlb_lock);
1576        return ret;
1577}
1578
1579static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1580{
1581        struct resv_map *reservations = vma_resv_map(vma);
1582
1583        /*
1584         * This new VMA should share its siblings reservation map if present.
1585         * The VMA will only ever have a valid reservation map pointer where
1586         * it is being copied for another still existing VMA.  As that VMA
1587         * has a reference to the reservation map it cannot dissappear until
1588         * after this open call completes.  It is therefore safe to take a
1589         * new reference here without additional locking.
1590         */
1591        if (reservations)
1592                kref_get(&reservations->refs);
1593}
1594
1595static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1596{
1597        struct hstate *h = hstate_vma(vma);
1598        struct resv_map *reservations = vma_resv_map(vma);
1599        unsigned long reserve;
1600        unsigned long start;
1601        unsigned long end;
1602
1603        if (reservations) {
1604                start = vma_hugecache_offset(h, vma, vma->vm_start);
1605                end = vma_hugecache_offset(h, vma, vma->vm_end);
1606
1607                reserve = (end - start) -
1608                        region_count(&reservations->regions, start, end);
1609
1610                kref_put(&reservations->refs, resv_map_release);
1611
1612                if (reserve) {
1613                        hugetlb_acct_memory(h, -reserve);
1614                        hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
1615                }
1616        }
1617}
1618
1619/*
1620 * We cannot handle pagefaults against hugetlb pages at all.  They cause
1621 * handle_mm_fault() to try to instantiate regular-sized pages in the
1622 * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
1623 * this far.
1624 */
1625static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1626{
1627        BUG();
1628        return 0;
1629}
1630
1631struct vm_operations_struct hugetlb_vm_ops = {
1632        .fault = hugetlb_vm_op_fault,
1633        .open = hugetlb_vm_op_open,
1634        .close = hugetlb_vm_op_close,
1635};
1636
1637static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
1638                                int writable)
1639{
1640        pte_t entry;
1641
1642        if (writable) {
1643                entry =
1644                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
1645        } else {
1646                entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
1647        }
1648        entry = pte_mkyoung(entry);
1649        entry = pte_mkhuge(entry);
1650
1651        return entry;
1652}
1653
1654static void set_huge_ptep_writable(struct vm_area_struct *vma,
1655                                   unsigned long address, pte_t *ptep)
1656{
1657        pte_t entry;
1658
1659        entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
1660        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
1661                update_mmu_cache(vma, address, entry);
1662        }
1663}
1664
1665
1666int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
1667                            struct vm_area_struct *vma)
1668{
1669        pte_t *src_pte, *dst_pte, entry;
1670        struct page *ptepage;
1671        unsigned long addr;
1672        int cow;
1673        struct hstate *h = hstate_vma(vma);
1674        unsigned long sz = huge_page_size(h);
1675
1676        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
1677
1678        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
1679                src_pte = huge_pte_offset(src, addr);
1680                if (!src_pte)
1681                        continue;
1682                dst_pte = huge_pte_alloc(dst, addr, sz);
1683                if (!dst_pte)
1684                        goto nomem;
1685
1686                /* If the pagetables are shared don't copy or take references */
1687                if (dst_pte == src_pte)
1688                        continue;
1689
1690                spin_lock(&dst->page_table_lock);
1691                spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
1692                if (!huge_pte_none(huge_ptep_get(src_pte))) {
1693                        if (cow)
1694                                huge_ptep_set_wrprotect(src, addr, src_pte);
1695                        entry = huge_ptep_get(src_pte);
1696                        ptepage = pte_page(entry);
1697                        get_page(ptepage);
1698                        set_huge_pte_at(dst, addr, dst_pte, entry);
1699                }
1700                spin_unlock(&src->page_table_lock);
1701                spin_unlock(&dst->page_table_lock);
1702        }
1703        return 0;
1704
1705nomem:
1706        return -ENOMEM;
1707}
1708
1709void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1710                            unsigned long end, struct page *ref_page)
1711{
1712        struct mm_struct *mm = vma->vm_mm;
1713        unsigned long address;
1714        pte_t *ptep;
1715        pte_t pte;
1716        struct page *page;
1717        struct page *tmp;
1718        struct hstate *h = hstate_vma(vma);
1719        unsigned long sz = huge_page_size(h);
1720
1721        /*
1722         * A page gathering list, protected by per file i_mmap_lock. The
1723         * lock is used to avoid list corruption from multiple unmapping
1724         * of the same page since we are using page->lru.
1725         */
1726        LIST_HEAD(page_list);
1727
1728        WARN_ON(!is_vm_hugetlb_page(vma));
1729        BUG_ON(start & ~huge_page_mask(h));
1730        BUG_ON(end & ~huge_page_mask(h));
1731
1732        mmu_notifier_invalidate_range_start(mm, start, end);
1733        spin_lock(&mm->page_table_lock);
1734        for (address = start; address < end; address += sz) {
1735                ptep = huge_pte_offset(mm, address);
1736                if (!ptep)
1737                        continue;
1738
1739                if (huge_pmd_unshare(mm, &address, ptep))
1740                        continue;
1741
1742                /*
1743                 * If a reference page is supplied, it is because a specific
1744                 * page is being unmapped, not a range. Ensure the page we
1745                 * are about to unmap is the actual page of interest.
1746                 */
1747                if (ref_page) {
1748                        pte = huge_ptep_get(ptep);
1749                        if (huge_pte_none(pte))
1750                                continue;
1751                        page = pte_page(pte);
1752                        if (page != ref_page)
1753                                continue;
1754
1755                        /*
1756                         * Mark the VMA as having unmapped its page so that
1757                         * future faults in this VMA will fail rather than
1758                         * looking like data was lost
1759                         */
1760                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
1761                }
1762
1763                pte = huge_ptep_get_and_clear(mm, address, ptep);
1764                if (huge_pte_none(pte))
1765                        continue;
1766
1767                page = pte_page(pte);
1768                if (pte_dirty(pte))
1769                        set_page_dirty(page);
1770                list_add(&page->lru, &page_list);
1771        }
1772        spin_unlock(&mm->page_table_lock);
1773        flush_tlb_range(vma, start, end);
1774        mmu_notifier_invalidate_range_end(mm, start, end);
1775        list_for_each_entry_safe(page, tmp, &page_list, lru) {
1776                list_del(&page->lru);
1777                put_page(page);
1778        }
1779}
1780
1781void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1782                          unsigned long end, struct page *ref_page)
1783{
1784        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1785        __unmap_hugepage_range(vma, start, end, ref_page);
1786        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
1787}
1788
1789/*
1790 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1791 * mappping it owns the reserve page for. The intention is to unmap the page
1792 * from other VMAs and let the children be SIGKILLed if they are faulting the
1793 * same region.
1794 */
1795int unmap_ref_private(struct mm_struct *mm,
1796                                        struct vm_area_struct *vma,
1797                                        struct page *page,
1798                                        unsigned long address)
1799{
1800        struct hstate *h = hstate_vma(vma);
1801        struct vm_area_struct *iter_vma;
1802        struct address_space *mapping;
1803        struct prio_tree_iter iter;
1804        pgoff_t pgoff;
1805
1806        /*
1807         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
1808         * from page cache lookup which is in HPAGE_SIZE units.
1809         */
1810        address = address & huge_page_mask(h);
1811        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
1812                + (vma->vm_pgoff >> PAGE_SHIFT);
1813        mapping = (struct address_space *)page_private(page);
1814
1815        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1816                /* Do not unmap the current VMA */
1817                if (iter_vma == vma)
1818                        continue;
1819
1820                /*
1821                 * Unmap the page from other VMAs without their own reserves.
1822                 * They get marked to be SIGKILLed if they fault in these
1823                 * areas. This is because a future no-page fault on this VMA
1824                 * could insert a zeroed page instead of the data existing
1825                 * from the time of fork. This would look like data corruption
1826                 */
1827                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1828                        unmap_hugepage_range(iter_vma,
1829                                address, address + huge_page_size(h),
1830                                page);
1831        }
1832
1833        return 1;
1834}
1835
1836static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
1837                        unsigned long address, pte_t *ptep, pte_t pte,
1838                        struct page *pagecache_page)
1839{
1840        struct hstate *h = hstate_vma(vma);
1841        struct page *old_page, *new_page;
1842        int avoidcopy;
1843        int outside_reserve = 0;
1844
1845        old_page = pte_page(pte);
1846
1847retry_avoidcopy:
1848        /* If no-one else is actually using this page, avoid the copy
1849         * and just make the page writable */
1850        avoidcopy = (page_count(old_page) == 1);
1851        if (avoidcopy) {
1852                set_huge_ptep_writable(vma, address, ptep);
1853                return 0;
1854        }
1855
1856        /*
1857         * If the process that created a MAP_PRIVATE mapping is about to
1858         * perform a COW due to a shared page count, attempt to satisfy
1859         * the allocation without using the existing reserves. The pagecache
1860         * page is used to determine if the reserve at this address was
1861         * consumed or not. If reserves were used, a partial faulted mapping
1862         * at the time of fork() could consume its reserves on COW instead
1863         * of the full address range.
1864         */
1865        if (!(vma->vm_flags & VM_MAYSHARE) &&
1866                        is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1867                        old_page != pagecache_page)
1868                outside_reserve = 1;
1869
1870        page_cache_get(old_page);
1871        new_page = alloc_huge_page(vma, address, outside_reserve);
1872
1873        if (IS_ERR(new_page)) {
1874                page_cache_release(old_page);
1875
1876                /*
1877                 * If a process owning a MAP_PRIVATE mapping fails to COW,
1878                 * it is due to references held by a child and an insufficient
1879                 * huge page pool. To guarantee the original mappers
1880                 * reliability, unmap the page from child processes. The child
1881                 * may get SIGKILLed if it later faults.
1882                 */
1883                if (outside_reserve) {
1884                        BUG_ON(huge_pte_none(pte));
1885                        if (unmap_ref_private(mm, vma, old_page, address)) {
1886                                BUG_ON(page_count(old_page) != 1);
1887                                BUG_ON(huge_pte_none(pte));
1888                                goto retry_avoidcopy;
1889                        }
1890                        WARN_ON_ONCE(1);
1891                }
1892
1893                return -PTR_ERR(new_page);
1894        }
1895
1896        spin_unlock(&mm->page_table_lock);
1897        copy_huge_page(new_page, old_page, address, vma);
1898        __SetPageUptodate(new_page);
1899        spin_lock(&mm->page_table_lock);
1900
1901        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1902        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1903                /* Break COW */
1904                huge_ptep_clear_flush(vma, address, ptep);
1905                set_huge_pte_at(mm, address, ptep,
1906                                make_huge_pte(vma, new_page, 1));
1907                /* Make the old page be freed below */
1908                new_page = old_page;
1909        }
1910        page_cache_release(new_page);
1911        page_cache_release(old_page);
1912        return 0;
1913}
1914
1915/* Return the pagecache page at a given address within a VMA */
1916static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1917                        struct vm_area_struct *vma, unsigned long address)
1918{
1919        struct address_space *mapping;
1920        pgoff_t idx;
1921
1922        mapping = vma->vm_file->f_mapping;
1923        idx = vma_hugecache_offset(h, vma, address);
1924
1925        return find_lock_page(mapping, idx);
1926}
1927
1928static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1929                        unsigned long address, pte_t *ptep, int write_access)
1930{
1931        struct hstate *h = hstate_vma(vma);
1932        int ret = VM_FAULT_SIGBUS;
1933        pgoff_t idx;
1934        unsigned long size;
1935        struct page *page;
1936        struct address_space *mapping;
1937        pte_t new_pte;
1938
1939        /*
1940         * Currently, we are forced to kill the process in the event the
1941         * original mapper has unmapped pages from the child due to a failed
1942         * COW. Warn that such a situation has occured as it may not be obvious
1943         */
1944        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1945                printk(KERN_WARNING
1946                        "PID %d killed due to inadequate hugepage pool\n",
1947                        current->pid);
1948                return ret;
1949        }
1950
1951        mapping = vma->vm_file->f_mapping;
1952        idx = vma_hugecache_offset(h, vma, address);
1953
1954        /*
1955         * Use page lock to guard against racing truncation
1956         * before we get page_table_lock.
1957         */
1958retry:
1959        page = find_lock_page(mapping, idx);
1960        if (!page) {
1961                size = i_size_read(mapping->host) >> huge_page_shift(h);
1962                if (idx >= size)
1963                        goto out;
1964                page = alloc_huge_page(vma, address, 0);
1965                if (IS_ERR(page)) {
1966                        ret = -PTR_ERR(page);
1967                        goto out;
1968                }
1969                clear_huge_page(page, address, huge_page_size(h));
1970                __SetPageUptodate(page);
1971
1972                if (vma->vm_flags & VM_MAYSHARE) {
1973                        int err;
1974                        struct inode *inode = mapping->host;
1975
1976                        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
1977                        if (err) {
1978                                put_page(page);
1979                                if (err == -EEXIST)
1980                                        goto retry;
1981                                goto out;
1982                        }
1983
1984                        spin_lock(&inode->i_lock);
1985                        inode->i_blocks += blocks_per_huge_page(h);
1986                        spin_unlock(&inode->i_lock);
1987                } else
1988                        lock_page(page);
1989        }
1990
1991        /*
1992         * If we are going to COW a private mapping later, we examine the
1993         * pending reservations for this page now. This will ensure that
1994         * any allocations necessary to record that reservation occur outside
1995         * the spinlock.
1996         */
1997        if (write_access && !(vma->vm_flags & VM_SHARED))
1998                if (vma_needs_reservation(h, vma, address) < 0) {
1999                        ret = VM_FAULT_OOM;
2000                        goto backout_unlocked;
2001                }
2002
2003        spin_lock(&mm->page_table_lock);
2004        size = i_size_read(mapping->host) >> huge_page_shift(h);
2005        if (idx >= size)
2006                goto backout;
2007
2008        ret = 0;
2009        if (!huge_pte_none(huge_ptep_get(ptep)))
2010                goto backout;
2011
2012        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
2013                                && (vma->vm_flags & VM_SHARED)));
2014        set_huge_pte_at(mm, address, ptep, new_pte);
2015
2016        if (write_access && !(vma->vm_flags & VM_SHARED)) {
2017                /* Optimization, do the COW without a second fault */
2018                ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
2019        }
2020
2021        spin_unlock(&mm->page_table_lock);
2022        unlock_page(page);
2023out:
2024        return ret;
2025
2026backout:
2027        spin_unlock(&mm->page_table_lock);
2028backout_unlocked:
2029        unlock_page(page);
2030        put_page(page);
2031        goto out;
2032}
2033
2034int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2035                        unsigned long address, int write_access)
2036{
2037        pte_t *ptep;
2038        pte_t entry;
2039        int ret;
2040        struct page *pagecache_page = NULL;
2041        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2042        struct hstate *h = hstate_vma(vma);
2043
2044        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2045        if (!ptep)
2046                return VM_FAULT_OOM;
2047
2048        /*
2049         * Serialize hugepage allocation and instantiation, so that we don't
2050         * get spurious allocation failures if two CPUs race to instantiate
2051         * the same page in the page cache.
2052         */
2053        mutex_lock(&hugetlb_instantiation_mutex);
2054        entry = huge_ptep_get(ptep);
2055        if (huge_pte_none(entry)) {
2056                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
2057                goto out_unlock;
2058        }
2059
2060        ret = 0;
2061
2062        /*
2063         * If we are going to COW the mapping later, we examine the pending
2064         * reservations for this page now. This will ensure that any
2065         * allocations necessary to record that reservation occur outside the
2066         * spinlock. For private mappings, we also lookup the pagecache
2067         * page now as it is used to determine if a reservation has been
2068         * consumed.
2069         */
2070        if (write_access && !pte_write(entry)) {
2071                if (vma_needs_reservation(h, vma, address) < 0) {
2072                        ret = VM_FAULT_OOM;
2073                        goto out_unlock;
2074                }
2075
2076                if (!(vma->vm_flags & VM_MAYSHARE))
2077                        pagecache_page = hugetlbfs_pagecache_page(h,
2078                                                                vma, address);
2079        }
2080
2081        spin_lock(&mm->page_table_lock);
2082        /* Check for a racing update before calling hugetlb_cow */
2083        if (likely(pte_same(entry, huge_ptep_get(ptep))))
2084                if (write_access && !pte_write(entry))
2085                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
2086                                                        pagecache_page);
2087        spin_unlock(&mm->page_table_lock);
2088
2089        if (pagecache_page) {
2090                unlock_page(pagecache_page);
2091                put_page(pagecache_page);
2092        }
2093
2094out_unlock:
2095        mutex_unlock(&hugetlb_instantiation_mutex);
2096
2097        return ret;
2098}
2099
2100/* Can be overriden by architectures */
2101__attribute__((weak)) struct page *
2102follow_huge_pud(struct mm_struct *mm, unsigned long address,
2103               pud_t *pud, int write)
2104{
2105        BUG();
2106        return NULL;
2107}
2108
2109int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2110                        struct page **pages, struct vm_area_struct **vmas,
2111                        unsigned long *position, int *length, int i,
2112                        int write)
2113{
2114        unsigned long pfn_offset;
2115        unsigned long vaddr = *position;
2116        int remainder = *length;
2117        struct hstate *h = hstate_vma(vma);
2118
2119        spin_lock(&mm->page_table_lock);
2120        while (vaddr < vma->vm_end && remainder) {
2121                pte_t *pte;
2122                struct page *page;
2123
2124                /*
2125                 * Some archs (sparc64, sh*) have multiple pte_ts to
2126                 * each hugepage.  We have to make * sure we get the
2127                 * first, for the page indexing below to work.
2128                 */
2129                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2130
2131                if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
2132                    (write && !pte_write(huge_ptep_get(pte)))) {
2133                        int ret;
2134
2135                        spin_unlock(&mm->page_table_lock);
2136                        ret = hugetlb_fault(mm, vma, vaddr, write);
2137                        spin_lock(&mm->page_table_lock);
2138                        if (!(ret & VM_FAULT_ERROR))
2139                                continue;
2140
2141                        remainder = 0;
2142                        if (!i)
2143                                i = -EFAULT;
2144                        break;
2145                }
2146
2147                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
2148                page = pte_page(huge_ptep_get(pte));
2149same_page:
2150                if (pages) {
2151                        get_page(page);
2152                        pages[i] = mem_map_offset(page, pfn_offset);
2153                }
2154
2155                if (vmas)
2156                        vmas[i] = vma;
2157
2158                vaddr += PAGE_SIZE;
2159                ++pfn_offset;
2160                --remainder;
2161                ++i;
2162                if (vaddr < vma->vm_end && remainder &&
2163                                pfn_offset < pages_per_huge_page(h)) {
2164                        /*
2165                         * We use pfn_offset to avoid touching the pageframes
2166                         * of this compound page.
2167                         */
2168                        goto same_page;
2169                }
2170        }
2171        spin_unlock(&mm->page_table_lock);
2172        *length = remainder;
2173        *position = vaddr;
2174
2175        return i;
2176}
2177
2178void hugetlb_change_protection(struct vm_area_struct *vma,
2179                unsigned long address, unsigned long end, pgprot_t newprot)
2180{
2181        struct mm_struct *mm = vma->vm_mm;
2182        unsigned long start = address;
2183        pte_t *ptep;
2184        pte_t pte;
2185        struct hstate *h = hstate_vma(vma);
2186
2187        BUG_ON(address >= end);
2188        flush_cache_range(vma, address, end);
2189
2190        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
2191        spin_lock(&mm->page_table_lock);
2192        for (; address < end; address += huge_page_size(h)) {
2193                ptep = huge_pte_offset(mm, address);
2194                if (!ptep)
2195                        continue;
2196                if (huge_pmd_unshare(mm, &address, ptep))
2197                        continue;
2198                if (!huge_pte_none(huge_ptep_get(ptep))) {
2199                        pte = huge_ptep_get_and_clear(mm, address, ptep);
2200                        pte = pte_mkhuge(pte_modify(pte, newprot));
2201                        set_huge_pte_at(mm, address, ptep, pte);
2202                }
2203        }
2204        spin_unlock(&mm->page_table_lock);
2205        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
2206
2207        flush_tlb_range(vma, start, end);
2208}
2209
2210int hugetlb_reserve_pages(struct inode *inode,
2211                                        long from, long to,
2212                                        struct vm_area_struct *vma)
2213{
2214        long ret, chg;
2215        struct hstate *h = hstate_inode(inode);
2216
2217        if (vma && vma->vm_flags & VM_NORESERVE)
2218                return 0;
2219
2220        /*
2221         * Shared mappings base their reservation on the number of pages that
2222         * are already allocated on behalf of the file. Private mappings need
2223         * to reserve the full area even if read-only as mprotect() may be
2224         * called to make the mapping read-write. Assume !vma is a shm mapping
2225         */
2226        if (!vma || vma->vm_flags & VM_MAYSHARE)
2227                chg = region_chg(&inode->i_mapping->private_list, from, to);
2228        else {
2229                struct resv_map *resv_map = resv_map_alloc();
2230                if (!resv_map)
2231                        return -ENOMEM;
2232
2233                chg = to - from;
2234
2235                set_vma_resv_map(vma, resv_map);
2236                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2237        }
2238
2239        if (chg < 0)
2240                return chg;
2241
2242        if (hugetlb_get_quota(inode->i_mapping, chg))
2243                return -ENOSPC;
2244        ret = hugetlb_acct_memory(h, chg);
2245        if (ret < 0) {
2246                hugetlb_put_quota(inode->i_mapping, chg);
2247                return ret;
2248        }
2249        if (!vma || vma->vm_flags & VM_MAYSHARE)
2250                region_add(&inode->i_mapping->private_list, from, to);
2251        return 0;
2252}
2253
2254void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2255{
2256        struct hstate *h = hstate_inode(inode);
2257        long chg = region_truncate(&inode->i_mapping->private_list, offset);
2258
2259        spin_lock(&inode->i_lock);
2260        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2261        spin_unlock(&inode->i_lock);
2262
2263        hugetlb_put_quota(inode->i_mapping, (chg - freed));
2264        hugetlb_acct_memory(h, -(chg - freed));
2265}
2266
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.