linux/mm/huge_memory.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 2009  Red Hat, Inc.
   3 *
   4 *  This work is licensed under the terms of the GNU GPL, version 2. See
   5 *  the COPYING file in the top-level directory.
   6 */
   7
   8#include <linux/mm.h>
   9#include <linux/sched.h>
  10#include <linux/highmem.h>
  11#include <linux/hugetlb.h>
  12#include <linux/mmu_notifier.h>
  13#include <linux/rmap.h>
  14#include <linux/swap.h>
  15#include <linux/mm_inline.h>
  16#include <linux/kthread.h>
  17#include <linux/khugepaged.h>
  18#include <linux/freezer.h>
  19#include <linux/mman.h>
  20#include <asm/tlb.h>
  21#include <asm/pgalloc.h>
  22#include "internal.h"
  23
  24/*
  25 * By default transparent hugepage support is enabled for all mappings
  26 * and khugepaged scans all mappings. Defrag is only invoked by
  27 * khugepaged hugepage allocations and by page faults inside
  28 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
  29 * allocations.
  30 */
  31unsigned long transparent_hugepage_flags __read_mostly =
  32#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
  33        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
  34#endif
  35#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  36        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  37#endif
  38        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
  39        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
  40
  41/* default scan 8*512 pte (or vmas) every 30 second */
  42static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
  43static unsigned int khugepaged_pages_collapsed;
  44static unsigned int khugepaged_full_scans;
  45static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
  46/* during fragmentation poll the hugepage allocator once every minute */
  47static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
  48static struct task_struct *khugepaged_thread __read_mostly;
  49static DEFINE_MUTEX(khugepaged_mutex);
  50static DEFINE_SPINLOCK(khugepaged_mm_lock);
  51static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  52/*
  53 * default collapse hugepages if there is at least one pte mapped like
  54 * it would have happened if the vma was large enough during page
  55 * fault.
  56 */
  57static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
  58
  59static int khugepaged(void *none);
  60static int mm_slots_hash_init(void);
  61static int khugepaged_slab_init(void);
  62static void khugepaged_slab_free(void);
  63
  64#define MM_SLOTS_HASH_HEADS 1024
  65static struct hlist_head *mm_slots_hash __read_mostly;
  66static struct kmem_cache *mm_slot_cache __read_mostly;
  67
  68/**
  69 * struct mm_slot - hash lookup from mm to mm_slot
  70 * @hash: hash collision list
  71 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
  72 * @mm: the mm that this information is valid for
  73 */
  74struct mm_slot {
  75        struct hlist_node hash;
  76        struct list_head mm_node;
  77        struct mm_struct *mm;
  78};
  79
  80/**
  81 * struct khugepaged_scan - cursor for scanning
  82 * @mm_head: the head of the mm list to scan
  83 * @mm_slot: the current mm_slot we are scanning
  84 * @address: the next address inside that to be scanned
  85 *
  86 * There is only the one khugepaged_scan instance of this cursor structure.
  87 */
  88struct khugepaged_scan {
  89        struct list_head mm_head;
  90        struct mm_slot *mm_slot;
  91        unsigned long address;
  92};
  93static struct khugepaged_scan khugepaged_scan = {
  94        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  95};
  96
  97
  98static int set_recommended_min_free_kbytes(void)
  99{
 100        struct zone *zone;
 101        int nr_zones = 0;
 102        unsigned long recommended_min;
 103        extern int min_free_kbytes;
 104
 105        if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
 106                      &transparent_hugepage_flags) &&
 107            !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 108                      &transparent_hugepage_flags))
 109                return 0;
 110
 111        for_each_populated_zone(zone)
 112                nr_zones++;
 113
 114        /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
 115        recommended_min = pageblock_nr_pages * nr_zones * 2;
 116
 117        /*
 118         * Make sure that on average at least two pageblocks are almost free
 119         * of another type, one for a migratetype to fall back to and a
 120         * second to avoid subsequent fallbacks of other types There are 3
 121         * MIGRATE_TYPES we care about.
 122         */
 123        recommended_min += pageblock_nr_pages * nr_zones *
 124                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
 125
 126        /* don't ever allow to reserve more than 5% of the lowmem */
 127        recommended_min = min(recommended_min,
 128                              (unsigned long) nr_free_buffer_pages() / 20);
 129        recommended_min <<= (PAGE_SHIFT-10);
 130
 131        if (recommended_min > min_free_kbytes)
 132                min_free_kbytes = recommended_min;
 133        setup_per_zone_wmarks();
 134        return 0;
 135}
 136late_initcall(set_recommended_min_free_kbytes);
 137
 138static int start_khugepaged(void)
 139{
 140        int err = 0;
 141        if (khugepaged_enabled()) {
 142                int wakeup;
 143                if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
 144                        err = -ENOMEM;
 145                        goto out;
 146                }
 147                mutex_lock(&khugepaged_mutex);
 148                if (!khugepaged_thread)
 149                        khugepaged_thread = kthread_run(khugepaged, NULL,
 150                                                        "khugepaged");
 151                if (unlikely(IS_ERR(khugepaged_thread))) {
 152                        printk(KERN_ERR
 153                               "khugepaged: kthread_run(khugepaged) failed\n");
 154                        err = PTR_ERR(khugepaged_thread);
 155                        khugepaged_thread = NULL;
 156                }
 157                wakeup = !list_empty(&khugepaged_scan.mm_head);
 158                mutex_unlock(&khugepaged_mutex);
 159                if (wakeup)
 160                        wake_up_interruptible(&khugepaged_wait);
 161
 162                set_recommended_min_free_kbytes();
 163        } else
 164                /* wakeup to exit */
 165                wake_up_interruptible(&khugepaged_wait);
 166out:
 167        return err;
 168}
 169
 170#ifdef CONFIG_SYSFS
 171
 172static ssize_t double_flag_show(struct kobject *kobj,
 173                                struct kobj_attribute *attr, char *buf,
 174                                enum transparent_hugepage_flag enabled,
 175                                enum transparent_hugepage_flag req_madv)
 176{
 177        if (test_bit(enabled, &transparent_hugepage_flags)) {
 178                VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
 179                return sprintf(buf, "[always] madvise never\n");
 180        } else if (test_bit(req_madv, &transparent_hugepage_flags))
 181                return sprintf(buf, "always [madvise] never\n");
 182        else
 183                return sprintf(buf, "always madvise [never]\n");
 184}
 185static ssize_t double_flag_store(struct kobject *kobj,
 186                                 struct kobj_attribute *attr,
 187                                 const char *buf, size_t count,
 188                                 enum transparent_hugepage_flag enabled,
 189                                 enum transparent_hugepage_flag req_madv)
 190{
 191        if (!memcmp("always", buf,
 192                    min(sizeof("always")-1, count))) {
 193                set_bit(enabled, &transparent_hugepage_flags);
 194                clear_bit(req_madv, &transparent_hugepage_flags);
 195        } else if (!memcmp("madvise", buf,
 196                           min(sizeof("madvise")-1, count))) {
 197                clear_bit(enabled, &transparent_hugepage_flags);
 198                set_bit(req_madv, &transparent_hugepage_flags);
 199        } else if (!memcmp("never", buf,
 200                           min(sizeof("never")-1, count))) {
 201                clear_bit(enabled, &transparent_hugepage_flags);
 202                clear_bit(req_madv, &transparent_hugepage_flags);
 203        } else
 204                return -EINVAL;
 205
 206        return count;
 207}
 208
 209static ssize_t enabled_show(struct kobject *kobj,
 210                            struct kobj_attribute *attr, char *buf)
 211{
 212        return double_flag_show(kobj, attr, buf,
 213                                TRANSPARENT_HUGEPAGE_FLAG,
 214                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 215}
 216static ssize_t enabled_store(struct kobject *kobj,
 217                             struct kobj_attribute *attr,
 218                             const char *buf, size_t count)
 219{
 220        ssize_t ret;
 221
 222        ret = double_flag_store(kobj, attr, buf, count,
 223                                TRANSPARENT_HUGEPAGE_FLAG,
 224                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 225
 226        if (ret > 0) {
 227                int err = start_khugepaged();
 228                if (err)
 229                        ret = err;
 230        }
 231
 232        if (ret > 0 &&
 233            (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
 234                      &transparent_hugepage_flags) ||
 235             test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 236                      &transparent_hugepage_flags)))
 237                set_recommended_min_free_kbytes();
 238
 239        return ret;
 240}
 241static struct kobj_attribute enabled_attr =
 242        __ATTR(enabled, 0644, enabled_show, enabled_store);
 243
 244static ssize_t single_flag_show(struct kobject *kobj,
 245                                struct kobj_attribute *attr, char *buf,
 246                                enum transparent_hugepage_flag flag)
 247{
 248        return sprintf(buf, "%d\n",
 249                       !!test_bit(flag, &transparent_hugepage_flags));
 250}
 251
 252static ssize_t single_flag_store(struct kobject *kobj,
 253                                 struct kobj_attribute *attr,
 254                                 const char *buf, size_t count,
 255                                 enum transparent_hugepage_flag flag)
 256{
 257        unsigned long value;
 258        int ret;
 259
 260        ret = kstrtoul(buf, 10, &value);
 261        if (ret < 0)
 262                return ret;
 263        if (value > 1)
 264                return -EINVAL;
 265
 266        if (value)
 267                set_bit(flag, &transparent_hugepage_flags);
 268        else
 269                clear_bit(flag, &transparent_hugepage_flags);
 270
 271        return count;
 272}
 273
 274/*
 275 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
 276 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
 277 * memory just to allocate one more hugepage.
 278 */
 279static ssize_t defrag_show(struct kobject *kobj,
 280                           struct kobj_attribute *attr, char *buf)
 281{
 282        return double_flag_show(kobj, attr, buf,
 283                                TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 284                                TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
 285}
 286static ssize_t defrag_store(struct kobject *kobj,
 287                            struct kobj_attribute *attr,
 288                            const char *buf, size_t count)
 289{
 290        return double_flag_store(kobj, attr, buf, count,
 291                                 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 292                                 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
 293}
 294static struct kobj_attribute defrag_attr =
 295        __ATTR(defrag, 0644, defrag_show, defrag_store);
 296
 297#ifdef CONFIG_DEBUG_VM
 298static ssize_t debug_cow_show(struct kobject *kobj,
 299                                struct kobj_attribute *attr, char *buf)
 300{
 301        return single_flag_show(kobj, attr, buf,
 302                                TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 303}
 304static ssize_t debug_cow_store(struct kobject *kobj,
 305                               struct kobj_attribute *attr,
 306                               const char *buf, size_t count)
 307{
 308        return single_flag_store(kobj, attr, buf, count,
 309                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 310}
 311static struct kobj_attribute debug_cow_attr =
 312        __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
 313#endif /* CONFIG_DEBUG_VM */
 314
 315static struct attribute *hugepage_attr[] = {
 316        &enabled_attr.attr,
 317        &defrag_attr.attr,
 318#ifdef CONFIG_DEBUG_VM
 319        &debug_cow_attr.attr,
 320#endif
 321        NULL,
 322};
 323
 324static struct attribute_group hugepage_attr_group = {
 325        .attrs = hugepage_attr,
 326};
 327
 328static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
 329                                         struct kobj_attribute *attr,
 330                                         char *buf)
 331{
 332        return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
 333}
 334
 335static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
 336                                          struct kobj_attribute *attr,
 337                                          const char *buf, size_t count)
 338{
 339        unsigned long msecs;
 340        int err;
 341
 342        err = strict_strtoul(buf, 10, &msecs);
 343        if (err || msecs > UINT_MAX)
 344                return -EINVAL;
 345
 346        khugepaged_scan_sleep_millisecs = msecs;
 347        wake_up_interruptible(&khugepaged_wait);
 348
 349        return count;
 350}
 351static struct kobj_attribute scan_sleep_millisecs_attr =
 352        __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
 353               scan_sleep_millisecs_store);
 354
 355static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
 356                                          struct kobj_attribute *attr,
 357                                          char *buf)
 358{
 359        return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
 360}
 361
 362static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
 363                                           struct kobj_attribute *attr,
 364                                           const char *buf, size_t count)
 365{
 366        unsigned long msecs;
 367        int err;
 368
 369        err = strict_strtoul(buf, 10, &msecs);
 370        if (err || msecs > UINT_MAX)
 371                return -EINVAL;
 372
 373        khugepaged_alloc_sleep_millisecs = msecs;
 374        wake_up_interruptible(&khugepaged_wait);
 375
 376        return count;
 377}
 378static struct kobj_attribute alloc_sleep_millisecs_attr =
 379        __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
 380               alloc_sleep_millisecs_store);
 381
 382static ssize_t pages_to_scan_show(struct kobject *kobj,
 383                                  struct kobj_attribute *attr,
 384                                  char *buf)
 385{
 386        return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
 387}
 388static ssize_t pages_to_scan_store(struct kobject *kobj,
 389                                   struct kobj_attribute *attr,
 390                                   const char *buf, size_t count)
 391{
 392        int err;
 393        unsigned long pages;
 394
 395        err = strict_strtoul(buf, 10, &pages);
 396        if (err || !pages || pages > UINT_MAX)
 397                return -EINVAL;
 398
 399        khugepaged_pages_to_scan = pages;
 400
 401        return count;
 402}
 403static struct kobj_attribute pages_to_scan_attr =
 404        __ATTR(pages_to_scan, 0644, pages_to_scan_show,
 405               pages_to_scan_store);
 406
 407static ssize_t pages_collapsed_show(struct kobject *kobj,
 408                                    struct kobj_attribute *attr,
 409                                    char *buf)
 410{
 411        return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
 412}
 413static struct kobj_attribute pages_collapsed_attr =
 414        __ATTR_RO(pages_collapsed);
 415
 416static ssize_t full_scans_show(struct kobject *kobj,
 417                               struct kobj_attribute *attr,
 418                               char *buf)
 419{
 420        return sprintf(buf, "%u\n", khugepaged_full_scans);
 421}
 422static struct kobj_attribute full_scans_attr =
 423        __ATTR_RO(full_scans);
 424
 425static ssize_t khugepaged_defrag_show(struct kobject *kobj,
 426                                      struct kobj_attribute *attr, char *buf)
 427{
 428        return single_flag_show(kobj, attr, buf,
 429                                TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
 430}
 431static ssize_t khugepaged_defrag_store(struct kobject *kobj,
 432                                       struct kobj_attribute *attr,
 433                                       const char *buf, size_t count)
 434{
 435        return single_flag_store(kobj, attr, buf, count,
 436                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
 437}
 438static struct kobj_attribute khugepaged_defrag_attr =
 439        __ATTR(defrag, 0644, khugepaged_defrag_show,
 440               khugepaged_defrag_store);
 441
 442/*
 443 * max_ptes_none controls if khugepaged should collapse hugepages over
 444 * any unmapped ptes in turn potentially increasing the memory
 445 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
 446 * reduce the available free memory in the system as it
 447 * runs. Increasing max_ptes_none will instead potentially reduce the
 448 * free memory in the system during the khugepaged scan.
 449 */
 450static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
 451                                             struct kobj_attribute *attr,
 452                                             char *buf)
 453{
 454        return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
 455}
 456static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
 457                                              struct kobj_attribute *attr,
 458                                              const char *buf, size_t count)
 459{
 460        int err;
 461        unsigned long max_ptes_none;
 462
 463        err = strict_strtoul(buf, 10, &max_ptes_none);
 464        if (err || max_ptes_none > HPAGE_PMD_NR-1)
 465                return -EINVAL;
 466
 467        khugepaged_max_ptes_none = max_ptes_none;
 468
 469        return count;
 470}
 471static struct kobj_attribute khugepaged_max_ptes_none_attr =
 472        __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
 473               khugepaged_max_ptes_none_store);
 474
 475static struct attribute *khugepaged_attr[] = {
 476        &khugepaged_defrag_attr.attr,
 477        &khugepaged_max_ptes_none_attr.attr,
 478        &pages_to_scan_attr.attr,
 479        &pages_collapsed_attr.attr,
 480        &full_scans_attr.attr,
 481        &scan_sleep_millisecs_attr.attr,
 482        &alloc_sleep_millisecs_attr.attr,
 483        NULL,
 484};
 485
 486static struct attribute_group khugepaged_attr_group = {
 487        .attrs = khugepaged_attr,
 488        .name = "khugepaged",
 489};
 490
 491static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 492{
 493        int err;
 494
 495        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 496        if (unlikely(!*hugepage_kobj)) {
 497                printk(KERN_ERR "hugepage: failed kobject create\n");
 498                return -ENOMEM;
 499        }
 500
 501        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
 502        if (err) {
 503                printk(KERN_ERR "hugepage: failed register hugeage group\n");
 504                goto delete_obj;
 505        }
 506
 507        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
 508        if (err) {
 509                printk(KERN_ERR "hugepage: failed register hugeage group\n");
 510                goto remove_hp_group;
 511        }
 512
 513        return 0;
 514
 515remove_hp_group:
 516        sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
 517delete_obj:
 518        kobject_put(*hugepage_kobj);
 519        return err;
 520}
 521
 522static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 523{
 524        sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
 525        sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
 526        kobject_put(hugepage_kobj);
 527}
 528#else
 529static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
 530{
 531        return 0;
 532}
 533
 534static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 535{
 536}
 537#endif /* CONFIG_SYSFS */
 538
 539static int __init hugepage_init(void)
 540{
 541        int err;
 542        struct kobject *hugepage_kobj;
 543
 544        if (!has_transparent_hugepage()) {
 545                transparent_hugepage_flags = 0;
 546                return -EINVAL;
 547        }
 548
 549        err = hugepage_init_sysfs(&hugepage_kobj);
 550        if (err)
 551                return err;
 552
 553        err = khugepaged_slab_init();
 554        if (err)
 555                goto out;
 556
 557        err = mm_slots_hash_init();
 558        if (err) {
 559                khugepaged_slab_free();
 560                goto out;
 561        }
 562
 563        /*
 564         * By default disable transparent hugepages on smaller systems,
 565         * where the extra memory used could hurt more than TLB overhead
 566         * is likely to save.  The admin can still enable it through /sys.
 567         */
 568        if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
 569                transparent_hugepage_flags = 0;
 570
 571        start_khugepaged();
 572
 573        set_recommended_min_free_kbytes();
 574
 575        return 0;
 576out:
 577        hugepage_exit_sysfs(hugepage_kobj);
 578        return err;
 579}
 580module_init(hugepage_init)
 581
 582static int __init setup_transparent_hugepage(char *str)
 583{
 584        int ret = 0;
 585        if (!str)
 586                goto out;
 587        if (!strcmp(str, "always")) {
 588                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
 589                        &transparent_hugepage_flags);
 590                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 591                          &transparent_hugepage_flags);
 592                ret = 1;
 593        } else if (!strcmp(str, "madvise")) {
 594                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 595                          &transparent_hugepage_flags);
 596                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 597                        &transparent_hugepage_flags);
 598                ret = 1;
 599        } else if (!strcmp(str, "never")) {
 600                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 601                          &transparent_hugepage_flags);
 602                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 603                          &transparent_hugepage_flags);
 604                ret = 1;
 605        }
 606out:
 607        if (!ret)
 608                printk(KERN_WARNING
 609                       "transparent_hugepage= cannot parse, ignored\n");
 610        return ret;
 611}
 612__setup("transparent_hugepage=", setup_transparent_hugepage);
 613
 614static void prepare_pmd_huge_pte(pgtable_t pgtable,
 615                                 struct mm_struct *mm)
 616{
 617        assert_spin_locked(&mm->page_table_lock);
 618
 619        /* FIFO */
 620        if (!mm->pmd_huge_pte)
 621                INIT_LIST_HEAD(&pgtable->lru);
 622        else
 623                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
 624        mm->pmd_huge_pte = pgtable;
 625}
 626
 627static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 628{
 629        if (likely(vma->vm_flags & VM_WRITE))
 630                pmd = pmd_mkwrite(pmd);
 631        return pmd;
 632}
 633
 634static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 635                                        struct vm_area_struct *vma,
 636                                        unsigned long haddr, pmd_t *pmd,
 637                                        struct page *page)
 638{
 639        pgtable_t pgtable;
 640
 641        VM_BUG_ON(!PageCompound(page));
 642        pgtable = pte_alloc_one(mm, haddr);
 643        if (unlikely(!pgtable))
 644                return VM_FAULT_OOM;
 645
 646        clear_huge_page(page, haddr, HPAGE_PMD_NR);
 647        __SetPageUptodate(page);
 648
 649        spin_lock(&mm->page_table_lock);
 650        if (unlikely(!pmd_none(*pmd))) {
 651                spin_unlock(&mm->page_table_lock);
 652                mem_cgroup_uncharge_page(page);
 653                put_page(page);
 654                pte_free(mm, pgtable);
 655        } else {
 656                pmd_t entry;
 657                entry = mk_pmd(page, vma->vm_page_prot);
 658                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 659                entry = pmd_mkhuge(entry);
 660                /*
 661                 * The spinlocking to take the lru_lock inside
 662                 * page_add_new_anon_rmap() acts as a full memory
 663                 * barrier to be sure clear_huge_page writes become
 664                 * visible after the set_pmd_at() write.
 665                 */
 666                page_add_new_anon_rmap(page, vma, haddr);
 667                set_pmd_at(mm, haddr, pmd, entry);
 668                prepare_pmd_huge_pte(pgtable, mm);
 669                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 670                mm->nr_ptes++;
 671                spin_unlock(&mm->page_table_lock);
 672        }
 673
 674        return 0;
 675}
 676
 677static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 678{
 679        return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 680}
 681
 682static inline struct page *alloc_hugepage_vma(int defrag,
 683                                              struct vm_area_struct *vma,
 684                                              unsigned long haddr, int nd,
 685                                              gfp_t extra_gfp)
 686{
 687        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
 688                               HPAGE_PMD_ORDER, vma, haddr, nd);
 689}
 690
 691#ifndef CONFIG_NUMA
 692static inline struct page *alloc_hugepage(int defrag)
 693{
 694        return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
 695                           HPAGE_PMD_ORDER);
 696}
 697#endif
 698
 699int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 700                               unsigned long address, pmd_t *pmd,
 701                               unsigned int flags)
 702{
 703        struct page *page;
 704        unsigned long haddr = address & HPAGE_PMD_MASK;
 705        pte_t *pte;
 706
 707        if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
 708                if (unlikely(anon_vma_prepare(vma)))
 709                        return VM_FAULT_OOM;
 710                if (unlikely(khugepaged_enter(vma)))
 711                        return VM_FAULT_OOM;
 712                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
 713                                          vma, haddr, numa_node_id(), 0);
 714                if (unlikely(!page)) {
 715                        count_vm_event(THP_FAULT_FALLBACK);
 716                        goto out;
 717                }
 718                count_vm_event(THP_FAULT_ALLOC);
 719                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
 720                        put_page(page);
 721                        goto out;
 722                }
 723                if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
 724                                                          page))) {
 725                        mem_cgroup_uncharge_page(page);
 726                        put_page(page);
 727                        goto out;
 728                }
 729
 730                return 0;
 731        }
 732out:
 733        /*
 734         * Use __pte_alloc instead of pte_alloc_map, because we can't
 735         * run pte_offset_map on the pmd, if an huge pmd could
 736         * materialize from under us from a different thread.
 737         */
 738        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
 739                return VM_FAULT_OOM;
 740        /* if an huge pmd materialized from under us just retry later */
 741        if (unlikely(pmd_trans_huge(*pmd)))
 742                return 0;
 743        /*
 744         * A regular pmd is established and it can't morph into a huge pmd
 745         * from under us anymore at this point because we hold the mmap_sem
 746         * read mode and khugepaged takes it in write mode. So now it's
 747         * safe to run pte_offset_map().
 748         */
 749        pte = pte_offset_map(pmd, address);
 750        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 751}
 752
 753int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 754                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 755                  struct vm_area_struct *vma)
 756{
 757        struct page *src_page;
 758        pmd_t pmd;
 759        pgtable_t pgtable;
 760        int ret;
 761
 762        ret = -ENOMEM;
 763        pgtable = pte_alloc_one(dst_mm, addr);
 764        if (unlikely(!pgtable))
 765                goto out;
 766
 767        spin_lock(&dst_mm->page_table_lock);
 768        spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
 769
 770        ret = -EAGAIN;
 771        pmd = *src_pmd;
 772        if (unlikely(!pmd_trans_huge(pmd))) {
 773                pte_free(dst_mm, pgtable);
 774                goto out_unlock;
 775        }
 776        if (unlikely(pmd_trans_splitting(pmd))) {
 777                /* split huge page running from under us */
 778                spin_unlock(&src_mm->page_table_lock);
 779                spin_unlock(&dst_mm->page_table_lock);
 780                pte_free(dst_mm, pgtable);
 781
 782                wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
 783                goto out;
 784        }
 785        src_page = pmd_page(pmd);
 786        VM_BUG_ON(!PageHead(src_page));
 787        get_page(src_page);
 788        page_dup_rmap(src_page);
 789        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 790
 791        pmdp_set_wrprotect(src_mm, addr, src_pmd);
 792        pmd = pmd_mkold(pmd_wrprotect(pmd));
 793        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 794        prepare_pmd_huge_pte(pgtable, dst_mm);
 795        dst_mm->nr_ptes++;
 796
 797        ret = 0;
 798out_unlock:
 799        spin_unlock(&src_mm->page_table_lock);
 800        spin_unlock(&dst_mm->page_table_lock);
 801out:
 802        return ret;
 803}
 804
 805/* no "address" argument so destroys page coloring of some arch */
 806pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
 807{
 808        pgtable_t pgtable;
 809
 810        assert_spin_locked(&mm->page_table_lock);
 811
 812        /* FIFO */
 813        pgtable = mm->pmd_huge_pte;
 814        if (list_empty(&pgtable->lru))
 815                mm->pmd_huge_pte = NULL;
 816        else {
 817                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
 818                                              struct page, lru);
 819                list_del(&pgtable->lru);
 820        }
 821        return pgtable;
 822}
 823
 824static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 825                                        struct vm_area_struct *vma,
 826                                        unsigned long address,
 827                                        pmd_t *pmd, pmd_t orig_pmd,
 828                                        struct page *page,
 829                                        unsigned long haddr)
 830{
 831        pgtable_t pgtable;
 832        pmd_t _pmd;
 833        int ret = 0, i;
 834        struct page **pages;
 835
 836        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
 837                        GFP_KERNEL);
 838        if (unlikely(!pages)) {
 839                ret |= VM_FAULT_OOM;
 840                goto out;
 841        }
 842
 843        for (i = 0; i < HPAGE_PMD_NR; i++) {
 844                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
 845                                               __GFP_OTHER_NODE,
 846                                               vma, address, page_to_nid(page));
 847                if (unlikely(!pages[i] ||
 848                             mem_cgroup_newpage_charge(pages[i], mm,
 849                                                       GFP_KERNEL))) {
 850                        if (pages[i])
 851                                put_page(pages[i]);
 852                        mem_cgroup_uncharge_start();
 853                        while (--i >= 0) {
 854                                mem_cgroup_uncharge_page(pages[i]);
 855                                put_page(pages[i]);
 856                        }
 857                        mem_cgroup_uncharge_end();
 858                        kfree(pages);
 859                        ret |= VM_FAULT_OOM;
 860                        goto out;
 861                }
 862        }
 863
 864        for (i = 0; i < HPAGE_PMD_NR; i++) {
 865                copy_user_highpage(pages[i], page + i,
 866                                   haddr + PAGE_SIZE * i, vma);
 867                __SetPageUptodate(pages[i]);
 868                cond_resched();
 869        }
 870
 871        spin_lock(&mm->page_table_lock);
 872        if (unlikely(!pmd_same(*pmd, orig_pmd)))
 873                goto out_free_pages;
 874        VM_BUG_ON(!PageHead(page));
 875
 876        pmdp_clear_flush_notify(vma, haddr, pmd);
 877        /* leave pmd empty until pte is filled */
 878
 879        pgtable = get_pmd_huge_pte(mm);
 880        pmd_populate(mm, &_pmd, pgtable);
 881
 882        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
 883                pte_t *pte, entry;
 884                entry = mk_pte(pages[i], vma->vm_page_prot);
 885                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 886                page_add_new_anon_rmap(pages[i], vma, haddr);
 887                pte = pte_offset_map(&_pmd, haddr);
 888                VM_BUG_ON(!pte_none(*pte));
 889                set_pte_at(mm, haddr, pte, entry);
 890                pte_unmap(pte);
 891        }
 892        kfree(pages);
 893
 894        smp_wmb(); /* make pte visible before pmd */
 895        pmd_populate(mm, pmd, pgtable);
 896        page_remove_rmap(page);
 897        spin_unlock(&mm->page_table_lock);
 898
 899        ret |= VM_FAULT_WRITE;
 900        put_page(page);
 901
 902out:
 903        return ret;
 904
 905out_free_pages:
 906        spin_unlock(&mm->page_table_lock);
 907        mem_cgroup_uncharge_start();
 908        for (i = 0; i < HPAGE_PMD_NR; i++) {
 909                mem_cgroup_uncharge_page(pages[i]);
 910                put_page(pages[i]);
 911        }
 912        mem_cgroup_uncharge_end();
 913        kfree(pages);
 914        goto out;
 915}
 916
 917int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 918                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 919{
 920        int ret = 0;
 921        struct page *page, *new_page;
 922        unsigned long haddr;
 923
 924        VM_BUG_ON(!vma->anon_vma);
 925        spin_lock(&mm->page_table_lock);
 926        if (unlikely(!pmd_same(*pmd, orig_pmd)))
 927                goto out_unlock;
 928
 929        page = pmd_page(orig_pmd);
 930        VM_BUG_ON(!PageCompound(page) || !PageHead(page));
 931        haddr = address & HPAGE_PMD_MASK;
 932        if (page_mapcount(page) == 1) {
 933                pmd_t entry;
 934                entry = pmd_mkyoung(orig_pmd);
 935                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 936                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
 937                        update_mmu_cache(vma, address, entry);
 938                ret |= VM_FAULT_WRITE;
 939                goto out_unlock;
 940        }
 941        get_page(page);
 942        spin_unlock(&mm->page_table_lock);
 943
 944        if (transparent_hugepage_enabled(vma) &&
 945            !transparent_hugepage_debug_cow())
 946                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
 947                                              vma, haddr, numa_node_id(), 0);
 948        else
 949                new_page = NULL;
 950
 951        if (unlikely(!new_page)) {
 952                count_vm_event(THP_FAULT_FALLBACK);
 953                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 954                                                   pmd, orig_pmd, page, haddr);
 955                if (ret & VM_FAULT_OOM)
 956                        split_huge_page(page);
 957                put_page(page);
 958                goto out;
 959        }
 960        count_vm_event(THP_FAULT_ALLOC);
 961
 962        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 963                put_page(new_page);
 964                split_huge_page(page);
 965                put_page(page);
 966                ret |= VM_FAULT_OOM;
 967                goto out;
 968        }
 969
 970        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 971        __SetPageUptodate(new_page);
 972
 973        spin_lock(&mm->page_table_lock);
 974        put_page(page);
 975        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 976                spin_unlock(&mm->page_table_lock);
 977                mem_cgroup_uncharge_page(new_page);
 978                put_page(new_page);
 979                goto out;
 980        } else {
 981                pmd_t entry;
 982                VM_BUG_ON(!PageHead(page));
 983                entry = mk_pmd(new_page, vma->vm_page_prot);
 984                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 985                entry = pmd_mkhuge(entry);
 986                pmdp_clear_flush_notify(vma, haddr, pmd);
 987                page_add_new_anon_rmap(new_page, vma, haddr);
 988                set_pmd_at(mm, haddr, pmd, entry);
 989                update_mmu_cache(vma, address, entry);
 990                page_remove_rmap(page);
 991                put_page(page);
 992                ret |= VM_FAULT_WRITE;
 993        }
 994out_unlock:
 995        spin_unlock(&mm->page_table_lock);
 996out:
 997        return ret;
 998}
 999
1000struct page *follow_trans_huge_pmd(struct mm_struct *mm,
1001                                   unsigned long addr,
1002                                   pmd_t *pmd,
1003                                   unsigned int flags)
1004{
1005        struct page *page = NULL;
1006
1007        assert_spin_locked(&mm->page_table_lock);
1008
1009        if (flags & FOLL_WRITE && !pmd_write(*pmd))
1010                goto out;
1011
1012        page = pmd_page(*pmd);
1013        VM_BUG_ON(!PageHead(page));
1014        if (flags & FOLL_TOUCH) {
1015                pmd_t _pmd;
1016                /*
1017                 * We should set the dirty bit only for FOLL_WRITE but
1018                 * for now the dirty bit in the pmd is meaningless.
1019                 * And if the dirty bit will become meaningful and
1020                 * we'll only set it with FOLL_WRITE, an atomic
1021                 * set_bit will be required on the pmd to set the
1022                 * young bit, instead of the current set_pmd_at.
1023                 */
1024                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1025                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1026        }
1027        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1028        VM_BUG_ON(!PageCompound(page));
1029        if (flags & FOLL_GET)
1030                get_page_foll(page);
1031
1032out:
1033        return page;
1034}
1035
1036int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1037                 pmd_t *pmd, unsigned long addr)
1038{
1039        int ret = 0;
1040
1041        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1042                struct page *page;
1043                pgtable_t pgtable;
1044                pgtable = get_pmd_huge_pte(tlb->mm);
1045                page = pmd_page(*pmd);
1046                pmd_clear(pmd);
1047                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1048                page_remove_rmap(page);
1049                VM_BUG_ON(page_mapcount(page) < 0);
1050                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1051                VM_BUG_ON(!PageHead(page));
1052                tlb->mm->nr_ptes--;
1053                spin_unlock(&tlb->mm->page_table_lock);
1054                tlb_remove_page(tlb, page);
1055                pte_free(tlb->mm, pgtable);
1056                ret = 1;
1057        }
1058        return ret;
1059}
1060
1061int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1062                unsigned long addr, unsigned long end,
1063                unsigned char *vec)
1064{
1065        int ret = 0;
1066
1067        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1068                /*
1069                 * All logical pages in the range are present
1070                 * if backed by a huge page.
1071                 */
1072                spin_unlock(&vma->vm_mm->page_table_lock);
1073                memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1074                ret = 1;
1075        }
1076
1077        return ret;
1078}
1079
1080int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1081                  unsigned long old_addr,
1082                  unsigned long new_addr, unsigned long old_end,
1083                  pmd_t *old_pmd, pmd_t *new_pmd)
1084{
1085        int ret = 0;
1086        pmd_t pmd;
1087
1088        struct mm_struct *mm = vma->vm_mm;
1089
1090        if ((old_addr & ~HPAGE_PMD_MASK) ||
1091            (new_addr & ~HPAGE_PMD_MASK) ||
1092            old_end - old_addr < HPAGE_PMD_SIZE ||
1093            (new_vma->vm_flags & VM_NOHUGEPAGE))
1094                goto out;
1095
1096        /*
1097         * The destination pmd shouldn't be established, free_pgtables()
1098         * should have release it.
1099         */
1100        if (WARN_ON(!pmd_none(*new_pmd))) {
1101                VM_BUG_ON(pmd_trans_huge(*new_pmd));
1102                goto out;
1103        }
1104
1105        ret = __pmd_trans_huge_lock(old_pmd, vma);
1106        if (ret == 1) {
1107                pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1108                VM_BUG_ON(!pmd_none(*new_pmd));
1109                set_pmd_at(mm, new_addr, new_pmd, pmd);
1110                spin_unlock(&mm->page_table_lock);
1111        }
1112out:
1113        return ret;
1114}
1115
1116int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1117                unsigned long addr, pgprot_t newprot)
1118{
1119        struct mm_struct *mm = vma->vm_mm;
1120        int ret = 0;
1121
1122        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1123                pmd_t entry;
1124                entry = pmdp_get_and_clear(mm, addr, pmd);
1125                entry = pmd_modify(entry, newprot);
1126                set_pmd_at(mm, addr, pmd, entry);
1127                spin_unlock(&vma->vm_mm->page_table_lock);
1128                ret = 1;
1129        }
1130
1131        return ret;
1132}
1133
1134/*
1135 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1136 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1137 *
1138 * Note that if it returns 1, this routine returns without unlocking page
1139 * table locks. So callers must unlock them.
1140 */
1141int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1142{
1143        spin_lock(&vma->vm_mm->page_table_lock);
1144        if (likely(pmd_trans_huge(*pmd))) {
1145                if (unlikely(pmd_trans_splitting(*pmd))) {
1146                        spin_unlock(&vma->vm_mm->page_table_lock);
1147                        wait_split_huge_page(vma->anon_vma, pmd);
1148                        return -1;
1149                } else {
1150                        /* Thp mapped by 'pmd' is stable, so we can
1151                         * handle it as it is. */
1152                        return 1;
1153                }
1154        }
1155        spin_unlock(&vma->vm_mm->page_table_lock);
1156        return 0;
1157}
1158
1159pmd_t *page_check_address_pmd(struct page *page,
1160                              struct mm_struct *mm,
1161                              unsigned long address,
1162                              enum page_check_address_pmd_flag flag)
1163{
1164        pgd_t *pgd;
1165        pud_t *pud;
1166        pmd_t *pmd, *ret = NULL;
1167
1168        if (address & ~HPAGE_PMD_MASK)
1169                goto out;
1170
1171        pgd = pgd_offset(mm, address);
1172        if (!pgd_present(*pgd))
1173                goto out;
1174
1175        pud = pud_offset(pgd, address);
1176        if (!pud_present(*pud))
1177                goto out;
1178
1179        pmd = pmd_offset(pud, address);
1180        if (pmd_none(*pmd))
1181                goto out;
1182        if (pmd_page(*pmd) != page)
1183                goto out;
1184        /*
1185         * split_vma() may create temporary aliased mappings. There is
1186         * no risk as long as all huge pmd are found and have their
1187         * splitting bit set before __split_huge_page_refcount
1188         * runs. Finding the same huge pmd more than once during the
1189         * same rmap walk is not a problem.
1190         */
1191        if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1192            pmd_trans_splitting(*pmd))
1193                goto out;
1194        if (pmd_trans_huge(*pmd)) {
1195                VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1196                          !pmd_trans_splitting(*pmd));
1197                ret = pmd;
1198        }
1199out:
1200        return ret;
1201}
1202
1203static int __split_huge_page_splitting(struct page *page,
1204                                       struct vm_area_struct *vma,
1205                                       unsigned long address)
1206{
1207        struct mm_struct *mm = vma->vm_mm;
1208        pmd_t *pmd;
1209        int ret = 0;
1210
1211        spin_lock(&mm->page_table_lock);
1212        pmd = page_check_address_pmd(page, mm, address,
1213                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1214        if (pmd) {
1215                /*
1216                 * We can't temporarily set the pmd to null in order
1217                 * to split it, the pmd must remain marked huge at all
1218                 * times or the VM won't take the pmd_trans_huge paths
1219                 * and it won't wait on the anon_vma->root->mutex to
1220                 * serialize against split_huge_page*.
1221                 */
1222                pmdp_splitting_flush_notify(vma, address, pmd);
1223                ret = 1;
1224        }
1225        spin_unlock(&mm->page_table_lock);
1226
1227        return ret;
1228}
1229
1230static void __split_huge_page_refcount(struct page *page)
1231{
1232        int i;
1233        struct zone *zone = page_zone(page);
1234        struct lruvec *lruvec;
1235        int tail_count = 0;
1236
1237        /* prevent PageLRU to go away from under us, and freeze lru stats */
1238        spin_lock_irq(&zone->lru_lock);
1239        lruvec = mem_cgroup_page_lruvec(page, zone);
1240
1241        compound_lock(page);
1242        /* complete memcg works before add pages to LRU */
1243        mem_cgroup_split_huge_fixup(page);
1244
1245        for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1246                struct page *page_tail = page + i;
1247
1248                /* tail_page->_mapcount cannot change */
1249                BUG_ON(page_mapcount(page_tail) < 0);
1250                tail_count += page_mapcount(page_tail);
1251                /* check for overflow */
1252                BUG_ON(tail_count < 0);
1253                BUG_ON(atomic_read(&page_tail->_count) != 0);
1254                /*
1255                 * tail_page->_count is zero and not changing from
1256                 * under us. But get_page_unless_zero() may be running
1257                 * from under us on the tail_page. If we used
1258                 * atomic_set() below instead of atomic_add(), we
1259                 * would then run atomic_set() concurrently with
1260                 * get_page_unless_zero(), and atomic_set() is
1261                 * implemented in C not using locked ops. spin_unlock
1262                 * on x86 sometime uses locked ops because of PPro
1263                 * errata 66, 92, so unless somebody can guarantee
1264                 * atomic_set() here would be safe on all archs (and
1265                 * not only on x86), it's safer to use atomic_add().
1266                 */
1267                atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
1268                           &page_tail->_count);
1269
1270                /* after clearing PageTail the gup refcount can be released */
1271                smp_mb();
1272
1273                /*
1274                 * retain hwpoison flag of the poisoned tail page:
1275                 *   fix for the unsuitable process killed on Guest Machine(KVM)
1276                 *   by the memory-failure.
1277                 */
1278                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1279                page_tail->flags |= (page->flags &
1280                                     ((1L << PG_referenced) |
1281                                      (1L << PG_swapbacked) |
1282                                      (1L << PG_mlocked) |
1283                                      (1L << PG_uptodate)));
1284                page_tail->flags |= (1L << PG_dirty);
1285
1286                /* clear PageTail before overwriting first_page */
1287                smp_wmb();
1288
1289                /*
1290                 * __split_huge_page_splitting() already set the
1291                 * splitting bit in all pmd that could map this
1292                 * hugepage, that will ensure no CPU can alter the
1293                 * mapcount on the head page. The mapcount is only
1294                 * accounted in the head page and it has to be
1295                 * transferred to all tail pages in the below code. So
1296                 * for this code to be safe, the split the mapcount
1297                 * can't change. But that doesn't mean userland can't
1298                 * keep changing and reading the page contents while
1299                 * we transfer the mapcount, so the pmd splitting
1300                 * status is achieved setting a reserved bit in the
1301                 * pmd, not by clearing the present bit.
1302                */
1303                page_tail->_mapcount = page->_mapcount;
1304
1305                BUG_ON(page_tail->mapping);
1306                page_tail->mapping = page->mapping;
1307
1308                page_tail->index = page->index + i;
1309
1310                BUG_ON(!PageAnon(page_tail));
1311                BUG_ON(!PageUptodate(page_tail));
1312                BUG_ON(!PageDirty(page_tail));
1313                BUG_ON(!PageSwapBacked(page_tail));
1314
1315                lru_add_page_tail(page, page_tail, lruvec);
1316        }
1317        atomic_sub(tail_count, &page->_count);
1318        BUG_ON(atomic_read(&page->_count) <= 0);
1319
1320        __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1321        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1322
1323        ClearPageCompound(page);
1324        compound_unlock(page);
1325        spin_unlock_irq(&zone->lru_lock);
1326
1327        for (i = 1; i < HPAGE_PMD_NR; i++) {
1328                struct page *page_tail = page + i;
1329                BUG_ON(page_count(page_tail) <= 0);
1330                /*
1331                 * Tail pages may be freed if there wasn't any mapping
1332                 * like if add_to_swap() is running on a lru page that
1333                 * had its mapping zapped. And freeing these pages
1334                 * requires taking the lru_lock so we do the put_page
1335                 * of the tail pages after the split is complete.
1336                 */
1337                put_page(page_tail);
1338        }
1339
1340        /*
1341         * Only the head page (now become a regular page) is required
1342         * to be pinned by the caller.
1343         */
1344        BUG_ON(page_count(page) <= 0);
1345}
1346
1347static int __split_huge_page_map(struct page *page,
1348                                 struct vm_area_struct *vma,
1349                                 unsigned long address)
1350{
1351        struct mm_struct *mm = vma->vm_mm;
1352        pmd_t *pmd, _pmd;
1353        int ret = 0, i;
1354        pgtable_t pgtable;
1355        unsigned long haddr;
1356
1357        spin_lock(&mm->page_table_lock);
1358        pmd = page_check_address_pmd(page, mm, address,
1359                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1360        if (pmd) {
1361                pgtable = get_pmd_huge_pte(mm);
1362                pmd_populate(mm, &_pmd, pgtable);
1363
1364                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1365                     i++, haddr += PAGE_SIZE) {
1366                        pte_t *pte, entry;
1367                        BUG_ON(PageCompound(page+i));
1368                        entry = mk_pte(page + i, vma->vm_page_prot);
1369                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1370                        if (!pmd_write(*pmd))
1371                                entry = pte_wrprotect(entry);
1372                        else
1373                                BUG_ON(page_mapcount(page) != 1);
1374                        if (!pmd_young(*pmd))
1375                                entry = pte_mkold(entry);
1376                        pte = pte_offset_map(&_pmd, haddr);
1377                        BUG_ON(!pte_none(*pte));
1378                        set_pte_at(mm, haddr, pte, entry);
1379                        pte_unmap(pte);
1380                }
1381
1382                smp_wmb(); /* make pte visible before pmd */
1383                /*
1384                 * Up to this point the pmd is present and huge and
1385                 * userland has the whole access to the hugepage
1386                 * during the split (which happens in place). If we
1387                 * overwrite the pmd with the not-huge version
1388                 * pointing to the pte here (which of course we could
1389                 * if all CPUs were bug free), userland could trigger
1390                 * a small page size TLB miss on the small sized TLB
1391                 * while the hugepage TLB entry is still established
1392                 * in the huge TLB. Some CPU doesn't like that. See
1393                 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1394                 * Erratum 383 on page 93. Intel should be safe but is
1395                 * also warns that it's only safe if the permission
1396                 * and cache attributes of the two entries loaded in
1397                 * the two TLB is identical (which should be the case
1398                 * here). But it is generally safer to never allow
1399                 * small and huge TLB entries for the same virtual
1400                 * address to be loaded simultaneously. So instead of
1401                 * doing "pmd_populate(); flush_tlb_range();" we first
1402                 * mark the current pmd notpresent (atomically because
1403                 * here the pmd_trans_huge and pmd_trans_splitting
1404                 * must remain set at all times on the pmd until the
1405                 * split is complete for this pmd), then we flush the
1406                 * SMP TLB and finally we write the non-huge version
1407                 * of the pmd entry with pmd_populate.
1408                 */
1409                set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1410                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1411                pmd_populate(mm, pmd, pgtable);
1412                ret = 1;
1413        }
1414        spin_unlock(&mm->page_table_lock);
1415
1416        return ret;
1417}
1418
1419/* must be called with anon_vma->root->mutex hold */
1420static void __split_huge_page(struct page *page,
1421                              struct anon_vma *anon_vma)
1422{
1423        int mapcount, mapcount2;
1424        struct anon_vma_chain *avc;
1425
1426        BUG_ON(!PageHead(page));
1427        BUG_ON(PageTail(page));
1428
1429        mapcount = 0;
1430        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1431                struct vm_area_struct *vma = avc->vma;
1432                unsigned long addr = vma_address(page, vma);
1433                BUG_ON(is_vma_temporary_stack(vma));
1434                if (addr == -EFAULT)
1435                        continue;
1436                mapcount += __split_huge_page_splitting(page, vma, addr);
1437        }
1438        /*
1439         * It is critical that new vmas are added to the tail of the
1440         * anon_vma list. This guarantes that if copy_huge_pmd() runs
1441         * and establishes a child pmd before
1442         * __split_huge_page_splitting() freezes the parent pmd (so if
1443         * we fail to prevent copy_huge_pmd() from running until the
1444         * whole __split_huge_page() is complete), we will still see
1445         * the newly established pmd of the child later during the
1446         * walk, to be able to set it as pmd_trans_splitting too.
1447         */
1448        if (mapcount != page_mapcount(page))
1449                printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1450                       mapcount, page_mapcount(page));
1451        BUG_ON(mapcount != page_mapcount(page));
1452
1453        __split_huge_page_refcount(page);
1454
1455        mapcount2 = 0;
1456        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1457                struct vm_area_struct *vma = avc->vma;
1458                unsigned long addr = vma_address(page, vma);
1459                BUG_ON(is_vma_temporary_stack(vma));
1460                if (addr == -EFAULT)
1461                        continue;
1462                mapcount2 += __split_huge_page_map(page, vma, addr);
1463        }
1464        if (mapcount != mapcount2)
1465                printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1466                       mapcount, mapcount2, page_mapcount(page));
1467        BUG_ON(mapcount != mapcount2);
1468}
1469
1470int split_huge_page(struct page *page)
1471{
1472        struct anon_vma *anon_vma;
1473        int ret = 1;
1474
1475        BUG_ON(!PageAnon(page));
1476        anon_vma = page_lock_anon_vma(page);
1477        if (!anon_vma)
1478                goto out;
1479        ret = 0;
1480        if (!PageCompound(page))
1481                goto out_unlock;
1482
1483        BUG_ON(!PageSwapBacked(page));
1484        __split_huge_page(page, anon_vma);
1485        count_vm_event(THP_SPLIT);
1486
1487        BUG_ON(PageCompound(page));
1488out_unlock:
1489        page_unlock_anon_vma(anon_vma);
1490out:
1491        return ret;
1492}
1493
1494#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
1495                   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1496
1497int hugepage_madvise(struct vm_area_struct *vma,
1498                     unsigned long *vm_flags, int advice)
1499{
1500        switch (advice) {
1501        case MADV_HUGEPAGE:
1502                /*
1503                 * Be somewhat over-protective like KSM for now!
1504                 */
1505                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1506                        return -EINVAL;
1507                *vm_flags &= ~VM_NOHUGEPAGE;
1508                *vm_flags |= VM_HUGEPAGE;
1509                /*
1510                 * If the vma become good for khugepaged to scan,
1511                 * register it here without waiting a page fault that
1512                 * may not happen any time soon.
1513                 */
1514                if (unlikely(khugepaged_enter_vma_merge(vma)))
1515                        return -ENOMEM;
1516                break;
1517        case MADV_NOHUGEPAGE:
1518                /*
1519                 * Be somewhat over-protective like KSM for now!
1520                 */
1521                if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
1522                        return -EINVAL;
1523                *vm_flags &= ~VM_HUGEPAGE;
1524                *vm_flags |= VM_NOHUGEPAGE;
1525                /*
1526                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1527                 * this vma even if we leave the mm registered in khugepaged if
1528                 * it got registered before VM_NOHUGEPAGE was set.
1529                 */
1530                break;
1531        }
1532
1533        return 0;
1534}
1535
1536static int __init khugepaged_slab_init(void)
1537{
1538        mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1539                                          sizeof(struct mm_slot),
1540                                          __alignof__(struct mm_slot), 0, NULL);
1541        if (!mm_slot_cache)
1542                return -ENOMEM;
1543
1544        return 0;
1545}
1546
1547static void __init khugepaged_slab_free(void)
1548{
1549        kmem_cache_destroy(mm_slot_cache);
1550        mm_slot_cache = NULL;
1551}
1552
1553static inline struct mm_slot *alloc_mm_slot(void)
1554{
1555        if (!mm_slot_cache)     /* initialization failed */
1556                return NULL;
1557        return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1558}
1559
1560static inline void free_mm_slot(struct mm_slot *mm_slot)
1561{
1562        kmem_cache_free(mm_slot_cache, mm_slot);
1563}
1564
1565static int __init mm_slots_hash_init(void)
1566{
1567        mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1568                                GFP_KERNEL);
1569        if (!mm_slots_hash)
1570                return -ENOMEM;
1571        return 0;
1572}
1573
1574#if 0
1575static void __init mm_slots_hash_free(void)
1576{
1577        kfree(mm_slots_hash);
1578        mm_slots_hash = NULL;
1579}
1580#endif
1581
1582static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1583{
1584        struct mm_slot *mm_slot;
1585        struct hlist_head *bucket;
1586        struct hlist_node *node;
1587
1588        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1589                                % MM_SLOTS_HASH_HEADS];
1590        hlist_for_each_entry(mm_slot, node, bucket, hash) {
1591                if (mm == mm_slot->mm)
1592                        return mm_slot;
1593        }
1594        return NULL;
1595}
1596
1597static void insert_to_mm_slots_hash(struct mm_struct *mm,
1598                                    struct mm_slot *mm_slot)
1599{
1600        struct hlist_head *bucket;
1601
1602        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1603                                % MM_SLOTS_HASH_HEADS];
1604        mm_slot->mm = mm;
1605        hlist_add_head(&mm_slot->hash, bucket);
1606}
1607
1608static inline int khugepaged_test_exit(struct mm_struct *mm)
1609{
1610        return atomic_read(&mm->mm_users) == 0;
1611}
1612
1613int __khugepaged_enter(struct mm_struct *mm)
1614{
1615        struct mm_slot *mm_slot;
1616        int wakeup;
1617
1618        mm_slot = alloc_mm_slot();
1619        if (!mm_slot)
1620                return -ENOMEM;
1621
1622        /* __khugepaged_exit() must not run from under us */
1623        VM_BUG_ON(khugepaged_test_exit(mm));
1624        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1625                free_mm_slot(mm_slot);
1626                return 0;
1627        }
1628
1629        spin_lock(&khugepaged_mm_lock);
1630        insert_to_mm_slots_hash(mm, mm_slot);
1631        /*
1632         * Insert just behind the scanning cursor, to let the area settle
1633         * down a little.
1634         */
1635        wakeup = list_empty(&khugepaged_scan.mm_head);
1636        list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1637        spin_unlock(&khugepaged_mm_lock);
1638
1639        atomic_inc(&mm->mm_count);
1640        if (wakeup)
1641                wake_up_interruptible(&khugepaged_wait);
1642
1643        return 0;
1644}
1645
1646int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1647{
1648        unsigned long hstart, hend;
1649        if (!vma->anon_vma)
1650                /*
1651                 * Not yet faulted in so we will register later in the
1652                 * page fault if needed.
1653                 */
1654                return 0;
1655        if (vma->vm_ops)
1656                /* khugepaged not yet working on file or special mappings */
1657                return 0;
1658        /*
1659         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1660         * true too, verify it here.
1661         */
1662        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1663        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1664        hend = vma->vm_end & HPAGE_PMD_MASK;
1665        if (hstart < hend)
1666                return khugepaged_enter(vma);
1667        return 0;
1668}
1669
1670void __khugepaged_exit(struct mm_struct *mm)
1671{
1672        struct mm_slot *mm_slot;
1673        int free = 0;
1674
1675        spin_lock(&khugepaged_mm_lock);
1676        mm_slot = get_mm_slot(mm);
1677        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1678                hlist_del(&mm_slot->hash);
1679                list_del(&mm_slot->mm_node);
1680                free = 1;
1681        }
1682        spin_unlock(&khugepaged_mm_lock);
1683
1684        if (free) {
1685                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1686                free_mm_slot(mm_slot);
1687                mmdrop(mm);
1688        } else if (mm_slot) {
1689                /*
1690                 * This is required to serialize against
1691                 * khugepaged_test_exit() (which is guaranteed to run
1692                 * under mmap sem read mode). Stop here (after we
1693                 * return all pagetables will be destroyed) until
1694                 * khugepaged has finished working on the pagetables
1695                 * under the mmap_sem.
1696                 */
1697                down_write(&mm->mmap_sem);
1698                up_write(&mm->mmap_sem);
1699        }
1700}
1701
1702static void release_pte_page(struct page *page)
1703{
1704        /* 0 stands for page_is_file_cache(page) == false */
1705        dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1706        unlock_page(page);
1707        putback_lru_page(page);
1708}
1709
1710static void release_pte_pages(pte_t *pte, pte_t *_pte)
1711{
1712        while (--_pte >= pte) {
1713                pte_t pteval = *_pte;
1714                if (!pte_none(pteval))
1715                        release_pte_page(pte_page(pteval));
1716        }
1717}
1718
1719static void release_all_pte_pages(pte_t *pte)
1720{
1721        release_pte_pages(pte, pte + HPAGE_PMD_NR);
1722}
1723
1724static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1725                                        unsigned long address,
1726                                        pte_t *pte)
1727{
1728        struct page *page;
1729        pte_t *_pte;
1730        int referenced = 0, isolated = 0, none = 0;
1731        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1732             _pte++, address += PAGE_SIZE) {
1733                pte_t pteval = *_pte;
1734                if (pte_none(pteval)) {
1735                        if (++none <= khugepaged_max_ptes_none)
1736                                continue;
1737                        else {
1738                                release_pte_pages(pte, _pte);
1739                                goto out;
1740                        }
1741                }
1742                if (!pte_present(pteval) || !pte_write(pteval)) {
1743                        release_pte_pages(pte, _pte);
1744                        goto out;
1745                }
1746                page = vm_normal_page(vma, address, pteval);
1747                if (unlikely(!page)) {
1748                        release_pte_pages(pte, _pte);
1749                        goto out;
1750                }
1751                VM_BUG_ON(PageCompound(page));
1752                BUG_ON(!PageAnon(page));
1753                VM_BUG_ON(!PageSwapBacked(page));
1754
1755                /* cannot use mapcount: can't collapse if there's a gup pin */
1756                if (page_count(page) != 1) {
1757                        release_pte_pages(pte, _pte);
1758                        goto out;
1759                }
1760                /*
1761                 * We can do it before isolate_lru_page because the
1762                 * page can't be freed from under us. NOTE: PG_lock
1763                 * is needed to serialize against split_huge_page
1764                 * when invoked from the VM.
1765                 */
1766                if (!trylock_page(page)) {
1767                        release_pte_pages(pte, _pte);
1768                        goto out;
1769                }
1770                /*
1771                 * Isolate the page to avoid collapsing an hugepage
1772                 * currently in use by the VM.
1773                 */
1774                if (isolate_lru_page(page)) {
1775                        unlock_page(page);
1776                        release_pte_pages(pte, _pte);
1777                        goto out;
1778                }
1779                /* 0 stands for page_is_file_cache(page) == false */
1780                inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1781                VM_BUG_ON(!PageLocked(page));
1782                VM_BUG_ON(PageLRU(page));
1783
1784                /* If there is no mapped pte young don't collapse the page */
1785                if (pte_young(pteval) || PageReferenced(page) ||
1786                    mmu_notifier_test_young(vma->vm_mm, address))
1787                        referenced = 1;
1788        }
1789        if (unlikely(!referenced))
1790                release_all_pte_pages(pte);
1791        else
1792                isolated = 1;
1793out:
1794        return isolated;
1795}
1796
1797static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1798                                      struct vm_area_struct *vma,
1799                                      unsigned long address,
1800                                      spinlock_t *ptl)
1801{
1802        pte_t *_pte;
1803        for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1804                pte_t pteval = *_pte;
1805                struct page *src_page;
1806
1807                if (pte_none(pteval)) {
1808                        clear_user_highpage(page, address);
1809                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1810                } else {
1811                        src_page = pte_page(pteval);
1812                        copy_user_highpage(page, src_page, address, vma);
1813                        VM_BUG_ON(page_mapcount(src_page) != 1);
1814                        release_pte_page(src_page);
1815                        /*
1816                         * ptl mostly unnecessary, but preempt has to
1817                         * be disabled to update the per-cpu stats
1818                         * inside page_remove_rmap().
1819                         */
1820                        spin_lock(ptl);
1821                        /*
1822                         * paravirt calls inside pte_clear here are
1823                         * superfluous.
1824                         */
1825                        pte_clear(vma->vm_mm, address, _pte);
1826                        page_remove_rmap(src_page);
1827                        spin_unlock(ptl);
1828                        free_page_and_swap_cache(src_page);
1829                }
1830
1831                address += PAGE_SIZE;
1832                page++;
1833        }
1834}
1835
1836static void collapse_huge_page(struct mm_struct *mm,
1837                               unsigned long address,
1838                               struct page **hpage,
1839                               struct vm_area_struct *vma,
1840                               int node)
1841{
1842        pgd_t *pgd;
1843        pud_t *pud;
1844        pmd_t *pmd, _pmd;
1845        pte_t *pte;
1846        pgtable_t pgtable;
1847        struct page *new_page;
1848        spinlock_t *ptl;
1849        int isolated;
1850        unsigned long hstart, hend;
1851
1852        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1853#ifndef CONFIG_NUMA
1854        up_read(&mm->mmap_sem);
1855        VM_BUG_ON(!*hpage);
1856        new_page = *hpage;
1857#else
1858        VM_BUG_ON(*hpage);
1859        /*
1860         * Allocate the page while the vma is still valid and under
1861         * the mmap_sem read mode so there is no memory allocation
1862         * later when we take the mmap_sem in write mode. This is more
1863         * friendly behavior (OTOH it may actually hide bugs) to
1864         * filesystems in userland with daemons allocating memory in
1865         * the userland I/O paths.  Allocating memory with the
1866         * mmap_sem in read mode is good idea also to allow greater
1867         * scalability.
1868         */
1869        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1870                                      node, __GFP_OTHER_NODE);
1871
1872        /*
1873         * After allocating the hugepage, release the mmap_sem read lock in
1874         * preparation for taking it in write mode.
1875         */
1876        up_read(&mm->mmap_sem);
1877        if (unlikely(!new_page)) {
1878                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1879                *hpage = ERR_PTR(-ENOMEM);
1880                return;
1881        }
1882#endif
1883
1884        count_vm_event(THP_COLLAPSE_ALLOC);
1885        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1886#ifdef CONFIG_NUMA
1887                put_page(new_page);
1888#endif
1889                return;
1890        }
1891
1892        /*
1893         * Prevent all access to pagetables with the exception of
1894         * gup_fast later hanlded by the ptep_clear_flush and the VM
1895         * handled by the anon_vma lock + PG_lock.
1896         */
1897        down_write(&mm->mmap_sem);
1898        if (unlikely(khugepaged_test_exit(mm)))
1899                goto out;
1900
1901        vma = find_vma(mm, address);
1902        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1903        hend = vma->vm_end & HPAGE_PMD_MASK;
1904        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1905                goto out;
1906
1907        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1908            (vma->vm_flags & VM_NOHUGEPAGE))
1909                goto out;
1910
1911        if (!vma->anon_vma || vma->vm_ops)
1912                goto out;
1913        if (is_vma_temporary_stack(vma))
1914                goto out;
1915        /*
1916         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1917         * true too, verify it here.
1918         */
1919        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1920
1921        pgd = pgd_offset(mm, address);
1922        if (!pgd_present(*pgd))
1923                goto out;
1924
1925        pud = pud_offset(pgd, address);
1926        if (!pud_present(*pud))
1927                goto out;
1928
1929        pmd = pmd_offset(pud, address);
1930        /* pmd can't go away or become huge under us */
1931        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1932                goto out;
1933
1934        anon_vma_lock(vma->anon_vma);
1935
1936        pte = pte_offset_map(pmd, address);
1937        ptl = pte_lockptr(mm, pmd);
1938
1939        spin_lock(&mm->page_table_lock); /* probably unnecessary */
1940        /*
1941         * After this gup_fast can't run anymore. This also removes
1942         * any huge TLB entry from the CPU so we won't allow
1943         * huge and small TLB entries for the same virtual address
1944         * to avoid the risk of CPU bugs in that area.
1945         */
1946        _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1947        spin_unlock(&mm->page_table_lock);
1948
1949        spin_lock(ptl);
1950        isolated = __collapse_huge_page_isolate(vma, address, pte);
1951        spin_unlock(ptl);
1952
1953        if (unlikely(!isolated)) {
1954                pte_unmap(pte);
1955                spin_lock(&mm->page_table_lock);
1956                BUG_ON(!pmd_none(*pmd));
1957                set_pmd_at(mm, address, pmd, _pmd);
1958                spin_unlock(&mm->page_table_lock);
1959                anon_vma_unlock(vma->anon_vma);
1960                goto out;
1961        }
1962
1963        /*
1964         * All pages are isolated and locked so anon_vma rmap
1965         * can't run anymore.
1966         */
1967        anon_vma_unlock(vma->anon_vma);
1968
1969        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1970        pte_unmap(pte);
1971        __SetPageUptodate(new_page);
1972        pgtable = pmd_pgtable(_pmd);
1973        VM_BUG_ON(page_count(pgtable) != 1);
1974        VM_BUG_ON(page_mapcount(pgtable) != 0);
1975
1976        _pmd = mk_pmd(new_page, vma->vm_page_prot);
1977        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1978        _pmd = pmd_mkhuge(_pmd);
1979
1980        /*
1981         * spin_lock() below is not the equivalent of smp_wmb(), so
1982         * this is needed to avoid the copy_huge_page writes to become
1983         * visible after the set_pmd_at() write.
1984         */
1985        smp_wmb();
1986
1987        spin_lock(&mm->page_table_lock);
1988        BUG_ON(!pmd_none(*pmd));
1989        page_add_new_anon_rmap(new_page, vma, address);
1990        set_pmd_at(mm, address, pmd, _pmd);
1991        update_mmu_cache(vma, address, _pmd);
1992        prepare_pmd_huge_pte(pgtable, mm);
1993        spin_unlock(&mm->page_table_lock);
1994
1995#ifndef CONFIG_NUMA
1996        *hpage = NULL;
1997#endif
1998        khugepaged_pages_collapsed++;
1999out_up_write:
2000        up_write(&mm->mmap_sem);
2001        return;
2002
2003out:
2004        mem_cgroup_uncharge_page(new_page);
2005#ifdef CONFIG_NUMA
2006        put_page(new_page);
2007#endif
2008        goto out_up_write;
2009}
2010
2011static int khugepaged_scan_pmd(struct mm_struct *mm,
2012                               struct vm_area_struct *vma,
2013                               unsigned long address,
2014                               struct page **hpage)
2015{
2016        pgd_t *pgd;
2017        pud_t *pud;
2018        pmd_t *pmd;
2019        pte_t *pte, *_pte;
2020        int ret = 0, referenced = 0, none = 0;
2021        struct page *page;
2022        unsigned long _address;
2023        spinlock_t *ptl;
2024        int node = -1;
2025
2026        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2027
2028        pgd = pgd_offset(mm, address);
2029        if (!pgd_present(*pgd))
2030                goto out;
2031
2032        pud = pud_offset(pgd, address);
2033        if (!pud_present(*pud))
2034                goto out;
2035
2036        pmd = pmd_offset(pud, address);
2037        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
2038                goto out;
2039
2040        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2041        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2042             _pte++, _address += PAGE_SIZE) {
2043                pte_t pteval = *_pte;
2044                if (pte_none(pteval)) {
2045                        if (++none <= khugepaged_max_ptes_none)
2046                                continue;
2047                        else
2048                                goto out_unmap;
2049                }
2050                if (!pte_present(pteval) || !pte_write(pteval))
2051                        goto out_unmap;
2052                page = vm_normal_page(vma, _address, pteval);
2053                if (unlikely(!page))
2054                        goto out_unmap;
2055                /*
2056                 * Chose the node of the first page. This could
2057                 * be more sophisticated and look at more pages,
2058                 * but isn't for now.
2059                 */
2060                if (node == -1)
2061                        node = page_to_nid(page);
2062                VM_BUG_ON(PageCompound(page));
2063                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2064                        goto out_unmap;
2065                /* cannot use mapcount: can't collapse if there's a gup pin */
2066                if (page_count(page) != 1)
2067                        goto out_unmap;
2068                if (pte_young(pteval) || PageReferenced(page) ||
2069                    mmu_notifier_test_young(vma->vm_mm, address))
2070                        referenced = 1;
2071        }
2072        if (referenced)
2073                ret = 1;
2074out_unmap:
2075        pte_unmap_unlock(pte, ptl);
2076        if (ret)
2077                /* collapse_huge_page will return with the mmap_sem released */
2078                collapse_huge_page(mm, address, hpage, vma, node);
2079out:
2080        return ret;
2081}
2082
2083static void collect_mm_slot(struct mm_slot *mm_slot)
2084{
2085        struct mm_struct *mm = mm_slot->mm;
2086
2087        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2088
2089        if (khugepaged_test_exit(mm)) {
2090                /* free mm_slot */
2091                hlist_del(&mm_slot->hash);
2092                list_del(&mm_slot->mm_node);
2093
2094                /*
2095                 * Not strictly needed because the mm exited already.
2096                 *
2097                 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2098                 */
2099
2100                /* khugepaged_mm_lock actually not necessary for the below */
2101                free_mm_slot(mm_slot);
2102                mmdrop(mm);
2103        }
2104}
2105
2106static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2107                                            struct page **hpage)
2108        __releases(&khugepaged_mm_lock)
2109        __acquires(&khugepaged_mm_lock)
2110{
2111        struct mm_slot *mm_slot;
2112        struct mm_struct *mm;
2113        struct vm_area_struct *vma;
2114        int progress = 0;
2115
2116        VM_BUG_ON(!pages);
2117        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2118
2119        if (khugepaged_scan.mm_slot)
2120                mm_slot = khugepaged_scan.mm_slot;
2121        else {
2122                mm_slot = list_entry(khugepaged_scan.mm_head.next,
2123                                     struct mm_slot, mm_node);
2124                khugepaged_scan.address = 0;
2125                khugepaged_scan.mm_slot = mm_slot;
2126        }
2127        spin_unlock(&khugepaged_mm_lock);
2128
2129        mm = mm_slot->mm;
2130        down_read(&mm->mmap_sem);
2131        if (unlikely(khugepaged_test_exit(mm)))
2132                vma = NULL;
2133        else
2134                vma = find_vma(mm, khugepaged_scan.address);
2135
2136        progress++;
2137        for (; vma; vma = vma->vm_next) {
2138                unsigned long hstart, hend;
2139
2140                cond_resched();
2141                if (unlikely(khugepaged_test_exit(mm))) {
2142                        progress++;
2143                        break;
2144                }
2145
2146                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2147                     !khugepaged_always()) ||
2148                    (vma->vm_flags & VM_NOHUGEPAGE)) {
2149                skip:
2150                        progress++;
2151                        continue;
2152                }
2153                if (!vma->anon_vma || vma->vm_ops)
2154                        goto skip;
2155                if (is_vma_temporary_stack(vma))
2156                        goto skip;
2157                /*
2158                 * If is_pfn_mapping() is true is_learn_pfn_mapping()
2159                 * must be true too, verify it here.
2160                 */
2161                VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2162                          vma->vm_flags & VM_NO_THP);
2163
2164                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2165                hend = vma->vm_end & HPAGE_PMD_MASK;
2166                if (hstart >= hend)
2167                        goto skip;
2168                if (khugepaged_scan.address > hend)
2169                        goto skip;
2170                if (khugepaged_scan.address < hstart)
2171                        khugepaged_scan.address = hstart;
2172                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2173
2174                while (khugepaged_scan.address < hend) {
2175                        int ret;
2176                        cond_resched();
2177                        if (unlikely(khugepaged_test_exit(mm)))
2178                                goto breakouterloop;
2179
2180                        VM_BUG_ON(khugepaged_scan.address < hstart ||
2181                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
2182                                  hend);
2183                        ret = khugepaged_scan_pmd(mm, vma,
2184                                                  khugepaged_scan.address,
2185                                                  hpage);
2186                        /* move to next address */
2187                        khugepaged_scan.address += HPAGE_PMD_SIZE;
2188                        progress += HPAGE_PMD_NR;
2189                        if (ret)
2190                                /* we released mmap_sem so break loop */
2191                                goto breakouterloop_mmap_sem;
2192                        if (progress >= pages)
2193                                goto breakouterloop;
2194                }
2195        }
2196breakouterloop:
2197        up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2198breakouterloop_mmap_sem:
2199
2200        spin_lock(&khugepaged_mm_lock);
2201        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2202        /*
2203         * Release the current mm_slot if this mm is about to die, or
2204         * if we scanned all vmas of this mm.
2205         */
2206        if (khugepaged_test_exit(mm) || !vma) {
2207                /*
2208                 * Make sure that if mm_users is reaching zero while
2209                 * khugepaged runs here, khugepaged_exit will find
2210                 * mm_slot not pointing to the exiting mm.
2211                 */
2212                if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2213                        khugepaged_scan.mm_slot = list_entry(
2214                                mm_slot->mm_node.next,
2215                                struct mm_slot, mm_node);
2216                        khugepaged_scan.address = 0;
2217                } else {
2218                        khugepaged_scan.mm_slot = NULL;
2219                        khugepaged_full_scans++;
2220                }
2221
2222                collect_mm_slot(mm_slot);
2223        }
2224
2225        return progress;
2226}
2227
2228static int khugepaged_has_work(void)
2229{
2230        return !list_empty(&khugepaged_scan.mm_head) &&
2231                khugepaged_enabled();
2232}
2233
2234static int khugepaged_wait_event(void)
2235{
2236        return !list_empty(&khugepaged_scan.mm_head) ||
2237                !khugepaged_enabled();
2238}
2239
2240static void khugepaged_do_scan(struct page **hpage)
2241{
2242        unsigned int progress = 0, pass_through_head = 0;
2243        unsigned int pages = khugepaged_pages_to_scan;
2244
2245        barrier(); /* write khugepaged_pages_to_scan to local stack */
2246
2247        while (progress < pages) {
2248                cond_resched();
2249
2250#ifndef CONFIG_NUMA
2251                if (!*hpage) {
2252                        *hpage = alloc_hugepage(khugepaged_defrag());
2253                        if (unlikely(!*hpage)) {
2254                                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2255                                break;
2256                        }
2257                        count_vm_event(THP_COLLAPSE_ALLOC);
2258                }
2259#else
2260                if (IS_ERR(*hpage))
2261                        break;
2262#endif
2263
2264                if (unlikely(kthread_should_stop() || freezing(current)))
2265                        break;
2266
2267                spin_lock(&khugepaged_mm_lock);
2268                if (!khugepaged_scan.mm_slot)
2269                        pass_through_head++;
2270                if (khugepaged_has_work() &&
2271                    pass_through_head < 2)
2272                        progress += khugepaged_scan_mm_slot(pages - progress,
2273                                                            hpage);
2274                else
2275                        progress = pages;
2276                spin_unlock(&khugepaged_mm_lock);
2277        }
2278}
2279
2280static void khugepaged_alloc_sleep(void)
2281{
2282        wait_event_freezable_timeout(khugepaged_wait, false,
2283                        msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2284}
2285
2286#ifndef CONFIG_NUMA
2287static struct page *khugepaged_alloc_hugepage(void)
2288{
2289        struct page *hpage;
2290
2291        do {
2292                hpage = alloc_hugepage(khugepaged_defrag());
2293                if (!hpage) {
2294                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2295                        khugepaged_alloc_sleep();
2296                } else
2297                        count_vm_event(THP_COLLAPSE_ALLOC);
2298        } while (unlikely(!hpage) &&
2299                 likely(khugepaged_enabled()));
2300        return hpage;
2301}
2302#endif
2303
2304static void khugepaged_loop(void)
2305{
2306        struct page *hpage;
2307
2308#ifdef CONFIG_NUMA
2309        hpage = NULL;
2310#endif
2311        while (likely(khugepaged_enabled())) {
2312#ifndef CONFIG_NUMA
2313                hpage = khugepaged_alloc_hugepage();
2314                if (unlikely(!hpage))
2315                        break;
2316#else
2317                if (IS_ERR(hpage)) {
2318                        khugepaged_alloc_sleep();
2319                        hpage = NULL;
2320                }
2321#endif
2322
2323                khugepaged_do_scan(&hpage);
2324#ifndef CONFIG_NUMA
2325                if (hpage)
2326                        put_page(hpage);
2327#endif
2328                try_to_freeze();
2329                if (unlikely(kthread_should_stop()))
2330                        break;
2331                if (khugepaged_has_work()) {
2332                        if (!khugepaged_scan_sleep_millisecs)
2333                                continue;
2334                        wait_event_freezable_timeout(khugepaged_wait, false,
2335                            msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2336                } else if (khugepaged_enabled())
2337                        wait_event_freezable(khugepaged_wait,
2338                                             khugepaged_wait_event());
2339        }
2340}
2341
2342static int khugepaged(void *none)
2343{
2344        struct mm_slot *mm_slot;
2345
2346        set_freezable();
2347        set_user_nice(current, 19);
2348
2349        /* serialize with start_khugepaged() */
2350        mutex_lock(&khugepaged_mutex);
2351
2352        for (;;) {
2353                mutex_unlock(&khugepaged_mutex);
2354                VM_BUG_ON(khugepaged_thread != current);
2355                khugepaged_loop();
2356                VM_BUG_ON(khugepaged_thread != current);
2357
2358                mutex_lock(&khugepaged_mutex);
2359                if (!khugepaged_enabled())
2360                        break;
2361                if (unlikely(kthread_should_stop()))
2362                        break;
2363        }
2364
2365        spin_lock(&khugepaged_mm_lock);
2366        mm_slot = khugepaged_scan.mm_slot;
2367        khugepaged_scan.mm_slot = NULL;
2368        if (mm_slot)
2369                collect_mm_slot(mm_slot);
2370        spin_unlock(&khugepaged_mm_lock);
2371
2372        khugepaged_thread = NULL;
2373        mutex_unlock(&khugepaged_mutex);
2374
2375        return 0;
2376}
2377
2378void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2379{
2380        struct page *page;
2381
2382        spin_lock(&mm->page_table_lock);
2383        if (unlikely(!pmd_trans_huge(*pmd))) {
2384                spin_unlock(&mm->page_table_lock);
2385                return;
2386        }
2387        page = pmd_page(*pmd);
2388        VM_BUG_ON(!page_count(page));
2389        get_page(page);
2390        spin_unlock(&mm->page_table_lock);
2391
2392        split_huge_page(page);
2393
2394        put_page(page);
2395        BUG_ON(pmd_trans_huge(*pmd));
2396}
2397
2398static void split_huge_page_address(struct mm_struct *mm,
2399                                    unsigned long address)
2400{
2401        pgd_t *pgd;
2402        pud_t *pud;
2403        pmd_t *pmd;
2404
2405        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2406
2407        pgd = pgd_offset(mm, address);
2408        if (!pgd_present(*pgd))
2409                return;
2410
2411        pud = pud_offset(pgd, address);
2412        if (!pud_present(*pud))
2413                return;
2414
2415        pmd = pmd_offset(pud, address);
2416        if (!pmd_present(*pmd))
2417                return;
2418        /*
2419         * Caller holds the mmap_sem write mode, so a huge pmd cannot
2420         * materialize from under us.
2421         */
2422        split_huge_page_pmd(mm, pmd);
2423}
2424
2425void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2426                             unsigned long start,
2427                             unsigned long end,
2428                             long adjust_next)
2429{
2430        /*
2431         * If the new start address isn't hpage aligned and it could
2432         * previously contain an hugepage: check if we need to split
2433         * an huge pmd.
2434         */
2435        if (start & ~HPAGE_PMD_MASK &&
2436            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2437            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2438                split_huge_page_address(vma->vm_mm, start);
2439
2440        /*
2441         * If the new end address isn't hpage aligned and it could
2442         * previously contain an hugepage: check if we need to split
2443         * an huge pmd.
2444         */
2445        if (end & ~HPAGE_PMD_MASK &&
2446            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2447            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2448                split_huge_page_address(vma->vm_mm, end);
2449
2450        /*
2451         * If we're also updating the vma->vm_next->vm_start, if the new
2452         * vm_next->vm_start isn't page aligned and it could previously
2453         * contain an hugepage: check if we need to split an huge pmd.
2454         */
2455        if (adjust_next > 0) {
2456                struct vm_area_struct *next = vma->vm_next;
2457                unsigned long nstart = next->vm_start;
2458                nstart += adjust_next << PAGE_SHIFT;
2459                if (nstart & ~HPAGE_PMD_MASK &&
2460                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2461                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2462                        split_huge_page_address(next->vm_mm, nstart);
2463        }
2464}
2465
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.