linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case node -1 here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#include <linux/mempolicy.h>
  69#include <linux/mm.h>
  70#include <linux/highmem.h>
  71#include <linux/hugetlb.h>
  72#include <linux/kernel.h>
  73#include <linux/sched.h>
  74#include <linux/nodemask.h>
  75#include <linux/cpuset.h>
  76#include <linux/gfp.h>
  77#include <linux/slab.h>
  78#include <linux/string.h>
  79#include <linux/module.h>
  80#include <linux/nsproxy.h>
  81#include <linux/interrupt.h>
  82#include <linux/init.h>
  83#include <linux/compat.h>
  84#include <linux/swap.h>
  85#include <linux/seq_file.h>
  86#include <linux/proc_fs.h>
  87#include <linux/migrate.h>
  88#include <linux/rmap.h>
  89#include <linux/security.h>
  90#include <linux/syscalls.h>
  91#include <linux/ctype.h>
  92
  93#include <asm/tlbflush.h>
  94#include <asm/uaccess.h>
  95
  96/* Internal flags */
  97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101static struct kmem_cache *policy_cache;
 102static struct kmem_cache *sn_cache;
 103
 104/* Highest zone. An specific allocation for a zone below that is not
 105   policied. */
 106enum zone_type policy_zone = 0;
 107
 108/*
 109 * run-time system-wide default policy => local allocation
 110 */
 111struct mempolicy default_policy = {
 112        .refcnt = ATOMIC_INIT(1), /* never free it */
 113        .mode = MPOL_PREFERRED,
 114        .flags = MPOL_F_LOCAL,
 115};
 116
 117static const struct mempolicy_operations {
 118        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 119        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 120} mpol_ops[MPOL_MAX];
 121
 122/* Check that the nodemask contains at least one populated zone */
 123static int is_valid_nodemask(const nodemask_t *nodemask)
 124{
 125        int nd, k;
 126
 127        /* Check that there is something useful in this mask */
 128        k = policy_zone;
 129
 130        for_each_node_mask(nd, *nodemask) {
 131                struct zone *z;
 132
 133                for (k = 0; k <= policy_zone; k++) {
 134                        z = &NODE_DATA(nd)->node_zones[k];
 135                        if (z->present_pages > 0)
 136                                return 1;
 137                }
 138        }
 139
 140        return 0;
 141}
 142
 143static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 144{
 145        return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 146}
 147
 148static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 149                                   const nodemask_t *rel)
 150{
 151        nodemask_t tmp;
 152        nodes_fold(tmp, *orig, nodes_weight(*rel));
 153        nodes_onto(*ret, tmp, *rel);
 154}
 155
 156static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 157{
 158        if (nodes_empty(*nodes))
 159                return -EINVAL;
 160        pol->v.nodes = *nodes;
 161        return 0;
 162}
 163
 164static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 165{
 166        if (!nodes)
 167                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 168        else if (nodes_empty(*nodes))
 169                return -EINVAL;                 /*  no allowed nodes */
 170        else
 171                pol->v.preferred_node = first_node(*nodes);
 172        return 0;
 173}
 174
 175static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!is_valid_nodemask(nodes))
 178                return -EINVAL;
 179        pol->v.nodes = *nodes;
 180        return 0;
 181}
 182
 183/* Create a new policy */
 184static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 185                                  nodemask_t *nodes)
 186{
 187        struct mempolicy *policy;
 188        nodemask_t cpuset_context_nmask;
 189        int ret;
 190
 191        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 192                 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 193
 194        if (mode == MPOL_DEFAULT) {
 195                if (nodes && !nodes_empty(*nodes))
 196                        return ERR_PTR(-EINVAL);
 197                return NULL;    /* simply delete any existing policy */
 198        }
 199        VM_BUG_ON(!nodes);
 200
 201        /*
 202         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 203         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 204         * All other modes require a valid pointer to a non-empty nodemask.
 205         */
 206        if (mode == MPOL_PREFERRED) {
 207                if (nodes_empty(*nodes)) {
 208                        if (((flags & MPOL_F_STATIC_NODES) ||
 209                             (flags & MPOL_F_RELATIVE_NODES)))
 210                                return ERR_PTR(-EINVAL);
 211                        nodes = NULL;   /* flag local alloc */
 212                }
 213        } else if (nodes_empty(*nodes))
 214                return ERR_PTR(-EINVAL);
 215        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 216        if (!policy)
 217                return ERR_PTR(-ENOMEM);
 218        atomic_set(&policy->refcnt, 1);
 219        policy->mode = mode;
 220        policy->flags = flags;
 221
 222        if (nodes) {
 223                /*
 224                 * cpuset related setup doesn't apply to local allocation
 225                 */
 226                cpuset_update_task_memory_state();
 227                if (flags & MPOL_F_RELATIVE_NODES)
 228                        mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 229                                               &cpuset_current_mems_allowed);
 230                else
 231                        nodes_and(cpuset_context_nmask, *nodes,
 232                                  cpuset_current_mems_allowed);
 233                if (mpol_store_user_nodemask(policy))
 234                        policy->w.user_nodemask = *nodes;
 235                else
 236                        policy->w.cpuset_mems_allowed =
 237                                                cpuset_mems_allowed(current);
 238        }
 239
 240        ret = mpol_ops[mode].create(policy,
 241                                nodes ? &cpuset_context_nmask : NULL);
 242        if (ret < 0) {
 243                kmem_cache_free(policy_cache, policy);
 244                return ERR_PTR(ret);
 245        }
 246        return policy;
 247}
 248
 249/* Slow path of a mpol destructor. */
 250void __mpol_put(struct mempolicy *p)
 251{
 252        if (!atomic_dec_and_test(&p->refcnt))
 253                return;
 254        kmem_cache_free(policy_cache, p);
 255}
 256
 257static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 258{
 259}
 260
 261static void mpol_rebind_nodemask(struct mempolicy *pol,
 262                                 const nodemask_t *nodes)
 263{
 264        nodemask_t tmp;
 265
 266        if (pol->flags & MPOL_F_STATIC_NODES)
 267                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 268        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 269                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 270        else {
 271                nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 272                            *nodes);
 273                pol->w.cpuset_mems_allowed = *nodes;
 274        }
 275
 276        pol->v.nodes = tmp;
 277        if (!node_isset(current->il_next, tmp)) {
 278                current->il_next = next_node(current->il_next, tmp);
 279                if (current->il_next >= MAX_NUMNODES)
 280                        current->il_next = first_node(tmp);
 281                if (current->il_next >= MAX_NUMNODES)
 282                        current->il_next = numa_node_id();
 283        }
 284}
 285
 286static void mpol_rebind_preferred(struct mempolicy *pol,
 287                                  const nodemask_t *nodes)
 288{
 289        nodemask_t tmp;
 290
 291        if (pol->flags & MPOL_F_STATIC_NODES) {
 292                int node = first_node(pol->w.user_nodemask);
 293
 294                if (node_isset(node, *nodes)) {
 295                        pol->v.preferred_node = node;
 296                        pol->flags &= ~MPOL_F_LOCAL;
 297                } else
 298                        pol->flags |= MPOL_F_LOCAL;
 299        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 300                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 301                pol->v.preferred_node = first_node(tmp);
 302        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 303                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 304                                                   pol->w.cpuset_mems_allowed,
 305                                                   *nodes);
 306                pol->w.cpuset_mems_allowed = *nodes;
 307        }
 308}
 309
 310/* Migrate a policy to a different set of nodes */
 311static void mpol_rebind_policy(struct mempolicy *pol,
 312                               const nodemask_t *newmask)
 313{
 314        if (!pol)
 315                return;
 316        if (!mpol_store_user_nodemask(pol) &&
 317            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 318                return;
 319        mpol_ops[pol->mode].rebind(pol, newmask);
 320}
 321
 322/*
 323 * Wrapper for mpol_rebind_policy() that just requires task
 324 * pointer, and updates task mempolicy.
 325 */
 326
 327void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 328{
 329        mpol_rebind_policy(tsk->mempolicy, new);
 330}
 331
 332/*
 333 * Rebind each vma in mm to new nodemask.
 334 *
 335 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 336 */
 337
 338void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 339{
 340        struct vm_area_struct *vma;
 341
 342        down_write(&mm->mmap_sem);
 343        for (vma = mm->mmap; vma; vma = vma->vm_next)
 344                mpol_rebind_policy(vma->vm_policy, new);
 345        up_write(&mm->mmap_sem);
 346}
 347
 348static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 349        [MPOL_DEFAULT] = {
 350                .rebind = mpol_rebind_default,
 351        },
 352        [MPOL_INTERLEAVE] = {
 353                .create = mpol_new_interleave,
 354                .rebind = mpol_rebind_nodemask,
 355        },
 356        [MPOL_PREFERRED] = {
 357                .create = mpol_new_preferred,
 358                .rebind = mpol_rebind_preferred,
 359        },
 360        [MPOL_BIND] = {
 361                .create = mpol_new_bind,
 362                .rebind = mpol_rebind_nodemask,
 363        },
 364};
 365
 366static void gather_stats(struct page *, void *, int pte_dirty);
 367static void migrate_page_add(struct page *page, struct list_head *pagelist,
 368                                unsigned long flags);
 369
 370/* Scan through pages checking if pages follow certain conditions. */
 371static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 372                unsigned long addr, unsigned long end,
 373                const nodemask_t *nodes, unsigned long flags,
 374                void *private)
 375{
 376        pte_t *orig_pte;
 377        pte_t *pte;
 378        spinlock_t *ptl;
 379
 380        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 381        do {
 382                struct page *page;
 383                int nid;
 384
 385                if (!pte_present(*pte))
 386                        continue;
 387                page = vm_normal_page(vma, addr, *pte);
 388                if (!page)
 389                        continue;
 390                /*
 391                 * The check for PageReserved here is important to avoid
 392                 * handling zero pages and other pages that may have been
 393                 * marked special by the system.
 394                 *
 395                 * If the PageReserved would not be checked here then f.e.
 396                 * the location of the zero page could have an influence
 397                 * on MPOL_MF_STRICT, zero pages would be counted for
 398                 * the per node stats, and there would be useless attempts
 399                 * to put zero pages on the migration list.
 400                 */
 401                if (PageReserved(page))
 402                        continue;
 403                nid = page_to_nid(page);
 404                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 405                        continue;
 406
 407                if (flags & MPOL_MF_STATS)
 408                        gather_stats(page, private, pte_dirty(*pte));
 409                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 410                        migrate_page_add(page, private, flags);
 411                else
 412                        break;
 413        } while (pte++, addr += PAGE_SIZE, addr != end);
 414        pte_unmap_unlock(orig_pte, ptl);
 415        return addr != end;
 416}
 417
 418static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 419                unsigned long addr, unsigned long end,
 420                const nodemask_t *nodes, unsigned long flags,
 421                void *private)
 422{
 423        pmd_t *pmd;
 424        unsigned long next;
 425
 426        pmd = pmd_offset(pud, addr);
 427        do {
 428                next = pmd_addr_end(addr, end);
 429                if (pmd_none_or_clear_bad(pmd))
 430                        continue;
 431                if (check_pte_range(vma, pmd, addr, next, nodes,
 432                                    flags, private))
 433                        return -EIO;
 434        } while (pmd++, addr = next, addr != end);
 435        return 0;
 436}
 437
 438static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 439                unsigned long addr, unsigned long end,
 440                const nodemask_t *nodes, unsigned long flags,
 441                void *private)
 442{
 443        pud_t *pud;
 444        unsigned long next;
 445
 446        pud = pud_offset(pgd, addr);
 447        do {
 448                next = pud_addr_end(addr, end);
 449                if (pud_none_or_clear_bad(pud))
 450                        continue;
 451                if (check_pmd_range(vma, pud, addr, next, nodes,
 452                                    flags, private))
 453                        return -EIO;
 454        } while (pud++, addr = next, addr != end);
 455        return 0;
 456}
 457
 458static inline int check_pgd_range(struct vm_area_struct *vma,
 459                unsigned long addr, unsigned long end,
 460                const nodemask_t *nodes, unsigned long flags,
 461                void *private)
 462{
 463        pgd_t *pgd;
 464        unsigned long next;
 465
 466        pgd = pgd_offset(vma->vm_mm, addr);
 467        do {
 468                next = pgd_addr_end(addr, end);
 469                if (pgd_none_or_clear_bad(pgd))
 470                        continue;
 471                if (check_pud_range(vma, pgd, addr, next, nodes,
 472                                    flags, private))
 473                        return -EIO;
 474        } while (pgd++, addr = next, addr != end);
 475        return 0;
 476}
 477
 478/*
 479 * Check if all pages in a range are on a set of nodes.
 480 * If pagelist != NULL then isolate pages from the LRU and
 481 * put them on the pagelist.
 482 */
 483static struct vm_area_struct *
 484check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 485                const nodemask_t *nodes, unsigned long flags, void *private)
 486{
 487        int err;
 488        struct vm_area_struct *first, *vma, *prev;
 489
 490        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 491
 492                err = migrate_prep();
 493                if (err)
 494                        return ERR_PTR(err);
 495        }
 496
 497        first = find_vma(mm, start);
 498        if (!first)
 499                return ERR_PTR(-EFAULT);
 500        prev = NULL;
 501        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 502                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 503                        if (!vma->vm_next && vma->vm_end < end)
 504                                return ERR_PTR(-EFAULT);
 505                        if (prev && prev->vm_end < vma->vm_start)
 506                                return ERR_PTR(-EFAULT);
 507                }
 508                if (!is_vm_hugetlb_page(vma) &&
 509                    ((flags & MPOL_MF_STRICT) ||
 510                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 511                                vma_migratable(vma)))) {
 512                        unsigned long endvma = vma->vm_end;
 513
 514                        if (endvma > end)
 515                                endvma = end;
 516                        if (vma->vm_start > start)
 517                                start = vma->vm_start;
 518                        err = check_pgd_range(vma, start, endvma, nodes,
 519                                                flags, private);
 520                        if (err) {
 521                                first = ERR_PTR(err);
 522                                break;
 523                        }
 524                }
 525                prev = vma;
 526        }
 527        return first;
 528}
 529
 530/* Apply policy to a single VMA */
 531static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 532{
 533        int err = 0;
 534        struct mempolicy *old = vma->vm_policy;
 535
 536        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 537                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 538                 vma->vm_ops, vma->vm_file,
 539                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 540
 541        if (vma->vm_ops && vma->vm_ops->set_policy)
 542                err = vma->vm_ops->set_policy(vma, new);
 543        if (!err) {
 544                mpol_get(new);
 545                vma->vm_policy = new;
 546                mpol_put(old);
 547        }
 548        return err;
 549}
 550
 551/* Step 2: apply policy to a range and do splits. */
 552static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 553                       unsigned long end, struct mempolicy *new)
 554{
 555        struct vm_area_struct *next;
 556        int err;
 557
 558        err = 0;
 559        for (; vma && vma->vm_start < end; vma = next) {
 560                next = vma->vm_next;
 561                if (vma->vm_start < start)
 562                        err = split_vma(vma->vm_mm, vma, start, 1);
 563                if (!err && vma->vm_end > end)
 564                        err = split_vma(vma->vm_mm, vma, end, 0);
 565                if (!err)
 566                        err = policy_vma(vma, new);
 567                if (err)
 568                        break;
 569        }
 570        return err;
 571}
 572
 573/*
 574 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 575 * mempolicy.  Allows more rapid checking of this (combined perhaps
 576 * with other PF_* flag bits) on memory allocation hot code paths.
 577 *
 578 * If called from outside this file, the task 'p' should -only- be
 579 * a newly forked child not yet visible on the task list, because
 580 * manipulating the task flags of a visible task is not safe.
 581 *
 582 * The above limitation is why this routine has the funny name
 583 * mpol_fix_fork_child_flag().
 584 *
 585 * It is also safe to call this with a task pointer of current,
 586 * which the static wrapper mpol_set_task_struct_flag() does,
 587 * for use within this file.
 588 */
 589
 590void mpol_fix_fork_child_flag(struct task_struct *p)
 591{
 592        if (p->mempolicy)
 593                p->flags |= PF_MEMPOLICY;
 594        else
 595                p->flags &= ~PF_MEMPOLICY;
 596}
 597
 598static void mpol_set_task_struct_flag(void)
 599{
 600        mpol_fix_fork_child_flag(current);
 601}
 602
 603/* Set the process memory policy */
 604static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 605                             nodemask_t *nodes)
 606{
 607        struct mempolicy *new;
 608        struct mm_struct *mm = current->mm;
 609
 610        new = mpol_new(mode, flags, nodes);
 611        if (IS_ERR(new))
 612                return PTR_ERR(new);
 613
 614        /*
 615         * prevent changing our mempolicy while show_numa_maps()
 616         * is using it.
 617         * Note:  do_set_mempolicy() can be called at init time
 618         * with no 'mm'.
 619         */
 620        if (mm)
 621                down_write(&mm->mmap_sem);
 622        mpol_put(current->mempolicy);
 623        current->mempolicy = new;
 624        mpol_set_task_struct_flag();
 625        if (new && new->mode == MPOL_INTERLEAVE &&
 626            nodes_weight(new->v.nodes))
 627                current->il_next = first_node(new->v.nodes);
 628        if (mm)
 629                up_write(&mm->mmap_sem);
 630
 631        return 0;
 632}
 633
 634/*
 635 * Return nodemask for policy for get_mempolicy() query
 636 */
 637static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 638{
 639        nodes_clear(*nodes);
 640        if (p == &default_policy)
 641                return;
 642
 643        switch (p->mode) {
 644        case MPOL_BIND:
 645                /* Fall through */
 646        case MPOL_INTERLEAVE:
 647                *nodes = p->v.nodes;
 648                break;
 649        case MPOL_PREFERRED:
 650                if (!(p->flags & MPOL_F_LOCAL))
 651                        node_set(p->v.preferred_node, *nodes);
 652                /* else return empty node mask for local allocation */
 653                break;
 654        default:
 655                BUG();
 656        }
 657}
 658
 659static int lookup_node(struct mm_struct *mm, unsigned long addr)
 660{
 661        struct page *p;
 662        int err;
 663
 664        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 665        if (err >= 0) {
 666                err = page_to_nid(p);
 667                put_page(p);
 668        }
 669        return err;
 670}
 671
 672/* Retrieve NUMA policy */
 673static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 674                             unsigned long addr, unsigned long flags)
 675{
 676        int err;
 677        struct mm_struct *mm = current->mm;
 678        struct vm_area_struct *vma = NULL;
 679        struct mempolicy *pol = current->mempolicy;
 680
 681        cpuset_update_task_memory_state();
 682        if (flags &
 683                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 684                return -EINVAL;
 685
 686        if (flags & MPOL_F_MEMS_ALLOWED) {
 687                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 688                        return -EINVAL;
 689                *policy = 0;    /* just so it's initialized */
 690                *nmask  = cpuset_current_mems_allowed;
 691                return 0;
 692        }
 693
 694        if (flags & MPOL_F_ADDR) {
 695                /*
 696                 * Do NOT fall back to task policy if the
 697                 * vma/shared policy at addr is NULL.  We
 698                 * want to return MPOL_DEFAULT in this case.
 699                 */
 700                down_read(&mm->mmap_sem);
 701                vma = find_vma_intersection(mm, addr, addr+1);
 702                if (!vma) {
 703                        up_read(&mm->mmap_sem);
 704                        return -EFAULT;
 705                }
 706                if (vma->vm_ops && vma->vm_ops->get_policy)
 707                        pol = vma->vm_ops->get_policy(vma, addr);
 708                else
 709                        pol = vma->vm_policy;
 710        } else if (addr)
 711                return -EINVAL;
 712
 713        if (!pol)
 714                pol = &default_policy;  /* indicates default behavior */
 715
 716        if (flags & MPOL_F_NODE) {
 717                if (flags & MPOL_F_ADDR) {
 718                        err = lookup_node(mm, addr);
 719                        if (err < 0)
 720                                goto out;
 721                        *policy = err;
 722                } else if (pol == current->mempolicy &&
 723                                pol->mode == MPOL_INTERLEAVE) {
 724                        *policy = current->il_next;
 725                } else {
 726                        err = -EINVAL;
 727                        goto out;
 728                }
 729        } else {
 730                *policy = pol == &default_policy ? MPOL_DEFAULT :
 731                                                pol->mode;
 732                /*
 733                 * Internal mempolicy flags must be masked off before exposing
 734                 * the policy to userspace.
 735                 */
 736                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 737        }
 738
 739        if (vma) {
 740                up_read(&current->mm->mmap_sem);
 741                vma = NULL;
 742        }
 743
 744        err = 0;
 745        if (nmask)
 746                get_policy_nodemask(pol, nmask);
 747
 748 out:
 749        mpol_cond_put(pol);
 750        if (vma)
 751                up_read(&current->mm->mmap_sem);
 752        return err;
 753}
 754
 755#ifdef CONFIG_MIGRATION
 756/*
 757 * page migration
 758 */
 759static void migrate_page_add(struct page *page, struct list_head *pagelist,
 760                                unsigned long flags)
 761{
 762        /*
 763         * Avoid migrating a page that is shared with others.
 764         */
 765        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 766                isolate_lru_page(page, pagelist);
 767}
 768
 769static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 770{
 771        return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 772}
 773
 774/*
 775 * Migrate pages from one node to a target node.
 776 * Returns error or the number of pages not migrated.
 777 */
 778static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 779                           int flags)
 780{
 781        nodemask_t nmask;
 782        LIST_HEAD(pagelist);
 783        int err = 0;
 784
 785        nodes_clear(nmask);
 786        node_set(source, nmask);
 787
 788        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 789                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 790
 791        if (!list_empty(&pagelist))
 792                err = migrate_pages(&pagelist, new_node_page, dest);
 793
 794        return err;
 795}
 796
 797/*
 798 * Move pages between the two nodesets so as to preserve the physical
 799 * layout as much as possible.
 800 *
 801 * Returns the number of page that could not be moved.
 802 */
 803int do_migrate_pages(struct mm_struct *mm,
 804        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 805{
 806        LIST_HEAD(pagelist);
 807        int busy = 0;
 808        int err = 0;
 809        nodemask_t tmp;
 810
 811        down_read(&mm->mmap_sem);
 812
 813        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 814        if (err)
 815                goto out;
 816
 817/*
 818 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 819 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 820 * bit in 'tmp', and return that <source, dest> pair for migration.
 821 * The pair of nodemasks 'to' and 'from' define the map.
 822 *
 823 * If no pair of bits is found that way, fallback to picking some
 824 * pair of 'source' and 'dest' bits that are not the same.  If the
 825 * 'source' and 'dest' bits are the same, this represents a node
 826 * that will be migrating to itself, so no pages need move.
 827 *
 828 * If no bits are left in 'tmp', or if all remaining bits left
 829 * in 'tmp' correspond to the same bit in 'to', return false
 830 * (nothing left to migrate).
 831 *
 832 * This lets us pick a pair of nodes to migrate between, such that
 833 * if possible the dest node is not already occupied by some other
 834 * source node, minimizing the risk of overloading the memory on a
 835 * node that would happen if we migrated incoming memory to a node
 836 * before migrating outgoing memory source that same node.
 837 *
 838 * A single scan of tmp is sufficient.  As we go, we remember the
 839 * most recent <s, d> pair that moved (s != d).  If we find a pair
 840 * that not only moved, but what's better, moved to an empty slot
 841 * (d is not set in tmp), then we break out then, with that pair.
 842 * Otherwise when we finish scannng from_tmp, we at least have the
 843 * most recent <s, d> pair that moved.  If we get all the way through
 844 * the scan of tmp without finding any node that moved, much less
 845 * moved to an empty node, then there is nothing left worth migrating.
 846 */
 847
 848        tmp = *from_nodes;
 849        while (!nodes_empty(tmp)) {
 850                int s,d;
 851                int source = -1;
 852                int dest = 0;
 853
 854                for_each_node_mask(s, tmp) {
 855                        d = node_remap(s, *from_nodes, *to_nodes);
 856                        if (s == d)
 857                                continue;
 858
 859                        source = s;     /* Node moved. Memorize */
 860                        dest = d;
 861
 862                        /* dest not in remaining from nodes? */
 863                        if (!node_isset(dest, tmp))
 864                                break;
 865                }
 866                if (source == -1)
 867                        break;
 868
 869                node_clear(source, tmp);
 870                err = migrate_to_node(mm, source, dest, flags);
 871                if (err > 0)
 872                        busy += err;
 873                if (err < 0)
 874                        break;
 875        }
 876out:
 877        up_read(&mm->mmap_sem);
 878        if (err < 0)
 879                return err;
 880        return busy;
 881
 882}
 883
 884/*
 885 * Allocate a new page for page migration based on vma policy.
 886 * Start assuming that page is mapped by vma pointed to by @private.
 887 * Search forward from there, if not.  N.B., this assumes that the
 888 * list of pages handed to migrate_pages()--which is how we get here--
 889 * is in virtual address order.
 890 */
 891static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 892{
 893        struct vm_area_struct *vma = (struct vm_area_struct *)private;
 894        unsigned long uninitialized_var(address);
 895
 896        while (vma) {
 897                address = page_address_in_vma(page, vma);
 898                if (address != -EFAULT)
 899                        break;
 900                vma = vma->vm_next;
 901        }
 902
 903        /*
 904         * if !vma, alloc_page_vma() will use task or system default policy
 905         */
 906        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 907}
 908#else
 909
 910static void migrate_page_add(struct page *page, struct list_head *pagelist,
 911                                unsigned long flags)
 912{
 913}
 914
 915int do_migrate_pages(struct mm_struct *mm,
 916        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 917{
 918        return -ENOSYS;
 919}
 920
 921static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 922{
 923        return NULL;
 924}
 925#endif
 926
 927static long do_mbind(unsigned long start, unsigned long len,
 928                     unsigned short mode, unsigned short mode_flags,
 929                     nodemask_t *nmask, unsigned long flags)
 930{
 931        struct vm_area_struct *vma;
 932        struct mm_struct *mm = current->mm;
 933        struct mempolicy *new;
 934        unsigned long end;
 935        int err;
 936        LIST_HEAD(pagelist);
 937
 938        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 939                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 940                return -EINVAL;
 941        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 942                return -EPERM;
 943
 944        if (start & ~PAGE_MASK)
 945                return -EINVAL;
 946
 947        if (mode == MPOL_DEFAULT)
 948                flags &= ~MPOL_MF_STRICT;
 949
 950        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 951        end = start + len;
 952
 953        if (end < start)
 954                return -EINVAL;
 955        if (end == start)
 956                return 0;
 957
 958        new = mpol_new(mode, mode_flags, nmask);
 959        if (IS_ERR(new))
 960                return PTR_ERR(new);
 961
 962        /*
 963         * If we are using the default policy then operation
 964         * on discontinuous address spaces is okay after all
 965         */
 966        if (!new)
 967                flags |= MPOL_MF_DISCONTIG_OK;
 968
 969        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 970                 start, start + len, mode, mode_flags,
 971                 nmask ? nodes_addr(*nmask)[0] : -1);
 972
 973        down_write(&mm->mmap_sem);
 974        vma = check_range(mm, start, end, nmask,
 975                          flags | MPOL_MF_INVERT, &pagelist);
 976
 977        err = PTR_ERR(vma);
 978        if (!IS_ERR(vma)) {
 979                int nr_failed = 0;
 980
 981                err = mbind_range(vma, start, end, new);
 982
 983                if (!list_empty(&pagelist))
 984                        nr_failed = migrate_pages(&pagelist, new_vma_page,
 985                                                (unsigned long)vma);
 986
 987                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 988                        err = -EIO;
 989        }
 990
 991        up_write(&mm->mmap_sem);
 992        mpol_put(new);
 993        return err;
 994}
 995
 996/*
 997 * User space interface with variable sized bitmaps for nodelists.
 998 */
 999
1000/* Copy a node mask from user space. */
1001static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1002                     unsigned long maxnode)
1003{
1004        unsigned long k;
1005        unsigned long nlongs;
1006        unsigned long endmask;
1007
1008        --maxnode;
1009        nodes_clear(*nodes);
1010        if (maxnode == 0 || !nmask)
1011                return 0;
1012        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1013                return -EINVAL;
1014
1015        nlongs = BITS_TO_LONGS(maxnode);
1016        if ((maxnode % BITS_PER_LONG) == 0)
1017                endmask = ~0UL;
1018        else
1019                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1020
1021        /* When the user specified more nodes than supported just check
1022           if the non supported part is all zero. */
1023        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1024                if (nlongs > PAGE_SIZE/sizeof(long))
1025                        return -EINVAL;
1026                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1027                        unsigned long t;
1028                        if (get_user(t, nmask + k))
1029                                return -EFAULT;
1030                        if (k == nlongs - 1) {
1031                                if (t & endmask)
1032                                        return -EINVAL;
1033                        } else if (t)
1034                                return -EINVAL;
1035                }
1036                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1037                endmask = ~0UL;
1038        }
1039
1040        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1041                return -EFAULT;
1042        nodes_addr(*nodes)[nlongs-1] &= endmask;
1043        return 0;
1044}
1045
1046/* Copy a kernel node mask to user space */
1047static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1048                              nodemask_t *nodes)
1049{
1050        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1051        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1052
1053        if (copy > nbytes) {
1054                if (copy > PAGE_SIZE)
1055                        return -EINVAL;
1056                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1057                        return -EFAULT;
1058                copy = nbytes;
1059        }
1060        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1061}
1062
1063asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1064                        unsigned long mode,
1065                        unsigned long __user *nmask, unsigned long maxnode,
1066                        unsigned flags)
1067{
1068        nodemask_t nodes;
1069        int err;
1070        unsigned short mode_flags;
1071
1072        mode_flags = mode & MPOL_MODE_FLAGS;
1073        mode &= ~MPOL_MODE_FLAGS;
1074        if (mode >= MPOL_MAX)
1075                return -EINVAL;
1076        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1077            (mode_flags & MPOL_F_RELATIVE_NODES))
1078                return -EINVAL;
1079        err = get_nodes(&nodes, nmask, maxnode);
1080        if (err)
1081                return err;
1082        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1083}
1084
1085/* Set the process memory policy */
1086asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1087                unsigned long maxnode)
1088{
1089        int err;
1090        nodemask_t nodes;
1091        unsigned short flags;
1092
1093        flags = mode & MPOL_MODE_FLAGS;
1094        mode &= ~MPOL_MODE_FLAGS;
1095        if ((unsigned int)mode >= MPOL_MAX)
1096                return -EINVAL;
1097        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1098                return -EINVAL;
1099        err = get_nodes(&nodes, nmask, maxnode);
1100        if (err)
1101                return err;
1102        return do_set_mempolicy(mode, flags, &nodes);
1103}
1104
1105asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1106                const unsigned long __user *old_nodes,
1107                const unsigned long __user *new_nodes)
1108{
1109        struct mm_struct *mm;
1110        struct task_struct *task;
1111        nodemask_t old;
1112        nodemask_t new;
1113        nodemask_t task_nodes;
1114        int err;
1115
1116        err = get_nodes(&old, old_nodes, maxnode);
1117        if (err)
1118                return err;
1119
1120        err = get_nodes(&new, new_nodes, maxnode);
1121        if (err)
1122                return err;
1123
1124        /* Find the mm_struct */
1125        read_lock(&tasklist_lock);
1126        task = pid ? find_task_by_vpid(pid) : current;
1127        if (!task) {
1128                read_unlock(&tasklist_lock);
1129                return -ESRCH;
1130        }
1131        mm = get_task_mm(task);
1132        read_unlock(&tasklist_lock);
1133
1134        if (!mm)
1135                return -EINVAL;
1136
1137        /*
1138         * Check if this process has the right to modify the specified
1139         * process. The right exists if the process has administrative
1140         * capabilities, superuser privileges or the same
1141         * userid as the target process.
1142         */
1143        if ((current->euid != task->suid) && (current->euid != task->uid) &&
1144            (current->uid != task->suid) && (current->uid != task->uid) &&
1145            !capable(CAP_SYS_NICE)) {
1146                err = -EPERM;
1147                goto out;
1148        }
1149
1150        task_nodes = cpuset_mems_allowed(task);
1151        /* Is the user allowed to access the target nodes? */
1152        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1153                err = -EPERM;
1154                goto out;
1155        }
1156
1157        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1158                err = -EINVAL;
1159                goto out;
1160        }
1161
1162        err = security_task_movememory(task);
1163        if (err)
1164                goto out;
1165
1166        err = do_migrate_pages(mm, &old, &new,
1167                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1168out:
1169        mmput(mm);
1170        return err;
1171}
1172
1173
1174/* Retrieve NUMA policy */
1175asmlinkage long sys_get_mempolicy(int __user *policy,
1176                                unsigned long __user *nmask,
1177                                unsigned long maxnode,
1178                                unsigned long addr, unsigned long flags)
1179{
1180        int err;
1181        int uninitialized_var(pval);
1182        nodemask_t nodes;
1183
1184        if (nmask != NULL && maxnode < MAX_NUMNODES)
1185                return -EINVAL;
1186
1187        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1188
1189        if (err)
1190                return err;
1191
1192        if (policy && put_user(pval, policy))
1193                return -EFAULT;
1194
1195        if (nmask)
1196                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1197
1198        return err;
1199}
1200
1201#ifdef CONFIG_COMPAT
1202
1203asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1204                                     compat_ulong_t __user *nmask,
1205                                     compat_ulong_t maxnode,
1206                                     compat_ulong_t addr, compat_ulong_t flags)
1207{
1208        long err;
1209        unsigned long __user *nm = NULL;
1210        unsigned long nr_bits, alloc_size;
1211        DECLARE_BITMAP(bm, MAX_NUMNODES);
1212
1213        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1214        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1215
1216        if (nmask)
1217                nm = compat_alloc_user_space(alloc_size);
1218
1219        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1220
1221        if (!err && nmask) {
1222                err = copy_from_user(bm, nm, alloc_size);
1223                /* ensure entire bitmap is zeroed */
1224                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1225                err |= compat_put_bitmap(nmask, bm, nr_bits);
1226        }
1227
1228        return err;
1229}
1230
1231asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1232                                     compat_ulong_t maxnode)
1233{
1234        long err = 0;
1235        unsigned long __user *nm = NULL;
1236        unsigned long nr_bits, alloc_size;
1237        DECLARE_BITMAP(bm, MAX_NUMNODES);
1238
1239        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1240        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1241
1242        if (nmask) {
1243                err = compat_get_bitmap(bm, nmask, nr_bits);
1244                nm = compat_alloc_user_space(alloc_size);
1245                err |= copy_to_user(nm, bm, alloc_size);
1246        }
1247
1248        if (err)
1249                return -EFAULT;
1250
1251        return sys_set_mempolicy(mode, nm, nr_bits+1);
1252}
1253
1254asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1255                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1256                             compat_ulong_t maxnode, compat_ulong_t flags)
1257{
1258        long err = 0;
1259        unsigned long __user *nm = NULL;
1260        unsigned long nr_bits, alloc_size;
1261        nodemask_t bm;
1262
1263        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1264        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1265
1266        if (nmask) {
1267                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1268                nm = compat_alloc_user_space(alloc_size);
1269                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1270        }
1271
1272        if (err)
1273                return -EFAULT;
1274
1275        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1276}
1277
1278#endif
1279
1280/*
1281 * get_vma_policy(@task, @vma, @addr)
1282 * @task - task for fallback if vma policy == default
1283 * @vma   - virtual memory area whose policy is sought
1284 * @addr  - address in @vma for shared policy lookup
1285 *
1286 * Returns effective policy for a VMA at specified address.
1287 * Falls back to @task or system default policy, as necessary.
1288 * Current or other task's task mempolicy and non-shared vma policies
1289 * are protected by the task's mmap_sem, which must be held for read by
1290 * the caller.
1291 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1292 * count--added by the get_policy() vm_op, as appropriate--to protect against
1293 * freeing by another task.  It is the caller's responsibility to free the
1294 * extra reference for shared policies.
1295 */
1296static struct mempolicy *get_vma_policy(struct task_struct *task,
1297                struct vm_area_struct *vma, unsigned long addr)
1298{
1299        struct mempolicy *pol = task->mempolicy;
1300
1301        if (vma) {
1302                if (vma->vm_ops && vma->vm_ops->get_policy) {
1303                        struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1304                                                                        addr);
1305                        if (vpol)
1306                                pol = vpol;
1307                } else if (vma->vm_policy)
1308                        pol = vma->vm_policy;
1309        }
1310        if (!pol)
1311                pol = &default_policy;
1312        return pol;
1313}
1314
1315/*
1316 * Return a nodemask representing a mempolicy for filtering nodes for
1317 * page allocation
1318 */
1319static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1320{
1321        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1322        if (unlikely(policy->mode == MPOL_BIND) &&
1323                        gfp_zone(gfp) >= policy_zone &&
1324                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1325                return &policy->v.nodes;
1326
1327        return NULL;
1328}
1329
1330/* Return a zonelist indicated by gfp for node representing a mempolicy */
1331static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1332{
1333        int nd = numa_node_id();
1334
1335        switch (policy->mode) {
1336        case MPOL_PREFERRED:
1337                if (!(policy->flags & MPOL_F_LOCAL))
1338                        nd = policy->v.preferred_node;
1339                break;
1340        case MPOL_BIND:
1341                /*
1342                 * Normally, MPOL_BIND allocations are node-local within the
1343                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1344                 * current node is part of the mask, we use the zonelist for
1345                 * the first node in the mask instead.
1346                 */
1347                if (unlikely(gfp & __GFP_THISNODE) &&
1348                                unlikely(!node_isset(nd, policy->v.nodes)))
1349                        nd = first_node(policy->v.nodes);
1350                break;
1351        case MPOL_INTERLEAVE: /* should not happen */
1352                break;
1353        default:
1354                BUG();
1355        }
1356        return node_zonelist(nd, gfp);
1357}
1358
1359/* Do dynamic interleaving for a process */
1360static unsigned interleave_nodes(struct mempolicy *policy)
1361{
1362        unsigned nid, next;
1363        struct task_struct *me = current;
1364
1365        nid = me->il_next;
1366        next = next_node(nid, policy->v.nodes);
1367        if (next >= MAX_NUMNODES)
1368                next = first_node(policy->v.nodes);
1369        if (next < MAX_NUMNODES)
1370                me->il_next = next;
1371        return nid;
1372}
1373
1374/*
1375 * Depending on the memory policy provide a node from which to allocate the
1376 * next slab entry.
1377 * @policy must be protected by freeing by the caller.  If @policy is
1378 * the current task's mempolicy, this protection is implicit, as only the
1379 * task can change it's policy.  The system default policy requires no
1380 * such protection.
1381 */
1382unsigned slab_node(struct mempolicy *policy)
1383{
1384        if (!policy || policy->flags & MPOL_F_LOCAL)
1385                return numa_node_id();
1386
1387        switch (policy->mode) {
1388        case MPOL_PREFERRED:
1389                /*
1390                 * handled MPOL_F_LOCAL above
1391                 */
1392                return policy->v.preferred_node;
1393
1394        case MPOL_INTERLEAVE:
1395                return interleave_nodes(policy);
1396
1397        case MPOL_BIND: {
1398                /*
1399                 * Follow bind policy behavior and start allocation at the
1400                 * first node.
1401                 */
1402                struct zonelist *zonelist;
1403                struct zone *zone;
1404                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1405                zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1406                (void)first_zones_zonelist(zonelist, highest_zoneidx,
1407                                                        &policy->v.nodes,
1408                                                        &zone);
1409                return zone->node;
1410        }
1411
1412        default:
1413                BUG();
1414        }
1415}
1416
1417/* Do static interleaving for a VMA with known offset. */
1418static unsigned offset_il_node(struct mempolicy *pol,
1419                struct vm_area_struct *vma, unsigned long off)
1420{
1421        unsigned nnodes = nodes_weight(pol->v.nodes);
1422        unsigned target;
1423        int c;
1424        int nid = -1;
1425
1426        if (!nnodes)
1427                return numa_node_id();
1428        target = (unsigned int)off % nnodes;
1429        c = 0;
1430        do {
1431                nid = next_node(nid, pol->v.nodes);
1432                c++;
1433        } while (c <= target);
1434        return nid;
1435}
1436
1437/* Determine a node number for interleave */
1438static inline unsigned interleave_nid(struct mempolicy *pol,
1439                 struct vm_area_struct *vma, unsigned long addr, int shift)
1440{
1441        if (vma) {
1442                unsigned long off;
1443
1444                /*
1445                 * for small pages, there is no difference between
1446                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1447                 * for huge pages, since vm_pgoff is in units of small
1448                 * pages, we need to shift off the always 0 bits to get
1449                 * a useful offset.
1450                 */
1451                BUG_ON(shift < PAGE_SHIFT);
1452                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1453                off += (addr - vma->vm_start) >> shift;
1454                return offset_il_node(pol, vma, off);
1455        } else
1456                return interleave_nodes(pol);
1457}
1458
1459#ifdef CONFIG_HUGETLBFS
1460/*
1461 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1462 * @vma = virtual memory area whose policy is sought
1463 * @addr = address in @vma for shared policy lookup and interleave policy
1464 * @gfp_flags = for requested zone
1465 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1466 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1467 *
1468 * Returns a zonelist suitable for a huge page allocation and a pointer
1469 * to the struct mempolicy for conditional unref after allocation.
1470 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1471 * @nodemask for filtering the zonelist.
1472 */
1473struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1474                                gfp_t gfp_flags, struct mempolicy **mpol,
1475                                nodemask_t **nodemask)
1476{
1477        struct zonelist *zl;
1478
1479        *mpol = get_vma_policy(current, vma, addr);
1480        *nodemask = NULL;       /* assume !MPOL_BIND */
1481
1482        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484                                                HPAGE_SHIFT), gfp_flags);
1485        } else {
1486                zl = policy_zonelist(gfp_flags, *mpol);
1487                if ((*mpol)->mode == MPOL_BIND)
1488                        *nodemask = &(*mpol)->v.nodes;
1489        }
1490        return zl;
1491}
1492#endif
1493
1494/* Allocate a page in interleaved policy.
1495   Own path because it needs to do special accounting. */
1496static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1497                                        unsigned nid)
1498{
1499        struct zonelist *zl;
1500        struct page *page;
1501
1502        zl = node_zonelist(nid, gfp);
1503        page = __alloc_pages(gfp, order, zl);
1504        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1505                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1506        return page;
1507}
1508
1509/**
1510 *      alloc_page_vma  - Allocate a page for a VMA.
1511 *
1512 *      @gfp:
1513 *      %GFP_USER    user allocation.
1514 *      %GFP_KERNEL  kernel allocations,
1515 *      %GFP_HIGHMEM highmem/user allocations,
1516 *      %GFP_FS      allocation should not call back into a file system.
1517 *      %GFP_ATOMIC  don't sleep.
1518 *
1519 *      @vma:  Pointer to VMA or NULL if not available.
1520 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1521 *
1522 *      This function allocates a page from the kernel page pool and applies
1523 *      a NUMA policy associated with the VMA or the current process.
1524 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1525 *      mm_struct of the VMA to prevent it from going away. Should be used for
1526 *      all allocations for pages that will be mapped into
1527 *      user space. Returns NULL when no page can be allocated.
1528 *
1529 *      Should be called with the mm_sem of the vma hold.
1530 */
1531struct page *
1532alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1533{
1534        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1535        struct zonelist *zl;
1536
1537        cpuset_update_task_memory_state();
1538
1539        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1540                unsigned nid;
1541
1542                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1543                mpol_cond_put(pol);
1544                return alloc_page_interleave(gfp, 0, nid);
1545        }
1546        zl = policy_zonelist(gfp, pol);
1547        if (unlikely(mpol_needs_cond_ref(pol))) {
1548                /*
1549                 * slow path: ref counted shared policy
1550                 */
1551                struct page *page =  __alloc_pages_nodemask(gfp, 0,
1552                                                zl, policy_nodemask(gfp, pol));
1553                __mpol_put(pol);
1554                return page;
1555        }
1556        /*
1557         * fast path:  default or task policy
1558         */
1559        return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1560}
1561
1562/**
1563 *      alloc_pages_current - Allocate pages.
1564 *
1565 *      @gfp:
1566 *              %GFP_USER   user allocation,
1567 *              %GFP_KERNEL kernel allocation,
1568 *              %GFP_HIGHMEM highmem allocation,
1569 *              %GFP_FS     don't call back into a file system.
1570 *              %GFP_ATOMIC don't sleep.
1571 *      @order: Power of two of allocation size in pages. 0 is a single page.
1572 *
1573 *      Allocate a page from the kernel page pool.  When not in
1574 *      interrupt context and apply the current process NUMA policy.
1575 *      Returns NULL when no page can be allocated.
1576 *
1577 *      Don't call cpuset_update_task_memory_state() unless
1578 *      1) it's ok to take cpuset_sem (can WAIT), and
1579 *      2) allocating for current task (not interrupt).
1580 */
1581struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1582{
1583        struct mempolicy *pol = current->mempolicy;
1584
1585        if ((gfp & __GFP_WAIT) && !in_interrupt())
1586                cpuset_update_task_memory_state();
1587        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1588                pol = &default_policy;
1589
1590        /*
1591         * No reference counting needed for current->mempolicy
1592         * nor system default_policy
1593         */
1594        if (pol->mode == MPOL_INTERLEAVE)
1595                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1596        return __alloc_pages_nodemask(gfp, order,
1597                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1598}
1599EXPORT_SYMBOL(alloc_pages_current);
1600
1601/*
1602 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1603 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1604 * with the mems_allowed returned by cpuset_mems_allowed().  This
1605 * keeps mempolicies cpuset relative after its cpuset moves.  See
1606 * further kernel/cpuset.c update_nodemask().
1607 */
1608
1609/* Slow path of a mempolicy duplicate */
1610struct mempolicy *__mpol_dup(struct mempolicy *old)
1611{
1612        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1613
1614        if (!new)
1615                return ERR_PTR(-ENOMEM);
1616        if (current_cpuset_is_being_rebound()) {
1617                nodemask_t mems = cpuset_mems_allowed(current);
1618                mpol_rebind_policy(old, &mems);
1619        }
1620        *new = *old;
1621        atomic_set(&new->refcnt, 1);
1622        return new;
1623}
1624
1625/*
1626 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1627 * eliminate the * MPOL_F_* flags that require conditional ref and
1628 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1629 * after return.  Use the returned value.
1630 *
1631 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1632 * policy lookup, even if the policy needs/has extra ref on lookup.
1633 * shmem_readahead needs this.
1634 */
1635struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1636                                                struct mempolicy *frompol)
1637{
1638        if (!mpol_needs_cond_ref(frompol))
1639                return frompol;
1640
1641        *tompol = *frompol;
1642        tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1643        __mpol_put(frompol);
1644        return tompol;
1645}
1646
1647static int mpol_match_intent(const struct mempolicy *a,
1648                             const struct mempolicy *b)
1649{
1650        if (a->flags != b->flags)
1651                return 0;
1652        if (!mpol_store_user_nodemask(a))
1653                return 1;
1654        return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1655}
1656
1657/* Slow path of a mempolicy comparison */
1658int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1659{
1660        if (!a || !b)
1661                return 0;
1662        if (a->mode != b->mode)
1663                return 0;
1664        if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1665                return 0;
1666        switch (a->mode) {
1667        case MPOL_BIND:
1668                /* Fall through */
1669        case MPOL_INTERLEAVE:
1670                return nodes_equal(a->v.nodes, b->v.nodes);
1671        case MPOL_PREFERRED:
1672                return a->v.preferred_node == b->v.preferred_node &&
1673                        a->flags == b->flags;
1674        default:
1675                BUG();
1676                return 0;
1677        }
1678}
1679
1680/*
1681 * Shared memory backing store policy support.
1682 *
1683 * Remember policies even when nobody has shared memory mapped.
1684 * The policies are kept in Red-Black tree linked from the inode.
1685 * They are protected by the sp->lock spinlock, which should be held
1686 * for any accesses to the tree.
1687 */
1688
1689/* lookup first element intersecting start-end */
1690/* Caller holds sp->lock */
1691static struct sp_node *
1692sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1693{
1694        struct rb_node *n = sp->root.rb_node;
1695
1696        while (n) {
1697                struct sp_node *p = rb_entry(n, struct sp_node, nd);
1698
1699                if (start >= p->end)
1700                        n = n->rb_right;
1701                else if (end <= p->start)
1702                        n = n->rb_left;
1703                else
1704                        break;
1705        }
1706        if (!n)
1707                return NULL;
1708        for (;;) {
1709                struct sp_node *w = NULL;
1710                struct rb_node *prev = rb_prev(n);
1711                if (!prev)
1712                        break;
1713                w = rb_entry(prev, struct sp_node, nd);
1714                if (w->end <= start)
1715                        break;
1716                n = prev;
1717        }
1718        return rb_entry(n, struct sp_node, nd);
1719}
1720
1721/* Insert a new shared policy into the list. */
1722/* Caller holds sp->lock */
1723static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1724{
1725        struct rb_node **p = &sp->root.rb_node;
1726        struct rb_node *parent = NULL;
1727        struct sp_node *nd;
1728
1729        while (*p) {
1730                parent = *p;
1731                nd = rb_entry(parent, struct sp_node, nd);
1732                if (new->start < nd->start)
1733                        p = &(*p)->rb_left;
1734                else if (new->end > nd->end)
1735                        p = &(*p)->rb_right;
1736                else
1737                        BUG();
1738        }
1739        rb_link_node(&new->nd, parent, p);
1740        rb_insert_color(&new->nd, &sp->root);
1741        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1742                 new->policy ? new->policy->mode : 0);
1743}
1744
1745/* Find shared policy intersecting idx */
1746struct mempolicy *
1747mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1748{
1749        struct mempolicy *pol = NULL;
1750        struct sp_node *sn;
1751
1752        if (!sp->root.rb_node)
1753                return NULL;
1754        spin_lock(&sp->lock);
1755        sn = sp_lookup(sp, idx, idx+1);
1756        if (sn) {
1757                mpol_get(sn->policy);
1758                pol = sn->policy;
1759        }
1760        spin_unlock(&sp->lock);
1761        return pol;
1762}
1763
1764static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1765{
1766        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1767        rb_erase(&n->nd, &sp->root);
1768        mpol_put(n->policy);
1769        kmem_cache_free(sn_cache, n);
1770}
1771
1772static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1773                                struct mempolicy *pol)
1774{
1775        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1776
1777        if (!n)
1778                return NULL;
1779        n->start = start;
1780        n->end = end;
1781        mpol_get(pol);
1782        pol->flags |= MPOL_F_SHARED;    /* for unref */
1783        n->policy = pol;
1784        return n;
1785}
1786
1787/* Replace a policy range. */
1788static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1789                                 unsigned long end, struct sp_node *new)
1790{
1791        struct sp_node *n, *new2 = NULL;
1792
1793restart:
1794        spin_lock(&sp->lock);
1795        n = sp_lookup(sp, start, end);
1796        /* Take care of old policies in the same range. */
1797        while (n && n->start < end) {
1798                struct rb_node *next = rb_next(&n->nd);
1799                if (n->start >= start) {
1800                        if (n->end <= end)
1801                                sp_delete(sp, n);
1802                        else
1803                                n->start = end;
1804                } else {
1805                        /* Old policy spanning whole new range. */
1806                        if (n->end > end) {
1807                                if (!new2) {
1808                                        spin_unlock(&sp->lock);
1809                                        new2 = sp_alloc(end, n->end, n->policy);
1810                                        if (!new2)
1811                                                return -ENOMEM;
1812                                        goto restart;
1813                                }
1814                                n->end = start;
1815                                sp_insert(sp, new2);
1816                                new2 = NULL;
1817                                break;
1818                        } else
1819                                n->end = start;
1820                }
1821                if (!next)
1822                        break;
1823                n = rb_entry(next, struct sp_node, nd);
1824        }
1825        if (new)
1826                sp_insert(sp, new);
1827        spin_unlock(&sp->lock);
1828        if (new2) {
1829                mpol_put(new2->policy);
1830                kmem_cache_free(sn_cache, new2);
1831        }
1832        return 0;
1833}
1834
1835/**
1836 * mpol_shared_policy_init - initialize shared policy for inode
1837 * @sp: pointer to inode shared policy
1838 * @mpol:  struct mempolicy to install
1839 *
1840 * Install non-NULL @mpol in inode's shared policy rb-tree.
1841 * On entry, the current task has a reference on a non-NULL @mpol.
1842 * This must be released on exit.
1843 */
1844void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1845{
1846        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1847        spin_lock_init(&sp->lock);
1848
1849        if (mpol) {
1850                struct vm_area_struct pvma;
1851                struct mempolicy *new;
1852
1853                /* contextualize the tmpfs mount point mempolicy */
1854                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1855                mpol_put(mpol); /* drop our ref on sb mpol */
1856                if (IS_ERR(new))
1857                        return;         /* no valid nodemask intersection */
1858
1859                /* Create pseudo-vma that contains just the policy */
1860                memset(&pvma, 0, sizeof(struct vm_area_struct));
1861                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1862                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1863                mpol_put(new);                  /* drop initial ref */
1864        }
1865}
1866
1867int mpol_set_shared_policy(struct shared_policy *info,
1868                        struct vm_area_struct *vma, struct mempolicy *npol)
1869{
1870        int err;
1871        struct sp_node *new = NULL;
1872        unsigned long sz = vma_pages(vma);
1873
1874        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1875                 vma->vm_pgoff,
1876                 sz, npol ? npol->mode : -1,
1877                 npol ? npol->flags : -1,
1878                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1879
1880        if (npol) {
1881                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1882                if (!new)
1883                        return -ENOMEM;
1884        }
1885        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1886        if (err && new)
1887                kmem_cache_free(sn_cache, new);
1888        return err;
1889}
1890
1891/* Free a backing policy store on inode delete. */
1892void mpol_free_shared_policy(struct shared_policy *p)
1893{
1894        struct sp_node *n;
1895        struct rb_node *next;
1896
1897        if (!p->root.rb_node)
1898                return;
1899        spin_lock(&p->lock);
1900        next = rb_first(&p->root);
1901        while (next) {
1902                n = rb_entry(next, struct sp_node, nd);
1903                next = rb_next(&n->nd);
1904                rb_erase(&n->nd, &p->root);
1905                mpol_put(n->policy);
1906                kmem_cache_free(sn_cache, n);
1907        }
1908        spin_unlock(&p->lock);
1909}
1910
1911/* assumes fs == KERNEL_DS */
1912void __init numa_policy_init(void)
1913{
1914        nodemask_t interleave_nodes;
1915        unsigned long largest = 0;
1916        int nid, prefer = 0;
1917
1918        policy_cache = kmem_cache_create("numa_policy",
1919                                         sizeof(struct mempolicy),
1920                                         0, SLAB_PANIC, NULL);
1921
1922        sn_cache = kmem_cache_create("shared_policy_node",
1923                                     sizeof(struct sp_node),
1924                                     0, SLAB_PANIC, NULL);
1925
1926        /*
1927         * Set interleaving policy for system init. Interleaving is only
1928         * enabled across suitably sized nodes (default is >= 16MB), or
1929         * fall back to the largest node if they're all smaller.
1930         */
1931        nodes_clear(interleave_nodes);
1932        for_each_node_state(nid, N_HIGH_MEMORY) {
1933                unsigned long total_pages = node_present_pages(nid);
1934
1935                /* Preserve the largest node */
1936                if (largest < total_pages) {
1937                        largest = total_pages;
1938                        prefer = nid;
1939                }
1940
1941                /* Interleave this node? */
1942                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1943                        node_set(nid, interleave_nodes);
1944        }
1945
1946        /* All too small, use the largest */
1947        if (unlikely(nodes_empty(interleave_nodes)))
1948                node_set(prefer, interleave_nodes);
1949
1950        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1951                printk("numa_policy_init: interleaving failed\n");
1952}
1953
1954/* Reset policy of current process to default */
1955void numa_default_policy(void)
1956{
1957        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1958}
1959
1960/*
1961 * Parse and format mempolicy from/to strings
1962 */
1963
1964/*
1965 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1966 * Used only for mpol_parse_str() and mpol_to_str()
1967 */
1968#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1969static const char * const policy_types[] =
1970        { "default", "prefer", "bind", "interleave", "local" };
1971
1972
1973#ifdef CONFIG_TMPFS
1974/**
1975 * mpol_parse_str - parse string to mempolicy
1976 * @str:  string containing mempolicy to parse
1977 * @mpol:  pointer to struct mempolicy pointer, returned on success.
1978 * @no_context:  flag whether to "contextualize" the mempolicy
1979 *
1980 * Format of input:
1981 *      <mode>[=<flags>][:<nodelist>]
1982 *
1983 * if @no_context is true, save the input nodemask in w.user_nodemask in
1984 * the returned mempolicy.  This will be used to "clone" the mempolicy in
1985 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1986 * mount option.  Note that if 'static' or 'relative' mode flags were
1987 * specified, the input nodemask will already have been saved.  Saving
1988 * it again is redundant, but safe.
1989 *
1990 * On success, returns 0, else 1
1991 */
1992int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1993{
1994        struct mempolicy *new = NULL;
1995        unsigned short uninitialized_var(mode);
1996        unsigned short uninitialized_var(mode_flags);
1997        nodemask_t nodes;
1998        char *nodelist = strchr(str, ':');
1999        char *flags = strchr(str, '=');
2000        int i;
2001        int err = 1;
2002
2003        if (nodelist) {
2004                /* NUL-terminate mode or flags string */
2005                *nodelist++ = '\0';
2006                if (nodelist_parse(nodelist, nodes))
2007                        goto out;
2008                if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2009                        goto out;
2010        } else
2011                nodes_clear(nodes);
2012
2013        if (flags)
2014                *flags++ = '\0';        /* terminate mode string */
2015
2016        for (i = 0; i <= MPOL_LOCAL; i++) {
2017                if (!strcmp(str, policy_types[i])) {
2018                        mode = i;
2019                        break;
2020                }
2021        }
2022        if (i > MPOL_LOCAL)
2023                goto out;
2024
2025        switch (mode) {
2026        case MPOL_PREFERRED:
2027                /*
2028                 * Insist on a nodelist of one node only
2029                 */
2030                if (nodelist) {
2031                        char *rest = nodelist;
2032                        while (isdigit(*rest))
2033                                rest++;
2034                        if (!*rest)
2035                                err = 0;
2036                }
2037                break;
2038        case MPOL_INTERLEAVE:
2039                /*
2040                 * Default to online nodes with memory if no nodelist
2041                 */
2042                if (!nodelist)
2043                        nodes = node_states[N_HIGH_MEMORY];
2044                err = 0;
2045                break;
2046        case MPOL_LOCAL:
2047                /*
2048                 * Don't allow a nodelist;  mpol_new() checks flags
2049                 */
2050                if (nodelist)
2051                        goto out;
2052                mode = MPOL_PREFERRED;
2053                break;
2054
2055        /*
2056         * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2057         * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2058         */
2059        }
2060
2061        mode_flags = 0;
2062        if (flags) {
2063                /*
2064                 * Currently, we only support two mutually exclusive
2065                 * mode flags.
2066                 */
2067                if (!strcmp(flags, "static"))
2068                        mode_flags |= MPOL_F_STATIC_NODES;
2069                else if (!strcmp(flags, "relative"))
2070                        mode_flags |= MPOL_F_RELATIVE_NODES;
2071                else
2072                        err = 1;
2073        }
2074
2075        new = mpol_new(mode, mode_flags, &nodes);
2076        if (IS_ERR(new))
2077                err = 1;
2078        else if (no_context)
2079                new->w.user_nodemask = nodes;   /* save for contextualization */
2080
2081out:
2082        /* Restore string for error message */
2083        if (nodelist)
2084                *--nodelist = ':';
2085        if (flags)
2086                *--flags = '=';
2087        if (!err)
2088                *mpol = new;
2089        return err;
2090}
2091#endif /* CONFIG_TMPFS */
2092
2093/**
2094 * mpol_to_str - format a mempolicy structure for printing
2095 * @buffer:  to contain formatted mempolicy string
2096 * @maxlen:  length of @buffer
2097 * @pol:  pointer to mempolicy to be formatted
2098 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2099 *
2100 * Convert a mempolicy into a string.
2101 * Returns the number of characters in buffer (if positive)
2102 * or an error (negative)
2103 */
2104int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2105{
2106        char *p = buffer;
2107        int l;
2108        nodemask_t nodes;
2109        unsigned short mode;
2110        unsigned short flags = pol ? pol->flags : 0;
2111
2112        /*
2113         * Sanity check:  room for longest mode, flag and some nodes
2114         */
2115        VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2116
2117        if (!pol || pol == &default_policy)
2118                mode = MPOL_DEFAULT;
2119        else
2120                mode = pol->mode;
2121
2122        switch (mode) {
2123        case MPOL_DEFAULT:
2124                nodes_clear(nodes);
2125                break;
2126
2127        case MPOL_PREFERRED:
2128                nodes_clear(nodes);
2129                if (flags & MPOL_F_LOCAL)
2130                        mode = MPOL_LOCAL;      /* pseudo-policy */
2131                else
2132                        node_set(pol->v.preferred_node, nodes);
2133                break;
2134
2135        case MPOL_BIND:
2136                /* Fall through */
2137        case MPOL_INTERLEAVE:
2138                if (no_context)
2139                        nodes = pol->w.user_nodemask;
2140                else
2141                        nodes = pol->v.nodes;
2142                break;
2143
2144        default:
2145                BUG();
2146        }
2147
2148        l = strlen(policy_types[mode]);
2149        if (buffer + maxlen < p + l + 1)
2150                return -ENOSPC;
2151
2152        strcpy(p, policy_types[mode]);
2153        p += l;
2154
2155        if (flags & MPOL_MODE_FLAGS) {
2156                if (buffer + maxlen < p + 2)
2157                        return -ENOSPC;
2158                *p++ = '=';
2159
2160                /*
2161                 * Currently, the only defined flags are mutually exclusive
2162                 */
2163                if (flags & MPOL_F_STATIC_NODES)
2164                        p += snprintf(p, buffer + maxlen - p, "static");
2165                else if (flags & MPOL_F_RELATIVE_NODES)
2166                        p += snprintf(p, buffer + maxlen - p, "relative");
2167        }
2168
2169        if (!nodes_empty(nodes)) {
2170                if (buffer + maxlen < p + 2)
2171                        return -ENOSPC;
2172                *p++ = ':';
2173                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2174        }
2175        return p - buffer;
2176}
2177
2178struct numa_maps {
2179        unsigned long pages;
2180        unsigned long anon;
2181        unsigned long active;
2182        unsigned long writeback;
2183        unsigned long mapcount_max;
2184        unsigned long dirty;
2185        unsigned long swapcache;
2186        unsigned long node[MAX_NUMNODES];
2187};
2188
2189static void gather_stats(struct page *page, void *private, int pte_dirty)
2190{
2191        struct numa_maps *md = private;
2192        int count = page_mapcount(page);
2193
2194        md->pages++;
2195        if (pte_dirty || PageDirty(page))
2196                md->dirty++;
2197
2198        if (PageSwapCache(page))
2199                md->swapcache++;
2200
2201        if (PageActive(page))
2202                md->active++;
2203
2204        if (PageWriteback(page))
2205                md->writeback++;
2206
2207        if (PageAnon(page))
2208                md->anon++;
2209
2210        if (count > md->mapcount_max)
2211                md->mapcount_max = count;
2212
2213        md->node[page_to_nid(page)]++;
2214}
2215
2216#ifdef CONFIG_HUGETLB_PAGE
2217static void check_huge_range(struct vm_area_struct *vma,
2218                unsigned long start, unsigned long end,
2219                struct numa_maps *md)
2220{
2221        unsigned long addr;
2222        struct page *page;
2223
2224        for (addr = start; addr < end; addr += HPAGE_SIZE) {
2225                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2226                pte_t pte;
2227
2228                if (!ptep)
2229                        continue;
2230
2231                pte = *ptep;
2232                if (pte_none(pte))
2233                        continue;
2234
2235                page = pte_page(pte);
2236                if (!page)
2237                        continue;
2238
2239                gather_stats(page, md, pte_dirty(*ptep));
2240        }
2241}
2242#else
2243static inline void check_huge_range(struct vm_area_struct *vma,
2244                unsigned long start, unsigned long end,
2245                struct numa_maps *md)
2246{
2247}
2248#endif
2249
2250/*
2251 * Display pages allocated per node and memory policy via /proc.
2252 */
2253int show_numa_map(struct seq_file *m, void *v)
2254{
2255        struct proc_maps_private *priv = m->private;
2256        struct vm_area_struct *vma = v;
2257        struct numa_maps *md;
2258        struct file *file = vma->vm_file;
2259        struct mm_struct *mm = vma->vm_mm;
2260        struct mempolicy *pol;
2261        int n;
2262        char buffer[50];
2263
2264        if (!mm)
2265                return 0;
2266
2267        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2268        if (!md)
2269                return 0;
2270
2271        pol = get_vma_policy(priv->task, vma, vma->vm_start);
2272        mpol_to_str(buffer, sizeof(buffer), pol, 0);
2273        mpol_cond_put(pol);
2274
2275        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2276
2277        if (file) {
2278                seq_printf(m, " file=");
2279                seq_path(m, &file->f_path, "\n\t= ");
2280        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2281                seq_printf(m, " heap");
2282        } else if (vma->vm_start <= mm->start_stack &&
2283                        vma->vm_end >= mm->start_stack) {
2284                seq_printf(m, " stack");
2285        }
2286
2287        if (is_vm_hugetlb_page(vma)) {
2288                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2289                seq_printf(m, " huge");
2290        } else {
2291                check_pgd_range(vma, vma->vm_start, vma->vm_end,
2292                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2293        }
2294
2295        if (!md->pages)
2296                goto out;
2297
2298        if (md->anon)
2299                seq_printf(m," anon=%lu",md->anon);
2300
2301        if (md->dirty)
2302                seq_printf(m," dirty=%lu",md->dirty);
2303
2304        if (md->pages != md->anon && md->pages != md->dirty)
2305                seq_printf(m, " mapped=%lu", md->pages);
2306
2307        if (md->mapcount_max > 1)
2308                seq_printf(m, " mapmax=%lu", md->mapcount_max);
2309
2310        if (md->swapcache)
2311                seq_printf(m," swapcache=%lu", md->swapcache);
2312
2313        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2314                seq_printf(m," active=%lu", md->active);
2315
2316        if (md->writeback)
2317                seq_printf(m," writeback=%lu", md->writeback);
2318
2319        for_each_node_state(n, N_HIGH_MEMORY)
2320                if (md->node[n])
2321                        seq_printf(m, " N%d=%lu", n, md->node[n]);
2322out:
2323        seq_putc(m, '\n');
2324        kfree(md);
2325
2326        if (m->count < m->size)
2327                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2328        return 0;
2329}
2330