linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case node -1 here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66*/
  67
  68#include <linux/mempolicy.h>
  69#include <linux/mm.h>
  70#include <linux/highmem.h>
  71#include <linux/hugetlb.h>
  72#include <linux/kernel.h>
  73#include <linux/sched.h>
  74#include <linux/nodemask.h>
  75#include <linux/cpuset.h>
  76#include <linux/gfp.h>
  77#include <linux/slab.h>
  78#include <linux/string.h>
  79#include <linux/module.h>
  80#include <linux/nsproxy.h>
  81#include <linux/interrupt.h>
  82#include <linux/init.h>
  83#include <linux/compat.h>
  84#include <linux/swap.h>
  85#include <linux/seq_file.h>
  86#include <linux/proc_fs.h>
  87#include <linux/migrate.h>
  88#include <linux/rmap.h>
  89#include <linux/security.h>
  90#include <linux/syscalls.h>
  91#include <linux/ctype.h>
  92
  93#include <asm/tlbflush.h>
  94#include <asm/uaccess.h>
  95
  96/* Internal flags */
  97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101static struct kmem_cache *policy_cache;
 102static struct kmem_cache *sn_cache;
 103
 104/* Highest zone. An specific allocation for a zone below that is not
 105   policied. */
 106enum zone_type policy_zone = 0;
 107
 108/*
 109 * run-time system-wide default policy => local allocation
 110 */
 111struct mempolicy default_policy = {
 112        .refcnt = ATOMIC_INIT(1), /* never free it */
 113        .mode = MPOL_PREFERRED,
 114        .flags = MPOL_F_LOCAL,
 115};
 116
 117static const struct mempolicy_operations {
 118        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 119        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 120} mpol_ops[MPOL_MAX];
 121
 122/* Check that the nodemask contains at least one populated zone */
 123static int is_valid_nodemask(const nodemask_t *nodemask)
 124{
 125        int nd, k;
 126
 127        /* Check that there is something useful in this mask */
 128        k = policy_zone;
 129
 130        for_each_node_mask(nd, *nodemask) {
 131                struct zone *z;
 132
 133                for (k = 0; k <= policy_zone; k++) {
 134                        z = &NODE_DATA(nd)->node_zones[k];
 135                        if (z->present_pages > 0)
 136                                return 1;
 137                }
 138        }
 139
 140        return 0;
 141}
 142
 143static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 144{
 145        return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 146}
 147
 148static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 149                                   const nodemask_t *rel)
 150{
 151        nodemask_t tmp;
 152        nodes_fold(tmp, *orig, nodes_weight(*rel));
 153        nodes_onto(*ret, tmp, *rel);
 154}
 155
 156static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 157{
 158        if (nodes_empty(*nodes))
 159                return -EINVAL;
 160        pol->v.nodes = *nodes;
 161        return 0;
 162}
 163
 164static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 165{
 166        if (!nodes)
 167                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 168        else if (nodes_empty(*nodes))
 169                return -EINVAL;                 /*  no allowed nodes */
 170        else
 171                pol->v.preferred_node = first_node(*nodes);
 172        return 0;
 173}
 174
 175static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 176{
 177        if (!is_valid_nodemask(nodes))
 178                return -EINVAL;
 179        pol->v.nodes = *nodes;
 180        return 0;
 181}
 182
 183/* Create a new policy */
 184static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 185                                  nodemask_t *nodes)
 186{
 187        struct mempolicy *policy;
 188        nodemask_t cpuset_context_nmask;
 189        int ret;
 190
 191        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 192                 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 193
 194        if (mode == MPOL_DEFAULT) {
 195                if (nodes && !nodes_empty(*nodes))
 196                        return ERR_PTR(-EINVAL);
 197                return NULL;    /* simply delete any existing policy */
 198        }
 199        VM_BUG_ON(!nodes);
 200
 201        /*
 202         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 203         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 204         * All other modes require a valid pointer to a non-empty nodemask.
 205         */
 206        if (mode == MPOL_PREFERRED) {
 207                if (nodes_empty(*nodes)) {
 208                        if (((flags & MPOL_F_STATIC_NODES) ||
 209                             (flags & MPOL_F_RELATIVE_NODES)))
 210                                return ERR_PTR(-EINVAL);
 211                        nodes = NULL;   /* flag local alloc */
 212                }
 213        } else if (nodes_empty(*nodes))
 214                return ERR_PTR(-EINVAL);
 215        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 216        if (!policy)
 217                return ERR_PTR(-ENOMEM);
 218        atomic_set(&policy->refcnt, 1);
 219        policy->mode = mode;
 220        policy->flags = flags;
 221
 222        if (nodes) {
 223                /*
 224                 * cpuset related setup doesn't apply to local allocation
 225                 */
 226                cpuset_update_task_memory_state();
 227                if (flags & MPOL_F_RELATIVE_NODES)
 228                        mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 229                                               &cpuset_current_mems_allowed);
 230                else
 231                        nodes_and(cpuset_context_nmask, *nodes,
 232                                  cpuset_current_mems_allowed);
 233                if (mpol_store_user_nodemask(policy))
 234                        policy->w.user_nodemask = *nodes;
 235                else
 236                        policy->w.cpuset_mems_allowed =
 237                                                cpuset_mems_allowed(current);
 238        }
 239
 240        ret = mpol_ops[mode].create(policy,
 241                                nodes ? &cpuset_context_nmask : NULL);
 242        if (ret < 0) {
 243                kmem_cache_free(policy_cache, policy);
 244                return ERR_PTR(ret);
 245        }
 246        return policy;
 247}
 248
 249/* Slow path of a mpol destructor. */
 250void __mpol_put(struct mempolicy *p)
 251{
 252        if (!atomic_dec_and_test(&p->refcnt))
 253                return;
 254        kmem_cache_free(policy_cache, p);
 255}
 256
 257static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 258{
 259}
 260
 261static void mpol_rebind_nodemask(struct mempolicy *pol,
 262                                 const nodemask_t *nodes)
 263{
 264        nodemask_t tmp;
 265
 266        if (pol->flags & MPOL_F_STATIC_NODES)
 267                nodes_and(tmp, pol->w.user_nodemask, *nodes);
 268        else if (pol->flags & MPOL_F_RELATIVE_NODES)
 269                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 270        else {
 271                nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 272                            *nodes);
 273                pol->w.cpuset_mems_allowed = *nodes;
 274        }
 275
 276        pol->v.nodes = tmp;
 277        if (!node_isset(current->il_next, tmp)) {
 278                current->il_next = next_node(current->il_next, tmp);
 279                if (current->il_next >= MAX_NUMNODES)
 280                        current->il_next = first_node(tmp);
 281                if (current->il_next >= MAX_NUMNODES)
 282                        current->il_next = numa_node_id();
 283        }
 284}
 285
 286static void mpol_rebind_preferred(struct mempolicy *pol,
 287                                  const nodemask_t *nodes)
 288{
 289        nodemask_t tmp;
 290
 291        if (pol->flags & MPOL_F_STATIC_NODES) {
 292                int node = first_node(pol->w.user_nodemask);
 293
 294                if (node_isset(node, *nodes)) {
 295                        pol->v.preferred_node = node;
 296                        pol->flags &= ~MPOL_F_LOCAL;
 297                } else
 298                        pol->flags |= MPOL_F_LOCAL;
 299        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 300                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 301                pol->v.preferred_node = first_node(tmp);
 302        } else if (!(pol->flags & MPOL_F_LOCAL)) {
 303                pol->v.preferred_node = node_remap(pol->v.preferred_node,
 304                                                   pol->w.cpuset_mems_allowed,
 305                                                   *nodes);
 306                pol->w.cpuset_mems_allowed = *nodes;
 307        }
 308}
 309
 310/* Migrate a policy to a different set of nodes */
 311static void mpol_rebind_policy(struct mempolicy *pol,
 312                               const nodemask_t *newmask)
 313{
 314        if (!pol)
 315                return;
 316        if (!mpol_store_user_nodemask(pol) &&
 317            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 318                return;
 319        mpol_ops[pol->mode].rebind(pol, newmask);
 320}
 321
 322/*
 323 * Wrapper for mpol_rebind_policy() that just requires task
 324 * pointer, and updates task mempolicy.
 325 */
 326
 327void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 328{
 329        mpol_rebind_policy(tsk->mempolicy, new);
 330}
 331
 332/*
 333 * Rebind each vma in mm to new nodemask.
 334 *
 335 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 336 */
 337
 338void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 339{
 340        struct vm_area_struct *vma;
 341
 342        down_write(&mm->mmap_sem);
 343        for (vma = mm->mmap; vma; vma = vma->vm_next)
 344                mpol_rebind_policy(vma->vm_policy, new);
 345        up_write(&mm->mmap_sem);
 346}
 347
 348static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 349        [MPOL_DEFAULT] = {
 350                .rebind = mpol_rebind_default,
 351        },
 352        [MPOL_INTERLEAVE] = {
 353                .create = mpol_new_interleave,
 354                .rebind = mpol_rebind_nodemask,
 355        },
 356        [MPOL_PREFERRED] = {
 357                .create = mpol_new_preferred,
 358                .rebind = mpol_rebind_preferred,
 359        },
 360        [MPOL_BIND] = {
 361                .create = mpol_new_bind,
 362                .rebind = mpol_rebind_nodemask,
 363        },
 364};
 365
 366static void gather_stats(struct page *, void *, int pte_dirty);
 367static void migrate_page_add(struct page *page, struct list_head *pagelist,
 368                                unsigned long flags);
 369
 370/* Scan through pages checking if pages follow certain conditions. */
 371static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 372                unsigned long addr, unsigned long end,
 373                const nodemask_t *nodes, unsigned long flags,
 374                void *private)
 375{
 376        pte_t *orig_pte;
 377        pte_t *pte;
 378        spinlock_t *ptl;
 379
 380        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 381        do {
 382                struct page *page;
 383                int nid;
 384
 385                if (!pte_present(*pte))
 386                        continue;
 387                page = vm_normal_page(vma, addr, *pte);
 388                if (!page)
 389                        continue;
 390                /*
 391                 * The check for PageReserved here is important to avoid
 392                 * handling zero pages and other pages that may have been
 393                 * marked special by the system.
 394                 *
 395                 * If the PageReserved would not be checked here then f.e.
 396                 * the location of the zero page could have an influence
 397                 * on MPOL_MF_STRICT, zero pages would be counted for
 398                 * the per node stats, and there would be useless attempts
 399                 * to put zero pages on the migration list.
 400                 */
 401                if (PageReserved(page))
 402                        continue;
 403                nid = page_to_nid(page);
 404                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 405                        continue;
 406
 407                if (flags & MPOL_MF_STATS)
 408                        gather_stats(page, private, pte_dirty(*pte));
 409                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 410                        migrate_page_add(page, private, flags);
 411                else
 412                        break;
 413        } while (pte++, addr += PAGE_SIZE, addr != end);
 414        pte_unmap_unlock(orig_pte, ptl);
 415        return addr != end;
 416}
 417
 418static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 419                unsigned long addr, unsigned long end,
 420                const nodemask_t *nodes, unsigned long flags,
 421                void *private)
 422{
 423        pmd_t *pmd;
 424        unsigned long next;
 425
 426        pmd = pmd_offset(pud, addr);
 427        do {
 428                next = pmd_addr_end(addr, end);
 429                if (pmd_none_or_clear_bad(pmd))
 430                        continue;
 431                if (check_pte_range(vma, pmd, addr, next, nodes,
 432                                    flags, private))
 433                        return -EIO;
 434        } while (pmd++, addr = next, addr != end);
 435        return 0;
 436}
 437
 438static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 439                unsigned long addr, unsigned long end,
 440                const nodemask_t *nodes, unsigned long flags,
 441                void *private)
 442{
 443        pud_t *pud;
 444        unsigned long next;
 445
 446        pud = pud_offset(pgd, addr);
 447        do {
 448                next = pud_addr_end(addr, end);
 449                if (pud_none_or_clear_bad(pud))
 450                        continue;
 451                if (check_pmd_range(vma, pud, addr, next, nodes,
 452                                    flags, private))
 453                        return -EIO;
 454        } while (pud++, addr = next, addr != end);
 455        return 0;
 456}
 457
 458static inline int check_pgd_range(struct vm_area_struct *vma,
 459                unsigned long addr, unsigned long end,
 460                const nodemask_t *nodes, unsigned long flags,
 461                void *private)
 462{
 463        pgd_t *pgd;
 464        unsigned long next;
 465
 466        pgd = pgd_offset(vma->vm_mm, addr);
 467        do {
 468                next = pgd_addr_end(addr, end);
 469                if (pgd_none_or_clear_bad(pgd))
 470                        continue;
 471                if (check_pud_range(vma, pgd, addr, next, nodes,
 472                                    flags, private))
 473                        return -EIO;
 474        } while (pgd++, addr = next, addr != end);
 475        return 0;
 476}
 477
 478/*
 479 * Check if all pages in a range are on a set of nodes.
 480 * If pagelist != NULL then isolate pages from the LRU and
 481 * put them on the pagelist.
 482 */
 483static struct vm_area_struct *
 484check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 485                const nodemask_t *nodes, unsigned long flags, void *private)
 486{
 487        int err;
 488        struct vm_area_struct *first, *vma, *prev;
 489
 490        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 491
 492                err = migrate_prep();
 493                if (err)
 494                        return ERR_PTR(err);
 495        }
 496
 497        first = find_vma(mm, start);
 498        if (!first)
 499                return ERR_PTR(-EFAULT);
 500        prev = NULL;
 501        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 502                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 503                        if (!vma->vm_next && vma->vm_end < end)
 504                                return ERR_PTR(-EFAULT);
 505                        if (prev && prev->vm_end < vma->vm_start)
 506                                return ERR_PTR(-EFAULT);
 507                }
 508                if (!is_vm_hugetlb_page(vma) &&
 509                    ((flags & MPOL_MF_STRICT) ||
 510                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 511                                vma_migratable(vma)))) {
 512                        unsigned long endvma = vma->vm_end;
 513
 514                        if (endvma > end)
 515                                endvma = end;
 516                        if (vma->vm_start > start)
 517                                start = vma->vm_start;
 518                        err = check_pgd_range(vma, start, endvma, nodes,
 519                                                flags, private);
 520                        if (err) {
 521                                first = ERR_PTR(err);
 522                                break;
 523                        }
 524                }
 525                prev = vma;
 526        }
 527        return first;
 528}
 529
 530/* Apply policy to a single VMA */
 531static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 532{
 533        int err = 0;
 534        struct mempolicy *old = vma->vm_policy;
 535
 536        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 537                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 538                 vma->vm_ops, vma->vm_file,
 539                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 540
 541        if (vma->vm_ops && vma->vm_ops->set_policy)
 542                err = vma->vm_ops->set_policy(vma, new);
 543        if (!err) {
 544                mpol_get(new);
 545                vma->vm_policy = new;
 546                mpol_put(old);
 547        }
 548        return err;
 549}
 550
 551/* Step 2: apply policy to a range and do splits. */
 552static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 553                       unsigned long end, struct mempolicy *new)
 554{
 555        struct vm_area_struct *next;
 556        int err;
 557
 558        err = 0;
 559        for (; vma && vma->vm_start < end; vma = next) {
 560                next = vma->vm_next;
 561                if (vma->vm_start < start)
 562                        err = split_vma(vma->vm_mm, vma, start, 1);
 563                if (!err && vma->vm_end > end)
 564                        err = split_vma(vma->vm_mm, vma, end, 0);
 565                if (!err)
 566                        err = policy_vma(vma, new);
 567                if (err)
 568                        break;
 569        }
 570        return err;
 571}
 572
 573/*
 574 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 575 * mempolicy.  Allows more rapid checking of this (combined perhaps
 576 * with other PF_* flag bits) on memory allocation hot code paths.
 577 *
 578 * If called from outside this file, the task 'p' should -only- be
 579 * a newly forked child not yet visible on the task list, because
 580 * manipulating the task flags of a visible task is not safe.
 581 *
 582 * The above limitation is why this routine has the funny name
 583 * mpol_fix_fork_child_flag().
 584 *
 585 * It is also safe to call this with a task pointer of current,
 586 * which the static wrapper mpol_set_task_struct_flag() does,
 587 * for use within this file.
 588 */
 589
 590void mpol_fix_fork_child_flag(struct task_struct *p)
 591{
 592        if (p->mempolicy)
 593                p->flags |= PF_MEMPOLICY;
 594        else
 595                p->flags &= ~PF_MEMPOLICY;
 596}
 597
 598static void mpol_set_task_struct_flag(void)
 599{
 600        mpol_fix_fork_child_flag(current);
 601}
 602
 603/* Set the process memory policy */
 604static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 605                             nodemask_t *nodes)
 606{
 607        struct mempolicy *new;
 608        struct mm_struct *mm = current->mm;
 609
 610        new = mpol_new(mode, flags, nodes);
 611        if (IS_ERR(new))
 612                return PTR_ERR(new);
 613
 614        /*
 615         * prevent changing our mempolicy while show_numa_maps()
 616         * is using it.
 617         * Note:  do_set_mempolicy() can be called at init time
 618         * with no 'mm'.
 619         */
 620        if (mm)
 621                down_write(&mm->mmap_sem);
 622        mpol_put(current->mempolicy);
 623        current->mempolicy = new;
 624        mpol_set_task_struct_flag();
 625        if (new && new->mode == MPOL_INTERLEAVE &&
 626            nodes_weight(new->v.nodes))
 627                current->il_next = first_node(new->v.nodes);
 628        if (mm)
 629                up_write(&mm->mmap_sem);
 630
 631        return 0;
 632}
 633
 634/*
 635 * Return nodemask for policy for get_mempolicy() query
 636 */
 637static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 638{
 639        nodes_clear(*nodes);
 640        if (p == &default_policy)
 641                return;
 642
 643        switch (p->mode) {
 644        case MPOL_BIND:
 645                /* Fall through */
 646        case MPOL_INTERLEAVE:
 647                *nodes = p->v.nodes;
 648                break;
 649        case MPOL_PREFERRED:
 650                if (!(p->flags & MPOL_F_LOCAL))
 651                        node_set(p->v.preferred_node, *nodes);
 652                /* else return empty node mask for local allocation */
 653                break;
 654        default:
 655                BUG();
 656        }
 657}
 658
 659static int lookup_node(struct mm_struct *mm, unsigned long addr)
 660{
 661        struct page *p;
 662        int err;
 663
 664        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 665        if (err >= 0) {
 666                err = page_to_nid(p);
 667                put_page(p);
 668        }
 669        return err;
 670}
 671
 672/* Retrieve NUMA policy */
 673static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 674                             unsigned long addr, unsigned long flags)
 675{
 676        int err;
 677        struct mm_struct *mm = current->mm;
 678        struct vm_area_struct *vma = NULL;
 679        struct mempolicy *pol = current->mempolicy;
 680
 681        cpuset_update_task_memory_state();
 682        if (flags &
 683                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 684                return -EINVAL;
 685
 686        if (flags & MPOL_F_MEMS_ALLOWED) {
 687                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 688                        return -EINVAL;
 689                *policy = 0;    /* just so it's initialized */
 690                *nmask  = cpuset_current_mems_allowed;
 691                return 0;
 692        }
 693
 694        if (flags & MPOL_F_ADDR) {
 695                /*
 696                 * Do NOT fall back to task policy if the
 697                 * vma/shared policy at addr is NULL.  We
 698                 * want to return MPOL_DEFAULT in this case.
 699                 */
 700                down_read(&mm->mmap_sem);
 701                vma = find_vma_intersection(mm, addr, addr+1);
 702                if (!vma) {
 703                        up_read(&mm->mmap_sem);
 704                        return -EFAULT;
 705                }
 706                if (vma->vm_ops && vma->vm_ops->get_policy)
 707                        pol = vma->vm_ops->get_policy(vma, addr);
 708                else
 709                        pol = vma->vm_policy;
 710        } else if (addr)
 711                return -EINVAL;
 712
 713        if (!pol)
 714                pol = &default_policy;  /* indicates default behavior */
 715
 716        if (flags & MPOL_F_NODE) {
 717                if (flags & MPOL_F_ADDR) {
 718                        err = lookup_node(mm, addr);
 719                        if (err < 0)
 720                                goto out;
 721                        *policy = err;
 722                } else if (pol == current->mempolicy &&
 723                                pol->mode == MPOL_INTERLEAVE) {
 724                        *policy = current->il_next;
 725                } else {
 726                        err = -EINVAL;
 727                        goto out;
 728                }
 729        } else {
 730                *policy = pol == &default_policy ? MPOL_DEFAULT :
 731                                                pol->mode;
 732                /*
 733                 * Internal mempolicy flags must be masked off before exposing
 734                 * the policy to userspace.
 735                 */
 736                *policy |= (pol->flags & MPOL_MODE_FLAGS);
 737        }
 738
 739        if (vma) {
 740                up_read(&current->mm->mmap_sem);
 741                vma = NULL;
 742        }
 743
 744        err = 0;
 745        if (nmask)
 746                get_policy_nodemask(pol, nmask);
 747
 748 out:
 749        mpol_cond_put(pol);
 750        if (vma)
 751                up_read(&current->mm->mmap_sem);
 752        return err;
 753}
 754
 755#ifdef CONFIG_MIGRATION
 756/*
 757 * page migration
 758 */
 759static void migrate_page_add(struct page *page, struct list_head *pagelist,
 760                                unsigned long flags)
 761{
 762        /*
 763         * Avoid migrating a page that is shared with others.
 764         */
 765        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 766                isolate_lru_page(page, pagelist);
 767}
 768
 769static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 770{
 771        return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 772}
 773
 774/*
 775 * Migrate pages from one node to a target node.
 776 * Returns error or the number of pages not migrated.
 777 */
 778static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 779                           int flags)
 780{
 781        nodemask_t nmask;
 782        LIST_HEAD(pagelist);
 783        int err = 0;
 784
 785        nodes_clear(nmask);
 786        node_set(source, nmask);
 787
 788        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 789                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 790
 791        if (!list_empty(&pagelist))
 792                err = migrate_pages(&pagelist, new_node_page, dest);
 793
 794        return err;
 795}
 796
 797/*
 798 * Move pages between the two nodesets so as to preserve the physical
 799 * layout as much as possible.
 800 *
 801 * Returns the number of page that could not be moved.
 802 */
 803int do_migrate_pages(struct mm_struct *mm,
 804        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 805{
 806        int busy = 0;
 807        int err = 0;
 808        nodemask_t tmp;
 809
 810        down_read(&mm->mmap_sem);
 811
 812        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 813        if (err)
 814                goto out;
 815
 816/*
 817 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 818 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 819 * bit in 'tmp', and return that <source, dest> pair for migration.
 820 * The pair of nodemasks 'to' and 'from' define the map.
 821 *
 822 * If no pair of bits is found that way, fallback to picking some
 823 * pair of 'source' and 'dest' bits that are not the same.  If the
 824 * 'source' and 'dest' bits are the same, this represents a node
 825 * that will be migrating to itself, so no pages need move.
 826 *
 827 * If no bits are left in 'tmp', or if all remaining bits left
 828 * in 'tmp' correspond to the same bit in 'to', return false
 829 * (nothing left to migrate).
 830 *
 831 * This lets us pick a pair of nodes to migrate between, such that
 832 * if possible the dest node is not already occupied by some other
 833 * source node, minimizing the risk of overloading the memory on a
 834 * node that would happen if we migrated incoming memory to a node
 835 * before migrating outgoing memory source that same node.
 836 *
 837 * A single scan of tmp is sufficient.  As we go, we remember the
 838 * most recent <s, d> pair that moved (s != d).  If we find a pair
 839 * that not only moved, but what's better, moved to an empty slot
 840 * (d is not set in tmp), then we break out then, with that pair.
 841 * Otherwise when we finish scannng from_tmp, we at least have the
 842 * most recent <s, d> pair that moved.  If we get all the way through
 843 * the scan of tmp without finding any node that moved, much less
 844 * moved to an empty node, then there is nothing left worth migrating.
 845 */
 846
 847        tmp = *from_nodes;
 848        while (!nodes_empty(tmp)) {
 849                int s,d;
 850                int source = -1;
 851                int dest = 0;
 852
 853                for_each_node_mask(s, tmp) {
 854                        d = node_remap(s, *from_nodes, *to_nodes);
 855                        if (s == d)
 856                                continue;
 857
 858                        source = s;     /* Node moved. Memorize */
 859                        dest = d;
 860
 861                        /* dest not in remaining from nodes? */
 862                        if (!node_isset(dest, tmp))
 863                                break;
 864                }
 865                if (source == -1)
 866                        break;
 867
 868                node_clear(source, tmp);
 869                err = migrate_to_node(mm, source, dest, flags);
 870                if (err > 0)
 871                        busy += err;
 872                if (err < 0)
 873                        break;
 874        }
 875out:
 876        up_read(&mm->mmap_sem);
 877        if (err < 0)
 878                return err;
 879        return busy;
 880
 881}
 882
 883/*
 884 * Allocate a new page for page migration based on vma policy.
 885 * Start assuming that page is mapped by vma pointed to by @private.
 886 * Search forward from there, if not.  N.B., this assumes that the
 887 * list of pages handed to migrate_pages()--which is how we get here--
 888 * is in virtual address order.
 889 */
 890static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 891{
 892        struct vm_area_struct *vma = (struct vm_area_struct *)private;
 893        unsigned long uninitialized_var(address);
 894
 895        while (vma) {
 896                address = page_address_in_vma(page, vma);
 897                if (address != -EFAULT)
 898                        break;
 899                vma = vma->vm_next;
 900        }
 901
 902        /*
 903         * if !vma, alloc_page_vma() will use task or system default policy
 904         */
 905        return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 906}
 907#else
 908
 909static void migrate_page_add(struct page *page, struct list_head *pagelist,
 910                                unsigned long flags)
 911{
 912}
 913
 914int do_migrate_pages(struct mm_struct *mm,
 915        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 916{
 917        return -ENOSYS;
 918}
 919
 920static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 921{
 922        return NULL;
 923}
 924#endif
 925
 926static long do_mbind(unsigned long start, unsigned long len,
 927                     unsigned short mode, unsigned short mode_flags,
 928                     nodemask_t *nmask, unsigned long flags)
 929{
 930        struct vm_area_struct *vma;
 931        struct mm_struct *mm = current->mm;
 932        struct mempolicy *new;
 933        unsigned long end;
 934        int err;
 935        LIST_HEAD(pagelist);
 936
 937        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 938                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 939                return -EINVAL;
 940        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 941                return -EPERM;
 942
 943        if (start & ~PAGE_MASK)
 944                return -EINVAL;
 945
 946        if (mode == MPOL_DEFAULT)
 947                flags &= ~MPOL_MF_STRICT;
 948
 949        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 950        end = start + len;
 951
 952        if (end < start)
 953                return -EINVAL;
 954        if (end == start)
 955                return 0;
 956
 957        new = mpol_new(mode, mode_flags, nmask);
 958        if (IS_ERR(new))
 959                return PTR_ERR(new);
 960
 961        /*
 962         * If we are using the default policy then operation
 963         * on discontinuous address spaces is okay after all
 964         */
 965        if (!new)
 966                flags |= MPOL_MF_DISCONTIG_OK;
 967
 968        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 969                 start, start + len, mode, mode_flags,
 970                 nmask ? nodes_addr(*nmask)[0] : -1);
 971
 972        down_write(&mm->mmap_sem);
 973        vma = check_range(mm, start, end, nmask,
 974                          flags | MPOL_MF_INVERT, &pagelist);
 975
 976        err = PTR_ERR(vma);
 977        if (!IS_ERR(vma)) {
 978                int nr_failed = 0;
 979
 980                err = mbind_range(vma, start, end, new);
 981
 982                if (!list_empty(&pagelist))
 983                        nr_failed = migrate_pages(&pagelist, new_vma_page,
 984                                                (unsigned long)vma);
 985
 986                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 987                        err = -EIO;
 988        } else
 989                putback_lru_pages(&pagelist);
 990
 991        up_write(&mm->mmap_sem);
 992        mpol_put(new);
 993        return err;
 994}
 995
 996/*
 997 * User space interface with variable sized bitmaps for nodelists.
 998 */
 999
1000/* Copy a node mask from user space. */
1001static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1002                     unsigned long maxnode)
1003{
1004        unsigned long k;
1005        unsigned long nlongs;
1006        unsigned long endmask;
1007
1008        --maxnode;
1009        nodes_clear(*nodes);
1010        if (maxnode == 0 || !nmask)
1011                return 0;
1012        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1013                return -EINVAL;
1014
1015        nlongs = BITS_TO_LONGS(maxnode);
1016        if ((maxnode % BITS_PER_LONG) == 0)
1017                endmask = ~0UL;
1018        else
1019                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1020
1021        /* When the user specified more nodes than supported just check
1022           if the non supported part is all zero. */
1023        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1024                if (nlongs > PAGE_SIZE/sizeof(long))
1025                        return -EINVAL;
1026                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1027                        unsigned long t;
1028                        if (get_user(t, nmask + k))
1029                                return -EFAULT;
1030                        if (k == nlongs - 1) {
1031                                if (t & endmask)
1032                                        return -EINVAL;
1033                        } else if (t)
1034                                return -EINVAL;
1035                }
1036                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1037                endmask = ~0UL;
1038        }
1039
1040        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1041                return -EFAULT;
1042        nodes_addr(*nodes)[nlongs-1] &= endmask;
1043        return 0;
1044}
1045
1046/* Copy a kernel node mask to user space */
1047static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1048                              nodemask_t *nodes)
1049{
1050        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1051        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1052
1053        if (copy > nbytes) {
1054                if (copy > PAGE_SIZE)
1055                        return -EINVAL;
1056                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1057                        return -EFAULT;
1058                copy = nbytes;
1059        }
1060        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1061}
1062
1063SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1064                unsigned long, mode, unsigned long __user *, nmask,
1065                unsigned long, maxnode, unsigned, flags)
1066{
1067        nodemask_t nodes;
1068        int err;
1069        unsigned short mode_flags;
1070
1071        mode_flags = mode & MPOL_MODE_FLAGS;
1072        mode &= ~MPOL_MODE_FLAGS;
1073        if (mode >= MPOL_MAX)
1074                return -EINVAL;
1075        if ((mode_flags & MPOL_F_STATIC_NODES) &&
1076            (mode_flags & MPOL_F_RELATIVE_NODES))
1077                return -EINVAL;
1078        err = get_nodes(&nodes, nmask, maxnode);
1079        if (err)
1080                return err;
1081        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1082}
1083
1084/* Set the process memory policy */
1085SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1086                unsigned long, maxnode)
1087{
1088        int err;
1089        nodemask_t nodes;
1090        unsigned short flags;
1091
1092        flags = mode & MPOL_MODE_FLAGS;
1093        mode &= ~MPOL_MODE_FLAGS;
1094        if ((unsigned int)mode >= MPOL_MAX)
1095                return -EINVAL;
1096        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1097                return -EINVAL;
1098        err = get_nodes(&nodes, nmask, maxnode);
1099        if (err)
1100                return err;
1101        return do_set_mempolicy(mode, flags, &nodes);
1102}
1103
1104SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1105                const unsigned long __user *, old_nodes,
1106                const unsigned long __user *, new_nodes)
1107{
1108        struct mm_struct *mm;
1109        struct task_struct *task;
1110        nodemask_t old;
1111        nodemask_t new;
1112        nodemask_t task_nodes;
1113        int err;
1114
1115        err = get_nodes(&old, old_nodes, maxnode);
1116        if (err)
1117                return err;
1118
1119        err = get_nodes(&new, new_nodes, maxnode);
1120        if (err)
1121                return err;
1122
1123        /* Find the mm_struct */
1124        read_lock(&tasklist_lock);
1125        task = pid ? find_task_by_vpid(pid) : current;
1126        if (!task) {
1127                read_unlock(&tasklist_lock);
1128                return -ESRCH;
1129        }
1130        mm = get_task_mm(task);
1131        read_unlock(&tasklist_lock);
1132
1133        if (!mm)
1134                return -EINVAL;
1135
1136        /*
1137         * Check if this process has the right to modify the specified
1138         * process. The right exists if the process has administrative
1139         * capabilities, superuser privileges or the same
1140         * userid as the target process.
1141         */
1142        if ((current->euid != task->suid) && (current->euid != task->uid) &&
1143            (current->uid != task->suid) && (current->uid != task->uid) &&
1144            !capable(CAP_SYS_NICE)) {
1145                err = -EPERM;
1146                goto out;
1147        }
1148
1149        task_nodes = cpuset_mems_allowed(task);
1150        /* Is the user allowed to access the target nodes? */
1151        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1152                err = -EPERM;
1153                goto out;
1154        }
1155
1156        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1157                err = -EINVAL;
1158                goto out;
1159        }
1160
1161        err = security_task_movememory(task);
1162        if (err)
1163                goto out;
1164
1165        err = do_migrate_pages(mm, &old, &new,
1166                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1167out:
1168        mmput(mm);
1169        return err;
1170}
1171
1172
1173/* Retrieve NUMA policy */
1174SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1175                unsigned long __user *, nmask, unsigned long, maxnode,
1176                unsigned long, addr, unsigned long, flags)
1177{
1178        int err;
1179        int uninitialized_var(pval);
1180        nodemask_t nodes;
1181
1182        if (nmask != NULL && maxnode < MAX_NUMNODES)
1183                return -EINVAL;
1184
1185        err = do_get_mempolicy(&pval, &nodes, addr, flags);
1186
1187        if (err)
1188                return err;
1189
1190        if (policy && put_user(pval, policy))
1191                return -EFAULT;
1192
1193        if (nmask)
1194                err = copy_nodes_to_user(nmask, maxnode, &nodes);
1195
1196        return err;
1197}
1198
1199#ifdef CONFIG_COMPAT
1200
1201asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1202                                     compat_ulong_t __user *nmask,
1203                                     compat_ulong_t maxnode,
1204                                     compat_ulong_t addr, compat_ulong_t flags)
1205{
1206        long err;
1207        unsigned long __user *nm = NULL;
1208        unsigned long nr_bits, alloc_size;
1209        DECLARE_BITMAP(bm, MAX_NUMNODES);
1210
1211        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1212        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1213
1214        if (nmask)
1215                nm = compat_alloc_user_space(alloc_size);
1216
1217        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1218
1219        if (!err && nmask) {
1220                err = copy_from_user(bm, nm, alloc_size);
1221                /* ensure entire bitmap is zeroed */
1222                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1223                err |= compat_put_bitmap(nmask, bm, nr_bits);
1224        }
1225
1226        return err;
1227}
1228
1229asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1230                                     compat_ulong_t maxnode)
1231{
1232        long err = 0;
1233        unsigned long __user *nm = NULL;
1234        unsigned long nr_bits, alloc_size;
1235        DECLARE_BITMAP(bm, MAX_NUMNODES);
1236
1237        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1238        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1239
1240        if (nmask) {
1241                err = compat_get_bitmap(bm, nmask, nr_bits);
1242                nm = compat_alloc_user_space(alloc_size);
1243                err |= copy_to_user(nm, bm, alloc_size);
1244        }
1245
1246        if (err)
1247                return -EFAULT;
1248
1249        return sys_set_mempolicy(mode, nm, nr_bits+1);
1250}
1251
1252asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1253                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1254                             compat_ulong_t maxnode, compat_ulong_t flags)
1255{
1256        long err = 0;
1257        unsigned long __user *nm = NULL;
1258        unsigned long nr_bits, alloc_size;
1259        nodemask_t bm;
1260
1261        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1262        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1263
1264        if (nmask) {
1265                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1266                nm = compat_alloc_user_space(alloc_size);
1267                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1268        }
1269
1270        if (err)
1271                return -EFAULT;
1272
1273        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1274}
1275
1276#endif
1277
1278/*
1279 * get_vma_policy(@task, @vma, @addr)
1280 * @task - task for fallback if vma policy == default
1281 * @vma   - virtual memory area whose policy is sought
1282 * @addr  - address in @vma for shared policy lookup
1283 *
1284 * Returns effective policy for a VMA at specified address.
1285 * Falls back to @task or system default policy, as necessary.
1286 * Current or other task's task mempolicy and non-shared vma policies
1287 * are protected by the task's mmap_sem, which must be held for read by
1288 * the caller.
1289 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1290 * count--added by the get_policy() vm_op, as appropriate--to protect against
1291 * freeing by another task.  It is the caller's responsibility to free the
1292 * extra reference for shared policies.
1293 */
1294static struct mempolicy *get_vma_policy(struct task_struct *task,
1295                struct vm_area_struct *vma, unsigned long addr)
1296{
1297        struct mempolicy *pol = task->mempolicy;
1298
1299        if (vma) {
1300                if (vma->vm_ops && vma->vm_ops->get_policy) {
1301                        struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1302                                                                        addr);
1303                        if (vpol)
1304                                pol = vpol;
1305                } else if (vma->vm_policy)
1306                        pol = vma->vm_policy;
1307        }
1308        if (!pol)
1309                pol = &default_policy;
1310        return pol;
1311}
1312
1313/*
1314 * Return a nodemask representing a mempolicy for filtering nodes for
1315 * page allocation
1316 */
1317static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1318{
1319        /* Lower zones don't get a nodemask applied for MPOL_BIND */
1320        if (unlikely(policy->mode == MPOL_BIND) &&
1321                        gfp_zone(gfp) >= policy_zone &&
1322                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1323                return &policy->v.nodes;
1324
1325        return NULL;
1326}
1327
1328/* Return a zonelist indicated by gfp for node representing a mempolicy */
1329static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1330{
1331        int nd = numa_node_id();
1332
1333        switch (policy->mode) {
1334        case MPOL_PREFERRED:
1335                if (!(policy->flags & MPOL_F_LOCAL))
1336                        nd = policy->v.preferred_node;
1337                break;
1338        case MPOL_BIND:
1339                /*
1340                 * Normally, MPOL_BIND allocations are node-local within the
1341                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1342                 * current node is part of the mask, we use the zonelist for
1343                 * the first node in the mask instead.
1344                 */
1345                if (unlikely(gfp & __GFP_THISNODE) &&
1346                                unlikely(!node_isset(nd, policy->v.nodes)))
1347                        nd = first_node(policy->v.nodes);
1348                break;
1349        case MPOL_INTERLEAVE: /* should not happen */
1350                break;
1351        default:
1352                BUG();
1353        }
1354        return node_zonelist(nd, gfp);
1355}
1356
1357/* Do dynamic interleaving for a process */
1358static unsigned interleave_nodes(struct mempolicy *policy)
1359{
1360        unsigned nid, next;
1361        struct task_struct *me = current;
1362
1363        nid = me->il_next;
1364        next = next_node(nid, policy->v.nodes);
1365        if (next >= MAX_NUMNODES)
1366                next = first_node(policy->v.nodes);
1367        if (next < MAX_NUMNODES)
1368                me->il_next = next;
1369        return nid;
1370}
1371
1372/*
1373 * Depending on the memory policy provide a node from which to allocate the
1374 * next slab entry.
1375 * @policy must be protected by freeing by the caller.  If @policy is
1376 * the current task's mempolicy, this protection is implicit, as only the
1377 * task can change it's policy.  The system default policy requires no
1378 * such protection.
1379 */
1380unsigned slab_node(struct mempolicy *policy)
1381{
1382        if (!policy || policy->flags & MPOL_F_LOCAL)
1383                return numa_node_id();
1384
1385        switch (policy->mode) {
1386        case MPOL_PREFERRED:
1387                /*
1388                 * handled MPOL_F_LOCAL above
1389                 */
1390                return policy->v.preferred_node;
1391
1392        case MPOL_INTERLEAVE:
1393                return interleave_nodes(policy);
1394
1395        case MPOL_BIND: {
1396                /*
1397                 * Follow bind policy behavior and start allocation at the
1398                 * first node.
1399                 */
1400                struct zonelist *zonelist;
1401                struct zone *zone;
1402                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1403                zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1404                (void)first_zones_zonelist(zonelist, highest_zoneidx,
1405                                                        &policy->v.nodes,
1406                                                        &zone);
1407                return zone ? zone->node : numa_node_id();
1408        }
1409
1410        default:
1411                BUG();
1412        }
1413}
1414
1415/* Do static interleaving for a VMA with known offset. */
1416static unsigned offset_il_node(struct mempolicy *pol,
1417                struct vm_area_struct *vma, unsigned long off)
1418{
1419        unsigned nnodes = nodes_weight(pol->v.nodes);
1420        unsigned target;
1421        int c;
1422        int nid = -1;
1423
1424        if (!nnodes)
1425                return numa_node_id();
1426        target = (unsigned int)off % nnodes;
1427        c = 0;
1428        do {
1429                nid = next_node(nid, pol->v.nodes);
1430                c++;
1431        } while (c <= target);
1432        return nid;
1433}
1434
1435/* Determine a node number for interleave */
1436static inline unsigned interleave_nid(struct mempolicy *pol,
1437                 struct vm_area_struct *vma, unsigned long addr, int shift)
1438{
1439        if (vma) {
1440                unsigned long off;
1441
1442                /*
1443                 * for small pages, there is no difference between
1444                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1445                 * for huge pages, since vm_pgoff is in units of small
1446                 * pages, we need to shift off the always 0 bits to get
1447                 * a useful offset.
1448                 */
1449                BUG_ON(shift < PAGE_SHIFT);
1450                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1451                off += (addr - vma->vm_start) >> shift;
1452                return offset_il_node(pol, vma, off);
1453        } else
1454                return interleave_nodes(pol);
1455}
1456
1457#ifdef CONFIG_HUGETLBFS
1458/*
1459 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1460 * @vma = virtual memory area whose policy is sought
1461 * @addr = address in @vma for shared policy lookup and interleave policy
1462 * @gfp_flags = for requested zone
1463 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1464 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1465 *
1466 * Returns a zonelist suitable for a huge page allocation and a pointer
1467 * to the struct mempolicy for conditional unref after allocation.
1468 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1469 * @nodemask for filtering the zonelist.
1470 */
1471struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1472                                gfp_t gfp_flags, struct mempolicy **mpol,
1473                                nodemask_t **nodemask)
1474{
1475        struct zonelist *zl;
1476
1477        *mpol = get_vma_policy(current, vma, addr);
1478        *nodemask = NULL;       /* assume !MPOL_BIND */
1479
1480        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1481                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1482                                huge_page_shift(hstate_vma(vma))), gfp_flags);
1483        } else {
1484                zl = policy_zonelist(gfp_flags, *mpol);
1485                if ((*mpol)->mode == MPOL_BIND)
1486                        *nodemask = &(*mpol)->v.nodes;
1487        }
1488        return zl;
1489}
1490#endif
1491
1492/* Allocate a page in interleaved policy.
1493   Own path because it needs to do special accounting. */
1494static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1495                                        unsigned nid)
1496{
1497        struct zonelist *zl;
1498        struct page *page;
1499
1500        zl = node_zonelist(nid, gfp);
1501        page = __alloc_pages(gfp, order, zl);
1502        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1503                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1504        return page;
1505}
1506
1507/**
1508 *      alloc_page_vma  - Allocate a page for a VMA.
1509 *
1510 *      @gfp:
1511 *      %GFP_USER    user allocation.
1512 *      %GFP_KERNEL  kernel allocations,
1513 *      %GFP_HIGHMEM highmem/user allocations,
1514 *      %GFP_FS      allocation should not call back into a file system.
1515 *      %GFP_ATOMIC  don't sleep.
1516 *
1517 *      @vma:  Pointer to VMA or NULL if not available.
1518 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1519 *
1520 *      This function allocates a page from the kernel page pool and applies
1521 *      a NUMA policy associated with the VMA or the current process.
1522 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1523 *      mm_struct of the VMA to prevent it from going away. Should be used for
1524 *      all allocations for pages that will be mapped into
1525 *      user space. Returns NULL when no page can be allocated.
1526 *
1527 *      Should be called with the mm_sem of the vma hold.
1528 */
1529struct page *
1530alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1531{
1532        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1533        struct zonelist *zl;
1534
1535        cpuset_update_task_memory_state();
1536
1537        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1538                unsigned nid;
1539
1540                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1541                mpol_cond_put(pol);
1542                return alloc_page_interleave(gfp, 0, nid);
1543        }
1544        zl = policy_zonelist(gfp, pol);
1545        if (unlikely(mpol_needs_cond_ref(pol))) {
1546                /*
1547                 * slow path: ref counted shared policy
1548                 */
1549                struct page *page =  __alloc_pages_nodemask(gfp, 0,
1550                                                zl, policy_nodemask(gfp, pol));
1551                __mpol_put(pol);
1552                return page;
1553        }
1554        /*
1555         * fast path:  default or task policy
1556         */
1557        return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1558}
1559
1560/**
1561 *      alloc_pages_current - Allocate pages.
1562 *
1563 *      @gfp:
1564 *              %GFP_USER   user allocation,
1565 *              %GFP_KERNEL kernel allocation,
1566 *              %GFP_HIGHMEM highmem allocation,
1567 *              %GFP_FS     don't call back into a file system.
1568 *              %GFP_ATOMIC don't sleep.
1569 *      @order: Power of two of allocation size in pages. 0 is a single page.
1570 *
1571 *      Allocate a page from the kernel page pool.  When not in
1572 *      interrupt context and apply the current process NUMA policy.
1573 *      Returns NULL when no page can be allocated.
1574 *
1575 *      Don't call cpuset_update_task_memory_state() unless
1576 *      1) it's ok to take cpuset_sem (can WAIT), and
1577 *      2) allocating for current task (not interrupt).
1578 */
1579struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1580{
1581        struct mempolicy *pol = current->mempolicy;
1582
1583        if ((gfp & __GFP_WAIT) && !in_interrupt())
1584                cpuset_update_task_memory_state();
1585        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1586                pol = &default_policy;
1587
1588        /*
1589         * No reference counting needed for current->mempolicy
1590         * nor system default_policy
1591         */
1592        if (pol->mode == MPOL_INTERLEAVE)
1593                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1594        return __alloc_pages_nodemask(gfp, order,
1595                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1596}
1597EXPORT_SYMBOL(alloc_pages_current);
1598
1599/*
1600 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1601 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1602 * with the mems_allowed returned by cpuset_mems_allowed().  This
1603 * keeps mempolicies cpuset relative after its cpuset moves.  See
1604 * further kernel/cpuset.c update_nodemask().
1605 */
1606
1607/* Slow path of a mempolicy duplicate */
1608struct mempolicy *__mpol_dup(struct mempolicy *old)
1609{
1610        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1611
1612        if (!new)
1613                return ERR_PTR(-ENOMEM);
1614        if (current_cpuset_is_being_rebound()) {
1615                nodemask_t mems = cpuset_mems_allowed(current);
1616                mpol_rebind_policy(old, &mems);
1617        }
1618        *new = *old;
1619        atomic_set(&new->refcnt, 1);
1620        return new;
1621}
1622
1623/*
1624 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1625 * eliminate the * MPOL_F_* flags that require conditional ref and
1626 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1627 * after return.  Use the returned value.
1628 *
1629 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1630 * policy lookup, even if the policy needs/has extra ref on lookup.
1631 * shmem_readahead needs this.
1632 */
1633struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1634                                                struct mempolicy *frompol)
1635{
1636        if (!mpol_needs_cond_ref(frompol))
1637                return frompol;
1638
1639        *tompol = *frompol;
1640        tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1641        __mpol_put(frompol);
1642        return tompol;
1643}
1644
1645static int mpol_match_intent(const struct mempolicy *a,
1646                             const struct mempolicy *b)
1647{
1648        if (a->flags != b->flags)
1649                return 0;
1650        if (!mpol_store_user_nodemask(a))
1651                return 1;
1652        return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1653}
1654
1655/* Slow path of a mempolicy comparison */
1656int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1657{
1658        if (!a || !b)
1659                return 0;
1660        if (a->mode != b->mode)
1661                return 0;
1662        if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1663                return 0;
1664        switch (a->mode) {
1665        case MPOL_BIND:
1666                /* Fall through */
1667        case MPOL_INTERLEAVE:
1668                return nodes_equal(a->v.nodes, b->v.nodes);
1669        case MPOL_PREFERRED:
1670                return a->v.preferred_node == b->v.preferred_node &&
1671                        a->flags == b->flags;
1672        default:
1673                BUG();
1674                return 0;
1675        }
1676}
1677
1678/*
1679 * Shared memory backing store policy support.
1680 *
1681 * Remember policies even when nobody has shared memory mapped.
1682 * The policies are kept in Red-Black tree linked from the inode.
1683 * They are protected by the sp->lock spinlock, which should be held
1684 * for any accesses to the tree.
1685 */
1686
1687/* lookup first element intersecting start-end */
1688/* Caller holds sp->lock */
1689static struct sp_node *
1690sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1691{
1692        struct rb_node *n = sp->root.rb_node;
1693
1694        while (n) {
1695                struct sp_node *p = rb_entry(n, struct sp_node, nd);
1696
1697                if (start >= p->end)
1698                        n = n->rb_right;
1699                else if (end <= p->start)
1700                        n = n->rb_left;
1701                else
1702                        break;
1703        }
1704        if (!n)
1705                return NULL;
1706        for (;;) {
1707                struct sp_node *w = NULL;
1708                struct rb_node *prev = rb_prev(n);
1709                if (!prev)
1710                        break;
1711                w = rb_entry(prev, struct sp_node, nd);
1712                if (w->end <= start)
1713                        break;
1714                n = prev;
1715        }
1716        return rb_entry(n, struct sp_node, nd);
1717}
1718
1719/* Insert a new shared policy into the list. */
1720/* Caller holds sp->lock */
1721static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1722{
1723        struct rb_node **p = &sp->root.rb_node;
1724        struct rb_node *parent = NULL;
1725        struct sp_node *nd;
1726
1727        while (*p) {
1728                parent = *p;
1729                nd = rb_entry(parent, struct sp_node, nd);
1730                if (new->start < nd->start)
1731                        p = &(*p)->rb_left;
1732                else if (new->end > nd->end)
1733                        p = &(*p)->rb_right;
1734                else
1735                        BUG();
1736        }
1737        rb_link_node(&new->nd, parent, p);
1738        rb_insert_color(&new->nd, &sp->root);
1739        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1740                 new->policy ? new->policy->mode : 0);
1741}
1742
1743/* Find shared policy intersecting idx */
1744struct mempolicy *
1745mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1746{
1747        struct mempolicy *pol = NULL;
1748        struct sp_node *sn;
1749
1750        if (!sp->root.rb_node)
1751                return NULL;
1752        spin_lock(&sp->lock);
1753        sn = sp_lookup(sp, idx, idx+1);
1754        if (sn) {
1755                mpol_get(sn->policy);
1756                pol = sn->policy;
1757        }
1758        spin_unlock(&sp->lock);
1759        return pol;
1760}
1761
1762static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1763{
1764        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1765        rb_erase(&n->nd, &sp->root);
1766        mpol_put(n->policy);
1767        kmem_cache_free(sn_cache, n);
1768}
1769
1770static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1771                                struct mempolicy *pol)
1772{
1773        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1774
1775        if (!n)
1776                return NULL;
1777        n->start = start;
1778        n->end = end;
1779        mpol_get(pol);
1780        pol->flags |= MPOL_F_SHARED;    /* for unref */
1781        n->policy = pol;
1782        return n;
1783}
1784
1785/* Replace a policy range. */
1786static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1787                                 unsigned long end, struct sp_node *new)
1788{
1789        struct sp_node *n, *new2 = NULL;
1790
1791restart:
1792        spin_lock(&sp->lock);
1793        n = sp_lookup(sp, start, end);
1794        /* Take care of old policies in the same range. */
1795        while (n && n->start < end) {
1796                struct rb_node *next = rb_next(&n->nd);
1797                if (n->start >= start) {
1798                        if (n->end <= end)
1799                                sp_delete(sp, n);
1800                        else
1801                                n->start = end;
1802                } else {
1803                        /* Old policy spanning whole new range. */
1804                        if (n->end > end) {
1805                                if (!new2) {
1806                                        spin_unlock(&sp->lock);
1807                                        new2 = sp_alloc(end, n->end, n->policy);
1808                                        if (!new2)
1809                                                return -ENOMEM;
1810                                        goto restart;
1811                                }
1812                                n->end = start;
1813                                sp_insert(sp, new2);
1814                                new2 = NULL;
1815                                break;
1816                        } else
1817                                n->end = start;
1818                }
1819                if (!next)
1820                        break;
1821                n = rb_entry(next, struct sp_node, nd);
1822        }
1823        if (new)
1824                sp_insert(sp, new);
1825        spin_unlock(&sp->lock);
1826        if (new2) {
1827                mpol_put(new2->policy);
1828                kmem_cache_free(sn_cache, new2);
1829        }
1830        return 0;
1831}
1832
1833/**
1834 * mpol_shared_policy_init - initialize shared policy for inode
1835 * @sp: pointer to inode shared policy
1836 * @mpol:  struct mempolicy to install
1837 *
1838 * Install non-NULL @mpol in inode's shared policy rb-tree.
1839 * On entry, the current task has a reference on a non-NULL @mpol.
1840 * This must be released on exit.
1841 */
1842void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1843{
1844        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1845        spin_lock_init(&sp->lock);
1846
1847        if (mpol) {
1848                struct vm_area_struct pvma;
1849                struct mempolicy *new;
1850
1851                /* contextualize the tmpfs mount point mempolicy */
1852                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1853                mpol_put(mpol); /* drop our ref on sb mpol */
1854                if (IS_ERR(new))
1855                        return;         /* no valid nodemask intersection */
1856
1857                /* Create pseudo-vma that contains just the policy */
1858                memset(&pvma, 0, sizeof(struct vm_area_struct));
1859                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1860                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1861                mpol_put(new);                  /* drop initial ref */
1862        }
1863}
1864
1865int mpol_set_shared_policy(struct shared_policy *info,
1866                        struct vm_area_struct *vma, struct mempolicy *npol)
1867{
1868        int err;
1869        struct sp_node *new = NULL;
1870        unsigned long sz = vma_pages(vma);
1871
1872        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1873                 vma->vm_pgoff,
1874                 sz, npol ? npol->mode : -1,
1875                 npol ? npol->flags : -1,
1876                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1877
1878        if (npol) {
1879                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1880                if (!new)
1881                        return -ENOMEM;
1882        }
1883        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1884        if (err && new)
1885                kmem_cache_free(sn_cache, new);
1886        return err;
1887}
1888
1889/* Free a backing policy store on inode delete. */
1890void mpol_free_shared_policy(struct shared_policy *p)
1891{
1892        struct sp_node *n;
1893        struct rb_node *next;
1894
1895        if (!p->root.rb_node)
1896                return;
1897        spin_lock(&p->lock);
1898        next = rb_first(&p->root);
1899        while (next) {
1900                n = rb_entry(next, struct sp_node, nd);
1901                next = rb_next(&n->nd);
1902                rb_erase(&n->nd, &p->root);
1903                mpol_put(n->policy);
1904                kmem_cache_free(sn_cache, n);
1905        }
1906        spin_unlock(&p->lock);
1907}
1908
1909/* assumes fs == KERNEL_DS */
1910void __init numa_policy_init(void)
1911{
1912        nodemask_t interleave_nodes;
1913        unsigned long largest = 0;
1914        int nid, prefer = 0;
1915
1916        policy_cache = kmem_cache_create("numa_policy",
1917                                         sizeof(struct mempolicy),
1918                                         0, SLAB_PANIC, NULL);
1919
1920        sn_cache = kmem_cache_create("shared_policy_node",
1921                                     sizeof(struct sp_node),
1922                                     0, SLAB_PANIC, NULL);
1923
1924        /*
1925         * Set interleaving policy for system init. Interleaving is only
1926         * enabled across suitably sized nodes (default is >= 16MB), or
1927         * fall back to the largest node if they're all smaller.
1928         */
1929        nodes_clear(interleave_nodes);
1930        for_each_node_state(nid, N_HIGH_MEMORY) {
1931                unsigned long total_pages = node_present_pages(nid);
1932
1933                /* Preserve the largest node */
1934                if (largest < total_pages) {
1935                        largest = total_pages;
1936                        prefer = nid;
1937                }
1938
1939                /* Interleave this node? */
1940                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1941                        node_set(nid, interleave_nodes);
1942        }
1943
1944        /* All too small, use the largest */
1945        if (unlikely(nodes_empty(interleave_nodes)))
1946                node_set(prefer, interleave_nodes);
1947
1948        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1949                printk("numa_policy_init: interleaving failed\n");
1950}
1951
1952/* Reset policy of current process to default */
1953void numa_default_policy(void)
1954{
1955        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1956}
1957
1958/*
1959 * Parse and format mempolicy from/to strings
1960 */
1961
1962/*
1963 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1964 * Used only for mpol_parse_str() and mpol_to_str()
1965 */
1966#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1967static const char * const policy_types[] =
1968        { "default", "prefer", "bind", "interleave", "local" };
1969
1970
1971#ifdef CONFIG_TMPFS
1972/**
1973 * mpol_parse_str - parse string to mempolicy
1974 * @str:  string containing mempolicy to parse
1975 * @mpol:  pointer to struct mempolicy pointer, returned on success.
1976 * @no_context:  flag whether to "contextualize" the mempolicy
1977 *
1978 * Format of input:
1979 *      <mode>[=<flags>][:<nodelist>]
1980 *
1981 * if @no_context is true, save the input nodemask in w.user_nodemask in
1982 * the returned mempolicy.  This will be used to "clone" the mempolicy in
1983 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1984 * mount option.  Note that if 'static' or 'relative' mode flags were
1985 * specified, the input nodemask will already have been saved.  Saving
1986 * it again is redundant, but safe.
1987 *
1988 * On success, returns 0, else 1
1989 */
1990int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1991{
1992        struct mempolicy *new = NULL;
1993        unsigned short uninitialized_var(mode);
1994        unsigned short uninitialized_var(mode_flags);
1995        nodemask_t nodes;
1996        char *nodelist = strchr(str, ':');
1997        char *flags = strchr(str, '=');
1998        int i;
1999        int err = 1;
2000
2001        if (nodelist) {
2002                /* NUL-terminate mode or flags string */
2003                *nodelist++ = '\0';
2004                if (nodelist_parse(nodelist, nodes))
2005                        goto out;
2006                if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2007                        goto out;
2008        } else
2009                nodes_clear(nodes);
2010
2011        if (flags)
2012                *flags++ = '\0';        /* terminate mode string */
2013
2014        for (i = 0; i <= MPOL_LOCAL; i++) {
2015                if (!strcmp(str, policy_types[i])) {
2016                        mode = i;
2017                        break;
2018                }
2019        }
2020        if (i > MPOL_LOCAL)
2021                goto out;
2022
2023        switch (mode) {
2024        case MPOL_PREFERRED:
2025                /*
2026                 * Insist on a nodelist of one node only
2027                 */
2028                if (nodelist) {
2029                        char *rest = nodelist;
2030                        while (isdigit(*rest))
2031                                rest++;
2032                        if (*rest)
2033                                goto out;
2034                }
2035                break;
2036        case MPOL_INTERLEAVE:
2037                /*
2038                 * Default to online nodes with memory if no nodelist
2039                 */
2040                if (!nodelist)
2041                        nodes = node_states[N_HIGH_MEMORY];
2042                break;
2043        case MPOL_LOCAL:
2044                /*
2045                 * Don't allow a nodelist;  mpol_new() checks flags
2046                 */
2047                if (nodelist)
2048                        goto out;
2049                mode = MPOL_PREFERRED;
2050                break;
2051        case MPOL_DEFAULT:
2052                /*
2053                 * Insist on a empty nodelist
2054                 */
2055                if (!nodelist)
2056                        err = 0;
2057                goto out;
2058        case MPOL_BIND:
2059                /*
2060                 * Insist on a nodelist
2061                 */
2062                if (!nodelist)
2063                        goto out;
2064        }
2065
2066        mode_flags = 0;
2067        if (flags) {
2068                /*
2069                 * Currently, we only support two mutually exclusive
2070                 * mode flags.
2071                 */
2072                if (!strcmp(flags, "static"))
2073                        mode_flags |= MPOL_F_STATIC_NODES;
2074                else if (!strcmp(flags, "relative"))
2075                        mode_flags |= MPOL_F_RELATIVE_NODES;
2076                else
2077                        goto out;
2078        }
2079
2080        new = mpol_new(mode, mode_flags, &nodes);
2081        if (IS_ERR(new))
2082                goto out;
2083        err = 0;
2084        if (no_context) {
2085                /* save for contextualization */
2086                new->w.user_nodemask = nodes;
2087        }
2088
2089out:
2090        /* Restore string for error message */
2091        if (nodelist)
2092                *--nodelist = ':';
2093        if (flags)
2094                *--flags = '=';
2095        if (!err)
2096                *mpol = new;
2097        return err;
2098}
2099#endif /* CONFIG_TMPFS */
2100
2101/**
2102 * mpol_to_str - format a mempolicy structure for printing
2103 * @buffer:  to contain formatted mempolicy string
2104 * @maxlen:  length of @buffer
2105 * @pol:  pointer to mempolicy to be formatted
2106 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2107 *
2108 * Convert a mempolicy into a string.
2109 * Returns the number of characters in buffer (if positive)
2110 * or an error (negative)
2111 */
2112int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2113{
2114        char *p = buffer;
2115        int l;
2116        nodemask_t nodes;
2117        unsigned short mode;
2118        unsigned short flags = pol ? pol->flags : 0;
2119
2120        /*
2121         * Sanity check:  room for longest mode, flag and some nodes
2122         */
2123        VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2124
2125        if (!pol || pol == &default_policy)
2126                mode = MPOL_DEFAULT;
2127        else
2128                mode = pol->mode;
2129
2130        switch (mode) {
2131        case MPOL_DEFAULT:
2132                nodes_clear(nodes);
2133                break;
2134
2135        case MPOL_PREFERRED:
2136                nodes_clear(nodes);
2137                if (flags & MPOL_F_LOCAL)
2138                        mode = MPOL_LOCAL;      /* pseudo-policy */
2139                else
2140                        node_set(pol->v.preferred_node, nodes);
2141                break;
2142
2143        case MPOL_BIND:
2144                /* Fall through */
2145        case MPOL_INTERLEAVE:
2146                if (no_context)
2147                        nodes = pol->w.user_nodemask;
2148                else
2149                        nodes = pol->v.nodes;
2150                break;
2151
2152        default:
2153                BUG();
2154        }
2155
2156        l = strlen(policy_types[mode]);
2157        if (buffer + maxlen < p + l + 1)
2158                return -ENOSPC;
2159
2160        strcpy(p, policy_types[mode]);
2161        p += l;
2162
2163        if (flags & MPOL_MODE_FLAGS) {
2164                if (buffer + maxlen < p + 2)
2165                        return -ENOSPC;
2166                *p++ = '=';
2167
2168                /*
2169                 * Currently, the only defined flags are mutually exclusive
2170                 */
2171                if (flags & MPOL_F_STATIC_NODES)
2172                        p += snprintf(p, buffer + maxlen - p, "static");
2173                else if (flags & MPOL_F_RELATIVE_NODES)
2174                        p += snprintf(p, buffer + maxlen - p, "relative");
2175        }
2176
2177        if (!nodes_empty(nodes)) {
2178                if (buffer + maxlen < p + 2)
2179                        return -ENOSPC;
2180                *p++ = ':';
2181                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2182        }
2183        return p - buffer;
2184}
2185
2186struct numa_maps {
2187        unsigned long pages;
2188        unsigned long anon;
2189        unsigned long active;
2190        unsigned long writeback;
2191        unsigned long mapcount_max;
2192        unsigned long dirty;
2193        unsigned long swapcache;
2194        unsigned long node[MAX_NUMNODES];
2195};
2196
2197static void gather_stats(struct page *page, void *private, int pte_dirty)
2198{
2199        struct numa_maps *md = private;
2200        int count = page_mapcount(page);
2201
2202        md->pages++;
2203        if (pte_dirty || PageDirty(page))
2204                md->dirty++;
2205
2206        if (PageSwapCache(page))
2207                md->swapcache++;
2208
2209        if (PageActive(page))
2210                md->active++;
2211
2212        if (PageWriteback(page))
2213                md->writeback++;
2214
2215        if (PageAnon(page))
2216                md->anon++;
2217
2218        if (count > md->mapcount_max)
2219                md->mapcount_max = count;
2220
2221        md->node[page_to_nid(page)]++;
2222}
2223
2224#ifdef CONFIG_HUGETLB_PAGE
2225static void check_huge_range(struct vm_area_struct *vma,
2226                unsigned long start, unsigned long end,
2227                struct numa_maps *md)
2228{
2229        unsigned long addr;
2230        struct page *page;
2231        struct hstate *h = hstate_vma(vma);
2232        unsigned long sz = huge_page_size(h);
2233
2234        for (addr = start; addr < end; addr += sz) {
2235                pte_t *ptep = huge_pte_offset(vma->vm_mm,
2236                                                addr & huge_page_mask(h));
2237                pte_t pte;
2238
2239                if (!ptep)
2240                        continue;
2241
2242                pte = *ptep;
2243                if (pte_none(pte))
2244                        continue;
2245
2246                page = pte_page(pte);
2247                if (!page)
2248                        continue;
2249
2250                gather_stats(page, md, pte_dirty(*ptep));
2251        }
2252}
2253#else
2254static inline void check_huge_range(struct vm_area_struct *vma,
2255                unsigned long start, unsigned long end,
2256                struct numa_maps *md)
2257{
2258}
2259#endif
2260
2261/*
2262 * Display pages allocated per node and memory policy via /proc.
2263 */
2264int show_numa_map(struct seq_file *m, void *v)
2265{
2266        struct proc_maps_private *priv = m->private;
2267        struct vm_area_struct *vma = v;
2268        struct numa_maps *md;
2269        struct file *file = vma->vm_file;
2270        struct mm_struct *mm = vma->vm_mm;
2271        struct mempolicy *pol;
2272        int n;
2273        char buffer[50];
2274
2275        if (!mm)
2276                return 0;
2277
2278        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2279        if (!md)
2280                return 0;
2281
2282        pol = get_vma_policy(priv->task, vma, vma->vm_start);
2283        mpol_to_str(buffer, sizeof(buffer), pol, 0);
2284        mpol_cond_put(pol);
2285
2286        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2287
2288        if (file) {
2289                seq_printf(m, " file=");
2290                seq_path(m, &file->f_path, "\n\t= ");
2291        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2292                seq_printf(m, " heap");
2293        } else if (vma->vm_start <= mm->start_stack &&
2294                        vma->vm_end >= mm->start_stack) {
2295                seq_printf(m, " stack");
2296        }
2297
2298        if (is_vm_hugetlb_page(vma)) {
2299                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2300                seq_printf(m, " huge");
2301        } else {
2302                check_pgd_range(vma, vma->vm_start, vma->vm_end,
2303                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2304        }
2305
2306        if (!md->pages)
2307                goto out;
2308
2309        if (md->anon)
2310                seq_printf(m," anon=%lu",md->anon);
2311
2312        if (md->dirty)
2313                seq_printf(m," dirty=%lu",md->dirty);
2314
2315        if (md->pages != md->anon && md->pages != md->dirty)
2316                seq_printf(m, " mapped=%lu", md->pages);
2317
2318        if (md->mapcount_max > 1)
2319                seq_printf(m, " mapmax=%lu", md->mapcount_max);
2320
2321        if (md->swapcache)
2322                seq_printf(m," swapcache=%lu", md->swapcache);
2323
2324        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2325                seq_printf(m," active=%lu", md->active);
2326
2327        if (md->writeback)
2328                seq_printf(m," writeback=%lu", md->writeback);
2329
2330        for_each_node_state(n, N_HIGH_MEMORY)
2331                if (md->node[n])
2332                        seq_printf(m, " N%d=%lu", n, md->node[n]);
2333out:
2334        seq_putc(m, '\n');
2335        kfree(md);
2336
2337        if (m->count < m->size)
2338                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2339        return 0;
2340}
2341