linux/mm/mempolicy.c
<<
>>
Prefs
   1/*
   2 * Simple NUMA memory policy for the Linux kernel.
   3 *
   4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6 * Subject to the GNU Public License, version 2.
   7 *
   8 * NUMA policy allows the user to give hints in which node(s) memory should
   9 * be allocated.
  10 *
  11 * Support four policies per VMA and per process:
  12 *
  13 * The VMA policy has priority over the process policy for a page fault.
  14 *
  15 * interleave     Allocate memory interleaved over a set of nodes,
  16 *                with normal fallback if it fails.
  17 *                For VMA based allocations this interleaves based on the
  18 *                offset into the backing object or offset into the mapping
  19 *                for anonymous memory. For process policy an process counter
  20 *                is used.
  21 *
  22 * bind           Only allocate memory on a specific set of nodes,
  23 *                no fallback.
  24 *                FIXME: memory is allocated starting with the first node
  25 *                to the last. It would be better if bind would truly restrict
  26 *                the allocation to memory nodes instead
  27 *
  28 * preferred       Try a specific node first before normal fallback.
  29 *                As a special case node -1 here means do the allocation
  30 *                on the local CPU. This is normally identical to default,
  31 *                but useful to set in a VMA when you have a non default
  32 *                process policy.
  33 *
  34 * default        Allocate on the local node first, or when on a VMA
  35 *                use the process policy. This is what Linux always did
  36 *                in a NUMA aware kernel and still does by, ahem, default.
  37 *
  38 * The process policy is applied for most non interrupt memory allocations
  39 * in that process' context. Interrupts ignore the policies and always
  40 * try to allocate on the local CPU. The VMA policy is only applied for memory
  41 * allocations for a VMA in the VM.
  42 *
  43 * Currently there are a few corner cases in swapping where the policy
  44 * is not applied, but the majority should be handled. When process policy
  45 * is used it is not remembered over swap outs/swap ins.
  46 *
  47 * Only the highest zone in the zone hierarchy gets policied. Allocations
  48 * requesting a lower zone just use default policy. This implies that
  49 * on systems with highmem kernel lowmem allocation don't get policied.
  50 * Same with GFP_DMA allocations.
  51 *
  52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53 * all users and remembered even when nobody has memory mapped.
  54 */
  55
  56/* Notebook:
  57   fix mmap readahead to honour policy and enable policy for any page cache
  58   object
  59   statistics for bigpages
  60   global policy for page cache? currently it uses process policy. Requires
  61   first item above.
  62   handle mremap for shared memory (currently ignored for the policy)
  63   grows down?
  64   make bind policy root only? It can trigger oom much faster and the
  65   kernel is not always grateful with that.
  66   could replace all the switch()es with a mempolicy_ops structure.
  67*/
  68
  69#include <linux/mempolicy.h>
  70#include <linux/mm.h>
  71#include <linux/highmem.h>
  72#include <linux/hugetlb.h>
  73#include <linux/kernel.h>
  74#include <linux/sched.h>
  75#include <linux/mm.h>
  76#include <linux/nodemask.h>
  77#include <linux/cpuset.h>
  78#include <linux/gfp.h>
  79#include <linux/slab.h>
  80#include <linux/string.h>
  81#include <linux/module.h>
  82#include <linux/interrupt.h>
  83#include <linux/init.h>
  84#include <linux/compat.h>
  85#include <linux/mempolicy.h>
  86#include <linux/swap.h>
  87#include <linux/seq_file.h>
  88#include <linux/proc_fs.h>
  89#include <linux/migrate.h>
  90#include <linux/rmap.h>
  91#include <linux/security.h>
  92
  93#include <asm/tlbflush.h>
  94#include <asm/uaccess.h>
  95
  96/* Internal flags */
  97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101static struct kmem_cache *policy_cache;
 102static struct kmem_cache *sn_cache;
 103
 104#define PDprintk(fmt...)
 105
 106/* Highest zone. An specific allocation for a zone below that is not
 107   policied. */
 108enum zone_type policy_zone = 0;
 109
 110struct mempolicy default_policy = {
 111        .refcnt = ATOMIC_INIT(1), /* never free it */
 112        .policy = MPOL_DEFAULT,
 113};
 114
 115/* Do sanity checking on a policy */
 116static int mpol_check_policy(int mode, nodemask_t *nodes)
 117{
 118        int empty = nodes_empty(*nodes);
 119
 120        switch (mode) {
 121        case MPOL_DEFAULT:
 122                if (!empty)
 123                        return -EINVAL;
 124                break;
 125        case MPOL_BIND:
 126        case MPOL_INTERLEAVE:
 127                /* Preferred will only use the first bit, but allow
 128                   more for now. */
 129                if (empty)
 130                        return -EINVAL;
 131                break;
 132        }
 133        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134}
 135
 136/* Generate a custom zonelist for the BIND policy. */
 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138{
 139        struct zonelist *zl;
 140        int num, max, nd;
 141        enum zone_type k;
 142
 143        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 144        max++;                  /* space for zlcache_ptr (see mmzone.h) */
 145        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 146        if (!zl)
 147                return ERR_PTR(-ENOMEM);
 148        zl->zlcache_ptr = NULL;
 149        num = 0;
 150        /* First put in the highest zones from all nodes, then all the next 
 151           lower zones etc. Avoid empty zones because the memory allocator
 152           doesn't like them. If you implement node hot removal you
 153           have to fix that. */
 154        k = policy_zone;
 155        while (1) {
 156                for_each_node_mask(nd, *nodes) { 
 157                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
 158                        if (z->present_pages > 0) 
 159                                zl->zones[num++] = z;
 160                }
 161                if (k == 0)
 162                        break;
 163                k--;
 164        }
 165        if (num == 0) {
 166                kfree(zl);
 167                return ERR_PTR(-EINVAL);
 168        }
 169        zl->zones[num] = NULL;
 170        return zl;
 171}
 172
 173/* Create a new policy */
 174static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 175{
 176        struct mempolicy *policy;
 177
 178        PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 179        if (mode == MPOL_DEFAULT)
 180                return NULL;
 181        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 182        if (!policy)
 183                return ERR_PTR(-ENOMEM);
 184        atomic_set(&policy->refcnt, 1);
 185        switch (mode) {
 186        case MPOL_INTERLEAVE:
 187                policy->v.nodes = *nodes;
 188                if (nodes_weight(*nodes) == 0) {
 189                        kmem_cache_free(policy_cache, policy);
 190                        return ERR_PTR(-EINVAL);
 191                }
 192                break;
 193        case MPOL_PREFERRED:
 194                policy->v.preferred_node = first_node(*nodes);
 195                if (policy->v.preferred_node >= MAX_NUMNODES)
 196                        policy->v.preferred_node = -1;
 197                break;
 198        case MPOL_BIND:
 199                policy->v.zonelist = bind_zonelist(nodes);
 200                if (IS_ERR(policy->v.zonelist)) {
 201                        void *error_code = policy->v.zonelist;
 202                        kmem_cache_free(policy_cache, policy);
 203                        return error_code;
 204                }
 205                break;
 206        }
 207        policy->policy = mode;
 208        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 209        return policy;
 210}
 211
 212static void gather_stats(struct page *, void *, int pte_dirty);
 213static void migrate_page_add(struct page *page, struct list_head *pagelist,
 214                                unsigned long flags);
 215
 216/* Scan through pages checking if pages follow certain conditions. */
 217static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 218                unsigned long addr, unsigned long end,
 219                const nodemask_t *nodes, unsigned long flags,
 220                void *private)
 221{
 222        pte_t *orig_pte;
 223        pte_t *pte;
 224        spinlock_t *ptl;
 225
 226        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 227        do {
 228                struct page *page;
 229                int nid;
 230
 231                if (!pte_present(*pte))
 232                        continue;
 233                page = vm_normal_page(vma, addr, *pte);
 234                if (!page)
 235                        continue;
 236                /*
 237                 * The check for PageReserved here is important to avoid
 238                 * handling zero pages and other pages that may have been
 239                 * marked special by the system.
 240                 *
 241                 * If the PageReserved would not be checked here then f.e.
 242                 * the location of the zero page could have an influence
 243                 * on MPOL_MF_STRICT, zero pages would be counted for
 244                 * the per node stats, and there would be useless attempts
 245                 * to put zero pages on the migration list.
 246                 */
 247                if (PageReserved(page))
 248                        continue;
 249                nid = page_to_nid(page);
 250                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 251                        continue;
 252
 253                if (flags & MPOL_MF_STATS)
 254                        gather_stats(page, private, pte_dirty(*pte));
 255                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 256                        migrate_page_add(page, private, flags);
 257                else
 258                        break;
 259        } while (pte++, addr += PAGE_SIZE, addr != end);
 260        pte_unmap_unlock(orig_pte, ptl);
 261        return addr != end;
 262}
 263
 264static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 265                unsigned long addr, unsigned long end,
 266                const nodemask_t *nodes, unsigned long flags,
 267                void *private)
 268{
 269        pmd_t *pmd;
 270        unsigned long next;
 271
 272        pmd = pmd_offset(pud, addr);
 273        do {
 274                next = pmd_addr_end(addr, end);
 275                if (pmd_none_or_clear_bad(pmd))
 276                        continue;
 277                if (check_pte_range(vma, pmd, addr, next, nodes,
 278                                    flags, private))
 279                        return -EIO;
 280        } while (pmd++, addr = next, addr != end);
 281        return 0;
 282}
 283
 284static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 285                unsigned long addr, unsigned long end,
 286                const nodemask_t *nodes, unsigned long flags,
 287                void *private)
 288{
 289        pud_t *pud;
 290        unsigned long next;
 291
 292        pud = pud_offset(pgd, addr);
 293        do {
 294                next = pud_addr_end(addr, end);
 295                if (pud_none_or_clear_bad(pud))
 296                        continue;
 297                if (check_pmd_range(vma, pud, addr, next, nodes,
 298                                    flags, private))
 299                        return -EIO;
 300        } while (pud++, addr = next, addr != end);
 301        return 0;
 302}
 303
 304static inline int check_pgd_range(struct vm_area_struct *vma,
 305                unsigned long addr, unsigned long end,
 306                const nodemask_t *nodes, unsigned long flags,
 307                void *private)
 308{
 309        pgd_t *pgd;
 310        unsigned long next;
 311
 312        pgd = pgd_offset(vma->vm_mm, addr);
 313        do {
 314                next = pgd_addr_end(addr, end);
 315                if (pgd_none_or_clear_bad(pgd))
 316                        continue;
 317                if (check_pud_range(vma, pgd, addr, next, nodes,
 318                                    flags, private))
 319                        return -EIO;
 320        } while (pgd++, addr = next, addr != end);
 321        return 0;
 322}
 323
 324/*
 325 * Check if all pages in a range are on a set of nodes.
 326 * If pagelist != NULL then isolate pages from the LRU and
 327 * put them on the pagelist.
 328 */
 329static struct vm_area_struct *
 330check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 331                const nodemask_t *nodes, unsigned long flags, void *private)
 332{
 333        int err;
 334        struct vm_area_struct *first, *vma, *prev;
 335
 336        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 337
 338                err = migrate_prep();
 339                if (err)
 340                        return ERR_PTR(err);
 341        }
 342
 343        first = find_vma(mm, start);
 344        if (!first)
 345                return ERR_PTR(-EFAULT);
 346        prev = NULL;
 347        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 348                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 349                        if (!vma->vm_next && vma->vm_end < end)
 350                                return ERR_PTR(-EFAULT);
 351                        if (prev && prev->vm_end < vma->vm_start)
 352                                return ERR_PTR(-EFAULT);
 353                }
 354                if (!is_vm_hugetlb_page(vma) &&
 355                    ((flags & MPOL_MF_STRICT) ||
 356                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 357                                vma_migratable(vma)))) {
 358                        unsigned long endvma = vma->vm_end;
 359
 360                        if (endvma > end)
 361                                endvma = end;
 362                        if (vma->vm_start > start)
 363                                start = vma->vm_start;
 364                        err = check_pgd_range(vma, start, endvma, nodes,
 365                                                flags, private);
 366                        if (err) {
 367                                first = ERR_PTR(err);
 368                                break;
 369                        }
 370                }
 371                prev = vma;
 372        }
 373        return first;
 374}
 375
 376/* Apply policy to a single VMA */
 377static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 378{
 379        int err = 0;
 380        struct mempolicy *old = vma->vm_policy;
 381
 382        PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 383                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 384                 vma->vm_ops, vma->vm_file,
 385                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 386
 387        if (vma->vm_ops && vma->vm_ops->set_policy)
 388                err = vma->vm_ops->set_policy(vma, new);
 389        if (!err) {
 390                mpol_get(new);
 391                vma->vm_policy = new;
 392                mpol_free(old);
 393        }
 394        return err;
 395}
 396
 397/* Step 2: apply policy to a range and do splits. */
 398static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 399                       unsigned long end, struct mempolicy *new)
 400{
 401        struct vm_area_struct *next;
 402        int err;
 403
 404        err = 0;
 405        for (; vma && vma->vm_start < end; vma = next) {
 406                next = vma->vm_next;
 407                if (vma->vm_start < start)
 408                        err = split_vma(vma->vm_mm, vma, start, 1);
 409                if (!err && vma->vm_end > end)
 410                        err = split_vma(vma->vm_mm, vma, end, 0);
 411                if (!err)
 412                        err = policy_vma(vma, new);
 413                if (err)
 414                        break;
 415        }
 416        return err;
 417}
 418
 419static int contextualize_policy(int mode, nodemask_t *nodes)
 420{
 421        if (!nodes)
 422                return 0;
 423
 424        cpuset_update_task_memory_state();
 425        if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 426                return -EINVAL;
 427        return mpol_check_policy(mode, nodes);
 428}
 429
 430
 431/*
 432 * Update task->flags PF_MEMPOLICY bit: set iff non-default
 433 * mempolicy.  Allows more rapid checking of this (combined perhaps
 434 * with other PF_* flag bits) on memory allocation hot code paths.
 435 *
 436 * If called from outside this file, the task 'p' should -only- be
 437 * a newly forked child not yet visible on the task list, because
 438 * manipulating the task flags of a visible task is not safe.
 439 *
 440 * The above limitation is why this routine has the funny name
 441 * mpol_fix_fork_child_flag().
 442 *
 443 * It is also safe to call this with a task pointer of current,
 444 * which the static wrapper mpol_set_task_struct_flag() does,
 445 * for use within this file.
 446 */
 447
 448void mpol_fix_fork_child_flag(struct task_struct *p)
 449{
 450        if (p->mempolicy)
 451                p->flags |= PF_MEMPOLICY;
 452        else
 453                p->flags &= ~PF_MEMPOLICY;
 454}
 455
 456static void mpol_set_task_struct_flag(void)
 457{
 458        mpol_fix_fork_child_flag(current);
 459}
 460
 461/* Set the process memory policy */
 462long do_set_mempolicy(int mode, nodemask_t *nodes)
 463{
 464        struct mempolicy *new;
 465
 466        if (contextualize_policy(mode, nodes))
 467                return -EINVAL;
 468        new = mpol_new(mode, nodes);
 469        if (IS_ERR(new))
 470                return PTR_ERR(new);
 471        mpol_free(current->mempolicy);
 472        current->mempolicy = new;
 473        mpol_set_task_struct_flag();
 474        if (new && new->policy == MPOL_INTERLEAVE)
 475                current->il_next = first_node(new->v.nodes);
 476        return 0;
 477}
 478
 479/* Fill a zone bitmap for a policy */
 480static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 481{
 482        int i;
 483
 484        nodes_clear(*nodes);
 485        switch (p->policy) {
 486        case MPOL_BIND:
 487                for (i = 0; p->v.zonelist->zones[i]; i++)
 488                        node_set(zone_to_nid(p->v.zonelist->zones[i]),
 489                                *nodes);
 490                break;
 491        case MPOL_DEFAULT:
 492                break;
 493        case MPOL_INTERLEAVE:
 494                *nodes = p->v.nodes;
 495                break;
 496        case MPOL_PREFERRED:
 497                /* or use current node instead of online map? */
 498                if (p->v.preferred_node < 0)
 499                        *nodes = node_online_map;
 500                else
 501                        node_set(p->v.preferred_node, *nodes);
 502                break;
 503        default:
 504                BUG();
 505        }
 506}
 507
 508static int lookup_node(struct mm_struct *mm, unsigned long addr)
 509{
 510        struct page *p;
 511        int err;
 512
 513        err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 514        if (err >= 0) {
 515                err = page_to_nid(p);
 516                put_page(p);
 517        }
 518        return err;
 519}
 520
 521/* Retrieve NUMA policy */
 522long do_get_mempolicy(int *policy, nodemask_t *nmask,
 523                        unsigned long addr, unsigned long flags)
 524{
 525        int err;
 526        struct mm_struct *mm = current->mm;
 527        struct vm_area_struct *vma = NULL;
 528        struct mempolicy *pol = current->mempolicy;
 529
 530        cpuset_update_task_memory_state();
 531        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 532                return -EINVAL;
 533        if (flags & MPOL_F_ADDR) {
 534                down_read(&mm->mmap_sem);
 535                vma = find_vma_intersection(mm, addr, addr+1);
 536                if (!vma) {
 537                        up_read(&mm->mmap_sem);
 538                        return -EFAULT;
 539                }
 540                if (vma->vm_ops && vma->vm_ops->get_policy)
 541                        pol = vma->vm_ops->get_policy(vma, addr);
 542                else
 543                        pol = vma->vm_policy;
 544        } else if (addr)
 545                return -EINVAL;
 546
 547        if (!pol)
 548                pol = &default_policy;
 549
 550        if (flags & MPOL_F_NODE) {
 551                if (flags & MPOL_F_ADDR) {
 552                        err = lookup_node(mm, addr);
 553                        if (err < 0)
 554                                goto out;
 555                        *policy = err;
 556                } else if (pol == current->mempolicy &&
 557                                pol->policy == MPOL_INTERLEAVE) {
 558                        *policy = current->il_next;
 559                } else {
 560                        err = -EINVAL;
 561                        goto out;
 562                }
 563        } else
 564                *policy = pol->policy;
 565
 566        if (vma) {
 567                up_read(&current->mm->mmap_sem);
 568                vma = NULL;
 569        }
 570
 571        err = 0;
 572        if (nmask)
 573                get_zonemask(pol, nmask);
 574
 575 out:
 576        if (vma)
 577                up_read(&current->mm->mmap_sem);
 578        return err;
 579}
 580
 581#ifdef CONFIG_MIGRATION
 582/*
 583 * page migration
 584 */
 585static void migrate_page_add(struct page *page, struct list_head *pagelist,
 586                                unsigned long flags)
 587{
 588        /*
 589         * Avoid migrating a page that is shared with others.
 590         */
 591        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 592                isolate_lru_page(page, pagelist);
 593}
 594
 595static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 596{
 597        return alloc_pages_node(node, GFP_HIGHUSER, 0);
 598}
 599
 600/*
 601 * Migrate pages from one node to a target node.
 602 * Returns error or the number of pages not migrated.
 603 */
 604int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 605{
 606        nodemask_t nmask;
 607        LIST_HEAD(pagelist);
 608        int err = 0;
 609
 610        nodes_clear(nmask);
 611        node_set(source, nmask);
 612
 613        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 614                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 615
 616        if (!list_empty(&pagelist))
 617                err = migrate_pages(&pagelist, new_node_page, dest);
 618
 619        return err;
 620}
 621
 622/*
 623 * Move pages between the two nodesets so as to preserve the physical
 624 * layout as much as possible.
 625 *
 626 * Returns the number of page that could not be moved.
 627 */
 628int do_migrate_pages(struct mm_struct *mm,
 629        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 630{
 631        LIST_HEAD(pagelist);
 632        int busy = 0;
 633        int err = 0;
 634        nodemask_t tmp;
 635
 636        down_read(&mm->mmap_sem);
 637
 638        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 639        if (err)
 640                goto out;
 641
 642/*
 643 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 644 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 645 * bit in 'tmp', and return that <source, dest> pair for migration.
 646 * The pair of nodemasks 'to' and 'from' define the map.
 647 *
 648 * If no pair of bits is found that way, fallback to picking some
 649 * pair of 'source' and 'dest' bits that are not the same.  If the
 650 * 'source' and 'dest' bits are the same, this represents a node
 651 * that will be migrating to itself, so no pages need move.
 652 *
 653 * If no bits are left in 'tmp', or if all remaining bits left
 654 * in 'tmp' correspond to the same bit in 'to', return false
 655 * (nothing left to migrate).
 656 *
 657 * This lets us pick a pair of nodes to migrate between, such that
 658 * if possible the dest node is not already occupied by some other
 659 * source node, minimizing the risk of overloading the memory on a
 660 * node that would happen if we migrated incoming memory to a node
 661 * before migrating outgoing memory source that same node.
 662 *
 663 * A single scan of tmp is sufficient.  As we go, we remember the
 664 * most recent <s, d> pair that moved (s != d).  If we find a pair
 665 * that not only moved, but what's better, moved to an empty slot
 666 * (d is not set in tmp), then we break out then, with that pair.
 667 * Otherwise when we finish scannng from_tmp, we at least have the
 668 * most recent <s, d> pair that moved.  If we get all the way through
 669 * the scan of tmp without finding any node that moved, much less
 670 * moved to an empty node, then there is nothing left worth migrating.
 671 */
 672
 673        tmp = *from_nodes;
 674        while (!nodes_empty(tmp)) {
 675                int s,d;
 676                int source = -1;
 677                int dest = 0;
 678
 679                for_each_node_mask(s, tmp) {
 680                        d = node_remap(s, *from_nodes, *to_nodes);
 681                        if (s == d)
 682                                continue;
 683
 684                        source = s;     /* Node moved. Memorize */
 685                        dest = d;
 686
 687                        /* dest not in remaining from nodes? */
 688                        if (!node_isset(dest, tmp))
 689                                break;
 690                }
 691                if (source == -1)
 692                        break;
 693
 694                node_clear(source, tmp);
 695                err = migrate_to_node(mm, source, dest, flags);
 696                if (err > 0)
 697                        busy += err;
 698                if (err < 0)
 699                        break;
 700        }
 701out:
 702        up_read(&mm->mmap_sem);
 703        if (err < 0)
 704                return err;
 705        return busy;
 706
 707}
 708
 709static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 710{
 711        struct vm_area_struct *vma = (struct vm_area_struct *)private;
 712
 713        return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 714}
 715#else
 716
 717static void migrate_page_add(struct page *page, struct list_head *pagelist,
 718                                unsigned long flags)
 719{
 720}
 721
 722int do_migrate_pages(struct mm_struct *mm,
 723        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 724{
 725        return -ENOSYS;
 726}
 727
 728static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 729{
 730        return NULL;
 731}
 732#endif
 733
 734long do_mbind(unsigned long start, unsigned long len,
 735                unsigned long mode, nodemask_t *nmask, unsigned long flags)
 736{
 737        struct vm_area_struct *vma;
 738        struct mm_struct *mm = current->mm;
 739        struct mempolicy *new;
 740        unsigned long end;
 741        int err;
 742        LIST_HEAD(pagelist);
 743
 744        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 745                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 746            || mode > MPOL_MAX)
 747                return -EINVAL;
 748        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 749                return -EPERM;
 750
 751        if (start & ~PAGE_MASK)
 752                return -EINVAL;
 753
 754        if (mode == MPOL_DEFAULT)
 755                flags &= ~MPOL_MF_STRICT;
 756
 757        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 758        end = start + len;
 759
 760        if (end < start)
 761                return -EINVAL;
 762        if (end == start)
 763                return 0;
 764
 765        if (mpol_check_policy(mode, nmask))
 766                return -EINVAL;
 767
 768        new = mpol_new(mode, nmask);
 769        if (IS_ERR(new))
 770                return PTR_ERR(new);
 771
 772        /*
 773         * If we are using the default policy then operation
 774         * on discontinuous address spaces is okay after all
 775         */
 776        if (!new)
 777                flags |= MPOL_MF_DISCONTIG_OK;
 778
 779        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 780                        mode,nodes_addr(nodes)[0]);
 781
 782        down_write(&mm->mmap_sem);
 783        vma = check_range(mm, start, end, nmask,
 784                          flags | MPOL_MF_INVERT, &pagelist);
 785
 786        err = PTR_ERR(vma);
 787        if (!IS_ERR(vma)) {
 788                int nr_failed = 0;
 789
 790                err = mbind_range(vma, start, end, new);
 791
 792                if (!list_empty(&pagelist))
 793                        nr_failed = migrate_pages(&pagelist, new_vma_page,
 794                                                (unsigned long)vma);
 795
 796                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 797                        err = -EIO;
 798        }
 799
 800        up_write(&mm->mmap_sem);
 801        mpol_free(new);
 802        return err;
 803}
 804
 805/*
 806 * User space interface with variable sized bitmaps for nodelists.
 807 */
 808
 809/* Copy a node mask from user space. */
 810static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 811                     unsigned long maxnode)
 812{
 813        unsigned long k;
 814        unsigned long nlongs;
 815        unsigned long endmask;
 816
 817        --maxnode;
 818        nodes_clear(*nodes);
 819        if (maxnode == 0 || !nmask)
 820                return 0;
 821        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 822                return -EINVAL;
 823
 824        nlongs = BITS_TO_LONGS(maxnode);
 825        if ((maxnode % BITS_PER_LONG) == 0)
 826                endmask = ~0UL;
 827        else
 828                endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 829
 830        /* When the user specified more nodes than supported just check
 831           if the non supported part is all zero. */
 832        if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 833                if (nlongs > PAGE_SIZE/sizeof(long))
 834                        return -EINVAL;
 835                for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 836                        unsigned long t;
 837                        if (get_user(t, nmask + k))
 838                                return -EFAULT;
 839                        if (k == nlongs - 1) {
 840                                if (t & endmask)
 841                                        return -EINVAL;
 842                        } else if (t)
 843                                return -EINVAL;
 844                }
 845                nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 846                endmask = ~0UL;
 847        }
 848
 849        if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 850                return -EFAULT;
 851        nodes_addr(*nodes)[nlongs-1] &= endmask;
 852        return 0;
 853}
 854
 855/* Copy a kernel node mask to user space */
 856static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 857                              nodemask_t *nodes)
 858{
 859        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 860        const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 861
 862        if (copy > nbytes) {
 863                if (copy > PAGE_SIZE)
 864                        return -EINVAL;
 865                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 866                        return -EFAULT;
 867                copy = nbytes;
 868        }
 869        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 870}
 871
 872asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 873                        unsigned long mode,
 874                        unsigned long __user *nmask, unsigned long maxnode,
 875                        unsigned flags)
 876{
 877        nodemask_t nodes;
 878        int err;
 879
 880        err = get_nodes(&nodes, nmask, maxnode);
 881        if (err)
 882                return err;
 883#ifdef CONFIG_CPUSETS
 884        /* Restrict the nodes to the allowed nodes in the cpuset */
 885        nodes_and(nodes, nodes, current->mems_allowed);
 886#endif
 887        return do_mbind(start, len, mode, &nodes, flags);
 888}
 889
 890/* Set the process memory policy */
 891asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 892                unsigned long maxnode)
 893{
 894        int err;
 895        nodemask_t nodes;
 896
 897        if (mode < 0 || mode > MPOL_MAX)
 898                return -EINVAL;
 899        err = get_nodes(&nodes, nmask, maxnode);
 900        if (err)
 901                return err;
 902        return do_set_mempolicy(mode, &nodes);
 903}
 904
 905asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 906                const unsigned long __user *old_nodes,
 907                const unsigned long __user *new_nodes)
 908{
 909        struct mm_struct *mm;
 910        struct task_struct *task;
 911        nodemask_t old;
 912        nodemask_t new;
 913        nodemask_t task_nodes;
 914        int err;
 915
 916        err = get_nodes(&old, old_nodes, maxnode);
 917        if (err)
 918                return err;
 919
 920        err = get_nodes(&new, new_nodes, maxnode);
 921        if (err)
 922                return err;
 923
 924        /* Find the mm_struct */
 925        read_lock(&tasklist_lock);
 926        task = pid ? find_task_by_pid(pid) : current;
 927        if (!task) {
 928                read_unlock(&tasklist_lock);
 929                return -ESRCH;
 930        }
 931        mm = get_task_mm(task);
 932        read_unlock(&tasklist_lock);
 933
 934        if (!mm)
 935                return -EINVAL;
 936
 937        /*
 938         * Check if this process has the right to modify the specified
 939         * process. The right exists if the process has administrative
 940         * capabilities, superuser privileges or the same
 941         * userid as the target process.
 942         */
 943        if ((current->euid != task->suid) && (current->euid != task->uid) &&
 944            (current->uid != task->suid) && (current->uid != task->uid) &&
 945            !capable(CAP_SYS_NICE)) {
 946                err = -EPERM;
 947                goto out;
 948        }
 949
 950        task_nodes = cpuset_mems_allowed(task);
 951        /* Is the user allowed to access the target nodes? */
 952        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 953                err = -EPERM;
 954                goto out;
 955        }
 956
 957        err = security_task_movememory(task);
 958        if (err)
 959                goto out;
 960
 961        err = do_migrate_pages(mm, &old, &new,
 962                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 963out:
 964        mmput(mm);
 965        return err;
 966}
 967
 968
 969/* Retrieve NUMA policy */
 970asmlinkage long sys_get_mempolicy(int __user *policy,
 971                                unsigned long __user *nmask,
 972                                unsigned long maxnode,
 973                                unsigned long addr, unsigned long flags)
 974{
 975        int err, pval;
 976        nodemask_t nodes;
 977
 978        if (nmask != NULL && maxnode < MAX_NUMNODES)
 979                return -EINVAL;
 980
 981        err = do_get_mempolicy(&pval, &nodes, addr, flags);
 982
 983        if (err)
 984                return err;
 985
 986        if (policy && put_user(pval, policy))
 987                return -EFAULT;
 988
 989        if (nmask)
 990                err = copy_nodes_to_user(nmask, maxnode, &nodes);
 991
 992        return err;
 993}
 994
 995#ifdef CONFIG_COMPAT
 996
 997asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 998                                     compat_ulong_t __user *nmask,
 999                                     compat_ulong_t maxnode,
1000                                     compat_ulong_t addr, compat_ulong_t flags)
1001{
1002        long err;
1003        unsigned long __user *nm = NULL;
1004        unsigned long nr_bits, alloc_size;
1005        DECLARE_BITMAP(bm, MAX_NUMNODES);
1006
1007        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1008        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1009
1010        if (nmask)
1011                nm = compat_alloc_user_space(alloc_size);
1012
1013        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1014
1015        if (!err && nmask) {
1016                err = copy_from_user(bm, nm, alloc_size);
1017                /* ensure entire bitmap is zeroed */
1018                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1019                err |= compat_put_bitmap(nmask, bm, nr_bits);
1020        }
1021
1022        return err;
1023}
1024
1025asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1026                                     compat_ulong_t maxnode)
1027{
1028        long err = 0;
1029        unsigned long __user *nm = NULL;
1030        unsigned long nr_bits, alloc_size;
1031        DECLARE_BITMAP(bm, MAX_NUMNODES);
1032
1033        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1034        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1035
1036        if (nmask) {
1037                err = compat_get_bitmap(bm, nmask, nr_bits);
1038                nm = compat_alloc_user_space(alloc_size);
1039                err |= copy_to_user(nm, bm, alloc_size);
1040        }
1041
1042        if (err)
1043                return -EFAULT;
1044
1045        return sys_set_mempolicy(mode, nm, nr_bits+1);
1046}
1047
1048asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1049                             compat_ulong_t mode, compat_ulong_t __user *nmask,
1050                             compat_ulong_t maxnode, compat_ulong_t flags)
1051{
1052        long err = 0;
1053        unsigned long __user *nm = NULL;
1054        unsigned long nr_bits, alloc_size;
1055        nodemask_t bm;
1056
1057        nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1058        alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1059
1060        if (nmask) {
1061                err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1062                nm = compat_alloc_user_space(alloc_size);
1063                err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1064        }
1065
1066        if (err)
1067                return -EFAULT;
1068
1069        return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1070}
1071
1072#endif
1073
1074/* Return effective policy for a VMA */
1075static struct mempolicy * get_vma_policy(struct task_struct *task,
1076                struct vm_area_struct *vma, unsigned long addr)
1077{
1078        struct mempolicy *pol = task->mempolicy;
1079
1080        if (vma) {
1081                if (vma->vm_ops && vma->vm_ops->get_policy)
1082                        pol = vma->vm_ops->get_policy(vma, addr);
1083                else if (vma->vm_policy &&
1084                                vma->vm_policy->policy != MPOL_DEFAULT)
1085                        pol = vma->vm_policy;
1086        }
1087        if (!pol)
1088                pol = &default_policy;
1089        return pol;
1090}
1091
1092/* Return a zonelist representing a mempolicy */
1093static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1094{
1095        int nd;
1096
1097        switch (policy->policy) {
1098        case MPOL_PREFERRED:
1099                nd = policy->v.preferred_node;
1100                if (nd < 0)
1101                        nd = numa_node_id();
1102                break;
1103        case MPOL_BIND:
1104                /* Lower zones don't get a policy applied */
1105                /* Careful: current->mems_allowed might have moved */
1106                if (gfp_zone(gfp) >= policy_zone)
1107                        if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1108                                return policy->v.zonelist;
1109                /*FALL THROUGH*/
1110        case MPOL_INTERLEAVE: /* should not happen */
1111        case MPOL_DEFAULT:
1112                nd = numa_node_id();
1113                break;
1114        default:
1115                nd = 0;
1116                BUG();
1117        }
1118        return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1119}
1120
1121/* Do dynamic interleaving for a process */
1122static unsigned interleave_nodes(struct mempolicy *policy)
1123{
1124        unsigned nid, next;
1125        struct task_struct *me = current;
1126
1127        nid = me->il_next;
1128        next = next_node(nid, policy->v.nodes);
1129        if (next >= MAX_NUMNODES)
1130                next = first_node(policy->v.nodes);
1131        me->il_next = next;
1132        return nid;
1133}
1134
1135/*
1136 * Depending on the memory policy provide a node from which to allocate the
1137 * next slab entry.
1138 */
1139unsigned slab_node(struct mempolicy *policy)
1140{
1141        int pol = policy ? policy->policy : MPOL_DEFAULT;
1142
1143        switch (pol) {
1144        case MPOL_INTERLEAVE:
1145                return interleave_nodes(policy);
1146
1147        case MPOL_BIND:
1148                /*
1149                 * Follow bind policy behavior and start allocation at the
1150                 * first node.
1151                 */
1152                return zone_to_nid(policy->v.zonelist->zones[0]);
1153
1154        case MPOL_PREFERRED:
1155                if (policy->v.preferred_node >= 0)
1156                        return policy->v.preferred_node;
1157                /* Fall through */
1158
1159        default:
1160                return numa_node_id();
1161        }
1162}
1163
1164/* Do static interleaving for a VMA with known offset. */
1165static unsigned offset_il_node(struct mempolicy *pol,
1166                struct vm_area_struct *vma, unsigned long off)
1167{
1168        unsigned nnodes = nodes_weight(pol->v.nodes);
1169        unsigned target = (unsigned)off % nnodes;
1170        int c;
1171        int nid = -1;
1172
1173        c = 0;
1174        do {
1175                nid = next_node(nid, pol->v.nodes);
1176                c++;
1177        } while (c <= target);
1178        return nid;
1179}
1180
1181/* Determine a node number for interleave */
1182static inline unsigned interleave_nid(struct mempolicy *pol,
1183                 struct vm_area_struct *vma, unsigned long addr, int shift)
1184{
1185        if (vma) {
1186                unsigned long off;
1187
1188                /*
1189                 * for small pages, there is no difference between
1190                 * shift and PAGE_SHIFT, so the bit-shift is safe.
1191                 * for huge pages, since vm_pgoff is in units of small
1192                 * pages, we need to shift off the always 0 bits to get
1193                 * a useful offset.
1194                 */
1195                BUG_ON(shift < PAGE_SHIFT);
1196                off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1197                off += (addr - vma->vm_start) >> shift;
1198                return offset_il_node(pol, vma, off);
1199        } else
1200                return interleave_nodes(pol);
1201}
1202
1203#ifdef CONFIG_HUGETLBFS
1204/* Return a zonelist suitable for a huge page allocation. */
1205struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1206{
1207        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1208
1209        if (pol->policy == MPOL_INTERLEAVE) {
1210                unsigned nid;
1211
1212                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1213                return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1214        }
1215        return zonelist_policy(GFP_HIGHUSER, pol);
1216}
1217#endif
1218
1219/* Allocate a page in interleaved policy.
1220   Own path because it needs to do special accounting. */
1221static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1222                                        unsigned nid)
1223{
1224        struct zonelist *zl;
1225        struct page *page;
1226
1227        zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1228        page = __alloc_pages(gfp, order, zl);
1229        if (page && page_zone(page) == zl->zones[0])
1230                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1231        return page;
1232}
1233
1234/**
1235 *      alloc_page_vma  - Allocate a page for a VMA.
1236 *
1237 *      @gfp:
1238 *      %GFP_USER    user allocation.
1239 *      %GFP_KERNEL  kernel allocations,
1240 *      %GFP_HIGHMEM highmem/user allocations,
1241 *      %GFP_FS      allocation should not call back into a file system.
1242 *      %GFP_ATOMIC  don't sleep.
1243 *
1244 *      @vma:  Pointer to VMA or NULL if not available.
1245 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1246 *
1247 *      This function allocates a page from the kernel page pool and applies
1248 *      a NUMA policy associated with the VMA or the current process.
1249 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1250 *      mm_struct of the VMA to prevent it from going away. Should be used for
1251 *      all allocations for pages that will be mapped into
1252 *      user space. Returns NULL when no page can be allocated.
1253 *
1254 *      Should be called with the mm_sem of the vma hold.
1255 */
1256struct page *
1257alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1258{
1259        struct mempolicy *pol = get_vma_policy(current, vma, addr);
1260
1261        cpuset_update_task_memory_state();
1262
1263        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1264                unsigned nid;
1265
1266                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1267                return alloc_page_interleave(gfp, 0, nid);
1268        }
1269        return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1270}
1271
1272/**
1273 *      alloc_pages_current - Allocate pages.
1274 *
1275 *      @gfp:
1276 *              %GFP_USER   user allocation,
1277 *              %GFP_KERNEL kernel allocation,
1278 *              %GFP_HIGHMEM highmem allocation,
1279 *              %GFP_FS     don't call back into a file system.
1280 *              %GFP_ATOMIC don't sleep.
1281 *      @order: Power of two of allocation size in pages. 0 is a single page.
1282 *
1283 *      Allocate a page from the kernel page pool.  When not in
1284 *      interrupt context and apply the current process NUMA policy.
1285 *      Returns NULL when no page can be allocated.
1286 *
1287 *      Don't call cpuset_update_task_memory_state() unless
1288 *      1) it's ok to take cpuset_sem (can WAIT), and
1289 *      2) allocating for current task (not interrupt).
1290 */
1291struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1292{
1293        struct mempolicy *pol = current->mempolicy;
1294
1295        if ((gfp & __GFP_WAIT) && !in_interrupt())
1296                cpuset_update_task_memory_state();
1297        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1298                pol = &default_policy;
1299        if (pol->policy == MPOL_INTERLEAVE)
1300                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1301        return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1302}
1303EXPORT_SYMBOL(alloc_pages_current);
1304
1305/*
1306 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1307 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1308 * with the mems_allowed returned by cpuset_mems_allowed().  This
1309 * keeps mempolicies cpuset relative after its cpuset moves.  See
1310 * further kernel/cpuset.c update_nodemask().
1311 */
1312void *cpuset_being_rebound;
1313
1314/* Slow path of a mempolicy copy */
1315struct mempolicy *__mpol_copy(struct mempolicy *old)
1316{
1317        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1318
1319        if (!new)
1320                return ERR_PTR(-ENOMEM);
1321        if (current_cpuset_is_being_rebound()) {
1322                nodemask_t mems = cpuset_mems_allowed(current);
1323                mpol_rebind_policy(old, &mems);
1324        }
1325        *new = *old;
1326        atomic_set(&new->refcnt, 1);
1327        if (new->policy == MPOL_BIND) {
1328                int sz = ksize(old->v.zonelist);
1329                new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1330                if (!new->v.zonelist) {
1331                        kmem_cache_free(policy_cache, new);
1332                        return ERR_PTR(-ENOMEM);
1333                }
1334        }
1335        return new;
1336}
1337
1338/* Slow path of a mempolicy comparison */
1339int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1340{
1341        if (!a || !b)
1342                return 0;
1343        if (a->policy != b->policy)
1344                return 0;
1345        switch (a->policy) {
1346        case MPOL_DEFAULT:
1347                return 1;
1348        case MPOL_INTERLEAVE:
1349                return nodes_equal(a->v.nodes, b->v.nodes);
1350        case MPOL_PREFERRED:
1351                return a->v.preferred_node == b->v.preferred_node;
1352        case MPOL_BIND: {
1353                int i;
1354                for (i = 0; a->v.zonelist->zones[i]; i++)
1355                        if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1356                                return 0;
1357                return b->v.zonelist->zones[i] == NULL;
1358        }
1359        default:
1360                BUG();
1361                return 0;
1362        }
1363}
1364
1365/* Slow path of a mpol destructor. */
1366void __mpol_free(struct mempolicy *p)
1367{
1368        if (!atomic_dec_and_test(&p->refcnt))
1369                return;
1370        if (p->policy == MPOL_BIND)
1371                kfree(p->v.zonelist);
1372        p->policy = MPOL_DEFAULT;
1373        kmem_cache_free(policy_cache, p);
1374}
1375
1376/*
1377 * Shared memory backing store policy support.
1378 *
1379 * Remember policies even when nobody has shared memory mapped.
1380 * The policies are kept in Red-Black tree linked from the inode.
1381 * They are protected by the sp->lock spinlock, which should be held
1382 * for any accesses to the tree.
1383 */
1384
1385/* lookup first element intersecting start-end */
1386/* Caller holds sp->lock */
1387static struct sp_node *
1388sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1389{
1390        struct rb_node *n = sp->root.rb_node;
1391
1392        while (n) {
1393                struct sp_node *p = rb_entry(n, struct sp_node, nd);
1394
1395                if (start >= p->end)
1396                        n = n->rb_right;
1397                else if (end <= p->start)
1398                        n = n->rb_left;
1399                else
1400                        break;
1401        }
1402        if (!n)
1403                return NULL;
1404        for (;;) {
1405                struct sp_node *w = NULL;
1406                struct rb_node *prev = rb_prev(n);
1407                if (!prev)
1408                        break;
1409                w = rb_entry(prev, struct sp_node, nd);
1410                if (w->end <= start)
1411                        break;
1412                n = prev;
1413        }
1414        return rb_entry(n, struct sp_node, nd);
1415}
1416
1417/* Insert a new shared policy into the list. */
1418/* Caller holds sp->lock */
1419static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1420{
1421        struct rb_node **p = &sp->root.rb_node;
1422        struct rb_node *parent = NULL;
1423        struct sp_node *nd;
1424
1425        while (*p) {
1426                parent = *p;
1427                nd = rb_entry(parent, struct sp_node, nd);
1428                if (new->start < nd->start)
1429                        p = &(*p)->rb_left;
1430                else if (new->end > nd->end)
1431                        p = &(*p)->rb_right;
1432                else
1433                        BUG();
1434        }
1435        rb_link_node(&new->nd, parent, p);
1436        rb_insert_color(&new->nd, &sp->root);
1437        PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1438                 new->policy ? new->policy->policy : 0);
1439}
1440
1441/* Find shared policy intersecting idx */
1442struct mempolicy *
1443mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1444{
1445        struct mempolicy *pol = NULL;
1446        struct sp_node *sn;
1447
1448        if (!sp->root.rb_node)
1449                return NULL;
1450        spin_lock(&sp->lock);
1451        sn = sp_lookup(sp, idx, idx+1);
1452        if (sn) {
1453                mpol_get(sn->policy);
1454                pol = sn->policy;
1455        }
1456        spin_unlock(&sp->lock);
1457        return pol;
1458}
1459
1460static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1461{
1462        PDprintk("deleting %lx-l%x\n", n->start, n->end);
1463        rb_erase(&n->nd, &sp->root);
1464        mpol_free(n->policy);
1465        kmem_cache_free(sn_cache, n);
1466}
1467
1468struct sp_node *
1469sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1470{
1471        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1472
1473        if (!n)
1474                return NULL;
1475        n->start = start;
1476        n->end = end;
1477        mpol_get(pol);
1478        n->policy = pol;
1479        return n;
1480}
1481
1482/* Replace a policy range. */
1483static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1484                                 unsigned long end, struct sp_node *new)
1485{
1486        struct sp_node *n, *new2 = NULL;
1487
1488restart:
1489        spin_lock(&sp->lock);
1490        n = sp_lookup(sp, start, end);
1491        /* Take care of old policies in the same range. */
1492        while (n && n->start < end) {
1493                struct rb_node *next = rb_next(&n->nd);
1494                if (n->start >= start) {
1495                        if (n->end <= end)
1496                                sp_delete(sp, n);
1497                        else
1498                                n->start = end;
1499                } else {
1500                        /* Old policy spanning whole new range. */
1501                        if (n->end > end) {
1502                                if (!new2) {
1503                                        spin_unlock(&sp->lock);
1504                                        new2 = sp_alloc(end, n->end, n->policy);
1505                                        if (!new2)
1506                                                return -ENOMEM;
1507                                        goto restart;
1508                                }
1509                                n->end = start;
1510                                sp_insert(sp, new2);
1511                                new2 = NULL;
1512                                break;
1513                        } else
1514                                n->end = start;
1515                }
1516                if (!next)
1517                        break;
1518                n = rb_entry(next, struct sp_node, nd);
1519        }
1520        if (new)
1521                sp_insert(sp, new);
1522        spin_unlock(&sp->lock);
1523        if (new2) {
1524                mpol_free(new2->policy);
1525                kmem_cache_free(sn_cache, new2);
1526        }
1527        return 0;
1528}
1529
1530void mpol_shared_policy_init(struct shared_policy *info, int policy,
1531                                nodemask_t *policy_nodes)
1532{
1533        info->root = RB_ROOT;
1534        spin_lock_init(&info->lock);
1535
1536        if (policy != MPOL_DEFAULT) {
1537                struct mempolicy *newpol;
1538
1539                /* Falls back to MPOL_DEFAULT on any error */
1540                newpol = mpol_new(policy, policy_nodes);
1541                if (!IS_ERR(newpol)) {
1542                        /* Create pseudo-vma that contains just the policy */
1543                        struct vm_area_struct pvma;
1544
1545                        memset(&pvma, 0, sizeof(struct vm_area_struct));
1546                        /* Policy covers entire file */
1547                        pvma.vm_end = TASK_SIZE;
1548                        mpol_set_shared_policy(info, &pvma, newpol);
1549                        mpol_free(newpol);
1550                }
1551        }
1552}
1553
1554int mpol_set_shared_policy(struct shared_policy *info,
1555                        struct vm_area_struct *vma, struct mempolicy *npol)
1556{
1557        int err;
1558        struct sp_node *new = NULL;
1559        unsigned long sz = vma_pages(vma);
1560
1561        PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1562                 vma->vm_pgoff,
1563                 sz, npol? npol->policy : -1,
1564                npol ? nodes_addr(npol->v.nodes)[0] : -1);
1565
1566        if (npol) {
1567                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1568                if (!new)
1569                        return -ENOMEM;
1570        }
1571        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1572        if (err && new)
1573                kmem_cache_free(sn_cache, new);
1574        return err;
1575}
1576
1577/* Free a backing policy store on inode delete. */
1578void mpol_free_shared_policy(struct shared_policy *p)
1579{
1580        struct sp_node *n;
1581        struct rb_node *next;
1582
1583        if (!p->root.rb_node)
1584                return;
1585        spin_lock(&p->lock);
1586        next = rb_first(&p->root);
1587        while (next) {
1588                n = rb_entry(next, struct sp_node, nd);
1589                next = rb_next(&n->nd);
1590                rb_erase(&n->nd, &p->root);
1591                mpol_free(n->policy);
1592                kmem_cache_free(sn_cache, n);
1593        }
1594        spin_unlock(&p->lock);
1595}
1596
1597/* assumes fs == KERNEL_DS */
1598void __init numa_policy_init(void)
1599{
1600        policy_cache = kmem_cache_create("numa_policy",
1601                                         sizeof(struct mempolicy),
1602                                         0, SLAB_PANIC, NULL, NULL);
1603
1604        sn_cache = kmem_cache_create("shared_policy_node",
1605                                     sizeof(struct sp_node),
1606                                     0, SLAB_PANIC, NULL, NULL);
1607
1608        /* Set interleaving policy for system init. This way not all
1609           the data structures allocated at system boot end up in node zero. */
1610
1611        if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1612                printk("numa_policy_init: interleaving failed\n");
1613}
1614
1615/* Reset policy of current process to default */
1616void numa_default_policy(void)
1617{
1618        do_set_mempolicy(MPOL_DEFAULT, NULL);
1619}
1620
1621/* Migrate a policy to a different set of nodes */
1622void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1623{
1624        nodemask_t *mpolmask;
1625        nodemask_t tmp;
1626
1627        if (!pol)
1628                return;
1629        mpolmask = &pol->cpuset_mems_allowed;
1630        if (nodes_equal(*mpolmask, *newmask))
1631                return;
1632
1633        switch (pol->policy) {
1634        case MPOL_DEFAULT:
1635                break;
1636        case MPOL_INTERLEAVE:
1637                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1638                pol->v.nodes = tmp;
1639                *mpolmask = *newmask;
1640                current->il_next = node_remap(current->il_next,
1641                                                *mpolmask, *newmask);
1642                break;
1643        case MPOL_PREFERRED:
1644                pol->v.preferred_node = node_remap(pol->v.preferred_node,
1645                                                *mpolmask, *newmask);
1646                *mpolmask = *newmask;
1647                break;
1648        case MPOL_BIND: {
1649                nodemask_t nodes;
1650                struct zone **z;
1651                struct zonelist *zonelist;
1652
1653                nodes_clear(nodes);
1654                for (z = pol->v.zonelist->zones; *z; z++)
1655                        node_set(zone_to_nid(*z), nodes);
1656                nodes_remap(tmp, nodes, *mpolmask, *newmask);
1657                nodes = tmp;
1658
1659                zonelist = bind_zonelist(&nodes);
1660
1661                /* If no mem, then zonelist is NULL and we keep old zonelist.
1662                 * If that old zonelist has no remaining mems_allowed nodes,
1663                 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1664                 */
1665
1666                if (!IS_ERR(zonelist)) {
1667                        /* Good - got mem - substitute new zonelist */
1668                        kfree(pol->v.zonelist);
1669                        pol->v.zonelist = zonelist;
1670                }
1671                *mpolmask = *newmask;
1672                break;
1673        }
1674        default:
1675                BUG();
1676                break;
1677        }
1678}
1679
1680/*
1681 * Wrapper for mpol_rebind_policy() that just requires task
1682 * pointer, and updates task mempolicy.
1683 */
1684
1685void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1686{
1687        mpol_rebind_policy(tsk->mempolicy, new);
1688}
1689
1690/*
1691 * Rebind each vma in mm to new nodemask.
1692 *
1693 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1694 */
1695
1696void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1697{
1698        struct vm_area_struct *vma;
1699
1700        down_write(&mm->mmap_sem);
1701        for (vma = mm->mmap; vma; vma = vma->vm_next)
1702                mpol_rebind_policy(vma->vm_policy, new);
1703        up_write(&mm->mmap_sem);
1704}
1705
1706/*
1707 * Display pages allocated per node and memory policy via /proc.
1708 */
1709
1710static const char * const policy_types[] =
1711        { "default", "prefer", "bind", "interleave" };
1712
1713/*
1714 * Convert a mempolicy into a string.
1715 * Returns the number of characters in buffer (if positive)
1716 * or an error (negative)
1717 */
1718static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1719{
1720        char *p = buffer;
1721        int l;
1722        nodemask_t nodes;
1723        int mode = pol ? pol->policy : MPOL_DEFAULT;
1724
1725        switch (mode) {
1726        case MPOL_DEFAULT:
1727                nodes_clear(nodes);
1728                break;
1729
1730        case MPOL_PREFERRED:
1731                nodes_clear(nodes);
1732                node_set(pol->v.preferred_node, nodes);
1733                break;
1734
1735        case MPOL_BIND:
1736                get_zonemask(pol, &nodes);
1737                break;
1738
1739        case MPOL_INTERLEAVE:
1740                nodes = pol->v.nodes;
1741                break;
1742
1743        default:
1744                BUG();
1745                return -EFAULT;
1746        }
1747
1748        l = strlen(policy_types[mode]);
1749        if (buffer + maxlen < p + l + 1)
1750                return -ENOSPC;
1751
1752        strcpy(p, policy_types[mode]);
1753        p += l;
1754
1755        if (!nodes_empty(nodes)) {
1756                if (buffer + maxlen < p + 2)
1757                        return -ENOSPC;
1758                *p++ = '=';
1759                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1760        }
1761        return p - buffer;
1762}
1763
1764struct numa_maps {
1765        unsigned long pages;
1766        unsigned long anon;
1767        unsigned long active;
1768        unsigned long writeback;
1769        unsigned long mapcount_max;
1770        unsigned long dirty;
1771        unsigned long swapcache;
1772        unsigned long node[MAX_NUMNODES];
1773};
1774
1775static void gather_stats(struct page *page, void *private, int pte_dirty)
1776{
1777        struct numa_maps *md = private;
1778        int count = page_mapcount(page);
1779
1780        md->pages++;
1781        if (pte_dirty || PageDirty(page))
1782                md->dirty++;
1783
1784        if (PageSwapCache(page))
1785                md->swapcache++;
1786
1787        if (PageActive(page))
1788                md->active++;
1789
1790        if (PageWriteback(page))
1791                md->writeback++;
1792
1793        if (PageAnon(page))
1794                md->anon++;
1795
1796        if (count > md->mapcount_max)
1797                md->mapcount_max = count;
1798
1799        md->node[page_to_nid(page)]++;
1800}
1801
1802#ifdef CONFIG_HUGETLB_PAGE
1803static void check_huge_range(struct vm_area_struct *vma,
1804                unsigned long start, unsigned long end,
1805                struct numa_maps *md)
1806{
1807        unsigned long addr;
1808        struct page *page;
1809
1810        for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812                pte_t pte;
1813
1814                if (!ptep)
1815                        continue;
1816
1817                pte = *ptep;
1818                if (pte_none(pte))
1819                        continue;
1820
1821                page = pte_page(pte);
1822                if (!page)
1823                        continue;
1824
1825                gather_stats(page, md, pte_dirty(*ptep));
1826        }
1827}
1828#else
1829static inline void check_huge_range(struct vm_area_struct *vma,
1830                unsigned long start, unsigned long end,
1831                struct numa_maps *md)
1832{
1833}
1834#endif
1835
1836int show_numa_map(struct seq_file *m, void *v)
1837{
1838        struct proc_maps_private *priv = m->private;
1839        struct vm_area_struct *vma = v;
1840        struct numa_maps *md;
1841        struct file *file = vma->vm_file;
1842        struct mm_struct *mm = vma->vm_mm;
1843        int n;
1844        char buffer[50];
1845
1846        if (!mm)
1847                return 0;
1848
1849        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1850        if (!md)
1851                return 0;
1852
1853        mpol_to_str(buffer, sizeof(buffer),
1854                            get_vma_policy(priv->task, vma, vma->vm_start));
1855
1856        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1857
1858        if (file) {
1859                seq_printf(m, " file=");
1860                seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
1861        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862                seq_printf(m, " heap");
1863        } else if (vma->vm_start <= mm->start_stack &&
1864                        vma->vm_end >= mm->start_stack) {
1865                seq_printf(m, " stack");
1866        }
1867
1868        if (is_vm_hugetlb_page(vma)) {
1869                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870                seq_printf(m, " huge");
1871        } else {
1872                check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873                                &node_online_map, MPOL_MF_STATS, md);
1874        }
1875
1876        if (!md->pages)
1877                goto out;
1878
1879        if (md->anon)
1880                seq_printf(m," anon=%lu",md->anon);
1881
1882        if (md->dirty)
1883                seq_printf(m," dirty=%lu",md->dirty);
1884
1885        if (md->pages != md->anon && md->pages != md->dirty)
1886                seq_printf(m, " mapped=%lu", md->pages);
1887
1888        if (md->mapcount_max > 1)
1889                seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890
1891        if (md->swapcache)
1892                seq_printf(m," swapcache=%lu", md->swapcache);
1893
1894        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895                seq_printf(m," active=%lu", md->active);
1896
1897        if (md->writeback)
1898                seq_printf(m," writeback=%lu", md->writeback);
1899
1900        for_each_online_node(n)
1901                if (md->node[n])
1902                        seq_printf(m, " N%d=%lu", n, md->node[n]);
1903out:
1904        seq_putc(m, '\n');
1905        kfree(md);
1906
1907        if (m->count < m->size)
1908                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1909        return 0;
1910}
1911
1912
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.