linux/mm/mmap.c
<<
>>
Prefs
   1/*
   2 * mm/mmap.c
   3 *
   4 * Written by obz.
   5 *
   6 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
   7 */
   8
   9#include <linux/slab.h>
  10#include <linux/backing-dev.h>
  11#include <linux/mm.h>
  12#include <linux/shm.h>
  13#include <linux/mman.h>
  14#include <linux/pagemap.h>
  15#include <linux/swap.h>
  16#include <linux/syscalls.h>
  17#include <linux/capability.h>
  18#include <linux/init.h>
  19#include <linux/file.h>
  20#include <linux/fs.h>
  21#include <linux/personality.h>
  22#include <linux/security.h>
  23#include <linux/hugetlb.h>
  24#include <linux/profile.h>
  25#include <linux/export.h>
  26#include <linux/mount.h>
  27#include <linux/mempolicy.h>
  28#include <linux/rmap.h>
  29#include <linux/mmu_notifier.h>
  30#include <linux/perf_event.h>
  31#include <linux/audit.h>
  32#include <linux/khugepaged.h>
  33#include <linux/uprobes.h>
  34#include <linux/rbtree_augmented.h>
  35
  36#include <asm/uaccess.h>
  37#include <asm/cacheflush.h>
  38#include <asm/tlb.h>
  39#include <asm/mmu_context.h>
  40
  41#include "internal.h"
  42
  43#ifndef arch_mmap_check
  44#define arch_mmap_check(addr, len, flags)       (0)
  45#endif
  46
  47#ifndef arch_rebalance_pgtables
  48#define arch_rebalance_pgtables(addr, len)              (addr)
  49#endif
  50
  51static void unmap_region(struct mm_struct *mm,
  52                struct vm_area_struct *vma, struct vm_area_struct *prev,
  53                unsigned long start, unsigned long end);
  54
  55/* description of effects of mapping type and prot in current implementation.
  56 * this is due to the limited x86 page protection hardware.  The expected
  57 * behavior is in parens:
  58 *
  59 * map_type     prot
  60 *              PROT_NONE       PROT_READ       PROT_WRITE      PROT_EXEC
  61 * MAP_SHARED   r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  62 *              w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
  63 *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  64 *              
  65 * MAP_PRIVATE  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  66 *              w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
  67 *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  68 *
  69 */
  70pgprot_t protection_map[16] = {
  71        __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
  72        __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
  73};
  74
  75pgprot_t vm_get_page_prot(unsigned long vm_flags)
  76{
  77        return __pgprot(pgprot_val(protection_map[vm_flags &
  78                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
  79                        pgprot_val(arch_vm_get_page_prot(vm_flags)));
  80}
  81EXPORT_SYMBOL(vm_get_page_prot);
  82
  83int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
  84int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
  85int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
  86/*
  87 * Make sure vm_committed_as in one cacheline and not cacheline shared with
  88 * other variables. It can be updated by several CPUs frequently.
  89 */
  90struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
  91
  92/*
  93 * The global memory commitment made in the system can be a metric
  94 * that can be used to drive ballooning decisions when Linux is hosted
  95 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
  96 * balancing memory across competing virtual machines that are hosted.
  97 * Several metrics drive this policy engine including the guest reported
  98 * memory commitment.
  99 */
 100unsigned long vm_memory_committed(void)
 101{
 102        return percpu_counter_read_positive(&vm_committed_as);
 103}
 104EXPORT_SYMBOL_GPL(vm_memory_committed);
 105
 106/*
 107 * Check that a process has enough memory to allocate a new virtual
 108 * mapping. 0 means there is enough memory for the allocation to
 109 * succeed and -ENOMEM implies there is not.
 110 *
 111 * We currently support three overcommit policies, which are set via the
 112 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
 113 *
 114 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 115 * Additional code 2002 Jul 20 by Robert Love.
 116 *
 117 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 118 *
 119 * Note this is a helper function intended to be used by LSMs which
 120 * wish to use this logic.
 121 */
 122int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 123{
 124        unsigned long free, allowed;
 125
 126        vm_acct_memory(pages);
 127
 128        /*
 129         * Sometimes we want to use more memory than we have
 130         */
 131        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
 132                return 0;
 133
 134        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
 135                free = global_page_state(NR_FREE_PAGES);
 136                free += global_page_state(NR_FILE_PAGES);
 137
 138                /*
 139                 * shmem pages shouldn't be counted as free in this
 140                 * case, they can't be purged, only swapped out, and
 141                 * that won't affect the overall amount of available
 142                 * memory in the system.
 143                 */
 144                free -= global_page_state(NR_SHMEM);
 145
 146                free += nr_swap_pages;
 147
 148                /*
 149                 * Any slabs which are created with the
 150                 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
 151                 * which are reclaimable, under pressure.  The dentry
 152                 * cache and most inode caches should fall into this
 153                 */
 154                free += global_page_state(NR_SLAB_RECLAIMABLE);
 155
 156                /*
 157                 * Leave reserved pages. The pages are not for anonymous pages.
 158                 */
 159                if (free <= totalreserve_pages)
 160                        goto error;
 161                else
 162                        free -= totalreserve_pages;
 163
 164                /*
 165                 * Leave the last 3% for root
 166                 */
 167                if (!cap_sys_admin)
 168                        free -= free / 32;
 169
 170                if (free > pages)
 171                        return 0;
 172
 173                goto error;
 174        }
 175
 176        allowed = (totalram_pages - hugetlb_total_pages())
 177                * sysctl_overcommit_ratio / 100;
 178        /*
 179         * Leave the last 3% for root
 180         */
 181        if (!cap_sys_admin)
 182                allowed -= allowed / 32;
 183        allowed += total_swap_pages;
 184
 185        /* Don't let a single process grow too big:
 186           leave 3% of the size of this process for other processes */
 187        if (mm)
 188                allowed -= mm->total_vm / 32;
 189
 190        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 191                return 0;
 192error:
 193        vm_unacct_memory(pages);
 194
 195        return -ENOMEM;
 196}
 197
 198/*
 199 * Requires inode->i_mapping->i_mmap_mutex
 200 */
 201static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 202                struct file *file, struct address_space *mapping)
 203{
 204        if (vma->vm_flags & VM_DENYWRITE)
 205                atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
 206        if (vma->vm_flags & VM_SHARED)
 207                mapping->i_mmap_writable--;
 208
 209        flush_dcache_mmap_lock(mapping);
 210        if (unlikely(vma->vm_flags & VM_NONLINEAR))
 211                list_del_init(&vma->shared.nonlinear);
 212        else
 213                vma_interval_tree_remove(vma, &mapping->i_mmap);
 214        flush_dcache_mmap_unlock(mapping);
 215}
 216
 217/*
 218 * Unlink a file-based vm structure from its interval tree, to hide
 219 * vma from rmap and vmtruncate before freeing its page tables.
 220 */
 221void unlink_file_vma(struct vm_area_struct *vma)
 222{
 223        struct file *file = vma->vm_file;
 224
 225        if (file) {
 226                struct address_space *mapping = file->f_mapping;
 227                mutex_lock(&mapping->i_mmap_mutex);
 228                __remove_shared_vm_struct(vma, file, mapping);
 229                mutex_unlock(&mapping->i_mmap_mutex);
 230        }
 231}
 232
 233/*
 234 * Close a vm structure and free it, returning the next.
 235 */
 236static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 237{
 238        struct vm_area_struct *next = vma->vm_next;
 239
 240        might_sleep();
 241        if (vma->vm_ops && vma->vm_ops->close)
 242                vma->vm_ops->close(vma);
 243        if (vma->vm_file)
 244                fput(vma->vm_file);
 245        mpol_put(vma_policy(vma));
 246        kmem_cache_free(vm_area_cachep, vma);
 247        return next;
 248}
 249
 250static unsigned long do_brk(unsigned long addr, unsigned long len);
 251
 252SYSCALL_DEFINE1(brk, unsigned long, brk)
 253{
 254        unsigned long rlim, retval;
 255        unsigned long newbrk, oldbrk;
 256        struct mm_struct *mm = current->mm;
 257        unsigned long min_brk;
 258
 259        down_write(&mm->mmap_sem);
 260
 261#ifdef CONFIG_COMPAT_BRK
 262        /*
 263         * CONFIG_COMPAT_BRK can still be overridden by setting
 264         * randomize_va_space to 2, which will still cause mm->start_brk
 265         * to be arbitrarily shifted
 266         */
 267        if (current->brk_randomized)
 268                min_brk = mm->start_brk;
 269        else
 270                min_brk = mm->end_data;
 271#else
 272        min_brk = mm->start_brk;
 273#endif
 274        if (brk < min_brk)
 275                goto out;
 276
 277        /*
 278         * Check against rlimit here. If this check is done later after the test
 279         * of oldbrk with newbrk then it can escape the test and let the data
 280         * segment grow beyond its set limit the in case where the limit is
 281         * not page aligned -Ram Gupta
 282         */
 283        rlim = rlimit(RLIMIT_DATA);
 284        if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
 285                        (mm->end_data - mm->start_data) > rlim)
 286                goto out;
 287
 288        newbrk = PAGE_ALIGN(brk);
 289        oldbrk = PAGE_ALIGN(mm->brk);
 290        if (oldbrk == newbrk)
 291                goto set_brk;
 292
 293        /* Always allow shrinking brk. */
 294        if (brk <= mm->brk) {
 295                if (!do_munmap(mm, newbrk, oldbrk-newbrk))
 296                        goto set_brk;
 297                goto out;
 298        }
 299
 300        /* Check against existing mmap mappings. */
 301        if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 302                goto out;
 303
 304        /* Ok, looks good - let it rip. */
 305        if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
 306                goto out;
 307set_brk:
 308        mm->brk = brk;
 309out:
 310        retval = mm->brk;
 311        up_write(&mm->mmap_sem);
 312        return retval;
 313}
 314
 315static long vma_compute_subtree_gap(struct vm_area_struct *vma)
 316{
 317        unsigned long max, subtree_gap;
 318        max = vma->vm_start;
 319        if (vma->vm_prev)
 320                max -= vma->vm_prev->vm_end;
 321        if (vma->vm_rb.rb_left) {
 322                subtree_gap = rb_entry(vma->vm_rb.rb_left,
 323                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
 324                if (subtree_gap > max)
 325                        max = subtree_gap;
 326        }
 327        if (vma->vm_rb.rb_right) {
 328                subtree_gap = rb_entry(vma->vm_rb.rb_right,
 329                                struct vm_area_struct, vm_rb)->rb_subtree_gap;
 330                if (subtree_gap > max)
 331                        max = subtree_gap;
 332        }
 333        return max;
 334}
 335
 336#ifdef CONFIG_DEBUG_VM_RB
 337static int browse_rb(struct rb_root *root)
 338{
 339        int i = 0, j, bug = 0;
 340        struct rb_node *nd, *pn = NULL;
 341        unsigned long prev = 0, pend = 0;
 342
 343        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 344                struct vm_area_struct *vma;
 345                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
 346                if (vma->vm_start < prev) {
 347                        printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
 348                        bug = 1;
 349                }
 350                if (vma->vm_start < pend) {
 351                        printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
 352                        bug = 1;
 353                }
 354                if (vma->vm_start > vma->vm_end) {
 355                        printk("vm_end %lx < vm_start %lx\n",
 356                                vma->vm_end, vma->vm_start);
 357                        bug = 1;
 358                }
 359                if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
 360                        printk("free gap %lx, correct %lx\n",
 361                               vma->rb_subtree_gap,
 362                               vma_compute_subtree_gap(vma));
 363                        bug = 1;
 364                }
 365                i++;
 366                pn = nd;
 367                prev = vma->vm_start;
 368                pend = vma->vm_end;
 369        }
 370        j = 0;
 371        for (nd = pn; nd; nd = rb_prev(nd))
 372                j++;
 373        if (i != j) {
 374                printk("backwards %d, forwards %d\n", j, i);
 375                bug = 1;
 376        }
 377        return bug ? -1 : i;
 378}
 379
 380static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
 381{
 382        struct rb_node *nd;
 383
 384        for (nd = rb_first(root); nd; nd = rb_next(nd)) {
 385                struct vm_area_struct *vma;
 386                vma = rb_entry(nd, struct vm_area_struct, vm_rb);
 387                BUG_ON(vma != ignore &&
 388                       vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
 389        }
 390}
 391
 392void validate_mm(struct mm_struct *mm)
 393{
 394        int bug = 0;
 395        int i = 0;
 396        unsigned long highest_address = 0;
 397        struct vm_area_struct *vma = mm->mmap;
 398        while (vma) {
 399                struct anon_vma_chain *avc;
 400                vma_lock_anon_vma(vma);
 401                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 402                        anon_vma_interval_tree_verify(avc);
 403                vma_unlock_anon_vma(vma);
 404                highest_address = vma->vm_end;
 405                vma = vma->vm_next;
 406                i++;
 407        }
 408        if (i != mm->map_count) {
 409                printk("map_count %d vm_next %d\n", mm->map_count, i);
 410                bug = 1;
 411        }
 412        if (highest_address != mm->highest_vm_end) {
 413                printk("mm->highest_vm_end %lx, found %lx\n",
 414                       mm->highest_vm_end, highest_address);
 415                bug = 1;
 416        }
 417        i = browse_rb(&mm->mm_rb);
 418        if (i != mm->map_count) {
 419                printk("map_count %d rb %d\n", mm->map_count, i);
 420                bug = 1;
 421        }
 422        BUG_ON(bug);
 423}
 424#else
 425#define validate_mm_rb(root, ignore) do { } while (0)
 426#define validate_mm(mm) do { } while (0)
 427#endif
 428
 429RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
 430                     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
 431
 432/*
 433 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
 434 * vma->vm_prev->vm_end values changed, without modifying the vma's position
 435 * in the rbtree.
 436 */
 437static void vma_gap_update(struct vm_area_struct *vma)
 438{
 439        /*
 440         * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
 441         * function that does exacltly what we want.
 442         */
 443        vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
 444}
 445
 446static inline void vma_rb_insert(struct vm_area_struct *vma,
 447                                 struct rb_root *root)
 448{
 449        /* All rb_subtree_gap values must be consistent prior to insertion */
 450        validate_mm_rb(root, NULL);
 451
 452        rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 453}
 454
 455static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
 456{
 457        /*
 458         * All rb_subtree_gap values must be consistent prior to erase,
 459         * with the possible exception of the vma being erased.
 460         */
 461        validate_mm_rb(root, vma);
 462
 463        /*
 464         * Note rb_erase_augmented is a fairly large inline function,
 465         * so make sure we instantiate it only once with our desired
 466         * augmented rbtree callbacks.
 467         */
 468        rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 469}
 470
 471/*
 472 * vma has some anon_vma assigned, and is already inserted on that
 473 * anon_vma's interval trees.
 474 *
 475 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 476 * vma must be removed from the anon_vma's interval trees using
 477 * anon_vma_interval_tree_pre_update_vma().
 478 *
 479 * After the update, the vma will be reinserted using
 480 * anon_vma_interval_tree_post_update_vma().
 481 *
 482 * The entire update must be protected by exclusive mmap_sem and by
 483 * the root anon_vma's mutex.
 484 */
 485static inline void
 486anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
 487{
 488        struct anon_vma_chain *avc;
 489
 490        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 491                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
 492}
 493
 494static inline void
 495anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
 496{
 497        struct anon_vma_chain *avc;
 498
 499        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 500                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
 501}
 502
 503static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 504                unsigned long end, struct vm_area_struct **pprev,
 505                struct rb_node ***rb_link, struct rb_node **rb_parent)
 506{
 507        struct rb_node **__rb_link, *__rb_parent, *rb_prev;
 508
 509        __rb_link = &mm->mm_rb.rb_node;
 510        rb_prev = __rb_parent = NULL;
 511
 512        while (*__rb_link) {
 513                struct vm_area_struct *vma_tmp;
 514
 515                __rb_parent = *__rb_link;
 516                vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 517
 518                if (vma_tmp->vm_end > addr) {
 519                        /* Fail if an existing vma overlaps the area */
 520                        if (vma_tmp->vm_start < end)
 521                                return -ENOMEM;
 522                        __rb_link = &__rb_parent->rb_left;
 523                } else {
 524                        rb_prev = __rb_parent;
 525                        __rb_link = &__rb_parent->rb_right;
 526                }
 527        }
 528
 529        *pprev = NULL;
 530        if (rb_prev)
 531                *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 532        *rb_link = __rb_link;
 533        *rb_parent = __rb_parent;
 534        return 0;
 535}
 536
 537void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 538                struct rb_node **rb_link, struct rb_node *rb_parent)
 539{
 540        /* Update tracking information for the gap following the new vma. */
 541        if (vma->vm_next)
 542                vma_gap_update(vma->vm_next);
 543        else
 544                mm->highest_vm_end = vma->vm_end;
 545
 546        /*
 547         * vma->vm_prev wasn't known when we followed the rbtree to find the
 548         * correct insertion point for that vma. As a result, we could not
 549         * update the vma vm_rb parents rb_subtree_gap values on the way down.
 550         * So, we first insert the vma with a zero rb_subtree_gap value
 551         * (to be consistent with what we did on the way down), and then
 552         * immediately update the gap to the correct value. Finally we
 553         * rebalance the rbtree after all augmented values have been set.
 554         */
 555        rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 556        vma->rb_subtree_gap = 0;
 557        vma_gap_update(vma);
 558        vma_rb_insert(vma, &mm->mm_rb);
 559}
 560
 561static void __vma_link_file(struct vm_area_struct *vma)
 562{
 563        struct file *file;
 564
 565        file = vma->vm_file;
 566        if (file) {
 567                struct address_space *mapping = file->f_mapping;
 568
 569                if (vma->vm_flags & VM_DENYWRITE)
 570                        atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
 571                if (vma->vm_flags & VM_SHARED)
 572                        mapping->i_mmap_writable++;
 573
 574                flush_dcache_mmap_lock(mapping);
 575                if (unlikely(vma->vm_flags & VM_NONLINEAR))
 576                        vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 577                else
 578                        vma_interval_tree_insert(vma, &mapping->i_mmap);
 579                flush_dcache_mmap_unlock(mapping);
 580        }
 581}
 582
 583static void
 584__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 585        struct vm_area_struct *prev, struct rb_node **rb_link,
 586        struct rb_node *rb_parent)
 587{
 588        __vma_link_list(mm, vma, prev, rb_parent);
 589        __vma_link_rb(mm, vma, rb_link, rb_parent);
 590}
 591
 592static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 593                        struct vm_area_struct *prev, struct rb_node **rb_link,
 594                        struct rb_node *rb_parent)
 595{
 596        struct address_space *mapping = NULL;
 597
 598        if (vma->vm_file)
 599                mapping = vma->vm_file->f_mapping;
 600
 601        if (mapping)
 602                mutex_lock(&mapping->i_mmap_mutex);
 603
 604        __vma_link(mm, vma, prev, rb_link, rb_parent);
 605        __vma_link_file(vma);
 606
 607        if (mapping)
 608                mutex_unlock(&mapping->i_mmap_mutex);
 609
 610        mm->map_count++;
 611        validate_mm(mm);
 612}
 613
 614/*
 615 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
 616 * mm's list and rbtree.  It has already been inserted into the interval tree.
 617 */
 618static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 619{
 620        struct vm_area_struct *prev;
 621        struct rb_node **rb_link, *rb_parent;
 622
 623        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
 624                           &prev, &rb_link, &rb_parent))
 625                BUG();
 626        __vma_link(mm, vma, prev, rb_link, rb_parent);
 627        mm->map_count++;
 628}
 629
 630static inline void
 631__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 632                struct vm_area_struct *prev)
 633{
 634        struct vm_area_struct *next;
 635
 636        vma_rb_erase(vma, &mm->mm_rb);
 637        prev->vm_next = next = vma->vm_next;
 638        if (next)
 639                next->vm_prev = prev;
 640        if (mm->mmap_cache == vma)
 641                mm->mmap_cache = prev;
 642}
 643
 644/*
 645 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
 646 * is already present in an i_mmap tree without adjusting the tree.
 647 * The following helper function should be used when such adjustments
 648 * are necessary.  The "insert" vma (if any) is to be inserted
 649 * before we drop the necessary locks.
 650 */
 651int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 652        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 653{
 654        struct mm_struct *mm = vma->vm_mm;
 655        struct vm_area_struct *next = vma->vm_next;
 656        struct vm_area_struct *importer = NULL;
 657        struct address_space *mapping = NULL;
 658        struct rb_root *root = NULL;
 659        struct anon_vma *anon_vma = NULL;
 660        struct file *file = vma->vm_file;
 661        bool start_changed = false, end_changed = false;
 662        long adjust_next = 0;
 663        int remove_next = 0;
 664
 665        if (next && !insert) {
 666                struct vm_area_struct *exporter = NULL;
 667
 668                if (end >= next->vm_end) {
 669                        /*
 670                         * vma expands, overlapping all the next, and
 671                         * perhaps the one after too (mprotect case 6).
 672                         */
 673again:                  remove_next = 1 + (end > next->vm_end);
 674                        end = next->vm_end;
 675                        exporter = next;
 676                        importer = vma;
 677                } else if (end > next->vm_start) {
 678                        /*
 679                         * vma expands, overlapping part of the next:
 680                         * mprotect case 5 shifting the boundary up.
 681                         */
 682                        adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
 683                        exporter = next;
 684                        importer = vma;
 685                } else if (end < vma->vm_end) {
 686                        /*
 687                         * vma shrinks, and !insert tells it's not
 688                         * split_vma inserting another: so it must be
 689                         * mprotect case 4 shifting the boundary down.
 690                         */
 691                        adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
 692                        exporter = vma;
 693                        importer = next;
 694                }
 695
 696                /*
 697                 * Easily overlooked: when mprotect shifts the boundary,
 698                 * make sure the expanding vma has anon_vma set if the
 699                 * shrinking vma had, to cover any anon pages imported.
 700                 */
 701                if (exporter && exporter->anon_vma && !importer->anon_vma) {
 702                        if (anon_vma_clone(importer, exporter))
 703                                return -ENOMEM;
 704                        importer->anon_vma = exporter->anon_vma;
 705                }
 706        }
 707
 708        if (file) {
 709                mapping = file->f_mapping;
 710                if (!(vma->vm_flags & VM_NONLINEAR)) {
 711                        root = &mapping->i_mmap;
 712                        uprobe_munmap(vma, vma->vm_start, vma->vm_end);
 713
 714                        if (adjust_next)
 715                                uprobe_munmap(next, next->vm_start,
 716                                                        next->vm_end);
 717                }
 718
 719                mutex_lock(&mapping->i_mmap_mutex);
 720                if (insert) {
 721                        /*
 722                         * Put into interval tree now, so instantiated pages
 723                         * are visible to arm/parisc __flush_dcache_page
 724                         * throughout; but we cannot insert into address
 725                         * space until vma start or end is updated.
 726                         */
 727                        __vma_link_file(insert);
 728                }
 729        }
 730
 731        vma_adjust_trans_huge(vma, start, end, adjust_next);
 732
 733        anon_vma = vma->anon_vma;
 734        if (!anon_vma && adjust_next)
 735                anon_vma = next->anon_vma;
 736        if (anon_vma) {
 737                VM_BUG_ON(adjust_next && next->anon_vma &&
 738                          anon_vma != next->anon_vma);
 739                anon_vma_lock_write(anon_vma);
 740                anon_vma_interval_tree_pre_update_vma(vma);
 741                if (adjust_next)
 742                        anon_vma_interval_tree_pre_update_vma(next);
 743        }
 744
 745        if (root) {
 746                flush_dcache_mmap_lock(mapping);
 747                vma_interval_tree_remove(vma, root);
 748                if (adjust_next)
 749                        vma_interval_tree_remove(next, root);
 750        }
 751
 752        if (start != vma->vm_start) {
 753                vma->vm_start = start;
 754                start_changed = true;
 755        }
 756        if (end != vma->vm_end) {
 757                vma->vm_end = end;
 758                end_changed = true;
 759        }
 760        vma->vm_pgoff = pgoff;
 761        if (adjust_next) {
 762                next->vm_start += adjust_next << PAGE_SHIFT;
 763                next->vm_pgoff += adjust_next;
 764        }
 765
 766        if (root) {
 767                if (adjust_next)
 768                        vma_interval_tree_insert(next, root);
 769                vma_interval_tree_insert(vma, root);
 770                flush_dcache_mmap_unlock(mapping);
 771        }
 772
 773        if (remove_next) {
 774                /*
 775                 * vma_merge has merged next into vma, and needs
 776                 * us to remove next before dropping the locks.
 777                 */
 778                __vma_unlink(mm, next, vma);
 779                if (file)
 780                        __remove_shared_vm_struct(next, file, mapping);
 781        } else if (insert) {
 782                /*
 783                 * split_vma has split insert from vma, and needs
 784                 * us to insert it before dropping the locks
 785                 * (it may either follow vma or precede it).
 786                 */
 787                __insert_vm_struct(mm, insert);
 788        } else {
 789                if (start_changed)
 790                        vma_gap_update(vma);
 791                if (end_changed) {
 792                        if (!next)
 793                                mm->highest_vm_end = end;
 794                        else if (!adjust_next)
 795                                vma_gap_update(next);
 796                }
 797        }
 798
 799        if (anon_vma) {
 800                anon_vma_interval_tree_post_update_vma(vma);
 801                if (adjust_next)
 802                        anon_vma_interval_tree_post_update_vma(next);
 803                anon_vma_unlock(anon_vma);
 804        }
 805        if (mapping)
 806                mutex_unlock(&mapping->i_mmap_mutex);
 807
 808        if (root) {
 809                uprobe_mmap(vma);
 810
 811                if (adjust_next)
 812                        uprobe_mmap(next);
 813        }
 814
 815        if (remove_next) {
 816                if (file) {
 817                        uprobe_munmap(next, next->vm_start, next->vm_end);
 818                        fput(file);
 819                }
 820                if (next->anon_vma)
 821                        anon_vma_merge(vma, next);
 822                mm->map_count--;
 823                mpol_put(vma_policy(next));
 824                kmem_cache_free(vm_area_cachep, next);
 825                /*
 826                 * In mprotect's case 6 (see comments on vma_merge),
 827                 * we must remove another next too. It would clutter
 828                 * up the code too much to do both in one go.
 829                 */
 830                next = vma->vm_next;
 831                if (remove_next == 2)
 832                        goto again;
 833                else if (next)
 834                        vma_gap_update(next);
 835                else
 836                        mm->highest_vm_end = end;
 837        }
 838        if (insert && file)
 839                uprobe_mmap(insert);
 840
 841        validate_mm(mm);
 842
 843        return 0;
 844}
 845
 846/*
 847 * If the vma has a ->close operation then the driver probably needs to release
 848 * per-vma resources, so we don't attempt to merge those.
 849 */
 850static inline int is_mergeable_vma(struct vm_area_struct *vma,
 851                        struct file *file, unsigned long vm_flags)
 852{
 853        if (vma->vm_flags ^ vm_flags)
 854                return 0;
 855        if (vma->vm_file != file)
 856                return 0;
 857        if (vma->vm_ops && vma->vm_ops->close)
 858                return 0;
 859        return 1;
 860}
 861
 862static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
 863                                        struct anon_vma *anon_vma2,
 864                                        struct vm_area_struct *vma)
 865{
 866        /*
 867         * The list_is_singular() test is to avoid merging VMA cloned from
 868         * parents. This can improve scalability caused by anon_vma lock.
 869         */
 870        if ((!anon_vma1 || !anon_vma2) && (!vma ||
 871                list_is_singular(&vma->anon_vma_chain)))
 872                return 1;
 873        return anon_vma1 == anon_vma2;
 874}
 875
 876/*
 877 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 878 * in front of (at a lower virtual address and file offset than) the vma.
 879 *
 880 * We cannot merge two vmas if they have differently assigned (non-NULL)
 881 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 882 *
 883 * We don't check here for the merged mmap wrapping around the end of pagecache
 884 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
 885 * wrap, nor mmaps which cover the final page at index -1UL.
 886 */
 887static int
 888can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 889        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 890{
 891        if (is_mergeable_vma(vma, file, vm_flags) &&
 892            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 893                if (vma->vm_pgoff == vm_pgoff)
 894                        return 1;
 895        }
 896        return 0;
 897}
 898
 899/*
 900 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 901 * beyond (at a higher virtual address and file offset than) the vma.
 902 *
 903 * We cannot merge two vmas if they have differently assigned (non-NULL)
 904 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 905 */
 906static int
 907can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 908        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 909{
 910        if (is_mergeable_vma(vma, file, vm_flags) &&
 911            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 912                pgoff_t vm_pglen;
 913                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 914                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
 915                        return 1;
 916        }
 917        return 0;
 918}
 919
 920/*
 921 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
 922 * whether that can be merged with its predecessor or its successor.
 923 * Or both (it neatly fills a hole).
 924 *
 925 * In most cases - when called for mmap, brk or mremap - [addr,end) is
 926 * certain not to be mapped by the time vma_merge is called; but when
 927 * called for mprotect, it is certain to be already mapped (either at
 928 * an offset within prev, or at the start of next), and the flags of
 929 * this area are about to be changed to vm_flags - and the no-change
 930 * case has already been eliminated.
 931 *
 932 * The following mprotect cases have to be considered, where AAAA is
 933 * the area passed down from mprotect_fixup, never extending beyond one
 934 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
 935 *
 936 *     AAAA             AAAA                AAAA          AAAA
 937 *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
 938 *    cannot merge    might become    might become    might become
 939 *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
 940 *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
 941 *    mremap move:                                    PPPPNNNNNNNN 8
 942 *        AAAA
 943 *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
 944 *    might become    case 1 below    case 2 below    case 3 below
 945 *
 946 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
 947 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
 948 */
 949struct vm_area_struct *vma_merge(struct mm_struct *mm,
 950                        struct vm_area_struct *prev, unsigned long addr,
 951                        unsigned long end, unsigned long vm_flags,
 952                        struct anon_vma *anon_vma, struct file *file,
 953                        pgoff_t pgoff, struct mempolicy *policy)
 954{
 955        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 956        struct vm_area_struct *area, *next;
 957        int err;
 958
 959        /*
 960         * We later require that vma->vm_flags == vm_flags,
 961         * so this tests vma->vm_flags & VM_SPECIAL, too.
 962         */
 963        if (vm_flags & VM_SPECIAL)
 964                return NULL;
 965
 966        if (prev)
 967                next = prev->vm_next;
 968        else
 969                next = mm->mmap;
 970        area = next;
 971        if (next && next->vm_end == end)                /* cases 6, 7, 8 */
 972                next = next->vm_next;
 973
 974        /*
 975         * Can it merge with the predecessor?
 976         */
 977        if (prev && prev->vm_end == addr &&
 978                        mpol_equal(vma_policy(prev), policy) &&
 979                        can_vma_merge_after(prev, vm_flags,
 980                                                anon_vma, file, pgoff)) {
 981                /*
 982                 * OK, it can.  Can we now merge in the successor as well?
 983                 */
 984                if (next && end == next->vm_start &&
 985                                mpol_equal(policy, vma_policy(next)) &&
 986                                can_vma_merge_before(next, vm_flags,
 987                                        anon_vma, file, pgoff+pglen) &&
 988                                is_mergeable_anon_vma(prev->anon_vma,
 989                                                      next->anon_vma, NULL)) {
 990                                                        /* cases 1, 6 */
 991                        err = vma_adjust(prev, prev->vm_start,
 992                                next->vm_end, prev->vm_pgoff, NULL);
 993                } else                                  /* cases 2, 5, 7 */
 994                        err = vma_adjust(prev, prev->vm_start,
 995                                end, prev->vm_pgoff, NULL);
 996                if (err)
 997                        return NULL;
 998                khugepaged_enter_vma_merge(prev);
 999                return prev;
1000        }
1001
1002        /*
1003         * Can this new request be merged in front of next?
1004         */
1005        if (next && end == next->vm_start &&
1006                        mpol_equal(policy, vma_policy(next)) &&
1007                        can_vma_merge_before(next, vm_flags,
1008                                        anon_vma, file, pgoff+pglen)) {
1009                if (prev && addr < prev->vm_end)        /* case 4 */
1010                        err = vma_adjust(prev, prev->vm_start,
1011                                addr, prev->vm_pgoff, NULL);
1012                else                                    /* cases 3, 8 */
1013                        err = vma_adjust(area, addr, next->vm_end,
1014                                next->vm_pgoff - pglen, NULL);
1015                if (err)
1016                        return NULL;
1017                khugepaged_enter_vma_merge(area);
1018                return area;
1019        }
1020
1021        return NULL;
1022}
1023
1024/*
1025 * Rough compatbility check to quickly see if it's even worth looking
1026 * at sharing an anon_vma.
1027 *
1028 * They need to have the same vm_file, and the flags can only differ
1029 * in things that mprotect may change.
1030 *
1031 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1032 * we can merge the two vma's. For example, we refuse to merge a vma if
1033 * there is a vm_ops->close() function, because that indicates that the
1034 * driver is doing some kind of reference counting. But that doesn't
1035 * really matter for the anon_vma sharing case.
1036 */
1037static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1038{
1039        return a->vm_end == b->vm_start &&
1040                mpol_equal(vma_policy(a), vma_policy(b)) &&
1041                a->vm_file == b->vm_file &&
1042                !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
1043                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1044}
1045
1046/*
1047 * Do some basic sanity checking to see if we can re-use the anon_vma
1048 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1049 * the same as 'old', the other will be the new one that is trying
1050 * to share the anon_vma.
1051 *
1052 * NOTE! This runs with mm_sem held for reading, so it is possible that
1053 * the anon_vma of 'old' is concurrently in the process of being set up
1054 * by another page fault trying to merge _that_. But that's ok: if it
1055 * is being set up, that automatically means that it will be a singleton
1056 * acceptable for merging, so we can do all of this optimistically. But
1057 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
1058 *
1059 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1060 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1061 * is to return an anon_vma that is "complex" due to having gone through
1062 * a fork).
1063 *
1064 * We also make sure that the two vma's are compatible (adjacent,
1065 * and with the same memory policies). That's all stable, even with just
1066 * a read lock on the mm_sem.
1067 */
1068static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1069{
1070        if (anon_vma_compatible(a, b)) {
1071                struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
1072
1073                if (anon_vma && list_is_singular(&old->anon_vma_chain))
1074                        return anon_vma;
1075        }
1076        return NULL;
1077}
1078
1079/*
1080 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1081 * neighbouring vmas for a suitable anon_vma, before it goes off
1082 * to allocate a new anon_vma.  It checks because a repetitive
1083 * sequence of mprotects and faults may otherwise lead to distinct
1084 * anon_vmas being allocated, preventing vma merge in subsequent
1085 * mprotect.
1086 */
1087struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1088{
1089        struct anon_vma *anon_vma;
1090        struct vm_area_struct *near;
1091
1092        near = vma->vm_next;
1093        if (!near)
1094                goto try_prev;
1095
1096        anon_vma = reusable_anon_vma(near, vma, near);
1097        if (anon_vma)
1098                return anon_vma;
1099try_prev:
1100        near = vma->vm_prev;
1101        if (!near)
1102                goto none;
1103
1104        anon_vma = reusable_anon_vma(near, near, vma);
1105        if (anon_vma)
1106                return anon_vma;
1107none:
1108        /*
1109         * There's no absolute need to look only at touching neighbours:
1110         * we could search further afield for "compatible" anon_vmas.
1111         * But it would probably just be a waste of time searching,
1112         * or lead to too many vmas hanging off the same anon_vma.
1113         * We're trying to allow mprotect remerging later on,
1114         * not trying to minimize memory used for anon_vmas.
1115         */
1116        return NULL;
1117}
1118
1119#ifdef CONFIG_PROC_FS
1120void vm_stat_account(struct mm_struct *mm, unsigned long flags,
1121                                                struct file *file, long pages)
1122{
1123        const unsigned long stack_flags
1124                = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
1125
1126        mm->total_vm += pages;
1127
1128        if (file) {
1129                mm->shared_vm += pages;
1130                if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
1131                        mm->exec_vm += pages;
1132        } else if (flags & stack_flags)
1133                mm->stack_vm += pages;
1134}
1135#endif /* CONFIG_PROC_FS */
1136
1137/*
1138 * If a hint addr is less than mmap_min_addr change hint to be as
1139 * low as possible but still greater than mmap_min_addr
1140 */
1141static inline unsigned long round_hint_to_min(unsigned long hint)
1142{
1143        hint &= PAGE_MASK;
1144        if (((void *)hint != NULL) &&
1145            (hint < mmap_min_addr))
1146                return PAGE_ALIGN(mmap_min_addr);
1147        return hint;
1148}
1149
1150/*
1151 * The caller must hold down_write(&current->mm->mmap_sem).
1152 */
1153
1154unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1155                        unsigned long len, unsigned long prot,
1156                        unsigned long flags, unsigned long pgoff)
1157{
1158        struct mm_struct * mm = current->mm;
1159        struct inode *inode;
1160        vm_flags_t vm_flags;
1161
1162        /*
1163         * Does the application expect PROT_READ to imply PROT_EXEC?
1164         *
1165         * (the exception is when the underlying filesystem is noexec
1166         *  mounted, in which case we dont add PROT_EXEC.)
1167         */
1168        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1169                if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
1170                        prot |= PROT_EXEC;
1171
1172        if (!len)
1173                return -EINVAL;
1174
1175        if (!(flags & MAP_FIXED))
1176                addr = round_hint_to_min(addr);
1177
1178        /* Careful about overflows.. */
1179        len = PAGE_ALIGN(len);
1180        if (!len)
1181                return -ENOMEM;
1182
1183        /* offset overflow? */
1184        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1185               return -EOVERFLOW;
1186
1187        /* Too many mappings? */
1188        if (mm->map_count > sysctl_max_map_count)
1189                return -ENOMEM;
1190
1191        /* Obtain the address to map to. we verify (or select) it and ensure
1192         * that it represents a valid section of the address space.
1193         */
1194        addr = get_unmapped_area(file, addr, len, pgoff, flags);
1195        if (addr & ~PAGE_MASK)
1196                return addr;
1197
1198        /* Do simple checking here so the lower-level routines won't have
1199         * to. we assume access permissions have been handled by the open
1200         * of the memory object, so we don't do any here.
1201         */
1202        vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1203                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1204
1205        if (flags & MAP_LOCKED)
1206                if (!can_do_mlock())
1207                        return -EPERM;
1208
1209        /* mlock MCL_FUTURE? */
1210        if (vm_flags & VM_LOCKED) {
1211                unsigned long locked, lock_limit;
1212                locked = len >> PAGE_SHIFT;
1213                locked += mm->locked_vm;
1214                lock_limit = rlimit(RLIMIT_MEMLOCK);
1215                lock_limit >>= PAGE_SHIFT;
1216                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1217                        return -EAGAIN;
1218        }
1219
1220        inode = file ? file->f_path.dentry->d_inode : NULL;
1221
1222        if (file) {
1223                switch (flags & MAP_TYPE) {
1224                case MAP_SHARED:
1225                        if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
1226                                return -EACCES;
1227
1228                        /*
1229                         * Make sure we don't allow writing to an append-only
1230                         * file..
1231                         */
1232                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1233                                return -EACCES;
1234
1235                        /*
1236                         * Make sure there are no mandatory locks on the file.
1237                         */
1238                        if (locks_verify_locked(inode))
1239                                return -EAGAIN;
1240
1241                        vm_flags |= VM_SHARED | VM_MAYSHARE;
1242                        if (!(file->f_mode & FMODE_WRITE))
1243                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1244
1245                        /* fall through */
1246                case MAP_PRIVATE:
1247                        if (!(file->f_mode & FMODE_READ))
1248                                return -EACCES;
1249                        if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1250                                if (vm_flags & VM_EXEC)
1251                                        return -EPERM;
1252                                vm_flags &= ~VM_MAYEXEC;
1253                        }
1254
1255                        if (!file->f_op || !file->f_op->mmap)
1256                                return -ENODEV;
1257                        break;
1258
1259                default:
1260                        return -EINVAL;
1261                }
1262        } else {
1263                switch (flags & MAP_TYPE) {
1264                case MAP_SHARED:
1265                        /*
1266                         * Ignore pgoff.
1267                         */
1268                        pgoff = 0;
1269                        vm_flags |= VM_SHARED | VM_MAYSHARE;
1270                        break;
1271                case MAP_PRIVATE:
1272                        /*
1273                         * Set pgoff according to addr for anon_vma.
1274                         */
1275                        pgoff = addr >> PAGE_SHIFT;
1276                        break;
1277                default:
1278                        return -EINVAL;
1279                }
1280        }
1281
1282        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1283}
1284
1285SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1286                unsigned long, prot, unsigned long, flags,
1287                unsigned long, fd, unsigned long, pgoff)
1288{
1289        struct file *file = NULL;
1290        unsigned long retval = -EBADF;
1291
1292        if (!(flags & MAP_ANONYMOUS)) {
1293                audit_mmap_fd(fd, flags);
1294                if (unlikely(flags & MAP_HUGETLB))
1295                        return -EINVAL;
1296                file = fget(fd);
1297                if (!file)
1298                        goto out;
1299        } else if (flags & MAP_HUGETLB) {
1300                struct user_struct *user = NULL;
1301                /*
1302                 * VM_NORESERVE is used because the reservations will be
1303                 * taken when vm_ops->mmap() is called
1304                 * A dummy user value is used because we are not locking
1305                 * memory so no accounting is necessary
1306                 */
1307                file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1308                                VM_NORESERVE,
1309                                &user, HUGETLB_ANONHUGE_INODE,
1310                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1311                if (IS_ERR(file))
1312                        return PTR_ERR(file);
1313        }
1314
1315        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1316
1317        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1318        if (file)
1319                fput(file);
1320out:
1321        return retval;
1322}
1323
1324#ifdef __ARCH_WANT_SYS_OLD_MMAP
1325struct mmap_arg_struct {
1326        unsigned long addr;
1327        unsigned long len;
1328        unsigned long prot;
1329        unsigned long flags;
1330        unsigned long fd;
1331        unsigned long offset;
1332};
1333
1334SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1335{
1336        struct mmap_arg_struct a;
1337
1338        if (copy_from_user(&a, arg, sizeof(a)))
1339                return -EFAULT;
1340        if (a.offset & ~PAGE_MASK)
1341                return -EINVAL;
1342
1343        return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1344                              a.offset >> PAGE_SHIFT);
1345}
1346#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1347
1348/*
1349 * Some shared mappigns will want the pages marked read-only
1350 * to track write events. If so, we'll downgrade vm_page_prot
1351 * to the private version (using protection_map[] without the
1352 * VM_SHARED bit).
1353 */
1354int vma_wants_writenotify(struct vm_area_struct *vma)
1355{
1356        vm_flags_t vm_flags = vma->vm_flags;
1357
1358        /* If it was private or non-writable, the write bit is already clear */
1359        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
1360                return 0;
1361
1362        /* The backer wishes to know when pages are first written to? */
1363        if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1364                return 1;
1365
1366        /* The open routine did something to the protections already? */
1367        if (pgprot_val(vma->vm_page_prot) !=
1368            pgprot_val(vm_get_page_prot(vm_flags)))
1369                return 0;
1370
1371        /* Specialty mapping? */
1372        if (vm_flags & VM_PFNMAP)
1373                return 0;
1374
1375        /* Can the mapping track the dirty pages? */
1376        return vma->vm_file && vma->vm_file->f_mapping &&
1377                mapping_cap_account_dirty(vma->vm_file->f_mapping);
1378}
1379
1380/*
1381 * We account for memory if it's a private writeable mapping,
1382 * not hugepages and VM_NORESERVE wasn't set.
1383 */
1384static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1385{
1386        /*
1387         * hugetlb has its own accounting separate from the core VM
1388         * VM_HUGETLB may not be set yet so we cannot check for that flag.
1389         */
1390        if (file && is_file_hugepages(file))
1391                return 0;
1392
1393        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1394}
1395
1396unsigned long mmap_region(struct file *file, unsigned long addr,
1397                          unsigned long len, unsigned long flags,
1398                          vm_flags_t vm_flags, unsigned long pgoff)
1399{
1400        struct mm_struct *mm = current->mm;
1401        struct vm_area_struct *vma, *prev;
1402        int correct_wcount = 0;
1403        int error;
1404        struct rb_node **rb_link, *rb_parent;
1405        unsigned long charged = 0;
1406        struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
1407
1408        /* Clear old maps */
1409        error = -ENOMEM;
1410munmap_back:
1411        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1412                if (do_munmap(mm, addr, len))
1413                        return -ENOMEM;
1414                goto munmap_back;
1415        }
1416
1417        /* Check against address space limit. */
1418        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1419                return -ENOMEM;
1420
1421        /*
1422         * Set 'VM_NORESERVE' if we should not account for the
1423         * memory use of this mapping.
1424         */
1425        if ((flags & MAP_NORESERVE)) {
1426                /* We honor MAP_NORESERVE if allowed to overcommit */
1427                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1428                        vm_flags |= VM_NORESERVE;
1429
1430                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1431                if (file && is_file_hugepages(file))
1432                        vm_flags |= VM_NORESERVE;
1433        }
1434
1435        /*
1436         * Private writable mapping: check memory availability
1437         */
1438        if (accountable_mapping(file, vm_flags)) {
1439                charged = len >> PAGE_SHIFT;
1440                if (security_vm_enough_memory_mm(mm, charged))
1441                        return -ENOMEM;
1442                vm_flags |= VM_ACCOUNT;
1443        }
1444
1445        /*
1446         * Can we just expand an old mapping?
1447         */
1448        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1449        if (vma)
1450                goto out;
1451
1452        /*
1453         * Determine the object being mapped and call the appropriate
1454         * specific mapper. the address has already been validated, but
1455         * not unmapped, but the maps are removed from the list.
1456         */
1457        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1458        if (!vma) {
1459                error = -ENOMEM;
1460                goto unacct_error;
1461        }
1462
1463        vma->vm_mm = mm;
1464        vma->vm_start = addr;
1465        vma->vm_end = addr + len;
1466        vma->vm_flags = vm_flags;
1467        vma->vm_page_prot = vm_get_page_prot(vm_flags);
1468        vma->vm_pgoff = pgoff;
1469        INIT_LIST_HEAD(&vma->anon_vma_chain);
1470
1471        error = -EINVAL;        /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
1472
1473        if (file) {
1474                if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1475                        goto free_vma;
1476                if (vm_flags & VM_DENYWRITE) {
1477                        error = deny_write_access(file);
1478                        if (error)
1479                                goto free_vma;
1480                        correct_wcount = 1;
1481                }
1482                vma->vm_file = get_file(file);
1483                error = file->f_op->mmap(file, vma);
1484                if (error)
1485                        goto unmap_and_free_vma;
1486
1487                /* Can addr have changed??
1488                 *
1489                 * Answer: Yes, several device drivers can do it in their
1490                 *         f_op->mmap method. -DaveM
1491                 * Bug: If addr is changed, prev, rb_link, rb_parent should
1492                 *      be updated for vma_link()
1493                 */
1494                WARN_ON_ONCE(addr != vma->vm_start);
1495
1496                addr = vma->vm_start;
1497                pgoff = vma->vm_pgoff;
1498                vm_flags = vma->vm_flags;
1499        } else if (vm_flags & VM_SHARED) {
1500                if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
1501                        goto free_vma;
1502                error = shmem_zero_setup(vma);
1503                if (error)
1504                        goto free_vma;
1505        }
1506
1507        if (vma_wants_writenotify(vma)) {
1508                pgprot_t pprot = vma->vm_page_prot;
1509
1510                /* Can vma->vm_page_prot have changed??
1511                 *
1512                 * Answer: Yes, drivers may have changed it in their
1513                 *         f_op->mmap method.
1514                 *
1515                 * Ensures that vmas marked as uncached stay that way.
1516                 */
1517                vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1518                if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
1519                        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1520        }
1521
1522        vma_link(mm, vma, prev, rb_link, rb_parent);
1523        file = vma->vm_file;
1524
1525        /* Once vma denies write, undo our temporary denial count */
1526        if (correct_wcount)
1527                atomic_inc(&inode->i_writecount);
1528out:
1529        perf_event_mmap(vma);
1530
1531        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1532        if (vm_flags & VM_LOCKED) {
1533                if (!mlock_vma_pages_range(vma, addr, addr + len))
1534                        mm->locked_vm += (len >> PAGE_SHIFT);
1535        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1536                make_pages_present(addr, addr + len);
1537
1538        if (file)
1539                uprobe_mmap(vma);
1540
1541        return addr;
1542
1543unmap_and_free_vma:
1544        if (correct_wcount)
1545                atomic_inc(&inode->i_writecount);
1546        vma->vm_file = NULL;
1547        fput(file);
1548
1549        /* Undo any partial mapping done by a device driver. */
1550        unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1551        charged = 0;
1552free_vma:
1553        kmem_cache_free(vm_area_cachep, vma);
1554unacct_error:
1555        if (charged)
1556                vm_unacct_memory(charged);
1557        return error;
1558}
1559
1560unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1561{
1562        /*
1563         * We implement the search by looking for an rbtree node that
1564         * immediately follows a suitable gap. That is,
1565         * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1566         * - gap_end   = vma->vm_start        >= info->low_limit  + length;
1567         * - gap_end - gap_start >= length
1568         */
1569
1570        struct mm_struct *mm = current->mm;
1571        struct vm_area_struct *vma;
1572        unsigned long length, low_limit, high_limit, gap_start, gap_end;
1573
1574        /* Adjust search length to account for worst case alignment overhead */
1575        length = info->length + info->align_mask;
1576        if (length < info->length)
1577                return -ENOMEM;
1578
1579        /* Adjust search limits by the desired length */
1580        if (info->high_limit < length)
1581                return -ENOMEM;
1582        high_limit = info->high_limit - length;
1583
1584        if (info->low_limit > high_limit)
1585                return -ENOMEM;
1586        low_limit = info->low_limit + length;
1587
1588        /* Check if rbtree root looks promising */
1589        if (RB_EMPTY_ROOT(&mm->mm_rb))
1590                goto check_highest;
1591        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1592        if (vma->rb_subtree_gap < length)
1593                goto check_highest;
1594
1595        while (true) {
1596                /* Visit left subtree if it looks promising */
1597                gap_end = vma->vm_start;
1598                if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1599                        struct vm_area_struct *left =
1600                                rb_entry(vma->vm_rb.rb_left,
1601                                         struct vm_area_struct, vm_rb);
1602                        if (left->rb_subtree_gap >= length) {
1603                                vma = left;
1604                                continue;
1605                        }
1606                }
1607
1608                gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1609check_current:
1610                /* Check if current node has a suitable gap */
1611                if (gap_start > high_limit)
1612                        return -ENOMEM;
1613                if (gap_end >= low_limit && gap_end - gap_start >= length)
1614                        goto found;
1615
1616                /* Visit right subtree if it looks promising */
1617                if (vma->vm_rb.rb_right) {
1618                        struct vm_area_struct *right =
1619                                rb_entry(vma->vm_rb.rb_right,
1620                                         struct vm_area_struct, vm_rb);
1621                        if (right->rb_subtree_gap >= length) {
1622                                vma = right;
1623                                continue;
1624                        }
1625                }
1626
1627                /* Go back up the rbtree to find next candidate node */
1628                while (true) {
1629                        struct rb_node *prev = &vma->vm_rb;
1630                        if (!rb_parent(prev))
1631                                goto check_highest;
1632                        vma = rb_entry(rb_parent(prev),
1633                                       struct vm_area_struct, vm_rb);
1634                        if (prev == vma->vm_rb.rb_left) {
1635                                gap_start = vma->vm_prev->vm_end;
1636                                gap_end = vma->vm_start;
1637                                goto check_current;
1638                        }
1639                }
1640        }
1641
1642check_highest:
1643        /* Check highest gap, which does not precede any rbtree node */
1644        gap_start = mm->highest_vm_end;
1645        gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
1646        if (gap_start > high_limit)
1647                return -ENOMEM;
1648
1649found:
1650        /* We found a suitable gap. Clip it with the original low_limit. */
1651        if (gap_start < info->low_limit)
1652                gap_start = info->low_limit;
1653
1654        /* Adjust gap address to the desired alignment */
1655        gap_start += (info->align_offset - gap_start) & info->align_mask;
1656
1657        VM_BUG_ON(gap_start + info->length > info->high_limit);
1658        VM_BUG_ON(gap_start + info->length > gap_end);
1659        return gap_start;
1660}
1661
1662unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1663{
1664        struct mm_struct *mm = current->mm;
1665        struct vm_area_struct *vma;
1666        unsigned long length, low_limit, high_limit, gap_start, gap_end;
1667
1668        /* Adjust search length to account for worst case alignment overhead */
1669        length = info->length + info->align_mask;
1670        if (length < info->length)
1671                return -ENOMEM;
1672
1673        /*
1674         * Adjust search limits by the desired length.
1675         * See implementation comment at top of unmapped_area().
1676         */
1677        gap_end = info->high_limit;
1678        if (gap_end < length)
1679                return -ENOMEM;
1680        high_limit = gap_end - length;
1681
1682        if (info->low_limit > high_limit)
1683                return -ENOMEM;
1684        low_limit = info->low_limit + length;
1685
1686        /* Check highest gap, which does not precede any rbtree node */
1687        gap_start = mm->highest_vm_end;
1688        if (gap_start <= high_limit)
1689                goto found_highest;
1690
1691        /* Check if rbtree root looks promising */
1692        if (RB_EMPTY_ROOT(&mm->mm_rb))
1693                return -ENOMEM;
1694        vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1695        if (vma->rb_subtree_gap < length)
1696                return -ENOMEM;
1697
1698        while (true) {
1699                /* Visit right subtree if it looks promising */
1700                gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1701                if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1702                        struct vm_area_struct *right =
1703                                rb_entry(vma->vm_rb.rb_right,
1704                                         struct vm_area_struct, vm_rb);
1705                        if (right->rb_subtree_gap >= length) {
1706                                vma = right;
1707                                continue;
1708                        }
1709                }
1710
1711check_current:
1712                /* Check if current node has a suitable gap */
1713                gap_end = vma->vm_start;
1714                if (gap_end < low_limit)
1715                        return -ENOMEM;
1716                if (gap_start <= high_limit && gap_end - gap_start >= length)
1717                        goto found;
1718
1719                /* Visit left subtree if it looks promising */
1720                if (vma->vm_rb.rb_left) {
1721                        struct vm_area_struct *left =
1722                                rb_entry(vma->vm_rb.rb_left,
1723                                         struct vm_area_struct, vm_rb);
1724                        if (left->rb_subtree_gap >= length) {
1725                                vma = left;
1726                                continue;
1727                        }
1728                }
1729
1730                /* Go back up the rbtree to find next candidate node */
1731                while (true) {
1732                        struct rb_node *prev = &vma->vm_rb;
1733                        if (!rb_parent(prev))
1734                                return -ENOMEM;
1735                        vma = rb_entry(rb_parent(prev),
1736                                       struct vm_area_struct, vm_rb);
1737                        if (prev == vma->vm_rb.rb_right) {
1738                                gap_start = vma->vm_prev ?
1739                                        vma->vm_prev->vm_end : 0;
1740                                goto check_current;
1741                        }
1742                }
1743        }
1744
1745found:
1746        /* We found a suitable gap. Clip it with the original high_limit. */
1747        if (gap_end > info->high_limit)
1748                gap_end = info->high_limit;
1749
1750found_highest:
1751        /* Compute highest gap address at the desired alignment */
1752        gap_end -= info->length;
1753        gap_end -= (gap_end - info->align_offset) & info->align_mask;
1754
1755        VM_BUG_ON(gap_end < info->low_limit);
1756        VM_BUG_ON(gap_end < gap_start);
1757        return gap_end;
1758}
1759
1760/* Get an address range which is currently unmapped.
1761 * For shmat() with addr=0.
1762 *
1763 * Ugly calling convention alert:
1764 * Return value with the low bits set means error value,
1765 * ie
1766 *      if (ret & ~PAGE_MASK)
1767 *              error = ret;
1768 *
1769 * This function "knows" that -ENOMEM has the bits set.
1770 */
1771#ifndef HAVE_ARCH_UNMAPPED_AREA
1772unsigned long
1773arch_get_unmapped_area(struct file *filp, unsigned long addr,
1774                unsigned long len, unsigned long pgoff, unsigned long flags)
1775{
1776        struct mm_struct *mm = current->mm;
1777        struct vm_area_struct *vma;
1778        struct vm_unmapped_area_info info;
1779
1780        if (len > TASK_SIZE)
1781                return -ENOMEM;
1782
1783        if (flags & MAP_FIXED)
1784                return addr;
1785
1786        if (addr) {
1787                addr = PAGE_ALIGN(addr);
1788                vma = find_vma(mm, addr);
1789                if (TASK_SIZE - len >= addr &&
1790                    (!vma || addr + len <= vma->vm_start))
1791                        return addr;
1792        }
1793
1794        info.flags = 0;
1795        info.length = len;
1796        info.low_limit = TASK_UNMAPPED_BASE;
1797        info.high_limit = TASK_SIZE;
1798        info.align_mask = 0;
1799        return vm_unmapped_area(&info);
1800}
1801#endif  
1802
1803void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1804{
1805        /*
1806         * Is this a new hole at the lowest possible address?
1807         */
1808        if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1809                mm->free_area_cache = addr;
1810}
1811
1812/*
1813 * This mmap-allocator allocates new areas top-down from below the
1814 * stack's low limit (the base):
1815 */
1816#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1817unsigned long
1818arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1819                          const unsigned long len, const unsigned long pgoff,
1820                          const unsigned long flags)
1821{
1822        struct vm_area_struct *vma;
1823        struct mm_struct *mm = current->mm;
1824        unsigned long addr = addr0;
1825        struct vm_unmapped_area_info info;
1826
1827        /* requested length too big for entire address space */
1828        if (len > TASK_SIZE)
1829                return -ENOMEM;
1830
1831        if (flags & MAP_FIXED)
1832                return addr;
1833
1834        /* requesting a specific address */
1835        if (addr) {
1836                addr = PAGE_ALIGN(addr);
1837                vma = find_vma(mm, addr);
1838                if (TASK_SIZE - len >= addr &&
1839                                (!vma || addr + len <= vma->vm_start))
1840                        return addr;
1841        }
1842
1843        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1844        info.length = len;
1845        info.low_limit = PAGE_SIZE;
1846        info.high_limit = mm->mmap_base;
1847        info.align_mask = 0;
1848        addr = vm_unmapped_area(&info);
1849
1850        /*
1851         * A failed mmap() very likely causes application failure,
1852         * so fall back to the bottom-up function here. This scenario
1853         * can happen with large stack limits and large mmap()
1854         * allocations.
1855         */
1856        if (addr & ~PAGE_MASK) {
1857                VM_BUG_ON(addr != -ENOMEM);
1858                info.flags = 0;
1859                info.low_limit = TASK_UNMAPPED_BASE;
1860                info.high_limit = TASK_SIZE;
1861                addr = vm_unmapped_area(&info);
1862        }
1863
1864        return addr;
1865}
1866#endif
1867
1868void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
1869{
1870        /*
1871         * Is this a new hole at the highest possible address?
1872         */
1873        if (addr > mm->free_area_cache)
1874                mm->free_area_cache = addr;
1875
1876        /* dont allow allocations above current base */
1877        if (mm->free_area_cache > mm->mmap_base)
1878                mm->free_area_cache = mm->mmap_base;
1879}
1880
1881unsigned long
1882get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1883                unsigned long pgoff, unsigned long flags)
1884{
1885        unsigned long (*get_area)(struct file *, unsigned long,
1886                                  unsigned long, unsigned long, unsigned long);
1887
1888        unsigned long error = arch_mmap_check(addr, len, flags);
1889        if (error)
1890                return error;
1891
1892        /* Careful about overflows.. */
1893        if (len > TASK_SIZE)
1894                return -ENOMEM;
1895
1896        get_area = current->mm->get_unmapped_area;
1897        if (file && file->f_op && file->f_op->get_unmapped_area)
1898                get_area = file->f_op->get_unmapped_area;
1899        addr = get_area(file, addr, len, pgoff, flags);
1900        if (IS_ERR_VALUE(addr))
1901                return addr;
1902
1903        if (addr > TASK_SIZE - len)
1904                return -ENOMEM;
1905        if (addr & ~PAGE_MASK)
1906                return -EINVAL;
1907
1908        addr = arch_rebalance_pgtables(addr, len);
1909        error = security_mmap_addr(addr);
1910        return error ? error : addr;
1911}
1912
1913EXPORT_SYMBOL(get_unmapped_area);
1914
1915/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
1916struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1917{
1918        struct vm_area_struct *vma = NULL;
1919
1920        if (WARN_ON_ONCE(!mm))          /* Remove this in linux-3.6 */
1921                return NULL;
1922
1923        /* Check the cache first. */
1924        /* (Cache hit rate is typically around 35%.) */
1925        vma = mm->mmap_cache;
1926        if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1927                struct rb_node *rb_node;
1928
1929                rb_node = mm->mm_rb.rb_node;
1930                vma = NULL;
1931
1932                while (rb_node) {
1933                        struct vm_area_struct *vma_tmp;
1934
1935                        vma_tmp = rb_entry(rb_node,
1936                                           struct vm_area_struct, vm_rb);
1937
1938                        if (vma_tmp->vm_end > addr) {
1939                                vma = vma_tmp;
1940                                if (vma_tmp->vm_start <= addr)
1941                                        break;
1942                                rb_node = rb_node->rb_left;
1943                        } else
1944                                rb_node = rb_node->rb_right;
1945                }
1946                if (vma)
1947                        mm->mmap_cache = vma;
1948        }
1949        return vma;
1950}
1951
1952EXPORT_SYMBOL(find_vma);
1953
1954/*
1955 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
1956 */
1957struct vm_area_struct *
1958find_vma_prev(struct mm_struct *mm, unsigned long addr,
1959                        struct vm_area_struct **pprev)
1960{
1961        struct vm_area_struct *vma;
1962
1963        vma = find_vma(mm, addr);
1964        if (vma) {
1965                *pprev = vma->vm_prev;
1966        } else {
1967                struct rb_node *rb_node = mm->mm_rb.rb_node;
1968                *pprev = NULL;
1969                while (rb_node) {
1970                        *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1971                        rb_node = rb_node->rb_right;
1972                }
1973        }
1974        return vma;
1975}
1976
1977/*
1978 * Verify that the stack growth is acceptable and
1979 * update accounting. This is shared with both the
1980 * grow-up and grow-down cases.
1981 */
1982static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
1983{
1984        struct mm_struct *mm = vma->vm_mm;
1985        struct rlimit *rlim = current->signal->rlim;
1986        unsigned long new_start;
1987
1988        /* address space limit tests */
1989        if (!may_expand_vm(mm, grow))
1990                return -ENOMEM;
1991
1992        /* Stack limit test */
1993        if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
1994                return -ENOMEM;
1995
1996        /* mlock limit tests */
1997        if (vma->vm_flags & VM_LOCKED) {
1998                unsigned long locked;
1999                unsigned long limit;
2000                locked = mm->locked_vm + grow;
2001                limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
2002                limit >>= PAGE_SHIFT;
2003                if (locked > limit && !capable(CAP_IPC_LOCK))
2004                        return -ENOMEM;
2005        }
2006
2007        /* Check to ensure the stack will not grow into a hugetlb-only region */
2008        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2009                        vma->vm_end - size;
2010        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2011                return -EFAULT;
2012
2013        /*
2014         * Overcommit..  This must be the final test, as it will
2015         * update security statistics.
2016         */
2017        if (security_vm_enough_memory_mm(mm, grow))
2018                return -ENOMEM;
2019
2020        /* Ok, everything looks good - let it rip */
2021        if (vma->vm_flags & VM_LOCKED)
2022                mm->locked_vm += grow;
2023        vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
2024        return 0;
2025}
2026
2027#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
2028/*
2029 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
2030 * vma is the last one with address > vma->vm_end.  Have to extend vma.
2031 */
2032int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2033{
2034        int error;
2035
2036        if (!(vma->vm_flags & VM_GROWSUP))
2037                return -EFAULT;
2038
2039        /*
2040         * We must make sure the anon_vma is allocated
2041         * so that the anon_vma locking is not a noop.
2042         */
2043        if (unlikely(anon_vma_prepare(vma)))
2044                return -ENOMEM;
2045        vma_lock_anon_vma(vma);
2046
2047        /*
2048         * vma->vm_start/vm_end cannot change under us because the caller
2049         * is required to hold the mmap_sem in read mode.  We need the
2050         * anon_vma lock to serialize against concurrent expand_stacks.
2051         * Also guard against wrapping around to address 0.
2052         */
2053        if (address < PAGE_ALIGN(address+4))
2054                address = PAGE_ALIGN(address+4);
2055        else {
2056                vma_unlock_anon_vma(vma);
2057                return -ENOMEM;
2058        }
2059        error = 0;
2060
2061        /* Somebody else might have raced and expanded it already */
2062        if (address > vma->vm_end) {
2063                unsigned long size, grow;
2064
2065                size = address - vma->vm_start;
2066                grow = (address - vma->vm_end) >> PAGE_SHIFT;
2067
2068                error = -ENOMEM;
2069                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2070                        error = acct_stack_growth(vma, size, grow);
2071                        if (!error) {
2072                                /*
2073                                 * vma_gap_update() doesn't support concurrent
2074                                 * updates, but we only hold a shared mmap_sem
2075                                 * lock here, so we need to protect against
2076                                 * concurrent vma expansions.
2077                                 * vma_lock_anon_vma() doesn't help here, as
2078                                 * we don't guarantee that all growable vmas
2079                                 * in a mm share the same root anon vma.
2080                                 * So, we reuse mm->page_table_lock to guard
2081                                 * against concurrent vma expansions.
2082                                 */
2083                                spin_lock(&vma->vm_mm->page_table_lock);
2084                                anon_vma_interval_tree_pre_update_vma(vma);
2085                                vma->vm_end = address;
2086                                anon_vma_interval_tree_post_update_vma(vma);
2087                                if (vma->vm_next)
2088                                        vma_gap_update(vma->vm_next);
2089                                else
2090                                        vma->vm_mm->highest_vm_end = address;
2091                                spin_unlock(&vma->vm_mm->page_table_lock);
2092
2093                                perf_event_mmap(vma);
2094                        }
2095                }
2096        }
2097        vma_unlock_anon_vma(vma);
2098        khugepaged_enter_vma_merge(vma);
2099        validate_mm(vma->vm_mm);
2100        return error;
2101}
2102#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
2103
2104/*
2105 * vma is the first one with address < vma->vm_start.  Have to extend vma.
2106 */
2107int expand_downwards(struct vm_area_struct *vma,
2108                                   unsigned long address)
2109{
2110        int error;
2111
2112        /*
2113         * We must make sure the anon_vma is allocated
2114         * so that the anon_vma locking is not a noop.
2115         */
2116        if (unlikely(anon_vma_prepare(vma)))
2117                return -ENOMEM;
2118
2119        address &= PAGE_MASK;
2120        error = security_mmap_addr(address);
2121        if (error)
2122                return error;
2123
2124        vma_lock_anon_vma(vma);
2125
2126        /*
2127         * vma->vm_start/vm_end cannot change under us because the caller
2128         * is required to hold the mmap_sem in read mode.  We need the
2129         * anon_vma lock to serialize against concurrent expand_stacks.
2130         */
2131
2132        /* Somebody else might have raced and expanded it already */
2133        if (address < vma->vm_start) {
2134                unsigned long size, grow;
2135
2136                size = vma->vm_end - address;
2137                grow = (vma->vm_start - address) >> PAGE_SHIFT;
2138
2139                error = -ENOMEM;
2140                if (grow <= vma->vm_pgoff) {
2141                        error = acct_stack_growth(vma, size, grow);
2142                        if (!error) {
2143                                /*
2144                                 * vma_gap_update() doesn't support concurrent
2145                                 * updates, but we only hold a shared mmap_sem
2146                                 * lock here, so we need to protect against
2147                                 * concurrent vma expansions.
2148                                 * vma_lock_anon_vma() doesn't help here, as
2149                                 * we don't guarantee that all growable vmas
2150                                 * in a mm share the same root anon vma.
2151                                 * So, we reuse mm->page_table_lock to guard
2152                                 * against concurrent vma expansions.
2153                                 */
2154                                spin_lock(&vma->vm_mm->page_table_lock);
2155                                anon_vma_interval_tree_pre_update_vma(vma);
2156                                vma->vm_start = address;
2157                                vma->vm_pgoff -= grow;
2158                                anon_vma_interval_tree_post_update_vma(vma);
2159                                vma_gap_update(vma);
2160                                spin_unlock(&vma->vm_mm->page_table_lock);
2161
2162                                perf_event_mmap(vma);
2163                        }
2164                }
2165        }
2166        vma_unlock_anon_vma(vma);
2167        khugepaged_enter_vma_merge(vma);
2168        validate_mm(vma->vm_mm);
2169        return error;
2170}
2171
2172/*
2173 * Note how expand_stack() refuses to expand the stack all the way to
2174 * abut the next virtual mapping, *unless* that mapping itself is also
2175 * a stack mapping. We want to leave room for a guard page, after all
2176 * (the guard page itself is not added here, that is done by the
2177 * actual page faulting logic)
2178 *
2179 * This matches the behavior of the guard page logic (see mm/memory.c:
2180 * check_stack_guard_page()), which only allows the guard page to be
2181 * removed under these circumstances.
2182 */
2183#ifdef CONFIG_STACK_GROWSUP
2184int expand_stack(struct vm_area_struct *vma, unsigned long address)
2185{
2186        struct vm_area_struct *next;
2187
2188        address &= PAGE_MASK;
2189        next = vma->vm_next;
2190        if (next && next->vm_start == address + PAGE_SIZE) {
2191                if (!(next->vm_flags & VM_GROWSUP))
2192                        return -ENOMEM;
2193        }
2194        return expand_upwards(vma, address);
2195}
2196
2197struct vm_area_struct *
2198find_extend_vma(struct mm_struct *mm, unsigned long addr)
2199{
2200        struct vm_area_struct *vma, *prev;
2201
2202        addr &= PAGE_MASK;
2203        vma = find_vma_prev(mm, addr, &prev);
2204        if (vma && (vma->vm_start <= addr))
2205                return vma;
2206        if (!prev || expand_stack(prev, addr))
2207                return NULL;
2208        if (prev->vm_flags & VM_LOCKED) {
2209                mlock_vma_pages_range(prev, addr, prev->vm_end);
2210        }
2211        return prev;
2212}
2213#else
2214int expand_stack(struct vm_area_struct *vma, unsigned long address)
2215{
2216        struct vm_area_struct *prev;
2217
2218        address &= PAGE_MASK;
2219        prev = vma->vm_prev;
2220        if (prev && prev->vm_end == address) {
2221                if (!(prev->vm_flags & VM_GROWSDOWN))
2222                        return -ENOMEM;
2223        }
2224        return expand_downwards(vma, address);
2225}
2226
2227struct vm_area_struct *
2228find_extend_vma(struct mm_struct * mm, unsigned long addr)
2229{
2230        struct vm_area_struct * vma;
2231        unsigned long start;
2232
2233        addr &= PAGE_MASK;
2234        vma = find_vma(mm,addr);
2235        if (!vma)
2236                return NULL;
2237        if (vma->vm_start <= addr)
2238                return vma;
2239        if (!(vma->vm_flags & VM_GROWSDOWN))
2240                return NULL;
2241        start = vma->vm_start;
2242        if (expand_stack(vma, addr))
2243                return NULL;
2244        if (vma->vm_flags & VM_LOCKED) {
2245                mlock_vma_pages_range(vma, addr, start);
2246        }
2247        return vma;
2248}
2249#endif
2250
2251/*
2252 * Ok - we have the memory areas we should free on the vma list,
2253 * so release them, and do the vma updates.
2254 *
2255 * Called with the mm semaphore held.
2256 */
2257static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2258{
2259        unsigned long nr_accounted = 0;
2260
2261        /* Update high watermark before we lower total_vm */
2262        update_hiwater_vm(mm);
2263        do {
2264                long nrpages = vma_pages(vma);
2265
2266                if (vma->vm_flags & VM_ACCOUNT)
2267                        nr_accounted += nrpages;
2268                vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
2269                vma = remove_vma(vma);
2270        } while (vma);
2271        vm_unacct_memory(nr_accounted);
2272        validate_mm(mm);
2273}
2274
2275/*
2276 * Get rid of page table information in the indicated region.
2277 *
2278 * Called with the mm semaphore held.
2279 */
2280static void unmap_region(struct mm_struct *mm,
2281                struct vm_area_struct *vma, struct vm_area_struct *prev,
2282                unsigned long start, unsigned long end)
2283{
2284        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
2285        struct mmu_gather tlb;
2286
2287        lru_add_drain();
2288        tlb_gather_mmu(&tlb, mm, 0);
2289        update_hiwater_rss(mm);
2290        unmap_vmas(&tlb, vma, start, end);
2291        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2292                                 next ? next->vm_start : 0);
2293        tlb_finish_mmu(&tlb, start, end);
2294}
2295
2296/*
2297 * Create a list of vma's touched by the unmap, removing them from the mm's
2298 * vma list as we go..
2299 */
2300static void
2301detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2302        struct vm_area_struct *prev, unsigned long end)
2303{
2304        struct vm_area_struct **insertion_point;
2305        struct vm_area_struct *tail_vma = NULL;
2306        unsigned long addr;
2307
2308        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2309        vma->vm_prev = NULL;
2310        do {
2311                vma_rb_erase(vma, &mm->mm_rb);
2312                mm->map_count--;
2313                tail_vma = vma;
2314                vma = vma->vm_next;
2315        } while (vma && vma->vm_start < end);
2316        *insertion_point = vma;
2317        if (vma) {
2318                vma->vm_prev = prev;
2319                vma_gap_update(vma);
2320        } else
2321                mm->highest_vm_end = prev ? prev->vm_end : 0;
2322        tail_vma->vm_next = NULL;
2323        if (mm->unmap_area == arch_unmap_area)
2324                addr = prev ? prev->vm_end : mm->mmap_base;
2325        else
2326                addr = vma ?  vma->vm_start : mm->mmap_base;
2327        mm->unmap_area(mm, addr);
2328        mm->mmap_cache = NULL;          /* Kill the cache. */
2329}
2330
2331/*
2332 * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
2333 * munmap path where it doesn't make sense to fail.
2334 */
2335static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2336              unsigned long addr, int new_below)
2337{
2338        struct mempolicy *pol;
2339        struct vm_area_struct *new;
2340        int err = -ENOMEM;
2341
2342        if (is_vm_hugetlb_page(vma) && (addr &
2343                                        ~(huge_page_mask(hstate_vma(vma)))))
2344                return -EINVAL;
2345
2346        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2347        if (!new)
2348                goto out_err;
2349
2350        /* most fields are the same, copy all, and then fixup */
2351        *new = *vma;
2352
2353        INIT_LIST_HEAD(&new->anon_vma_chain);
2354
2355        if (new_below)
2356                new->vm_end = addr;
2357        else {
2358                new->vm_start = addr;
2359                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2360        }
2361
2362        pol = mpol_dup(vma_policy(vma));
2363        if (IS_ERR(pol)) {
2364                err = PTR_ERR(pol);
2365                goto out_free_vma;
2366        }
2367        vma_set_policy(new, pol);
2368
2369        if (anon_vma_clone(new, vma))
2370                goto out_free_mpol;
2371
2372        if (new->vm_file)
2373                get_file(new->vm_file);
2374
2375        if (new->vm_ops && new->vm_ops->open)
2376                new->vm_ops->open(new);
2377
2378        if (new_below)
2379                err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
2380                        ((addr - new->vm_start) >> PAGE_SHIFT), new);
2381        else
2382                err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
2383
2384        /* Success. */
2385        if (!err)
2386                return 0;
2387
2388        /* Clean everything up if vma_adjust failed. */
2389        if (new->vm_ops && new->vm_ops->close)
2390                new->vm_ops->close(new);
2391        if (new->vm_file)
2392                fput(new->vm_file);
2393        unlink_anon_vmas(new);
2394 out_free_mpol:
2395        mpol_put(pol);
2396 out_free_vma:
2397        kmem_cache_free(vm_area_cachep, new);
2398 out_err:
2399        return err;
2400}
2401
2402/*
2403 * Split a vma into two pieces at address 'addr', a new vma is allocated
2404 * either for the first part or the tail.
2405 */
2406int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2407              unsigned long addr, int new_below)
2408{
2409        if (mm->map_count >= sysctl_max_map_count)
2410                return -ENOMEM;
2411
2412        return __split_vma(mm, vma, addr, new_below);
2413}
2414
2415/* Munmap is split into 2 main parts -- this part which finds
2416 * what needs doing, and the areas themselves, which do the
2417 * work.  This now handles partial unmappings.
2418 * Jeremy Fitzhardinge <jeremy@goop.org>
2419 */
2420int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2421{
2422        unsigned long end;
2423        struct vm_area_struct *vma, *prev, *last;
2424
2425        if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
2426                return -EINVAL;
2427
2428        if ((len = PAGE_ALIGN(len)) == 0)
2429                return -EINVAL;
2430
2431        /* Find the first overlapping VMA */
2432        vma = find_vma(mm, start);
2433        if (!vma)
2434                return 0;
2435        prev = vma->vm_prev;
2436        /* we have  start < vma->vm_end  */
2437
2438        /* if it doesn't overlap, we have nothing.. */
2439        end = start + len;
2440        if (vma->vm_start >= end)
2441                return 0;
2442
2443        /*
2444         * If we need to split any vma, do it now to save pain later.
2445         *
2446         * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
2447         * unmapped vm_area_struct will remain in use: so lower split_vma
2448         * places tmp vma above, and higher split_vma places tmp vma below.
2449         */
2450        if (start > vma->vm_start) {
2451                int error;
2452
2453                /*
2454                 * Make sure that map_count on return from munmap() will
2455                 * not exceed its limit; but let map_count go just above
2456                 * its limit temporarily, to help free resources as expected.
2457                 */
2458                if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2459                        return -ENOMEM;
2460
2461                error = __split_vma(mm, vma, start, 0);
2462                if (error)
2463                        return error;
2464                prev = vma;
2465        }
2466
2467        /* Does it split the last one? */
2468        last = find_vma(mm, end);
2469        if (last && end > last->vm_start) {
2470                int error = __split_vma(mm, last, end, 1);
2471                if (error)
2472                        return error;
2473        }
2474        vma = prev? prev->vm_next: mm->mmap;
2475
2476        /*
2477         * unlock any mlock()ed ranges before detaching vmas
2478         */
2479        if (mm->locked_vm) {
2480                struct vm_area_struct *tmp = vma;
2481                while (tmp && tmp->vm_start < end) {
2482                        if (tmp->vm_flags & VM_LOCKED) {
2483                                mm->locked_vm -= vma_pages(tmp);
2484                                munlock_vma_pages_all(tmp);
2485                        }
2486                        tmp = tmp->vm_next;
2487                }
2488        }
2489
2490        /*
2491         * Remove the vma's, and unmap the actual pages
2492         */
2493        detach_vmas_to_be_unmapped(mm, vma, prev, end);
2494        unmap_region(mm, vma, prev, start, end);
2495
2496        /* Fix up all other VM information */
2497        remove_vma_list(mm, vma);
2498
2499        return 0;
2500}
2501
2502int vm_munmap(unsigned long start, size_t len)
2503{
2504        int ret;
2505        struct mm_struct *mm = current->mm;
2506
2507        down_write(&mm->mmap_sem);
2508        ret = do_munmap(mm, start, len);
2509        up_write(&mm->mmap_sem);
2510        return ret;
2511}
2512EXPORT_SYMBOL(vm_munmap);
2513
2514SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2515{
2516        profile_munmap(addr);
2517        return vm_munmap(addr, len);
2518}
2519
2520static inline void verify_mm_writelocked(struct mm_struct *mm)
2521{
2522#ifdef CONFIG_DEBUG_VM
2523        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2524                WARN_ON(1);
2525                up_read(&mm->mmap_sem);
2526        }
2527#endif
2528}
2529
2530/*
2531 *  this is really a simplified "do_mmap".  it only handles
2532 *  anonymous maps.  eventually we may be able to do some
2533 *  brk-specific accounting here.
2534 */
2535static unsigned long do_brk(unsigned long addr, unsigned long len)
2536{
2537        struct mm_struct * mm = current->mm;
2538        struct vm_area_struct * vma, * prev;
2539        unsigned long flags;
2540        struct rb_node ** rb_link, * rb_parent;
2541        pgoff_t pgoff = addr >> PAGE_SHIFT;
2542        int error;
2543
2544        len = PAGE_ALIGN(len);
2545        if (!len)
2546                return addr;
2547
2548        flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2549
2550        error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2551        if (error & ~PAGE_MASK)
2552                return error;
2553
2554        /*
2555         * mlock MCL_FUTURE?
2556         */
2557        if (mm->def_flags & VM_LOCKED) {
2558                unsigned long locked, lock_limit;
2559                locked = len >> PAGE_SHIFT;
2560                locked += mm->locked_vm;
2561                lock_limit = rlimit(RLIMIT_MEMLOCK);
2562                lock_limit >>= PAGE_SHIFT;
2563                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2564                        return -EAGAIN;
2565        }
2566
2567        /*
2568         * mm->mmap_sem is required to protect against another thread
2569         * changing the mappings in case we sleep.
2570         */
2571        verify_mm_writelocked(mm);
2572
2573        /*
2574         * Clear old maps.  this also does some error checking for us
2575         */
2576 munmap_back:
2577        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2578                if (do_munmap(mm, addr, len))
2579                        return -ENOMEM;
2580                goto munmap_back;
2581        }
2582
2583        /* Check against address space limits *after* clearing old maps... */
2584        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
2585                return -ENOMEM;
2586
2587        if (mm->map_count > sysctl_max_map_count)
2588                return -ENOMEM;
2589
2590        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2591                return -ENOMEM;
2592
2593        /* Can we just expand an old private anonymous mapping? */
2594        vma = vma_merge(mm, prev, addr, addr + len, flags,
2595                                        NULL, NULL, pgoff, NULL);
2596        if (vma)
2597                goto out;
2598
2599        /*
2600         * create a vma struct for an anonymous mapping
2601         */
2602        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2603        if (!vma) {
2604                vm_unacct_memory(len >> PAGE_SHIFT);
2605                return -ENOMEM;
2606        }
2607
2608        INIT_LIST_HEAD(&vma->anon_vma_chain);
2609        vma->vm_mm = mm;
2610        vma->vm_start = addr;
2611        vma->vm_end = addr + len;
2612        vma->vm_pgoff = pgoff;
2613        vma->vm_flags = flags;
2614        vma->vm_page_prot = vm_get_page_prot(flags);
2615        vma_link(mm, vma, prev, rb_link, rb_parent);
2616out:
2617        perf_event_mmap(vma);
2618        mm->total_vm += len >> PAGE_SHIFT;
2619        if (flags & VM_LOCKED) {
2620                if (!mlock_vma_pages_range(vma, addr, addr + len))
2621                        mm->locked_vm += (len >> PAGE_SHIFT);
2622        }
2623        return addr;
2624}
2625
2626unsigned long vm_brk(unsigned long addr, unsigned long len)
2627{
2628        struct mm_struct *mm = current->mm;
2629        unsigned long ret;
2630
2631        down_write(&mm->mmap_sem);
2632        ret = do_brk(addr, len);
2633        up_write(&mm->mmap_sem);
2634        return ret;
2635}
2636EXPORT_SYMBOL(vm_brk);
2637
2638/* Release all mmaps. */
2639void exit_mmap(struct mm_struct *mm)
2640{
2641        struct mmu_gather tlb;
2642        struct vm_area_struct *vma;
2643        unsigned long nr_accounted = 0;
2644
2645        /* mm's last user has gone, and its about to be pulled down */
2646        mmu_notifier_release(mm);
2647
2648        if (mm->locked_vm) {
2649                vma = mm->mmap;
2650                while (vma) {
2651                        if (vma->vm_flags & VM_LOCKED)
2652                                munlock_vma_pages_all(vma);
2653                        vma = vma->vm_next;
2654                }
2655        }
2656
2657        arch_exit_mmap(mm);
2658
2659        vma = mm->mmap;
2660        if (!vma)       /* Can happen if dup_mmap() received an OOM */
2661                return;
2662
2663        lru_add_drain();
2664        flush_cache_mm(mm);
2665        tlb_gather_mmu(&tlb, mm, 1);
2666        /* update_hiwater_rss(mm) here? but nobody should be looking */
2667        /* Use -1 here to ensure all VMAs in the mm are unmapped */
2668        unmap_vmas(&tlb, vma, 0, -1);
2669
2670        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2671        tlb_finish_mmu(&tlb, 0, -1);
2672
2673        /*
2674         * Walk the list again, actually closing and freeing it,
2675         * with preemption enabled, without holding any MM locks.
2676         */
2677        while (vma) {
2678                if (vma->vm_flags & VM_ACCOUNT)
2679                        nr_accounted += vma_pages(vma);
2680                vma = remove_vma(vma);
2681        }
2682        vm_unacct_memory(nr_accounted);
2683
2684        WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2685}
2686
2687/* Insert vm structure into process list sorted by address
2688 * and into the inode's i_mmap tree.  If vm_file is non-NULL
2689 * then i_mmap_mutex is taken here.
2690 */
2691int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2692{
2693        struct vm_area_struct *prev;
2694        struct rb_node **rb_link, *rb_parent;
2695
2696        /*
2697         * The vm_pgoff of a purely anonymous vma should be irrelevant
2698         * until its first write fault, when page's anon_vma and index
2699         * are set.  But now set the vm_pgoff it will almost certainly
2700         * end up with (unless mremap moves it elsewhere before that
2701         * first wfault), so /proc/pid/maps tells a consistent story.
2702         *
2703         * By setting it to reflect the virtual start address of the
2704         * vma, merges and splits can happen in a seamless way, just
2705         * using the existing file pgoff checks and manipulations.
2706         * Similarly in do_mmap_pgoff and in do_brk.
2707         */
2708        if (!vma->vm_file) {
2709                BUG_ON(vma->anon_vma);
2710                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2711        }
2712        if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2713                           &prev, &rb_link, &rb_parent))
2714                return -ENOMEM;
2715        if ((vma->vm_flags & VM_ACCOUNT) &&
2716             security_vm_enough_memory_mm(mm, vma_pages(vma)))
2717                return -ENOMEM;
2718
2719        vma_link(mm, vma, prev, rb_link, rb_parent);
2720        return 0;
2721}
2722
2723/*
2724 * Copy the vma structure to a new location in the same mm,
2725 * prior to moving page table entries, to effect an mremap move.
2726 */
2727struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2728        unsigned long addr, unsigned long len, pgoff_t pgoff,
2729        bool *need_rmap_locks)
2730{
2731        struct vm_area_struct *vma = *vmap;
2732        unsigned long vma_start = vma->vm_start;
2733        struct mm_struct *mm = vma->vm_mm;
2734        struct vm_area_struct *new_vma, *prev;
2735        struct rb_node **rb_link, *rb_parent;
2736        struct mempolicy *pol;
2737        bool faulted_in_anon_vma = true;
2738
2739        /*
2740         * If anonymous vma has not yet been faulted, update new pgoff
2741         * to match new location, to increase its chance of merging.
2742         */
2743        if (unlikely(!vma->vm_file && !vma->anon_vma)) {
2744                pgoff = addr >> PAGE_SHIFT;
2745                faulted_in_anon_vma = false;
2746        }
2747
2748        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2749                return NULL;    /* should never get here */
2750        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2751                        vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2752        if (new_vma) {
2753                /*
2754                 * Source vma may have been merged into new_vma
2755                 */
2756                if (unlikely(vma_start >= new_vma->vm_start &&
2757                             vma_start < new_vma->vm_end)) {
2758                        /*
2759                         * The only way we can get a vma_merge with
2760                         * self during an mremap is if the vma hasn't
2761                         * been faulted in yet and we were allowed to
2762                         * reset the dst vma->vm_pgoff to the
2763                         * destination address of the mremap to allow
2764                         * the merge to happen. mremap must change the
2765                         * vm_pgoff linearity between src and dst vmas
2766                         * (in turn preventing a vma_merge) to be
2767                         * safe. It is only safe to keep the vm_pgoff
2768                         * linear if there are no pages mapped yet.
2769                         */
2770                        VM_BUG_ON(faulted_in_anon_vma);
2771                        *vmap = vma = new_vma;
2772                }
2773                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2774        } else {
2775                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2776                if (new_vma) {
2777                        *new_vma = *vma;
2778                        new_vma->vm_start = addr;
2779                        new_vma->vm_end = addr + len;
2780                        new_vma->vm_pgoff = pgoff;
2781                        pol = mpol_dup(vma_policy(vma));
2782                        if (IS_ERR(pol))
2783                                goto out_free_vma;
2784                        vma_set_policy(new_vma, pol);
2785                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2786                        if (anon_vma_clone(new_vma, vma))
2787                                goto out_free_mempol;
2788                        if (new_vma->vm_file)
2789                                get_file(new_vma->vm_file);
2790                        if (new_vma->vm_ops && new_vma->vm_ops->open)
2791                                new_vma->vm_ops->open(new_vma);
2792                        vma_link(mm, new_vma, prev, rb_link, rb_parent);
2793                        *need_rmap_locks = false;
2794                }
2795        }
2796        return new_vma;
2797
2798 out_free_mempol:
2799        mpol_put(pol);
2800 out_free_vma:
2801        kmem_cache_free(vm_area_cachep, new_vma);
2802        return NULL;
2803}
2804
2805/*
2806 * Return true if the calling process may expand its vm space by the passed
2807 * number of pages
2808 */
2809int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2810{
2811        unsigned long cur = mm->total_vm;       /* pages */
2812        unsigned long lim;
2813
2814        lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2815
2816        if (cur + npages > lim)
2817                return 0;
2818        return 1;
2819}
2820
2821
2822static int special_mapping_fault(struct vm_area_struct *vma,
2823                                struct vm_fault *vmf)
2824{
2825        pgoff_t pgoff;
2826        struct page **pages;
2827
2828        /*
2829         * special mappings have no vm_file, and in that case, the mm
2830         * uses vm_pgoff internally. So we have to subtract it from here.
2831         * We are allowed to do this because we are the mm; do not copy
2832         * this code into drivers!
2833         */
2834        pgoff = vmf->pgoff - vma->vm_pgoff;
2835
2836        for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
2837                pgoff--;
2838
2839        if (*pages) {
2840                struct page *page = *pages;
2841                get_page(page);
2842                vmf->page = page;
2843                return 0;
2844        }
2845
2846        return VM_FAULT_SIGBUS;
2847}
2848
2849/*
2850 * Having a close hook prevents vma merging regardless of flags.
2851 */
2852static void special_mapping_close(struct vm_area_struct *vma)
2853{
2854}
2855
2856static const struct vm_operations_struct special_mapping_vmops = {
2857        .close = special_mapping_close,
2858        .fault = special_mapping_fault,
2859};
2860
2861/*
2862 * Called with mm->mmap_sem held for writing.
2863 * Insert a new vma covering the given region, with the given flags.
2864 * Its pages are supplied by the given array of struct page *.
2865 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
2866 * The region past the last page supplied will always produce SIGBUS.
2867 * The array pointer and the pages it points to are assumed to stay alive
2868 * for as long as this mapping might exist.
2869 */
2870int install_special_mapping(struct mm_struct *mm,
2871                            unsigned long addr, unsigned long len,
2872                            unsigned long vm_flags, struct page **pages)
2873{
2874        int ret;
2875        struct vm_area_struct *vma;
2876
2877        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2878        if (unlikely(vma == NULL))
2879                return -ENOMEM;
2880
2881        INIT_LIST_HEAD(&vma->anon_vma_chain);
2882        vma->vm_mm = mm;
2883        vma->vm_start = addr;
2884        vma->vm_end = addr + len;
2885
2886        vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
2887        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
2888
2889        vma->vm_ops = &special_mapping_vmops;
2890        vma->vm_private_data = pages;
2891
2892        ret = insert_vm_struct(mm, vma);
2893        if (ret)
2894                goto out;
2895
2896        mm->total_vm += len >> PAGE_SHIFT;
2897
2898        perf_event_mmap(vma);
2899
2900        return 0;
2901
2902out:
2903        kmem_cache_free(vm_area_cachep, vma);
2904        return ret;
2905}
2906
2907static DEFINE_MUTEX(mm_all_locks_mutex);
2908
2909static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2910{
2911        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2912                /*
2913                 * The LSB of head.next can't change from under us
2914                 * because we hold the mm_all_locks_mutex.
2915                 */
2916                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
2917                /*
2918                 * We can safely modify head.next after taking the
2919                 * anon_vma->root->rwsem. If some other vma in this mm shares
2920                 * the same anon_vma we won't take it again.
2921                 *
2922                 * No need of atomic instructions here, head.next
2923                 * can't change from under us thanks to the
2924                 * anon_vma->root->rwsem.
2925                 */
2926                if (__test_and_set_bit(0, (unsigned long *)
2927                                       &anon_vma->root->rb_root.rb_node))
2928                        BUG();
2929        }
2930}
2931
2932static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2933{
2934        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
2935                /*
2936                 * AS_MM_ALL_LOCKS can't change from under us because
2937                 * we hold the mm_all_locks_mutex.
2938                 *
2939                 * Operations on ->flags have to be atomic because
2940                 * even if AS_MM_ALL_LOCKS is stable thanks to the
2941                 * mm_all_locks_mutex, there may be other cpus
2942                 * changing other bitflags in parallel to us.
2943                 */
2944                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2945                        BUG();
2946                mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
2947        }
2948}
2949
2950/*
2951 * This operation locks against the VM for all pte/vma/mm related
2952 * operations that could ever happen on a certain mm. This includes
2953 * vmtruncate, try_to_unmap, and all page faults.
2954 *
2955 * The caller must take the mmap_sem in write mode before calling
2956 * mm_take_all_locks(). The caller isn't allowed to release the
2957 * mmap_sem until mm_drop_all_locks() returns.
2958 *
2959 * mmap_sem in write mode is required in order to block all operations
2960 * that could modify pagetables and free pages without need of
2961 * altering the vma layout (for example populate_range() with
2962 * nonlinear vmas). It's also needed in write mode to avoid new
2963 * anon_vmas to be associated with existing vmas.
2964 *
2965 * A single task can't take more than one mm_take_all_locks() in a row
2966 * or it would deadlock.
2967 *
2968 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
2969 * mapping->flags avoid to take the same lock twice, if more than one
2970 * vma in this mm is backed by the same anon_vma or address_space.
2971 *
2972 * We can take all the locks in random order because the VM code
2973 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
2974 * takes more than one of them in a row. Secondly we're protected
2975 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2976 *
2977 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
2978 * that may have to take thousand of locks.
2979 *
2980 * mm_take_all_locks() can fail if it's interrupted by signals.
2981 */
2982int mm_take_all_locks(struct mm_struct *mm)
2983{
2984        struct vm_area_struct *vma;
2985        struct anon_vma_chain *avc;
2986
2987        BUG_ON(down_read_trylock(&mm->mmap_sem));
2988
2989        mutex_lock(&mm_all_locks_mutex);
2990
2991        for (vma = mm->mmap; vma; vma = vma->vm_next) {
2992                if (signal_pending(current))
2993                        goto out_unlock;
2994                if (vma->vm_file && vma->vm_file->f_mapping)
2995                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
2996        }
2997
2998        for (vma = mm->mmap; vma; vma = vma->vm_next) {
2999                if (signal_pending(current))
3000                        goto out_unlock;
3001                if (vma->anon_vma)
3002                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3003                                vm_lock_anon_vma(mm, avc->anon_vma);
3004        }
3005
3006        return 0;
3007
3008out_unlock:
3009        mm_drop_all_locks(mm);
3010        return -EINTR;
3011}
3012
3013static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3014{
3015        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3016                /*
3017                 * The LSB of head.next can't change to 0 from under
3018                 * us because we hold the mm_all_locks_mutex.
3019                 *
3020                 * We must however clear the bitflag before unlocking
3021                 * the vma so the users using the anon_vma->rb_root will
3022                 * never see our bitflag.
3023                 *
3024                 * No need of atomic instructions here, head.next
3025                 * can't change from under us until we release the
3026                 * anon_vma->root->rwsem.
3027                 */
3028                if (!__test_and_clear_bit(0, (unsigned long *)
3029                                          &anon_vma->root->rb_root.rb_node))
3030                        BUG();
3031                anon_vma_unlock(anon_vma);
3032        }
3033}
3034
3035static void vm_unlock_mapping(struct address_space *mapping)
3036{
3037        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3038                /*
3039                 * AS_MM_ALL_LOCKS can't change to 0 from under us
3040                 * because we hold the mm_all_locks_mutex.
3041                 */
3042                mutex_unlock(&mapping->i_mmap_mutex);
3043                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3044                                        &mapping->flags))
3045                        BUG();
3046        }
3047}
3048
3049/*
3050 * The mmap_sem cannot be released by the caller until
3051 * mm_drop_all_locks() returns.
3052 */
3053void mm_drop_all_locks(struct mm_struct *mm)
3054{
3055        struct vm_area_struct *vma;
3056        struct anon_vma_chain *avc;
3057
3058        BUG_ON(down_read_trylock(&mm->mmap_sem));
3059        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
3060
3061        for (vma = mm->mmap; vma; vma = vma->vm_next) {
3062                if (vma->anon_vma)
3063                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3064                                vm_unlock_anon_vma(avc->anon_vma);
3065                if (vma->vm_file && vma->vm_file->f_mapping)
3066                        vm_unlock_mapping(vma->vm_file->f_mapping);
3067        }
3068
3069        mutex_unlock(&mm_all_locks_mutex);
3070}
3071
3072/*
3073 * initialise the VMA slab
3074 */
3075void __init mmap_init(void)
3076{
3077        int ret;
3078
3079        ret = percpu_counter_init(&vm_committed_as, 0);
3080        VM_BUG_ON(ret);
3081}
3082
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.