linux/mm/madvise.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/madvise.c
   3 *
   4 * Copyright (C) 1999  Linus Torvalds
   5 * Copyright (C) 2002  Christoph Hellwig
   6 */
   7
   8#include <linux/mman.h>
   9#include <linux/pagemap.h>
  10#include <linux/syscalls.h>
  11#include <linux/mempolicy.h>
  12#include <linux/page-isolation.h>
  13#include <linux/hugetlb.h>
  14#include <linux/sched.h>
  15#include <linux/ksm.h>
  16
  17/*
  18 * Any behaviour which results in changes to the vma->vm_flags needs to
  19 * take mmap_sem for writing. Others, which simply traverse vmas, need
  20 * to only take it for reading.
  21 */
  22static int madvise_need_mmap_write(int behavior)
  23{
  24        switch (behavior) {
  25        case MADV_REMOVE:
  26        case MADV_WILLNEED:
  27        case MADV_DONTNEED:
  28                return 0;
  29        default:
  30                /* be safe, default to 1. list exceptions explicitly */
  31                return 1;
  32        }
  33}
  34
  35/*
  36 * We can potentially split a vm area into separate
  37 * areas, each area with its own behavior.
  38 */
  39static long madvise_behavior(struct vm_area_struct * vma,
  40                     struct vm_area_struct **prev,
  41                     unsigned long start, unsigned long end, int behavior)
  42{
  43        struct mm_struct * mm = vma->vm_mm;
  44        int error = 0;
  45        pgoff_t pgoff;
  46        unsigned long new_flags = vma->vm_flags;
  47
  48        switch (behavior) {
  49        case MADV_NORMAL:
  50                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
  51                break;
  52        case MADV_SEQUENTIAL:
  53                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
  54                break;
  55        case MADV_RANDOM:
  56                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
  57                break;
  58        case MADV_DONTFORK:
  59                new_flags |= VM_DONTCOPY;
  60                break;
  61        case MADV_DOFORK:
  62                if (vma->vm_flags & VM_IO) {
  63                        error = -EINVAL;
  64                        goto out;
  65                }
  66                new_flags &= ~VM_DONTCOPY;
  67                break;
  68        case MADV_MERGEABLE:
  69        case MADV_UNMERGEABLE:
  70                error = ksm_madvise(vma, start, end, behavior, &new_flags);
  71                if (error)
  72                        goto out;
  73                break;
  74        case MADV_HUGEPAGE:
  75        case MADV_NOHUGEPAGE:
  76                error = hugepage_madvise(vma, &new_flags, behavior);
  77                if (error)
  78                        goto out;
  79                break;
  80        }
  81
  82        if (new_flags == vma->vm_flags) {
  83                *prev = vma;
  84                goto out;
  85        }
  86
  87        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
  88        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
  89                                vma->vm_file, pgoff, vma_policy(vma));
  90        if (*prev) {
  91                vma = *prev;
  92                goto success;
  93        }
  94
  95        *prev = vma;
  96
  97        if (start != vma->vm_start) {
  98                error = split_vma(mm, vma, start, 1);
  99                if (error)
 100                        goto out;
 101        }
 102
 103        if (end != vma->vm_end) {
 104                error = split_vma(mm, vma, end, 0);
 105                if (error)
 106                        goto out;
 107        }
 108
 109success:
 110        /*
 111         * vm_flags is protected by the mmap_sem held in write mode.
 112         */
 113        vma->vm_flags = new_flags;
 114
 115out:
 116        if (error == -ENOMEM)
 117                error = -EAGAIN;
 118        return error;
 119}
 120
 121/*
 122 * Schedule all required I/O operations.  Do not wait for completion.
 123 */
 124static long madvise_willneed(struct vm_area_struct * vma,
 125                             struct vm_area_struct ** prev,
 126                             unsigned long start, unsigned long end)
 127{
 128        struct file *file = vma->vm_file;
 129
 130        if (!file)
 131                return -EBADF;
 132
 133        if (file->f_mapping->a_ops->get_xip_mem) {
 134                /* no bad return value, but ignore advice */
 135                return 0;
 136        }
 137
 138        *prev = vma;
 139        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 140        if (end > vma->vm_end)
 141                end = vma->vm_end;
 142        end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 143
 144        force_page_cache_readahead(file->f_mapping, file, start, end - start);
 145        return 0;
 146}
 147
 148/*
 149 * Application no longer needs these pages.  If the pages are dirty,
 150 * it's OK to just throw them away.  The app will be more careful about
 151 * data it wants to keep.  Be sure to free swap resources too.  The
 152 * zap_page_range call sets things up for shrink_active_list to actually free
 153 * these pages later if no one else has touched them in the meantime,
 154 * although we could add these pages to a global reuse list for
 155 * shrink_active_list to pick up before reclaiming other pages.
 156 *
 157 * NB: This interface discards data rather than pushes it out to swap,
 158 * as some implementations do.  This has performance implications for
 159 * applications like large transactional databases which want to discard
 160 * pages in anonymous maps after committing to backing store the data
 161 * that was kept in them.  There is no reason to write this data out to
 162 * the swap area if the application is discarding it.
 163 *
 164 * An interface that causes the system to free clean pages and flush
 165 * dirty pages is already available as msync(MS_INVALIDATE).
 166 */
 167static long madvise_dontneed(struct vm_area_struct * vma,
 168                             struct vm_area_struct ** prev,
 169                             unsigned long start, unsigned long end)
 170{
 171        *prev = vma;
 172        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 173                return -EINVAL;
 174
 175        if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
 176                struct zap_details details = {
 177                        .nonlinear_vma = vma,
 178                        .last_index = ULONG_MAX,
 179                };
 180                zap_page_range(vma, start, end - start, &details);
 181        } else
 182                zap_page_range(vma, start, end - start, NULL);
 183        return 0;
 184}
 185
 186/*
 187 * Application wants to free up the pages and associated backing store.
 188 * This is effectively punching a hole into the middle of a file.
 189 *
 190 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
 191 * Other filesystems return -ENOSYS.
 192 */
 193static long madvise_remove(struct vm_area_struct *vma,
 194                                struct vm_area_struct **prev,
 195                                unsigned long start, unsigned long end)
 196{
 197        struct address_space *mapping;
 198        loff_t offset, endoff;
 199        int error;
 200
 201        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
 202
 203        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
 204                return -EINVAL;
 205
 206        if (!vma->vm_file || !vma->vm_file->f_mapping
 207                || !vma->vm_file->f_mapping->host) {
 208                        return -EINVAL;
 209        }
 210
 211        if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 212                return -EACCES;
 213
 214        mapping = vma->vm_file->f_mapping;
 215
 216        offset = (loff_t)(start - vma->vm_start)
 217                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 218        endoff = (loff_t)(end - vma->vm_start - 1)
 219                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 220
 221        /* vmtruncate_range needs to take i_mutex */
 222        up_read(&current->mm->mmap_sem);
 223        error = vmtruncate_range(mapping->host, offset, endoff);
 224        down_read(&current->mm->mmap_sem);
 225        return error;
 226}
 227
 228#ifdef CONFIG_MEMORY_FAILURE
 229/*
 230 * Error injection support for memory error handling.
 231 */
 232static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 233{
 234        int ret = 0;
 235
 236        if (!capable(CAP_SYS_ADMIN))
 237                return -EPERM;
 238        for (; start < end; start += PAGE_SIZE) {
 239                struct page *p;
 240                int ret = get_user_pages_fast(start, 1, 0, &p);
 241                if (ret != 1)
 242                        return ret;
 243                if (bhv == MADV_SOFT_OFFLINE) {
 244                        printk(KERN_INFO "Soft offlining page %lx at %lx\n",
 245                                page_to_pfn(p), start);
 246                        ret = soft_offline_page(p, MF_COUNT_INCREASED);
 247                        if (ret)
 248                                break;
 249                        continue;
 250                }
 251                printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
 252                       page_to_pfn(p), start);
 253                /* Ignore return value for now */
 254                __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
 255        }
 256        return ret;
 257}
 258#endif
 259
 260static long
 261madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 262                unsigned long start, unsigned long end, int behavior)
 263{
 264        switch (behavior) {
 265        case MADV_REMOVE:
 266                return madvise_remove(vma, prev, start, end);
 267        case MADV_WILLNEED:
 268                return madvise_willneed(vma, prev, start, end);
 269        case MADV_DONTNEED:
 270                return madvise_dontneed(vma, prev, start, end);
 271        default:
 272                return madvise_behavior(vma, prev, start, end, behavior);
 273        }
 274}
 275
 276static int
 277madvise_behavior_valid(int behavior)
 278{
 279        switch (behavior) {
 280        case MADV_DOFORK:
 281        case MADV_DONTFORK:
 282        case MADV_NORMAL:
 283        case MADV_SEQUENTIAL:
 284        case MADV_RANDOM:
 285        case MADV_REMOVE:
 286        case MADV_WILLNEED:
 287        case MADV_DONTNEED:
 288#ifdef CONFIG_KSM
 289        case MADV_MERGEABLE:
 290        case MADV_UNMERGEABLE:
 291#endif
 292#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 293        case MADV_HUGEPAGE:
 294        case MADV_NOHUGEPAGE:
 295#endif
 296                return 1;
 297
 298        default:
 299                return 0;
 300        }
 301}
 302
 303/*
 304 * The madvise(2) system call.
 305 *
 306 * Applications can use madvise() to advise the kernel how it should
 307 * handle paging I/O in this VM area.  The idea is to help the kernel
 308 * use appropriate read-ahead and caching techniques.  The information
 309 * provided is advisory only, and can be safely disregarded by the
 310 * kernel without affecting the correct operation of the application.
 311 *
 312 * behavior values:
 313 *  MADV_NORMAL - the default behavior is to read clusters.  This
 314 *              results in some read-ahead and read-behind.
 315 *  MADV_RANDOM - the system should read the minimum amount of data
 316 *              on any access, since it is unlikely that the appli-
 317 *              cation will need more than what it asks for.
 318 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 319 *              once, so they can be aggressively read ahead, and
 320 *              can be freed soon after they are accessed.
 321 *  MADV_WILLNEED - the application is notifying the system to read
 322 *              some pages ahead.
 323 *  MADV_DONTNEED - the application is finished with the given range,
 324 *              so the kernel can free resources associated with it.
 325 *  MADV_REMOVE - the application wants to free up the given range of
 326 *              pages and associated backing store.
 327 *  MADV_DONTFORK - omit this area from child's address space when forking:
 328 *              typically, to avoid COWing pages pinned by get_user_pages().
 329 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
 330 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 331 *              this area with pages of identical content from other such areas.
 332 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
 333 *
 334 * return values:
 335 *  zero    - success
 336 *  -EINVAL - start + len < 0, start is not page-aligned,
 337 *              "behavior" is not a valid value, or application
 338 *              is attempting to release locked or shared pages.
 339 *  -ENOMEM - addresses in the specified range are not currently
 340 *              mapped, or are outside the AS of the process.
 341 *  -EIO    - an I/O error occurred while paging in data.
 342 *  -EBADF  - map exists, but area maps something that isn't a file.
 343 *  -EAGAIN - a kernel resource was temporarily unavailable.
 344 */
 345SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 346{
 347        unsigned long end, tmp;
 348        struct vm_area_struct * vma, *prev;
 349        int unmapped_error = 0;
 350        int error = -EINVAL;
 351        int write;
 352        size_t len;
 353
 354#ifdef CONFIG_MEMORY_FAILURE
 355        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
 356                return madvise_hwpoison(behavior, start, start+len_in);
 357#endif
 358        if (!madvise_behavior_valid(behavior))
 359                return error;
 360
 361        write = madvise_need_mmap_write(behavior);
 362        if (write)
 363                down_write(&current->mm->mmap_sem);
 364        else
 365                down_read(&current->mm->mmap_sem);
 366
 367        if (start & ~PAGE_MASK)
 368                goto out;
 369        len = (len_in + ~PAGE_MASK) & PAGE_MASK;
 370
 371        /* Check to see whether len was rounded up from small -ve to zero */
 372        if (len_in && !len)
 373                goto out;
 374
 375        end = start + len;
 376        if (end < start)
 377                goto out;
 378
 379        error = 0;
 380        if (end == start)
 381                goto out;
 382
 383        /*
 384         * If the interval [start,end) covers some unmapped address
 385         * ranges, just ignore them, but return -ENOMEM at the end.
 386         * - different from the way of handling in mlock etc.
 387         */
 388        vma = find_vma_prev(current->mm, start, &prev);
 389        if (vma && start > vma->vm_start)
 390                prev = vma;
 391
 392        for (;;) {
 393                /* Still start < end. */
 394                error = -ENOMEM;
 395                if (!vma)
 396                        goto out;
 397
 398                /* Here start < (end|vma->vm_end). */
 399                if (start < vma->vm_start) {
 400                        unmapped_error = -ENOMEM;
 401                        start = vma->vm_start;
 402                        if (start >= end)
 403                                goto out;
 404                }
 405
 406                /* Here vma->vm_start <= start < (end|vma->vm_end) */
 407                tmp = vma->vm_end;
 408                if (end < tmp)
 409                        tmp = end;
 410
 411                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
 412                error = madvise_vma(vma, &prev, start, tmp, behavior);
 413                if (error)
 414                        goto out;
 415                start = tmp;
 416                if (prev && start < prev->vm_end)
 417                        start = prev->vm_end;
 418                error = unmapped_error;
 419                if (start >= end)
 420                        goto out;
 421                if (prev)
 422                        vma = prev->vm_next;
 423                else    /* madvise_remove dropped mmap_sem */
 424                        vma = find_vma(current->mm, start);
 425        }
 426out:
 427        if (write)
 428                up_write(&current->mm->mmap_sem);
 429        else
 430                up_read(&current->mm->mmap_sem);
 431
 432        return error;
 433}
 434
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.