linux/mm/filemap.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994-1999  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem used to do this differently, for example)
  11 */
  12#include <linux/module.h>
  13#include <linux/slab.h>
  14#include <linux/compiler.h>
  15#include <linux/fs.h>
  16#include <linux/uaccess.h>
  17#include <linux/aio.h>
  18#include <linux/capability.h>
  19#include <linux/kernel_stat.h>
  20#include <linux/mm.h>
  21#include <linux/swap.h>
  22#include <linux/mman.h>
  23#include <linux/pagemap.h>
  24#include <linux/file.h>
  25#include <linux/uio.h>
  26#include <linux/hash.h>
  27#include <linux/writeback.h>
  28#include <linux/backing-dev.h>
  29#include <linux/pagevec.h>
  30#include <linux/blkdev.h>
  31#include <linux/security.h>
  32#include <linux/syscalls.h>
  33#include <linux/cpuset.h>
  34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
  35#include <linux/memcontrol.h>
  36#include "internal.h"
  37
  38/*
  39 * FIXME: remove all knowledge of the buffer layer from the core VM
  40 */
  41#include <linux/buffer_head.h> /* for generic_osync_inode */
  42
  43#include <asm/mman.h>
  44
  45static ssize_t
  46generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  47        loff_t offset, unsigned long nr_segs);
  48
  49/*
  50 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  51 * though.
  52 *
  53 * Shared mappings now work. 15.8.1995  Bruno.
  54 *
  55 * finished 'unifying' the page and buffer cache and SMP-threaded the
  56 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  57 *
  58 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  59 */
  60
  61/*
  62 * Lock ordering:
  63 *
  64 *  ->i_mmap_lock               (vmtruncate)
  65 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
  66 *      ->swap_lock             (exclusive_swap_page, others)
  67 *        ->mapping->tree_lock
  68 *
  69 *  ->i_mutex
  70 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
  71 *
  72 *  ->mmap_sem
  73 *    ->i_mmap_lock
  74 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
  75 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
  76 *
  77 *  ->mmap_sem
  78 *    ->lock_page               (access_process_vm)
  79 *
  80 *  ->i_mutex                   (generic_file_buffered_write)
  81 *    ->mmap_sem                (fault_in_pages_readable->do_page_fault)
  82 *
  83 *  ->i_mutex
  84 *    ->i_alloc_sem             (various)
  85 *
  86 *  ->inode_lock
  87 *    ->sb_lock                 (fs/fs-writeback.c)
  88 *    ->mapping->tree_lock      (__sync_single_inode)
  89 *
  90 *  ->i_mmap_lock
  91 *    ->anon_vma.lock           (vma_adjust)
  92 *
  93 *  ->anon_vma.lock
  94 *    ->page_table_lock or pte_lock     (anon_vma_prepare and various)
  95 *
  96 *  ->page_table_lock or pte_lock
  97 *    ->swap_lock               (try_to_unmap_one)
  98 *    ->private_lock            (try_to_unmap_one)
  99 *    ->tree_lock               (try_to_unmap_one)
 100 *    ->zone.lru_lock           (follow_page->mark_page_accessed)
 101 *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
 102 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 103 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
 104 *    ->inode_lock              (page_remove_rmap->set_page_dirty)
 105 *    ->inode_lock              (zap_pte_range->set_page_dirty)
 106 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 107 *
 108 *  ->task->proc_lock
 109 *    ->dcache_lock             (proc_pid_lookup)
 110 */
 111
 112/*
 113 * Remove a page from the page cache and free it. Caller has to make
 114 * sure the page is locked and that nobody else uses it - or that usage
 115 * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
 116 */
 117void __remove_from_page_cache(struct page *page)
 118{
 119        struct address_space *mapping = page->mapping;
 120
 121        mem_cgroup_uncharge_page(page);
 122        radix_tree_delete(&mapping->page_tree, page->index);
 123        page->mapping = NULL;
 124        mapping->nrpages--;
 125        __dec_zone_page_state(page, NR_FILE_PAGES);
 126        BUG_ON(page_mapped(page));
 127
 128        /*
 129         * Some filesystems seem to re-dirty the page even after
 130         * the VM has canceled the dirty bit (eg ext3 journaling).
 131         *
 132         * Fix it up by doing a final dirty accounting check after
 133         * having removed the page entirely.
 134         */
 135        if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 136                dec_zone_page_state(page, NR_FILE_DIRTY);
 137                dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 138        }
 139}
 140
 141void remove_from_page_cache(struct page *page)
 142{
 143        struct address_space *mapping = page->mapping;
 144
 145        BUG_ON(!PageLocked(page));
 146
 147        write_lock_irq(&mapping->tree_lock);
 148        __remove_from_page_cache(page);
 149        write_unlock_irq(&mapping->tree_lock);
 150}
 151
 152static int sync_page(void *word)
 153{
 154        struct address_space *mapping;
 155        struct page *page;
 156
 157        page = container_of((unsigned long *)word, struct page, flags);
 158
 159        /*
 160         * page_mapping() is being called without PG_locked held.
 161         * Some knowledge of the state and use of the page is used to
 162         * reduce the requirements down to a memory barrier.
 163         * The danger here is of a stale page_mapping() return value
 164         * indicating a struct address_space different from the one it's
 165         * associated with when it is associated with one.
 166         * After smp_mb(), it's either the correct page_mapping() for
 167         * the page, or an old page_mapping() and the page's own
 168         * page_mapping() has gone NULL.
 169         * The ->sync_page() address_space operation must tolerate
 170         * page_mapping() going NULL. By an amazing coincidence,
 171         * this comes about because none of the users of the page
 172         * in the ->sync_page() methods make essential use of the
 173         * page_mapping(), merely passing the page down to the backing
 174         * device's unplug functions when it's non-NULL, which in turn
 175         * ignore it for all cases but swap, where only page_private(page) is
 176         * of interest. When page_mapping() does go NULL, the entire
 177         * call stack gracefully ignores the page and returns.
 178         * -- wli
 179         */
 180        smp_mb();
 181        mapping = page_mapping(page);
 182        if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 183                mapping->a_ops->sync_page(page);
 184        io_schedule();
 185        return 0;
 186}
 187
 188static int sync_page_killable(void *word)
 189{
 190        sync_page(word);
 191        return fatal_signal_pending(current) ? -EINTR : 0;
 192}
 193
 194/**
 195 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 196 * @mapping:    address space structure to write
 197 * @start:      offset in bytes where the range starts
 198 * @end:        offset in bytes where the range ends (inclusive)
 199 * @sync_mode:  enable synchronous operation
 200 *
 201 * Start writeback against all of a mapping's dirty pages that lie
 202 * within the byte offsets <start, end> inclusive.
 203 *
 204 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 205 * opposed to a regular memory cleansing writeback.  The difference between
 206 * these two operations is that if a dirty page/buffer is encountered, it must
 207 * be waited upon, and not just skipped over.
 208 */
 209int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 210                                loff_t end, int sync_mode)
 211{
 212        int ret;
 213        struct writeback_control wbc = {
 214                .sync_mode = sync_mode,
 215                .nr_to_write = mapping->nrpages * 2,
 216                .range_start = start,
 217                .range_end = end,
 218        };
 219
 220        if (!mapping_cap_writeback_dirty(mapping))
 221                return 0;
 222
 223        ret = do_writepages(mapping, &wbc);
 224        return ret;
 225}
 226
 227static inline int __filemap_fdatawrite(struct address_space *mapping,
 228        int sync_mode)
 229{
 230        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 231}
 232
 233int filemap_fdatawrite(struct address_space *mapping)
 234{
 235        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 236}
 237EXPORT_SYMBOL(filemap_fdatawrite);
 238
 239static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 240                                loff_t end)
 241{
 242        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 243}
 244
 245/**
 246 * filemap_flush - mostly a non-blocking flush
 247 * @mapping:    target address_space
 248 *
 249 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 250 * purposes - I/O may not be started against all dirty pages.
 251 */
 252int filemap_flush(struct address_space *mapping)
 253{
 254        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 255}
 256EXPORT_SYMBOL(filemap_flush);
 257
 258/**
 259 * wait_on_page_writeback_range - wait for writeback to complete
 260 * @mapping:    target address_space
 261 * @start:      beginning page index
 262 * @end:        ending page index
 263 *
 264 * Wait for writeback to complete against pages indexed by start->end
 265 * inclusive
 266 */
 267int wait_on_page_writeback_range(struct address_space *mapping,
 268                                pgoff_t start, pgoff_t end)
 269{
 270        struct pagevec pvec;
 271        int nr_pages;
 272        int ret = 0;
 273        pgoff_t index;
 274
 275        if (end < start)
 276                return 0;
 277
 278        pagevec_init(&pvec, 0);
 279        index = start;
 280        while ((index <= end) &&
 281                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 282                        PAGECACHE_TAG_WRITEBACK,
 283                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
 284                unsigned i;
 285
 286                for (i = 0; i < nr_pages; i++) {
 287                        struct page *page = pvec.pages[i];
 288
 289                        /* until radix tree lookup accepts end_index */
 290                        if (page->index > end)
 291                                continue;
 292
 293                        wait_on_page_writeback(page);
 294                        if (PageError(page))
 295                                ret = -EIO;
 296                }
 297                pagevec_release(&pvec);
 298                cond_resched();
 299        }
 300
 301        /* Check for outstanding write errors */
 302        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
 303                ret = -ENOSPC;
 304        if (test_and_clear_bit(AS_EIO, &mapping->flags))
 305                ret = -EIO;
 306
 307        return ret;
 308}
 309
 310/**
 311 * sync_page_range - write and wait on all pages in the passed range
 312 * @inode:      target inode
 313 * @mapping:    target address_space
 314 * @pos:        beginning offset in pages to write
 315 * @count:      number of bytes to write
 316 *
 317 * Write and wait upon all the pages in the passed range.  This is a "data
 318 * integrity" operation.  It waits upon in-flight writeout before starting and
 319 * waiting upon new writeout.  If there was an IO error, return it.
 320 *
 321 * We need to re-take i_mutex during the generic_osync_inode list walk because
 322 * it is otherwise livelockable.
 323 */
 324int sync_page_range(struct inode *inode, struct address_space *mapping,
 325                        loff_t pos, loff_t count)
 326{
 327        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 328        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 329        int ret;
 330
 331        if (!mapping_cap_writeback_dirty(mapping) || !count)
 332                return 0;
 333        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 334        if (ret == 0) {
 335                mutex_lock(&inode->i_mutex);
 336                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 337                mutex_unlock(&inode->i_mutex);
 338        }
 339        if (ret == 0)
 340                ret = wait_on_page_writeback_range(mapping, start, end);
 341        return ret;
 342}
 343EXPORT_SYMBOL(sync_page_range);
 344
 345/**
 346 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
 347 * @inode:      target inode
 348 * @mapping:    target address_space
 349 * @pos:        beginning offset in pages to write
 350 * @count:      number of bytes to write
 351 *
 352 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
 353 * as it forces O_SYNC writers to different parts of the same file
 354 * to be serialised right until io completion.
 355 */
 356int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 357                           loff_t pos, loff_t count)
 358{
 359        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 360        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 361        int ret;
 362
 363        if (!mapping_cap_writeback_dirty(mapping) || !count)
 364                return 0;
 365        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 366        if (ret == 0)
 367                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 368        if (ret == 0)
 369                ret = wait_on_page_writeback_range(mapping, start, end);
 370        return ret;
 371}
 372EXPORT_SYMBOL(sync_page_range_nolock);
 373
 374/**
 375 * filemap_fdatawait - wait for all under-writeback pages to complete
 376 * @mapping: address space structure to wait for
 377 *
 378 * Walk the list of under-writeback pages of the given address space
 379 * and wait for all of them.
 380 */
 381int filemap_fdatawait(struct address_space *mapping)
 382{
 383        loff_t i_size = i_size_read(mapping->host);
 384
 385        if (i_size == 0)
 386                return 0;
 387
 388        return wait_on_page_writeback_range(mapping, 0,
 389                                (i_size - 1) >> PAGE_CACHE_SHIFT);
 390}
 391EXPORT_SYMBOL(filemap_fdatawait);
 392
 393int filemap_write_and_wait(struct address_space *mapping)
 394{
 395        int err = 0;
 396
 397        if (mapping->nrpages) {
 398                err = filemap_fdatawrite(mapping);
 399                /*
 400                 * Even if the above returned error, the pages may be
 401                 * written partially (e.g. -ENOSPC), so we wait for it.
 402                 * But the -EIO is special case, it may indicate the worst
 403                 * thing (e.g. bug) happened, so we avoid waiting for it.
 404                 */
 405                if (err != -EIO) {
 406                        int err2 = filemap_fdatawait(mapping);
 407                        if (!err)
 408                                err = err2;
 409                }
 410        }
 411        return err;
 412}
 413EXPORT_SYMBOL(filemap_write_and_wait);
 414
 415/**
 416 * filemap_write_and_wait_range - write out & wait on a file range
 417 * @mapping:    the address_space for the pages
 418 * @lstart:     offset in bytes where the range starts
 419 * @lend:       offset in bytes where the range ends (inclusive)
 420 *
 421 * Write out and wait upon file offsets lstart->lend, inclusive.
 422 *
 423 * Note that `lend' is inclusive (describes the last byte to be written) so
 424 * that this function can be used to write to the very end-of-file (end = -1).
 425 */
 426int filemap_write_and_wait_range(struct address_space *mapping,
 427                                 loff_t lstart, loff_t lend)
 428{
 429        int err = 0;
 430
 431        if (mapping->nrpages) {
 432                err = __filemap_fdatawrite_range(mapping, lstart, lend,
 433                                                 WB_SYNC_ALL);
 434                /* See comment of filemap_write_and_wait() */
 435                if (err != -EIO) {
 436                        int err2 = wait_on_page_writeback_range(mapping,
 437                                                lstart >> PAGE_CACHE_SHIFT,
 438                                                lend >> PAGE_CACHE_SHIFT);
 439                        if (!err)
 440                                err = err2;
 441                }
 442        }
 443        return err;
 444}
 445
 446/**
 447 * add_to_page_cache - add newly allocated pagecache pages
 448 * @page:       page to add
 449 * @mapping:    the page's address_space
 450 * @offset:     page index
 451 * @gfp_mask:   page allocation mode
 452 *
 453 * This function is used to add newly allocated pagecache pages;
 454 * the page is new, so we can just run SetPageLocked() against it.
 455 * The other page state flags were set by rmqueue().
 456 *
 457 * This function does not add the page to the LRU.  The caller must do that.
 458 */
 459int add_to_page_cache(struct page *page, struct address_space *mapping,
 460                pgoff_t offset, gfp_t gfp_mask)
 461{
 462        int error = mem_cgroup_cache_charge(page, current->mm,
 463                                        gfp_mask & ~__GFP_HIGHMEM);
 464        if (error)
 465                goto out;
 466
 467        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 468        if (error == 0) {
 469                write_lock_irq(&mapping->tree_lock);
 470                error = radix_tree_insert(&mapping->page_tree, offset, page);
 471                if (!error) {
 472                        page_cache_get(page);
 473                        SetPageLocked(page);
 474                        page->mapping = mapping;
 475                        page->index = offset;
 476                        mapping->nrpages++;
 477                        __inc_zone_page_state(page, NR_FILE_PAGES);
 478                } else
 479                        mem_cgroup_uncharge_page(page);
 480
 481                write_unlock_irq(&mapping->tree_lock);
 482                radix_tree_preload_end();
 483        } else
 484                mem_cgroup_uncharge_page(page);
 485out:
 486        return error;
 487}
 488EXPORT_SYMBOL(add_to_page_cache);
 489
 490int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 491                                pgoff_t offset, gfp_t gfp_mask)
 492{
 493        int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
 494        if (ret == 0)
 495                lru_cache_add(page);
 496        return ret;
 497}
 498
 499#ifdef CONFIG_NUMA
 500struct page *__page_cache_alloc(gfp_t gfp)
 501{
 502        if (cpuset_do_page_mem_spread()) {
 503                int n = cpuset_mem_spread_node();
 504                return alloc_pages_node(n, gfp, 0);
 505        }
 506        return alloc_pages(gfp, 0);
 507}
 508EXPORT_SYMBOL(__page_cache_alloc);
 509#endif
 510
 511static int __sleep_on_page_lock(void *word)
 512{
 513        io_schedule();
 514        return 0;
 515}
 516
 517/*
 518 * In order to wait for pages to become available there must be
 519 * waitqueues associated with pages. By using a hash table of
 520 * waitqueues where the bucket discipline is to maintain all
 521 * waiters on the same queue and wake all when any of the pages
 522 * become available, and for the woken contexts to check to be
 523 * sure the appropriate page became available, this saves space
 524 * at a cost of "thundering herd" phenomena during rare hash
 525 * collisions.
 526 */
 527static wait_queue_head_t *page_waitqueue(struct page *page)
 528{
 529        const struct zone *zone = page_zone(page);
 530
 531        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 532}
 533
 534static inline void wake_up_page(struct page *page, int bit)
 535{
 536        __wake_up_bit(page_waitqueue(page), &page->flags, bit);
 537}
 538
 539void wait_on_page_bit(struct page *page, int bit_nr)
 540{
 541        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 542
 543        if (test_bit(bit_nr, &page->flags))
 544                __wait_on_bit(page_waitqueue(page), &wait, sync_page,
 545                                                        TASK_UNINTERRUPTIBLE);
 546}
 547EXPORT_SYMBOL(wait_on_page_bit);
 548
 549/**
 550 * unlock_page - unlock a locked page
 551 * @page: the page
 552 *
 553 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 554 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 555 * mechananism between PageLocked pages and PageWriteback pages is shared.
 556 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 557 *
 558 * The first mb is necessary to safely close the critical section opened by the
 559 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
 560 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
 561 * parallel wait_on_page_locked()).
 562 */
 563void unlock_page(struct page *page)
 564{
 565        smp_mb__before_clear_bit();
 566        if (!TestClearPageLocked(page))
 567                BUG();
 568        smp_mb__after_clear_bit(); 
 569        wake_up_page(page, PG_locked);
 570}
 571EXPORT_SYMBOL(unlock_page);
 572
 573/**
 574 * end_page_writeback - end writeback against a page
 575 * @page: the page
 576 */
 577void end_page_writeback(struct page *page)
 578{
 579        if (TestClearPageReclaim(page))
 580                rotate_reclaimable_page(page);
 581
 582        if (!test_clear_page_writeback(page))
 583                BUG();
 584
 585        smp_mb__after_clear_bit();
 586        wake_up_page(page, PG_writeback);
 587}
 588EXPORT_SYMBOL(end_page_writeback);
 589
 590/**
 591 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 592 * @page: the page to lock
 593 *
 594 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 595 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 596 * chances are that on the second loop, the block layer's plug list is empty,
 597 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 598 */
 599void __lock_page(struct page *page)
 600{
 601        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 602
 603        __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
 604                                                        TASK_UNINTERRUPTIBLE);
 605}
 606EXPORT_SYMBOL(__lock_page);
 607
 608int __lock_page_killable(struct page *page)
 609{
 610        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 611
 612        return __wait_on_bit_lock(page_waitqueue(page), &wait,
 613                                        sync_page_killable, TASK_KILLABLE);
 614}
 615
 616/**
 617 * __lock_page_nosync - get a lock on the page, without calling sync_page()
 618 * @page: the page to lock
 619 *
 620 * Variant of lock_page that does not require the caller to hold a reference
 621 * on the page's mapping.
 622 */
 623void __lock_page_nosync(struct page *page)
 624{
 625        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 626        __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
 627                                                        TASK_UNINTERRUPTIBLE);
 628}
 629
 630/**
 631 * find_get_page - find and get a page reference
 632 * @mapping: the address_space to search
 633 * @offset: the page index
 634 *
 635 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 636 * If yes, increment its refcount and return it; if no, return NULL.
 637 */
 638struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
 639{
 640        struct page *page;
 641
 642        read_lock_irq(&mapping->tree_lock);
 643        page = radix_tree_lookup(&mapping->page_tree, offset);
 644        if (page)
 645                page_cache_get(page);
 646        read_unlock_irq(&mapping->tree_lock);
 647        return page;
 648}
 649EXPORT_SYMBOL(find_get_page);
 650
 651/**
 652 * find_lock_page - locate, pin and lock a pagecache page
 653 * @mapping: the address_space to search
 654 * @offset: the page index
 655 *
 656 * Locates the desired pagecache page, locks it, increments its reference
 657 * count and returns its address.
 658 *
 659 * Returns zero if the page was not present. find_lock_page() may sleep.
 660 */
 661struct page *find_lock_page(struct address_space *mapping,
 662                                pgoff_t offset)
 663{
 664        struct page *page;
 665
 666repeat:
 667        read_lock_irq(&mapping->tree_lock);
 668        page = radix_tree_lookup(&mapping->page_tree, offset);
 669        if (page) {
 670                page_cache_get(page);
 671                if (TestSetPageLocked(page)) {
 672                        read_unlock_irq(&mapping->tree_lock);
 673                        __lock_page(page);
 674
 675                        /* Has the page been truncated while we slept? */
 676                        if (unlikely(page->mapping != mapping)) {
 677                                unlock_page(page);
 678                                page_cache_release(page);
 679                                goto repeat;
 680                        }
 681                        VM_BUG_ON(page->index != offset);
 682                        goto out;
 683                }
 684        }
 685        read_unlock_irq(&mapping->tree_lock);
 686out:
 687        return page;
 688}
 689EXPORT_SYMBOL(find_lock_page);
 690
 691/**
 692 * find_or_create_page - locate or add a pagecache page
 693 * @mapping: the page's address_space
 694 * @index: the page's index into the mapping
 695 * @gfp_mask: page allocation mode
 696 *
 697 * Locates a page in the pagecache.  If the page is not present, a new page
 698 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
 699 * LRU list.  The returned page is locked and has its reference count
 700 * incremented.
 701 *
 702 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
 703 * allocation!
 704 *
 705 * find_or_create_page() returns the desired page's address, or zero on
 706 * memory exhaustion.
 707 */
 708struct page *find_or_create_page(struct address_space *mapping,
 709                pgoff_t index, gfp_t gfp_mask)
 710{
 711        struct page *page;
 712        int err;
 713repeat:
 714        page = find_lock_page(mapping, index);
 715        if (!page) {
 716                page = __page_cache_alloc(gfp_mask);
 717                if (!page)
 718                        return NULL;
 719                err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
 720                if (unlikely(err)) {
 721                        page_cache_release(page);
 722                        page = NULL;
 723                        if (err == -EEXIST)
 724                                goto repeat;
 725                }
 726        }
 727        return page;
 728}
 729EXPORT_SYMBOL(find_or_create_page);
 730
 731/**
 732 * find_get_pages - gang pagecache lookup
 733 * @mapping:    The address_space to search
 734 * @start:      The starting page index
 735 * @nr_pages:   The maximum number of pages
 736 * @pages:      Where the resulting pages are placed
 737 *
 738 * find_get_pages() will search for and return a group of up to
 739 * @nr_pages pages in the mapping.  The pages are placed at @pages.
 740 * find_get_pages() takes a reference against the returned pages.
 741 *
 742 * The search returns a group of mapping-contiguous pages with ascending
 743 * indexes.  There may be holes in the indices due to not-present pages.
 744 *
 745 * find_get_pages() returns the number of pages which were found.
 746 */
 747unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 748                            unsigned int nr_pages, struct page **pages)
 749{
 750        unsigned int i;
 751        unsigned int ret;
 752
 753        read_lock_irq(&mapping->tree_lock);
 754        ret = radix_tree_gang_lookup(&mapping->page_tree,
 755                                (void **)pages, start, nr_pages);
 756        for (i = 0; i < ret; i++)
 757                page_cache_get(pages[i]);
 758        read_unlock_irq(&mapping->tree_lock);
 759        return ret;
 760}
 761
 762/**
 763 * find_get_pages_contig - gang contiguous pagecache lookup
 764 * @mapping:    The address_space to search
 765 * @index:      The starting page index
 766 * @nr_pages:   The maximum number of pages
 767 * @pages:      Where the resulting pages are placed
 768 *
 769 * find_get_pages_contig() works exactly like find_get_pages(), except
 770 * that the returned number of pages are guaranteed to be contiguous.
 771 *
 772 * find_get_pages_contig() returns the number of pages which were found.
 773 */
 774unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 775                               unsigned int nr_pages, struct page **pages)
 776{
 777        unsigned int i;
 778        unsigned int ret;
 779
 780        read_lock_irq(&mapping->tree_lock);
 781        ret = radix_tree_gang_lookup(&mapping->page_tree,
 782                                (void **)pages, index, nr_pages);
 783        for (i = 0; i < ret; i++) {
 784                if (pages[i]->mapping == NULL || pages[i]->index != index)
 785                        break;
 786
 787                page_cache_get(pages[i]);
 788                index++;
 789        }
 790        read_unlock_irq(&mapping->tree_lock);
 791        return i;
 792}
 793EXPORT_SYMBOL(find_get_pages_contig);
 794
 795/**
 796 * find_get_pages_tag - find and return pages that match @tag
 797 * @mapping:    the address_space to search
 798 * @index:      the starting page index
 799 * @tag:        the tag index
 800 * @nr_pages:   the maximum number of pages
 801 * @pages:      where the resulting pages are placed
 802 *
 803 * Like find_get_pages, except we only return pages which are tagged with
 804 * @tag.   We update @index to index the next page for the traversal.
 805 */
 806unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 807                        int tag, unsigned int nr_pages, struct page **pages)
 808{
 809        unsigned int i;
 810        unsigned int ret;
 811
 812        read_lock_irq(&mapping->tree_lock);
 813        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
 814                                (void **)pages, *index, nr_pages, tag);
 815        for (i = 0; i < ret; i++)
 816                page_cache_get(pages[i]);
 817        if (ret)
 818                *index = pages[ret - 1]->index + 1;
 819        read_unlock_irq(&mapping->tree_lock);
 820        return ret;
 821}
 822EXPORT_SYMBOL(find_get_pages_tag);
 823
 824/**
 825 * grab_cache_page_nowait - returns locked page at given index in given cache
 826 * @mapping: target address_space
 827 * @index: the page index
 828 *
 829 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 830 * This is intended for speculative data generators, where the data can
 831 * be regenerated if the page couldn't be grabbed.  This routine should
 832 * be safe to call while holding the lock for another page.
 833 *
 834 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 835 * and deadlock against the caller's locked page.
 836 */
 837struct page *
 838grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 839{
 840        struct page *page = find_get_page(mapping, index);
 841
 842        if (page) {
 843                if (!TestSetPageLocked(page))
 844                        return page;
 845                page_cache_release(page);
 846                return NULL;
 847        }
 848        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
 849        if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
 850                page_cache_release(page);
 851                page = NULL;
 852        }
 853        return page;
 854}
 855EXPORT_SYMBOL(grab_cache_page_nowait);
 856
 857/*
 858 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 859 * a _large_ part of the i/o request. Imagine the worst scenario:
 860 *
 861 *      ---R__________________________________________B__________
 862 *         ^ reading here                             ^ bad block(assume 4k)
 863 *
 864 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 865 * => failing the whole request => read(R) => read(R+1) =>
 866 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 867 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 868 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 869 *
 870 * It is going insane. Fix it by quickly scaling down the readahead size.
 871 */
 872static void shrink_readahead_size_eio(struct file *filp,
 873                                        struct file_ra_state *ra)
 874{
 875        if (!ra->ra_pages)
 876                return;
 877
 878        ra->ra_pages /= 4;
 879}
 880
 881/**
 882 * do_generic_file_read - generic file read routine
 883 * @filp:       the file to read
 884 * @ppos:       current file position
 885 * @desc:       read_descriptor
 886 * @actor:      read method
 887 *
 888 * This is a generic file read routine, and uses the
 889 * mapping->a_ops->readpage() function for the actual low-level stuff.
 890 *
 891 * This is really ugly. But the goto's actually try to clarify some
 892 * of the logic when it comes to error handling etc.
 893 */
 894static void do_generic_file_read(struct file *filp, loff_t *ppos,
 895                read_descriptor_t *desc, read_actor_t actor)
 896{
 897        struct address_space *mapping = filp->f_mapping;
 898        struct inode *inode = mapping->host;
 899        struct file_ra_state *ra = &filp->f_ra;
 900        pgoff_t index;
 901        pgoff_t last_index;
 902        pgoff_t prev_index;
 903        unsigned long offset;      /* offset into pagecache page */
 904        unsigned int prev_offset;
 905        int error;
 906
 907        index = *ppos >> PAGE_CACHE_SHIFT;
 908        prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
 909        prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
 910        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 911        offset = *ppos & ~PAGE_CACHE_MASK;
 912
 913        for (;;) {
 914                struct page *page;
 915                pgoff_t end_index;
 916                loff_t isize;
 917                unsigned long nr, ret;
 918
 919                cond_resched();
 920find_page:
 921                page = find_get_page(mapping, index);
 922                if (!page) {
 923                        page_cache_sync_readahead(mapping,
 924                                        ra, filp,
 925                                        index, last_index - index);
 926                        page = find_get_page(mapping, index);
 927                        if (unlikely(page == NULL))
 928                                goto no_cached_page;
 929                }
 930                if (PageReadahead(page)) {
 931                        page_cache_async_readahead(mapping,
 932                                        ra, filp, page,
 933                                        index, last_index - index);
 934                }
 935                if (!PageUptodate(page))
 936                        goto page_not_up_to_date;
 937page_ok:
 938                /*
 939                 * i_size must be checked after we know the page is Uptodate.
 940                 *
 941                 * Checking i_size after the check allows us to calculate
 942                 * the correct value for "nr", which means the zero-filled
 943                 * part of the page is not copied back to userspace (unless
 944                 * another truncate extends the file - this is desired though).
 945                 */
 946
 947                isize = i_size_read(inode);
 948                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 949                if (unlikely(!isize || index > end_index)) {
 950                        page_cache_release(page);
 951                        goto out;
 952                }
 953
 954                /* nr is the maximum number of bytes to copy from this page */
 955                nr = PAGE_CACHE_SIZE;
 956                if (index == end_index) {
 957                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 958                        if (nr <= offset) {
 959                                page_cache_release(page);
 960                                goto out;
 961                        }
 962                }
 963                nr = nr - offset;
 964
 965                /* If users can be writing to this page using arbitrary
 966                 * virtual addresses, take care about potential aliasing
 967                 * before reading the page on the kernel side.
 968                 */
 969                if (mapping_writably_mapped(mapping))
 970                        flush_dcache_page(page);
 971
 972                /*
 973                 * When a sequential read accesses a page several times,
 974                 * only mark it as accessed the first time.
 975                 */
 976                if (prev_index != index || offset != prev_offset)
 977                        mark_page_accessed(page);
 978                prev_index = index;
 979
 980                /*
 981                 * Ok, we have the page, and it's up-to-date, so
 982                 * now we can copy it to user space...
 983                 *
 984                 * The actor routine returns how many bytes were actually used..
 985                 * NOTE! This may not be the same as how much of a user buffer
 986                 * we filled up (we may be padding etc), so we can only update
 987                 * "pos" here (the actor routine has to update the user buffer
 988                 * pointers and the remaining count).
 989                 */
 990                ret = actor(desc, page, offset, nr);
 991                offset += ret;
 992                index += offset >> PAGE_CACHE_SHIFT;
 993                offset &= ~PAGE_CACHE_MASK;
 994                prev_offset = offset;
 995
 996                page_cache_release(page);
 997                if (ret == nr && desc->count)
 998                        continue;
 999                goto out;
1000
1001page_not_up_to_date:
1002                /* Get exclusive access to the page ... */
1003                if (lock_page_killable(page))
1004                        goto readpage_eio;
1005
1006                /* Did it get truncated before we got the lock? */
1007                if (!page->mapping) {
1008                        unlock_page(page);
1009                        page_cache_release(page);
1010                        continue;
1011                }
1012
1013                /* Did somebody else fill it already? */
1014                if (PageUptodate(page)) {
1015                        unlock_page(page);
1016                        goto page_ok;
1017                }
1018
1019readpage:
1020                /* Start the actual read. The read will unlock the page. */
1021                error = mapping->a_ops->readpage(filp, page);
1022
1023                if (unlikely(error)) {
1024                        if (error == AOP_TRUNCATED_PAGE) {
1025                                page_cache_release(page);
1026                                goto find_page;
1027                        }
1028                        goto readpage_error;
1029                }
1030
1031                if (!PageUptodate(page)) {
1032                        if (lock_page_killable(page))
1033                                goto readpage_eio;
1034                        if (!PageUptodate(page)) {
1035                                if (page->mapping == NULL) {
1036                                        /*
1037                                         * invalidate_inode_pages got it
1038                                         */
1039                                        unlock_page(page);
1040                                        page_cache_release(page);
1041                                        goto find_page;
1042                                }
1043                                unlock_page(page);
1044                                shrink_readahead_size_eio(filp, ra);
1045                                goto readpage_eio;
1046                        }
1047                        unlock_page(page);
1048                }
1049
1050                goto page_ok;
1051
1052readpage_eio:
1053                error = -EIO;
1054readpage_error:
1055                /* UHHUH! A synchronous read error occurred. Report it */
1056                desc->error = error;
1057                page_cache_release(page);
1058                goto out;
1059
1060no_cached_page:
1061                /*
1062                 * Ok, it wasn't cached, so we need to create a new
1063                 * page..
1064                 */
1065                page = page_cache_alloc_cold(mapping);
1066                if (!page) {
1067                        desc->error = -ENOMEM;
1068                        goto out;
1069                }
1070                error = add_to_page_cache_lru(page, mapping,
1071                                                index, GFP_KERNEL);
1072                if (error) {
1073                        page_cache_release(page);
1074                        if (error == -EEXIST)
1075                                goto find_page;
1076                        desc->error = error;
1077                        goto out;
1078                }
1079                goto readpage;
1080        }
1081
1082out:
1083        ra->prev_pos = prev_index;
1084        ra->prev_pos <<= PAGE_CACHE_SHIFT;
1085        ra->prev_pos |= prev_offset;
1086
1087        *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1088        if (filp)
1089                file_accessed(filp);
1090}
1091
1092int file_read_actor(read_descriptor_t *desc, struct page *page,
1093                        unsigned long offset, unsigned long size)
1094{
1095        char *kaddr;
1096        unsigned long left, count = desc->count;
1097
1098        if (size > count)
1099                size = count;
1100
1101        /*
1102         * Faults on the destination of a read are common, so do it before
1103         * taking the kmap.
1104         */
1105        if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1106                kaddr = kmap_atomic(page, KM_USER0);
1107                left = __copy_to_user_inatomic(desc->arg.buf,
1108                                                kaddr + offset, size);
1109                kunmap_atomic(kaddr, KM_USER0);
1110                if (left == 0)
1111                        goto success;
1112        }
1113
1114        /* Do it the slow way */
1115        kaddr = kmap(page);
1116        left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1117        kunmap(page);
1118
1119        if (left) {
1120                size -= left;
1121                desc->error = -EFAULT;
1122        }
1123success:
1124        desc->count = count - size;
1125        desc->written += size;
1126        desc->arg.buf += size;
1127        return size;
1128}
1129
1130/*
1131 * Performs necessary checks before doing a write
1132 * @iov:        io vector request
1133 * @nr_segs:    number of segments in the iovec
1134 * @count:      number of bytes to write
1135 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1136 *
1137 * Adjust number of segments and amount of bytes to write (nr_segs should be
1138 * properly initialized first). Returns appropriate error code that caller
1139 * should return or zero in case that write should be allowed.
1140 */
1141int generic_segment_checks(const struct iovec *iov,
1142                        unsigned long *nr_segs, size_t *count, int access_flags)
1143{
1144        unsigned long   seg;
1145        size_t cnt = 0;
1146        for (seg = 0; seg < *nr_segs; seg++) {
1147                const struct iovec *iv = &iov[seg];
1148
1149                /*
1150                 * If any segment has a negative length, or the cumulative
1151                 * length ever wraps negative then return -EINVAL.
1152                 */
1153                cnt += iv->iov_len;
1154                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1155                        return -EINVAL;
1156                if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1157                        continue;
1158                if (seg == 0)
1159                        return -EFAULT;
1160                *nr_segs = seg;
1161                cnt -= iv->iov_len;     /* This segment is no good */
1162                break;
1163        }
1164        *count = cnt;
1165        return 0;
1166}
1167EXPORT_SYMBOL(generic_segment_checks);
1168
1169/**
1170 * generic_file_aio_read - generic filesystem read routine
1171 * @iocb:       kernel I/O control block
1172 * @iov:        io vector request
1173 * @nr_segs:    number of segments in the iovec
1174 * @pos:        current file position
1175 *
1176 * This is the "read()" routine for all filesystems
1177 * that can use the page cache directly.
1178 */
1179ssize_t
1180generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1181                unsigned long nr_segs, loff_t pos)
1182{
1183        struct file *filp = iocb->ki_filp;
1184        ssize_t retval;
1185        unsigned long seg;
1186        size_t count;
1187        loff_t *ppos = &iocb->ki_pos;
1188
1189        count = 0;
1190        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1191        if (retval)
1192                return retval;
1193
1194        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1195        if (filp->f_flags & O_DIRECT) {
1196                loff_t size;
1197                struct address_space *mapping;
1198                struct inode *inode;
1199
1200                mapping = filp->f_mapping;
1201                inode = mapping->host;
1202                retval = 0;
1203                if (!count)
1204                        goto out; /* skip atime */
1205                size = i_size_read(inode);
1206                if (pos < size) {
1207                        retval = generic_file_direct_IO(READ, iocb,
1208                                                iov, pos, nr_segs);
1209                        if (retval > 0)
1210                                *ppos = pos + retval;
1211                }
1212                if (likely(retval != 0)) {
1213                        file_accessed(filp);
1214                        goto out;
1215                }
1216        }
1217
1218        retval = 0;
1219        if (count) {
1220                for (seg = 0; seg < nr_segs; seg++) {
1221                        read_descriptor_t desc;
1222
1223                        desc.written = 0;
1224                        desc.arg.buf = iov[seg].iov_base;
1225                        desc.count = iov[seg].iov_len;
1226                        if (desc.count == 0)
1227                                continue;
1228                        desc.error = 0;
1229                        do_generic_file_read(filp,ppos,&desc,file_read_actor);
1230                        retval += desc.written;
1231                        if (desc.error) {
1232                                retval = retval ?: desc.error;
1233                                break;
1234                        }
1235                        if (desc.count > 0)
1236                                break;
1237                }
1238        }
1239out:
1240        return retval;
1241}
1242EXPORT_SYMBOL(generic_file_aio_read);
1243
1244static ssize_t
1245do_readahead(struct address_space *mapping, struct file *filp,
1246             pgoff_t index, unsigned long nr)
1247{
1248        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1249                return -EINVAL;
1250
1251        force_page_cache_readahead(mapping, filp, index,
1252                                        max_sane_readahead(nr));
1253        return 0;
1254}
1255
1256asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1257{
1258        ssize_t ret;
1259        struct file *file;
1260
1261        ret = -EBADF;
1262        file = fget(fd);
1263        if (file) {
1264                if (file->f_mode & FMODE_READ) {
1265                        struct address_space *mapping = file->f_mapping;
1266                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1267                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1268                        unsigned long len = end - start + 1;
1269                        ret = do_readahead(mapping, file, start, len);
1270                }
1271                fput(file);
1272        }
1273        return ret;
1274}
1275
1276#ifdef CONFIG_MMU
1277/**
1278 * page_cache_read - adds requested page to the page cache if not already there
1279 * @file:       file to read
1280 * @offset:     page index
1281 *
1282 * This adds the requested page to the page cache if it isn't already there,
1283 * and schedules an I/O to read in its contents from disk.
1284 */
1285static int page_cache_read(struct file *file, pgoff_t offset)
1286{
1287        struct address_space *mapping = file->f_mapping;
1288        struct page *page; 
1289        int ret;
1290
1291        do {
1292                page = page_cache_alloc_cold(mapping);
1293                if (!page)
1294                        return -ENOMEM;
1295
1296                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1297                if (ret == 0)
1298                        ret = mapping->a_ops->readpage(file, page);
1299                else if (ret == -EEXIST)
1300                        ret = 0; /* losing race to add is OK */
1301
1302                page_cache_release(page);
1303
1304        } while (ret == AOP_TRUNCATED_PAGE);
1305                
1306        return ret;
1307}
1308
1309#define MMAP_LOTSAMISS  (100)
1310
1311/**
1312 * filemap_fault - read in file data for page fault handling
1313 * @vma:        vma in which the fault was taken
1314 * @vmf:        struct vm_fault containing details of the fault
1315 *
1316 * filemap_fault() is invoked via the vma operations vector for a
1317 * mapped memory region to read in file data during a page fault.
1318 *
1319 * The goto's are kind of ugly, but this streamlines the normal case of having
1320 * it in the page cache, and handles the special cases reasonably without
1321 * having a lot of duplicated code.
1322 */
1323int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1324{
1325        int error;
1326        struct file *file = vma->vm_file;
1327        struct address_space *mapping = file->f_mapping;
1328        struct file_ra_state *ra = &file->f_ra;
1329        struct inode *inode = mapping->host;
1330        struct page *page;
1331        pgoff_t size;
1332        int did_readaround = 0;
1333        int ret = 0;
1334
1335        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1336        if (vmf->pgoff >= size)
1337                return VM_FAULT_SIGBUS;
1338
1339        /* If we don't want any read-ahead, don't bother */
1340        if (VM_RandomReadHint(vma))
1341                goto no_cached_page;
1342
1343        /*
1344         * Do we have something in the page cache already?
1345         */
1346retry_find:
1347        page = find_lock_page(mapping, vmf->pgoff);
1348        /*
1349         * For sequential accesses, we use the generic readahead logic.
1350         */
1351        if (VM_SequentialReadHint(vma)) {
1352                if (!page) {
1353                        page_cache_sync_readahead(mapping, ra, file,
1354                                                           vmf->pgoff, 1);
1355                        page = find_lock_page(mapping, vmf->pgoff);
1356                        if (!page)
1357                                goto no_cached_page;
1358                }
1359                if (PageReadahead(page)) {
1360                        page_cache_async_readahead(mapping, ra, file, page,
1361                                                           vmf->pgoff, 1);
1362                }
1363        }
1364
1365        if (!page) {
1366                unsigned long ra_pages;
1367
1368                ra->mmap_miss++;
1369
1370                /*
1371                 * Do we miss much more than hit in this file? If so,
1372                 * stop bothering with read-ahead. It will only hurt.
1373                 */
1374                if (ra->mmap_miss > MMAP_LOTSAMISS)
1375                        goto no_cached_page;
1376
1377                /*
1378                 * To keep the pgmajfault counter straight, we need to
1379                 * check did_readaround, as this is an inner loop.
1380                 */
1381                if (!did_readaround) {
1382                        ret = VM_FAULT_MAJOR;
1383                        count_vm_event(PGMAJFAULT);
1384                }
1385                did_readaround = 1;
1386                ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1387                if (ra_pages) {
1388                        pgoff_t start = 0;
1389
1390                        if (vmf->pgoff > ra_pages / 2)
1391                                start = vmf->pgoff - ra_pages / 2;
1392                        do_page_cache_readahead(mapping, file, start, ra_pages);
1393                }
1394                page = find_lock_page(mapping, vmf->pgoff);
1395                if (!page)
1396                        goto no_cached_page;
1397        }
1398
1399        if (!did_readaround)
1400                ra->mmap_miss--;
1401
1402        /*
1403         * We have a locked page in the page cache, now we need to check
1404         * that it's up-to-date. If not, it is going to be due to an error.
1405         */
1406        if (unlikely(!PageUptodate(page)))
1407                goto page_not_uptodate;
1408
1409        /* Must recheck i_size under page lock */
1410        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1411        if (unlikely(vmf->pgoff >= size)) {
1412                unlock_page(page);
1413                page_cache_release(page);
1414                return VM_FAULT_SIGBUS;
1415        }
1416
1417        /*
1418         * Found the page and have a reference on it.
1419         */
1420        mark_page_accessed(page);
1421        ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1422        vmf->page = page;
1423        return ret | VM_FAULT_LOCKED;
1424
1425no_cached_page:
1426        /*
1427         * We're only likely to ever get here if MADV_RANDOM is in
1428         * effect.
1429         */
1430        error = page_cache_read(file, vmf->pgoff);
1431
1432        /*
1433         * The page we want has now been added to the page cache.
1434         * In the unlikely event that someone removed it in the
1435         * meantime, we'll just come back here and read it again.
1436         */
1437        if (error >= 0)
1438                goto retry_find;
1439
1440        /*
1441         * An error return from page_cache_read can result if the
1442         * system is low on memory, or a problem occurs while trying
1443         * to schedule I/O.
1444         */
1445        if (error == -ENOMEM)
1446                return VM_FAULT_OOM;
1447        return VM_FAULT_SIGBUS;
1448
1449page_not_uptodate:
1450        /* IO error path */
1451        if (!did_readaround) {
1452                ret = VM_FAULT_MAJOR;
1453                count_vm_event(PGMAJFAULT);
1454        }
1455
1456        /*
1457         * Umm, take care of errors if the page isn't up-to-date.
1458         * Try to re-read it _once_. We do this synchronously,
1459         * because there really aren't any performance issues here
1460         * and we need to check for errors.
1461         */
1462        ClearPageError(page);
1463        error = mapping->a_ops->readpage(file, page);
1464        if (!error) {
1465                wait_on_page_locked(page);
1466                if (!PageUptodate(page))
1467                        error = -EIO;
1468        }
1469        page_cache_release(page);
1470
1471        if (!error || error == AOP_TRUNCATED_PAGE)
1472                goto retry_find;
1473
1474        /* Things didn't work out. Return zero to tell the mm layer so. */
1475        shrink_readahead_size_eio(file, ra);
1476        return VM_FAULT_SIGBUS;
1477}
1478EXPORT_SYMBOL(filemap_fault);
1479
1480struct vm_operations_struct generic_file_vm_ops = {
1481        .fault          = filemap_fault,
1482};
1483
1484/* This is used for a general mmap of a disk file */
1485
1486int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1487{
1488        struct address_space *mapping = file->f_mapping;
1489
1490        if (!mapping->a_ops->readpage)
1491                return -ENOEXEC;
1492        file_accessed(file);
1493        vma->vm_ops = &generic_file_vm_ops;
1494        vma->vm_flags |= VM_CAN_NONLINEAR;
1495        return 0;
1496}
1497
1498/*
1499 * This is for filesystems which do not implement ->writepage.
1500 */
1501int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1502{
1503        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1504                return -EINVAL;
1505        return generic_file_mmap(file, vma);
1506}
1507#else
1508int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1509{
1510        return -ENOSYS;
1511}
1512int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1513{
1514        return -ENOSYS;
1515}
1516#endif /* CONFIG_MMU */
1517
1518EXPORT_SYMBOL(generic_file_mmap);
1519EXPORT_SYMBOL(generic_file_readonly_mmap);
1520
1521static struct page *__read_cache_page(struct address_space *mapping,
1522                                pgoff_t index,
1523                                int (*filler)(void *,struct page*),
1524                                void *data)
1525{
1526        struct page *page;
1527        int err;
1528repeat:
1529        page = find_get_page(mapping, index);
1530        if (!page) {
1531                page = page_cache_alloc_cold(mapping);
1532                if (!page)
1533                        return ERR_PTR(-ENOMEM);
1534                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1535                if (unlikely(err)) {
1536                        page_cache_release(page);
1537                        if (err == -EEXIST)
1538                                goto repeat;
1539                        /* Presumably ENOMEM for radix tree node */
1540                        return ERR_PTR(err);
1541                }
1542                err = filler(data, page);
1543                if (err < 0) {
1544                        page_cache_release(page);
1545                        page = ERR_PTR(err);
1546                }
1547        }
1548        return page;
1549}
1550
1551/**
1552 * read_cache_page_async - read into page cache, fill it if needed
1553 * @mapping:    the page's address_space
1554 * @index:      the page index
1555 * @filler:     function to perform the read
1556 * @data:       destination for read data
1557 *
1558 * Same as read_cache_page, but don't wait for page to become unlocked
1559 * after submitting it to the filler.
1560 *
1561 * Read into the page cache. If a page already exists, and PageUptodate() is
1562 * not set, try to fill the page but don't wait for it to become unlocked.
1563 *
1564 * If the page does not get brought uptodate, return -EIO.
1565 */
1566struct page *read_cache_page_async(struct address_space *mapping,
1567                                pgoff_t index,
1568                                int (*filler)(void *,struct page*),
1569                                void *data)
1570{
1571        struct page *page;
1572        int err;
1573
1574retry:
1575        page = __read_cache_page(mapping, index, filler, data);
1576        if (IS_ERR(page))
1577                return page;
1578        if (PageUptodate(page))
1579                goto out;
1580
1581        lock_page(page);
1582        if (!page->mapping) {
1583                unlock_page(page);
1584                page_cache_release(page);
1585                goto retry;
1586        }
1587        if (PageUptodate(page)) {
1588                unlock_page(page);
1589                goto out;
1590        }
1591        err = filler(data, page);
1592        if (err < 0) {
1593                page_cache_release(page);
1594                return ERR_PTR(err);
1595        }
1596out:
1597        mark_page_accessed(page);
1598        return page;
1599}
1600EXPORT_SYMBOL(read_cache_page_async);
1601
1602/**
1603 * read_cache_page - read into page cache, fill it if needed
1604 * @mapping:    the page's address_space
1605 * @index:      the page index
1606 * @filler:     function to perform the read
1607 * @data:       destination for read data
1608 *
1609 * Read into the page cache. If a page already exists, and PageUptodate() is
1610 * not set, try to fill the page then wait for it to become unlocked.
1611 *
1612 * If the page does not get brought uptodate, return -EIO.
1613 */
1614struct page *read_cache_page(struct address_space *mapping,
1615                                pgoff_t index,
1616                                int (*filler)(void *,struct page*),
1617                                void *data)
1618{
1619        struct page *page;
1620
1621        page = read_cache_page_async(mapping, index, filler, data);
1622        if (IS_ERR(page))
1623                goto out;
1624        wait_on_page_locked(page);
1625        if (!PageUptodate(page)) {
1626                page_cache_release(page);
1627                page = ERR_PTR(-EIO);
1628        }
1629 out:
1630        return page;
1631}
1632EXPORT_SYMBOL(read_cache_page);
1633
1634/*
1635 * The logic we want is
1636 *
1637 *      if suid or (sgid and xgrp)
1638 *              remove privs
1639 */
1640int should_remove_suid(struct dentry *dentry)
1641{
1642        mode_t mode = dentry->d_inode->i_mode;
1643        int kill = 0;
1644
1645        /* suid always must be killed */
1646        if (unlikely(mode & S_ISUID))
1647                kill = ATTR_KILL_SUID;
1648
1649        /*
1650         * sgid without any exec bits is just a mandatory locking mark; leave
1651         * it alone.  If some exec bits are set, it's a real sgid; kill it.
1652         */
1653        if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1654                kill |= ATTR_KILL_SGID;
1655
1656        if (unlikely(kill && !capable(CAP_FSETID)))
1657                return kill;
1658
1659        return 0;
1660}
1661EXPORT_SYMBOL(should_remove_suid);
1662
1663static int __remove_suid(struct dentry *dentry, int kill)
1664{
1665        struct iattr newattrs;
1666
1667        newattrs.ia_valid = ATTR_FORCE | kill;
1668        return notify_change(dentry, &newattrs);
1669}
1670
1671int remove_suid(struct dentry *dentry)
1672{
1673        int killsuid = should_remove_suid(dentry);
1674        int killpriv = security_inode_need_killpriv(dentry);
1675        int error = 0;
1676
1677        if (killpriv < 0)
1678                return killpriv;
1679        if (killpriv)
1680                error = security_inode_killpriv(dentry);
1681        if (!error && killsuid)
1682                error = __remove_suid(dentry, killsuid);
1683
1684        return error;
1685}
1686EXPORT_SYMBOL(remove_suid);
1687
1688static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1689                        const struct iovec *iov, size_t base, size_t bytes)
1690{
1691        size_t copied = 0, left = 0;
1692
1693        while (bytes) {
1694                char __user *buf = iov->iov_base + base;
1695                int copy = min(bytes, iov->iov_len - base);
1696
1697                base = 0;
1698                left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1699                copied += copy;
1700                bytes -= copy;
1701                vaddr += copy;
1702                iov++;
1703
1704                if (unlikely(left))
1705                        break;
1706        }
1707        return copied - left;
1708}
1709
1710/*
1711 * Copy as much as we can into the page and return the number of bytes which
1712 * were sucessfully copied.  If a fault is encountered then return the number of
1713 * bytes which were copied.
1714 */
1715size_t iov_iter_copy_from_user_atomic(struct page *page,
1716                struct iov_iter *i, unsigned long offset, size_t bytes)
1717{
1718        char *kaddr;
1719        size_t copied;
1720
1721        BUG_ON(!in_atomic());
1722        kaddr = kmap_atomic(page, KM_USER0);
1723        if (likely(i->nr_segs == 1)) {
1724                int left;
1725                char __user *buf = i->iov->iov_base + i->iov_offset;
1726                left = __copy_from_user_inatomic_nocache(kaddr + offset,
1727                                                        buf, bytes);
1728                copied = bytes - left;
1729        } else {
1730                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1731                                                i->iov, i->iov_offset, bytes);
1732        }
1733        kunmap_atomic(kaddr, KM_USER0);
1734
1735        return copied;
1736}
1737EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1738
1739/*
1740 * This has the same sideeffects and return value as
1741 * iov_iter_copy_from_user_atomic().
1742 * The difference is that it attempts to resolve faults.
1743 * Page must not be locked.
1744 */
1745size_t iov_iter_copy_from_user(struct page *page,
1746                struct iov_iter *i, unsigned long offset, size_t bytes)
1747{
1748        char *kaddr;
1749        size_t copied;
1750
1751        kaddr = kmap(page);
1752        if (likely(i->nr_segs == 1)) {
1753                int left;
1754                char __user *buf = i->iov->iov_base + i->iov_offset;
1755                left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
1756                copied = bytes - left;
1757        } else {
1758                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1759                                                i->iov, i->iov_offset, bytes);
1760        }
1761        kunmap(page);
1762        return copied;
1763}
1764EXPORT_SYMBOL(iov_iter_copy_from_user);
1765
1766void iov_iter_advance(struct iov_iter *i, size_t bytes)
1767{
1768        BUG_ON(i->count < bytes);
1769
1770        if (likely(i->nr_segs == 1)) {
1771                i->iov_offset += bytes;
1772                i->count -= bytes;
1773        } else {
1774                const struct iovec *iov = i->iov;
1775                size_t base = i->iov_offset;
1776
1777                /*
1778                 * The !iov->iov_len check ensures we skip over unlikely
1779                 * zero-length segments (without overruning the iovec).
1780                 */
1781                while (bytes || unlikely(i->count && !iov->iov_len)) {
1782                        int copy;
1783
1784                        copy = min(bytes, iov->iov_len - base);
1785                        BUG_ON(!i->count || i->count < copy);
1786                        i->count -= copy;
1787                        bytes -= copy;
1788                        base += copy;
1789                        if (iov->iov_len == base) {
1790                                iov++;
1791                                base = 0;
1792                        }
1793                }
1794                i->iov = iov;
1795                i->iov_offset = base;
1796        }
1797}
1798EXPORT_SYMBOL(iov_iter_advance);
1799
1800/*
1801 * Fault in the first iovec of the given iov_iter, to a maximum length
1802 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1803 * accessed (ie. because it is an invalid address).
1804 *
1805 * writev-intensive code may want this to prefault several iovecs -- that
1806 * would be possible (callers must not rely on the fact that _only_ the
1807 * first iovec will be faulted with the current implementation).
1808 */
1809int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1810{
1811        char __user *buf = i->iov->iov_base + i->iov_offset;
1812        bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1813        return fault_in_pages_readable(buf, bytes);
1814}
1815EXPORT_SYMBOL(iov_iter_fault_in_readable);
1816
1817/*
1818 * Return the count of just the current iov_iter segment.
1819 */
1820size_t iov_iter_single_seg_count(struct iov_iter *i)
1821{
1822        const struct iovec *iov = i->iov;
1823        if (i->nr_segs == 1)
1824                return i->count;
1825        else
1826                return min(i->count, iov->iov_len - i->iov_offset);
1827}
1828EXPORT_SYMBOL(iov_iter_single_seg_count);
1829
1830/*
1831 * Performs necessary checks before doing a write
1832 *
1833 * Can adjust writing position or amount of bytes to write.
1834 * Returns appropriate error code that caller should return or
1835 * zero in case that write should be allowed.
1836 */
1837inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1838{
1839        struct inode *inode = file->f_mapping->host;
1840        unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1841
1842        if (unlikely(*pos < 0))
1843                return -EINVAL;
1844
1845        if (!isblk) {
1846                /* FIXME: this is for backwards compatibility with 2.4 */
1847                if (file->f_flags & O_APPEND)
1848                        *pos = i_size_read(inode);
1849
1850                if (limit != RLIM_INFINITY) {
1851                        if (*pos >= limit) {
1852                                send_sig(SIGXFSZ, current, 0);
1853                                return -EFBIG;
1854                        }
1855                        if (*count > limit - (typeof(limit))*pos) {
1856                                *count = limit - (typeof(limit))*pos;
1857                        }
1858                }
1859        }
1860
1861        /*
1862         * LFS rule
1863         */
1864        if (unlikely(*pos + *count > MAX_NON_LFS &&
1865                                !(file->f_flags & O_LARGEFILE))) {
1866                if (*pos >= MAX_NON_LFS) {
1867                        return -EFBIG;
1868                }
1869                if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1870                        *count = MAX_NON_LFS - (unsigned long)*pos;
1871                }
1872        }
1873
1874        /*
1875         * Are we about to exceed the fs block limit ?
1876         *
1877         * If we have written data it becomes a short write.  If we have
1878         * exceeded without writing data we send a signal and return EFBIG.
1879         * Linus frestrict idea will clean these up nicely..
1880         */
1881        if (likely(!isblk)) {
1882                if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1883                        if (*count || *pos > inode->i_sb->s_maxbytes) {
1884                                return -EFBIG;
1885                        }
1886                        /* zero-length writes at ->s_maxbytes are OK */
1887                }
1888
1889                if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1890                        *count = inode->i_sb->s_maxbytes - *pos;
1891        } else {
1892#ifdef CONFIG_BLOCK
1893                loff_t isize;
1894                if (bdev_read_only(I_BDEV(inode)))
1895                        return -EPERM;
1896                isize = i_size_read(inode);
1897                if (*pos >= isize) {
1898                        if (*count || *pos > isize)
1899                                return -ENOSPC;
1900                }
1901
1902                if (*pos + *count > isize)
1903                        *count = isize - *pos;
1904#else
1905                return -EPERM;
1906#endif
1907        }
1908        return 0;
1909}
1910EXPORT_SYMBOL(generic_write_checks);
1911
1912int pagecache_write_begin(struct file *file, struct address_space *mapping,
1913                                loff_t pos, unsigned len, unsigned flags,
1914                                struct page **pagep, void **fsdata)
1915{
1916        const struct address_space_operations *aops = mapping->a_ops;
1917
1918        if (aops->write_begin) {
1919                return aops->write_begin(file, mapping, pos, len, flags,
1920                                                        pagep, fsdata);
1921        } else {
1922                int ret;
1923                pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1924                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1925                struct inode *inode = mapping->host;
1926                struct page *page;
1927again:
1928                page = __grab_cache_page(mapping, index);
1929                *pagep = page;
1930                if (!page)
1931                        return -ENOMEM;
1932
1933                if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
1934                        /*
1935                         * There is no way to resolve a short write situation
1936                         * for a !Uptodate page (except by double copying in
1937                         * the caller done by generic_perform_write_2copy).
1938                         *
1939                         * Instead, we have to bring it uptodate here.
1940                         */
1941                        ret = aops->readpage(file, page);
1942                        page_cache_release(page);
1943                        if (ret) {
1944                                if (ret == AOP_TRUNCATED_PAGE)
1945                                        goto again;
1946                                return ret;
1947                        }
1948                        goto again;
1949                }
1950
1951                ret = aops->prepare_write(file, page, offset, offset+len);
1952                if (ret) {
1953                        unlock_page(page);
1954                        page_cache_release(page);
1955                        if (pos + len > inode->i_size)
1956                                vmtruncate(inode, inode->i_size);
1957                }
1958                return ret;
1959        }
1960}
1961EXPORT_SYMBOL(pagecache_write_begin);
1962
1963int pagecache_write_end(struct file *file, struct address_space *mapping,
1964                                loff_t pos, unsigned len, unsigned copied,
1965                                struct page *page, void *fsdata)
1966{
1967        const struct address_space_operations *aops = mapping->a_ops;
1968        int ret;
1969
1970        if (aops->write_end) {
1971                mark_page_accessed(page);
1972                ret = aops->write_end(file, mapping, pos, len, copied,
1973                                                        page, fsdata);
1974        } else {
1975                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1976                struct inode *inode = mapping->host;
1977
1978                flush_dcache_page(page);
1979                ret = aops->commit_write(file, page, offset, offset+len);
1980                unlock_page(page);
1981                mark_page_accessed(page);
1982                page_cache_release(page);
1983
1984                if (ret < 0) {
1985                        if (pos + len > inode->i_size)
1986                                vmtruncate(inode, inode->i_size);
1987                } else if (ret > 0)
1988                        ret = min_t(size_t, copied, ret);
1989                else
1990                        ret = copied;
1991        }
1992
1993        return ret;
1994}
1995EXPORT_SYMBOL(pagecache_write_end);
1996
1997ssize_t
1998generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1999                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2000                size_t count, size_t ocount)
2001{
2002        struct file     *file = iocb->ki_filp;
2003        struct address_space *mapping = file->f_mapping;
2004        struct inode    *inode = mapping->host;
2005        ssize_t         written;
2006
2007        if (count != ocount)
2008                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2009
2010        written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2011        if (written > 0) {
2012                loff_t end = pos + written;
2013                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2014                        i_size_write(inode,  end);
2015                        mark_inode_dirty(inode);
2016                }
2017                *ppos = end;
2018        }
2019
2020        /*
2021         * Sync the fs metadata but not the minor inode changes and
2022         * of course not the data as we did direct DMA for the IO.
2023         * i_mutex is held, which protects generic_osync_inode() from
2024         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
2025         */
2026        if ((written >= 0 || written == -EIOCBQUEUED) &&
2027            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2028                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2029                if (err < 0)
2030                        written = err;
2031        }
2032        return written;
2033}
2034EXPORT_SYMBOL(generic_file_direct_write);
2035
2036/*
2037 * Find or create a page at the given pagecache position. Return the locked
2038 * page. This function is specifically for buffered writes.
2039 */
2040struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
2041{
2042        int status;
2043        struct page *page;
2044repeat:
2045        page = find_lock_page(mapping, index);
2046        if (likely(page))
2047                return page;
2048
2049        page = page_cache_alloc(mapping);
2050        if (!page)
2051                return NULL;
2052        status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
2053        if (unlikely(status)) {
2054                page_cache_release(page);
2055                if (status == -EEXIST)
2056                        goto repeat;
2057                return NULL;
2058        }
2059        return page;
2060}
2061EXPORT_SYMBOL(__grab_cache_page);
2062
2063static ssize_t generic_perform_write_2copy(struct file *file,
2064                                struct iov_iter *i, loff_t pos)
2065{
2066        struct address_space *mapping = file->f_mapping;
2067        const struct address_space_operations *a_ops = mapping->a_ops;
2068        struct inode *inode = mapping->host;
2069        long status = 0;
2070        ssize_t written = 0;
2071
2072        do {
2073                struct page *src_page;
2074                struct page *page;
2075                pgoff_t index;          /* Pagecache index for current page */
2076                unsigned long offset;   /* Offset into pagecache page */
2077                unsigned long bytes;    /* Bytes to write to page */
2078                size_t copied;          /* Bytes copied from user */
2079
2080                offset = (pos & (PAGE_CACHE_SIZE - 1));
2081                index = pos >> PAGE_CACHE_SHIFT;
2082                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2083                                                iov_iter_count(i));
2084
2085                /*
2086                 * a non-NULL src_page indicates that we're doing the
2087                 * copy via get_user_pages and kmap.
2088                 */
2089                src_page = NULL;
2090
2091                /*
2092                 * Bring in the user page that we will copy from _first_.
2093                 * Otherwise there's a nasty deadlock on copying from the
2094                 * same page as we're writing to, without it being marked
2095                 * up-to-date.
2096                 *
2097                 * Not only is this an optimisation, but it is also required
2098                 * to check that the address is actually valid, when atomic
2099                 * usercopies are used, below.
2100                 */
2101                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2102                        status = -EFAULT;
2103                        break;
2104                }
2105
2106                page = __grab_cache_page(mapping, index);
2107                if (!page) {
2108                        status = -ENOMEM;
2109                        break;
2110                }
2111
2112                /*
2113                 * non-uptodate pages cannot cope with short copies, and we
2114                 * cannot take a pagefault with the destination page locked.
2115                 * So pin the source page to copy it.
2116                 */
2117                if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2118                        unlock_page(page);
2119
2120                        src_page = alloc_page(GFP_KERNEL);
2121                        if (!src_page) {
2122                                page_cache_release(page);
2123                                status = -ENOMEM;
2124                                break;
2125                        }
2126
2127                        /*
2128                         * Cannot get_user_pages with a page locked for the
2129                         * same reason as we can't take a page fault with a
2130                         * page locked (as explained below).
2131                         */
2132                        copied = iov_iter_copy_from_user(src_page, i,
2133                                                                offset, bytes);
2134                        if (unlikely(copied == 0)) {
2135                                status = -EFAULT;
2136                                page_cache_release(page);
2137                                page_cache_release(src_page);
2138                                break;
2139                        }
2140                        bytes = copied;
2141
2142                        lock_page(page);
2143                        /*
2144                         * Can't handle the page going uptodate here, because
2145                         * that means we would use non-atomic usercopies, which
2146                         * zero out the tail of the page, which can cause
2147                         * zeroes to become transiently visible. We could just
2148                         * use a non-zeroing copy, but the APIs aren't too
2149                         * consistent.
2150                         */
2151                        if (unlikely(!page->mapping || PageUptodate(page))) {
2152                                unlock_page(page);
2153                                page_cache_release(page);
2154                                page_cache_release(src_page);
2155                                continue;
2156                        }
2157                }
2158
2159                status = a_ops->prepare_write(file, page, offset, offset+bytes);
2160                if (unlikely(status))
2161                        goto fs_write_aop_error;
2162
2163                if (!src_page) {
2164                        /*
2165                         * Must not enter the pagefault handler here, because
2166                         * we hold the page lock, so we might recursively
2167                         * deadlock on the same lock, or get an ABBA deadlock
2168                         * against a different lock, or against the mmap_sem
2169                         * (which nests outside the page lock).  So increment
2170                         * preempt count, and use _atomic usercopies.
2171                         *
2172                         * The page is uptodate so we are OK to encounter a
2173                         * short copy: if unmodified parts of the page are
2174                         * marked dirty and written out to disk, it doesn't
2175                         * really matter.
2176                         */
2177                        pagefault_disable();
2178                        copied = iov_iter_copy_from_user_atomic(page, i,
2179                                                                offset, bytes);
2180                        pagefault_enable();
2181                } else {
2182                        void *src, *dst;
2183                        src = kmap_atomic(src_page, KM_USER0);
2184                        dst = kmap_atomic(page, KM_USER1);
2185                        memcpy(dst + offset, src + offset, bytes);
2186                        kunmap_atomic(dst, KM_USER1);
2187                        kunmap_atomic(src, KM_USER0);
2188                        copied = bytes;
2189                }
2190                flush_dcache_page(page);
2191
2192                status = a_ops->commit_write(file, page, offset, offset+bytes);
2193                if (unlikely(status < 0))
2194                        goto fs_write_aop_error;
2195                if (unlikely(status > 0)) /* filesystem did partial write */
2196                        copied = min_t(size_t, copied, status);
2197
2198                unlock_page(page);
2199                mark_page_accessed(page);
2200                page_cache_release(page);
2201                if (src_page)
2202                        page_cache_release(src_page);
2203
2204                iov_iter_advance(i, copied);
2205                pos += copied;
2206                written += copied;
2207
2208                balance_dirty_pages_ratelimited(mapping);
2209                cond_resched();
2210                continue;
2211
2212fs_write_aop_error:
2213                unlock_page(page);
2214                page_cache_release(page);
2215                if (src_page)
2216                        page_cache_release(src_page);
2217
2218                /*
2219                 * prepare_write() may have instantiated a few blocks
2220                 * outside i_size.  Trim these off again. Don't need
2221                 * i_size_read because we hold i_mutex.
2222                 */
2223                if (pos + bytes > inode->i_size)
2224                        vmtruncate(inode, inode->i_size);
2225                break;
2226        } while (iov_iter_count(i));
2227
2228        return written ? written : status;
2229}
2230
2231static ssize_t generic_perform_write(struct file *file,
2232                                struct iov_iter *i, loff_t pos)
2233{
2234        struct address_space *mapping = file->f_mapping;
2235        const struct address_space_operations *a_ops = mapping->a_ops;
2236        long status = 0;
2237        ssize_t written = 0;
2238        unsigned int flags = 0;
2239
2240        /*
2241         * Copies from kernel address space cannot fail (NFSD is a big user).
2242         */
2243        if (segment_eq(get_fs(), KERNEL_DS))
2244                flags |= AOP_FLAG_UNINTERRUPTIBLE;
2245
2246        do {
2247                struct page *page;
2248                pgoff_t index;          /* Pagecache index for current page */
2249                unsigned long offset;   /* Offset into pagecache page */
2250                unsigned long bytes;    /* Bytes to write to page */
2251                size_t copied;          /* Bytes copied from user */
2252                void *fsdata;
2253
2254                offset = (pos & (PAGE_CACHE_SIZE - 1));
2255                index = pos >> PAGE_CACHE_SHIFT;
2256                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2257                                                iov_iter_count(i));
2258
2259again:
2260
2261                /*
2262                 * Bring in the user page that we will copy from _first_.
2263                 * Otherwise there's a nasty deadlock on copying from the
2264                 * same page as we're writing to, without it being marked
2265                 * up-to-date.
2266                 *
2267                 * Not only is this an optimisation, but it is also required
2268                 * to check that the address is actually valid, when atomic
2269                 * usercopies are used, below.
2270                 */
2271                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2272                        status = -EFAULT;
2273                        break;
2274                }
2275
2276                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2277                                                &page, &fsdata);
2278                if (unlikely(status))
2279                        break;
2280
2281                pagefault_disable();
2282                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2283                pagefault_enable();
2284                flush_dcache_page(page);
2285
2286                status = a_ops->write_end(file, mapping, pos, bytes, copied,
2287                                                page, fsdata);
2288                if (unlikely(status < 0))
2289                        break;
2290                copied = status;
2291
2292                cond_resched();
2293
2294                iov_iter_advance(i, copied);
2295                if (unlikely(copied == 0)) {
2296                        /*
2297                         * If we were unable to copy any data at all, we must
2298                         * fall back to a single segment length write.
2299                         *
2300                         * If we didn't fallback here, we could livelock
2301                         * because not all segments in the iov can be copied at
2302                         * once without a pagefault.
2303                         */
2304                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2305                                                iov_iter_single_seg_count(i));
2306                        goto again;
2307                }
2308                pos += copied;
2309                written += copied;
2310
2311                balance_dirty_pages_ratelimited(mapping);
2312
2313        } while (iov_iter_count(i));
2314
2315        return written ? written : status;
2316}
2317
2318ssize_t
2319generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2320                unsigned long nr_segs, loff_t pos, loff_t *ppos,
2321                size_t count, ssize_t written)
2322{
2323        struct file *file = iocb->ki_filp;
2324        struct address_space *mapping = file->f_mapping;
2325        const struct address_space_operations *a_ops = mapping->a_ops;
2326        struct inode *inode = mapping->host;
2327        ssize_t status;
2328        struct iov_iter i;
2329
2330        iov_iter_init(&i, iov, nr_segs, count, written);
2331        if (a_ops->write_begin)
2332                status = generic_perform_write(file, &i, pos);
2333        else
2334                status = generic_perform_write_2copy(file, &i, pos);
2335
2336        if (likely(status >= 0)) {
2337                written += status;
2338                *ppos = pos + status;
2339
2340                /*
2341                 * For now, when the user asks for O_SYNC, we'll actually give
2342                 * O_DSYNC
2343                 */
2344                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2345                        if (!a_ops->writepage || !is_sync_kiocb(iocb))
2346                                status = generic_osync_inode(inode, mapping,
2347                                                OSYNC_METADATA|OSYNC_DATA);
2348                }
2349        }
2350        
2351        /*
2352         * If we get here for O_DIRECT writes then we must have fallen through
2353         * to buffered writes (block instantiation inside i_size).  So we sync
2354         * the file data here, to try to honour O_DIRECT expectations.
2355         */
2356        if (unlikely(file->f_flags & O_DIRECT) && written)
2357                status = filemap_write_and_wait(mapping);
2358
2359        return written ? written : status;
2360}
2361EXPORT_SYMBOL(generic_file_buffered_write);
2362
2363static ssize_t
2364__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2365                                unsigned long nr_segs, loff_t *ppos)
2366{
2367        struct file *file = iocb->ki_filp;
2368        struct address_space * mapping = file->f_mapping;
2369        size_t ocount;          /* original count */
2370        size_t count;           /* after file limit checks */
2371        struct inode    *inode = mapping->host;
2372        loff_t          pos;
2373        ssize_t         written;
2374        ssize_t         err;
2375
2376        ocount = 0;
2377        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2378        if (err)
2379                return err;
2380
2381        count = ocount;
2382        pos = *ppos;
2383
2384        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2385
2386        /* We can write back this queue in page reclaim */
2387        current->backing_dev_info = mapping->backing_dev_info;
2388        written = 0;
2389
2390        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2391        if (err)
2392                goto out;
2393
2394        if (count == 0)
2395                goto out;
2396
2397        err = remove_suid(file->f_path.dentry);
2398        if (err)
2399                goto out;
2400
2401        file_update_time(file);
2402
2403        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2404        if (unlikely(file->f_flags & O_DIRECT)) {
2405                loff_t endbyte;
2406                ssize_t written_buffered;
2407
2408                written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2409                                                        ppos, count, ocount);
2410                if (written < 0 || written == count)
2411                        goto out;
2412                /*
2413                 * direct-io write to a hole: fall through to buffered I/O
2414                 * for completing the rest of the request.
2415                 */
2416                pos += written;
2417                count -= written;
2418                written_buffered = generic_file_buffered_write(iocb, iov,
2419                                                nr_segs, pos, ppos, count,
2420                                                written);
2421                /*
2422                 * If generic_file_buffered_write() retuned a synchronous error
2423                 * then we want to return the number of bytes which were
2424                 * direct-written, or the error code if that was zero.  Note
2425                 * that this differs from normal direct-io semantics, which
2426                 * will return -EFOO even if some bytes were written.
2427                 */
2428                if (written_buffered < 0) {
2429                        err = written_buffered;
2430                        goto out;
2431                }
2432
2433                /*
2434                 * We need to ensure that the page cache pages are written to
2435                 * disk and invalidated to preserve the expected O_DIRECT
2436                 * semantics.
2437                 */
2438                endbyte = pos + written_buffered - written - 1;
2439                err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
2440                                            SYNC_FILE_RANGE_WAIT_BEFORE|
2441                                            SYNC_FILE_RANGE_WRITE|
2442                                            SYNC_FILE_RANGE_WAIT_AFTER);
2443                if (err == 0) {
2444                        written = written_buffered;
2445                        invalidate_mapping_pages(mapping,
2446                                                 pos >> PAGE_CACHE_SHIFT,
2447                                                 endbyte >> PAGE_CACHE_SHIFT);
2448                } else {
2449                        /*
2450                         * We don't know how much we wrote, so just return
2451                         * the number of bytes which were direct-written
2452                         */
2453                }
2454        } else {
2455                written = generic_file_buffered_write(iocb, iov, nr_segs,
2456                                pos, ppos, count, written);
2457        }
2458out:
2459        current->backing_dev_info = NULL;
2460        return written ? written : err;
2461}
2462
2463ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
2464                const struct iovec *iov, unsigned long nr_segs, loff_t pos)
2465{
2466        struct file *file = iocb->ki_filp;
2467        struct address_space *mapping = file->f_mapping;
2468        struct inode *inode = mapping->host;
2469        ssize_t ret;
2470
2471        BUG_ON(iocb->ki_pos != pos);
2472
2473        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2474                        &iocb->ki_pos);
2475
2476        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2477                ssize_t err;
2478
2479                err = sync_page_range_nolock(inode, mapping, pos, ret);
2480                if (err < 0)
2481                        ret = err;
2482        }
2483        return ret;
2484}
2485EXPORT_SYMBOL(generic_file_aio_write_nolock);
2486
2487ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2488                unsigned long nr_segs, loff_t pos)
2489{
2490        struct file *file = iocb->ki_filp;
2491        struct address_space *mapping = file->f_mapping;
2492        struct inode *inode = mapping->host;
2493        ssize_t ret;
2494
2495        BUG_ON(iocb->ki_pos != pos);
2496
2497        mutex_lock(&inode->i_mutex);
2498        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2499                        &iocb->ki_pos);
2500        mutex_unlock(&inode->i_mutex);
2501
2502        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2503                ssize_t err;
2504
2505                err = sync_page_range(inode, mapping, pos, ret);
2506                if (err < 0)
2507                        ret = err;
2508        }
2509        return ret;
2510}
2511EXPORT_SYMBOL(generic_file_aio_write);
2512
2513/*
2514 * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
2515 * went wrong during pagecache shootdown.
2516 */
2517static ssize_t
2518generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2519        loff_t offset, unsigned long nr_segs)
2520{
2521        struct file *file = iocb->ki_filp;
2522        struct address_space *mapping = file->f_mapping;
2523        ssize_t retval;
2524        size_t write_len;
2525        pgoff_t end = 0; /* silence gcc */
2526
2527        /*
2528         * If it's a write, unmap all mmappings of the file up-front.  This
2529         * will cause any pte dirty bits to be propagated into the pageframes
2530         * for the subsequent filemap_write_and_wait().
2531         */
2532        if (rw == WRITE) {
2533                write_len = iov_length(iov, nr_segs);
2534                end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2535                if (mapping_mapped(mapping))
2536                        unmap_mapping_range(mapping, offset, write_len, 0);
2537        }
2538
2539        retval = filemap_write_and_wait(mapping);
2540        if (retval)
2541                goto out;
2542
2543        /*
2544         * After a write we want buffered reads to be sure to go to disk to get
2545         * the new data.  We invalidate clean cached page from the region we're
2546         * about to write.  We do this *before* the write so that we can return
2547         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2548         */
2549        if (rw == WRITE && mapping->nrpages) {
2550                retval = invalidate_inode_pages2_range(mapping,
2551                                        offset >> PAGE_CACHE_SHIFT, end);
2552                if (retval)
2553                        goto out;
2554        }
2555
2556        retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2557
2558        /*
2559         * Finally, try again to invalidate clean pages which might have been
2560         * cached by non-direct readahead, or faulted in by get_user_pages()
2561         * if the source of the write was an mmap'ed region of the file
2562         * we're writing.  Either one is a pretty crazy thing to do,
2563         * so we don't support it 100%.  If this invalidation
2564         * fails, tough, the write still worked...
2565         */
2566        if (rw == WRITE && mapping->nrpages) {
2567                invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2568        }
2569out:
2570        return retval;
2571}
2572
2573/**
2574 * try_to_release_page() - release old fs-specific metadata on a page
2575 *
2576 * @page: the page which the kernel is trying to free
2577 * @gfp_mask: memory allocation flags (and I/O mode)
2578 *
2579 * The address_space is to try to release any data against the page
2580 * (presumably at page->private).  If the release was successful, return `1'.
2581 * Otherwise return zero.
2582 *
2583 * The @gfp_mask argument specifies whether I/O may be performed to release
2584 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2585 *
2586 */
2587int try_to_release_page(struct page *page, gfp_t gfp_mask)
2588{
2589        struct address_space * const mapping = page->mapping;
2590
2591        BUG_ON(!PageLocked(page));
2592        if (PageWriteback(page))
2593                return 0;
2594
2595        if (mapping && mapping->a_ops->releasepage)
2596                return mapping->a_ops->releasepage(page, gfp_mask);
2597        return try_to_free_buffers(page);
2598}
2599
2600EXPORT_SYMBOL(try_to_release_page);
2601
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.