linux/mm/filemap.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994-1999  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem used to do this differently, for example)
  11 */
  12#include <linux/module.h>
  13#include <linux/slab.h>
  14#include <linux/compiler.h>
  15#include <linux/fs.h>
  16#include <linux/uaccess.h>
  17#include <linux/aio.h>
  18#include <linux/capability.h>
  19#include <linux/kernel_stat.h>
  20#include <linux/mm.h>
  21#include <linux/swap.h>
  22#include <linux/mman.h>
  23#include <linux/pagemap.h>
  24#include <linux/file.h>
  25#include <linux/uio.h>
  26#include <linux/hash.h>
  27#include <linux/writeback.h>
  28#include <linux/pagevec.h>
  29#include <linux/blkdev.h>
  30#include <linux/security.h>
  31#include <linux/syscalls.h>
  32#include <linux/cpuset.h>
  33#include "filemap.h"
  34#include "internal.h"
  35
  36/*
  37 * FIXME: remove all knowledge of the buffer layer from the core VM
  38 */
  39#include <linux/buffer_head.h> /* for generic_osync_inode */
  40
  41#include <asm/mman.h>
  42
  43static ssize_t
  44generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  45        loff_t offset, unsigned long nr_segs);
  46
  47/*
  48 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  49 * though.
  50 *
  51 * Shared mappings now work. 15.8.1995  Bruno.
  52 *
  53 * finished 'unifying' the page and buffer cache and SMP-threaded the
  54 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  55 *
  56 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  57 */
  58
  59/*
  60 * Lock ordering:
  61 *
  62 *  ->i_mmap_lock               (vmtruncate)
  63 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
  64 *      ->swap_lock             (exclusive_swap_page, others)
  65 *        ->mapping->tree_lock
  66 *
  67 *  ->i_mutex
  68 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
  69 *
  70 *  ->mmap_sem
  71 *    ->i_mmap_lock
  72 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
  73 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
  74 *
  75 *  ->mmap_sem
  76 *    ->lock_page               (access_process_vm)
  77 *
  78 *  ->i_mutex                   (generic_file_buffered_write)
  79 *    ->mmap_sem                (fault_in_pages_readable->do_page_fault)
  80 *
  81 *  ->i_mutex
  82 *    ->i_alloc_sem             (various)
  83 *
  84 *  ->inode_lock
  85 *    ->sb_lock                 (fs/fs-writeback.c)
  86 *    ->mapping->tree_lock      (__sync_single_inode)
  87 *
  88 *  ->i_mmap_lock
  89 *    ->anon_vma.lock           (vma_adjust)
  90 *
  91 *  ->anon_vma.lock
  92 *    ->page_table_lock or pte_lock     (anon_vma_prepare and various)
  93 *
  94 *  ->page_table_lock or pte_lock
  95 *    ->swap_lock               (try_to_unmap_one)
  96 *    ->private_lock            (try_to_unmap_one)
  97 *    ->tree_lock               (try_to_unmap_one)
  98 *    ->zone.lru_lock           (follow_page->mark_page_accessed)
  99 *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
 100 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 101 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
 102 *    ->inode_lock              (page_remove_rmap->set_page_dirty)
 103 *    ->inode_lock              (zap_pte_range->set_page_dirty)
 104 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 105 *
 106 *  ->task->proc_lock
 107 *    ->dcache_lock             (proc_pid_lookup)
 108 */
 109
 110/*
 111 * Remove a page from the page cache and free it. Caller has to make
 112 * sure the page is locked and that nobody else uses it - or that usage
 113 * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
 114 */
 115void __remove_from_page_cache(struct page *page)
 116{
 117        struct address_space *mapping = page->mapping;
 118
 119        radix_tree_delete(&mapping->page_tree, page->index);
 120        page->mapping = NULL;
 121        mapping->nrpages--;
 122        __dec_zone_page_state(page, NR_FILE_PAGES);
 123}
 124
 125void remove_from_page_cache(struct page *page)
 126{
 127        struct address_space *mapping = page->mapping;
 128
 129        BUG_ON(!PageLocked(page));
 130
 131        write_lock_irq(&mapping->tree_lock);
 132        __remove_from_page_cache(page);
 133        write_unlock_irq(&mapping->tree_lock);
 134}
 135
 136static int sync_page(void *word)
 137{
 138        struct address_space *mapping;
 139        struct page *page;
 140
 141        page = container_of((unsigned long *)word, struct page, flags);
 142
 143        /*
 144         * page_mapping() is being called without PG_locked held.
 145         * Some knowledge of the state and use of the page is used to
 146         * reduce the requirements down to a memory barrier.
 147         * The danger here is of a stale page_mapping() return value
 148         * indicating a struct address_space different from the one it's
 149         * associated with when it is associated with one.
 150         * After smp_mb(), it's either the correct page_mapping() for
 151         * the page, or an old page_mapping() and the page's own
 152         * page_mapping() has gone NULL.
 153         * The ->sync_page() address_space operation must tolerate
 154         * page_mapping() going NULL. By an amazing coincidence,
 155         * this comes about because none of the users of the page
 156         * in the ->sync_page() methods make essential use of the
 157         * page_mapping(), merely passing the page down to the backing
 158         * device's unplug functions when it's non-NULL, which in turn
 159         * ignore it for all cases but swap, where only page_private(page) is
 160         * of interest. When page_mapping() does go NULL, the entire
 161         * call stack gracefully ignores the page and returns.
 162         * -- wli
 163         */
 164        smp_mb();
 165        mapping = page_mapping(page);
 166        if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 167                mapping->a_ops->sync_page(page);
 168        io_schedule();
 169        return 0;
 170}
 171
 172/**
 173 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 174 * @mapping:    address space structure to write
 175 * @start:      offset in bytes where the range starts
 176 * @end:        offset in bytes where the range ends (inclusive)
 177 * @sync_mode:  enable synchronous operation
 178 *
 179 * Start writeback against all of a mapping's dirty pages that lie
 180 * within the byte offsets <start, end> inclusive.
 181 *
 182 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 183 * opposed to a regular memory cleansing writeback.  The difference between
 184 * these two operations is that if a dirty page/buffer is encountered, it must
 185 * be waited upon, and not just skipped over.
 186 */
 187int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 188                                loff_t end, int sync_mode)
 189{
 190        int ret;
 191        struct writeback_control wbc = {
 192                .sync_mode = sync_mode,
 193                .nr_to_write = mapping->nrpages * 2,
 194                .range_start = start,
 195                .range_end = end,
 196        };
 197
 198        if (!mapping_cap_writeback_dirty(mapping))
 199                return 0;
 200
 201        ret = do_writepages(mapping, &wbc);
 202        return ret;
 203}
 204
 205static inline int __filemap_fdatawrite(struct address_space *mapping,
 206        int sync_mode)
 207{
 208        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 209}
 210
 211int filemap_fdatawrite(struct address_space *mapping)
 212{
 213        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 214}
 215EXPORT_SYMBOL(filemap_fdatawrite);
 216
 217static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 218                                loff_t end)
 219{
 220        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 221}
 222
 223/**
 224 * filemap_flush - mostly a non-blocking flush
 225 * @mapping:    target address_space
 226 *
 227 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 228 * purposes - I/O may not be started against all dirty pages.
 229 */
 230int filemap_flush(struct address_space *mapping)
 231{
 232        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 233}
 234EXPORT_SYMBOL(filemap_flush);
 235
 236/**
 237 * wait_on_page_writeback_range - wait for writeback to complete
 238 * @mapping:    target address_space
 239 * @start:      beginning page index
 240 * @end:        ending page index
 241 *
 242 * Wait for writeback to complete against pages indexed by start->end
 243 * inclusive
 244 */
 245int wait_on_page_writeback_range(struct address_space *mapping,
 246                                pgoff_t start, pgoff_t end)
 247{
 248        struct pagevec pvec;
 249        int nr_pages;
 250        int ret = 0;
 251        pgoff_t index;
 252
 253        if (end < start)
 254                return 0;
 255
 256        pagevec_init(&pvec, 0);
 257        index = start;
 258        while ((index <= end) &&
 259                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 260                        PAGECACHE_TAG_WRITEBACK,
 261                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
 262                unsigned i;
 263
 264                for (i = 0; i < nr_pages; i++) {
 265                        struct page *page = pvec.pages[i];
 266
 267                        /* until radix tree lookup accepts end_index */
 268                        if (page->index > end)
 269                                continue;
 270
 271                        wait_on_page_writeback(page);
 272                        if (PageError(page))
 273                                ret = -EIO;
 274                }
 275                pagevec_release(&pvec);
 276                cond_resched();
 277        }
 278
 279        /* Check for outstanding write errors */
 280        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
 281                ret = -ENOSPC;
 282        if (test_and_clear_bit(AS_EIO, &mapping->flags))
 283                ret = -EIO;
 284
 285        return ret;
 286}
 287
 288/**
 289 * sync_page_range - write and wait on all pages in the passed range
 290 * @inode:      target inode
 291 * @mapping:    target address_space
 292 * @pos:        beginning offset in pages to write
 293 * @count:      number of bytes to write
 294 *
 295 * Write and wait upon all the pages in the passed range.  This is a "data
 296 * integrity" operation.  It waits upon in-flight writeout before starting and
 297 * waiting upon new writeout.  If there was an IO error, return it.
 298 *
 299 * We need to re-take i_mutex during the generic_osync_inode list walk because
 300 * it is otherwise livelockable.
 301 */
 302int sync_page_range(struct inode *inode, struct address_space *mapping,
 303                        loff_t pos, loff_t count)
 304{
 305        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 306        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 307        int ret;
 308
 309        if (!mapping_cap_writeback_dirty(mapping) || !count)
 310                return 0;
 311        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 312        if (ret == 0) {
 313                mutex_lock(&inode->i_mutex);
 314                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 315                mutex_unlock(&inode->i_mutex);
 316        }
 317        if (ret == 0)
 318                ret = wait_on_page_writeback_range(mapping, start, end);
 319        return ret;
 320}
 321EXPORT_SYMBOL(sync_page_range);
 322
 323/**
 324 * sync_page_range_nolock
 325 * @inode:      target inode
 326 * @mapping:    target address_space
 327 * @pos:        beginning offset in pages to write
 328 * @count:      number of bytes to write
 329 *
 330 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
 331 * as it forces O_SYNC writers to different parts of the same file
 332 * to be serialised right until io completion.
 333 */
 334int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 335                           loff_t pos, loff_t count)
 336{
 337        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 338        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 339        int ret;
 340
 341        if (!mapping_cap_writeback_dirty(mapping) || !count)
 342                return 0;
 343        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 344        if (ret == 0)
 345                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 346        if (ret == 0)
 347                ret = wait_on_page_writeback_range(mapping, start, end);
 348        return ret;
 349}
 350EXPORT_SYMBOL(sync_page_range_nolock);
 351
 352/**
 353 * filemap_fdatawait - wait for all under-writeback pages to complete
 354 * @mapping: address space structure to wait for
 355 *
 356 * Walk the list of under-writeback pages of the given address space
 357 * and wait for all of them.
 358 */
 359int filemap_fdatawait(struct address_space *mapping)
 360{
 361        loff_t i_size = i_size_read(mapping->host);
 362
 363        if (i_size == 0)
 364                return 0;
 365
 366        return wait_on_page_writeback_range(mapping, 0,
 367                                (i_size - 1) >> PAGE_CACHE_SHIFT);
 368}
 369EXPORT_SYMBOL(filemap_fdatawait);
 370
 371int filemap_write_and_wait(struct address_space *mapping)
 372{
 373        int err = 0;
 374
 375        if (mapping->nrpages) {
 376                err = filemap_fdatawrite(mapping);
 377                /*
 378                 * Even if the above returned error, the pages may be
 379                 * written partially (e.g. -ENOSPC), so we wait for it.
 380                 * But the -EIO is special case, it may indicate the worst
 381                 * thing (e.g. bug) happened, so we avoid waiting for it.
 382                 */
 383                if (err != -EIO) {
 384                        int err2 = filemap_fdatawait(mapping);
 385                        if (!err)
 386                                err = err2;
 387                }
 388        }
 389        return err;
 390}
 391EXPORT_SYMBOL(filemap_write_and_wait);
 392
 393/**
 394 * filemap_write_and_wait_range - write out & wait on a file range
 395 * @mapping:    the address_space for the pages
 396 * @lstart:     offset in bytes where the range starts
 397 * @lend:       offset in bytes where the range ends (inclusive)
 398 *
 399 * Write out and wait upon file offsets lstart->lend, inclusive.
 400 *
 401 * Note that `lend' is inclusive (describes the last byte to be written) so
 402 * that this function can be used to write to the very end-of-file (end = -1).
 403 */
 404int filemap_write_and_wait_range(struct address_space *mapping,
 405                                 loff_t lstart, loff_t lend)
 406{
 407        int err = 0;
 408
 409        if (mapping->nrpages) {
 410                err = __filemap_fdatawrite_range(mapping, lstart, lend,
 411                                                 WB_SYNC_ALL);
 412                /* See comment of filemap_write_and_wait() */
 413                if (err != -EIO) {
 414                        int err2 = wait_on_page_writeback_range(mapping,
 415                                                lstart >> PAGE_CACHE_SHIFT,
 416                                                lend >> PAGE_CACHE_SHIFT);
 417                        if (!err)
 418                                err = err2;
 419                }
 420        }
 421        return err;
 422}
 423
 424/**
 425 * add_to_page_cache - add newly allocated pagecache pages
 426 * @page:       page to add
 427 * @mapping:    the page's address_space
 428 * @offset:     page index
 429 * @gfp_mask:   page allocation mode
 430 *
 431 * This function is used to add newly allocated pagecache pages;
 432 * the page is new, so we can just run SetPageLocked() against it.
 433 * The other page state flags were set by rmqueue().
 434 *
 435 * This function does not add the page to the LRU.  The caller must do that.
 436 */
 437int add_to_page_cache(struct page *page, struct address_space *mapping,
 438                pgoff_t offset, gfp_t gfp_mask)
 439{
 440        int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 441
 442        if (error == 0) {
 443                write_lock_irq(&mapping->tree_lock);
 444                error = radix_tree_insert(&mapping->page_tree, offset, page);
 445                if (!error) {
 446                        page_cache_get(page);
 447                        SetPageLocked(page);
 448                        page->mapping = mapping;
 449                        page->index = offset;
 450                        mapping->nrpages++;
 451                        __inc_zone_page_state(page, NR_FILE_PAGES);
 452                }
 453                write_unlock_irq(&mapping->tree_lock);
 454                radix_tree_preload_end();
 455        }
 456        return error;
 457}
 458EXPORT_SYMBOL(add_to_page_cache);
 459
 460int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 461                                pgoff_t offset, gfp_t gfp_mask)
 462{
 463        int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
 464        if (ret == 0)
 465                lru_cache_add(page);
 466        return ret;
 467}
 468
 469#ifdef CONFIG_NUMA
 470struct page *__page_cache_alloc(gfp_t gfp)
 471{
 472        if (cpuset_do_page_mem_spread()) {
 473                int n = cpuset_mem_spread_node();
 474                return alloc_pages_node(n, gfp, 0);
 475        }
 476        return alloc_pages(gfp, 0);
 477}
 478EXPORT_SYMBOL(__page_cache_alloc);
 479#endif
 480
 481static int __sleep_on_page_lock(void *word)
 482{
 483        io_schedule();
 484        return 0;
 485}
 486
 487/*
 488 * In order to wait for pages to become available there must be
 489 * waitqueues associated with pages. By using a hash table of
 490 * waitqueues where the bucket discipline is to maintain all
 491 * waiters on the same queue and wake all when any of the pages
 492 * become available, and for the woken contexts to check to be
 493 * sure the appropriate page became available, this saves space
 494 * at a cost of "thundering herd" phenomena during rare hash
 495 * collisions.
 496 */
 497static wait_queue_head_t *page_waitqueue(struct page *page)
 498{
 499        const struct zone *zone = page_zone(page);
 500
 501        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 502}
 503
 504static inline void wake_up_page(struct page *page, int bit)
 505{
 506        __wake_up_bit(page_waitqueue(page), &page->flags, bit);
 507}
 508
 509void fastcall wait_on_page_bit(struct page *page, int bit_nr)
 510{
 511        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 512
 513        if (test_bit(bit_nr, &page->flags))
 514                __wait_on_bit(page_waitqueue(page), &wait, sync_page,
 515                                                        TASK_UNINTERRUPTIBLE);
 516}
 517EXPORT_SYMBOL(wait_on_page_bit);
 518
 519/**
 520 * unlock_page - unlock a locked page
 521 * @page: the page
 522 *
 523 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 524 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 525 * mechananism between PageLocked pages and PageWriteback pages is shared.
 526 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 527 *
 528 * The first mb is necessary to safely close the critical section opened by the
 529 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
 530 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
 531 * parallel wait_on_page_locked()).
 532 */
 533void fastcall unlock_page(struct page *page)
 534{
 535        smp_mb__before_clear_bit();
 536        if (!TestClearPageLocked(page))
 537                BUG();
 538        smp_mb__after_clear_bit(); 
 539        wake_up_page(page, PG_locked);
 540}
 541EXPORT_SYMBOL(unlock_page);
 542
 543/**
 544 * end_page_writeback - end writeback against a page
 545 * @page: the page
 546 */
 547void end_page_writeback(struct page *page)
 548{
 549        if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
 550                if (!test_clear_page_writeback(page))
 551                        BUG();
 552        }
 553        smp_mb__after_clear_bit();
 554        wake_up_page(page, PG_writeback);
 555}
 556EXPORT_SYMBOL(end_page_writeback);
 557
 558/**
 559 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 560 * @page: the page to lock
 561 *
 562 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 563 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 564 * chances are that on the second loop, the block layer's plug list is empty,
 565 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 566 */
 567void fastcall __lock_page(struct page *page)
 568{
 569        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 570
 571        __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
 572                                                        TASK_UNINTERRUPTIBLE);
 573}
 574EXPORT_SYMBOL(__lock_page);
 575
 576/*
 577 * Variant of lock_page that does not require the caller to hold a reference
 578 * on the page's mapping.
 579 */
 580void fastcall __lock_page_nosync(struct page *page)
 581{
 582        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 583        __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
 584                                                        TASK_UNINTERRUPTIBLE);
 585}
 586
 587/**
 588 * find_get_page - find and get a page reference
 589 * @mapping: the address_space to search
 590 * @offset: the page index
 591 *
 592 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 593 * If yes, increment its refcount and return it; if no, return NULL.
 594 */
 595struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 596{
 597        struct page *page;
 598
 599        read_lock_irq(&mapping->tree_lock);
 600        page = radix_tree_lookup(&mapping->page_tree, offset);
 601        if (page)
 602                page_cache_get(page);
 603        read_unlock_irq(&mapping->tree_lock);
 604        return page;
 605}
 606EXPORT_SYMBOL(find_get_page);
 607
 608/**
 609 * find_lock_page - locate, pin and lock a pagecache page
 610 * @mapping: the address_space to search
 611 * @offset: the page index
 612 *
 613 * Locates the desired pagecache page, locks it, increments its reference
 614 * count and returns its address.
 615 *
 616 * Returns zero if the page was not present. find_lock_page() may sleep.
 617 */
 618struct page *find_lock_page(struct address_space *mapping,
 619                                unsigned long offset)
 620{
 621        struct page *page;
 622
 623        read_lock_irq(&mapping->tree_lock);
 624repeat:
 625        page = radix_tree_lookup(&mapping->page_tree, offset);
 626        if (page) {
 627                page_cache_get(page);
 628                if (TestSetPageLocked(page)) {
 629                        read_unlock_irq(&mapping->tree_lock);
 630                        __lock_page(page);
 631                        read_lock_irq(&mapping->tree_lock);
 632
 633                        /* Has the page been truncated while we slept? */
 634                        if (unlikely(page->mapping != mapping ||
 635                                     page->index != offset)) {
 636                                unlock_page(page);
 637                                page_cache_release(page);
 638                                goto repeat;
 639                        }
 640                }
 641        }
 642        read_unlock_irq(&mapping->tree_lock);
 643        return page;
 644}
 645EXPORT_SYMBOL(find_lock_page);
 646
 647/**
 648 * find_or_create_page - locate or add a pagecache page
 649 * @mapping: the page's address_space
 650 * @index: the page's index into the mapping
 651 * @gfp_mask: page allocation mode
 652 *
 653 * Locates a page in the pagecache.  If the page is not present, a new page
 654 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
 655 * LRU list.  The returned page is locked and has its reference count
 656 * incremented.
 657 *
 658 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
 659 * allocation!
 660 *
 661 * find_or_create_page() returns the desired page's address, or zero on
 662 * memory exhaustion.
 663 */
 664struct page *find_or_create_page(struct address_space *mapping,
 665                unsigned long index, gfp_t gfp_mask)
 666{
 667        struct page *page, *cached_page = NULL;
 668        int err;
 669repeat:
 670        page = find_lock_page(mapping, index);
 671        if (!page) {
 672                if (!cached_page) {
 673                        cached_page =
 674                                __page_cache_alloc(gfp_mask);
 675                        if (!cached_page)
 676                                return NULL;
 677                }
 678                err = add_to_page_cache_lru(cached_page, mapping,
 679                                        index, gfp_mask);
 680                if (!err) {
 681                        page = cached_page;
 682                        cached_page = NULL;
 683                } else if (err == -EEXIST)
 684                        goto repeat;
 685        }
 686        if (cached_page)
 687                page_cache_release(cached_page);
 688        return page;
 689}
 690EXPORT_SYMBOL(find_or_create_page);
 691
 692/**
 693 * find_get_pages - gang pagecache lookup
 694 * @mapping:    The address_space to search
 695 * @start:      The starting page index
 696 * @nr_pages:   The maximum number of pages
 697 * @pages:      Where the resulting pages are placed
 698 *
 699 * find_get_pages() will search for and return a group of up to
 700 * @nr_pages pages in the mapping.  The pages are placed at @pages.
 701 * find_get_pages() takes a reference against the returned pages.
 702 *
 703 * The search returns a group of mapping-contiguous pages with ascending
 704 * indexes.  There may be holes in the indices due to not-present pages.
 705 *
 706 * find_get_pages() returns the number of pages which were found.
 707 */
 708unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 709                            unsigned int nr_pages, struct page **pages)
 710{
 711        unsigned int i;
 712        unsigned int ret;
 713
 714        read_lock_irq(&mapping->tree_lock);
 715        ret = radix_tree_gang_lookup(&mapping->page_tree,
 716                                (void **)pages, start, nr_pages);
 717        for (i = 0; i < ret; i++)
 718                page_cache_get(pages[i]);
 719        read_unlock_irq(&mapping->tree_lock);
 720        return ret;
 721}
 722
 723/**
 724 * find_get_pages_contig - gang contiguous pagecache lookup
 725 * @mapping:    The address_space to search
 726 * @index:      The starting page index
 727 * @nr_pages:   The maximum number of pages
 728 * @pages:      Where the resulting pages are placed
 729 *
 730 * find_get_pages_contig() works exactly like find_get_pages(), except
 731 * that the returned number of pages are guaranteed to be contiguous.
 732 *
 733 * find_get_pages_contig() returns the number of pages which were found.
 734 */
 735unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 736                               unsigned int nr_pages, struct page **pages)
 737{
 738        unsigned int i;
 739        unsigned int ret;
 740
 741        read_lock_irq(&mapping->tree_lock);
 742        ret = radix_tree_gang_lookup(&mapping->page_tree,
 743                                (void **)pages, index, nr_pages);
 744        for (i = 0; i < ret; i++) {
 745                if (pages[i]->mapping == NULL || pages[i]->index != index)
 746                        break;
 747
 748                page_cache_get(pages[i]);
 749                index++;
 750        }
 751        read_unlock_irq(&mapping->tree_lock);
 752        return i;
 753}
 754EXPORT_SYMBOL(find_get_pages_contig);
 755
 756/**
 757 * find_get_pages_tag - find and return pages that match @tag
 758 * @mapping:    the address_space to search
 759 * @index:      the starting page index
 760 * @tag:        the tag index
 761 * @nr_pages:   the maximum number of pages
 762 * @pages:      where the resulting pages are placed
 763 *
 764 * Like find_get_pages, except we only return pages which are tagged with
 765 * @tag.   We update @index to index the next page for the traversal.
 766 */
 767unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 768                        int tag, unsigned int nr_pages, struct page **pages)
 769{
 770        unsigned int i;
 771        unsigned int ret;
 772
 773        read_lock_irq(&mapping->tree_lock);
 774        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
 775                                (void **)pages, *index, nr_pages, tag);
 776        for (i = 0; i < ret; i++)
 777                page_cache_get(pages[i]);
 778        if (ret)
 779                *index = pages[ret - 1]->index + 1;
 780        read_unlock_irq(&mapping->tree_lock);
 781        return ret;
 782}
 783EXPORT_SYMBOL(find_get_pages_tag);
 784
 785/**
 786 * grab_cache_page_nowait - returns locked page at given index in given cache
 787 * @mapping: target address_space
 788 * @index: the page index
 789 *
 790 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 791 * This is intended for speculative data generators, where the data can
 792 * be regenerated if the page couldn't be grabbed.  This routine should
 793 * be safe to call while holding the lock for another page.
 794 *
 795 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 796 * and deadlock against the caller's locked page.
 797 */
 798struct page *
 799grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 800{
 801        struct page *page = find_get_page(mapping, index);
 802
 803        if (page) {
 804                if (!TestSetPageLocked(page))
 805                        return page;
 806                page_cache_release(page);
 807                return NULL;
 808        }
 809        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
 810        if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
 811                page_cache_release(page);
 812                page = NULL;
 813        }
 814        return page;
 815}
 816EXPORT_SYMBOL(grab_cache_page_nowait);
 817
 818/*
 819 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 820 * a _large_ part of the i/o request. Imagine the worst scenario:
 821 *
 822 *      ---R__________________________________________B__________
 823 *         ^ reading here                             ^ bad block(assume 4k)
 824 *
 825 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 826 * => failing the whole request => read(R) => read(R+1) =>
 827 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 828 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 829 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 830 *
 831 * It is going insane. Fix it by quickly scaling down the readahead size.
 832 */
 833static void shrink_readahead_size_eio(struct file *filp,
 834                                        struct file_ra_state *ra)
 835{
 836        if (!ra->ra_pages)
 837                return;
 838
 839        ra->ra_pages /= 4;
 840}
 841
 842/**
 843 * do_generic_mapping_read - generic file read routine
 844 * @mapping:    address_space to be read
 845 * @_ra:        file's readahead state
 846 * @filp:       the file to read
 847 * @ppos:       current file position
 848 * @desc:       read_descriptor
 849 * @actor:      read method
 850 *
 851 * This is a generic file read routine, and uses the
 852 * mapping->a_ops->readpage() function for the actual low-level stuff.
 853 *
 854 * This is really ugly. But the goto's actually try to clarify some
 855 * of the logic when it comes to error handling etc.
 856 *
 857 * Note the struct file* is only passed for the use of readpage.
 858 * It may be NULL.
 859 */
 860void do_generic_mapping_read(struct address_space *mapping,
 861                             struct file_ra_state *_ra,
 862                             struct file *filp,
 863                             loff_t *ppos,
 864                             read_descriptor_t *desc,
 865                             read_actor_t actor)
 866{
 867        struct inode *inode = mapping->host;
 868        unsigned long index;
 869        unsigned long end_index;
 870        unsigned long offset;
 871        unsigned long last_index;
 872        unsigned long next_index;
 873        unsigned long prev_index;
 874        unsigned int prev_offset;
 875        loff_t isize;
 876        struct page *cached_page;
 877        int error;
 878        struct file_ra_state ra = *_ra;
 879
 880        cached_page = NULL;
 881        index = *ppos >> PAGE_CACHE_SHIFT;
 882        next_index = index;
 883        prev_index = ra.prev_index;
 884        prev_offset = ra.prev_offset;
 885        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 886        offset = *ppos & ~PAGE_CACHE_MASK;
 887
 888        isize = i_size_read(inode);
 889        if (!isize)
 890                goto out;
 891
 892        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 893        for (;;) {
 894                struct page *page;
 895                unsigned long nr, ret;
 896
 897                /* nr is the maximum number of bytes to copy from this page */
 898                nr = PAGE_CACHE_SIZE;
 899                if (index >= end_index) {
 900                        if (index > end_index)
 901                                goto out;
 902                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
 903                        if (nr <= offset) {
 904                                goto out;
 905                        }
 906                }
 907                nr = nr - offset;
 908
 909                cond_resched();
 910                if (index == next_index)
 911                        next_index = page_cache_readahead(mapping, &ra, filp,
 912                                        index, last_index - index);
 913
 914find_page:
 915                page = find_get_page(mapping, index);
 916                if (unlikely(page == NULL)) {
 917                        handle_ra_miss(mapping, &ra, index);
 918                        goto no_cached_page;
 919                }
 920                if (!PageUptodate(page))
 921                        goto page_not_up_to_date;
 922page_ok:
 923
 924                /* If users can be writing to this page using arbitrary
 925                 * virtual addresses, take care about potential aliasing
 926                 * before reading the page on the kernel side.
 927                 */
 928                if (mapping_writably_mapped(mapping))
 929                        flush_dcache_page(page);
 930
 931                /*
 932                 * When a sequential read accesses a page several times,
 933                 * only mark it as accessed the first time.
 934                 */
 935                if (prev_index != index || offset != prev_offset)
 936                        mark_page_accessed(page);
 937                prev_index = index;
 938
 939                /*
 940                 * Ok, we have the page, and it's up-to-date, so
 941                 * now we can copy it to user space...
 942                 *
 943                 * The actor routine returns how many bytes were actually used..
 944                 * NOTE! This may not be the same as how much of a user buffer
 945                 * we filled up (we may be padding etc), so we can only update
 946                 * "pos" here (the actor routine has to update the user buffer
 947                 * pointers and the remaining count).
 948                 */
 949                ret = actor(desc, page, offset, nr);
 950                offset += ret;
 951                index += offset >> PAGE_CACHE_SHIFT;
 952                offset &= ~PAGE_CACHE_MASK;
 953                prev_offset = offset;
 954                ra.prev_offset = offset;
 955
 956                page_cache_release(page);
 957                if (ret == nr && desc->count)
 958                        continue;
 959                goto out;
 960
 961page_not_up_to_date:
 962                /* Get exclusive access to the page ... */
 963                lock_page(page);
 964
 965                /* Did it get truncated before we got the lock? */
 966                if (!page->mapping) {
 967                        unlock_page(page);
 968                        page_cache_release(page);
 969                        continue;
 970                }
 971
 972                /* Did somebody else fill it already? */
 973                if (PageUptodate(page)) {
 974                        unlock_page(page);
 975                        goto page_ok;
 976                }
 977
 978readpage:
 979                /* Start the actual read. The read will unlock the page. */
 980                error = mapping->a_ops->readpage(filp, page);
 981
 982                if (unlikely(error)) {
 983                        if (error == AOP_TRUNCATED_PAGE) {
 984                                page_cache_release(page);
 985                                goto find_page;
 986                        }
 987                        goto readpage_error;
 988                }
 989
 990                if (!PageUptodate(page)) {
 991                        lock_page(page);
 992                        if (!PageUptodate(page)) {
 993                                if (page->mapping == NULL) {
 994                                        /*
 995                                         * invalidate_inode_pages got it
 996                                         */
 997                                        unlock_page(page);
 998                                        page_cache_release(page);
 999                                        goto find_page;
1000                                }
1001                                unlock_page(page);
1002                                error = -EIO;
1003                                shrink_readahead_size_eio(filp, &ra);
1004                                goto readpage_error;
1005                        }
1006                        unlock_page(page);
1007                }
1008
1009                /*
1010                 * i_size must be checked after we have done ->readpage.
1011                 *
1012                 * Checking i_size after the readpage allows us to calculate
1013                 * the correct value for "nr", which means the zero-filled
1014                 * part of the page is not copied back to userspace (unless
1015                 * another truncate extends the file - this is desired though).
1016                 */
1017                isize = i_size_read(inode);
1018                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1019                if (unlikely(!isize || index > end_index)) {
1020                        page_cache_release(page);
1021                        goto out;
1022                }
1023
1024                /* nr is the maximum number of bytes to copy from this page */
1025                nr = PAGE_CACHE_SIZE;
1026                if (index == end_index) {
1027                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1028                        if (nr <= offset) {
1029                                page_cache_release(page);
1030                                goto out;
1031                        }
1032                }
1033                nr = nr - offset;
1034                goto page_ok;
1035
1036readpage_error:
1037                /* UHHUH! A synchronous read error occurred. Report it */
1038                desc->error = error;
1039                page_cache_release(page);
1040                goto out;
1041
1042no_cached_page:
1043                /*
1044                 * Ok, it wasn't cached, so we need to create a new
1045                 * page..
1046                 */
1047                if (!cached_page) {
1048                        cached_page = page_cache_alloc_cold(mapping);
1049                        if (!cached_page) {
1050                                desc->error = -ENOMEM;
1051                                goto out;
1052                        }
1053                }
1054                error = add_to_page_cache_lru(cached_page, mapping,
1055                                                index, GFP_KERNEL);
1056                if (error) {
1057                        if (error == -EEXIST)
1058                                goto find_page;
1059                        desc->error = error;
1060                        goto out;
1061                }
1062                page = cached_page;
1063                cached_page = NULL;
1064                goto readpage;
1065        }
1066
1067out:
1068        *_ra = ra;
1069
1070        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1071        if (cached_page)
1072                page_cache_release(cached_page);
1073        if (filp)
1074                file_accessed(filp);
1075}
1076EXPORT_SYMBOL(do_generic_mapping_read);
1077
1078int file_read_actor(read_descriptor_t *desc, struct page *page,
1079                        unsigned long offset, unsigned long size)
1080{
1081        char *kaddr;
1082        unsigned long left, count = desc->count;
1083
1084        if (size > count)
1085                size = count;
1086
1087        /*
1088         * Faults on the destination of a read are common, so do it before
1089         * taking the kmap.
1090         */
1091        if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1092                kaddr = kmap_atomic(page, KM_USER0);
1093                left = __copy_to_user_inatomic(desc->arg.buf,
1094                                                kaddr + offset, size);
1095                kunmap_atomic(kaddr, KM_USER0);
1096                if (left == 0)
1097                        goto success;
1098        }
1099
1100        /* Do it the slow way */
1101        kaddr = kmap(page);
1102        left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1103        kunmap(page);
1104
1105        if (left) {
1106                size -= left;
1107                desc->error = -EFAULT;
1108        }
1109success:
1110        desc->count = count - size;
1111        desc->written += size;
1112        desc->arg.buf += size;
1113        return size;
1114}
1115
1116/*
1117 * Performs necessary checks before doing a write
1118 * @iov:        io vector request
1119 * @nr_segs:    number of segments in the iovec
1120 * @count:      number of bytes to write
1121 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1122 *
1123 * Adjust number of segments and amount of bytes to write (nr_segs should be
1124 * properly initialized first). Returns appropriate error code that caller
1125 * should return or zero in case that write should be allowed.
1126 */
1127int generic_segment_checks(const struct iovec *iov,
1128                        unsigned long *nr_segs, size_t *count, int access_flags)
1129{
1130        unsigned long   seg;
1131        size_t cnt = 0;
1132        for (seg = 0; seg < *nr_segs; seg++) {
1133                const struct iovec *iv = &iov[seg];
1134
1135                /*
1136                 * If any segment has a negative length, or the cumulative
1137                 * length ever wraps negative then return -EINVAL.
1138                 */
1139                cnt += iv->iov_len;
1140                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1141                        return -EINVAL;
1142                if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1143                        continue;
1144                if (seg == 0)
1145                        return -EFAULT;
1146                *nr_segs = seg;
1147                cnt -= iv->iov_len;     /* This segment is no good */
1148                break;
1149        }
1150        *count = cnt;
1151        return 0;
1152}
1153EXPORT_SYMBOL(generic_segment_checks);
1154
1155/**
1156 * generic_file_aio_read - generic filesystem read routine
1157 * @iocb:       kernel I/O control block
1158 * @iov:        io vector request
1159 * @nr_segs:    number of segments in the iovec
1160 * @pos:        current file position
1161 *
1162 * This is the "read()" routine for all filesystems
1163 * that can use the page cache directly.
1164 */
1165ssize_t
1166generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1167                unsigned long nr_segs, loff_t pos)
1168{
1169        struct file *filp = iocb->ki_filp;
1170        ssize_t retval;
1171        unsigned long seg;
1172        size_t count;
1173        loff_t *ppos = &iocb->ki_pos;
1174
1175        count = 0;
1176        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1177        if (retval)
1178                return retval;
1179
1180        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1181        if (filp->f_flags & O_DIRECT) {
1182                loff_t size;
1183                struct address_space *mapping;
1184                struct inode *inode;
1185
1186                mapping = filp->f_mapping;
1187                inode = mapping->host;
1188                retval = 0;
1189                if (!count)
1190                        goto out; /* skip atime */
1191                size = i_size_read(inode);
1192                if (pos < size) {
1193                        retval = generic_file_direct_IO(READ, iocb,
1194                                                iov, pos, nr_segs);
1195                        if (retval > 0)
1196                                *ppos = pos + retval;
1197                }
1198                if (likely(retval != 0)) {
1199                        file_accessed(filp);
1200                        goto out;
1201                }
1202        }
1203
1204        retval = 0;
1205        if (count) {
1206                for (seg = 0; seg < nr_segs; seg++) {
1207                        read_descriptor_t desc;
1208
1209                        desc.written = 0;
1210                        desc.arg.buf = iov[seg].iov_base;
1211                        desc.count = iov[seg].iov_len;
1212                        if (desc.count == 0)
1213                                continue;
1214                        desc.error = 0;
1215                        do_generic_file_read(filp,ppos,&desc,file_read_actor);
1216                        retval += desc.written;
1217                        if (desc.error) {
1218                                retval = retval ?: desc.error;
1219                                break;
1220                        }
1221                }
1222        }
1223out:
1224        return retval;
1225}
1226EXPORT_SYMBOL(generic_file_aio_read);
1227
1228int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1229{
1230        ssize_t written;
1231        unsigned long count = desc->count;
1232        struct file *file = desc->arg.data;
1233
1234        if (size > count)
1235                size = count;
1236
1237        written = file->f_op->sendpage(file, page, offset,
1238                                       size, &file->f_pos, size<count);
1239        if (written < 0) {
1240                desc->error = written;
1241                written = 0;
1242        }
1243        desc->count = count - written;
1244        desc->written += written;
1245        return written;
1246}
1247
1248ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1249                         size_t count, read_actor_t actor, void *target)
1250{
1251        read_descriptor_t desc;
1252
1253        if (!count)
1254                return 0;
1255
1256        desc.written = 0;
1257        desc.count = count;
1258        desc.arg.data = target;
1259        desc.error = 0;
1260
1261        do_generic_file_read(in_file, ppos, &desc, actor);
1262        if (desc.written)
1263                return desc.written;
1264        return desc.error;
1265}
1266EXPORT_SYMBOL(generic_file_sendfile);
1267
1268static ssize_t
1269do_readahead(struct address_space *mapping, struct file *filp,
1270             unsigned long index, unsigned long nr)
1271{
1272        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1273                return -EINVAL;
1274
1275        force_page_cache_readahead(mapping, filp, index,
1276                                        max_sane_readahead(nr));
1277        return 0;
1278}
1279
1280asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1281{
1282        ssize_t ret;
1283        struct file *file;
1284
1285        ret = -EBADF;
1286        file = fget(fd);
1287        if (file) {
1288                if (file->f_mode & FMODE_READ) {
1289                        struct address_space *mapping = file->f_mapping;
1290                        unsigned long start = offset >> PAGE_CACHE_SHIFT;
1291                        unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1292                        unsigned long len = end - start + 1;
1293                        ret = do_readahead(mapping, file, start, len);
1294                }
1295                fput(file);
1296        }
1297        return ret;
1298}
1299
1300#ifdef CONFIG_MMU
1301static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1302/**
1303 * page_cache_read - adds requested page to the page cache if not already there
1304 * @file:       file to read
1305 * @offset:     page index
1306 *
1307 * This adds the requested page to the page cache if it isn't already there,
1308 * and schedules an I/O to read in its contents from disk.
1309 */
1310static int fastcall page_cache_read(struct file * file, unsigned long offset)
1311{
1312        struct address_space *mapping = file->f_mapping;
1313        struct page *page; 
1314        int ret;
1315
1316        do {
1317                page = page_cache_alloc_cold(mapping);
1318                if (!page)
1319                        return -ENOMEM;
1320
1321                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1322                if (ret == 0)
1323                        ret = mapping->a_ops->readpage(file, page);
1324                else if (ret == -EEXIST)
1325                        ret = 0; /* losing race to add is OK */
1326
1327                page_cache_release(page);
1328
1329        } while (ret == AOP_TRUNCATED_PAGE);
1330                
1331        return ret;
1332}
1333
1334#define MMAP_LOTSAMISS  (100)
1335
1336/**
1337 * filemap_nopage - read in file data for page fault handling
1338 * @area:       the applicable vm_area
1339 * @address:    target address to read in
1340 * @type:       returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
1341 *
1342 * filemap_nopage() is invoked via the vma operations vector for a
1343 * mapped memory region to read in file data during a page fault.
1344 *
1345 * The goto's are kind of ugly, but this streamlines the normal case of having
1346 * it in the page cache, and handles the special cases reasonably without
1347 * having a lot of duplicated code.
1348 */
1349struct page *filemap_nopage(struct vm_area_struct *area,
1350                                unsigned long address, int *type)
1351{
1352        int error;
1353        struct file *file = area->vm_file;
1354        struct address_space *mapping = file->f_mapping;
1355        struct file_ra_state *ra = &file->f_ra;
1356        struct inode *inode = mapping->host;
1357        struct page *page;
1358        unsigned long size, pgoff;
1359        int did_readaround = 0, majmin = VM_FAULT_MINOR;
1360
1361        pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1362
1363retry_all:
1364        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1365        if (pgoff >= size)
1366                goto outside_data_content;
1367
1368        /* If we don't want any read-ahead, don't bother */
1369        if (VM_RandomReadHint(area))
1370                goto no_cached_page;
1371
1372        /*
1373         * The readahead code wants to be told about each and every page
1374         * so it can build and shrink its windows appropriately
1375         *
1376         * For sequential accesses, we use the generic readahead logic.
1377         */
1378        if (VM_SequentialReadHint(area))
1379                page_cache_readahead(mapping, ra, file, pgoff, 1);
1380
1381        /*
1382         * Do we have something in the page cache already?
1383         */
1384retry_find:
1385        page = find_get_page(mapping, pgoff);
1386        if (!page) {
1387                unsigned long ra_pages;
1388
1389                if (VM_SequentialReadHint(area)) {
1390                        handle_ra_miss(mapping, ra, pgoff);
1391                        goto no_cached_page;
1392                }
1393                ra->mmap_miss++;
1394
1395                /*
1396                 * Do we miss much more than hit in this file? If so,
1397                 * stop bothering with read-ahead. It will only hurt.
1398                 */
1399                if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1400                        goto no_cached_page;
1401
1402                /*
1403                 * To keep the pgmajfault counter straight, we need to
1404                 * check did_readaround, as this is an inner loop.
1405                 */
1406                if (!did_readaround) {
1407                        majmin = VM_FAULT_MAJOR;
1408                        count_vm_event(PGMAJFAULT);
1409                }
1410                did_readaround = 1;
1411                ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1412                if (ra_pages) {
1413                        pgoff_t start = 0;
1414
1415                        if (pgoff > ra_pages / 2)
1416                                start = pgoff - ra_pages / 2;
1417                        do_page_cache_readahead(mapping, file, start, ra_pages);
1418                }
1419                page = find_get_page(mapping, pgoff);
1420                if (!page)
1421                        goto no_cached_page;
1422        }
1423
1424        if (!did_readaround)
1425                ra->mmap_hit++;
1426
1427        /*
1428         * Ok, found a page in the page cache, now we need to check
1429         * that it's up-to-date.
1430         */
1431        if (!PageUptodate(page))
1432                goto page_not_uptodate;
1433
1434success:
1435        /*
1436         * Found the page and have a reference on it.
1437         */
1438        mark_page_accessed(page);
1439        if (type)
1440                *type = majmin;
1441        return page;
1442
1443outside_data_content:
1444        /*
1445         * An external ptracer can access pages that normally aren't
1446         * accessible..
1447         */
1448        if (area->vm_mm == current->mm)
1449                return NOPAGE_SIGBUS;
1450        /* Fall through to the non-read-ahead case */
1451no_cached_page:
1452        /*
1453         * We're only likely to ever get here if MADV_RANDOM is in
1454         * effect.
1455         */
1456        error = page_cache_read(file, pgoff);
1457
1458        /*
1459         * The page we want has now been added to the page cache.
1460         * In the unlikely event that someone removed it in the
1461         * meantime, we'll just come back here and read it again.
1462         */
1463        if (error >= 0)
1464                goto retry_find;
1465
1466        /*
1467         * An error return from page_cache_read can result if the
1468         * system is low on memory, or a problem occurs while trying
1469         * to schedule I/O.
1470         */
1471        if (error == -ENOMEM)
1472                return NOPAGE_OOM;
1473        return NOPAGE_SIGBUS;
1474
1475page_not_uptodate:
1476        if (!did_readaround) {
1477                majmin = VM_FAULT_MAJOR;
1478                count_vm_event(PGMAJFAULT);
1479        }
1480
1481        /*
1482         * Umm, take care of errors if the page isn't up-to-date.
1483         * Try to re-read it _once_. We do this synchronously,
1484         * because there really aren't any performance issues here
1485         * and we need to check for errors.
1486         */
1487        lock_page(page);
1488
1489        /* Somebody truncated the page on us? */
1490        if (!page->mapping) {
1491                unlock_page(page);
1492                page_cache_release(page);
1493                goto retry_all;
1494        }
1495
1496        /* Somebody else successfully read it in? */
1497        if (PageUptodate(page)) {
1498                unlock_page(page);
1499                goto success;
1500        }
1501        ClearPageError(page);
1502        error = mapping->a_ops->readpage(file, page);
1503        if (!error) {
1504                wait_on_page_locked(page);
1505                if (PageUptodate(page))
1506                        goto success;
1507        } else if (error == AOP_TRUNCATED_PAGE) {
1508                page_cache_release(page);
1509                goto retry_find;
1510        }
1511
1512        /*
1513         * Things didn't work out. Return zero to tell the
1514         * mm layer so, possibly freeing the page cache page first.
1515         */
1516        shrink_readahead_size_eio(file, ra);
1517        page_cache_release(page);
1518        return NOPAGE_SIGBUS;
1519}
1520EXPORT_SYMBOL(filemap_nopage);
1521
1522static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1523                                        int nonblock)
1524{
1525        struct address_space *mapping = file->f_mapping;
1526        struct page *page;
1527        int error;
1528
1529        /*
1530         * Do we have something in the page cache already?
1531         */
1532retry_find:
1533        page = find_get_page(mapping, pgoff);
1534        if (!page) {
1535                if (nonblock)
1536                        return NULL;
1537                goto no_cached_page;
1538        }
1539
1540        /*
1541         * Ok, found a page in the page cache, now we need to check
1542         * that it's up-to-date.
1543         */
1544        if (!PageUptodate(page)) {
1545                if (nonblock) {
1546                        page_cache_release(page);
1547                        return NULL;
1548                }
1549                goto page_not_uptodate;
1550        }
1551
1552success:
1553        /*
1554         * Found the page and have a reference on it.
1555         */
1556        mark_page_accessed(page);
1557        return page;
1558
1559no_cached_page:
1560        error = page_cache_read(file, pgoff);
1561
1562        /*
1563         * The page we want has now been added to the page cache.
1564         * In the unlikely event that someone removed it in the
1565         * meantime, we'll just come back here and read it again.
1566         */
1567        if (error >= 0)
1568                goto retry_find;
1569
1570        /*
1571         * An error return from page_cache_read can result if the
1572         * system is low on memory, or a problem occurs while trying
1573         * to schedule I/O.
1574         */
1575        return NULL;
1576
1577page_not_uptodate:
1578        lock_page(page);
1579
1580        /* Did it get truncated while we waited for it? */
1581        if (!page->mapping) {
1582                unlock_page(page);
1583                goto err;
1584        }
1585
1586        /* Did somebody else get it up-to-date? */
1587        if (PageUptodate(page)) {
1588                unlock_page(page);
1589                goto success;
1590        }
1591
1592        error = mapping->a_ops->readpage(file, page);
1593        if (!error) {
1594                wait_on_page_locked(page);
1595                if (PageUptodate(page))
1596                        goto success;
1597        } else if (error == AOP_TRUNCATED_PAGE) {
1598                page_cache_release(page);
1599                goto retry_find;
1600        }
1601
1602        /*
1603         * Umm, take care of errors if the page isn't up-to-date.
1604         * Try to re-read it _once_. We do this synchronously,
1605         * because there really aren't any performance issues here
1606         * and we need to check for errors.
1607         */
1608        lock_page(page);
1609
1610        /* Somebody truncated the page on us? */
1611        if (!page->mapping) {
1612                unlock_page(page);
1613                goto err;
1614        }
1615        /* Somebody else successfully read it in? */
1616        if (PageUptodate(page)) {
1617                unlock_page(page);
1618                goto success;
1619        }
1620
1621        ClearPageError(page);
1622        error = mapping->a_ops->readpage(file, page);
1623        if (!error) {
1624                wait_on_page_locked(page);
1625                if (PageUptodate(page))
1626                        goto success;
1627        } else if (error == AOP_TRUNCATED_PAGE) {
1628                page_cache_release(page);
1629                goto retry_find;
1630        }
1631
1632        /*
1633         * Things didn't work out. Return zero to tell the
1634         * mm layer so, possibly freeing the page cache page first.
1635         */
1636err:
1637        page_cache_release(page);
1638
1639        return NULL;
1640}
1641
1642int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
1643                unsigned long len, pgprot_t prot, unsigned long pgoff,
1644                int nonblock)
1645{
1646        struct file *file = vma->vm_file;
1647        struct address_space *mapping = file->f_mapping;
1648        struct inode *inode = mapping->host;
1649        unsigned long size;
1650        struct mm_struct *mm = vma->vm_mm;
1651        struct page *page;
1652        int err;
1653
1654        if (!nonblock)
1655                force_page_cache_readahead(mapping, vma->vm_file,
1656                                        pgoff, len >> PAGE_CACHE_SHIFT);
1657
1658repeat:
1659        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1660        if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1661                return -EINVAL;
1662
1663        page = filemap_getpage(file, pgoff, nonblock);
1664
1665        /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
1666         * done in shmem_populate calling shmem_getpage */
1667        if (!page && !nonblock)
1668                return -ENOMEM;
1669
1670        if (page) {
1671                err = install_page(mm, vma, addr, page, prot);
1672                if (err) {
1673                        page_cache_release(page);
1674                        return err;
1675                }
1676        } else if (vma->vm_flags & VM_NONLINEAR) {
1677                /* No page was found just because we can't read it in now (being
1678                 * here implies nonblock != 0), but the page may exist, so set
1679                 * the PTE to fault it in later. */
1680                err = install_file_pte(mm, vma, addr, pgoff, prot);
1681                if (err)
1682                        return err;
1683        }
1684
1685        len -= PAGE_SIZE;
1686        addr += PAGE_SIZE;
1687        pgoff++;
1688        if (len)
1689                goto repeat;
1690
1691        return 0;
1692}
1693EXPORT_SYMBOL(filemap_populate);
1694
1695struct vm_operations_struct generic_file_vm_ops = {
1696        .nopage         = filemap_nopage,
1697        .populate       = filemap_populate,
1698};
1699
1700/* This is used for a general mmap of a disk file */
1701
1702int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1703{
1704        struct address_space *mapping = file->f_mapping;
1705
1706        if (!mapping->a_ops->readpage)
1707                return -ENOEXEC;
1708        file_accessed(file);
1709        vma->vm_ops = &generic_file_vm_ops;
1710        return 0;
1711}
1712
1713/*
1714 * This is for filesystems which do not implement ->writepage.
1715 */
1716int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1717{
1718        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1719                return -EINVAL;
1720        return generic_file_mmap(file, vma);
1721}
1722#else
1723int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1724{
1725        return -ENOSYS;
1726}
1727int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1728{
1729        return -ENOSYS;
1730}
1731#endif /* CONFIG_MMU */
1732
1733EXPORT_SYMBOL(generic_file_mmap);
1734EXPORT_SYMBOL(generic_file_readonly_mmap);
1735
1736static struct page *__read_cache_page(struct address_space *mapping,
1737                                unsigned long index,
1738                                int (*filler)(void *,struct page*),
1739                                void *data)
1740{
1741        struct page *page, *cached_page = NULL;
1742        int err;
1743repeat:
1744        page = find_get_page(mapping, index);
1745        if (!page) {
1746                if (!cached_page) {
1747                        cached_page = page_cache_alloc_cold(mapping);
1748                        if (!cached_page)
1749                                return ERR_PTR(-ENOMEM);
1750                }
1751                err = add_to_page_cache_lru(cached_page, mapping,
1752                                        index, GFP_KERNEL);
1753                if (err == -EEXIST)
1754                        goto repeat;
1755                if (err < 0) {
1756                        /* Presumably ENOMEM for radix tree node */
1757                        page_cache_release(cached_page);
1758                        return ERR_PTR(err);
1759                }
1760                page = cached_page;
1761                cached_page = NULL;
1762                err = filler(data, page);
1763                if (err < 0) {
1764                        page_cache_release(page);
1765                        page = ERR_PTR(err);
1766                }
1767        }
1768        if (cached_page)
1769                page_cache_release(cached_page);
1770        return page;
1771}
1772
1773/*
1774 * Same as read_cache_page, but don't wait for page to become unlocked
1775 * after submitting it to the filler.
1776 */
1777struct page *read_cache_page_async(struct address_space *mapping,
1778                                unsigned long index,
1779                                int (*filler)(void *,struct page*),
1780                                void *data)
1781{
1782        struct page *page;
1783        int err;
1784
1785retry:
1786        page = __read_cache_page(mapping, index, filler, data);
1787        if (IS_ERR(page))
1788                return page;
1789        if (PageUptodate(page))
1790                goto out;
1791
1792        lock_page(page);
1793        if (!page->mapping) {
1794                unlock_page(page);
1795                page_cache_release(page);
1796                goto retry;
1797        }
1798        if (PageUptodate(page)) {
1799                unlock_page(page);
1800                goto out;
1801        }
1802        err = filler(data, page);
1803        if (err < 0) {
1804                page_cache_release(page);
1805                return ERR_PTR(err);
1806        }
1807out:
1808        mark_page_accessed(page);
1809        return page;
1810}
1811EXPORT_SYMBOL(read_cache_page_async);
1812
1813/**
1814 * read_cache_page - read into page cache, fill it if needed
1815 * @mapping:    the page's address_space
1816 * @index:      the page index
1817 * @filler:     function to perform the read
1818 * @data:       destination for read data
1819 *
1820 * Read into the page cache. If a page already exists, and PageUptodate() is
1821 * not set, try to fill the page then wait for it to become unlocked.
1822 *
1823 * If the page does not get brought uptodate, return -EIO.
1824 */
1825struct page *read_cache_page(struct address_space *mapping,
1826                                unsigned long index,
1827                                int (*filler)(void *,struct page*),
1828                                void *data)
1829{
1830        struct page *page;
1831
1832        page = read_cache_page_async(mapping, index, filler, data);
1833        if (IS_ERR(page))
1834                goto out;
1835        wait_on_page_locked(page);
1836        if (!PageUptodate(page)) {
1837                page_cache_release(page);
1838                page = ERR_PTR(-EIO);
1839        }
1840 out:
1841        return page;
1842}
1843EXPORT_SYMBOL(read_cache_page);
1844
1845/*
1846 * If the page was newly created, increment its refcount and add it to the
1847 * caller's lru-buffering pagevec.  This function is specifically for
1848 * generic_file_write().
1849 */
1850static inline struct page *
1851__grab_cache_page(struct address_space *mapping, unsigned long index,
1852                        struct page **cached_page, struct pagevec *lru_pvec)
1853{
1854        int err;
1855        struct page *page;
1856repeat:
1857        page = find_lock_page(mapping, index);
1858        if (!page) {
1859                if (!*cached_page) {
1860                        *cached_page = page_cache_alloc(mapping);
1861                        if (!*cached_page)
1862                                return NULL;
1863                }
1864                err = add_to_page_cache(*cached_page, mapping,
1865                                        index, GFP_KERNEL);
1866                if (err == -EEXIST)
1867                        goto repeat;
1868                if (err == 0) {
1869                        page = *cached_page;
1870                        page_cache_get(page);
1871                        if (!pagevec_add(lru_pvec, page))
1872                                __pagevec_lru_add(lru_pvec);
1873                        *cached_page = NULL;
1874                }
1875        }
1876        return page;
1877}
1878
1879/*
1880 * The logic we want is
1881 *
1882 *      if suid or (sgid and xgrp)
1883 *              remove privs
1884 */
1885int should_remove_suid(struct dentry *dentry)
1886{
1887        mode_t mode = dentry->d_inode->i_mode;
1888        int kill = 0;
1889
1890        /* suid always must be killed */
1891        if (unlikely(mode & S_ISUID))
1892                kill = ATTR_KILL_SUID;
1893
1894        /*
1895         * sgid without any exec bits is just a mandatory locking mark; leave
1896         * it alone.  If some exec bits are set, it's a real sgid; kill it.
1897         */
1898        if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1899                kill |= ATTR_KILL_SGID;
1900
1901        if (unlikely(kill && !capable(CAP_FSETID)))
1902                return kill;
1903
1904        return 0;
1905}
1906EXPORT_SYMBOL(should_remove_suid);
1907
1908int __remove_suid(struct dentry *dentry, int kill)
1909{
1910        struct iattr newattrs;
1911
1912        newattrs.ia_valid = ATTR_FORCE | kill;
1913        return notify_change(dentry, &newattrs);
1914}
1915
1916int remove_suid(struct dentry *dentry)
1917{
1918        int kill = should_remove_suid(dentry);
1919
1920        if (unlikely(kill))
1921                return __remove_suid(dentry, kill);
1922
1923        return 0;
1924}
1925EXPORT_SYMBOL(remove_suid);
1926
1927size_t
1928__filemap_copy_from_user_iovec_inatomic(char *vaddr,
1929                        const struct iovec *iov, size_t base, size_t bytes)
1930{
1931        size_t copied = 0, left = 0;
1932
1933        while (bytes) {
1934                char __user *buf = iov->iov_base + base;
1935                int copy = min(bytes, iov->iov_len - base);
1936
1937                base = 0;
1938                left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1939                copied += copy;
1940                bytes -= copy;
1941                vaddr += copy;
1942                iov++;
1943
1944                if (unlikely(left))
1945                        break;
1946        }
1947        return copied - left;
1948}
1949
1950/*
1951 * Performs necessary checks before doing a write
1952 *
1953 * Can adjust writing position or amount of bytes to write.
1954 * Returns appropriate error code that caller should return or
1955 * zero in case that write should be allowed.
1956 */
1957inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1958{
1959        struct inode *inode = file->f_mapping->host;
1960        unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1961
1962        if (unlikely(*pos < 0))
1963                return -EINVAL;
1964
1965        if (!isblk) {
1966                /* FIXME: this is for backwards compatibility with 2.4 */
1967                if (file->f_flags & O_APPEND)
1968                        *pos = i_size_read(inode);
1969
1970                if (limit != RLIM_INFINITY) {
1971                        if (*pos >= limit) {
1972                                send_sig(SIGXFSZ, current, 0);
1973                                return -EFBIG;
1974                        }
1975                        if (*count > limit - (typeof(limit))*pos) {
1976                                *count = limit - (typeof(limit))*pos;
1977                        }
1978                }
1979        }
1980
1981        /*
1982         * LFS rule
1983         */
1984        if (unlikely(*pos + *count > MAX_NON_LFS &&
1985                                !(file->f_flags & O_LARGEFILE))) {
1986                if (*pos >= MAX_NON_LFS) {
1987                        send_sig(SIGXFSZ, current, 0);
1988                        return -EFBIG;
1989                }
1990                if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1991                        *count = MAX_NON_LFS - (unsigned long)*pos;
1992                }
1993        }
1994
1995        /*
1996         * Are we about to exceed the fs block limit ?
1997         *
1998         * If we have written data it becomes a short write.  If we have
1999         * exceeded without writing data we send a signal and return EFBIG.
2000         * Linus frestrict idea will clean these up nicely..
2001         */
2002        if (likely(!isblk)) {
2003                if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2004                        if (*count || *pos > inode->i_sb->s_maxbytes) {
2005                                send_sig(SIGXFSZ, current, 0);
2006                                return -EFBIG;
2007                        }
2008                        /* zero-length writes at ->s_maxbytes are OK */
2009                }
2010
2011                if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2012                        *count = inode->i_sb->s_maxbytes - *pos;
2013        } else {
2014#ifdef CONFIG_BLOCK
2015                loff_t isize;
2016                if (bdev_read_only(I_BDEV(inode)))
2017                        return -EPERM;
2018                isize = i_size_read(inode);
2019                if (*pos >= isize) {
2020                        if (*count || *pos > isize)
2021                                return -ENOSPC;
2022                }
2023
2024                if (*pos + *count > isize)
2025                        *count = isize - *pos;
2026#else
2027                return -EPERM;
2028#endif
2029        }
2030        return 0;
2031}
2032EXPORT_SYMBOL(generic_write_checks);
2033
2034ssize_t
2035generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2036                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2037                size_t count, size_t ocount)
2038{
2039        struct file     *file = iocb->ki_filp;
2040        struct address_space *mapping = file->f_mapping;
2041        struct inode    *inode = mapping->host;
2042        ssize_t         written;
2043
2044        if (count != ocount)
2045                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2046
2047        written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2048        if (written > 0) {
2049                loff_t end = pos + written;
2050                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2051                        i_size_write(inode,  end);
2052                        mark_inode_dirty(inode);
2053                }
2054                *ppos = end;
2055        }
2056
2057        /*
2058         * Sync the fs metadata but not the minor inode changes and
2059         * of course not the data as we did direct DMA for the IO.
2060         * i_mutex is held, which protects generic_osync_inode() from
2061         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
2062         */
2063        if ((written >= 0 || written == -EIOCBQUEUED) &&
2064            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2065                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2066                if (err < 0)
2067                        written = err;
2068        }
2069        return written;
2070}
2071EXPORT_SYMBOL(generic_file_direct_write);
2072
2073ssize_t
2074generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2075                unsigned long nr_segs, loff_t pos, loff_t *ppos,
2076                size_t count, ssize_t written)
2077{
2078        struct file *file = iocb->ki_filp;
2079        struct address_space * mapping = file->f_mapping;
2080        const struct address_space_operations *a_ops = mapping->a_ops;
2081        struct inode    *inode = mapping->host;
2082        long            status = 0;
2083        struct page     *page;
2084        struct page     *cached_page = NULL;
2085        size_t          bytes;
2086        struct pagevec  lru_pvec;
2087        const struct iovec *cur_iov = iov; /* current iovec */
2088        size_t          iov_base = 0;      /* offset in the current iovec */
2089        char __user     *buf;
2090
2091        pagevec_init(&lru_pvec, 0);
2092
2093        /*
2094         * handle partial DIO write.  Adjust cur_iov if needed.
2095         */
2096        if (likely(nr_segs == 1))
2097                buf = iov->iov_base + written;
2098        else {
2099                filemap_set_next_iovec(&cur_iov, &iov_base, written);
2100                buf = cur_iov->iov_base + iov_base;
2101        }
2102
2103        do {
2104                unsigned long index;
2105                unsigned long offset;
2106                size_t copied;
2107
2108                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
2109                index = pos >> PAGE_CACHE_SHIFT;
2110                bytes = PAGE_CACHE_SIZE - offset;
2111
2112                /* Limit the size of the copy to the caller's write size */
2113                bytes = min(bytes, count);
2114
2115                /* We only need to worry about prefaulting when writes are from
2116                 * user-space.  NFSd uses vfs_writev with several non-aligned
2117                 * segments in the vector, and limiting to one segment a time is
2118                 * a noticeable performance for re-write
2119                 */
2120                if (!segment_eq(get_fs(), KERNEL_DS)) {
2121                        /*
2122                         * Limit the size of the copy to that of the current
2123                         * segment, because fault_in_pages_readable() doesn't
2124                         * know how to walk segments.
2125                         */
2126                        bytes = min(bytes, cur_iov->iov_len - iov_base);
2127
2128                        /*
2129                         * Bring in the user page that we will copy from
2130                         * _first_.  Otherwise there's a nasty deadlock on
2131                         * copying from the same page as we're writing to,
2132                         * without it being marked up-to-date.
2133                         */
2134                        fault_in_pages_readable(buf, bytes);
2135                }
2136                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
2137                if (!page) {
2138                        status = -ENOMEM;
2139                        break;
2140                }
2141
2142                if (unlikely(bytes == 0)) {
2143                        status = 0;
2144                        copied = 0;
2145                        goto zero_length_segment;
2146                }
2147
2148                status = a_ops->prepare_write(file, page, offset, offset+bytes);
2149                if (unlikely(status)) {
2150                        loff_t isize = i_size_read(inode);
2151
2152                        if (status != AOP_TRUNCATED_PAGE)
2153                                unlock_page(page);
2154                        page_cache_release(page);
2155                        if (status == AOP_TRUNCATED_PAGE)
2156                                continue;
2157                        /*
2158                         * prepare_write() may have instantiated a few blocks
2159                         * outside i_size.  Trim these off again.
2160                         */
2161                        if (pos + bytes > isize)
2162                                vmtruncate(inode, isize);
2163                        break;
2164                }
2165                if (likely(nr_segs == 1))
2166                        copied = filemap_copy_from_user(page, offset,
2167                                                        buf, bytes);
2168                else
2169                        copied = filemap_copy_from_user_iovec(page, offset,
2170                                                cur_iov, iov_base, bytes);
2171                flush_dcache_page(page);
2172                status = a_ops->commit_write(file, page, offset, offset+bytes);
2173                if (status == AOP_TRUNCATED_PAGE) {
2174                        page_cache_release(page);
2175                        continue;
2176                }
2177zero_length_segment:
2178                if (likely(copied >= 0)) {
2179                        if (!status)
2180                                status = copied;
2181
2182                        if (status >= 0) {
2183                                written += status;
2184                                count -= status;
2185                                pos += status;
2186                                buf += status;
2187                                if (unlikely(nr_segs > 1)) {
2188                                        filemap_set_next_iovec(&cur_iov,
2189                                                        &iov_base, status);
2190                                        if (count)
2191                                                buf = cur_iov->iov_base +
2192                                                        iov_base;
2193                                } else {
2194                                        iov_base += status;
2195                                }
2196                        }
2197                }
2198                if (unlikely(copied != bytes))
2199                        if (status >= 0)
2200                                status = -EFAULT;
2201                unlock_page(page);
2202                mark_page_accessed(page);
2203                page_cache_release(page);
2204                if (status < 0)
2205                        break;
2206                balance_dirty_pages_ratelimited(mapping);
2207                cond_resched();
2208        } while (count);
2209        *ppos = pos;
2210
2211        if (cached_page)
2212                page_cache_release(cached_page);
2213
2214        /*
2215         * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2216         */
2217        if (likely(status >= 0)) {
2218                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2219                        if (!a_ops->writepage || !is_sync_kiocb(iocb))
2220                                status = generic_osync_inode(inode, mapping,
2221                                                OSYNC_METADATA|OSYNC_DATA);
2222                }
2223        }
2224        
2225        /*
2226         * If we get here for O_DIRECT writes then we must have fallen through
2227         * to buffered writes (block instantiation inside i_size).  So we sync
2228         * the file data here, to try to honour O_DIRECT expectations.
2229         */
2230        if (unlikely(file->f_flags & O_DIRECT) && written)
2231                status = filemap_write_and_wait(mapping);
2232
2233        pagevec_lru_add(&lru_pvec);
2234        return written ? written : status;
2235}
2236EXPORT_SYMBOL(generic_file_buffered_write);
2237
2238static ssize_t
2239__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2240                                unsigned long nr_segs, loff_t *ppos)
2241{
2242        struct file *file = iocb->ki_filp;
2243        struct address_space * mapping = file->f_mapping;
2244        size_t ocount;          /* original count */
2245        size_t count;           /* after file limit checks */
2246        struct inode    *inode = mapping->host;
2247        loff_t          pos;
2248        ssize_t         written;
2249        ssize_t         err;
2250
2251        ocount = 0;
2252        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2253        if (err)
2254                return err;
2255
2256        count = ocount;
2257        pos = *ppos;
2258
2259        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2260
2261        /* We can write back this queue in page reclaim */
2262        current->backing_dev_info = mapping->backing_dev_info;
2263        written = 0;
2264
2265        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2266        if (err)
2267                goto out;
2268
2269        if (count == 0)
2270                goto out;
2271
2272        err = remove_suid(file->f_path.dentry);
2273        if (err)
2274                goto out;
2275
2276        file_update_time(file);
2277
2278        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2279        if (unlikely(file->f_flags & O_DIRECT)) {
2280                loff_t endbyte;
2281                ssize_t written_buffered;
2282
2283                written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2284                                                        ppos, count, ocount);
2285                if (written < 0 || written == count)
2286                        goto out;
2287                /*
2288                 * direct-io write to a hole: fall through to buffered I/O
2289                 * for completing the rest of the request.
2290                 */
2291                pos += written;
2292                count -= written;
2293                written_buffered = generic_file_buffered_write(iocb, iov,
2294                                                nr_segs, pos, ppos, count,
2295                                                written);
2296                /*
2297                 * If generic_file_buffered_write() retuned a synchronous error
2298                 * then we want to return the number of bytes which were
2299                 * direct-written, or the error code if that was zero.  Note
2300                 * that this differs from normal direct-io semantics, which
2301                 * will return -EFOO even if some bytes were written.
2302                 */
2303                if (written_buffered < 0) {
2304                        err = written_buffered;
2305                        goto out;
2306                }
2307
2308                /*
2309                 * We need to ensure that the page cache pages are written to
2310                 * disk and invalidated to preserve the expected O_DIRECT
2311                 * semantics.
2312                 */
2313                endbyte = pos + written_buffered - written - 1;
2314                err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
2315                                            SYNC_FILE_RANGE_WAIT_BEFORE|
2316                                            SYNC_FILE_RANGE_WRITE|
2317                                            SYNC_FILE_RANGE_WAIT_AFTER);
2318                if (err == 0) {
2319                        written = written_buffered;
2320                        invalidate_mapping_pages(mapping,
2321                                                 pos >> PAGE_CACHE_SHIFT,
2322                                                 endbyte >> PAGE_CACHE_SHIFT);
2323                } else {
2324                        /*
2325                         * We don't know how much we wrote, so just return
2326                         * the number of bytes which were direct-written
2327                         */
2328                }
2329        } else {
2330                written = generic_file_buffered_write(iocb, iov, nr_segs,
2331                                pos, ppos, count, written);
2332        }
2333out:
2334        current->backing_dev_info = NULL;
2335        return written ? written : err;
2336}
2337
2338ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
2339                const struct iovec *iov, unsigned long nr_segs, loff_t pos)
2340{
2341        struct file *file = iocb->ki_filp;
2342        struct address_space *mapping = file->f_mapping;
2343        struct inode *inode = mapping->host;
2344        ssize_t ret;
2345
2346        BUG_ON(iocb->ki_pos != pos);
2347
2348        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2349                        &iocb->ki_pos);
2350
2351        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2352                ssize_t err;
2353
2354                err = sync_page_range_nolock(inode, mapping, pos, ret);
2355                if (err < 0)
2356                        ret = err;
2357        }
2358        return ret;
2359}
2360EXPORT_SYMBOL(generic_file_aio_write_nolock);
2361
2362ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2363                unsigned long nr_segs, loff_t pos)
2364{
2365        struct file *file = iocb->ki_filp;
2366        struct address_space *mapping = file->f_mapping;
2367        struct inode *inode = mapping->host;
2368        ssize_t ret;
2369
2370        BUG_ON(iocb->ki_pos != pos);
2371
2372        mutex_lock(&inode->i_mutex);
2373        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2374                        &iocb->ki_pos);
2375        mutex_unlock(&inode->i_mutex);
2376
2377        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2378                ssize_t err;
2379
2380                err = sync_page_range(inode, mapping, pos, ret);
2381                if (err < 0)
2382                        ret = err;
2383        }
2384        return ret;
2385}
2386EXPORT_SYMBOL(generic_file_aio_write);
2387
2388/*
2389 * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
2390 * went wrong during pagecache shootdown.
2391 */
2392static ssize_t
2393generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2394        loff_t offset, unsigned long nr_segs)
2395{
2396        struct file *file = iocb->ki_filp;
2397        struct address_space *mapping = file->f_mapping;
2398        ssize_t retval;
2399        size_t write_len;
2400        pgoff_t end = 0; /* silence gcc */
2401
2402        /*
2403         * If it's a write, unmap all mmappings of the file up-front.  This
2404         * will cause any pte dirty bits to be propagated into the pageframes
2405         * for the subsequent filemap_write_and_wait().
2406         */
2407        if (rw == WRITE) {
2408                write_len = iov_length(iov, nr_segs);
2409                end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2410                if (mapping_mapped(mapping))
2411                        unmap_mapping_range(mapping, offset, write_len, 0);
2412        }
2413
2414        retval = filemap_write_and_wait(mapping);
2415        if (retval)
2416                goto out;
2417
2418        /*
2419         * After a write we want buffered reads to be sure to go to disk to get
2420         * the new data.  We invalidate clean cached page from the region we're
2421         * about to write.  We do this *before* the write so that we can return
2422         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2423         */
2424        if (rw == WRITE && mapping->nrpages) {
2425                retval = invalidate_inode_pages2_range(mapping,
2426                                        offset >> PAGE_CACHE_SHIFT, end);
2427                if (retval)
2428                        goto out;
2429        }
2430
2431        retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2432        if (retval)
2433                goto out;
2434
2435        /*
2436         * Finally, try again to invalidate clean pages which might have been
2437         * faulted in by get_user_pages() if the source of the write was an
2438         * mmap()ed region of the file we're writing.  That's a pretty crazy
2439         * thing to do, so we don't support it 100%.  If this invalidation
2440         * fails and we have -EIOCBQUEUED we ignore the failure.
2441         */
2442        if (rw == WRITE && mapping->nrpages) {
2443                int err = invalidate_inode_pages2_range(mapping,
2444                                              offset >> PAGE_CACHE_SHIFT, end);
2445                if (err && retval >= 0)
2446                        retval = err;
2447        }
2448out:
2449        return retval;
2450}
2451
2452/**
2453 * try_to_release_page() - release old fs-specific metadata on a page
2454 *
2455 * @page: the page which the kernel is trying to free
2456 * @gfp_mask: memory allocation flags (and I/O mode)
2457 *
2458 * The address_space is to try to release any data against the page
2459 * (presumably at page->private).  If the release was successful, return `1'.
2460 * Otherwise return zero.
2461 *
2462 * The @gfp_mask argument specifies whether I/O may be performed to release
2463 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
2464 *
2465 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2466 */
2467int try_to_release_page(struct page *page, gfp_t gfp_mask)
2468{
2469        struct address_space * const mapping = page->mapping;
2470
2471        BUG_ON(!PageLocked(page));
2472        if (PageWriteback(page))
2473                return 0;
2474
2475        if (mapping && mapping->a_ops->releasepage)
2476                return mapping->a_ops->releasepage(page, gfp_mask);
2477        return try_to_free_buffers(page);
2478}
2479
2480EXPORT_SYMBOL(try_to_release_page);
2481
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.