linux/mm/filemap.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994-1999  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem used to do this differently, for example)
  11 */
  12#include <linux/module.h>
  13#include <linux/slab.h>
  14#include <linux/compiler.h>
  15#include <linux/fs.h>
  16#include <linux/uaccess.h>
  17#include <linux/aio.h>
  18#include <linux/capability.h>
  19#include <linux/kernel_stat.h>
  20#include <linux/mm.h>
  21#include <linux/swap.h>
  22#include <linux/mman.h>
  23#include <linux/pagemap.h>
  24#include <linux/file.h>
  25#include <linux/uio.h>
  26#include <linux/hash.h>
  27#include <linux/writeback.h>
  28#include <linux/backing-dev.h>
  29#include <linux/pagevec.h>
  30#include <linux/blkdev.h>
  31#include <linux/security.h>
  32#include <linux/syscalls.h>
  33#include <linux/cpuset.h>
  34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
  35#include <linux/memcontrol.h>
  36#include <linux/mm_inline.h> /* for page_is_file_cache() */
  37#include "internal.h"
  38
  39/*
  40 * FIXME: remove all knowledge of the buffer layer from the core VM
  41 */
  42#include <linux/buffer_head.h> /* for generic_osync_inode */
  43
  44#include <asm/mman.h>
  45
  46
  47/*
  48 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  49 * though.
  50 *
  51 * Shared mappings now work. 15.8.1995  Bruno.
  52 *
  53 * finished 'unifying' the page and buffer cache and SMP-threaded the
  54 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  55 *
  56 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  57 */
  58
  59/*
  60 * Lock ordering:
  61 *
  62 *  ->i_mmap_lock               (vmtruncate)
  63 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
  64 *      ->swap_lock             (exclusive_swap_page, others)
  65 *        ->mapping->tree_lock
  66 *
  67 *  ->i_mutex
  68 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
  69 *
  70 *  ->mmap_sem
  71 *    ->i_mmap_lock
  72 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
  73 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
  74 *
  75 *  ->mmap_sem
  76 *    ->lock_page               (access_process_vm)
  77 *
  78 *  ->i_mutex                   (generic_file_buffered_write)
  79 *    ->mmap_sem                (fault_in_pages_readable->do_page_fault)
  80 *
  81 *  ->i_mutex
  82 *    ->i_alloc_sem             (various)
  83 *
  84 *  ->inode_lock
  85 *    ->sb_lock                 (fs/fs-writeback.c)
  86 *    ->mapping->tree_lock      (__sync_single_inode)
  87 *
  88 *  ->i_mmap_lock
  89 *    ->anon_vma.lock           (vma_adjust)
  90 *
  91 *  ->anon_vma.lock
  92 *    ->page_table_lock or pte_lock     (anon_vma_prepare and various)
  93 *
  94 *  ->page_table_lock or pte_lock
  95 *    ->swap_lock               (try_to_unmap_one)
  96 *    ->private_lock            (try_to_unmap_one)
  97 *    ->tree_lock               (try_to_unmap_one)
  98 *    ->zone.lru_lock           (follow_page->mark_page_accessed)
  99 *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
 100 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 101 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
 102 *    ->inode_lock              (page_remove_rmap->set_page_dirty)
 103 *    ->inode_lock              (zap_pte_range->set_page_dirty)
 104 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 105 *
 106 *  ->task->proc_lock
 107 *    ->dcache_lock             (proc_pid_lookup)
 108 */
 109
 110/*
 111 * Remove a page from the page cache and free it. Caller has to make
 112 * sure the page is locked and that nobody else uses it - or that usage
 113 * is safe.  The caller must hold the mapping's tree_lock.
 114 */
 115void __remove_from_page_cache(struct page *page)
 116{
 117        struct address_space *mapping = page->mapping;
 118
 119        radix_tree_delete(&mapping->page_tree, page->index);
 120        page->mapping = NULL;
 121        mapping->nrpages--;
 122        __dec_zone_page_state(page, NR_FILE_PAGES);
 123        BUG_ON(page_mapped(page));
 124
 125        /*
 126         * Some filesystems seem to re-dirty the page even after
 127         * the VM has canceled the dirty bit (eg ext3 journaling).
 128         *
 129         * Fix it up by doing a final dirty accounting check after
 130         * having removed the page entirely.
 131         */
 132        if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 133                dec_zone_page_state(page, NR_FILE_DIRTY);
 134                dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 135        }
 136}
 137
 138void remove_from_page_cache(struct page *page)
 139{
 140        struct address_space *mapping = page->mapping;
 141
 142        BUG_ON(!PageLocked(page));
 143
 144        spin_lock_irq(&mapping->tree_lock);
 145        __remove_from_page_cache(page);
 146        spin_unlock_irq(&mapping->tree_lock);
 147        mem_cgroup_uncharge_cache_page(page);
 148}
 149
 150static int sync_page(void *word)
 151{
 152        struct address_space *mapping;
 153        struct page *page;
 154
 155        page = container_of((unsigned long *)word, struct page, flags);
 156
 157        /*
 158         * page_mapping() is being called without PG_locked held.
 159         * Some knowledge of the state and use of the page is used to
 160         * reduce the requirements down to a memory barrier.
 161         * The danger here is of a stale page_mapping() return value
 162         * indicating a struct address_space different from the one it's
 163         * associated with when it is associated with one.
 164         * After smp_mb(), it's either the correct page_mapping() for
 165         * the page, or an old page_mapping() and the page's own
 166         * page_mapping() has gone NULL.
 167         * The ->sync_page() address_space operation must tolerate
 168         * page_mapping() going NULL. By an amazing coincidence,
 169         * this comes about because none of the users of the page
 170         * in the ->sync_page() methods make essential use of the
 171         * page_mapping(), merely passing the page down to the backing
 172         * device's unplug functions when it's non-NULL, which in turn
 173         * ignore it for all cases but swap, where only page_private(page) is
 174         * of interest. When page_mapping() does go NULL, the entire
 175         * call stack gracefully ignores the page and returns.
 176         * -- wli
 177         */
 178        smp_mb();
 179        mapping = page_mapping(page);
 180        if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 181                mapping->a_ops->sync_page(page);
 182        io_schedule();
 183        return 0;
 184}
 185
 186static int sync_page_killable(void *word)
 187{
 188        sync_page(word);
 189        return fatal_signal_pending(current) ? -EINTR : 0;
 190}
 191
 192/**
 193 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 194 * @mapping:    address space structure to write
 195 * @start:      offset in bytes where the range starts
 196 * @end:        offset in bytes where the range ends (inclusive)
 197 * @sync_mode:  enable synchronous operation
 198 *
 199 * Start writeback against all of a mapping's dirty pages that lie
 200 * within the byte offsets <start, end> inclusive.
 201 *
 202 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 203 * opposed to a regular memory cleansing writeback.  The difference between
 204 * these two operations is that if a dirty page/buffer is encountered, it must
 205 * be waited upon, and not just skipped over.
 206 */
 207int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 208                                loff_t end, int sync_mode)
 209{
 210        int ret;
 211        struct writeback_control wbc = {
 212                .sync_mode = sync_mode,
 213                .nr_to_write = LONG_MAX,
 214                .range_start = start,
 215                .range_end = end,
 216        };
 217
 218        if (!mapping_cap_writeback_dirty(mapping))
 219                return 0;
 220
 221        ret = do_writepages(mapping, &wbc);
 222        return ret;
 223}
 224
 225static inline int __filemap_fdatawrite(struct address_space *mapping,
 226        int sync_mode)
 227{
 228        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 229}
 230
 231int filemap_fdatawrite(struct address_space *mapping)
 232{
 233        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 234}
 235EXPORT_SYMBOL(filemap_fdatawrite);
 236
 237int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 238                                loff_t end)
 239{
 240        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 241}
 242EXPORT_SYMBOL(filemap_fdatawrite_range);
 243
 244/**
 245 * filemap_flush - mostly a non-blocking flush
 246 * @mapping:    target address_space
 247 *
 248 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 249 * purposes - I/O may not be started against all dirty pages.
 250 */
 251int filemap_flush(struct address_space *mapping)
 252{
 253        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 254}
 255EXPORT_SYMBOL(filemap_flush);
 256
 257/**
 258 * wait_on_page_writeback_range - wait for writeback to complete
 259 * @mapping:    target address_space
 260 * @start:      beginning page index
 261 * @end:        ending page index
 262 *
 263 * Wait for writeback to complete against pages indexed by start->end
 264 * inclusive
 265 */
 266int wait_on_page_writeback_range(struct address_space *mapping,
 267                                pgoff_t start, pgoff_t end)
 268{
 269        struct pagevec pvec;
 270        int nr_pages;
 271        int ret = 0;
 272        pgoff_t index;
 273
 274        if (end < start)
 275                return 0;
 276
 277        pagevec_init(&pvec, 0);
 278        index = start;
 279        while ((index <= end) &&
 280                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 281                        PAGECACHE_TAG_WRITEBACK,
 282                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
 283                unsigned i;
 284
 285                for (i = 0; i < nr_pages; i++) {
 286                        struct page *page = pvec.pages[i];
 287
 288                        /* until radix tree lookup accepts end_index */
 289                        if (page->index > end)
 290                                continue;
 291
 292                        wait_on_page_writeback(page);
 293                        if (PageError(page))
 294                                ret = -EIO;
 295                }
 296                pagevec_release(&pvec);
 297                cond_resched();
 298        }
 299
 300        /* Check for outstanding write errors */
 301        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
 302                ret = -ENOSPC;
 303        if (test_and_clear_bit(AS_EIO, &mapping->flags))
 304                ret = -EIO;
 305
 306        return ret;
 307}
 308
 309/**
 310 * sync_page_range - write and wait on all pages in the passed range
 311 * @inode:      target inode
 312 * @mapping:    target address_space
 313 * @pos:        beginning offset in pages to write
 314 * @count:      number of bytes to write
 315 *
 316 * Write and wait upon all the pages in the passed range.  This is a "data
 317 * integrity" operation.  It waits upon in-flight writeout before starting and
 318 * waiting upon new writeout.  If there was an IO error, return it.
 319 *
 320 * We need to re-take i_mutex during the generic_osync_inode list walk because
 321 * it is otherwise livelockable.
 322 */
 323int sync_page_range(struct inode *inode, struct address_space *mapping,
 324                        loff_t pos, loff_t count)
 325{
 326        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 327        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 328        int ret;
 329
 330        if (!mapping_cap_writeback_dirty(mapping) || !count)
 331                return 0;
 332        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 333        if (ret == 0) {
 334                mutex_lock(&inode->i_mutex);
 335                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 336                mutex_unlock(&inode->i_mutex);
 337        }
 338        if (ret == 0)
 339                ret = wait_on_page_writeback_range(mapping, start, end);
 340        return ret;
 341}
 342EXPORT_SYMBOL(sync_page_range);
 343
 344/**
 345 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
 346 * @inode:      target inode
 347 * @mapping:    target address_space
 348 * @pos:        beginning offset in pages to write
 349 * @count:      number of bytes to write
 350 *
 351 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
 352 * as it forces O_SYNC writers to different parts of the same file
 353 * to be serialised right until io completion.
 354 */
 355int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 356                           loff_t pos, loff_t count)
 357{
 358        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 359        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 360        int ret;
 361
 362        if (!mapping_cap_writeback_dirty(mapping) || !count)
 363                return 0;
 364        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
 365        if (ret == 0)
 366                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 367        if (ret == 0)
 368                ret = wait_on_page_writeback_range(mapping, start, end);
 369        return ret;
 370}
 371EXPORT_SYMBOL(sync_page_range_nolock);
 372
 373/**
 374 * filemap_fdatawait - wait for all under-writeback pages to complete
 375 * @mapping: address space structure to wait for
 376 *
 377 * Walk the list of under-writeback pages of the given address space
 378 * and wait for all of them.
 379 */
 380int filemap_fdatawait(struct address_space *mapping)
 381{
 382        loff_t i_size = i_size_read(mapping->host);
 383
 384        if (i_size == 0)
 385                return 0;
 386
 387        return wait_on_page_writeback_range(mapping, 0,
 388                                (i_size - 1) >> PAGE_CACHE_SHIFT);
 389}
 390EXPORT_SYMBOL(filemap_fdatawait);
 391
 392int filemap_write_and_wait(struct address_space *mapping)
 393{
 394        int err = 0;
 395
 396        if (mapping->nrpages) {
 397                err = filemap_fdatawrite(mapping);
 398                /*
 399                 * Even if the above returned error, the pages may be
 400                 * written partially (e.g. -ENOSPC), so we wait for it.
 401                 * But the -EIO is special case, it may indicate the worst
 402                 * thing (e.g. bug) happened, so we avoid waiting for it.
 403                 */
 404                if (err != -EIO) {
 405                        int err2 = filemap_fdatawait(mapping);
 406                        if (!err)
 407                                err = err2;
 408                }
 409        }
 410        return err;
 411}
 412EXPORT_SYMBOL(filemap_write_and_wait);
 413
 414/**
 415 * filemap_write_and_wait_range - write out & wait on a file range
 416 * @mapping:    the address_space for the pages
 417 * @lstart:     offset in bytes where the range starts
 418 * @lend:       offset in bytes where the range ends (inclusive)
 419 *
 420 * Write out and wait upon file offsets lstart->lend, inclusive.
 421 *
 422 * Note that `lend' is inclusive (describes the last byte to be written) so
 423 * that this function can be used to write to the very end-of-file (end = -1).
 424 */
 425int filemap_write_and_wait_range(struct address_space *mapping,
 426                                 loff_t lstart, loff_t lend)
 427{
 428        int err = 0;
 429
 430        if (mapping->nrpages) {
 431                err = __filemap_fdatawrite_range(mapping, lstart, lend,
 432                                                 WB_SYNC_ALL);
 433                /* See comment of filemap_write_and_wait() */
 434                if (err != -EIO) {
 435                        int err2 = wait_on_page_writeback_range(mapping,
 436                                                lstart >> PAGE_CACHE_SHIFT,
 437                                                lend >> PAGE_CACHE_SHIFT);
 438                        if (!err)
 439                                err = err2;
 440                }
 441        }
 442        return err;
 443}
 444EXPORT_SYMBOL(filemap_write_and_wait_range);
 445
 446/**
 447 * add_to_page_cache_locked - add a locked page to the pagecache
 448 * @page:       page to add
 449 * @mapping:    the page's address_space
 450 * @offset:     page index
 451 * @gfp_mask:   page allocation mode
 452 *
 453 * This function is used to add a page to the pagecache. It must be locked.
 454 * This function does not add the page to the LRU.  The caller must do that.
 455 */
 456int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 457                pgoff_t offset, gfp_t gfp_mask)
 458{
 459        int error;
 460
 461        VM_BUG_ON(!PageLocked(page));
 462
 463        error = mem_cgroup_cache_charge(page, current->mm,
 464                                        gfp_mask & GFP_RECLAIM_MASK);
 465        if (error)
 466                goto out;
 467
 468        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 469        if (error == 0) {
 470                page_cache_get(page);
 471                page->mapping = mapping;
 472                page->index = offset;
 473
 474                spin_lock_irq(&mapping->tree_lock);
 475                error = radix_tree_insert(&mapping->page_tree, offset, page);
 476                if (likely(!error)) {
 477                        mapping->nrpages++;
 478                        __inc_zone_page_state(page, NR_FILE_PAGES);
 479                        spin_unlock_irq(&mapping->tree_lock);
 480                } else {
 481                        page->mapping = NULL;
 482                        spin_unlock_irq(&mapping->tree_lock);
 483                        mem_cgroup_uncharge_cache_page(page);
 484                        page_cache_release(page);
 485                }
 486                radix_tree_preload_end();
 487        } else
 488                mem_cgroup_uncharge_cache_page(page);
 489out:
 490        return error;
 491}
 492EXPORT_SYMBOL(add_to_page_cache_locked);
 493
 494int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 495                                pgoff_t offset, gfp_t gfp_mask)
 496{
 497        int ret;
 498
 499        /*
 500         * Splice_read and readahead add shmem/tmpfs pages into the page cache
 501         * before shmem_readpage has a chance to mark them as SwapBacked: they
 502         * need to go on the active_anon lru below, and mem_cgroup_cache_charge
 503         * (called in add_to_page_cache) needs to know where they're going too.
 504         */
 505        if (mapping_cap_swap_backed(mapping))
 506                SetPageSwapBacked(page);
 507
 508        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
 509        if (ret == 0) {
 510                if (page_is_file_cache(page))
 511                        lru_cache_add_file(page);
 512                else
 513                        lru_cache_add_active_anon(page);
 514        }
 515        return ret;
 516}
 517EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 518
 519#ifdef CONFIG_NUMA
 520struct page *__page_cache_alloc(gfp_t gfp)
 521{
 522        if (cpuset_do_page_mem_spread()) {
 523                int n = cpuset_mem_spread_node();
 524                return alloc_pages_exact_node(n, gfp, 0);
 525        }
 526        return alloc_pages(gfp, 0);
 527}
 528EXPORT_SYMBOL(__page_cache_alloc);
 529#endif
 530
 531static int __sleep_on_page_lock(void *word)
 532{
 533        io_schedule();
 534        return 0;
 535}
 536
 537/*
 538 * In order to wait for pages to become available there must be
 539 * waitqueues associated with pages. By using a hash table of
 540 * waitqueues where the bucket discipline is to maintain all
 541 * waiters on the same queue and wake all when any of the pages
 542 * become available, and for the woken contexts to check to be
 543 * sure the appropriate page became available, this saves space
 544 * at a cost of "thundering herd" phenomena during rare hash
 545 * collisions.
 546 */
 547static wait_queue_head_t *page_waitqueue(struct page *page)
 548{
 549        const struct zone *zone = page_zone(page);
 550
 551        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 552}
 553
 554static inline void wake_up_page(struct page *page, int bit)
 555{
 556        __wake_up_bit(page_waitqueue(page), &page->flags, bit);
 557}
 558
 559void wait_on_page_bit(struct page *page, int bit_nr)
 560{
 561        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
 562
 563        if (test_bit(bit_nr, &page->flags))
 564                __wait_on_bit(page_waitqueue(page), &wait, sync_page,
 565                                                        TASK_UNINTERRUPTIBLE);
 566}
 567EXPORT_SYMBOL(wait_on_page_bit);
 568
 569/**
 570 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
 571 * @page: Page defining the wait queue of interest
 572 * @waiter: Waiter to add to the queue
 573 *
 574 * Add an arbitrary @waiter to the wait queue for the nominated @page.
 575 */
 576void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
 577{
 578        wait_queue_head_t *q = page_waitqueue(page);
 579        unsigned long flags;
 580
 581        spin_lock_irqsave(&q->lock, flags);
 582        __add_wait_queue(q, waiter);
 583        spin_unlock_irqrestore(&q->lock, flags);
 584}
 585EXPORT_SYMBOL_GPL(add_page_wait_queue);
 586
 587/**
 588 * unlock_page - unlock a locked page
 589 * @page: the page
 590 *
 591 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 592 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 593 * mechananism between PageLocked pages and PageWriteback pages is shared.
 594 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 595 *
 596 * The mb is necessary to enforce ordering between the clear_bit and the read
 597 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
 598 */
 599void unlock_page(struct page *page)
 600{
 601        VM_BUG_ON(!PageLocked(page));
 602        clear_bit_unlock(PG_locked, &page->flags);
 603        smp_mb__after_clear_bit();
 604        wake_up_page(page, PG_locked);
 605}
 606EXPORT_SYMBOL(unlock_page);
 607
 608/**
 609 * end_page_writeback - end writeback against a page
 610 * @page: the page
 611 */
 612void end_page_writeback(struct page *page)
 613{
 614        if (TestClearPageReclaim(page))
 615                rotate_reclaimable_page(page);
 616
 617        if (!test_clear_page_writeback(page))
 618                BUG();
 619
 620        smp_mb__after_clear_bit();
 621        wake_up_page(page, PG_writeback);
 622}
 623EXPORT_SYMBOL(end_page_writeback);
 624
 625/**
 626 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 627 * @page: the page to lock
 628 *
 629 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 630 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 631 * chances are that on the second loop, the block layer's plug list is empty,
 632 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 633 */
 634void __lock_page(struct page *page)
 635{
 636        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 637
 638        __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
 639                                                        TASK_UNINTERRUPTIBLE);
 640}
 641EXPORT_SYMBOL(__lock_page);
 642
 643int __lock_page_killable(struct page *page)
 644{
 645        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 646
 647        return __wait_on_bit_lock(page_waitqueue(page), &wait,
 648                                        sync_page_killable, TASK_KILLABLE);
 649}
 650EXPORT_SYMBOL_GPL(__lock_page_killable);
 651
 652/**
 653 * __lock_page_nosync - get a lock on the page, without calling sync_page()
 654 * @page: the page to lock
 655 *
 656 * Variant of lock_page that does not require the caller to hold a reference
 657 * on the page's mapping.
 658 */
 659void __lock_page_nosync(struct page *page)
 660{
 661        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 662        __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
 663                                                        TASK_UNINTERRUPTIBLE);
 664}
 665
 666/**
 667 * find_get_page - find and get a page reference
 668 * @mapping: the address_space to search
 669 * @offset: the page index
 670 *
 671 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 672 * If yes, increment its refcount and return it; if no, return NULL.
 673 */
 674struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 675{
 676        void **pagep;
 677        struct page *page;
 678
 679        rcu_read_lock();
 680repeat:
 681        page = NULL;
 682        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
 683        if (pagep) {
 684                page = radix_tree_deref_slot(pagep);
 685                if (unlikely(!page || page == RADIX_TREE_RETRY))
 686                        goto repeat;
 687
 688                if (!page_cache_get_speculative(page))
 689                        goto repeat;
 690
 691                /*
 692                 * Has the page moved?
 693                 * This is part of the lockless pagecache protocol. See
 694                 * include/linux/pagemap.h for details.
 695                 */
 696                if (unlikely(page != *pagep)) {
 697                        page_cache_release(page);
 698                        goto repeat;
 699                }
 700        }
 701        rcu_read_unlock();
 702
 703        return page;
 704}
 705EXPORT_SYMBOL(find_get_page);
 706
 707/**
 708 * find_lock_page - locate, pin and lock a pagecache page
 709 * @mapping: the address_space to search
 710 * @offset: the page index
 711 *
 712 * Locates the desired pagecache page, locks it, increments its reference
 713 * count and returns its address.
 714 *
 715 * Returns zero if the page was not present. find_lock_page() may sleep.
 716 */
 717struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 718{
 719        struct page *page;
 720
 721repeat:
 722        page = find_get_page(mapping, offset);
 723        if (page) {
 724                lock_page(page);
 725                /* Has the page been truncated? */
 726                if (unlikely(page->mapping != mapping)) {
 727                        unlock_page(page);
 728                        page_cache_release(page);
 729                        goto repeat;
 730                }
 731                VM_BUG_ON(page->index != offset);
 732        }
 733        return page;
 734}
 735EXPORT_SYMBOL(find_lock_page);
 736
 737/**
 738 * find_or_create_page - locate or add a pagecache page
 739 * @mapping: the page's address_space
 740 * @index: the page's index into the mapping
 741 * @gfp_mask: page allocation mode
 742 *
 743 * Locates a page in the pagecache.  If the page is not present, a new page
 744 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
 745 * LRU list.  The returned page is locked and has its reference count
 746 * incremented.
 747 *
 748 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
 749 * allocation!
 750 *
 751 * find_or_create_page() returns the desired page's address, or zero on
 752 * memory exhaustion.
 753 */
 754struct page *find_or_create_page(struct address_space *mapping,
 755                pgoff_t index, gfp_t gfp_mask)
 756{
 757        struct page *page;
 758        int err;
 759repeat:
 760        page = find_lock_page(mapping, index);
 761        if (!page) {
 762                page = __page_cache_alloc(gfp_mask);
 763                if (!page)
 764                        return NULL;
 765                /*
 766                 * We want a regular kernel memory (not highmem or DMA etc)
 767                 * allocation for the radix tree nodes, but we need to honour
 768                 * the context-specific requirements the caller has asked for.
 769                 * GFP_RECLAIM_MASK collects those requirements.
 770                 */
 771                err = add_to_page_cache_lru(page, mapping, index,
 772                        (gfp_mask & GFP_RECLAIM_MASK));
 773                if (unlikely(err)) {
 774                        page_cache_release(page);
 775                        page = NULL;
 776                        if (err == -EEXIST)
 777                                goto repeat;
 778                }
 779        }
 780        return page;
 781}
 782EXPORT_SYMBOL(find_or_create_page);
 783
 784/**
 785 * find_get_pages - gang pagecache lookup
 786 * @mapping:    The address_space to search
 787 * @start:      The starting page index
 788 * @nr_pages:   The maximum number of pages
 789 * @pages:      Where the resulting pages are placed
 790 *
 791 * find_get_pages() will search for and return a group of up to
 792 * @nr_pages pages in the mapping.  The pages are placed at @pages.
 793 * find_get_pages() takes a reference against the returned pages.
 794 *
 795 * The search returns a group of mapping-contiguous pages with ascending
 796 * indexes.  There may be holes in the indices due to not-present pages.
 797 *
 798 * find_get_pages() returns the number of pages which were found.
 799 */
 800unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 801                            unsigned int nr_pages, struct page **pages)
 802{
 803        unsigned int i;
 804        unsigned int ret;
 805        unsigned int nr_found;
 806
 807        rcu_read_lock();
 808restart:
 809        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
 810                                (void ***)pages, start, nr_pages);
 811        ret = 0;
 812        for (i = 0; i < nr_found; i++) {
 813                struct page *page;
 814repeat:
 815                page = radix_tree_deref_slot((void **)pages[i]);
 816                if (unlikely(!page))
 817                        continue;
 818                /*
 819                 * this can only trigger if nr_found == 1, making livelock
 820                 * a non issue.
 821                 */
 822                if (unlikely(page == RADIX_TREE_RETRY))
 823                        goto restart;
 824
 825                if (!page_cache_get_speculative(page))
 826                        goto repeat;
 827
 828                /* Has the page moved? */
 829                if (unlikely(page != *((void **)pages[i]))) {
 830                        page_cache_release(page);
 831                        goto repeat;
 832                }
 833
 834                pages[ret] = page;
 835                ret++;
 836        }
 837        rcu_read_unlock();
 838        return ret;
 839}
 840
 841/**
 842 * find_get_pages_contig - gang contiguous pagecache lookup
 843 * @mapping:    The address_space to search
 844 * @index:      The starting page index
 845 * @nr_pages:   The maximum number of pages
 846 * @pages:      Where the resulting pages are placed
 847 *
 848 * find_get_pages_contig() works exactly like find_get_pages(), except
 849 * that the returned number of pages are guaranteed to be contiguous.
 850 *
 851 * find_get_pages_contig() returns the number of pages which were found.
 852 */
 853unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 854                               unsigned int nr_pages, struct page **pages)
 855{
 856        unsigned int i;
 857        unsigned int ret;
 858        unsigned int nr_found;
 859
 860        rcu_read_lock();
 861restart:
 862        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
 863                                (void ***)pages, index, nr_pages);
 864        ret = 0;
 865        for (i = 0; i < nr_found; i++) {
 866                struct page *page;
 867repeat:
 868                page = radix_tree_deref_slot((void **)pages[i]);
 869                if (unlikely(!page))
 870                        continue;
 871                /*
 872                 * this can only trigger if nr_found == 1, making livelock
 873                 * a non issue.
 874                 */
 875                if (unlikely(page == RADIX_TREE_RETRY))
 876                        goto restart;
 877
 878                if (page->mapping == NULL || page->index != index)
 879                        break;
 880
 881                if (!page_cache_get_speculative(page))
 882                        goto repeat;
 883
 884                /* Has the page moved? */
 885                if (unlikely(page != *((void **)pages[i]))) {
 886                        page_cache_release(page);
 887                        goto repeat;
 888                }
 889
 890                pages[ret] = page;
 891                ret++;
 892                index++;
 893        }
 894        rcu_read_unlock();
 895        return ret;
 896}
 897EXPORT_SYMBOL(find_get_pages_contig);
 898
 899/**
 900 * find_get_pages_tag - find and return pages that match @tag
 901 * @mapping:    the address_space to search
 902 * @index:      the starting page index
 903 * @tag:        the tag index
 904 * @nr_pages:   the maximum number of pages
 905 * @pages:      where the resulting pages are placed
 906 *
 907 * Like find_get_pages, except we only return pages which are tagged with
 908 * @tag.   We update @index to index the next page for the traversal.
 909 */
 910unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 911                        int tag, unsigned int nr_pages, struct page **pages)
 912{
 913        unsigned int i;
 914        unsigned int ret;
 915        unsigned int nr_found;
 916
 917        rcu_read_lock();
 918restart:
 919        nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
 920                                (void ***)pages, *index, nr_pages, tag);
 921        ret = 0;
 922        for (i = 0; i < nr_found; i++) {
 923                struct page *page;
 924repeat:
 925                page = radix_tree_deref_slot((void **)pages[i]);
 926                if (unlikely(!page))
 927                        continue;
 928                /*
 929                 * this can only trigger if nr_found == 1, making livelock
 930                 * a non issue.
 931                 */
 932                if (unlikely(page == RADIX_TREE_RETRY))
 933                        goto restart;
 934
 935                if (!page_cache_get_speculative(page))
 936                        goto repeat;
 937
 938                /* Has the page moved? */
 939                if (unlikely(page != *((void **)pages[i]))) {
 940                        page_cache_release(page);
 941                        goto repeat;
 942                }
 943
 944                pages[ret] = page;
 945                ret++;
 946        }
 947        rcu_read_unlock();
 948
 949        if (ret)
 950                *index = pages[ret - 1]->index + 1;
 951
 952        return ret;
 953}
 954EXPORT_SYMBOL(find_get_pages_tag);
 955
 956/**
 957 * grab_cache_page_nowait - returns locked page at given index in given cache
 958 * @mapping: target address_space
 959 * @index: the page index
 960 *
 961 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 962 * This is intended for speculative data generators, where the data can
 963 * be regenerated if the page couldn't be grabbed.  This routine should
 964 * be safe to call while holding the lock for another page.
 965 *
 966 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 967 * and deadlock against the caller's locked page.
 968 */
 969struct page *
 970grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 971{
 972        struct page *page = find_get_page(mapping, index);
 973
 974        if (page) {
 975                if (trylock_page(page))
 976                        return page;
 977                page_cache_release(page);
 978                return NULL;
 979        }
 980        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
 981        if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
 982                page_cache_release(page);
 983                page = NULL;
 984        }
 985        return page;
 986}
 987EXPORT_SYMBOL(grab_cache_page_nowait);
 988
 989/*
 990 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 991 * a _large_ part of the i/o request. Imagine the worst scenario:
 992 *
 993 *      ---R__________________________________________B__________
 994 *         ^ reading here                             ^ bad block(assume 4k)
 995 *
 996 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 997 * => failing the whole request => read(R) => read(R+1) =>
 998 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 999 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
1000 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
1001 *
1002 * It is going insane. Fix it by quickly scaling down the readahead size.
1003 */
1004static void shrink_readahead_size_eio(struct file *filp,
1005                                        struct file_ra_state *ra)
1006{
1007        ra->ra_pages /= 4;
1008}
1009
1010/**
1011 * do_generic_file_read - generic file read routine
1012 * @filp:       the file to read
1013 * @ppos:       current file position
1014 * @desc:       read_descriptor
1015 * @actor:      read method
1016 *
1017 * This is a generic file read routine, and uses the
1018 * mapping->a_ops->readpage() function for the actual low-level stuff.
1019 *
1020 * This is really ugly. But the goto's actually try to clarify some
1021 * of the logic when it comes to error handling etc.
1022 */
1023static void do_generic_file_read(struct file *filp, loff_t *ppos,
1024                read_descriptor_t *desc, read_actor_t actor)
1025{
1026        struct address_space *mapping = filp->f_mapping;
1027        struct inode *inode = mapping->host;
1028        struct file_ra_state *ra = &filp->f_ra;
1029        pgoff_t index;
1030        pgoff_t last_index;
1031        pgoff_t prev_index;
1032        unsigned long offset;      /* offset into pagecache page */
1033        unsigned int prev_offset;
1034        int error;
1035
1036        index = *ppos >> PAGE_CACHE_SHIFT;
1037        prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
1038        prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
1039        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
1040        offset = *ppos & ~PAGE_CACHE_MASK;
1041
1042        for (;;) {
1043                struct page *page;
1044                pgoff_t end_index;
1045                loff_t isize;
1046                unsigned long nr, ret;
1047
1048                cond_resched();
1049find_page:
1050                page = find_get_page(mapping, index);
1051                if (!page) {
1052                        page_cache_sync_readahead(mapping,
1053                                        ra, filp,
1054                                        index, last_index - index);
1055                        page = find_get_page(mapping, index);
1056                        if (unlikely(page == NULL))
1057                                goto no_cached_page;
1058                }
1059                if (PageReadahead(page)) {
1060                        page_cache_async_readahead(mapping,
1061                                        ra, filp, page,
1062                                        index, last_index - index);
1063                }
1064                if (!PageUptodate(page)) {
1065                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1066                                        !mapping->a_ops->is_partially_uptodate)
1067                                goto page_not_up_to_date;
1068                        if (!trylock_page(page))
1069                                goto page_not_up_to_date;
1070                        if (!mapping->a_ops->is_partially_uptodate(page,
1071                                                                desc, offset))
1072                                goto page_not_up_to_date_locked;
1073                        unlock_page(page);
1074                }
1075page_ok:
1076                /*
1077                 * i_size must be checked after we know the page is Uptodate.
1078                 *
1079                 * Checking i_size after the check allows us to calculate
1080                 * the correct value for "nr", which means the zero-filled
1081                 * part of the page is not copied back to userspace (unless
1082                 * another truncate extends the file - this is desired though).
1083                 */
1084
1085                isize = i_size_read(inode);
1086                end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1087                if (unlikely(!isize || index > end_index)) {
1088                        page_cache_release(page);
1089                        goto out;
1090                }
1091
1092                /* nr is the maximum number of bytes to copy from this page */
1093                nr = PAGE_CACHE_SIZE;
1094                if (index == end_index) {
1095                        nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1096                        if (nr <= offset) {
1097                                page_cache_release(page);
1098                                goto out;
1099                        }
1100                }
1101                nr = nr - offset;
1102
1103                /* If users can be writing to this page using arbitrary
1104                 * virtual addresses, take care about potential aliasing
1105                 * before reading the page on the kernel side.
1106                 */
1107                if (mapping_writably_mapped(mapping))
1108                        flush_dcache_page(page);
1109
1110                /*
1111                 * When a sequential read accesses a page several times,
1112                 * only mark it as accessed the first time.
1113                 */
1114                if (prev_index != index || offset != prev_offset)
1115                        mark_page_accessed(page);
1116                prev_index = index;
1117
1118                /*
1119                 * Ok, we have the page, and it's up-to-date, so
1120                 * now we can copy it to user space...
1121                 *
1122                 * The actor routine returns how many bytes were actually used..
1123                 * NOTE! This may not be the same as how much of a user buffer
1124                 * we filled up (we may be padding etc), so we can only update
1125                 * "pos" here (the actor routine has to update the user buffer
1126                 * pointers and the remaining count).
1127                 */
1128                ret = actor(desc, page, offset, nr);
1129                offset += ret;
1130                index += offset >> PAGE_CACHE_SHIFT;
1131                offset &= ~PAGE_CACHE_MASK;
1132                prev_offset = offset;
1133
1134                page_cache_release(page);
1135                if (ret == nr && desc->count)
1136                        continue;
1137                goto out;
1138
1139page_not_up_to_date:
1140                /* Get exclusive access to the page ... */
1141                error = lock_page_killable(page);
1142                if (unlikely(error))
1143                        goto readpage_error;
1144
1145page_not_up_to_date_locked:
1146                /* Did it get truncated before we got the lock? */
1147                if (!page->mapping) {
1148                        unlock_page(page);
1149                        page_cache_release(page);
1150                        continue;
1151                }
1152
1153                /* Did somebody else fill it already? */
1154                if (PageUptodate(page)) {
1155                        unlock_page(page);
1156                        goto page_ok;
1157                }
1158
1159readpage:
1160                /* Start the actual read. The read will unlock the page. */
1161                error = mapping->a_ops->readpage(filp, page);
1162
1163                if (unlikely(error)) {
1164                        if (error == AOP_TRUNCATED_PAGE) {
1165                                page_cache_release(page);
1166                                goto find_page;
1167                        }
1168                        goto readpage_error;
1169                }
1170
1171                if (!PageUptodate(page)) {
1172                        error = lock_page_killable(page);
1173                        if (unlikely(error))
1174                                goto readpage_error;
1175                        if (!PageUptodate(page)) {
1176                                if (page->mapping == NULL) {
1177                                        /*
1178                                         * invalidate_inode_pages got it
1179                                         */
1180                                        unlock_page(page);
1181                                        page_cache_release(page);
1182                                        goto find_page;
1183                                }
1184                                unlock_page(page);
1185                                shrink_readahead_size_eio(filp, ra);
1186                                error = -EIO;
1187                                goto readpage_error;
1188                        }
1189                        unlock_page(page);
1190                }
1191
1192                goto page_ok;
1193
1194readpage_error:
1195                /* UHHUH! A synchronous read error occurred. Report it */
1196                desc->error = error;
1197                page_cache_release(page);
1198                goto out;
1199
1200no_cached_page:
1201                /*
1202                 * Ok, it wasn't cached, so we need to create a new
1203                 * page..
1204                 */
1205                page = page_cache_alloc_cold(mapping);
1206                if (!page) {
1207                        desc->error = -ENOMEM;
1208                        goto out;
1209                }
1210                error = add_to_page_cache_lru(page, mapping,
1211                                                index, GFP_KERNEL);
1212                if (error) {
1213                        page_cache_release(page);
1214                        if (error == -EEXIST)
1215                                goto find_page;
1216                        desc->error = error;
1217                        goto out;
1218                }
1219                goto readpage;
1220        }
1221
1222out:
1223        ra->prev_pos = prev_index;
1224        ra->prev_pos <<= PAGE_CACHE_SHIFT;
1225        ra->prev_pos |= prev_offset;
1226
1227        *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1228        file_accessed(filp);
1229}
1230
1231int file_read_actor(read_descriptor_t *desc, struct page *page,
1232                        unsigned long offset, unsigned long size)
1233{
1234        char *kaddr;
1235        unsigned long left, count = desc->count;
1236
1237        if (size > count)
1238                size = count;
1239
1240        /*
1241         * Faults on the destination of a read are common, so do it before
1242         * taking the kmap.
1243         */
1244        if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1245                kaddr = kmap_atomic(page, KM_USER0);
1246                left = __copy_to_user_inatomic(desc->arg.buf,
1247                                                kaddr + offset, size);
1248                kunmap_atomic(kaddr, KM_USER0);
1249                if (left == 0)
1250                        goto success;
1251        }
1252
1253        /* Do it the slow way */
1254        kaddr = kmap(page);
1255        left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1256        kunmap(page);
1257
1258        if (left) {
1259                size -= left;
1260                desc->error = -EFAULT;
1261        }
1262success:
1263        desc->count = count - size;
1264        desc->written += size;
1265        desc->arg.buf += size;
1266        return size;
1267}
1268
1269/*
1270 * Performs necessary checks before doing a write
1271 * @iov:        io vector request
1272 * @nr_segs:    number of segments in the iovec
1273 * @count:      number of bytes to write
1274 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1275 *
1276 * Adjust number of segments and amount of bytes to write (nr_segs should be
1277 * properly initialized first). Returns appropriate error code that caller
1278 * should return or zero in case that write should be allowed.
1279 */
1280int generic_segment_checks(const struct iovec *iov,
1281                        unsigned long *nr_segs, size_t *count, int access_flags)
1282{
1283        unsigned long   seg;
1284        size_t cnt = 0;
1285        for (seg = 0; seg < *nr_segs; seg++) {
1286                const struct iovec *iv = &iov[seg];
1287
1288                /*
1289                 * If any segment has a negative length, or the cumulative
1290                 * length ever wraps negative then return -EINVAL.
1291                 */
1292                cnt += iv->iov_len;
1293                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1294                        return -EINVAL;
1295                if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1296                        continue;
1297                if (seg == 0)
1298                        return -EFAULT;
1299                *nr_segs = seg;
1300                cnt -= iv->iov_len;     /* This segment is no good */
1301                break;
1302        }
1303        *count = cnt;
1304        return 0;
1305}
1306EXPORT_SYMBOL(generic_segment_checks);
1307
1308/**
1309 * generic_file_aio_read - generic filesystem read routine
1310 * @iocb:       kernel I/O control block
1311 * @iov:        io vector request
1312 * @nr_segs:    number of segments in the iovec
1313 * @pos:        current file position
1314 *
1315 * This is the "read()" routine for all filesystems
1316 * that can use the page cache directly.
1317 */
1318ssize_t
1319generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1320                unsigned long nr_segs, loff_t pos)
1321{
1322        struct file *filp = iocb->ki_filp;
1323        ssize_t retval;
1324        unsigned long seg;
1325        size_t count;
1326        loff_t *ppos = &iocb->ki_pos;
1327
1328        count = 0;
1329        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1330        if (retval)
1331                return retval;
1332
1333        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1334        if (filp->f_flags & O_DIRECT) {
1335                loff_t size;
1336                struct address_space *mapping;
1337                struct inode *inode;
1338
1339                mapping = filp->f_mapping;
1340                inode = mapping->host;
1341                if (!count)
1342                        goto out; /* skip atime */
1343                size = i_size_read(inode);
1344                if (pos < size) {
1345                        retval = filemap_write_and_wait_range(mapping, pos,
1346                                        pos + iov_length(iov, nr_segs) - 1);
1347                        if (!retval) {
1348                                retval = mapping->a_ops->direct_IO(READ, iocb,
1349                                                        iov, pos, nr_segs);
1350                        }
1351                        if (retval > 0)
1352                                *ppos = pos + retval;
1353                        if (retval) {
1354                                file_accessed(filp);
1355                                goto out;
1356                        }
1357                }
1358        }
1359
1360        for (seg = 0; seg < nr_segs; seg++) {
1361                read_descriptor_t desc;
1362
1363                desc.written = 0;
1364                desc.arg.buf = iov[seg].iov_base;
1365                desc.count = iov[seg].iov_len;
1366                if (desc.count == 0)
1367                        continue;
1368                desc.error = 0;
1369                do_generic_file_read(filp, ppos, &desc, file_read_actor);
1370                retval += desc.written;
1371                if (desc.error) {
1372                        retval = retval ?: desc.error;
1373                        break;
1374                }
1375                if (desc.count > 0)
1376                        break;
1377        }
1378out:
1379        return retval;
1380}
1381EXPORT_SYMBOL(generic_file_aio_read);
1382
1383static ssize_t
1384do_readahead(struct address_space *mapping, struct file *filp,
1385             pgoff_t index, unsigned long nr)
1386{
1387        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1388                return -EINVAL;
1389
1390        force_page_cache_readahead(mapping, filp, index, nr);
1391        return 0;
1392}
1393
1394SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1395{
1396        ssize_t ret;
1397        struct file *file;
1398
1399        ret = -EBADF;
1400        file = fget(fd);
1401        if (file) {
1402                if (file->f_mode & FMODE_READ) {
1403                        struct address_space *mapping = file->f_mapping;
1404                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1405                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1406                        unsigned long len = end - start + 1;
1407                        ret = do_readahead(mapping, file, start, len);
1408                }
1409                fput(file);
1410        }
1411        return ret;
1412}
1413#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1414asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1415{
1416        return SYSC_readahead((int) fd, offset, (size_t) count);
1417}
1418SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1419#endif
1420
1421#ifdef CONFIG_MMU
1422/**
1423 * page_cache_read - adds requested page to the page cache if not already there
1424 * @file:       file to read
1425 * @offset:     page index
1426 *
1427 * This adds the requested page to the page cache if it isn't already there,
1428 * and schedules an I/O to read in its contents from disk.
1429 */
1430static int page_cache_read(struct file *file, pgoff_t offset)
1431{
1432        struct address_space *mapping = file->f_mapping;
1433        struct page *page; 
1434        int ret;
1435
1436        do {
1437                page = page_cache_alloc_cold(mapping);
1438                if (!page)
1439                        return -ENOMEM;
1440
1441                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1442                if (ret == 0)
1443                        ret = mapping->a_ops->readpage(file, page);
1444                else if (ret == -EEXIST)
1445                        ret = 0; /* losing race to add is OK */
1446
1447                page_cache_release(page);
1448
1449        } while (ret == AOP_TRUNCATED_PAGE);
1450                
1451        return ret;
1452}
1453
1454#define MMAP_LOTSAMISS  (100)
1455
1456/*
1457 * Synchronous readahead happens when we don't even find
1458 * a page in the page cache at all.
1459 */
1460static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1461                                   struct file_ra_state *ra,
1462                                   struct file *file,
1463                                   pgoff_t offset)
1464{
1465        unsigned long ra_pages;
1466        struct address_space *mapping = file->f_mapping;
1467
1468        /* If we don't want any read-ahead, don't bother */
1469        if (VM_RandomReadHint(vma))
1470                return;
1471
1472        if (VM_SequentialReadHint(vma) ||
1473                        offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1474                page_cache_sync_readahead(mapping, ra, file, offset,
1475                                          ra->ra_pages);
1476                return;
1477        }
1478
1479        if (ra->mmap_miss < INT_MAX)
1480                ra->mmap_miss++;
1481
1482        /*
1483         * Do we miss much more than hit in this file? If so,
1484         * stop bothering with read-ahead. It will only hurt.
1485         */
1486        if (ra->mmap_miss > MMAP_LOTSAMISS)
1487                return;
1488
1489        /*
1490         * mmap read-around
1491         */
1492        ra_pages = max_sane_readahead(ra->ra_pages);
1493        if (ra_pages) {
1494                ra->start = max_t(long, 0, offset - ra_pages/2);
1495                ra->size = ra_pages;
1496                ra->async_size = 0;
1497                ra_submit(ra, mapping, file);
1498        }
1499}
1500
1501/*
1502 * Asynchronous readahead happens when we find the page and PG_readahead,
1503 * so we want to possibly extend the readahead further..
1504 */
1505static void do_async_mmap_readahead(struct vm_area_struct *vma,
1506                                    struct file_ra_state *ra,
1507                                    struct file *file,
1508                                    struct page *page,
1509                                    pgoff_t offset)
1510{
1511        struct address_space *mapping = file->f_mapping;
1512
1513        /* If we don't want any read-ahead, don't bother */
1514        if (VM_RandomReadHint(vma))
1515                return;
1516        if (ra->mmap_miss > 0)
1517                ra->mmap_miss--;
1518        if (PageReadahead(page))
1519                page_cache_async_readahead(mapping, ra, file,
1520                                           page, offset, ra->ra_pages);
1521}
1522
1523/**
1524 * filemap_fault - read in file data for page fault handling
1525 * @vma:        vma in which the fault was taken
1526 * @vmf:        struct vm_fault containing details of the fault
1527 *
1528 * filemap_fault() is invoked via the vma operations vector for a
1529 * mapped memory region to read in file data during a page fault.
1530 *
1531 * The goto's are kind of ugly, but this streamlines the normal case of having
1532 * it in the page cache, and handles the special cases reasonably without
1533 * having a lot of duplicated code.
1534 */
1535int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1536{
1537        int error;
1538        struct file *file = vma->vm_file;
1539        struct address_space *mapping = file->f_mapping;
1540        struct file_ra_state *ra = &file->f_ra;
1541        struct inode *inode = mapping->host;
1542        pgoff_t offset = vmf->pgoff;
1543        struct page *page;
1544        pgoff_t size;
1545        int ret = 0;
1546
1547        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1548        if (offset >= size)
1549                return VM_FAULT_SIGBUS;
1550
1551        /*
1552         * Do we have something in the page cache already?
1553         */
1554        page = find_get_page(mapping, offset);
1555        if (likely(page)) {
1556                /*
1557                 * We found the page, so try async readahead before
1558                 * waiting for the lock.
1559                 */
1560                do_async_mmap_readahead(vma, ra, file, page, offset);
1561                lock_page(page);
1562
1563                /* Did it get truncated? */
1564                if (unlikely(page->mapping != mapping)) {
1565                        unlock_page(page);
1566                        put_page(page);
1567                        goto no_cached_page;
1568                }
1569        } else {
1570                /* No page in the page cache at all */
1571                do_sync_mmap_readahead(vma, ra, file, offset);
1572                count_vm_event(PGMAJFAULT);
1573                ret = VM_FAULT_MAJOR;
1574retry_find:
1575                page = find_lock_page(mapping, offset);
1576                if (!page)
1577                        goto no_cached_page;
1578        }
1579
1580        /*
1581         * We have a locked page in the page cache, now we need to check
1582         * that it's up-to-date. If not, it is going to be due to an error.
1583         */
1584        if (unlikely(!PageUptodate(page)))
1585                goto page_not_uptodate;
1586
1587        /*
1588         * Found the page and have a reference on it.
1589         * We must recheck i_size under page lock.
1590         */
1591        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1592        if (unlikely(offset >= size)) {
1593                unlock_page(page);
1594                page_cache_release(page);
1595                return VM_FAULT_SIGBUS;
1596        }
1597
1598        ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1599        vmf->page = page;
1600        return ret | VM_FAULT_LOCKED;
1601
1602no_cached_page:
1603        /*
1604         * We're only likely to ever get here if MADV_RANDOM is in
1605         * effect.
1606         */
1607        error = page_cache_read(file, offset);
1608
1609        /*
1610         * The page we want has now been added to the page cache.
1611         * In the unlikely event that someone removed it in the
1612         * meantime, we'll just come back here and read it again.
1613         */
1614        if (error >= 0)
1615                goto retry_find;
1616
1617        /*
1618         * An error return from page_cache_read can result if the
1619         * system is low on memory, or a problem occurs while trying
1620         * to schedule I/O.
1621         */
1622        if (error == -ENOMEM)
1623                return VM_FAULT_OOM;
1624        return VM_FAULT_SIGBUS;
1625
1626page_not_uptodate:
1627        /*
1628         * Umm, take care of errors if the page isn't up-to-date.
1629         * Try to re-read it _once_. We do this synchronously,
1630         * because there really aren't any performance issues here
1631         * and we need to check for errors.
1632         */
1633        ClearPageError(page);
1634        error = mapping->a_ops->readpage(file, page);
1635        if (!error) {
1636                wait_on_page_locked(page);
1637                if (!PageUptodate(page))
1638                        error = -EIO;
1639        }
1640        page_cache_release(page);
1641
1642        if (!error || error == AOP_TRUNCATED_PAGE)
1643                goto retry_find;
1644
1645        /* Things didn't work out. Return zero to tell the mm layer so. */
1646        shrink_readahead_size_eio(file, ra);
1647        return VM_FAULT_SIGBUS;
1648}
1649EXPORT_SYMBOL(filemap_fault);
1650
1651struct vm_operations_struct generic_file_vm_ops = {
1652        .fault          = filemap_fault,
1653};
1654
1655/* This is used for a general mmap of a disk file */
1656
1657int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1658{
1659        struct address_space *mapping = file->f_mapping;
1660
1661        if (!mapping->a_ops->readpage)
1662                return -ENOEXEC;
1663        file_accessed(file);
1664        vma->vm_ops = &generic_file_vm_ops;
1665        vma->vm_flags |= VM_CAN_NONLINEAR;
1666        return 0;
1667}
1668
1669/*
1670 * This is for filesystems which do not implement ->writepage.
1671 */
1672int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1673{
1674        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1675                return -EINVAL;
1676        return generic_file_mmap(file, vma);
1677}
1678#else
1679int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1680{
1681        return -ENOSYS;
1682}
1683int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1684{
1685        return -ENOSYS;
1686}
1687#endif /* CONFIG_MMU */
1688
1689EXPORT_SYMBOL(generic_file_mmap);
1690EXPORT_SYMBOL(generic_file_readonly_mmap);
1691
1692static struct page *__read_cache_page(struct address_space *mapping,
1693                                pgoff_t index,
1694                                int (*filler)(void *,struct page*),
1695                                void *data)
1696{
1697        struct page *page;
1698        int err;
1699repeat:
1700        page = find_get_page(mapping, index);
1701        if (!page) {
1702                page = page_cache_alloc_cold(mapping);
1703                if (!page)
1704                        return ERR_PTR(-ENOMEM);
1705                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1706                if (unlikely(err)) {
1707                        page_cache_release(page);
1708                        if (err == -EEXIST)
1709                                goto repeat;
1710                        /* Presumably ENOMEM for radix tree node */
1711                        return ERR_PTR(err);
1712                }
1713                err = filler(data, page);
1714                if (err < 0) {
1715                        page_cache_release(page);
1716                        page = ERR_PTR(err);
1717                }
1718        }
1719        return page;
1720}
1721
1722/**
1723 * read_cache_page_async - read into page cache, fill it if needed
1724 * @mapping:    the page's address_space
1725 * @index:      the page index
1726 * @filler:     function to perform the read
1727 * @data:       destination for read data
1728 *
1729 * Same as read_cache_page, but don't wait for page to become unlocked
1730 * after submitting it to the filler.
1731 *
1732 * Read into the page cache. If a page already exists, and PageUptodate() is
1733 * not set, try to fill the page but don't wait for it to become unlocked.
1734 *
1735 * If the page does not get brought uptodate, return -EIO.
1736 */
1737struct page *read_cache_page_async(struct address_space *mapping,
1738                                pgoff_t index,
1739                                int (*filler)(void *,struct page*),
1740                                void *data)
1741{
1742        struct page *page;
1743        int err;
1744
1745retry:
1746        page = __read_cache_page(mapping, index, filler, data);
1747        if (IS_ERR(page))
1748                return page;
1749        if (PageUptodate(page))
1750                goto out;
1751
1752        lock_page(page);
1753        if (!page->mapping) {
1754                unlock_page(page);
1755                page_cache_release(page);
1756                goto retry;
1757        }
1758        if (PageUptodate(page)) {
1759                unlock_page(page);
1760                goto out;
1761        }
1762        err = filler(data, page);
1763        if (err < 0) {
1764                page_cache_release(page);
1765                return ERR_PTR(err);
1766        }
1767out:
1768        mark_page_accessed(page);
1769        return page;
1770}
1771EXPORT_SYMBOL(read_cache_page_async);
1772
1773/**
1774 * read_cache_page - read into page cache, fill it if needed
1775 * @mapping:    the page's address_space
1776 * @index:      the page index
1777 * @filler:     function to perform the read
1778 * @data:       destination for read data
1779 *
1780 * Read into the page cache. If a page already exists, and PageUptodate() is
1781 * not set, try to fill the page then wait for it to become unlocked.
1782 *
1783 * If the page does not get brought uptodate, return -EIO.
1784 */
1785struct page *read_cache_page(struct address_space *mapping,
1786                                pgoff_t index,
1787                                int (*filler)(void *,struct page*),
1788                                void *data)
1789{
1790        struct page *page;
1791
1792        page = read_cache_page_async(mapping, index, filler, data);
1793        if (IS_ERR(page))
1794                goto out;
1795        wait_on_page_locked(page);
1796        if (!PageUptodate(page)) {
1797                page_cache_release(page);
1798                page = ERR_PTR(-EIO);
1799        }
1800 out:
1801        return page;
1802}
1803EXPORT_SYMBOL(read_cache_page);
1804
1805/*
1806 * The logic we want is
1807 *
1808 *      if suid or (sgid and xgrp)
1809 *              remove privs
1810 */
1811int should_remove_suid(struct dentry *dentry)
1812{
1813        mode_t mode = dentry->d_inode->i_mode;
1814        int kill = 0;
1815
1816        /* suid always must be killed */
1817        if (unlikely(mode & S_ISUID))
1818                kill = ATTR_KILL_SUID;
1819
1820        /*
1821         * sgid without any exec bits is just a mandatory locking mark; leave
1822         * it alone.  If some exec bits are set, it's a real sgid; kill it.
1823         */
1824        if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1825                kill |= ATTR_KILL_SGID;
1826
1827        if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1828                return kill;
1829
1830        return 0;
1831}
1832EXPORT_SYMBOL(should_remove_suid);
1833
1834static int __remove_suid(struct dentry *dentry, int kill)
1835{
1836        struct iattr newattrs;
1837
1838        newattrs.ia_valid = ATTR_FORCE | kill;
1839        return notify_change(dentry, &newattrs);
1840}
1841
1842int file_remove_suid(struct file *file)
1843{
1844        struct dentry *dentry = file->f_path.dentry;
1845        int killsuid = should_remove_suid(dentry);
1846        int killpriv = security_inode_need_killpriv(dentry);
1847        int error = 0;
1848
1849        if (killpriv < 0)
1850                return killpriv;
1851        if (killpriv)
1852                error = security_inode_killpriv(dentry);
1853        if (!error && killsuid)
1854                error = __remove_suid(dentry, killsuid);
1855
1856        return error;
1857}
1858EXPORT_SYMBOL(file_remove_suid);
1859
1860static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1861                        const struct iovec *iov, size_t base, size_t bytes)
1862{
1863        size_t copied = 0, left = 0;
1864
1865        while (bytes) {
1866                char __user *buf = iov->iov_base + base;
1867                int copy = min(bytes, iov->iov_len - base);
1868
1869                base = 0;
1870                left = __copy_from_user_inatomic(vaddr, buf, copy);
1871                copied += copy;
1872                bytes -= copy;
1873                vaddr += copy;
1874                iov++;
1875
1876                if (unlikely(left))
1877                        break;
1878        }
1879        return copied - left;
1880}
1881
1882/*
1883 * Copy as much as we can into the page and return the number of bytes which
1884 * were sucessfully copied.  If a fault is encountered then return the number of
1885 * bytes which were copied.
1886 */
1887size_t iov_iter_copy_from_user_atomic(struct page *page,
1888                struct iov_iter *i, unsigned long offset, size_t bytes)
1889{
1890        char *kaddr;
1891        size_t copied;
1892
1893        BUG_ON(!in_atomic());
1894        kaddr = kmap_atomic(page, KM_USER0);
1895        if (likely(i->nr_segs == 1)) {
1896                int left;
1897                char __user *buf = i->iov->iov_base + i->iov_offset;
1898                left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1899                copied = bytes - left;
1900        } else {
1901                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1902                                                i->iov, i->iov_offset, bytes);
1903        }
1904        kunmap_atomic(kaddr, KM_USER0);
1905
1906        return copied;
1907}
1908EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1909
1910/*
1911 * This has the same sideeffects and return value as
1912 * iov_iter_copy_from_user_atomic().
1913 * The difference is that it attempts to resolve faults.
1914 * Page must not be locked.
1915 */
1916size_t iov_iter_copy_from_user(struct page *page,
1917                struct iov_iter *i, unsigned long offset, size_t bytes)
1918{
1919        char *kaddr;
1920        size_t copied;
1921
1922        kaddr = kmap(page);
1923        if (likely(i->nr_segs == 1)) {
1924                int left;
1925                char __user *buf = i->iov->iov_base + i->iov_offset;
1926                left = __copy_from_user(kaddr + offset, buf, bytes);
1927                copied = bytes - left;
1928        } else {
1929                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1930                                                i->iov, i->iov_offset, bytes);
1931        }
1932        kunmap(page);
1933        return copied;
1934}
1935EXPORT_SYMBOL(iov_iter_copy_from_user);
1936
1937void iov_iter_advance(struct iov_iter *i, size_t bytes)
1938{
1939        BUG_ON(i->count < bytes);
1940
1941        if (likely(i->nr_segs == 1)) {
1942                i->iov_offset += bytes;
1943                i->count -= bytes;
1944        } else {
1945                const struct iovec *iov = i->iov;
1946                size_t base = i->iov_offset;
1947
1948                /*
1949                 * The !iov->iov_len check ensures we skip over unlikely
1950                 * zero-length segments (without overruning the iovec).
1951                 */
1952                while (bytes || unlikely(i->count && !iov->iov_len)) {
1953                        int copy;
1954
1955                        copy = min(bytes, iov->iov_len - base);
1956                        BUG_ON(!i->count || i->count < copy);
1957                        i->count -= copy;
1958                        bytes -= copy;
1959                        base += copy;
1960                        if (iov->iov_len == base) {
1961                                iov++;
1962                                base = 0;
1963                        }
1964                }
1965                i->iov = iov;
1966                i->iov_offset = base;
1967        }
1968}
1969EXPORT_SYMBOL(iov_iter_advance);
1970
1971/*
1972 * Fault in the first iovec of the given iov_iter, to a maximum length
1973 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1974 * accessed (ie. because it is an invalid address).
1975 *
1976 * writev-intensive code may want this to prefault several iovecs -- that
1977 * would be possible (callers must not rely on the fact that _only_ the
1978 * first iovec will be faulted with the current implementation).
1979 */
1980int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1981{
1982        char __user *buf = i->iov->iov_base + i->iov_offset;
1983        bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1984        return fault_in_pages_readable(buf, bytes);
1985}
1986EXPORT_SYMBOL(iov_iter_fault_in_readable);
1987
1988/*
1989 * Return the count of just the current iov_iter segment.
1990 */
1991size_t iov_iter_single_seg_count(struct iov_iter *i)
1992{
1993        const struct iovec *iov = i->iov;
1994        if (i->nr_segs == 1)
1995                return i->count;
1996        else
1997                return min(i->count, iov->iov_len - i->iov_offset);
1998}
1999EXPORT_SYMBOL(iov_iter_single_seg_count);
2000
2001/*
2002 * Performs necessary checks before doing a write
2003 *
2004 * Can adjust writing position or amount of bytes to write.
2005 * Returns appropriate error code that caller should return or
2006 * zero in case that write should be allowed.
2007 */
2008inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
2009{
2010        struct inode *inode = file->f_mapping->host;
2011        unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2012
2013        if (unlikely(*pos < 0))
2014                return -EINVAL;
2015
2016        if (!isblk) {
2017                /* FIXME: this is for backwards compatibility with 2.4 */
2018                if (file->f_flags & O_APPEND)
2019                        *pos = i_size_read(inode);
2020
2021                if (limit != RLIM_INFINITY) {
2022                        if (*pos >= limit) {
2023                                send_sig(SIGXFSZ, current, 0);
2024                                return -EFBIG;
2025                        }
2026                        if (*count > limit - (typeof(limit))*pos) {
2027                                *count = limit - (typeof(limit))*pos;
2028                        }
2029                }
2030        }
2031
2032        /*
2033         * LFS rule
2034         */
2035        if (unlikely(*pos + *count > MAX_NON_LFS &&
2036                                !(file->f_flags & O_LARGEFILE))) {
2037                if (*pos >= MAX_NON_LFS) {
2038                        return -EFBIG;
2039                }
2040                if (*count > MAX_NON_LFS - (unsigned long)*pos) {
2041                        *count = MAX_NON_LFS - (unsigned long)*pos;
2042                }
2043        }
2044
2045        /*
2046         * Are we about to exceed the fs block limit ?
2047         *
2048         * If we have written data it becomes a short write.  If we have
2049         * exceeded without writing data we send a signal and return EFBIG.
2050         * Linus frestrict idea will clean these up nicely..
2051         */
2052        if (likely(!isblk)) {
2053                if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2054                        if (*count || *pos > inode->i_sb->s_maxbytes) {
2055                                return -EFBIG;
2056                        }
2057                        /* zero-length writes at ->s_maxbytes are OK */
2058                }
2059
2060                if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2061                        *count = inode->i_sb->s_maxbytes - *pos;
2062        } else {
2063#ifdef CONFIG_BLOCK
2064                loff_t isize;
2065                if (bdev_read_only(I_BDEV(inode)))
2066                        return -EPERM;
2067                isize = i_size_read(inode);
2068                if (*pos >= isize) {
2069                        if (*count || *pos > isize)
2070                                return -ENOSPC;
2071                }
2072
2073                if (*pos + *count > isize)
2074                        *count = isize - *pos;
2075#else
2076                return -EPERM;
2077#endif
2078        }
2079        return 0;
2080}
2081EXPORT_SYMBOL(generic_write_checks);
2082
2083int pagecache_write_begin(struct file *file, struct address_space *mapping,
2084                                loff_t pos, unsigned len, unsigned flags,
2085                                struct page **pagep, void **fsdata)
2086{
2087        const struct address_space_operations *aops = mapping->a_ops;
2088
2089        return aops->write_begin(file, mapping, pos, len, flags,
2090                                                        pagep, fsdata);
2091}
2092EXPORT_SYMBOL(pagecache_write_begin);
2093
2094int pagecache_write_end(struct file *file, struct address_space *mapping,
2095                                loff_t pos, unsigned len, unsigned copied,
2096                                struct page *page, void *fsdata)
2097{
2098        const struct address_space_operations *aops = mapping->a_ops;
2099
2100        mark_page_accessed(page);
2101        return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2102}
2103EXPORT_SYMBOL(pagecache_write_end);
2104
2105ssize_t
2106generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2107                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2108                size_t count, size_t ocount)
2109{
2110        struct file     *file = iocb->ki_filp;
2111        struct address_space *mapping = file->f_mapping;
2112        struct inode    *inode = mapping->host;
2113        ssize_t         written;
2114        size_t          write_len;
2115        pgoff_t         end;
2116
2117        if (count != ocount)
2118                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2119
2120        write_len = iov_length(iov, *nr_segs);
2121        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2122
2123        written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2124        if (written)
2125                goto out;
2126
2127        /*
2128         * After a write we want buffered reads to be sure to go to disk to get
2129         * the new data.  We invalidate clean cached page from the region we're
2130         * about to write.  We do this *before* the write so that we can return
2131         * without clobbering -EIOCBQUEUED from ->direct_IO().
2132         */
2133        if (mapping->nrpages) {
2134                written = invalidate_inode_pages2_range(mapping,
2135                                        pos >> PAGE_CACHE_SHIFT, end);
2136                /*
2137                 * If a page can not be invalidated, return 0 to fall back
2138                 * to buffered write.
2139                 */
2140                if (written) {
2141                        if (written == -EBUSY)
2142                                return 0;
2143                        goto out;
2144                }
2145        }
2146
2147        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2148
2149        /*
2150         * Finally, try again to invalidate clean pages which might have been
2151         * cached by non-direct readahead, or faulted in by get_user_pages()
2152         * if the source of the write was an mmap'ed region of the file
2153         * we're writing.  Either one is a pretty crazy thing to do,
2154         * so we don't support it 100%.  If this invalidation
2155         * fails, tough, the write still worked...
2156         */
2157        if (mapping->nrpages) {
2158                invalidate_inode_pages2_range(mapping,
2159                                              pos >> PAGE_CACHE_SHIFT, end);
2160        }
2161
2162        if (written > 0) {
2163                loff_t end = pos + written;
2164                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2165                        i_size_write(inode,  end);
2166                        mark_inode_dirty(inode);
2167                }
2168                *ppos = end;
2169        }
2170
2171        /*
2172         * Sync the fs metadata but not the minor inode changes and
2173         * of course not the data as we did direct DMA for the IO.
2174         * i_mutex is held, which protects generic_osync_inode() from
2175         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
2176         */
2177out:
2178        if ((written >= 0 || written == -EIOCBQUEUED) &&
2179            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2180                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2181                if (err < 0)
2182                        written = err;
2183        }
2184        return written;
2185}
2186EXPORT_SYMBOL(generic_file_direct_write);
2187
2188/*
2189 * Find or create a page at the given pagecache position. Return the locked
2190 * page. This function is specifically for buffered writes.
2191 */
2192struct page *grab_cache_page_write_begin(struct address_space *mapping,
2193                                        pgoff_t index, unsigned flags)
2194{
2195        int status;
2196        struct page *page;
2197        gfp_t gfp_notmask = 0;
2198        if (flags & AOP_FLAG_NOFS)
2199                gfp_notmask = __GFP_FS;
2200repeat:
2201        page = find_lock_page(mapping, index);
2202        if (likely(page))
2203                return page;
2204
2205        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
2206        if (!page)
2207                return NULL;
2208        status = add_to_page_cache_lru(page, mapping, index,
2209                                                GFP_KERNEL & ~gfp_notmask);
2210        if (unlikely(status)) {
2211                page_cache_release(page);
2212                if (status == -EEXIST)
2213                        goto repeat;
2214                return NULL;
2215        }
2216        return page;
2217}
2218EXPORT_SYMBOL(grab_cache_page_write_begin);
2219
2220static ssize_t generic_perform_write(struct file *file,
2221                                struct iov_iter *i, loff_t pos)
2222{
2223        struct address_space *mapping = file->f_mapping;
2224        const struct address_space_operations *a_ops = mapping->a_ops;
2225        long status = 0;
2226        ssize_t written = 0;
2227        unsigned int flags = 0;
2228
2229        /*
2230         * Copies from kernel address space cannot fail (NFSD is a big user).
2231         */
2232        if (segment_eq(get_fs(), KERNEL_DS))
2233                flags |= AOP_FLAG_UNINTERRUPTIBLE;
2234
2235        do {
2236                struct page *page;
2237                pgoff_t index;          /* Pagecache index for current page */
2238                unsigned long offset;   /* Offset into pagecache page */
2239                unsigned long bytes;    /* Bytes to write to page */
2240                size_t copied;          /* Bytes copied from user */
2241                void *fsdata;
2242
2243                offset = (pos & (PAGE_CACHE_SIZE - 1));
2244                index = pos >> PAGE_CACHE_SHIFT;
2245                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2246                                                iov_iter_count(i));
2247
2248again:
2249
2250                /*
2251                 * Bring in the user page that we will copy from _first_.
2252                 * Otherwise there's a nasty deadlock on copying from the
2253                 * same page as we're writing to, without it being marked
2254                 * up-to-date.
2255                 *
2256                 * Not only is this an optimisation, but it is also required
2257                 * to check that the address is actually valid, when atomic
2258                 * usercopies are used, below.
2259                 */
2260                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2261                        status = -EFAULT;
2262                        break;
2263                }
2264
2265                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2266                                                &page, &fsdata);
2267                if (unlikely(status))
2268                        break;
2269
2270                pagefault_disable();
2271                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2272                pagefault_enable();
2273                flush_dcache_page(page);
2274
2275                mark_page_accessed(page);
2276                status = a_ops->write_end(file, mapping, pos, bytes, copied,
2277                                                page, fsdata);
2278                if (unlikely(status < 0))
2279                        break;
2280                copied = status;
2281
2282                cond_resched();
2283
2284                iov_iter_advance(i, copied);
2285                if (unlikely(copied == 0)) {
2286                        /*
2287                         * If we were unable to copy any data at all, we must
2288                         * fall back to a single segment length write.
2289                         *
2290                         * If we didn't fallback here, we could livelock
2291                         * because not all segments in the iov can be copied at
2292                         * once without a pagefault.
2293                         */
2294                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2295                                                iov_iter_single_seg_count(i));
2296                        goto again;
2297                }
2298                pos += copied;
2299                written += copied;
2300
2301                balance_dirty_pages_ratelimited(mapping);
2302
2303        } while (iov_iter_count(i));
2304
2305        return written ? written : status;
2306}
2307
2308ssize_t
2309generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2310                unsigned long nr_segs, loff_t pos, loff_t *ppos,
2311                size_t count, ssize_t written)
2312{
2313        struct file *file = iocb->ki_filp;
2314        struct address_space *mapping = file->f_mapping;
2315        const struct address_space_operations *a_ops = mapping->a_ops;
2316        struct inode *inode = mapping->host;
2317        ssize_t status;
2318        struct iov_iter i;
2319
2320        iov_iter_init(&i, iov, nr_segs, count, written);
2321        status = generic_perform_write(file, &i, pos);
2322
2323        if (likely(status >= 0)) {
2324                written += status;
2325                *ppos = pos + status;
2326
2327                /*
2328                 * For now, when the user asks for O_SYNC, we'll actually give
2329                 * O_DSYNC
2330                 */
2331                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2332                        if (!a_ops->writepage || !is_sync_kiocb(iocb))
2333                                status = generic_osync_inode(inode, mapping,
2334                                                OSYNC_METADATA|OSYNC_DATA);
2335                }
2336        }
2337        
2338        /*
2339         * If we get here for O_DIRECT writes then we must have fallen through
2340         * to buffered writes (block instantiation inside i_size).  So we sync
2341         * the file data here, to try to honour O_DIRECT expectations.
2342         */
2343        if (unlikely(file->f_flags & O_DIRECT) && written)
2344                status = filemap_write_and_wait_range(mapping,
2345                                        pos, pos + written - 1);
2346
2347        return written ? written : status;
2348}
2349EXPORT_SYMBOL(generic_file_buffered_write);
2350
2351static ssize_t
2352__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2353                                unsigned long nr_segs, loff_t *ppos)
2354{
2355        struct file *file = iocb->ki_filp;
2356        struct address_space * mapping = file->f_mapping;
2357        size_t ocount;          /* original count */
2358        size_t count;           /* after file limit checks */
2359        struct inode    *inode = mapping->host;
2360        loff_t          pos;
2361        ssize_t         written;
2362        ssize_t         err;
2363
2364        ocount = 0;
2365        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2366        if (err)
2367                return err;
2368
2369        count = ocount;
2370        pos = *ppos;
2371
2372        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2373
2374        /* We can write back this queue in page reclaim */
2375        current->backing_dev_info = mapping->backing_dev_info;
2376        written = 0;
2377
2378        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2379        if (err)
2380                goto out;
2381
2382        if (count == 0)
2383                goto out;
2384
2385        err = file_remove_suid(file);
2386        if (err)
2387                goto out;
2388
2389        file_update_time(file);
2390
2391        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2392        if (unlikely(file->f_flags & O_DIRECT)) {
2393                loff_t endbyte;
2394                ssize_t written_buffered;
2395
2396                written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2397                                                        ppos, count, ocount);
2398                if (written < 0 || written == count)
2399                        goto out;
2400                /*
2401                 * direct-io write to a hole: fall through to buffered I/O
2402                 * for completing the rest of the request.
2403                 */
2404                pos += written;
2405                count -= written;
2406                written_buffered = generic_file_buffered_write(iocb, iov,
2407                                                nr_segs, pos, ppos, count,
2408                                                written);
2409                /*
2410                 * If generic_file_buffered_write() retuned a synchronous error
2411                 * then we want to return the number of bytes which were
2412                 * direct-written, or the error code if that was zero.  Note
2413                 * that this differs from normal direct-io semantics, which
2414                 * will return -EFOO even if some bytes were written.
2415                 */
2416                if (written_buffered < 0) {
2417                        err = written_buffered;
2418                        goto out;
2419                }
2420
2421                /*
2422                 * We need to ensure that the page cache pages are written to
2423                 * disk and invalidated to preserve the expected O_DIRECT
2424                 * semantics.
2425                 */
2426                endbyte = pos + written_buffered - written - 1;
2427                err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
2428                                            SYNC_FILE_RANGE_WAIT_BEFORE|
2429                                            SYNC_FILE_RANGE_WRITE|
2430                                            SYNC_FILE_RANGE_WAIT_AFTER);
2431                if (err == 0) {
2432                        written = written_buffered;
2433                        invalidate_mapping_pages(mapping,
2434                                                 pos >> PAGE_CACHE_SHIFT,
2435                                                 endbyte >> PAGE_CACHE_SHIFT);
2436                } else {
2437                        /*
2438                         * We don't know how much we wrote, so just return
2439                         * the number of bytes which were direct-written
2440                         */
2441                }
2442        } else {
2443                written = generic_file_buffered_write(iocb, iov, nr_segs,
2444                                pos, ppos, count, written);
2445        }
2446out:
2447        current->backing_dev_info = NULL;
2448        return written ? written : err;
2449}
2450
2451ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
2452                const struct iovec *iov, unsigned long nr_segs, loff_t pos)
2453{
2454        struct file *file = iocb->ki_filp;
2455        struct address_space *mapping = file->f_mapping;
2456        struct inode *inode = mapping->host;
2457        ssize_t ret;
2458
2459        BUG_ON(iocb->ki_pos != pos);
2460
2461        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2462                        &iocb->ki_pos);
2463
2464        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2465                ssize_t err;
2466
2467                err = sync_page_range_nolock(inode, mapping, pos, ret);
2468                if (err < 0)
2469                        ret = err;
2470        }
2471        return ret;
2472}
2473EXPORT_SYMBOL(generic_file_aio_write_nolock);
2474
2475ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2476                unsigned long nr_segs, loff_t pos)
2477{
2478        struct file *file = iocb->ki_filp;
2479        struct address_space *mapping = file->f_mapping;
2480        struct inode *inode = mapping->host;
2481        ssize_t ret;
2482
2483        BUG_ON(iocb->ki_pos != pos);
2484
2485        mutex_lock(&inode->i_mutex);
2486        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2487                        &iocb->ki_pos);
2488        mutex_unlock(&inode->i_mutex);
2489
2490        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2491                ssize_t err;
2492
2493                err = sync_page_range(inode, mapping, pos, ret);
2494                if (err < 0)
2495                        ret = err;
2496        }
2497        return ret;
2498}
2499EXPORT_SYMBOL(generic_file_aio_write);
2500
2501/**
2502 * try_to_release_page() - release old fs-specific metadata on a page
2503 *
2504 * @page: the page which the kernel is trying to free
2505 * @gfp_mask: memory allocation flags (and I/O mode)
2506 *
2507 * The address_space is to try to release any data against the page
2508 * (presumably at page->private).  If the release was successful, return `1'.
2509 * Otherwise return zero.
2510 *
2511 * This may also be called if PG_fscache is set on a page, indicating that the
2512 * page is known to the local caching routines.
2513 *
2514 * The @gfp_mask argument specifies whether I/O may be performed to release
2515 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2516 *
2517 */
2518int try_to_release_page(struct page *page, gfp_t gfp_mask)
2519{
2520        struct address_space * const mapping = page->mapping;
2521
2522        BUG_ON(!PageLocked(page));
2523        if (PageWriteback(page))
2524                return 0;
2525
2526        if (mapping && mapping->a_ops->releasepage)
2527                return mapping->a_ops->releasepage(page, gfp_mask);
2528        return try_to_free_buffers(page);
2529}
2530
2531EXPORT_SYMBOL(try_to_release_page);
2532
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.