linux-bk/mm/filemap.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994-1999  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem used to do this differently, for example)
  11 */
  12#include <linux/module.h>
  13#include <linux/slab.h>
  14#include <linux/compiler.h>
  15#include <linux/fs.h>
  16#include <linux/aio.h>
  17#include <linux/kernel_stat.h>
  18#include <linux/mm.h>
  19#include <linux/mman.h>
  20#include <linux/pagemap.h>
  21#include <linux/file.h>
  22#include <linux/uio.h>
  23#include <linux/iobuf.h>
  24#include <linux/hash.h>
  25#include <linux/writeback.h>
  26#include <linux/pagevec.h>
  27#include <linux/security.h>
  28/*
  29 * This is needed for the following functions:
  30 *  - try_to_release_page
  31 *  - block_invalidatepage
  32 *  - page_has_buffers
  33 *  - generic_osync_inode
  34 *
  35 * FIXME: remove all knowledge of the buffer layer from this file
  36 */
  37#include <linux/buffer_head.h>
  38
  39#include <asm/uaccess.h>
  40#include <asm/mman.h>
  41
  42/*
  43 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  44 * though.
  45 *
  46 * Shared mappings now work. 15.8.1995  Bruno.
  47 *
  48 * finished 'unifying' the page and buffer cache and SMP-threaded the
  49 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  50 *
  51 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  52 */
  53
  54
  55/*
  56 * Lock ordering:
  57 *
  58 *  ->i_shared_lock             (vmtruncate)
  59 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
  60 *      ->swap_list_lock
  61 *        ->swap_device_lock    (exclusive_swap_page, others)
  62 *          ->mapping->page_lock
  63 *      ->inode_lock            (__mark_inode_dirty)
  64 *        ->sb_lock             (fs/fs-writeback.c)
  65 */
  66
  67/*
  68 * Remove a page from the page cache and free it. Caller has to make
  69 * sure the page is locked and that nobody else uses it - or that usage
  70 * is safe.  The caller must hold a write_lock on the mapping's page_lock.
  71 */
  72void __remove_from_page_cache(struct page *page)
  73{
  74        struct address_space *mapping = page->mapping;
  75
  76        if (unlikely(PageDirty(page)) && !PageSwapCache(page))
  77                BUG();
  78
  79        radix_tree_delete(&page->mapping->page_tree, page->index);
  80        list_del(&page->list);
  81        page->mapping = NULL;
  82
  83        mapping->nrpages--;
  84        dec_page_state(nr_pagecache);
  85}
  86
  87void remove_from_page_cache(struct page *page)
  88{
  89        struct address_space *mapping = page->mapping;
  90
  91        if (unlikely(!PageLocked(page)))
  92                PAGE_BUG(page);
  93
  94        write_lock(&mapping->page_lock);
  95        __remove_from_page_cache(page);
  96        write_unlock(&mapping->page_lock);
  97}
  98
  99static inline int sync_page(struct page *page)
 100{
 101        struct address_space *mapping = page->mapping;
 102
 103        if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 104                return mapping->a_ops->sync_page(page);
 105        return 0;
 106}
 107
 108/**
 109 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 110 * @inode: the inode which pages we want to invalidate
 111 *
 112 * This function only removes the unlocked pages, if you want to
 113 * remove all the pages of one inode, you must call truncate_inode_pages.
 114 */
 115
 116void invalidate_inode_pages(struct inode * inode)
 117{
 118        struct list_head *head, *curr;
 119        struct page * page;
 120        struct address_space *mapping = inode->i_mapping;
 121        struct pagevec pvec;
 122
 123        head = &mapping->clean_pages;
 124        pagevec_init(&pvec);
 125        write_lock(&mapping->page_lock);
 126        curr = head->next;
 127
 128        while (curr != head) {
 129                page = list_entry(curr, struct page, list);
 130                curr = curr->next;
 131
 132                /* We cannot invalidate something in dirty.. */
 133                if (PageDirty(page))
 134                        continue;
 135
 136                /* ..or locked */
 137                if (TestSetPageLocked(page))
 138                        continue;
 139
 140                if (PagePrivate(page) && !try_to_release_page(page, 0))
 141                        goto unlock;
 142
 143                if (page_count(page) != 1)
 144                        goto unlock;
 145
 146                __remove_from_page_cache(page);
 147                unlock_page(page);
 148                if (!pagevec_add(&pvec, page))
 149                        __pagevec_release(&pvec);
 150                continue;
 151unlock:
 152                unlock_page(page);
 153                continue;
 154        }
 155
 156        write_unlock(&mapping->page_lock);
 157        pagevec_release(&pvec);
 158}
 159
 160static int do_invalidatepage(struct page *page, unsigned long offset)
 161{
 162        int (*invalidatepage)(struct page *, unsigned long);
 163        invalidatepage = page->mapping->a_ops->invalidatepage;
 164        if (invalidatepage)
 165                return (*invalidatepage)(page, offset);
 166        return block_invalidatepage(page, offset);
 167}
 168
 169static inline void truncate_partial_page(struct page *page, unsigned partial)
 170{
 171        memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 172        if (PagePrivate(page))
 173                do_invalidatepage(page, partial);
 174}
 175
 176/*
 177 * If truncate cannot remove the fs-private metadata from the page, the page
 178 * becomes anonymous.  It will be left on the LRU and may even be mapped into
 179 * user pagetables if we're racing with filemap_nopage().
 180 */
 181static void truncate_complete_page(struct page *page)
 182{
 183        if (PagePrivate(page))
 184                do_invalidatepage(page, 0);
 185
 186        clear_page_dirty(page);
 187        ClearPageUptodate(page);
 188        remove_from_page_cache(page);
 189        page_cache_release(page);
 190}
 191
 192/*
 193 * Writeback walks the page list in ->prev order, which is low-to-high file
 194 * offsets in the common case where he file was written linearly. So truncate
 195 * walks the page list in the opposite (->next) direction, to avoid getting
 196 * into lockstep with writeback's cursor.  To prune as many pages as possible
 197 * before the truncate cursor collides with the writeback cursor.
 198 */
 199static int truncate_list_pages(struct address_space *mapping,
 200        struct list_head *head, unsigned long start, unsigned *partial)
 201{
 202        struct list_head *curr;
 203        struct page * page;
 204        int unlocked = 0;
 205        struct pagevec release_pvec;
 206
 207        pagevec_init(&release_pvec);
 208restart:
 209        curr = head->next;
 210        while (curr != head) {
 211                unsigned long offset;
 212
 213                page = list_entry(curr, struct page, list);
 214                offset = page->index;
 215
 216                /* Is one of the pages to truncate? */
 217                if ((offset >= start) || (*partial && (offset + 1) == start)) {
 218                        int failed;
 219
 220                        page_cache_get(page);
 221                        failed = TestSetPageLocked(page);
 222                        if (!failed && PageWriteback(page)) {
 223                                unlock_page(page);
 224                                list_del(head);
 225                                list_add_tail(head, curr);
 226                                write_unlock(&mapping->page_lock);
 227                                wait_on_page_writeback(page);
 228                                if (!pagevec_add(&release_pvec, page))
 229                                        __pagevec_release(&release_pvec);
 230                                unlocked = 1;
 231                                write_lock(&mapping->page_lock);
 232                                goto restart;
 233                        }
 234
 235                        list_del(head);
 236                        if (!failed)            /* Restart after this page */
 237                                list_add(head, curr);
 238                        else                    /* Restart on this page */
 239                                list_add_tail(head, curr);
 240
 241                        write_unlock(&mapping->page_lock);
 242                        unlocked = 1;
 243
 244                        if (!failed) {
 245                                if (*partial && (offset + 1) == start) {
 246                                        truncate_partial_page(page, *partial);
 247                                        *partial = 0;
 248                                } else {
 249                                        truncate_complete_page(page);
 250                                }
 251                                unlock_page(page);
 252                        } else {
 253                                wait_on_page_locked(page);
 254                        }
 255                        if (!pagevec_add(&release_pvec, page))
 256                                __pagevec_release(&release_pvec);
 257                        cond_resched();
 258                        write_lock(&mapping->page_lock);
 259                        goto restart;
 260                }
 261                curr = curr->next;
 262        }
 263        if (pagevec_count(&release_pvec)) {
 264                write_unlock(&mapping->page_lock);
 265                pagevec_release(&release_pvec);
 266                write_lock(&mapping->page_lock);
 267                unlocked = 1;
 268        }
 269        return unlocked;
 270}
 271
 272/*
 273 * Unconditionally clean all pages outside `start'.  The mapping lock
 274 * must be held.
 275 */
 276static void clean_list_pages(struct address_space *mapping,
 277                struct list_head *head, unsigned long start)
 278{
 279        struct page *page;
 280        struct list_head *curr;
 281
 282        for (curr = head->next; curr != head; curr = curr->next) {
 283                page = list_entry(curr, struct page, list);
 284                if (page->index > start)
 285                        clear_page_dirty(page);
 286        }
 287}
 288
 289/**
 290 * truncate_inode_pages - truncate *all* the pages from an offset
 291 * @mapping: mapping to truncate
 292 * @lstart: offset from with to truncate
 293 *
 294 * Truncate the page cache at a set offset, removing the pages
 295 * that are beyond that offset (and zeroing out partial pages).
 296 * If any page is locked we wait for it to become unlocked.
 297 */
 298void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
 299{
 300        unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 301        unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 302        int unlocked;
 303
 304        write_lock(&mapping->page_lock);
 305        clean_list_pages(mapping, &mapping->io_pages, start);
 306        clean_list_pages(mapping, &mapping->dirty_pages, start);
 307        do {
 308                unlocked = truncate_list_pages(mapping,
 309                                &mapping->io_pages, start, &partial);
 310                unlocked |= truncate_list_pages(mapping,
 311                                &mapping->dirty_pages, start, &partial);
 312                unlocked |= truncate_list_pages(mapping,
 313                                &mapping->clean_pages, start, &partial);
 314                unlocked |= truncate_list_pages(mapping,
 315                                &mapping->locked_pages, start, &partial);
 316        } while (unlocked);
 317        /* Traversed all three lists without dropping the lock */
 318        write_unlock(&mapping->page_lock);
 319}
 320
 321static inline int invalidate_this_page2(struct address_space * mapping,
 322                                        struct page * page,
 323                                        struct list_head * curr,
 324                                        struct list_head * head)
 325{
 326        int unlocked = 1;
 327
 328        /*
 329         * The page is locked and we hold the mapping lock as well
 330         * so both page_count(page) and page_buffers stays constant here.
 331         * AKPM: fixme: No global lock any more.  Is this still OK?
 332         */
 333        if (page_count(page) == 1 + !!page_has_buffers(page)) {
 334                /* Restart after this page */
 335                list_del(head);
 336                list_add_tail(head, curr);
 337
 338                page_cache_get(page);
 339                write_unlock(&mapping->page_lock);
 340                truncate_complete_page(page);
 341        } else {
 342                if (page_has_buffers(page)) {
 343                        /* Restart after this page */
 344                        list_del(head);
 345                        list_add_tail(head, curr);
 346
 347                        page_cache_get(page);
 348                        write_unlock(&mapping->page_lock);
 349                        do_invalidatepage(page, 0);
 350                } else
 351                        unlocked = 0;
 352
 353                clear_page_dirty(page);
 354                ClearPageUptodate(page);
 355        }
 356
 357        return unlocked;
 358}
 359
 360static int invalidate_list_pages2(struct address_space * mapping,
 361                                  struct list_head * head)
 362{
 363        struct list_head *curr;
 364        struct page * page;
 365        int unlocked = 0;
 366        struct pagevec release_pvec;
 367
 368        pagevec_init(&release_pvec);
 369restart:
 370        curr = head->prev;
 371        while (curr != head) {
 372                page = list_entry(curr, struct page, list);
 373
 374                if (!TestSetPageLocked(page)) {
 375                        int __unlocked;
 376
 377                        if (PageWriteback(page)) {
 378                                write_unlock(&mapping->page_lock);
 379                                wait_on_page_writeback(page);
 380                                unlocked = 1;
 381                                write_lock(&mapping->page_lock);
 382                                unlock_page(page);
 383                                goto restart;
 384                        }
 385
 386                        __unlocked = invalidate_this_page2(mapping,
 387                                                page, curr, head);
 388                        unlock_page(page);
 389                        unlocked |= __unlocked;
 390                        if (!__unlocked) {
 391                                curr = curr->prev;
 392                                continue;
 393                        }
 394                } else {
 395                        /* Restart on this page */
 396                        list_del(head);
 397                        list_add(head, curr);
 398
 399                        page_cache_get(page);
 400                        write_unlock(&mapping->page_lock);
 401                        unlocked = 1;
 402                        wait_on_page_locked(page);
 403                }
 404
 405                if (!pagevec_add(&release_pvec, page))
 406                        __pagevec_release(&release_pvec);
 407                cond_resched();
 408                write_lock(&mapping->page_lock);
 409                goto restart;
 410        }
 411        if (pagevec_count(&release_pvec)) {
 412                write_unlock(&mapping->page_lock);
 413                pagevec_release(&release_pvec);
 414                write_lock(&mapping->page_lock);
 415                unlocked = 1;
 416        }
 417        return unlocked;
 418}
 419
 420/**
 421 * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
 422 * free the pages because they're mapped.
 423 * @mapping: the address_space which pages we want to invalidate
 424 */
 425void invalidate_inode_pages2(struct address_space *mapping)
 426{
 427        int unlocked;
 428
 429        write_lock(&mapping->page_lock);
 430        do {
 431                unlocked = invalidate_list_pages2(mapping,
 432                                &mapping->clean_pages);
 433                unlocked |= invalidate_list_pages2(mapping,
 434                                &mapping->dirty_pages);
 435                unlocked |= invalidate_list_pages2(mapping,
 436                                &mapping->io_pages);
 437                unlocked |= invalidate_list_pages2(mapping,
 438                                &mapping->locked_pages);
 439        } while (unlocked);
 440        write_unlock(&mapping->page_lock);
 441}
 442
 443/*
 444 * In-memory filesystems have to fail their
 445 * writepage function - and this has to be
 446 * worked around in the VM layer..
 447 *
 448 * We
 449 *  - mark the page dirty again (but do NOT
 450 *    add it back to the inode dirty list, as
 451 *    that would livelock in fdatasync)
 452 *  - activate the page so that the page stealer
 453 *    doesn't try to write it out over and over
 454 *    again.
 455 *
 456 * NOTE!  The livelock in fdatasync went away, due to io_pages.
 457 * So this function can now call set_page_dirty().
 458 */
 459int fail_writepage(struct page *page)
 460{
 461        /* Only activate on memory-pressure, not fsync.. */
 462        if (current->flags & PF_MEMALLOC) {
 463                if (!PageActive(page))
 464                        activate_page(page);
 465                if (!PageReferenced(page))
 466                        SetPageReferenced(page);
 467        }
 468
 469        unlock_page(page);
 470        return -EAGAIN;         /* It will be set dirty again */
 471}
 472EXPORT_SYMBOL(fail_writepage);
 473
 474/**
 475 * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
 476 * @mapping: address space structure to write
 477 *
 478 * This is a "data integrity" operation, as opposed to a regular memory
 479 * cleansing writeback.  The difference between these two operations is that
 480 * if a dirty page/buffer is encountered, it must be waited upon, and not just
 481 * skipped over.
 482 *
 483 * The PF_SYNC flag is set across this operation and the various functions
 484 * which care about this distinction must use called_for_sync() to find out
 485 * which behaviour they should implement.
 486 */
 487int filemap_fdatawrite(struct address_space *mapping)
 488{
 489        int ret;
 490        struct writeback_control wbc = {
 491                .sync_mode = WB_SYNC_ALL,
 492                .nr_to_write = mapping->nrpages * 2,
 493        };
 494
 495        current->flags |= PF_SYNC;
 496        ret = do_writepages(mapping, &wbc);
 497        current->flags &= ~PF_SYNC;
 498        return ret;
 499}
 500
 501/**
 502 * filemap_fdatawait - walk the list of locked pages of the given address
 503 *                     space and wait for all of them.
 504 * @mapping: address space structure to wait for
 505 */
 506int filemap_fdatawait(struct address_space * mapping)
 507{
 508        int ret = 0;
 509
 510        write_lock(&mapping->page_lock);
 511
 512        while (!list_empty(&mapping->locked_pages)) {
 513                struct page *page;
 514
 515                page = list_entry(mapping->locked_pages.next,struct page,list);
 516                list_del(&page->list);
 517                if (PageDirty(page))
 518                        list_add(&page->list, &mapping->dirty_pages);
 519                else
 520                        list_add(&page->list, &mapping->clean_pages);
 521
 522                if (!PageWriteback(page))
 523                        continue;
 524
 525                page_cache_get(page);
 526                write_unlock(&mapping->page_lock);
 527
 528                wait_on_page_writeback(page);
 529                if (PageError(page))
 530                        ret = -EIO;
 531
 532                page_cache_release(page);
 533                write_lock(&mapping->page_lock);
 534        }
 535        write_unlock(&mapping->page_lock);
 536        return ret;
 537}
 538
 539/*
 540 * This adds a page to the page cache, starting out as locked, unreferenced,
 541 * not uptodate and with no errors.
 542 *
 543 * This function is used for two things: adding newly allocated pagecache
 544 * pages and for moving existing anon pages into swapcache.
 545 *
 546 * In the case of pagecache pages, the page is new, so we can just run
 547 * SetPageLocked() against it.  The other page state flags were set by
 548 * rmqueue()
 549 *
 550 * In the case of swapcache, try_to_swap_out() has already locked the page, so
 551 * SetPageLocked() is ugly-but-OK there too.  The required page state has been
 552 * set up by swap_out_add_to_swap_cache().
 553 *
 554 * This function does not add the page to the LRU.  The caller must do that.
 555 */
 556int add_to_page_cache(struct page *page,
 557                struct address_space *mapping, pgoff_t offset)
 558{
 559        int error;
 560
 561        page_cache_get(page);
 562        write_lock(&mapping->page_lock);
 563        error = radix_tree_insert(&mapping->page_tree, offset, page);
 564        if (!error) {
 565                SetPageLocked(page);
 566                ___add_to_page_cache(page, mapping, offset);
 567                ClearPageDirty(page);
 568        } else {
 569                page_cache_release(page);
 570        }
 571        write_unlock(&mapping->page_lock);
 572        return error;
 573}
 574
 575int add_to_page_cache_lru(struct page *page,
 576                struct address_space *mapping, pgoff_t offset)
 577{
 578        int ret = add_to_page_cache(page, mapping, offset);
 579        if (ret == 0)
 580                lru_cache_add(page);
 581        return ret;
 582}
 583
 584/*
 585 * This adds the requested page to the page cache if it isn't already there,
 586 * and schedules an I/O to read in its contents from disk.
 587 */
 588static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 589static int page_cache_read(struct file * file, unsigned long offset)
 590{
 591        struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 592        struct page *page; 
 593        int error;
 594
 595        page = page_cache_alloc(mapping);
 596        if (!page)
 597                return -ENOMEM;
 598
 599        error = add_to_page_cache_lru(page, mapping, offset);
 600        if (!error) {
 601                error = mapping->a_ops->readpage(file, page);
 602                page_cache_release(page);
 603                return error;
 604        }
 605
 606        /*
 607         * We arrive here in the unlikely event that someone 
 608         * raced with us and added our page to the cache first
 609         * or we are out of memory for radix-tree nodes.
 610         */
 611        page_cache_release(page);
 612        return error == -EEXIST ? 0 : error;
 613}
 614
 615/*
 616 * In order to wait for pages to become available there must be
 617 * waitqueues associated with pages. By using a hash table of
 618 * waitqueues where the bucket discipline is to maintain all
 619 * waiters on the same queue and wake all when any of the pages
 620 * become available, and for the woken contexts to check to be
 621 * sure the appropriate page became available, this saves space
 622 * at a cost of "thundering herd" phenomena during rare hash
 623 * collisions.
 624 */
 625static inline wait_queue_head_t *page_waitqueue(struct page *page)
 626{
 627        const struct zone *zone = page_zone(page);
 628
 629        return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 630}
 631
 632void wait_on_page_bit(struct page *page, int bit_nr)
 633{
 634        wait_queue_head_t *waitqueue = page_waitqueue(page);
 635        struct task_struct *tsk = current;
 636        DECLARE_WAITQUEUE(wait, tsk);
 637
 638        add_wait_queue(waitqueue, &wait);
 639        do {
 640                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 641                if (!test_bit(bit_nr, &page->flags))
 642                        break;
 643                sync_page(page);
 644                schedule();
 645        } while (test_bit(bit_nr, &page->flags));
 646        __set_task_state(tsk, TASK_RUNNING);
 647        remove_wait_queue(waitqueue, &wait);
 648}
 649EXPORT_SYMBOL(wait_on_page_bit);
 650
 651/**
 652 * unlock_page() - unlock a locked page
 653 *
 654 * @page: the page
 655 *
 656 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 657 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 658 * mechananism between PageLocked pages and PageWriteback pages is shared.
 659 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 660 *
 661 * The first mb is necessary to safely close the critical section opened by the
 662 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
 663 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
 664 * parallel wait_on_page_locked()).
 665 */
 666void unlock_page(struct page *page)
 667{
 668        wait_queue_head_t *waitqueue = page_waitqueue(page);
 669        smp_mb__before_clear_bit();
 670        if (!TestClearPageLocked(page))
 671                BUG();
 672        smp_mb__after_clear_bit(); 
 673        if (waitqueue_active(waitqueue))
 674                wake_up_all(waitqueue);
 675}
 676
 677/*
 678 * End writeback against a page.
 679 */
 680void end_page_writeback(struct page *page)
 681{
 682        wait_queue_head_t *waitqueue = page_waitqueue(page);
 683        smp_mb__before_clear_bit();
 684        if (!TestClearPageWriteback(page))
 685                BUG();
 686        smp_mb__after_clear_bit(); 
 687        if (waitqueue_active(waitqueue))
 688                wake_up_all(waitqueue);
 689}
 690EXPORT_SYMBOL(end_page_writeback);
 691
 692/*
 693 * Get a lock on the page, assuming we need to sleep
 694 * to get it..
 695 */
 696static void __lock_page(struct page *page)
 697{
 698        wait_queue_head_t *waitqueue = page_waitqueue(page);
 699        struct task_struct *tsk = current;
 700        DECLARE_WAITQUEUE(wait, tsk);
 701
 702        add_wait_queue_exclusive(waitqueue, &wait);
 703        for (;;) {
 704                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 705                if (PageLocked(page)) {
 706                        sync_page(page);
 707                        schedule();
 708                }
 709                if (!TestSetPageLocked(page))
 710                        break;
 711        }
 712        __set_task_state(tsk, TASK_RUNNING);
 713        remove_wait_queue(waitqueue, &wait);
 714}
 715
 716/*
 717 * Get an exclusive lock on the page, optimistically
 718 * assuming it's not locked..
 719 */
 720void lock_page(struct page *page)
 721{
 722        if (TestSetPageLocked(page))
 723                __lock_page(page);
 724}
 725
 726/*
 727 * a rather lightweight function, finding and getting a reference to a
 728 * hashed page atomically.
 729 */
 730struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 731{
 732        struct page *page;
 733
 734        /*
 735         * We scan the hash list read-only. Addition to and removal from
 736         * the hash-list needs a held write-lock.
 737         */
 738        read_lock(&mapping->page_lock);
 739        page = radix_tree_lookup(&mapping->page_tree, offset);
 740        if (page)
 741                page_cache_get(page);
 742        read_unlock(&mapping->page_lock);
 743        return page;
 744}
 745
 746/*
 747 * Same as above, but trylock it instead of incrementing the count.
 748 */
 749struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
 750{
 751        struct page *page;
 752
 753        read_lock(&mapping->page_lock);
 754        page = radix_tree_lookup(&mapping->page_tree, offset);
 755        if (page && TestSetPageLocked(page))
 756                page = NULL;
 757        read_unlock(&mapping->page_lock);
 758        return page;
 759}
 760
 761/**
 762 * find_lock_page - locate, pin and lock a pagecache page
 763 *
 764 * @mapping - the address_space to search
 765 * @offset - the page index
 766 *
 767 * Locates the desired pagecache page, locks it, increments its reference
 768 * count and returns its address.
 769 *
 770 * Returns zero if the page was not present. find_lock_page() may sleep.
 771 */
 772struct page *find_lock_page(struct address_space *mapping,
 773                                unsigned long offset)
 774{
 775        struct page *page;
 776
 777        read_lock(&mapping->page_lock);
 778repeat:
 779        page = radix_tree_lookup(&mapping->page_tree, offset);
 780        if (page) {
 781                page_cache_get(page);
 782                if (TestSetPageLocked(page)) {
 783                        read_unlock(&mapping->page_lock);
 784                        lock_page(page);
 785                        read_lock(&mapping->page_lock);
 786
 787                        /* Has the page been truncated while we slept? */
 788                        if (page->mapping != mapping || page->index != offset) {
 789                                unlock_page(page);
 790                                page_cache_release(page);
 791                                goto repeat;
 792                        }
 793                }
 794        }
 795        read_unlock(&mapping->page_lock);
 796        return page;
 797}
 798
 799/**
 800 * find_or_create_page - locate or add a pagecache page
 801 *
 802 * @mapping - the page's address_space
 803 * @index - the page's index into the mapping
 804 * @gfp_mask - page allocation mode
 805 *
 806 * Locates a page in the pagecache.  If the page is not present, a new page
 807 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
 808 * LRU list.  The returned page is locked and has its reference count
 809 * incremented.
 810 *
 811 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
 812 * allocation!
 813 *
 814 * find_or_create_page() returns the desired page's address, or zero on
 815 * memory exhaustion.
 816 */
 817struct page *find_or_create_page(struct address_space *mapping,
 818                unsigned long index, unsigned int gfp_mask)
 819{
 820        struct page *page, *cached_page = NULL;
 821        int err;
 822repeat:
 823        page = find_lock_page(mapping, index);
 824        if (!page) {
 825                if (!cached_page) {
 826                        cached_page = alloc_page(gfp_mask);
 827                        if (!cached_page)
 828                                return NULL;
 829                }
 830                err = add_to_page_cache_lru(cached_page, mapping, index);
 831                if (!err) {
 832                        page = cached_page;
 833                        cached_page = NULL;
 834                } else if (err == -EEXIST)
 835                        goto repeat;
 836        }
 837        if (cached_page)
 838                page_cache_release(cached_page);
 839        return page;
 840}
 841
 842/*
 843 * Same as grab_cache_page, but do not wait if the page is unavailable.
 844 * This is intended for speculative data generators, where the data can
 845 * be regenerated if the page couldn't be grabbed.  This routine should
 846 * be safe to call while holding the lock for another page.
 847 *
 848 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 849 * and deadlock against the caller's locked page.
 850 */
 851struct page *
 852grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 853{
 854        struct page *page = find_get_page(mapping, index);
 855
 856        if (page) {
 857                if (!TestSetPageLocked(page))
 858                        return page;
 859                page_cache_release(page);
 860                return NULL;
 861        }
 862        page = alloc_pages(mapping->gfp_mask & ~__GFP_FS, 0);
 863        if (page && add_to_page_cache_lru(page, mapping, index)) {
 864                page_cache_release(page);
 865                page = NULL;
 866        }
 867        return page;
 868}
 869
 870/*
 871 * Mark a page as having seen activity.
 872 *
 873 * inactive,unreferenced        ->      inactive,referenced
 874 * inactive,referenced          ->      active,unreferenced
 875 * active,unreferenced          ->      active,referenced
 876 */
 877void mark_page_accessed(struct page *page)
 878{
 879        if (!PageActive(page) && PageReferenced(page)) {
 880                activate_page(page);
 881                ClearPageReferenced(page);
 882                return;
 883        } else if (!PageReferenced(page)) {
 884                SetPageReferenced(page);
 885        }
 886}
 887
 888/*
 889 * This is a generic file read routine, and uses the
 890 * inode->i_op->readpage() function for the actual low-level
 891 * stuff.
 892 *
 893 * This is really ugly. But the goto's actually try to clarify some
 894 * of the logic when it comes to error handling etc.
 895 */
 896void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 897{
 898        struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
 899        struct inode *inode = mapping->host;
 900        unsigned long index, offset;
 901        struct page *cached_page;
 902        int error;
 903
 904        cached_page = NULL;
 905        index = *ppos >> PAGE_CACHE_SHIFT;
 906        offset = *ppos & ~PAGE_CACHE_MASK;
 907
 908        for (;;) {
 909                struct page *page;
 910                unsigned long end_index, nr, ret;
 911
 912                end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 913                        
 914                if (index > end_index)
 915                        break;
 916                nr = PAGE_CACHE_SIZE;
 917                if (index == end_index) {
 918                        nr = inode->i_size & ~PAGE_CACHE_MASK;
 919                        if (nr <= offset)
 920                                break;
 921                }
 922
 923                page_cache_readahead(filp, index);
 924
 925                nr = nr - offset;
 926
 927                /*
 928                 * Try to find the data in the page cache..
 929                 */
 930find_page:
 931                read_lock(&mapping->page_lock);
 932                page = radix_tree_lookup(&mapping->page_tree, index);
 933                if (!page) {
 934                        read_unlock(&mapping->page_lock);
 935                        handle_ra_miss(filp);
 936                        goto no_cached_page;
 937                }
 938                page_cache_get(page);
 939                read_unlock(&mapping->page_lock);
 940
 941                if (!PageUptodate(page))
 942                        goto page_not_up_to_date;
 943page_ok:
 944                /* If users can be writing to this page using arbitrary
 945                 * virtual addresses, take care about potential aliasing
 946                 * before reading the page on the kernel side.
 947                 */
 948                if (!list_empty(&mapping->i_mmap_shared))
 949                        flush_dcache_page(page);
 950
 951                /*
 952                 * Mark the page accessed if we read the beginning.
 953                 */
 954                if (!offset)
 955                        mark_page_accessed(page);
 956
 957                /*
 958                 * Ok, we have the page, and it's up-to-date, so
 959                 * now we can copy it to user space...
 960                 *
 961                 * The actor routine returns how many bytes were actually used..
 962                 * NOTE! This may not be the same as how much of a user buffer
 963                 * we filled up (we may be padding etc), so we can only update
 964                 * "pos" here (the actor routine has to update the user buffer
 965                 * pointers and the remaining count).
 966                 */
 967                ret = actor(desc, page, offset, nr);
 968                offset += ret;
 969                index += offset >> PAGE_CACHE_SHIFT;
 970                offset &= ~PAGE_CACHE_MASK;
 971
 972                page_cache_release(page);
 973                if (ret == nr && desc->count)
 974                        continue;
 975                break;
 976
 977page_not_up_to_date:
 978                if (PageUptodate(page))
 979                        goto page_ok;
 980
 981                /* Get exclusive access to the page ... */
 982                lock_page(page);
 983
 984                /* Did it get unhashed before we got the lock? */
 985                if (!page->mapping) {
 986                        unlock_page(page);
 987                        page_cache_release(page);
 988                        continue;
 989                }
 990
 991                /* Did somebody else fill it already? */
 992                if (PageUptodate(page)) {
 993                        unlock_page(page);
 994                        goto page_ok;
 995                }
 996
 997readpage:
 998                /* ... and start the actual read. The read will unlock the page. */
 999                error = mapping->a_ops->readpage(filp, page);
1000
1001                if (!error) {
1002                        if (PageUptodate(page))
1003                                goto page_ok;
1004                        wait_on_page_locked(page);
1005                        if (PageUptodate(page))
1006                                goto page_ok;
1007                        error = -EIO;
1008                }
1009
1010                /* UHHUH! A synchronous read error occurred. Report it */
1011                desc->error = error;
1012                page_cache_release(page);
1013                break;
1014
1015no_cached_page:
1016                /*
1017                 * Ok, it wasn't cached, so we need to create a new
1018                 * page..
1019                 */
1020                if (!cached_page) {
1021                        cached_page = page_cache_alloc(mapping);
1022                        if (!cached_page) {
1023                                desc->error = -ENOMEM;
1024                                break;
1025                        }
1026                }
1027                error = add_to_page_cache_lru(cached_page, mapping, index);
1028                if (error) {
1029                        if (error == -EEXIST)
1030                                goto find_page;
1031                        desc->error = error;
1032                        break;
1033                }
1034                page = cached_page;
1035                cached_page = NULL;
1036                goto readpage;
1037        }
1038
1039        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1040        if (cached_page)
1041                page_cache_release(cached_page);
1042        UPDATE_ATIME(inode);
1043}
1044
1045/*
1046 * Fault a userspace page into pagetables.  Return non-zero on a fault.
1047 *
1048 * FIXME: this assumes that two userspace pages are always sufficient.  That's
1049 * not true if PAGE_CACHE_SIZE > PAGE_SIZE.
1050 */
1051static inline int fault_in_pages_writeable(char *uaddr, int size)
1052{
1053        int ret;
1054
1055        /*
1056         * Writing zeroes into userspace here is OK, because we know that if
1057         * the zero gets there, we'll be overwriting it.
1058         */
1059        ret = __put_user(0, uaddr);
1060        if (ret == 0) {
1061                char *end = uaddr + size - 1;
1062
1063                /*
1064                 * If the page was already mapped, this will get a cache miss
1065                 * for sure, so try to avoid doing it.
1066                 */
1067                if (((unsigned long)uaddr & PAGE_MASK) !=
1068                                ((unsigned long)end & PAGE_MASK))
1069                        ret = __put_user(0, end);
1070        }
1071        return ret;
1072}
1073
1074static inline void fault_in_pages_readable(const char *uaddr, int size)
1075{
1076        volatile char c;
1077        int ret;
1078
1079        ret = __get_user(c, (char *)uaddr);
1080        if (ret == 0) {
1081                const char *end = uaddr + size - 1;
1082
1083                if (((unsigned long)uaddr & PAGE_MASK) !=
1084                                ((unsigned long)end & PAGE_MASK))
1085                        __get_user(c, (char *)end);
1086        }
1087}
1088
1089int file_read_actor(read_descriptor_t *desc, struct page *page,
1090                        unsigned long offset, unsigned long size)
1091{
1092        char *kaddr;
1093        unsigned long left, count = desc->count;
1094
1095        if (size > count)
1096                size = count;
1097
1098        /*
1099         * Faults on the destination of a read are common, so do it before
1100         * taking the kmap.
1101         */
1102        if (!fault_in_pages_writeable(desc->buf, size)) {
1103                kaddr = kmap_atomic(page, KM_USER0);
1104                left = __copy_to_user(desc->buf, kaddr + offset, size);
1105                kunmap_atomic(kaddr, KM_USER0);
1106                if (left == 0)
1107                        goto success;
1108        }
1109
1110        /* Do it the slow way */
1111        kaddr = kmap(page);
1112        left = __copy_to_user(desc->buf, kaddr + offset, size);
1113        kunmap(page);
1114
1115        if (left) {
1116                size -= left;
1117                desc->error = -EFAULT;
1118        }
1119success:
1120        desc->count = count - size;
1121        desc->written += size;
1122        desc->buf += size;
1123        return size;
1124}
1125
1126/*
1127 * This is the "read()" routine for all filesystems
1128 * that can use the page cache directly.
1129 */
1130static ssize_t
1131__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1132                unsigned long nr_segs, loff_t *ppos)
1133{
1134        struct file *filp = iocb->ki_filp;
1135        ssize_t retval;
1136        unsigned long seg;
1137        size_t count;
1138
1139        count = 0;
1140        for (seg = 0; seg < nr_segs; seg++) {
1141                const struct iovec *iv = &iov[seg];
1142
1143                /*
1144                 * If any segment has a negative length, or the cumulative
1145                 * length ever wraps negative then return -EINVAL.
1146                 */
1147                count += iv->iov_len;
1148                if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1149                        return -EINVAL;
1150                if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1151                        continue;
1152                if (seg == 0)
1153                        return -EFAULT;
1154                nr_segs = seg;
1155                break;
1156        }
1157
1158        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1159        if (filp->f_flags & O_DIRECT) {
1160                loff_t pos = *ppos, size;
1161                struct address_space *mapping;
1162                struct inode *inode;
1163
1164                mapping = filp->f_dentry->d_inode->i_mapping;
1165                inode = mapping->host;
1166                retval = 0;
1167                if (!count)
1168                        goto out; /* skip atime */
1169                size = inode->i_size;
1170                if (pos < size) {
1171                        if (pos + count > size) {
1172                                count = size - pos;
1173                                nr_segs = iov_shorten((struct iovec *)iov,
1174                                                        nr_segs, count);
1175                        }
1176                        retval = generic_file_direct_IO(READ, inode, 
1177                                        iov, pos, nr_segs);
1178                        if (retval > 0)
1179                                *ppos = pos + retval;
1180                }
1181                UPDATE_ATIME(filp->f_dentry->d_inode);
1182                goto out;
1183        }
1184
1185        retval = 0;
1186        if (count) {
1187                for (seg = 0; seg < nr_segs; seg++) {
1188                        read_descriptor_t desc;
1189
1190                        desc.written = 0;
1191                        desc.buf = iov[seg].iov_base;
1192                        desc.count = iov[seg].iov_len;
1193                        if (desc.count == 0)
1194                                continue;
1195                        desc.error = 0;
1196                        do_generic_file_read(filp,ppos,&desc,file_read_actor);
1197                        retval += desc.written;
1198                        if (!retval) {
1199                                retval = desc.error;
1200                                break;
1201                        }
1202                }
1203        }
1204out:
1205        return retval;
1206}
1207
1208ssize_t
1209generic_file_aio_read(struct kiocb *iocb, char *buf, size_t count, loff_t *ppos)
1210{
1211        struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1212
1213        return __generic_file_aio_read(iocb, &local_iov, 1, ppos);
1214}
1215
1216ssize_t
1217generic_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
1218{
1219        struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1220        struct kiocb kiocb;
1221        ssize_t ret;
1222
1223        init_sync_kiocb(&kiocb, filp);
1224        ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1225        if (-EIOCBQUEUED == ret)
1226                ret = wait_on_sync_kiocb(&kiocb);
1227        return ret;
1228}
1229
1230static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1231{
1232        ssize_t written;
1233        unsigned long count = desc->count;
1234        struct file *file = (struct file *) desc->buf;
1235
1236        if (size > count)
1237                size = count;
1238
1239        written = file->f_op->sendpage(file, page, offset,
1240                                       size, &file->f_pos, size<count);
1241        if (written < 0) {
1242                desc->error = written;
1243                written = 0;
1244        }
1245        desc->count = count - written;
1246        desc->written += written;
1247        return written;
1248}
1249
1250ssize_t generic_file_sendfile(struct file *out_file, struct file *in_file,
1251                              loff_t *ppos, size_t count)
1252{
1253        read_descriptor_t desc;
1254
1255        if (!count)
1256                return 0;
1257
1258        desc.written = 0;
1259        desc.count = count;
1260        desc.buf = (char *)out_file;
1261        desc.error = 0;
1262
1263        do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1264        if (desc.written)
1265                return desc.written;
1266        return desc.error;
1267}
1268
1269static ssize_t
1270do_readahead(struct file *file, unsigned long index, unsigned long nr)
1271{
1272        struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1273        unsigned long max;
1274        unsigned long active;
1275        unsigned long inactive;
1276
1277        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1278                return -EINVAL;
1279
1280        /* Limit it to a sane percentage of the inactive list.. */
1281        get_zone_counts(&active, &inactive);
1282        max = inactive / 2;
1283        if (nr > max)
1284                nr = max;
1285
1286        do_page_cache_readahead(file, index, nr);
1287        return 0;
1288}
1289
1290asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1291{
1292        ssize_t ret;
1293        struct file *file;
1294
1295        ret = -EBADF;
1296        file = fget(fd);
1297        if (file) {
1298                if (file->f_mode & FMODE_READ) {
1299                        unsigned long start = offset >> PAGE_CACHE_SHIFT;
1300                        unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1301                        unsigned long len = end - start + 1;
1302                        ret = do_readahead(file, start, len);
1303                }
1304                fput(file);
1305        }
1306        return ret;
1307}
1308
1309/*
1310 * filemap_nopage() is invoked via the vma operations vector for a
1311 * mapped memory region to read in file data during a page fault.
1312 *
1313 * The goto's are kind of ugly, but this streamlines the normal case of having
1314 * it in the page cache, and handles the special cases reasonably without
1315 * having a lot of duplicated code.
1316 */
1317
1318struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
1319{
1320        int error;
1321        struct file *file = area->vm_file;
1322        struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1323        struct inode *inode = mapping->host;
1324        struct page *page;
1325        unsigned long size, pgoff, endoff;
1326        int did_readahead;
1327
1328        pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1329        endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1330
1331retry_all:
1332        /*
1333         * An external ptracer can access pages that normally aren't
1334         * accessible..
1335         */
1336        size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1337        if ((pgoff >= size) && (area->vm_mm == current->mm))
1338                return NULL;
1339
1340        /*
1341         * The "size" of the file, as far as mmap is concerned, isn't bigger
1342         * than the mapping
1343         */
1344        if (size > endoff)
1345                size = endoff;
1346
1347        did_readahead = 0;
1348
1349        /*
1350         * The readahead code wants to be told about each and every page
1351         * so it can build and shrink its windows appropriately
1352         */
1353        if (VM_SequentialReadHint(area)) {
1354                did_readahead = 1;
1355                page_cache_readahead(area->vm_file, pgoff);
1356        }
1357
1358        /*
1359         * If the offset is outside the mapping size we're off the end
1360         * of a privately mapped file, so we need to map a zero page.
1361         */
1362        if ((pgoff < size) && !VM_RandomReadHint(area)) {
1363                did_readahead = 1;
1364                page_cache_readaround(file, pgoff);
1365        }
1366
1367        /*
1368         * Do we have something in the page cache already?
1369         */
1370retry_find:
1371        page = find_get_page(mapping, pgoff);
1372        if (!page) {
1373                if (did_readahead) {
1374                        handle_ra_miss(file);
1375                        did_readahead = 0;
1376                }
1377                goto no_cached_page;
1378        }
1379
1380        /*
1381         * Ok, found a page in the page cache, now we need to check
1382         * that it's up-to-date.
1383         */
1384        if (!PageUptodate(page))
1385                goto page_not_uptodate;
1386
1387success:
1388        /*
1389         * Found the page and have a reference on it, need to check sharing
1390         * and possibly copy it over to another page..
1391         */
1392        mark_page_accessed(page);
1393        flush_page_to_ram(page);
1394        return page;
1395
1396no_cached_page:
1397        /*
1398         * We're only likely to ever get here if MADV_RANDOM is in
1399         * effect.
1400         */
1401        error = page_cache_read(file, pgoff);
1402
1403        /*
1404         * The page we want has now been added to the page cache.
1405         * In the unlikely event that someone removed it in the
1406         * meantime, we'll just come back here and read it again.
1407         */
1408        if (error >= 0)
1409                goto retry_find;
1410
1411        /*
1412         * An error return from page_cache_read can result if the
1413         * system is low on memory, or a problem occurs while trying
1414         * to schedule I/O.
1415         */
1416        if (error == -ENOMEM)
1417                return NOPAGE_OOM;
1418        return NULL;
1419
1420page_not_uptodate:
1421        KERNEL_STAT_INC(pgmajfault);
1422        lock_page(page);
1423
1424        /* Did it get unhashed while we waited for it? */
1425        if (!page->mapping) {
1426                unlock_page(page);
1427                page_cache_release(page);
1428                goto retry_all;
1429        }
1430
1431        /* Did somebody else get it up-to-date? */
1432        if (PageUptodate(page)) {
1433                unlock_page(page);
1434                goto success;
1435        }
1436
1437        if (!mapping->a_ops->readpage(file, page)) {
1438                wait_on_page_locked(page);
1439                if (PageUptodate(page))
1440                        goto success;
1441        }
1442
1443        /*
1444         * Umm, take care of errors if the page isn't up-to-date.
1445         * Try to re-read it _once_. We do this synchronously,
1446         * because there really aren't any performance issues here
1447         * and we need to check for errors.
1448         */
1449        lock_page(page);
1450
1451        /* Somebody truncated the page on us? */
1452        if (!page->mapping) {
1453                unlock_page(page);
1454                page_cache_release(page);
1455                goto retry_all;
1456        }
1457
1458        /* Somebody else successfully read it in? */
1459        if (PageUptodate(page)) {
1460                unlock_page(page);
1461                goto success;
1462        }
1463        ClearPageError(page);
1464        if (!mapping->a_ops->readpage(file, page)) {
1465                wait_on_page_locked(page);
1466                if (PageUptodate(page))
1467                        goto success;
1468        }
1469
1470        /*
1471         * Things didn't work out. Return zero to tell the
1472         * mm layer so, possibly freeing the page cache page first.
1473         */
1474        page_cache_release(page);
1475        return NULL;
1476}
1477
1478static struct vm_operations_struct generic_file_vm_ops = {
1479        .nopage         = filemap_nopage,
1480};
1481
1482/* This is used for a general mmap of a disk file */
1483
1484int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1485{
1486        struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1487        struct inode *inode = mapping->host;
1488
1489        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1490                if (!mapping->a_ops->writepage)
1491                        return -EINVAL;
1492        }
1493        if (!mapping->a_ops->readpage)
1494                return -ENOEXEC;
1495        UPDATE_ATIME(inode);
1496        vma->vm_ops = &generic_file_vm_ops;
1497        return 0;
1498}
1499
1500static inline struct page *__read_cache_page(struct address_space *mapping,
1501                                unsigned long index,
1502                                int (*filler)(void *,struct page*),
1503                                void *data)
1504{
1505        struct page *page, *cached_page = NULL;
1506        int err;
1507repeat:
1508        page = find_get_page(mapping, index);
1509        if (!page) {
1510                if (!cached_page) {
1511                        cached_page = page_cache_alloc(mapping);
1512                        if (!cached_page)
1513                                return ERR_PTR(-ENOMEM);
1514                }
1515                err = add_to_page_cache_lru(cached_page, mapping, index);
1516                if (err == -EEXIST)
1517                        goto repeat;
1518                if (err < 0) {
1519                        /* Presumably ENOMEM for radix tree node */
1520                        page_cache_release(cached_page);
1521                        return ERR_PTR(err);
1522                }
1523                page = cached_page;
1524                cached_page = NULL;
1525                err = filler(data, page);
1526                if (err < 0) {
1527                        page_cache_release(page);
1528                        page = ERR_PTR(err);
1529                }
1530        }
1531        if (cached_page)
1532                page_cache_release(cached_page);
1533        return page;
1534}
1535
1536/*
1537 * Read into the page cache. If a page already exists,
1538 * and PageUptodate() is not set, try to fill the page.
1539 */
1540struct page *read_cache_page(struct address_space *mapping,
1541                                unsigned long index,
1542                                int (*filler)(void *,struct page*),
1543                                void *data)
1544{
1545        struct page *page;
1546        int err;
1547
1548retry:
1549        page = __read_cache_page(mapping, index, filler, data);
1550        if (IS_ERR(page))
1551                goto out;
1552        mark_page_accessed(page);
1553        if (PageUptodate(page))
1554                goto out;
1555
1556        lock_page(page);
1557        if (!page->mapping) {
1558                unlock_page(page);
1559                page_cache_release(page);
1560                goto retry;
1561        }
1562        if (PageUptodate(page)) {
1563                unlock_page(page);
1564                goto out;
1565        }
1566        err = filler(data, page);
1567        if (err < 0) {
1568                page_cache_release(page);
1569                page = ERR_PTR(err);
1570        }
1571 out:
1572        return page;
1573}
1574
1575/*
1576 * If the page was newly created, increment its refcount and add it to the
1577 * caller's lru-buffering pagevec.  This function is specifically for
1578 * generic_file_write().
1579 */
1580static inline struct page *
1581__grab_cache_page(struct address_space *mapping, unsigned long index,
1582                        struct page **cached_page, struct pagevec *lru_pvec)
1583{
1584        int err;
1585        struct page *page;
1586repeat:
1587        page = find_lock_page(mapping, index);
1588        if (!page) {
1589                if (!*cached_page) {
1590                        *cached_page = page_cache_alloc(mapping);
1591                        if (!*cached_page)
1592                                return NULL;
1593                }
1594                err = add_to_page_cache(*cached_page, mapping, index);
1595                if (err == -EEXIST)
1596                        goto repeat;
1597                if (err == 0) {
1598                        page = *cached_page;
1599                        page_cache_get(page);
1600                        if (!pagevec_add(lru_pvec, page))
1601                                __pagevec_lru_add(lru_pvec);
1602                        *cached_page = NULL;
1603                }
1604        }
1605        return page;
1606}
1607
1608inline void remove_suid(struct dentry *dentry)
1609{
1610        struct iattr newattrs;
1611        struct inode *inode = dentry->d_inode;
1612        unsigned int mode = inode->i_mode & (S_ISUID|S_ISGID|S_IXGRP);
1613
1614        if (!(mode & S_IXGRP))
1615                mode &= S_ISUID;
1616
1617        /* was any of the uid bits set? */
1618        if (mode && !capable(CAP_FSETID)) {
1619                newattrs.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
1620                notify_change(dentry, &newattrs);
1621        }
1622}
1623
1624static inline int
1625filemap_copy_from_user(struct page *page, unsigned long offset,
1626                        const char *buf, unsigned bytes)
1627{
1628        char *kaddr;
1629        int left;
1630
1631        kaddr = kmap_atomic(page, KM_USER0);
1632        left = __copy_from_user(kaddr + offset, buf, bytes);
1633        kunmap_atomic(kaddr, KM_USER0);
1634
1635        if (left != 0) {
1636                /* Do it the slow way */
1637                kaddr = kmap(page);
1638                left = __copy_from_user(kaddr + offset, buf, bytes);
1639                kunmap(page);
1640        }
1641        return left;
1642}
1643
1644static inline int
1645__filemap_copy_from_user_iovec(char *vaddr, 
1646                        const struct iovec *iov, size_t base, unsigned bytes)
1647{
1648        int left = 0;
1649
1650        while (bytes) {
1651                char *buf = iov->iov_base + base;
1652                int copy = min(bytes, iov->iov_len - base);
1653                base = 0;
1654                if ((left = __copy_from_user(vaddr, buf, copy)))
1655                        break;
1656                bytes -= copy;
1657                vaddr += copy;
1658                iov++;
1659        }
1660        return left;
1661}
1662
1663static inline int
1664filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
1665                        const struct iovec *iov, size_t base, unsigned bytes)
1666{
1667        char *kaddr;
1668        int left;
1669
1670        kaddr = kmap_atomic(page, KM_USER0);
1671        left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
1672        kunmap_atomic(kaddr, KM_USER0);
1673        if (left != 0) {
1674                kaddr = kmap(page);
1675                left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
1676                kunmap(page);
1677        }
1678        return left;
1679}
1680
1681static inline void
1682filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, unsigned bytes)
1683{
1684        const struct iovec *iov = *iovp;
1685        size_t base = *basep;
1686
1687        while (bytes) {
1688                int copy = min(bytes, iov->iov_len - base);
1689                bytes -= copy;
1690                base += copy;
1691                if (iov->iov_len == base) {
1692                        iov++;
1693                        base = 0;
1694                }
1695        }
1696        *iovp = iov;
1697        *basep = base;
1698}
1699
1700
1701/*
1702 * Write to a file through the page cache. 
1703 *
1704 * We put everything into the page cache prior to writing it. This is not a
1705 * problem when writing full pages. With partial pages, however, we first have
1706 * to read the data into the cache, then dirty the page, and finally schedule
1707 * it for writing by marking it dirty.
1708 *                                                      okir@monad.swb.de
1709 */
1710ssize_t
1711generic_file_write_nolock(struct file *file, const struct iovec *iov,
1712                                unsigned long nr_segs, loff_t *ppos)
1713{
1714        struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
1715        struct address_space_operations *a_ops = mapping->a_ops;
1716        size_t ocount;          /* original count */
1717        size_t count;           /* after file limit checks */
1718        struct inode    *inode = mapping->host;
1719        unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1720        long            status = 0;
1721        loff_t          pos;
1722        struct page     *page;
1723        struct page     *cached_page = NULL;
1724        ssize_t         written;
1725        int             err;
1726        unsigned        bytes;
1727        time_t          time_now;
1728        struct pagevec  lru_pvec;
1729        const struct iovec *cur_iov = iov; /* current iovec */
1730        unsigned        iov_base = 0;      /* offset in the current iovec */
1731        unsigned long   seg;
1732        char            *buf;
1733
1734        ocount = 0;
1735        for (seg = 0; seg < nr_segs; seg++) {
1736                const struct iovec *iv = &iov[seg];
1737
1738                /*
1739                 * If any segment has a negative length, or the cumulative
1740                 * length ever wraps negative then return -EINVAL.
1741                 */
1742                ocount += iv->iov_len;
1743                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
1744                        return -EINVAL;
1745                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1746                        continue;
1747                if (seg == 0)
1748                        return -EFAULT;
1749                nr_segs = seg;
1750                break;
1751        }
1752        count = ocount;
1753
1754        pos = *ppos;
1755        if (unlikely(pos < 0))
1756                return -EINVAL;
1757
1758        pagevec_init(&lru_pvec);
1759
1760        if (unlikely(file->f_error)) {
1761                err = file->f_error;
1762                file->f_error = 0;
1763                goto out;
1764        }
1765
1766        written = 0;
1767
1768        /* FIXME: this is for backwards compatibility with 2.4 */
1769        if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
1770                pos = inode->i_size;
1771
1772        /*
1773         * Check whether we've reached the file size limit.
1774         */
1775        if (unlikely(limit != RLIM_INFINITY)) {
1776                if (pos >= limit) {
1777                        send_sig(SIGXFSZ, current, 0);
1778                        err = -EFBIG;
1779                        goto out;
1780                }
1781                if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
1782                        /* send_sig(SIGXFSZ, current, 0); */
1783                        count = limit - (u32)pos;
1784                }
1785        }
1786
1787        /*
1788         * LFS rule
1789         */
1790        if (unlikely(pos + count > MAX_NON_LFS &&
1791                                !(file->f_flags & O_LARGEFILE))) {
1792                if (pos >= MAX_NON_LFS) {
1793                        send_sig(SIGXFSZ, current, 0);
1794                        err = -EFBIG;
1795                        goto out;
1796                }
1797                if (count > MAX_NON_LFS - (u32)pos) {
1798                        /* send_sig(SIGXFSZ, current, 0); */
1799                        count = MAX_NON_LFS - (u32)pos;
1800                }
1801        }
1802
1803        /*
1804         * Are we about to exceed the fs block limit ?
1805         *
1806         * If we have written data it becomes a short write.  If we have
1807         * exceeded without writing data we send a signal and return EFBIG.
1808         * Linus frestrict idea will clean these up nicely..
1809         */
1810        if (likely(!S_ISBLK(inode->i_mode))) {
1811                if (unlikely(pos >= inode->i_sb->s_maxbytes)) {
1812                        if (count || pos > inode->i_sb->s_maxbytes) {
1813                                send_sig(SIGXFSZ, current, 0);
1814                                err = -EFBIG;
1815                                goto out;
1816                        }
1817                        /* zero-length writes at ->s_maxbytes are OK */
1818                }
1819
1820                if (unlikely(pos + count > inode->i_sb->s_maxbytes))
1821                        count = inode->i_sb->s_maxbytes - pos;
1822        } else {
1823                if (bdev_read_only(inode->i_bdev)) {
1824                        err = -EPERM;
1825                        goto out;
1826                }
1827                if (pos >= inode->i_size) {
1828                        if (count || pos > inode->i_size) {
1829                                err = -ENOSPC;
1830                                goto out;
1831                        }
1832                }
1833
1834                if (pos + count > inode->i_size)
1835                        count = inode->i_size - pos;
1836        }
1837
1838        err = 0;
1839        if (count == 0)
1840                goto out;
1841
1842        remove_suid(file->f_dentry);
1843        time_now = CURRENT_TIME;
1844        if (inode->i_ctime != time_now || inode->i_mtime != time_now) {
1845                inode->i_ctime = time_now;
1846                inode->i_mtime = time_now;
1847                mark_inode_dirty_sync(inode);
1848        }
1849
1850        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1851        if (unlikely(file->f_flags & O_DIRECT)) {
1852                if (count != ocount)
1853                        nr_segs = iov_shorten((struct iovec *)iov,
1854                                                nr_segs, count);
1855                written = generic_file_direct_IO(WRITE, inode, 
1856                                        iov, pos, nr_segs);
1857                if (written > 0) {
1858                        loff_t end = pos + written;
1859                        if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
1860                                inode->i_size = end;
1861                                mark_inode_dirty(inode);
1862                        }
1863                        *ppos = end;
1864                }
1865                /*
1866                 * Sync the fs metadata but not the minor inode changes and
1867                 * of course not the data as we did direct DMA for the IO.
1868                 */
1869                if (written >= 0 && file->f_flags & O_SYNC)
1870                        status = generic_osync_inode(inode, OSYNC_METADATA);
1871                goto out_status;
1872        }
1873
1874        buf = iov->iov_base;
1875        do {
1876                unsigned long index;
1877                unsigned long offset;
1878                long page_fault;
1879
1880                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1881                index = pos >> PAGE_CACHE_SHIFT;
1882                bytes = PAGE_CACHE_SIZE - offset;
1883                if (bytes > count)
1884                        bytes = count;
1885
1886                /*
1887                 * Bring in the user page that we will copy from _first_.
1888                 * Otherwise there's a nasty deadlock on copying from the
1889                 * same page as we're writing to, without it being marked
1890                 * up-to-date.
1891                 */
1892                fault_in_pages_readable(buf, bytes);
1893
1894                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
1895                if (!page) {
1896                        status = -ENOMEM;
1897                        break;
1898                }
1899
1900                status = a_ops->prepare_write(file, page, offset, offset+bytes);
1901                if (unlikely(status)) {
1902                        /*
1903                         * prepare_write() may have instantiated a few blocks
1904                         * outside i_size.  Trim these off again.
1905                         */
1906                        unlock_page(page);
1907                        page_cache_release(page);
1908                        if (pos + bytes > inode->i_size)
1909                                vmtruncate(inode, inode->i_size);
1910                        break;
1911                }
1912                if (likely(nr_segs == 1))
1913                        page_fault = filemap_copy_from_user(page, offset,
1914                                                        buf, bytes);
1915                else
1916                        page_fault = filemap_copy_from_user_iovec(page, offset,
1917                                                cur_iov, iov_base, bytes);
1918                flush_dcache_page(page);
1919                status = a_ops->commit_write(file, page, offset, offset+bytes);
1920                if (unlikely(page_fault)) {
1921                        status = -EFAULT;
1922                } else {
1923                        if (!status)
1924                                status = bytes;
1925
1926                        if (status >= 0) {
1927                                written += status;
1928                                count -= status;
1929                                pos += status;
1930                                buf += status;
1931                                if (unlikely(nr_segs > 1))
1932                                        filemap_set_next_iovec(&cur_iov,
1933                                                        &iov_base, status);
1934                        }
1935                }
1936                if (!PageReferenced(page))
1937                        SetPageReferenced(page);
1938                unlock_page(page);
1939                page_cache_release(page);
1940                if (status < 0)
1941                        break;
1942                balance_dirty_pages_ratelimited(mapping);
1943        } while (count);
1944        *ppos = pos;
1945
1946        if (cached_page)
1947                page_cache_release(cached_page);
1948
1949        /*
1950         * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
1951         */
1952        if (status >= 0) {
1953                if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1954                        status = generic_osync_inode(inode,
1955                                        OSYNC_METADATA|OSYNC_DATA);
1956        }
1957        
1958out_status:     
1959        err = written ? written : status;
1960out:
1961        pagevec_lru_add(&lru_pvec);
1962        return err;
1963}
1964
1965ssize_t generic_file_write(struct file *file, const char *buf,
1966                           size_t count, loff_t *ppos)
1967{
1968        struct inode    *inode = file->f_dentry->d_inode->i_mapping->host;
1969        int             err;
1970        struct iovec local_iov = { .iov_base = (void *)buf, .iov_len = count };
1971
1972        down(&inode->i_sem);
1973        err = generic_file_write_nolock(file, &local_iov, 1, ppos);
1974        up(&inode->i_sem);
1975
1976        return err;
1977}
1978
1979ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
1980                        unsigned long nr_segs, loff_t *ppos)
1981{
1982        struct kiocb kiocb;
1983        ssize_t ret;
1984
1985        init_sync_kiocb(&kiocb, filp);
1986        ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
1987        if (-EIOCBQUEUED == ret)
1988                ret = wait_on_sync_kiocb(&kiocb);
1989        return ret;
1990}
1991
1992ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
1993                        unsigned long nr_segs, loff_t * ppos) 
1994{
1995        struct inode *inode = file->f_dentry->d_inode;
1996        ssize_t ret;
1997
1998        down(&inode->i_sem);
1999        ret = generic_file_write_nolock(file, iov, nr_segs, ppos);
2000        up(&inode->i_sem);
2001        return ret;
2002}
2003
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.