linux-old/mm/filemap.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994, 1995  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem used to do this differently, for example)
  11 */
  12#include <linux/malloc.h>
  13#include <linux/shm.h>
  14#include <linux/mman.h>
  15#include <linux/locks.h>
  16#include <linux/pagemap.h>
  17#include <linux/swap.h>
  18#include <linux/smp_lock.h>
  19#include <linux/blkdev.h>
  20#include <linux/file.h>
  21#include <linux/swapctl.h>
  22#include <linux/init.h>
  23
  24#include <asm/pgtable.h>
  25#include <asm/uaccess.h>
  26
  27/*
  28 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  29 * though.
  30 *
  31 * Shared mappings now work. 15.8.1995  Bruno.
  32 */
  33
  34unsigned long page_cache_size = 0;
  35unsigned int page_hash_bits, page_hash_mask;
  36struct page **page_hash_table;
  37
  38static inline int sync_page(struct page *page)
  39{
  40        struct inode *inode = page->inode;
  41
  42        if (inode && inode->i_op && inode->i_op->sync_page)
  43                return inode->i_op->sync_page(page);
  44        run_task_queue(&tq_disk);
  45        return 0;
  46}
  47
  48/*
  49 * Invalidate the pages of an inode, removing all pages that aren't
  50 * locked down (those are sure to be up-to-date anyway, so we shouldn't
  51 * invalidate them).
  52 */
  53void invalidate_inode_pages(struct inode * inode)
  54{
  55        struct page ** p;
  56        struct page * page;
  57
  58        p = &inode->i_pages;
  59        while ((page = *p) != NULL) {
  60                if (PageLocked(page)) {
  61                        p = &page->next;
  62                        continue;
  63                }
  64                inode->i_nrpages--;
  65                if ((*p = page->next) != NULL)
  66                        (*p)->prev = page->prev;
  67                page->next = NULL;
  68                page->prev = NULL;
  69                remove_page_from_hash_queue(page);
  70                page->inode = NULL;
  71                page_cache_release(page);
  72                continue;
  73        }
  74}
  75
  76/*
  77 * Truncate the page cache at a set offset, removing the pages
  78 * that are beyond that offset (and zeroing out partial pages).
  79 */
  80void truncate_inode_pages(struct inode * inode, unsigned long start)
  81{
  82        struct page ** p;
  83        struct page * page;
  84
  85repeat:
  86        p = &inode->i_pages;
  87        while ((page = *p) != NULL) {
  88                unsigned long offset = page->offset;
  89
  90                /* page wholly truncated - free it */
  91                if (offset >= start) {
  92                        if (PageLocked(page)) {
  93                                wait_on_page(page);
  94                                goto repeat;
  95                        }
  96                        inode->i_nrpages--;
  97                        if ((*p = page->next) != NULL)
  98                                (*p)->prev = page->prev;
  99                        page->next = NULL;
 100                        page->prev = NULL;
 101                        remove_page_from_hash_queue(page);
 102                        page->inode = NULL;
 103                        page_cache_release(page);
 104                        continue;
 105                }
 106                p = &page->next;
 107                offset = start - offset;
 108                /* partial truncate, clear end of page */
 109                if (offset < PAGE_CACHE_SIZE) {
 110                        unsigned long address = page_address(page);
 111                        memset((void *) (offset + address), 0, PAGE_CACHE_SIZE - offset);
 112                        flush_page_to_ram(address);
 113                }
 114        }
 115}
 116
 117/*
 118 * Remove a page from the page cache and free it.
 119 */
 120void remove_inode_page(struct page *page)
 121{
 122        remove_page_from_hash_queue(page);
 123        remove_page_from_inode_queue(page);
 124        page_cache_release(page);
 125}
 126
 127int shrink_mmap(int priority, int gfp_mask)
 128{
 129        static unsigned long clock = 0;
 130        unsigned long limit = num_physpages;
 131        struct page * page;
 132        int count;
 133
 134        /* Make sure we scan all pages twice at priority 0. */
 135        count = limit / priority;
 136
 137 refresh_clock:
 138        page = mem_map + clock;
 139        do {
 140                int referenced;
 141
 142                if (current->need_resched) {
 143                        current->state = TASK_RUNNING;
 144                        schedule();
 145                        goto refresh_clock;
 146                }
 147                
 148                /* This works even in the presence of PageSkip because
 149                 * the first two entries at the beginning of a hole will
 150                 * be marked, not just the first.
 151                 */
 152                page++;
 153                clock++;
 154                if (clock >= max_mapnr) {
 155                        clock = 0;
 156                        page = mem_map;
 157                }
 158                if (PageSkip(page)) {
 159                        /* next_hash is overloaded for PageSkip */
 160                        page = page->next_hash;
 161                        clock = page - mem_map;
 162                }
 163                
 164                count--;
 165
 166                /* We can't free pages unless there's just one user */
 167                if (atomic_read(&page->count) != 1)
 168                        continue;
 169
 170                referenced = test_and_clear_bit(PG_referenced, &page->flags);
 171
 172                if (PageLocked(page))
 173                        continue;
 174
 175                if ((gfp_mask & __GFP_DMA) && !PageDMA(page)) {
 176                        count++;
 177                        continue;
 178                }
 179
 180                /*
 181                 * Is it a page swap page? If so, we want to
 182                 * drop it if it is no longer used, even if it
 183                 * were to be marked referenced..
 184                 */
 185                if (PageSwapCache(page)) {
 186                        if (referenced && swap_count(page->offset) != 1)
 187                                continue;
 188                        delete_from_swap_cache(page);
 189                        return 1;
 190                }       
 191
 192                if (referenced)
 193                        continue;
 194
 195                /* Is it a buffer page? */
 196                if (page->buffers) {
 197                        if (buffer_under_min())
 198                                continue;
 199                        /*
 200                         * We can sleep if we need to do some write
 201                         * throttling.
 202                         */
 203
 204                        if (!try_to_free_buffers(page, gfp_mask))
 205                                goto refresh_clock;
 206                        return 1;
 207                }
 208
 209                /* is it a page-cache page? */
 210                if (page->inode) {
 211                        if (pgcache_under_min())
 212                                continue;
 213                        remove_inode_page(page);
 214                        return 1;
 215                }
 216        } while (count > 0);
 217        return 0;
 218}
 219
 220/*
 221 * Update a page cache copy, when we're doing a "write()" system call
 222 * See also "update_vm_cache()".
 223 *
 224 * This function is conditional in that it checks whether the original
 225 * source of the data is the same as the ultimate destination, and
 226 * aborts the update if so.  
 227 *
 228 * The "source_address" is the virtual address of the original location
 229 * of the data we are injecting.  For writes from user mode, it is the
 230 * user VA.  However, for filemap_sync writes, "source_address", it is
 231 * the page cache address.  In both cases, "buf" points to the copy we
 232 * have already made in kernel space and we use that pointer for the
 233 * transfer.  source_address just allows us to detect an update_vm_cache
 234 * which is being sourced from the copy of the data already in the page
 235 * cache.  
 236 * 
 237 * This prevents munmap() and msync() from stomping all over shared
 238 * memory maps.  --sct
 239 */
 240
 241void update_vm_cache_conditional(struct inode * inode, unsigned long pos, const char * buf, int count, unsigned long source_address)
 242{
 243        unsigned long offset, len;
 244
 245        offset = (pos & ~PAGE_CACHE_MASK);
 246        pos = pos & PAGE_CACHE_MASK;
 247        len = PAGE_CACHE_SIZE - offset;
 248        do {
 249                struct page * page;
 250
 251                if (len > count)
 252                        len = count;
 253                page = find_page(inode, pos);
 254                if (page) {
 255                        char *dest = (char*) (offset + page_address(page));
 256
 257                        if ((unsigned long)dest != source_address 
 258                                || !segment_eq(get_fs(), KERNEL_DS)) {
 259                                wait_on_page(page);
 260                                memcpy(dest, buf, len);
 261                                flush_dcache_page(page_address(page));
 262                        }
 263                        page_cache_release(page);
 264                }
 265                count -= len;
 266                buf += len;
 267                len = PAGE_CACHE_SIZE;
 268                offset = 0;
 269                pos += PAGE_CACHE_SIZE;
 270        } while (count);
 271}
 272
 273void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
 274{
 275        update_vm_cache_conditional(inode, pos, buf, count, 0);
 276}
 277
 278
 279static inline void add_to_page_cache(struct page * page,
 280        struct inode * inode, unsigned long offset,
 281        struct page **hash)
 282{
 283        atomic_inc(&page->count);
 284        page->flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
 285        page->offset = offset;
 286        add_page_to_inode_queue(inode, page);
 287        __add_page_to_hash_queue(page, hash);
 288}
 289
 290/*
 291 * Try to read ahead in the file. "page_cache" is a potentially free page
 292 * that we could use for the cache (if it is 0 we can try to create one,
 293 * this is all overlapped with the IO on the previous page finishing anyway)
 294 */
 295static unsigned long try_to_read_ahead(struct file * file,
 296                                unsigned long offset, unsigned long page_cache)
 297{
 298        struct inode *inode = file->f_dentry->d_inode;
 299        struct page * page;
 300        struct page ** hash;
 301
 302        offset &= PAGE_CACHE_MASK;
 303        switch (page_cache) {
 304        case 0:
 305                page_cache = page_cache_alloc();
 306                if (!page_cache)
 307                        break;
 308        default:
 309                if (offset >= inode->i_size)
 310                        break;
 311                hash = page_hash(inode, offset);
 312                page = __find_page(inode, offset, *hash);
 313                if (!page) {
 314                        /*
 315                         * Ok, add the new page to the hash-queues...
 316                         */
 317                        page = page_cache_entry(page_cache);
 318                        add_to_page_cache(page, inode, offset, hash);
 319                        inode->i_op->readpage(file, page);
 320                        page_cache = 0;
 321                }
 322                page_cache_release(page);
 323        }
 324        return page_cache;
 325}
 326
 327/* 
 328 * Wait for IO to complete on a locked page.
 329 *
 330 * This must be called with the caller "holding" the page,
 331 * ie with increased "page->count" so that the page won't
 332 * go away during the wait..
 333 */
 334void __wait_on_page(struct page *page)
 335{
 336        struct task_struct *tsk = current;
 337        struct wait_queue wait;
 338
 339        wait.task = tsk;
 340        add_wait_queue(&page->wait, &wait);
 341repeat:
 342        tsk->state = TASK_UNINTERRUPTIBLE;
 343        sync_page(page);
 344        if (PageLocked(page)) {
 345                schedule();
 346                goto repeat;
 347        }
 348        tsk->state = TASK_RUNNING;
 349        remove_wait_queue(&page->wait, &wait);
 350}
 351
 352#if 0
 353#define PROFILE_READAHEAD
 354#define DEBUG_READAHEAD
 355#endif
 356
 357/*
 358 * Read-ahead profiling information
 359 * --------------------------------
 360 * Every PROFILE_MAXREADCOUNT, the following information is written 
 361 * to the syslog:
 362 *   Percentage of asynchronous read-ahead.
 363 *   Average of read-ahead fields context value.
 364 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 365 * to the syslog.
 366 */
 367
 368#ifdef PROFILE_READAHEAD
 369
 370#define PROFILE_MAXREADCOUNT 1000
 371
 372static unsigned long total_reada;
 373static unsigned long total_async;
 374static unsigned long total_ramax;
 375static unsigned long total_ralen;
 376static unsigned long total_rawin;
 377
 378static void profile_readahead(int async, struct file *filp)
 379{
 380        unsigned long flags;
 381
 382        ++total_reada;
 383        if (async)
 384                ++total_async;
 385
 386        total_ramax     += filp->f_ramax;
 387        total_ralen     += filp->f_ralen;
 388        total_rawin     += filp->f_rawin;
 389
 390        if (total_reada > PROFILE_MAXREADCOUNT) {
 391                save_flags(flags);
 392                cli();
 393                if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 394                        restore_flags(flags);
 395                        return;
 396                }
 397
 398                printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 399                        total_ramax/total_reada,
 400                        total_ralen/total_reada,
 401                        total_rawin/total_reada,
 402                        (total_async*100)/total_reada);
 403#ifdef DEBUG_READAHEAD
 404                printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 405                        filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 406#endif
 407
 408                total_reada     = 0;
 409                total_async     = 0;
 410                total_ramax     = 0;
 411                total_ralen     = 0;
 412                total_rawin     = 0;
 413
 414                restore_flags(flags);
 415        }
 416}
 417#endif  /* defined PROFILE_READAHEAD */
 418
 419/*
 420 * Read-ahead context:
 421 * -------------------
 422 * The read ahead context fields of the "struct file" are the following:
 423 * - f_raend : position of the first byte after the last page we tried to
 424 *             read ahead.
 425 * - f_ramax : current read-ahead maximum size.
 426 * - f_ralen : length of the current IO read block we tried to read-ahead.
 427 * - f_rawin : length of the current read-ahead window.
 428 *             if last read-ahead was synchronous then
 429 *                  f_rawin = f_ralen
 430 *             otherwise (was asynchronous)
 431 *                  f_rawin = previous value of f_ralen + f_ralen
 432 *
 433 * Read-ahead limits:
 434 * ------------------
 435 * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 436 * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 437 *
 438 * Synchronous read-ahead benefits:
 439 * --------------------------------
 440 * Using reasonable IO xfer length from peripheral devices increase system 
 441 * performances.
 442 * Reasonable means, in this context, not too large but not too small.
 443 * The actual maximum value is:
 444 *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 445 *      and 32K if defined (4K page size assumed).
 446 *
 447 * Asynchronous read-ahead benefits:
 448 * ---------------------------------
 449 * Overlapping next read request and user process execution increase system 
 450 * performance.
 451 *
 452 * Read-ahead risks:
 453 * -----------------
 454 * We have to guess which further data are needed by the user process.
 455 * If these data are often not really needed, it's bad for system 
 456 * performances.
 457 * However, we know that files are often accessed sequentially by 
 458 * application programs and it seems that it is possible to have some good 
 459 * strategy in that guessing.
 460 * We only try to read-ahead files that seems to be read sequentially.
 461 *
 462 * Asynchronous read-ahead risks:
 463 * ------------------------------
 464 * In order to maximize overlapping, we must start some asynchronous read 
 465 * request from the device, as soon as possible.
 466 * We must be very careful about:
 467 * - The number of effective pending IO read requests.
 468 *   ONE seems to be the only reasonable value.
 469 * - The total memory pool usage for the file access stream.
 470 *   This maximum memory usage is implicitly 2 IO read chunks:
 471 *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 472 *   64k if defined (4K page size assumed).
 473 */
 474
 475static inline int get_max_readahead(struct inode * inode)
 476{
 477        if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 478                return MAX_READAHEAD;
 479        return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 480}
 481
 482static inline unsigned long generic_file_readahead(int reada_ok,
 483        struct file * filp, struct inode * inode,
 484        unsigned long ppos, struct page * page, unsigned long page_cache)
 485{
 486        unsigned long max_ahead, ahead;
 487        unsigned long raend;
 488        int max_readahead = get_max_readahead(inode);
 489
 490        raend = filp->f_raend & PAGE_CACHE_MASK;
 491        max_ahead = 0;
 492
 493/*
 494 * The current page is locked.
 495 * If the current position is inside the previous read IO request, do not
 496 * try to reread previously read ahead pages.
 497 * Otherwise decide or not to read ahead some pages synchronously.
 498 * If we are not going to read ahead, set the read ahead context for this 
 499 * page only.
 500 */
 501        if (PageLocked(page)) {
 502                if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 503                        raend = ppos;
 504                        if (raend < inode->i_size)
 505                                max_ahead = filp->f_ramax;
 506                        filp->f_rawin = 0;
 507                        filp->f_ralen = PAGE_CACHE_SIZE;
 508                        if (!max_ahead) {
 509                                filp->f_raend  = ppos + filp->f_ralen;
 510                                filp->f_rawin += filp->f_ralen;
 511                        }
 512                }
 513        }
 514/*
 515 * The current page is not locked.
 516 * If we were reading ahead and,
 517 * if the current max read ahead size is not zero and,
 518 * if the current position is inside the last read-ahead IO request,
 519 *   it is the moment to try to read ahead asynchronously.
 520 * We will later force unplug device in order to force asynchronous read IO.
 521 */
 522        else if (reada_ok && filp->f_ramax && raend >= PAGE_CACHE_SIZE &&
 523                 ppos <= raend && ppos + filp->f_ralen >= raend) {
 524/*
 525 * Add ONE page to max_ahead in order to try to have about the same IO max size
 526 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 527 * Compute the position of the last page we have tried to read in order to 
 528 * begin to read ahead just at the next page.
 529 */
 530                raend -= PAGE_CACHE_SIZE;
 531                if (raend < inode->i_size)
 532                        max_ahead = filp->f_ramax + PAGE_CACHE_SIZE;
 533
 534                if (max_ahead) {
 535                        filp->f_rawin = filp->f_ralen;
 536                        filp->f_ralen = 0;
 537                        reada_ok      = 2;
 538                }
 539        }
 540/*
 541 * Try to read ahead pages.
 542 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 543 * scheduler, will work enough for us to avoid too bad actuals IO requests.
 544 */
 545        ahead = 0;
 546        while (ahead < max_ahead) {
 547                ahead += PAGE_CACHE_SIZE;
 548                page_cache = try_to_read_ahead(filp, raend + ahead,
 549                                                page_cache);
 550        }
 551/*
 552 * If we tried to read ahead some pages,
 553 * If we tried to read ahead asynchronously,
 554 *   Try to force unplug of the device in order to start an asynchronous
 555 *   read IO request.
 556 * Update the read-ahead context.
 557 * Store the length of the current read-ahead window.
 558 * Double the current max read ahead size.
 559 *   That heuristic avoid to do some large IO for files that are not really
 560 *   accessed sequentially.
 561 */
 562        if (ahead) {
 563                if (reada_ok == 2) {
 564                        run_task_queue(&tq_disk);
 565                }
 566
 567                filp->f_ralen += ahead;
 568                filp->f_rawin += filp->f_ralen;
 569                filp->f_raend = raend + ahead + PAGE_CACHE_SIZE;
 570
 571                filp->f_ramax += filp->f_ramax;
 572
 573                if (filp->f_ramax > max_readahead)
 574                        filp->f_ramax = max_readahead;
 575
 576#ifdef PROFILE_READAHEAD
 577                profile_readahead((reada_ok == 2), filp);
 578#endif
 579        }
 580
 581        return page_cache;
 582}
 583
 584/*
 585 * "descriptor" for what we're up to with a read.
 586 * This allows us to use the same read code yet
 587 * have multiple different users of the data that
 588 * we read from a file.
 589 *
 590 * The simplest case just copies the data to user
 591 * mode.
 592 */
 593typedef struct {
 594        size_t written;
 595        size_t count;
 596        char * buf;
 597        int error;
 598} read_descriptor_t;
 599
 600typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
 601
 602/*
 603 * This is a generic file read routine, and uses the
 604 * inode->i_op->readpage() function for the actual low-level
 605 * stuff.
 606 *
 607 * This is really ugly. But the goto's actually try to clarify some
 608 * of the logic when it comes to error handling etc.
 609 */
 610static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 611{
 612        struct dentry *dentry = filp->f_dentry;
 613        struct inode *inode = dentry->d_inode;
 614        unsigned long page_cache;
 615        size_t pos, pgpos;
 616        int reada_ok;
 617        int max_readahead = get_max_readahead(inode);
 618
 619        page_cache = 0;
 620
 621        pos = *ppos;
 622        pgpos = pos & PAGE_CACHE_MASK;
 623/*
 624 * If the current position is outside the previous read-ahead window, 
 625 * we reset the current read-ahead context and set read ahead max to zero
 626 * (will be set to just needed value later),
 627 * otherwise, we assume that the file accesses are sequential enough to
 628 * continue read-ahead.
 629 */
 630        if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
 631                reada_ok = 0;
 632                filp->f_raend = 0;
 633                filp->f_ralen = 0;
 634                filp->f_ramax = 0;
 635                filp->f_rawin = 0;
 636        } else {
 637                reada_ok = 1;
 638        }
 639/*
 640 * Adjust the current value of read-ahead max.
 641 * If the read operation stay in the first half page, force no readahead.
 642 * Otherwise try to increase read ahead max just enough to do the read request.
 643 * Then, at least MIN_READAHEAD if read ahead is ok,
 644 * and at most MAX_READAHEAD in all cases.
 645 */
 646        if (pos + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 647                filp->f_ramax = 0;
 648        } else {
 649                unsigned long needed;
 650
 651                needed = ((pos + desc->count) & PAGE_CACHE_MASK) - pgpos;
 652
 653                if (filp->f_ramax < needed)
 654                        filp->f_ramax = needed;
 655
 656                if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 657                                filp->f_ramax = MIN_READAHEAD;
 658                if (filp->f_ramax > max_readahead)
 659                        filp->f_ramax = max_readahead;
 660        }
 661
 662        for (;;) {
 663                struct page *page, **hash;
 664
 665                if (pos >= inode->i_size)
 666                        break;
 667
 668                /*
 669                 * Try to find the data in the page cache..
 670                 */
 671                hash = page_hash(inode, pos & PAGE_CACHE_MASK);
 672                page = __find_page(inode, pos & PAGE_CACHE_MASK, *hash);
 673                if (!page)
 674                        goto no_cached_page;
 675
 676found_page:
 677/*
 678 * Try to read ahead only if the current page is filled or being filled.
 679 * Otherwise, if we were reading ahead, decrease max read ahead size to
 680 * the minimum value.
 681 * In this context, that seems to may happen only on some read error or if 
 682 * the page has been rewritten.
 683 */
 684                if (PageUptodate(page) || PageLocked(page))
 685                        page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
 686                else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 687                                filp->f_ramax = MIN_READAHEAD;
 688
 689                wait_on_page(page);
 690
 691                if (!PageUptodate(page))
 692                        goto page_read_error;
 693
 694success:
 695                /*
 696                 * Ok, we have the page, it's up-to-date and ok,
 697                 * so now we can finally copy it to user space...
 698                 */
 699        {
 700                unsigned long offset, nr;
 701
 702                /* If users can be writing to this page using arbitrary
 703                 * virtual addresses, take care about potential aliasing
 704                 * before reading the page on the kernel side.
 705                 */
 706                if (inode->i_mmap_shared != NULL)
 707                        flush_dcache_page(page_address(page));
 708
 709                offset = pos & ~PAGE_CACHE_MASK;
 710                nr = PAGE_CACHE_SIZE - offset;
 711                if (nr > inode->i_size - pos)
 712                        nr = inode->i_size - pos;
 713
 714                /*
 715                 * The actor routine returns how many bytes were actually used..
 716                 * NOTE! This may not be the same as how much of a user buffer
 717                 * we filled up (we may be padding etc), so we can only update
 718                 * "pos" here (the actor routine has to update the user buffer
 719                 * pointers and the remaining count).
 720                 */
 721                nr = actor(desc, (const char *) (page_address(page) + offset), nr);
 722                pos += nr;
 723                page_cache_release(page);
 724                if (nr && desc->count)
 725                        continue;
 726                break;
 727        }
 728
 729no_cached_page:
 730                /*
 731                 * Ok, it wasn't cached, so we need to create a new
 732                 * page..
 733                 */
 734                if (!page_cache) {
 735                        page_cache = page_cache_alloc();
 736                        /*
 737                         * That could have slept, so go around to the
 738                         * very beginning..
 739                         */
 740                        if (page_cache)
 741                                continue;
 742                        desc->error = -ENOMEM;
 743                        break;
 744                }
 745
 746                /*
 747                 * Ok, add the new page to the hash-queues...
 748                 */
 749                page = page_cache_entry(page_cache);
 750                page_cache = 0;
 751                add_to_page_cache(page, inode, pos & PAGE_CACHE_MASK, hash);
 752
 753                /*
 754                 * Error handling is tricky. If we get a read error,
 755                 * the cached page stays in the cache (but uptodate=0),
 756                 * and the next process that accesses it will try to
 757                 * re-read it. This is needed for NFS etc, where the
 758                 * identity of the reader can decide if we can read the
 759                 * page or not..
 760                 */
 761/*
 762 * We have to read the page.
 763 * If we were reading ahead, we had previously tried to read this page,
 764 * That means that the page has probably been removed from the cache before 
 765 * the application process needs it, or has been rewritten.
 766 * Decrease max readahead size to the minimum value in that situation.
 767 */
 768                if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 769                        filp->f_ramax = MIN_READAHEAD;
 770
 771                {
 772                        int error = inode->i_op->readpage(filp, page);
 773                        if (!error)
 774                                goto found_page;
 775                        desc->error = error;
 776                        page_cache_release(page);
 777                        break;
 778                }
 779
 780page_read_error:
 781                /*
 782                 * We found the page, but it wasn't up-to-date.
 783                 * Try to re-read it _once_. We do this synchronously,
 784                 * because this happens only if there were errors.
 785                 */
 786                {
 787                        int error = inode->i_op->readpage(filp, page);
 788                        if (!error) {
 789                                wait_on_page(page);
 790                                if (PageUptodate(page) && !PageError(page))
 791                                        goto success;
 792                                error = -EIO; /* Some unspecified error occurred.. */
 793                        }
 794                        desc->error = error;
 795                        page_cache_release(page);
 796                        break;
 797                }
 798        }
 799
 800        *ppos = pos;
 801        filp->f_reada = 1;
 802        if (page_cache)
 803                page_cache_free(page_cache);
 804        UPDATE_ATIME(inode);
 805}
 806
 807static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 808{
 809        unsigned long left;
 810        unsigned long count = desc->count;
 811
 812        if (size > count)
 813                size = count;
 814        left = __copy_to_user(desc->buf, area, size);
 815        if (left) {
 816                size -= left;
 817                desc->error = -EFAULT;
 818        }
 819        desc->count = count - size;
 820        desc->written += size;
 821        desc->buf += size;
 822        return size;
 823}
 824
 825/*
 826 * This is the "read()" routine for all filesystems
 827 * that can use the page cache directly.
 828 */
 829ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 830{
 831        ssize_t retval;
 832
 833        retval = -EFAULT;
 834        if (access_ok(VERIFY_WRITE, buf, count)) {
 835                retval = 0;
 836                if (count) {
 837                        read_descriptor_t desc;
 838
 839                        desc.written = 0;
 840                        desc.count = count;
 841                        desc.buf = buf;
 842                        desc.error = 0;
 843                        do_generic_file_read(filp, ppos, &desc, file_read_actor);
 844
 845                        retval = desc.written;
 846                        if (!retval)
 847                                retval = desc.error;
 848                }
 849        }
 850        return retval;
 851}
 852
 853static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
 854{
 855        ssize_t written;
 856        unsigned long count = desc->count;
 857        struct file *file = (struct file *) desc->buf;
 858        struct inode *inode = file->f_dentry->d_inode;
 859        mm_segment_t old_fs;
 860
 861        if (size > count)
 862                size = count;
 863        fs_down(&inode->i_sem);
 864        old_fs = get_fs();
 865        set_fs(KERNEL_DS);
 866        written = file->f_op->write(file, area, size, &file->f_pos);
 867        set_fs(old_fs);
 868        fs_up(&inode->i_sem);
 869        if (written < 0) {
 870                desc->error = written;
 871                written = 0;
 872        }
 873        desc->count = count - written;
 874        desc->written += written;
 875        return written;
 876}
 877
 878asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
 879{
 880        ssize_t retval;
 881        struct file * in_file, * out_file;
 882        struct inode * in_inode, * out_inode;
 883
 884        lock_kernel();
 885
 886        /*
 887         * Get input file, and verify that it is ok..
 888         */
 889        retval = -EBADF;
 890        in_file = fget(in_fd);
 891        if (!in_file)
 892                goto out;
 893        if (!(in_file->f_mode & FMODE_READ))
 894                goto fput_in;
 895        retval = -EINVAL;
 896        in_inode = in_file->f_dentry->d_inode;
 897        if (!in_inode)
 898                goto fput_in;
 899        if (!in_inode->i_op || !in_inode->i_op->readpage)
 900                goto fput_in;
 901        retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
 902        if (retval)
 903                goto fput_in;
 904
 905        /*
 906         * Get output file, and verify that it is ok..
 907         */
 908        retval = -EBADF;
 909        out_file = fget(out_fd);
 910        if (!out_file)
 911                goto fput_in;
 912        if (!(out_file->f_mode & FMODE_WRITE))
 913                goto fput_out;
 914        retval = -EINVAL;
 915        if (!out_file->f_op || !out_file->f_op->write)
 916                goto fput_out;
 917        out_inode = out_file->f_dentry->d_inode;
 918        if (!out_inode)
 919                goto fput_out;
 920        retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
 921        if (retval)
 922                goto fput_out;
 923
 924        retval = 0;
 925        if (count) {
 926                read_descriptor_t desc;
 927                loff_t pos = 0, *ppos;
 928
 929                retval = -EFAULT;
 930                ppos = &in_file->f_pos;
 931                if (offset) {
 932                        if (get_user(pos, offset))
 933                                goto fput_out;
 934                        ppos = &pos;
 935                }
 936
 937                desc.written = 0;
 938                desc.count = count;
 939                desc.buf = (char *) out_file;
 940                desc.error = 0;
 941                do_generic_file_read(in_file, ppos, &desc, file_send_actor);
 942
 943                retval = desc.written;
 944                if (!retval)
 945                        retval = desc.error;
 946                if (offset)
 947                        put_user(pos, offset);
 948        }
 949
 950
 951fput_out:
 952        fput(out_file);
 953fput_in:
 954        fput(in_file);
 955out:
 956        unlock_kernel();
 957        return retval;
 958}
 959
 960/*
 961 * Semantics for shared and private memory areas are different past the end
 962 * of the file. A shared mapping past the last page of the file is an error
 963 * and results in a SIGBUS, while a private mapping just maps in a zero page.
 964 *
 965 * The goto's are kind of ugly, but this streamlines the normal case of having
 966 * it in the page cache, and handles the special cases reasonably without
 967 * having a lot of duplicated code.
 968 *
 969 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
 970 * ahead of the wait if we're sure to need it.
 971 */
 972static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
 973{
 974        struct file * file = area->vm_file;
 975        struct dentry * dentry = file->f_dentry;
 976        struct inode * inode = dentry->d_inode;
 977        unsigned long offset, reada, i;
 978        struct page * page, **hash;
 979        unsigned long old_page, new_page;
 980
 981        new_page = 0;
 982        offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 983        if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 984                goto no_page;
 985
 986        /*
 987         * Do we have something in the page cache already?
 988         */
 989        hash = page_hash(inode, offset);
 990        page = __find_page(inode, offset, *hash);
 991        if (!page)
 992                goto no_cached_page;
 993
 994found_page:
 995        /*
 996         * Ok, found a page in the page cache, now we need to check
 997         * that it's up-to-date.  First check whether we'll need an
 998         * extra page -- better to overlap the allocation with the I/O.
 999         */
1000        if (no_share && !new_page) {
1001                new_page = page_cache_alloc();
1002                if (!new_page)
1003                        goto release_and_oom;
1004        }
1005
1006        if (PageLocked(page))
1007                goto page_locked_wait;
1008        if (!PageUptodate(page))
1009                goto page_read_error;
1010
1011success:
1012        /*
1013         * Found the page, need to check sharing and possibly
1014         * copy it over to another page..
1015         */
1016        old_page = page_address(page);
1017        if (!no_share) {
1018                /*
1019                 * Ok, we can share the cached page directly.. Get rid
1020                 * of any potential extra pages.
1021                 */
1022                if (new_page)
1023                        page_cache_free(new_page);
1024
1025                flush_page_to_ram(old_page);
1026                return old_page;
1027        }
1028
1029        /*
1030         * No sharing ... copy to the new page.
1031         */
1032        copy_page(new_page, old_page);
1033        flush_page_to_ram(new_page);
1034        page_cache_release(page);
1035        return new_page;
1036
1037no_cached_page:
1038        /*
1039         * Try to read in an entire cluster at once.
1040         */
1041        reada   = offset;
1042        reada >>= PAGE_CACHE_SHIFT + page_cluster;
1043        reada <<= PAGE_CACHE_SHIFT + page_cluster;
1044
1045        for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
1046                new_page = try_to_read_ahead(file, reada, new_page);
1047
1048        if (!new_page)
1049                new_page = page_cache_alloc();
1050        if (!new_page)
1051                goto oom;
1052
1053        /*
1054         * During getting the above page we might have slept,
1055         * so we need to re-check the situation with the page
1056         * cache.. The page we just got may be useful if we
1057         * can't share, so don't get rid of it here.
1058         */
1059        page = find_page(inode, offset);
1060        if (page)
1061                goto found_page;
1062
1063        /*
1064         * Now, create a new page-cache page from the page we got
1065         */
1066        page = page_cache_entry(new_page);
1067        new_page = 0;
1068        add_to_page_cache(page, inode, offset, hash);
1069
1070        if (inode->i_op->readpage(file, page) != 0)
1071                goto failure;
1072
1073        goto found_page;
1074
1075page_locked_wait:
1076        __wait_on_page(page);
1077        if (PageUptodate(page))
1078                goto success;
1079        
1080page_read_error:
1081        /*
1082         * Umm, take care of errors if the page isn't up-to-date.
1083         * Try to re-read it _once_. We do this synchronously,
1084         * because there really aren't any performance issues here
1085         * and we need to check for errors.
1086         */
1087        if (inode->i_op->readpage(file, page) != 0)
1088                goto failure;
1089        wait_on_page(page);
1090        if (PageError(page))
1091                goto failure;
1092        if (PageUptodate(page))
1093                goto success;
1094
1095        /*
1096         * Things didn't work out. Return zero to tell the
1097         * mm layer so, possibly freeing the page cache page first.
1098         */
1099failure:
1100        page_cache_release(page);
1101        if (new_page)
1102                page_cache_free(new_page);
1103no_page:
1104        return 0;
1105
1106release_and_oom:
1107        page_cache_release(page);
1108oom:
1109        return -1;
1110}
1111
1112/*
1113 * Tries to write a shared mapped page to its backing store. May return -EIO
1114 * if the disk is full.
1115 */
1116static inline int do_write_page(struct inode * inode, struct file * file,
1117        const char * page, unsigned long offset)
1118{
1119        int retval;
1120        unsigned long size;
1121        loff_t loff = offset;
1122        mm_segment_t old_fs;
1123
1124        size = offset + PAGE_SIZE;
1125        /* refuse to extend file size.. */
1126        if (S_ISREG(inode->i_mode)) {
1127                if (size > inode->i_size)
1128                        size = inode->i_size;
1129                /* Ho humm.. We should have tested for this earlier */
1130                if (size < offset)
1131                        return -EIO;
1132        }
1133        size -= offset;
1134        old_fs = get_fs();
1135        set_fs(KERNEL_DS);
1136        retval = -EIO;
1137        if (size == file->f_op->write(file, (const char *) page, size, &loff))
1138                retval = 0;
1139        set_fs(old_fs);
1140        return retval;
1141}
1142
1143static int filemap_write_page(struct vm_area_struct * vma,
1144                              unsigned long offset,
1145                              unsigned long page)
1146{
1147        int result;
1148        struct file * file;
1149        struct dentry * dentry;
1150        struct inode * inode;
1151
1152        file = vma->vm_file;
1153        dentry = file->f_dentry;
1154        inode = dentry->d_inode;
1155        if (!file->f_op->write)
1156                return -EIO;
1157
1158        /*
1159         * If a task terminates while we're swapping the page, the vma and
1160         * and file could be released ... increment the count to be safe.
1161         */
1162        file->f_count++;
1163        fs_down(&inode->i_sem);
1164        result = do_write_page(inode, file, (const char *) page, offset);
1165        fs_up(&inode->i_sem);
1166        fput(file);
1167        return result;
1168}
1169
1170
1171/*
1172 * The page cache takes care of races between somebody
1173 * trying to swap something out and swap something in
1174 * at the same time..
1175 */
1176int filemap_swapout(struct vm_area_struct * vma, struct page * page)
1177{
1178        return filemap_write_page(vma, page->offset, page_address(page));
1179}
1180
1181static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1182        unsigned long address, unsigned int flags)
1183{
1184        pte_t pte = *ptep;
1185        unsigned long page;
1186        int error;
1187
1188        if (!(flags & MS_INVALIDATE)) {
1189                if (!pte_present(pte))
1190                        return 0;
1191                if (!pte_dirty(pte))
1192                        return 0;
1193                flush_page_to_ram(pte_page(pte));
1194                flush_cache_page(vma, address);
1195                set_pte(ptep, pte_mkclean(pte));
1196                flush_tlb_page(vma, address);
1197                page = pte_page(pte);
1198                atomic_inc(&page_cache_entry(page)->count);
1199        } else {
1200                if (pte_none(pte))
1201                        return 0;
1202                flush_cache_page(vma, address);
1203                pte_clear(ptep);
1204                flush_tlb_page(vma, address);
1205                if (!pte_present(pte)) {
1206                        swap_free(pte_val(pte));
1207                        return 0;
1208                }
1209                page = pte_page(pte);
1210                if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1211                        page_cache_free(page);
1212                        return 0;
1213                }
1214        }
1215        error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1216        page_cache_free(page);
1217        return error;
1218}
1219
1220static inline int filemap_sync_pte_range(pmd_t * pmd,
1221        unsigned long address, unsigned long size, 
1222        struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1223{
1224        pte_t * pte;
1225        unsigned long end;
1226        int error;
1227
1228        if (pmd_none(*pmd))
1229                return 0;
1230        if (pmd_bad(*pmd)) {
1231                printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1232                pmd_clear(pmd);
1233                return 0;
1234        }
1235        pte = pte_offset(pmd, address);
1236        offset += address & PMD_MASK;
1237        address &= ~PMD_MASK;
1238        end = address + size;
1239        if (end > PMD_SIZE)
1240                end = PMD_SIZE;
1241        error = 0;
1242        do {
1243                error |= filemap_sync_pte(pte, vma, address + offset, flags);
1244                address += PAGE_SIZE;
1245                pte++;
1246        } while (address < end);
1247        return error;
1248}
1249
1250static inline int filemap_sync_pmd_range(pgd_t * pgd,
1251        unsigned long address, unsigned long size, 
1252        struct vm_area_struct *vma, unsigned int flags)
1253{
1254        pmd_t * pmd;
1255        unsigned long offset, end;
1256        int error;
1257
1258        if (pgd_none(*pgd))
1259                return 0;
1260        if (pgd_bad(*pgd)) {
1261                printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1262                pgd_clear(pgd);
1263                return 0;
1264        }
1265        pmd = pmd_offset(pgd, address);
1266        offset = address & PGDIR_MASK;
1267        address &= ~PGDIR_MASK;
1268        end = address + size;
1269        if (end > PGDIR_SIZE)
1270                end = PGDIR_SIZE;
1271        error = 0;
1272        do {
1273                error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1274                address = (address + PMD_SIZE) & PMD_MASK;
1275                pmd++;
1276        } while (address < end);
1277        return error;
1278}
1279
1280static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1281        size_t size, unsigned int flags)
1282{
1283        pgd_t * dir;
1284        unsigned long end = address + size;
1285        int error = 0;
1286
1287        dir = pgd_offset(vma->vm_mm, address);
1288        flush_cache_range(vma->vm_mm, end - size, end);
1289        while (address < end) {
1290                error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1291                address = (address + PGDIR_SIZE) & PGDIR_MASK;
1292                dir++;
1293        }
1294        flush_tlb_range(vma->vm_mm, end - size, end);
1295        return error;
1296}
1297
1298/*
1299 * This handles (potentially partial) area unmaps..
1300 */
1301static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1302{
1303        filemap_sync(vma, start, len, MS_ASYNC);
1304}
1305
1306/*
1307 * Shared mappings need to be able to do the right thing at
1308 * close/unmap/sync. They will also use the private file as
1309 * backing-store for swapping..
1310 */
1311static struct vm_operations_struct file_shared_mmap = {
1312        NULL,                   /* no special open */
1313        NULL,                   /* no special close */
1314        filemap_unmap,          /* unmap - we need to sync the pages */
1315        NULL,                   /* no special protect */
1316        filemap_sync,           /* sync */
1317        NULL,                   /* advise */
1318        filemap_nopage,         /* nopage */
1319        NULL,                   /* wppage */
1320        filemap_swapout,        /* swapout */
1321        NULL,                   /* swapin */
1322};
1323
1324/*
1325 * Private mappings just need to be able to load in the map.
1326 *
1327 * (This is actually used for shared mappings as well, if we
1328 * know they can't ever get write permissions..)
1329 */
1330static struct vm_operations_struct file_private_mmap = {
1331        NULL,                   /* open */
1332        NULL,                   /* close */
1333        NULL,                   /* unmap */
1334        NULL,                   /* protect */
1335        NULL,                   /* sync */
1336        NULL,                   /* advise */
1337        filemap_nopage,         /* nopage */
1338        NULL,                   /* wppage */
1339        NULL,                   /* swapout */
1340        NULL,                   /* swapin */
1341};
1342
1343/* This is used for a general mmap of a disk file */
1344
1345int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1346{
1347        struct vm_operations_struct * ops;
1348        struct inode *inode = file->f_dentry->d_inode;
1349
1350        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1351                ops = &file_shared_mmap;
1352                /* share_page() can only guarantee proper page sharing if
1353                 * the offsets are all page aligned. */
1354                if (vma->vm_offset & (PAGE_SIZE - 1))
1355                        return -EINVAL;
1356        } else {
1357                ops = &file_private_mmap;
1358                if (inode->i_op && inode->i_op->bmap &&
1359                    (vma->vm_offset & (inode->i_sb->s_blocksize - 1)))
1360                        return -EINVAL;
1361        }
1362        if (!inode->i_sb || !S_ISREG(inode->i_mode))
1363                return -EACCES;
1364        if (!inode->i_op || !inode->i_op->readpage)
1365                return -ENOEXEC;
1366        UPDATE_ATIME(inode);
1367        vma->vm_ops = ops;
1368        return 0;
1369}
1370
1371
1372/*
1373 * The msync() system call.
1374 */
1375
1376static int msync_interval(struct vm_area_struct * vma,
1377        unsigned long start, unsigned long end, int flags)
1378{
1379        if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
1380                int error;
1381                error = vma->vm_ops->sync(vma, start, end-start, flags);
1382                if (!error && (flags & MS_SYNC)) {
1383                        struct file * file = vma->vm_file;
1384                        if (file) {
1385                                struct dentry * dentry = file->f_dentry;
1386                                struct inode * inode = dentry->d_inode;
1387                                fs_down(&inode->i_sem);
1388                                error = file_fsync(file, dentry);
1389                                fs_up(&inode->i_sem);
1390                        }
1391                }
1392                return error;
1393        }
1394        return 0;
1395}
1396
1397asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1398{
1399        unsigned long end;
1400        struct vm_area_struct * vma;
1401        int unmapped_error, error = -EINVAL;
1402
1403        down(&current->mm->mmap_sem);
1404        lock_kernel();
1405        if (start & ~PAGE_MASK)
1406                goto out;
1407        len = (len + ~PAGE_MASK) & PAGE_MASK;
1408        end = start + len;
1409        if (end < start)
1410                goto out;
1411        if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1412                goto out;
1413        error = 0;
1414        if (end == start)
1415                goto out;
1416        /*
1417         * If the interval [start,end) covers some unmapped address ranges,
1418         * just ignore them, but return -EFAULT at the end.
1419         */
1420        vma = find_vma(current->mm, start);
1421        unmapped_error = 0;
1422        for (;;) {
1423                /* Still start < end. */
1424                error = -EFAULT;
1425                if (!vma)
1426                        goto out;
1427                /* Here start < vma->vm_end. */
1428                if (start < vma->vm_start) {
1429                        unmapped_error = -EFAULT;
1430                        start = vma->vm_start;
1431                }
1432                /* Here vma->vm_start <= start < vma->vm_end. */
1433                if (end <= vma->vm_end) {
1434                        if (start < end) {
1435                                error = msync_interval(vma, start, end, flags);
1436                                if (error)
1437                                        goto out;
1438                        }
1439                        error = unmapped_error;
1440                        goto out;
1441                }
1442                /* Here vma->vm_start <= start < vma->vm_end < end. */
1443                error = msync_interval(vma, start, vma->vm_end, flags);
1444                if (error)
1445                        goto out;
1446                start = vma->vm_end;
1447                vma = vma->vm_next;
1448        }
1449out:
1450        unlock_kernel();
1451        up(&current->mm->mmap_sem);
1452        return error;
1453}
1454
1455static inline
1456struct page *__read_cache_page(struct inode *inode, 
1457                               unsigned long offset,
1458                               int (*filler)(void *,struct page*),
1459                               void *data)
1460{
1461        struct page **hash = page_hash(inode, offset);
1462        struct page *page;
1463        unsigned long cached_page = 0;
1464        int err;
1465
1466        offset &= PAGE_CACHE_MASK;
1467repeat:
1468        page = __find_page(inode, offset, *hash);
1469        if (!page) {
1470                if (!cached_page) {
1471                        cached_page = page_cache_alloc();
1472                        if (!cached_page)
1473                                return ERR_PTR(-ENOMEM);
1474                        goto repeat;
1475                }
1476                page = page_cache_entry(cached_page);
1477                cached_page = 0;
1478                add_to_page_cache(page, inode, offset, hash);
1479                set_bit(PG_locked, &page->flags);
1480                err = filler(data, page);
1481                if (err < 0) {
1482                        page_cache_release(page);
1483                        page = ERR_PTR(err);
1484                }
1485        }
1486        if (cached_page)
1487                page_cache_free(cached_page);
1488        return page;
1489}
1490
1491/*
1492 * Read into the page cache. If a page already exists,
1493 * and Page_Uptodate() is not set, try to fill the page.
1494 */
1495struct page *read_cache_page(struct inode *inode,
1496                                unsigned long offset,
1497                                int (*filler)(void *,struct page*),
1498                                void *data)
1499{
1500        struct page *page = __read_cache_page(inode, offset, filler, data);
1501        int err;
1502
1503        if (IS_ERR(page) || PageUptodate(page))
1504                goto out;
1505
1506        wait_on_page(page);
1507        if (PageUptodate(page))
1508                goto out;
1509
1510        set_bit(PG_locked, &page->flags);
1511        err = filler(data, page);
1512        if (err < 0) {
1513                page_cache_release(page);
1514                page = ERR_PTR(err);
1515        }
1516 out:
1517        return page;
1518}
1519
1520/*
1521 * Write to a file through the page cache. This is mainly for the
1522 * benefit of NFS and possibly other network-based file systems.
1523 *
1524 * We currently put everything into the page cache prior to writing it.
1525 * This is not a problem when writing full pages. With partial pages,
1526 * however, we first have to read the data into the cache, then
1527 * dirty the page, and finally schedule it for writing. Alternatively, we
1528 * could write-through just the portion of data that would go into that
1529 * page, but that would kill performance for applications that write data
1530 * line by line, and it's prone to race conditions.
1531 *
1532 * Note that this routine doesn't try to keep track of dirty pages. Each
1533 * file system has to do this all by itself, unfortunately.
1534 *                                                      okir@monad.swb.de
1535 */
1536ssize_t
1537generic_file_write(struct file *file, const char *buf,
1538                   size_t count, loff_t *ppos)
1539{
1540        struct dentry   *dentry = file->f_dentry; 
1541        struct inode    *inode = dentry->d_inode; 
1542        unsigned long   pos = *ppos;
1543        unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1544        struct page     *page, **hash;
1545        unsigned long   page_cache = 0;
1546        unsigned long   written;
1547        long            status, sync;
1548
1549        if (!inode->i_op || !inode->i_op->updatepage)
1550                return -EIO;
1551
1552        if (file->f_error) {
1553                int error = file->f_error;
1554                file->f_error = 0;
1555                return error;
1556        }
1557
1558        sync    = file->f_flags & O_SYNC;
1559        written = 0;
1560
1561        if (file->f_flags & O_APPEND)
1562                pos = inode->i_size;
1563
1564        /*
1565         * Check whether we've reached the file size limit.
1566         */
1567        status = -EFBIG;
1568        if (pos >= limit) {
1569                send_sig(SIGXFSZ, current, 0);
1570                goto out;
1571        }
1572
1573        status  = 0;
1574        /*
1575         * Check whether to truncate the write,
1576         * and send the signal if we do.
1577         */
1578        if (count > limit - pos) {
1579                send_sig(SIGXFSZ, current, 0);
1580                count = limit - pos;
1581        }
1582
1583        while (count) {
1584                unsigned long bytes, pgpos, offset;
1585                char * dest;
1586
1587                /*
1588                 * Try to find the page in the cache. If it isn't there,
1589                 * allocate a free page.
1590                 */
1591                offset = (pos & ~PAGE_CACHE_MASK);
1592                pgpos = pos & PAGE_CACHE_MASK;
1593                bytes = PAGE_CACHE_SIZE - offset;
1594                if (bytes > count)
1595                        bytes = count;
1596
1597                hash = page_hash(inode, pgpos);
1598                page = __find_page(inode, pgpos, *hash);
1599                if (!page) {
1600                        if (!page_cache) {
1601                                page_cache = page_cache_alloc();
1602                                if (page_cache)
1603                                        continue;
1604                                status = -ENOMEM;
1605                                break;
1606                        }
1607                        page = page_cache_entry(page_cache);
1608                        add_to_page_cache(page, inode, pgpos, hash);
1609                        page_cache = 0;
1610                }
1611
1612                /* Get exclusive IO access to the page.. */
1613                wait_on_page(page);
1614                set_bit(PG_locked, &page->flags);
1615
1616                if (inode->i_op->prepare_write)
1617                        status = inode->i_op->prepare_write(file, page, offset, bytes);
1618                if (status < 0)
1619                        goto unlock;
1620
1621                /*
1622                 * Do the real work.. If the writer ends up delaying the write,
1623                 * the writer needs to increment the page use counts until he
1624                 * is done with the page.
1625                 */
1626                dest = (char *) page_address(page) + offset;
1627                if (dest != buf) { /* See comment in update_vm_cache_cond. */
1628                        bytes -= copy_from_user(dest, buf, bytes);
1629                        flush_dcache_page(page_address(page));
1630                }
1631                status = -EFAULT;
1632                if (bytes)
1633                        status = inode->i_op->updatepage(file, page, offset, bytes, sync);
1634
1635 unlock:
1636                /* Mark it unlocked again and drop the page.. */
1637                clear_bit(PG_locked, &page->flags);
1638                wake_up(&page->wait);
1639                page_cache_release(page);
1640
1641                if (status < 0)
1642                        break;
1643
1644                written += status;
1645                count -= status;
1646                pos += status;
1647                buf += status;
1648        }
1649        *ppos = pos;
1650        if (pos > inode->i_size)
1651                inode->i_size = pos;
1652
1653        if (page_cache)
1654                page_cache_free(page_cache);
1655out:
1656        return written ? written : status;
1657}
1658
1659/*
1660 * Support routines for directory cacheing using the page cache.
1661 */
1662
1663/*
1664 * Finds the page at the specified offset, installing a new page
1665 * if requested.  The count is incremented and the page is locked.
1666 *
1667 * Note: we don't have to worry about races here, as the caller
1668 * is holding the inode semaphore.
1669 */
1670unsigned long get_cached_page(struct inode * inode, unsigned long offset,
1671                                int new)
1672{
1673        struct page * page;
1674        struct page ** hash;
1675        unsigned long page_cache = 0;
1676
1677        hash = page_hash(inode, offset);
1678        page = __find_page(inode, offset, *hash);
1679        if (!page) {
1680                if (!new)
1681                        goto out;
1682                page_cache = page_cache_alloc();
1683                if (!page_cache)
1684                        goto out;
1685                clear_page(page_cache);
1686                page = page_cache_entry(page_cache);
1687                add_to_page_cache(page, inode, offset, hash);
1688        }
1689        if (atomic_read(&page->count) != 2)
1690                printk(KERN_ERR "get_cached_page: page count=%d\n",
1691                        atomic_read(&page->count));
1692        if (test_bit(PG_locked, &page->flags))
1693                printk(KERN_ERR "get_cached_page: page already locked!\n");
1694        set_bit(PG_locked, &page->flags);
1695        page_cache = page_address(page);
1696
1697out:
1698        return page_cache;
1699}
1700
1701/*
1702 * Unlock and free a page.
1703 */
1704void put_cached_page(unsigned long addr)
1705{
1706        struct page * page = page_cache_entry(addr);
1707
1708        if (!test_bit(PG_locked, &page->flags))
1709                printk("put_cached_page: page not locked!\n");
1710        if (atomic_read(&page->count) != 2)
1711                printk("put_cached_page: page count=%d\n", 
1712                        atomic_read(&page->count));
1713        clear_bit(PG_locked, &page->flags);
1714        wake_up(&page->wait);
1715        page_cache_release(page);
1716}
1717
1718void __init page_cache_init(unsigned long memory_size)
1719{
1720        unsigned long htable_size;
1721        long order;
1722
1723        htable_size  = memory_size >> PAGE_SHIFT;
1724        htable_size *= sizeof(struct page *);
1725        for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
1726                ;
1727
1728        do {
1729                unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
1730
1731                page_hash_mask = (tmp - 1UL);
1732
1733                page_hash_bits = 0;
1734                while((tmp >>= 1UL) != 0UL)
1735                        page_hash_bits++;
1736
1737                page_hash_table = (struct page **)
1738                        __get_free_pages(GFP_ATOMIC, order);
1739        } while(page_hash_table == NULL && --order >= 0L);
1740
1741        printk("Page cache hash table entries: %d (order %ld, %ldk)\n",
1742               (1 << page_hash_bits), order, (1UL << order) * PAGE_SIZE / 1024);
1743        if (!page_hash_table)
1744                panic("Failed to allocate page hash table\n");
1745        memset(page_hash_table, 0,
1746               (PAGE_HASH_MASK + 1UL) * sizeof(struct page *));
1747}
1748
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.