linux-old/mm/filemap.c
<<
>>
Prefs
   1/*
   2 *      linux/mm/filemap.c
   3 *
   4 * Copyright (C) 1994, 1995  Linus Torvalds
   5 */
   6
   7/*
   8 * This file handles the generic file mmap semantics used by
   9 * most "normal" filesystems (but you don't /have/ to use this:
  10 * the NFS filesystem does this differently, for example)
  11 */
  12#include <linux/stat.h>
  13#include <linux/sched.h>
  14#include <linux/kernel.h>
  15#include <linux/mm.h>
  16#include <linux/shm.h>
  17#include <linux/errno.h>
  18#include <linux/mman.h>
  19#include <linux/string.h>
  20#include <linux/malloc.h>
  21#include <linux/fs.h>
  22#include <linux/locks.h>
  23#include <linux/pagemap.h>
  24#include <linux/swap.h>
  25#include <linux/smp.h>
  26#include <linux/smp_lock.h>
  27
  28#include <asm/system.h>
  29#include <asm/pgtable.h>
  30#include <asm/uaccess.h>
  31
  32/*
  33 * Shared mappings implemented 30.11.1994. It's not fully working yet,
  34 * though.
  35 *
  36 * Shared mappings now work. 15.8.1995  Bruno.
  37 */
  38
  39unsigned long page_cache_size = 0;
  40struct page * page_hash_table[PAGE_HASH_SIZE];
  41
  42/*
  43 * Simple routines for both non-shared and shared mappings.
  44 */
  45
  46#define release_page(page) __free_page((page))
  47
  48/*
  49 * Invalidate the pages of an inode, removing all pages that aren't
  50 * locked down (those are sure to be up-to-date anyway, so we shouldn't
  51 * invalidate them).
  52 */
  53void invalidate_inode_pages(struct inode * inode)
  54{
  55        struct page ** p;
  56        struct page * page;
  57
  58        p = &inode->i_pages;
  59        while ((page = *p) != NULL) {
  60                if (PageLocked(page)) {
  61                        p = &page->next;
  62                        continue;
  63                }
  64                inode->i_nrpages--;
  65                if ((*p = page->next) != NULL)
  66                        (*p)->prev = page->prev;
  67                page->next = NULL;
  68                page->prev = NULL;
  69                remove_page_from_hash_queue(page);
  70                page->inode = NULL;
  71                __free_page(page);
  72                continue;
  73        }
  74}
  75
  76/*
  77 * Truncate the page cache at a set offset, removing the pages
  78 * that are beyond that offset (and zeroing out partial pages).
  79 */
  80void truncate_inode_pages(struct inode * inode, unsigned long start)
  81{
  82        struct page ** p;
  83        struct page * page;
  84
  85repeat:
  86        p = &inode->i_pages;
  87        while ((page = *p) != NULL) {
  88                unsigned long offset = page->offset;
  89
  90                /* page wholly truncated - free it */
  91                if (offset >= start) {
  92                        if (PageLocked(page)) {
  93                                wait_on_page(page);
  94                                goto repeat;
  95                        }
  96                        inode->i_nrpages--;
  97                        if ((*p = page->next) != NULL)
  98                                (*p)->prev = page->prev;
  99                        page->next = NULL;
 100                        page->prev = NULL;
 101                        remove_page_from_hash_queue(page);
 102                        page->inode = NULL;
 103                        __free_page(page);
 104                        continue;
 105                }
 106                p = &page->next;
 107                offset = start - offset;
 108                /* partial truncate, clear end of page */
 109                if (offset < PAGE_SIZE) {
 110                        unsigned long address = page_address(page);
 111                        memset((void *) (offset + address), 0, PAGE_SIZE - offset);
 112                        flush_page_to_ram(address);
 113                }
 114        }
 115}
 116
 117int shrink_mmap(int priority, int dma)
 118{
 119        static unsigned long clock = 0;
 120        struct page * page;
 121        unsigned long limit = num_physpages;
 122        struct buffer_head *tmp, *bh;
 123        int count_max, count_min;
 124
 125        count_max = (limit<<1) >> (priority>>1);
 126        count_min = (limit<<1) >> (priority);
 127
 128        page = mem_map + clock;
 129        do {
 130                count_max--;
 131                if (page->inode || page->buffers)
 132                        count_min--;
 133
 134                if (PageLocked(page))
 135                        goto next;
 136                if (dma && !PageDMA(page))
 137                        goto next;
 138                /* First of all, regenerate the page's referenced bit
 139                   from any buffers in the page */
 140                bh = page->buffers;
 141                if (bh) {
 142                        tmp = bh;
 143                        do {
 144                                if (buffer_touched(tmp)) {
 145                                        clear_bit(BH_Touched, &tmp->b_state);
 146                                        set_bit(PG_referenced, &page->flags);
 147                                }
 148                                tmp = tmp->b_this_page;
 149                        } while (tmp != bh);
 150                }
 151
 152                /* We can't throw away shared pages, but we do mark
 153                   them as referenced.  This relies on the fact that
 154                   no page is currently in both the page cache and the
 155                   buffer cache; we'd have to modify the following
 156                   test to allow for that case. */
 157
 158                switch (atomic_read(&page->count)) {
 159                        case 1:
 160                                /* If it has been referenced recently, don't free it */
 161                                if (test_and_clear_bit(PG_referenced, &page->flags))
 162                                        break;
 163
 164                                /* is it a page cache page? */
 165                                if (page->inode) {
 166                                        remove_page_from_hash_queue(page);
 167                                        remove_page_from_inode_queue(page);
 168                                        __free_page(page);
 169                                        return 1;
 170                                }
 171
 172                                /* is it a buffer cache page? */
 173                                if (bh && try_to_free_buffer(bh, &bh, 6))
 174                                        return 1;
 175                                break;
 176
 177                        default:
 178                                /* more than one users: we can't throw it away */
 179                                set_bit(PG_referenced, &page->flags);
 180                                /* fall through */
 181                        case 0:
 182                                /* nothing */
 183                }
 184next:
 185                page++;
 186                clock++;
 187                if (clock >= limit) {
 188                        clock = 0;
 189                        page = mem_map;
 190                }
 191        } while (count_max > 0 && count_min > 0);
 192        return 0;
 193}
 194
 195/*
 196 * This is called from try_to_swap_out() when we try to get rid of some
 197 * pages..  If we're unmapping the last occurrence of this page, we also
 198 * free it from the page hash-queues etc, as we don't want to keep it
 199 * in-core unnecessarily.
 200 */
 201unsigned long page_unuse(unsigned long page)
 202{
 203        struct page * p = mem_map + MAP_NR(page);
 204        int count = atomic_read(&p->count);
 205
 206        if (count != 2)
 207                return count;
 208        if (!p->inode)
 209                return count;
 210        remove_page_from_hash_queue(p);
 211        remove_page_from_inode_queue(p);
 212        free_page(page);
 213        return 1;
 214}
 215
 216/*
 217 * Update a page cache copy, when we're doing a "write()" system call
 218 * See also "update_vm_cache()".
 219 */
 220void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
 221{
 222        unsigned long offset, len;
 223
 224        offset = (pos & ~PAGE_MASK);
 225        pos = pos & PAGE_MASK;
 226        len = PAGE_SIZE - offset;
 227        do {
 228                struct page * page;
 229
 230                if (len > count)
 231                        len = count;
 232                page = find_page(inode, pos);
 233                if (page) {
 234                        wait_on_page(page);
 235                        memcpy((void *) (offset + page_address(page)), buf, len);
 236                        release_page(page);
 237                }
 238                count -= len;
 239                buf += len;
 240                len = PAGE_SIZE;
 241                offset = 0;
 242                pos += PAGE_SIZE;
 243        } while (count);
 244}
 245
 246static inline void add_to_page_cache(struct page * page,
 247        struct inode * inode, unsigned long offset,
 248        struct page **hash)
 249{
 250        atomic_inc(&page->count);
 251        page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
 252        page->offset = offset;
 253        add_page_to_inode_queue(inode, page);
 254        __add_page_to_hash_queue(page, hash);
 255}
 256
 257/*
 258 * Try to read ahead in the file. "page_cache" is a potentially free page
 259 * that we could use for the cache (if it is 0 we can try to create one,
 260 * this is all overlapped with the IO on the previous page finishing anyway)
 261 */
 262static unsigned long try_to_read_ahead(struct inode * inode, unsigned long offset, unsigned long page_cache)
 263{
 264        struct page * page;
 265        struct page ** hash;
 266
 267        offset &= PAGE_MASK;
 268        switch (page_cache) {
 269        case 0:
 270                page_cache = __get_free_page(GFP_KERNEL);
 271                if (!page_cache)
 272                        break;
 273        default:
 274                if (offset >= inode->i_size)
 275                        break;
 276                hash = page_hash(inode, offset);
 277                page = __find_page(inode, offset, *hash);
 278                if (!page) {
 279                        /*
 280                         * Ok, add the new page to the hash-queues...
 281                         */
 282                        page = mem_map + MAP_NR(page_cache);
 283                        add_to_page_cache(page, inode, offset, hash);
 284                        inode->i_op->readpage(inode, page);
 285                        page_cache = 0;
 286                }
 287                release_page(page);
 288        }
 289        return page_cache;
 290}
 291
 292/* 
 293 * Wait for IO to complete on a locked page.
 294 *
 295 * This must be called with the caller "holding" the page,
 296 * ie with increased "page->count" so that the page won't
 297 * go away during the wait..
 298 */
 299void __wait_on_page(struct page *page)
 300{
 301        struct wait_queue wait = { current, NULL };
 302
 303        add_wait_queue(&page->wait, &wait);
 304repeat:
 305        run_task_queue(&tq_disk);
 306        current->state = TASK_UNINTERRUPTIBLE;
 307        if (PageLocked(page)) {
 308                schedule();
 309                goto repeat;
 310        }
 311        remove_wait_queue(&page->wait, &wait);
 312        current->state = TASK_RUNNING;
 313}
 314
 315#if 0
 316#define PROFILE_READAHEAD
 317#define DEBUG_READAHEAD
 318#endif
 319
 320/*
 321 * Read-ahead profiling information
 322 * --------------------------------
 323 * Every PROFILE_MAXREADCOUNT, the following information is written 
 324 * to the syslog:
 325 *   Percentage of asynchronous read-ahead.
 326 *   Average of read-ahead fields context value.
 327 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 328 * to the syslog.
 329 */
 330
 331#ifdef PROFILE_READAHEAD
 332
 333#define PROFILE_MAXREADCOUNT 1000
 334
 335static unsigned long total_reada;
 336static unsigned long total_async;
 337static unsigned long total_ramax;
 338static unsigned long total_ralen;
 339static unsigned long total_rawin;
 340
 341static void profile_readahead(int async, struct file *filp)
 342{
 343        unsigned long flags;
 344
 345        ++total_reada;
 346        if (async)
 347                ++total_async;
 348
 349        total_ramax     += filp->f_ramax;
 350        total_ralen     += filp->f_ralen;
 351        total_rawin     += filp->f_rawin;
 352
 353        if (total_reada > PROFILE_MAXREADCOUNT) {
 354                save_flags(flags);
 355                cli();
 356                if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 357                        restore_flags(flags);
 358                        return;
 359                }
 360
 361                printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 362                        total_ramax/total_reada,
 363                        total_ralen/total_reada,
 364                        total_rawin/total_reada,
 365                        (total_async*100)/total_reada);
 366#ifdef DEBUG_READAHEAD
 367                printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
 368                        filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 369#endif
 370
 371                total_reada     = 0;
 372                total_async     = 0;
 373                total_ramax     = 0;
 374                total_ralen     = 0;
 375                total_rawin     = 0;
 376
 377                restore_flags(flags);
 378        }
 379}
 380#endif  /* defined PROFILE_READAHEAD */
 381
 382/*
 383 * Read-ahead context:
 384 * -------------------
 385 * The read ahead context fields of the "struct file" are the following:
 386 * - f_raend : position of the first byte after the last page we tried to
 387 *             read ahead.
 388 * - f_ramax : current read-ahead maximum size.
 389 * - f_ralen : length of the current IO read block we tried to read-ahead.
 390 * - f_rawin : length of the current read-ahead window.
 391 *             if last read-ahead was synchronous then
 392 *                  f_rawin = f_ralen
 393 *             otherwise (was asynchronous)
 394 *                  f_rawin = previous value of f_ralen + f_ralen
 395 *
 396 * Read-ahead limits:
 397 * ------------------
 398 * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 399 * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 400 *
 401 * Synchronous read-ahead benefits:
 402 * --------------------------------
 403 * Using reasonable IO xfer length from peripheral devices increase system 
 404 * performances.
 405 * Reasonable means, in this context, not too large but not too small.
 406 * The actual maximum value is:
 407 *      MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 408 *      and 32K if defined (4K page size assumed).
 409 *
 410 * Asynchronous read-ahead benefits:
 411 * ---------------------------------
 412 * Overlapping next read request and user process execution increase system 
 413 * performance.
 414 *
 415 * Read-ahead risks:
 416 * -----------------
 417 * We have to guess which further data are needed by the user process.
 418 * If these data are often not really needed, it's bad for system 
 419 * performances.
 420 * However, we know that files are often accessed sequentially by 
 421 * application programs and it seems that it is possible to have some good 
 422 * strategy in that guessing.
 423 * We only try to read-ahead files that seems to be read sequentially.
 424 *
 425 * Asynchronous read-ahead risks:
 426 * ------------------------------
 427 * In order to maximize overlapping, we must start some asynchronous read 
 428 * request from the device, as soon as possible.
 429 * We must be very careful about:
 430 * - The number of effective pending IO read requests.
 431 *   ONE seems to be the only reasonable value.
 432 * - The total memory pool usage for the file access stream.
 433 *   This maximum memory usage is implicitly 2 IO read chunks:
 434 *   2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 435 *   64k if defined (4K page size assumed).
 436 */
 437
 438#define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK)
 439
 440#if 0  /* small readahead */
 441#define MAX_READAHEAD PageAlignSize(4096*7)
 442#define MIN_READAHEAD PageAlignSize(4096*2)
 443#else /* large readahead */
 444#define MAX_READAHEAD PageAlignSize(4096*18)
 445#define MIN_READAHEAD PageAlignSize(4096*3)
 446#endif
 447
 448static inline unsigned long generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode,
 449        unsigned long ppos, struct page * page,
 450        unsigned long page_cache)
 451{
 452        unsigned long max_ahead, ahead;
 453        unsigned long raend;
 454
 455        raend = filp->f_raend & PAGE_MASK;
 456        max_ahead = 0;
 457
 458/*
 459 * The current page is locked.
 460 * If the current position is inside the previous read IO request, do not
 461 * try to reread previously read ahead pages.
 462 * Otherwise decide or not to read ahead some pages synchronously.
 463 * If we are not going to read ahead, set the read ahead context for this 
 464 * page only.
 465 */
 466        if (PageLocked(page)) {
 467                if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
 468                        raend = ppos;
 469                        if (raend < inode->i_size)
 470                                max_ahead = filp->f_ramax;
 471                        filp->f_rawin = 0;
 472                        filp->f_ralen = PAGE_SIZE;
 473                        if (!max_ahead) {
 474                                filp->f_raend  = ppos + filp->f_ralen;
 475                                filp->f_rawin += filp->f_ralen;
 476                        }
 477                }
 478        }
 479/*
 480 * The current page is not locked.
 481 * If we were reading ahead and,
 482 * if the current max read ahead size is not zero and,
 483 * if the current position is inside the last read-ahead IO request,
 484 *   it is the moment to try to read ahead asynchronously.
 485 * We will later force unplug device in order to force asynchronous read IO.
 486 */
 487        else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
 488                 ppos <= raend && ppos + filp->f_ralen >= raend) {
 489/*
 490 * Add ONE page to max_ahead in order to try to have about the same IO max size
 491 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
 492 * Compute the position of the last page we have tried to read in order to 
 493 * begin to read ahead just at the next page.
 494 */
 495                raend -= PAGE_SIZE;
 496                if (raend < inode->i_size)
 497                        max_ahead = filp->f_ramax + PAGE_SIZE;
 498
 499                if (max_ahead) {
 500                        filp->f_rawin = filp->f_ralen;
 501                        filp->f_ralen = 0;
 502                        reada_ok      = 2;
 503                }
 504        }
 505/*
 506 * Try to read ahead pages.
 507 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 508 * scheduler, will work enough for us to avoid too bad actuals IO requests.
 509 */
 510        ahead = 0;
 511        while (ahead < max_ahead) {
 512                ahead += PAGE_SIZE;
 513                page_cache = try_to_read_ahead(inode, raend + ahead, page_cache);
 514        }
 515/*
 516 * If we tried to read ahead some pages,
 517 * If we tried to read ahead asynchronously,
 518 *   Try to force unplug of the device in order to start an asynchronous
 519 *   read IO request.
 520 * Update the read-ahead context.
 521 * Store the length of the current read-ahead window.
 522 * Double the current max read ahead size.
 523 *   That heuristic avoid to do some large IO for files that are not really
 524 *   accessed sequentially.
 525 */
 526        if (ahead) {
 527                if (reada_ok == 2) {
 528                        run_task_queue(&tq_disk);
 529                }
 530
 531                filp->f_ralen += ahead;
 532                filp->f_rawin += filp->f_ralen;
 533                filp->f_raend = raend + ahead + PAGE_SIZE;
 534
 535                filp->f_ramax += filp->f_ramax;
 536
 537                if (filp->f_ramax > MAX_READAHEAD)
 538                        filp->f_ramax = MAX_READAHEAD;
 539
 540#ifdef PROFILE_READAHEAD
 541                profile_readahead((reada_ok == 2), filp);
 542#endif
 543        }
 544
 545        return page_cache;
 546}
 547
 548
 549/*
 550 * This is a generic file read routine, and uses the
 551 * inode->i_op->readpage() function for the actual low-level
 552 * stuff.
 553 *
 554 * This is really ugly. But the goto's actually try to clarify some
 555 * of the logic when it comes to error handling etc.
 556 */
 557
 558long generic_file_read(struct inode * inode, struct file * filp,
 559        char * buf, unsigned long count)
 560{
 561        int error, read;
 562        unsigned long pos, ppos, page_cache;
 563        int reada_ok;
 564
 565        if (!access_ok(VERIFY_WRITE, buf, count))
 566                return -EFAULT;
 567        if (!count)
 568                return 0;
 569        error = 0;
 570        read = 0;
 571        page_cache = 0;
 572
 573        pos = filp->f_pos;
 574        ppos = pos & PAGE_MASK;
 575/*
 576 * If the current position is outside the previous read-ahead window, 
 577 * we reset the current read-ahead context and set read ahead max to zero
 578 * (will be set to just needed value later),
 579 * otherwise, we assume that the file accesses are sequential enough to
 580 * continue read-ahead.
 581 */
 582        if (ppos > filp->f_raend || ppos + filp->f_rawin < filp->f_raend) {
 583                reada_ok = 0;
 584                filp->f_raend = 0;
 585                filp->f_ralen = 0;
 586                filp->f_ramax = 0;
 587                filp->f_rawin = 0;
 588        } else {
 589                reada_ok = 1;
 590        }
 591/*
 592 * Adjust the current value of read-ahead max.
 593 * If the read operation stay in the first half page, force no readahead.
 594 * Otherwise try to increase read ahead max just enough to do the read request.
 595 * Then, at least MIN_READAHEAD if read ahead is ok,
 596 * and at most MAX_READAHEAD in all cases.
 597 */
 598        if (pos + count <= (PAGE_SIZE >> 1)) {
 599                filp->f_ramax = 0;
 600        } else {
 601                unsigned long needed;
 602
 603                needed = ((pos + count) & PAGE_MASK) - ppos;
 604
 605                if (filp->f_ramax < needed)
 606                        filp->f_ramax = needed;
 607
 608                if (reada_ok && filp->f_ramax < MIN_READAHEAD)
 609                                filp->f_ramax = MIN_READAHEAD;
 610                if (filp->f_ramax > MAX_READAHEAD)
 611                        filp->f_ramax = MAX_READAHEAD;
 612        }
 613
 614        for (;;) {
 615                struct page *page, **hash;
 616
 617                if (pos >= inode->i_size)
 618                        break;
 619
 620                /*
 621                 * Try to find the data in the page cache..
 622                 */
 623                hash = page_hash(inode, pos & PAGE_MASK);
 624                page = __find_page(inode, pos & PAGE_MASK, *hash);
 625                if (!page)
 626                        goto no_cached_page;
 627
 628found_page:
 629/*
 630 * Try to read ahead only if the current page is filled or being filled.
 631 * Otherwise, if we were reading ahead, decrease max read ahead size to
 632 * the minimum value.
 633 * In this context, that seems to may happen only on some read error or if 
 634 * the page has been rewritten.
 635 */
 636                if (PageUptodate(page) || PageLocked(page))
 637                        page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
 638                else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 639                                filp->f_ramax = MIN_READAHEAD;
 640
 641                wait_on_page(page);
 642
 643                if (!PageUptodate(page))
 644                        goto page_read_error;
 645
 646success:
 647                /*
 648                 * Ok, we have the page, it's up-to-date and ok,
 649                 * so now we can finally copy it to user space...
 650                 */
 651        {
 652                unsigned long offset, nr;
 653
 654                offset = pos & ~PAGE_MASK;
 655                nr = PAGE_SIZE - offset;
 656                if (nr > count)
 657                        nr = count;
 658                if (nr > inode->i_size - pos)
 659                        nr = inode->i_size - pos;
 660                nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr);
 661                release_page(page);
 662                error = -EFAULT;
 663                if (!nr)
 664                        break;
 665                buf += nr;
 666                pos += nr;
 667                read += nr;
 668                count -= nr;
 669                if (count)
 670                        continue;
 671                break;
 672        }
 673
 674no_cached_page:
 675                /*
 676                 * Ok, it wasn't cached, so we need to create a new
 677                 * page..
 678                 */
 679                if (!page_cache) {
 680                        page_cache = __get_free_page(GFP_KERNEL);
 681                        /*
 682                         * That could have slept, so go around to the
 683                         * very beginning..
 684                         */
 685                        if (page_cache)
 686                                continue;
 687                        error = -ENOMEM;
 688                        break;
 689                }
 690
 691                /*
 692                 * Ok, add the new page to the hash-queues...
 693                 */
 694                page = mem_map + MAP_NR(page_cache);
 695                page_cache = 0;
 696                add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
 697
 698                /*
 699                 * Error handling is tricky. If we get a read error,
 700                 * the cached page stays in the cache (but uptodate=0),
 701                 * and the next process that accesses it will try to
 702                 * re-read it. This is needed for NFS etc, where the
 703                 * identity of the reader can decide if we can read the
 704                 * page or not..
 705                 */
 706/*
 707 * We have to read the page.
 708 * If we were reading ahead, we had previously tried to read this page,
 709 * That means that the page has probably been removed from the cache before 
 710 * the application process needs it, or has been rewritten.
 711 * Decrease max readahead size to the minimum value in that situation.
 712 */
 713                if (reada_ok && filp->f_ramax > MIN_READAHEAD)
 714                        filp->f_ramax = MIN_READAHEAD;
 715
 716                error = inode->i_op->readpage(inode, page);
 717                if (!error)
 718                        goto found_page;
 719                release_page(page);
 720                break;
 721
 722page_read_error:
 723                /*
 724                 * We found the page, but it wasn't up-to-date.
 725                 * Try to re-read it _once_. We do this synchronously,
 726                 * because this happens only if there were errors.
 727                 */
 728                error = inode->i_op->readpage(inode, page);
 729                if (!error) {
 730                        wait_on_page(page);
 731                        if (PageUptodate(page) && !PageError(page))
 732                                goto success;
 733                        error = -EIO; /* Some unspecified error occurred.. */
 734                }
 735                release_page(page);
 736                break;
 737        }
 738
 739        filp->f_pos = pos;
 740        filp->f_reada = 1;
 741        if (page_cache)
 742                free_page(page_cache);
 743        UPDATE_ATIME(inode)
 744        if (!read)
 745                read = error;
 746        return read;
 747}
 748
 749/*
 750 * Semantics for shared and private memory areas are different past the end
 751 * of the file. A shared mapping past the last page of the file is an error
 752 * and results in a SIGBUS, while a private mapping just maps in a zero page.
 753 *
 754 * The goto's are kind of ugly, but this streamlines the normal case of having
 755 * it in the page cache, and handles the special cases reasonably without
 756 * having a lot of duplicated code.
 757 *
 758 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
 759 * ahead of the wait if we're sure to need it.
 760 */
 761static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
 762{
 763        unsigned long offset;
 764        struct page * page, **hash;
 765        struct inode * inode = area->vm_dentry->d_inode;
 766        unsigned long old_page, new_page;
 767
 768        new_page = 0;
 769        offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
 770        if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
 771                goto no_page;
 772
 773        /*
 774         * Do we have something in the page cache already?
 775         */
 776        hash = page_hash(inode, offset);
 777        page = __find_page(inode, offset, *hash);
 778        if (!page)
 779                goto no_cached_page;
 780
 781found_page:
 782        /*
 783         * Ok, found a page in the page cache, now we need to check
 784         * that it's up-to-date.  First check whether we'll need an
 785         * extra page -- better to overlap the allocation with the I/O.
 786         */
 787        if (no_share && !new_page) {
 788                new_page = __get_free_page(GFP_KERNEL);
 789                if (!new_page)
 790                        goto failure;
 791        }
 792
 793        if (PageLocked(page))
 794                goto page_locked_wait;
 795        if (!PageUptodate(page))
 796                goto page_read_error;
 797
 798success:
 799        /*
 800         * Found the page, need to check sharing and possibly
 801         * copy it over to another page..
 802         */
 803        old_page = page_address(page);
 804        if (!no_share) {
 805                /*
 806                 * Ok, we can share the cached page directly.. Get rid
 807                 * of any potential extra pages.
 808                 */
 809                if (new_page)
 810                        free_page(new_page);
 811
 812                flush_page_to_ram(old_page);
 813                return old_page;
 814        }
 815
 816        /*
 817         * No sharing ... copy to the new page.
 818         */
 819        copy_page(new_page, old_page);
 820        flush_page_to_ram(new_page);
 821        release_page(page);
 822        return new_page;
 823
 824no_cached_page:
 825        new_page = __get_free_page(GFP_KERNEL);
 826        if (!new_page)
 827                goto no_page;
 828
 829        /*
 830         * During getting the above page we might have slept,
 831         * so we need to re-check the situation with the page
 832         * cache.. The page we just got may be useful if we
 833         * can't share, so don't get rid of it here.
 834         */
 835        page = find_page(inode, offset);
 836        if (page)
 837                goto found_page;
 838
 839        /*
 840         * Now, create a new page-cache page from the page we got
 841         */
 842        page = mem_map + MAP_NR(new_page);
 843        new_page = 0;
 844        add_to_page_cache(page, inode, offset, hash);
 845
 846        if (inode->i_op->readpage(inode, page) != 0)
 847                goto failure;
 848
 849        /*
 850         * Do a very limited read-ahead if appropriate
 851         */
 852        if (PageLocked(page))
 853                new_page = try_to_read_ahead(inode, offset + PAGE_SIZE, 0);
 854        goto found_page;
 855
 856page_locked_wait:
 857        __wait_on_page(page);
 858        if (PageUptodate(page))
 859                goto success;
 860        
 861page_read_error:
 862        /*
 863         * Umm, take care of errors if the page isn't up-to-date.
 864         * Try to re-read it _once_. We do this synchronously,
 865         * because there really aren't any performance issues here
 866         * and we need to check for errors.
 867         */
 868        if (inode->i_op->readpage(inode, page) != 0)
 869                goto failure;
 870        wait_on_page(page);
 871        if (PageError(page))
 872                goto failure;
 873        if (PageUptodate(page))
 874                goto success;
 875
 876        /*
 877         * Uhhuh.. Things didn't work out. Return zero to tell the
 878         * mm layer so, possibly freeing the page cache page first.
 879         */
 880failure:
 881        release_page(page);
 882        if (new_page)
 883                free_page(new_page);
 884no_page:
 885        return 0;
 886}
 887
 888/*
 889 * Tries to write a shared mapped page to its backing store. May return -EIO
 890 * if the disk is full.
 891 */
 892static inline int do_write_page(struct inode * inode, struct file * file,
 893        const char * page, unsigned long offset)
 894{
 895        int retval;
 896        unsigned long size;
 897        unsigned long old_fs;
 898
 899        size = offset + PAGE_SIZE;
 900        /* refuse to extend file size.. */
 901        if (S_ISREG(inode->i_mode)) {
 902                if (size > inode->i_size)
 903                        size = inode->i_size;
 904                /* Ho humm.. We should have tested for this earlier */
 905                if (size < offset)
 906                        return -EIO;
 907        }
 908        size -= offset;
 909        old_fs = get_fs();
 910        set_fs(KERNEL_DS);
 911        retval = -EIO;
 912        if (size == file->f_op->write(inode, file, (const char *) page, size))
 913                retval = 0;
 914        set_fs(old_fs);
 915        return retval;
 916}
 917
 918static int filemap_write_page(struct vm_area_struct * vma,
 919        unsigned long offset,
 920        unsigned long page)
 921{
 922        int result;
 923        struct file file;
 924        struct dentry * dentry;
 925        struct inode * inode;
 926        struct buffer_head * bh;
 927
 928        bh = mem_map[MAP_NR(page)].buffers;
 929        if (bh) {
 930                /* whee.. just mark the buffer heads dirty */
 931                struct buffer_head * tmp = bh;
 932                do {
 933                        /*
 934                         * WSH: There's a race here: mark_buffer_dirty()
 935                         * could block, and the buffers aren't pinned down.
 936                         */
 937                        mark_buffer_dirty(tmp, 0);
 938                        tmp = tmp->b_this_page;
 939                } while (tmp != bh);
 940                return 0;
 941        }
 942
 943        dentry = vma->vm_dentry;
 944        inode = dentry->d_inode;
 945        file.f_op = inode->i_op->default_file_ops;
 946        if (!file.f_op->write)
 947                return -EIO;
 948        file.f_mode = 3;
 949        file.f_flags = 0;
 950        file.f_count = 1;
 951        file.f_dentry = dentry;
 952        file.f_pos = offset;
 953        file.f_reada = 0;
 954
 955        /*
 956         * If a task terminates while we're swapping the page, the vma and
 957         * and dentry could be released ... increment the count to be safe.
 958         */
 959        dget(dentry);
 960        down(&inode->i_sem);
 961        result = do_write_page(inode, &file, (const char *) page, offset);
 962        up(&inode->i_sem);
 963        dput(dentry);
 964        return result;
 965}
 966
 967
 968/*
 969 * Swapping to a shared file: while we're busy writing out the page
 970 * (and the page still exists in memory), we save the page information
 971 * in the page table, so that "filemap_swapin()" can re-use the page
 972 * immediately if it is called while we're busy swapping it out..
 973 *
 974 * Once we've written it all out, we mark the page entry "empty", which
 975 * will result in a normal page-in (instead of a swap-in) from the now
 976 * up-to-date disk file.
 977 */
 978int filemap_swapout(struct vm_area_struct * vma,
 979        unsigned long offset,
 980        pte_t *page_table)
 981{
 982        int error;
 983        unsigned long page = pte_page(*page_table);
 984        unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));
 985
 986        flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
 987        set_pte(page_table, __pte(entry));
 988        flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
 989        error = filemap_write_page(vma, offset, page);
 990        if (pte_val(*page_table) == entry)
 991                pte_clear(page_table);
 992        return error;
 993}
 994
 995/*
 996 * filemap_swapin() is called only if we have something in the page
 997 * tables that is non-zero (but not present), which we know to be the
 998 * page index of a page that is busy being swapped out (see above).
 999 * So we just use it directly..
1000 */
1001static pte_t filemap_swapin(struct vm_area_struct * vma,
1002        unsigned long offset,
1003        unsigned long entry)
1004{
1005        unsigned long page = SWP_OFFSET(entry);
1006
1007        atomic_inc(&mem_map[page].count);
1008        page = (page << PAGE_SHIFT) + PAGE_OFFSET;
1009        return mk_pte(page,vma->vm_page_prot);
1010}
1011
1012
1013static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
1014        unsigned long address, unsigned int flags)
1015{
1016        pte_t pte = *ptep;
1017        unsigned long page;
1018        int error;
1019
1020        if (!(flags & MS_INVALIDATE)) {
1021                if (!pte_present(pte))
1022                        return 0;
1023                if (!pte_dirty(pte))
1024                        return 0;
1025                flush_page_to_ram(pte_page(pte));
1026                flush_cache_page(vma, address);
1027                set_pte(ptep, pte_mkclean(pte));
1028                flush_tlb_page(vma, address);
1029                page = pte_page(pte);
1030                atomic_inc(&mem_map[MAP_NR(page)].count);
1031        } else {
1032                if (pte_none(pte))
1033                        return 0;
1034                flush_cache_page(vma, address);
1035                pte_clear(ptep);
1036                flush_tlb_page(vma, address);
1037                if (!pte_present(pte)) {
1038                        swap_free(pte_val(pte));
1039                        return 0;
1040                }
1041                page = pte_page(pte);
1042                if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
1043                        free_page(page);
1044                        return 0;
1045                }
1046        }
1047        error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
1048        free_page(page);
1049        return error;
1050}
1051
1052static inline int filemap_sync_pte_range(pmd_t * pmd,
1053        unsigned long address, unsigned long size, 
1054        struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
1055{
1056        pte_t * pte;
1057        unsigned long end;
1058        int error;
1059
1060        if (pmd_none(*pmd))
1061                return 0;
1062        if (pmd_bad(*pmd)) {
1063                printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
1064                pmd_clear(pmd);
1065                return 0;
1066        }
1067        pte = pte_offset(pmd, address);
1068        offset += address & PMD_MASK;
1069        address &= ~PMD_MASK;
1070        end = address + size;
1071        if (end > PMD_SIZE)
1072                end = PMD_SIZE;
1073        error = 0;
1074        do {
1075                error |= filemap_sync_pte(pte, vma, address + offset, flags);
1076                address += PAGE_SIZE;
1077                pte++;
1078        } while (address < end);
1079        return error;
1080}
1081
1082static inline int filemap_sync_pmd_range(pgd_t * pgd,
1083        unsigned long address, unsigned long size, 
1084        struct vm_area_struct *vma, unsigned int flags)
1085{
1086        pmd_t * pmd;
1087        unsigned long offset, end;
1088        int error;
1089
1090        if (pgd_none(*pgd))
1091                return 0;
1092        if (pgd_bad(*pgd)) {
1093                printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
1094                pgd_clear(pgd);
1095                return 0;
1096        }
1097        pmd = pmd_offset(pgd, address);
1098        offset = address & PGDIR_MASK;
1099        address &= ~PGDIR_MASK;
1100        end = address + size;
1101        if (end > PGDIR_SIZE)
1102                end = PGDIR_SIZE;
1103        error = 0;
1104        do {
1105                error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
1106                address = (address + PMD_SIZE) & PMD_MASK;
1107                pmd++;
1108        } while (address < end);
1109        return error;
1110}
1111
1112static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
1113        size_t size, unsigned int flags)
1114{
1115        pgd_t * dir;
1116        unsigned long end = address + size;
1117        int error = 0;
1118
1119        dir = pgd_offset(vma->vm_mm, address);
1120        flush_cache_range(vma->vm_mm, end - size, end);
1121        while (address < end) {
1122                error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
1123                address = (address + PGDIR_SIZE) & PGDIR_MASK;
1124                dir++;
1125        }
1126        flush_tlb_range(vma->vm_mm, end - size, end);
1127        return error;
1128}
1129
1130/*
1131 * This handles (potentially partial) area unmaps..
1132 */
1133static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
1134{
1135        filemap_sync(vma, start, len, MS_ASYNC);
1136}
1137
1138/*
1139 * Shared mappings need to be able to do the right thing at
1140 * close/unmap/sync. They will also use the private file as
1141 * backing-store for swapping..
1142 */
1143static struct vm_operations_struct file_shared_mmap = {
1144        NULL,                   /* no special open */
1145        NULL,                   /* no special close */
1146        filemap_unmap,          /* unmap - we need to sync the pages */
1147        NULL,                   /* no special protect */
1148        filemap_sync,           /* sync */
1149        NULL,                   /* advise */
1150        filemap_nopage,         /* nopage */
1151        NULL,                   /* wppage */
1152        filemap_swapout,        /* swapout */
1153        filemap_swapin,         /* swapin */
1154};
1155
1156/*
1157 * Private mappings just need to be able to load in the map.
1158 *
1159 * (This is actually used for shared mappings as well, if we
1160 * know they can't ever get write permissions..)
1161 */
1162static struct vm_operations_struct file_private_mmap = {
1163        NULL,                   /* open */
1164        NULL,                   /* close */
1165        NULL,                   /* unmap */
1166        NULL,                   /* protect */
1167        NULL,                   /* sync */
1168        NULL,                   /* advise */
1169        filemap_nopage,         /* nopage */
1170        NULL,                   /* wppage */
1171        NULL,                   /* swapout */
1172        NULL,                   /* swapin */
1173};
1174
1175/* This is used for a general mmap of a disk file */
1176
1177int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1178{
1179        struct vm_operations_struct * ops;
1180        struct inode *inode = file->f_dentry->d_inode;
1181
1182        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
1183                ops = &file_shared_mmap;
1184                /* share_page() can only guarantee proper page sharing if
1185                 * the offsets are all page aligned. */
1186                if (vma->vm_offset & (PAGE_SIZE - 1))
1187                        return -EINVAL;
1188        } else {
1189                ops = &file_private_mmap;
1190                if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
1191                        return -EINVAL;
1192        }
1193        if (!inode->i_sb || !S_ISREG(inode->i_mode))
1194                return -EACCES;
1195        if (!inode->i_op || !inode->i_op->readpage)
1196                return -ENOEXEC;
1197        UPDATE_ATIME(inode);
1198        vma->vm_dentry = dget(file->f_dentry);
1199        vma->vm_ops = ops;
1200        return 0;
1201}
1202
1203
1204/*
1205 * The msync() system call.
1206 */
1207
1208static int msync_interval(struct vm_area_struct * vma,
1209        unsigned long start, unsigned long end, int flags)
1210{
1211        if (!vma->vm_dentry)
1212                return 0;
1213        if (vma->vm_ops->sync) {
1214                int error;
1215                error = vma->vm_ops->sync(vma, start, end-start, flags);
1216                if (!error && (flags & MS_SYNC)) {
1217                        struct dentry * dentry = vma->vm_dentry;
1218                        if (dentry) {
1219                                struct inode * inode = dentry->d_inode;
1220                                down(&inode->i_sem);
1221                                error = file_fsync(NULL,dentry);
1222                                up(&inode->i_sem);
1223                        }
1224                }
1225                return error;
1226        }
1227        return 0;
1228}
1229
1230asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
1231{
1232        unsigned long end;
1233        struct vm_area_struct * vma;
1234        int unmapped_error, error = -EINVAL;
1235
1236        lock_kernel();
1237        if (start & ~PAGE_MASK)
1238                goto out;
1239        len = (len + ~PAGE_MASK) & PAGE_MASK;
1240        end = start + len;
1241        if (end < start)
1242                goto out;
1243        if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
1244                goto out;
1245        error = 0;
1246        if (end == start)
1247                goto out;
1248        /*
1249         * If the interval [start,end) covers some unmapped address ranges,
1250         * just ignore them, but return -EFAULT at the end.
1251         */
1252        vma = find_vma(current->mm, start);
1253        unmapped_error = 0;
1254        for (;;) {
1255                /* Still start < end. */
1256                error = -EFAULT;
1257                if (!vma)
1258                        goto out;
1259                /* Here start < vma->vm_end. */
1260                if (start < vma->vm_start) {
1261                        unmapped_error = -EFAULT;
1262                        start = vma->vm_start;
1263                }
1264                /* Here vma->vm_start <= start < vma->vm_end. */
1265                if (end <= vma->vm_end) {
1266                        if (start < end) {
1267                                error = msync_interval(vma, start, end, flags);
1268                                if (error)
1269                                        goto out;
1270                        }
1271                        error = unmapped_error;
1272                        goto out;
1273                }
1274                /* Here vma->vm_start <= start < vma->vm_end < end. */
1275                error = msync_interval(vma, start, vma->vm_end, flags);
1276                if (error)
1277                        goto out;
1278                start = vma->vm_end;
1279                vma = vma->vm_next;
1280        }
1281out:
1282        unlock_kernel();
1283        return error;
1284}
1285
1286/*
1287 * Write to a file through the page cache. This is mainly for the
1288 * benefit of NFS and possibly other network-based file systems.
1289 *
1290 * We currently put everything into the page cache prior to writing it.
1291 * This is not a problem when writing full pages. With partial pages,
1292 * however, we first have to read the data into the cache, then
1293 * dirty the page, and finally schedule it for writing. Alternatively, we
1294 * could write-through just the portion of data that would go into that
1295 * page, but that would kill performance for applications that write data
1296 * line by line, and it's prone to race conditions.
1297 *
1298 * Note that this routine doesn't try to keep track of dirty pages. Each
1299 * file system has to do this all by itself, unfortunately.
1300 *                                                      okir@monad.swb.de
1301 */
1302long
1303generic_file_write(struct inode *inode, struct file *file, const char *buf, unsigned long count)
1304{
1305        struct page     *page, **hash;
1306        unsigned long   page_cache = 0;
1307        unsigned long   ppos, offset;
1308        unsigned int    bytes, written;
1309        unsigned long   pos;
1310        int             status, sync, didread;
1311
1312        if (!inode->i_op || !inode->i_op->updatepage)
1313                return -EIO;
1314
1315        sync    = file->f_flags & O_SYNC;
1316        pos     = file->f_pos;
1317        written = 0;
1318        status  = 0;
1319
1320        if (file->f_flags & O_APPEND)
1321                pos = inode->i_size;
1322
1323        while (count) {
1324                /*
1325                 * Try to find the page in the cache. If it isn't there,
1326                 * allocate a free page.
1327                 */
1328                offset = (pos & ~PAGE_MASK);
1329                ppos = pos & PAGE_MASK;
1330
1331                if ((bytes = PAGE_SIZE - offset) > count)
1332                        bytes = count;
1333
1334                hash = page_hash(inode, ppos);
1335                if (!(page = __find_page(inode, ppos, *hash))) {
1336                        if (!page_cache) {
1337                                page_cache = __get_free_page(GFP_KERNEL);
1338                                if (!page_cache) {
1339                                        status = -ENOMEM;
1340                                        break;
1341                                }
1342                                continue;
1343                        }
1344                        page = mem_map + MAP_NR(page_cache);
1345                        add_to_page_cache(page, inode, ppos, hash);
1346                        page_cache = 0;
1347                }
1348
1349                /*
1350                 * WSH 06/05/97: restructured slightly to make sure we release
1351                 * the page on an error exit.  Removed explicit setting of
1352                 * PG_locked, as that's handled below the i_op->xxx interface.
1353                 */
1354                didread = 0;
1355page_wait:
1356                wait_on_page(page);
1357
1358                /*
1359                 * If the page is not uptodate, and we're writing less
1360                 * than a full page of data, we may have to read it first.
1361                 * However, don't bother with reading the page when it's
1362                 * after the current end of file.
1363                 */
1364                if (!PageUptodate(page)) {
1365                        if (bytes < PAGE_SIZE && ppos < inode->i_size) {
1366                                if (didread < 2)
1367                                    status = inode->i_op->readpage(inode, page);
1368                                else 
1369                                    status = -EIO; /* two tries ... error out */
1370                                if (status < 0)
1371                                        goto done_with_page;
1372                                didread++;
1373                                goto page_wait;
1374                        }
1375                        set_bit(PG_uptodate, &page->flags);
1376                }
1377
1378                /* Alright, the page is there.  Now update it. */
1379                status = inode->i_op->updatepage(inode, page, buf,
1380                                                        offset, bytes, sync);
1381done_with_page:
1382                __free_page(page);
1383                if (status < 0)
1384                        break;
1385
1386                written += status;
1387                count -= status;
1388                pos += status;
1389                buf += status;
1390        }
1391        file->f_pos = pos;
1392        if (pos > inode->i_size)
1393                inode->i_size = pos;
1394
1395        if (page_cache)
1396                free_page(page_cache);
1397        if (written)
1398                return written;
1399        return status;
1400}
1401
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.