linux/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/config.h>
  22#include <linux/kernel.h>
  23#include <linux/syscalls.h>
  24#include <linux/fs.h>
  25#include <linux/mm.h>
  26#include <linux/percpu.h>
  27#include <linux/slab.h>
  28#include <linux/smp_lock.h>
  29#include <linux/capability.h>
  30#include <linux/blkdev.h>
  31#include <linux/file.h>
  32#include <linux/quotaops.h>
  33#include <linux/highmem.h>
  34#include <linux/module.h>
  35#include <linux/writeback.h>
  36#include <linux/hash.h>
  37#include <linux/suspend.h>
  38#include <linux/buffer_head.h>
  39#include <linux/bio.h>
  40#include <linux/notifier.h>
  41#include <linux/cpu.h>
  42#include <linux/bitops.h>
  43#include <linux/mpage.h>
  44#include <linux/bit_spinlock.h>
  45
  46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  47static void invalidate_bh_lrus(void);
  48
  49#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  50
  51inline void
  52init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  53{
  54        bh->b_end_io = handler;
  55        bh->b_private = private;
  56}
  57
  58static int sync_buffer(void *word)
  59{
  60        struct block_device *bd;
  61        struct buffer_head *bh
  62                = container_of(word, struct buffer_head, b_state);
  63
  64        smp_mb();
  65        bd = bh->b_bdev;
  66        if (bd)
  67                blk_run_address_space(bd->bd_inode->i_mapping);
  68        io_schedule();
  69        return 0;
  70}
  71
  72void fastcall __lock_buffer(struct buffer_head *bh)
  73{
  74        wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
  75                                                        TASK_UNINTERRUPTIBLE);
  76}
  77EXPORT_SYMBOL(__lock_buffer);
  78
  79void fastcall unlock_buffer(struct buffer_head *bh)
  80{
  81        clear_buffer_locked(bh);
  82        smp_mb__after_clear_bit();
  83        wake_up_bit(&bh->b_state, BH_Lock);
  84}
  85
  86/*
  87 * Block until a buffer comes unlocked.  This doesn't stop it
  88 * from becoming locked again - you have to lock it yourself
  89 * if you want to preserve its state.
  90 */
  91void __wait_on_buffer(struct buffer_head * bh)
  92{
  93        wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
  94}
  95
  96static void
  97__clear_page_buffers(struct page *page)
  98{
  99        ClearPagePrivate(page);
 100        set_page_private(page, 0);
 101        page_cache_release(page);
 102}
 103
 104static void buffer_io_error(struct buffer_head *bh)
 105{
 106        char b[BDEVNAME_SIZE];
 107
 108        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 109                        bdevname(bh->b_bdev, b),
 110                        (unsigned long long)bh->b_blocknr);
 111}
 112
 113/*
 114 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 115 * unlock the buffer. This is what ll_rw_block uses too.
 116 */
 117void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 118{
 119        if (uptodate) {
 120                set_buffer_uptodate(bh);
 121        } else {
 122                /* This happens, due to failed READA attempts. */
 123                clear_buffer_uptodate(bh);
 124        }
 125        unlock_buffer(bh);
 126        put_bh(bh);
 127}
 128
 129void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 130{
 131        char b[BDEVNAME_SIZE];
 132
 133        if (uptodate) {
 134                set_buffer_uptodate(bh);
 135        } else {
 136                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 137                        buffer_io_error(bh);
 138                        printk(KERN_WARNING "lost page write due to "
 139                                        "I/O error on %s\n",
 140                                       bdevname(bh->b_bdev, b));
 141                }
 142                set_buffer_write_io_error(bh);
 143                clear_buffer_uptodate(bh);
 144        }
 145        unlock_buffer(bh);
 146        put_bh(bh);
 147}
 148
 149/*
 150 * Write out and wait upon all the dirty data associated with a block
 151 * device via its mapping.  Does not take the superblock lock.
 152 */
 153int sync_blockdev(struct block_device *bdev)
 154{
 155        int ret = 0;
 156
 157        if (bdev)
 158                ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 159        return ret;
 160}
 161EXPORT_SYMBOL(sync_blockdev);
 162
 163static void __fsync_super(struct super_block *sb)
 164{
 165        sync_inodes_sb(sb, 0);
 166        DQUOT_SYNC(sb);
 167        lock_super(sb);
 168        if (sb->s_dirt && sb->s_op->write_super)
 169                sb->s_op->write_super(sb);
 170        unlock_super(sb);
 171        if (sb->s_op->sync_fs)
 172                sb->s_op->sync_fs(sb, 1);
 173        sync_blockdev(sb->s_bdev);
 174        sync_inodes_sb(sb, 1);
 175}
 176
 177/*
 178 * Write out and wait upon all dirty data associated with this
 179 * superblock.  Filesystem data as well as the underlying block
 180 * device.  Takes the superblock lock.
 181 */
 182int fsync_super(struct super_block *sb)
 183{
 184        __fsync_super(sb);
 185        return sync_blockdev(sb->s_bdev);
 186}
 187
 188/*
 189 * Write out and wait upon all dirty data associated with this
 190 * device.   Filesystem data as well as the underlying block
 191 * device.  Takes the superblock lock.
 192 */
 193int fsync_bdev(struct block_device *bdev)
 194{
 195        struct super_block *sb = get_super(bdev);
 196        if (sb) {
 197                int res = fsync_super(sb);
 198                drop_super(sb);
 199                return res;
 200        }
 201        return sync_blockdev(bdev);
 202}
 203
 204/**
 205 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 206 * @bdev:       blockdevice to lock
 207 *
 208 * This takes the block device bd_mount_mutex to make sure no new mounts
 209 * happen on bdev until thaw_bdev() is called.
 210 * If a superblock is found on this device, we take the s_umount semaphore
 211 * on it to make sure nobody unmounts until the snapshot creation is done.
 212 */
 213struct super_block *freeze_bdev(struct block_device *bdev)
 214{
 215        struct super_block *sb;
 216
 217        mutex_lock(&bdev->bd_mount_mutex);
 218        sb = get_super(bdev);
 219        if (sb && !(sb->s_flags & MS_RDONLY)) {
 220                sb->s_frozen = SB_FREEZE_WRITE;
 221                smp_wmb();
 222
 223                __fsync_super(sb);
 224
 225                sb->s_frozen = SB_FREEZE_TRANS;
 226                smp_wmb();
 227
 228                sync_blockdev(sb->s_bdev);
 229
 230                if (sb->s_op->write_super_lockfs)
 231                        sb->s_op->write_super_lockfs(sb);
 232        }
 233
 234        sync_blockdev(bdev);
 235        return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 236}
 237EXPORT_SYMBOL(freeze_bdev);
 238
 239/**
 240 * thaw_bdev  -- unlock filesystem
 241 * @bdev:       blockdevice to unlock
 242 * @sb:         associated superblock
 243 *
 244 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 245 */
 246void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 247{
 248        if (sb) {
 249                BUG_ON(sb->s_bdev != bdev);
 250
 251                if (sb->s_op->unlockfs)
 252                        sb->s_op->unlockfs(sb);
 253                sb->s_frozen = SB_UNFROZEN;
 254                smp_wmb();
 255                wake_up(&sb->s_wait_unfrozen);
 256                drop_super(sb);
 257        }
 258
 259        mutex_unlock(&bdev->bd_mount_mutex);
 260}
 261EXPORT_SYMBOL(thaw_bdev);
 262
 263/*
 264 * sync everything.  Start out by waking pdflush, because that writes back
 265 * all queues in parallel.
 266 */
 267static void do_sync(unsigned long wait)
 268{
 269        wakeup_pdflush(0);
 270        sync_inodes(0);         /* All mappings, inodes and their blockdevs */
 271        DQUOT_SYNC(NULL);
 272        sync_supers();          /* Write the superblocks */
 273        sync_filesystems(0);    /* Start syncing the filesystems */
 274        sync_filesystems(wait); /* Waitingly sync the filesystems */
 275        sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
 276        if (!wait)
 277                printk("Emergency Sync complete\n");
 278        if (unlikely(laptop_mode))
 279                laptop_sync_completion();
 280}
 281
 282asmlinkage long sys_sync(void)
 283{
 284        do_sync(1);
 285        return 0;
 286}
 287
 288void emergency_sync(void)
 289{
 290        pdflush_operation(do_sync, 0);
 291}
 292
 293/*
 294 * Generic function to fsync a file.
 295 *
 296 * filp may be NULL if called via the msync of a vma.
 297 */
 298 
 299int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 300{
 301        struct inode * inode = dentry->d_inode;
 302        struct super_block * sb;
 303        int ret, err;
 304
 305        /* sync the inode to buffers */
 306        ret = write_inode_now(inode, 0);
 307
 308        /* sync the superblock to buffers */
 309        sb = inode->i_sb;
 310        lock_super(sb);
 311        if (sb->s_op->write_super)
 312                sb->s_op->write_super(sb);
 313        unlock_super(sb);
 314
 315        /* .. finally sync the buffers to disk */
 316        err = sync_blockdev(sb->s_bdev);
 317        if (!ret)
 318                ret = err;
 319        return ret;
 320}
 321
 322long do_fsync(struct file *file, int datasync)
 323{
 324        int ret;
 325        int err;
 326        struct address_space *mapping = file->f_mapping;
 327
 328        if (!file->f_op || !file->f_op->fsync) {
 329                /* Why?  We can still call filemap_fdatawrite */
 330                ret = -EINVAL;
 331                goto out;
 332        }
 333
 334        current->flags |= PF_SYNCWRITE;
 335        ret = filemap_fdatawrite(mapping);
 336
 337        /*
 338         * We need to protect against concurrent writers, which could cause
 339         * livelocks in fsync_buffers_list().
 340         */
 341        mutex_lock(&mapping->host->i_mutex);
 342        err = file->f_op->fsync(file, file->f_dentry, datasync);
 343        if (!ret)
 344                ret = err;
 345        mutex_unlock(&mapping->host->i_mutex);
 346        err = filemap_fdatawait(mapping);
 347        if (!ret)
 348                ret = err;
 349        current->flags &= ~PF_SYNCWRITE;
 350out:
 351        return ret;
 352}
 353
 354static long __do_fsync(unsigned int fd, int datasync)
 355{
 356        struct file *file;
 357        int ret = -EBADF;
 358
 359        file = fget(fd);
 360        if (file) {
 361                ret = do_fsync(file, datasync);
 362                fput(file);
 363        }
 364        return ret;
 365}
 366
 367asmlinkage long sys_fsync(unsigned int fd)
 368{
 369        return __do_fsync(fd, 0);
 370}
 371
 372asmlinkage long sys_fdatasync(unsigned int fd)
 373{
 374        return __do_fsync(fd, 1);
 375}
 376
 377/*
 378 * Various filesystems appear to want __find_get_block to be non-blocking.
 379 * But it's the page lock which protects the buffers.  To get around this,
 380 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 381 * private_lock.
 382 *
 383 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 384 * may be quite high.  This code could TryLock the page, and if that
 385 * succeeds, there is no need to take private_lock. (But if
 386 * private_lock is contended then so is mapping->tree_lock).
 387 */
 388static struct buffer_head *
 389__find_get_block_slow(struct block_device *bdev, sector_t block)
 390{
 391        struct inode *bd_inode = bdev->bd_inode;
 392        struct address_space *bd_mapping = bd_inode->i_mapping;
 393        struct buffer_head *ret = NULL;
 394        pgoff_t index;
 395        struct buffer_head *bh;
 396        struct buffer_head *head;
 397        struct page *page;
 398        int all_mapped = 1;
 399
 400        index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 401        page = find_get_page(bd_mapping, index);
 402        if (!page)
 403                goto out;
 404
 405        spin_lock(&bd_mapping->private_lock);
 406        if (!page_has_buffers(page))
 407                goto out_unlock;
 408        head = page_buffers(page);
 409        bh = head;
 410        do {
 411                if (bh->b_blocknr == block) {
 412                        ret = bh;
 413                        get_bh(bh);
 414                        goto out_unlock;
 415                }
 416                if (!buffer_mapped(bh))
 417                        all_mapped = 0;
 418                bh = bh->b_this_page;
 419        } while (bh != head);
 420
 421        /* we might be here because some of the buffers on this page are
 422         * not mapped.  This is due to various races between
 423         * file io on the block device and getblk.  It gets dealt with
 424         * elsewhere, don't buffer_error if we had some unmapped buffers
 425         */
 426        if (all_mapped) {
 427                printk("__find_get_block_slow() failed. "
 428                        "block=%llu, b_blocknr=%llu\n",
 429                        (unsigned long long)block,
 430                        (unsigned long long)bh->b_blocknr);
 431                printk("b_state=0x%08lx, b_size=%zu\n",
 432                        bh->b_state, bh->b_size);
 433                printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 434        }
 435out_unlock:
 436        spin_unlock(&bd_mapping->private_lock);
 437        page_cache_release(page);
 438out:
 439        return ret;
 440}
 441
 442/* If invalidate_buffers() will trash dirty buffers, it means some kind
 443   of fs corruption is going on. Trashing dirty data always imply losing
 444   information that was supposed to be just stored on the physical layer
 445   by the user.
 446
 447   Thus invalidate_buffers in general usage is not allwowed to trash
 448   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 449   be preserved.  These buffers are simply skipped.
 450  
 451   We also skip buffers which are still in use.  For example this can
 452   happen if a userspace program is reading the block device.
 453
 454   NOTE: In the case where the user removed a removable-media-disk even if
 455   there's still dirty data not synced on disk (due a bug in the device driver
 456   or due an error of the user), by not destroying the dirty buffers we could
 457   generate corruption also on the next media inserted, thus a parameter is
 458   necessary to handle this case in the most safe way possible (trying
 459   to not corrupt also the new disk inserted with the data belonging to
 460   the old now corrupted disk). Also for the ramdisk the natural thing
 461   to do in order to release the ramdisk memory is to destroy dirty buffers.
 462
 463   These are two special cases. Normal usage imply the device driver
 464   to issue a sync on the device (without waiting I/O completion) and
 465   then an invalidate_buffers call that doesn't trash dirty buffers.
 466
 467   For handling cache coherency with the blkdev pagecache the 'update' case
 468   is been introduced. It is needed to re-read from disk any pinned
 469   buffer. NOTE: re-reading from disk is destructive so we can do it only
 470   when we assume nobody is changing the buffercache under our I/O and when
 471   we think the disk contains more recent information than the buffercache.
 472   The update == 1 pass marks the buffers we need to update, the update == 2
 473   pass does the actual I/O. */
 474void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 475{
 476        struct address_space *mapping = bdev->bd_inode->i_mapping;
 477
 478        if (mapping->nrpages == 0)
 479                return;
 480
 481        invalidate_bh_lrus();
 482        /*
 483         * FIXME: what about destroy_dirty_buffers?
 484         * We really want to use invalidate_inode_pages2() for
 485         * that, but not until that's cleaned up.
 486         */
 487        invalidate_inode_pages(mapping);
 488}
 489
 490/*
 491 * Kick pdflush then try to free up some ZONE_NORMAL memory.
 492 */
 493static void free_more_memory(void)
 494{
 495        struct zone **zones;
 496        pg_data_t *pgdat;
 497
 498        wakeup_pdflush(1024);
 499        yield();
 500
 501        for_each_online_pgdat(pgdat) {
 502                zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
 503                if (*zones)
 504                        try_to_free_pages(zones, GFP_NOFS);
 505        }
 506}
 507
 508/*
 509 * I/O completion handler for block_read_full_page() - pages
 510 * which come unlocked at the end of I/O.
 511 */
 512static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 513{
 514        unsigned long flags;
 515        struct buffer_head *first;
 516        struct buffer_head *tmp;
 517        struct page *page;
 518        int page_uptodate = 1;
 519
 520        BUG_ON(!buffer_async_read(bh));
 521
 522        page = bh->b_page;
 523        if (uptodate) {
 524                set_buffer_uptodate(bh);
 525        } else {
 526                clear_buffer_uptodate(bh);
 527                if (printk_ratelimit())
 528                        buffer_io_error(bh);
 529                SetPageError(page);
 530        }
 531
 532        /*
 533         * Be _very_ careful from here on. Bad things can happen if
 534         * two buffer heads end IO at almost the same time and both
 535         * decide that the page is now completely done.
 536         */
 537        first = page_buffers(page);
 538        local_irq_save(flags);
 539        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 540        clear_buffer_async_read(bh);
 541        unlock_buffer(bh);
 542        tmp = bh;
 543        do {
 544                if (!buffer_uptodate(tmp))
 545                        page_uptodate = 0;
 546                if (buffer_async_read(tmp)) {
 547                        BUG_ON(!buffer_locked(tmp));
 548                        goto still_busy;
 549                }
 550                tmp = tmp->b_this_page;
 551        } while (tmp != bh);
 552        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 553        local_irq_restore(flags);
 554
 555        /*
 556         * If none of the buffers had errors and they are all
 557         * uptodate then we can set the page uptodate.
 558         */
 559        if (page_uptodate && !PageError(page))
 560                SetPageUptodate(page);
 561        unlock_page(page);
 562        return;
 563
 564still_busy:
 565        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 566        local_irq_restore(flags);
 567        return;
 568}
 569
 570/*
 571 * Completion handler for block_write_full_page() - pages which are unlocked
 572 * during I/O, and which have PageWriteback cleared upon I/O completion.
 573 */
 574void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 575{
 576        char b[BDEVNAME_SIZE];
 577        unsigned long flags;
 578        struct buffer_head *first;
 579        struct buffer_head *tmp;
 580        struct page *page;
 581
 582        BUG_ON(!buffer_async_write(bh));
 583
 584        page = bh->b_page;
 585        if (uptodate) {
 586                set_buffer_uptodate(bh);
 587        } else {
 588                if (printk_ratelimit()) {
 589                        buffer_io_error(bh);
 590                        printk(KERN_WARNING "lost page write due to "
 591                                        "I/O error on %s\n",
 592                               bdevname(bh->b_bdev, b));
 593                }
 594                set_bit(AS_EIO, &page->mapping->flags);
 595                clear_buffer_uptodate(bh);
 596                SetPageError(page);
 597        }
 598
 599        first = page_buffers(page);
 600        local_irq_save(flags);
 601        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 602
 603        clear_buffer_async_write(bh);
 604        unlock_buffer(bh);
 605        tmp = bh->b_this_page;
 606        while (tmp != bh) {
 607                if (buffer_async_write(tmp)) {
 608                        BUG_ON(!buffer_locked(tmp));
 609                        goto still_busy;
 610                }
 611                tmp = tmp->b_this_page;
 612        }
 613        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 614        local_irq_restore(flags);
 615        end_page_writeback(page);
 616        return;
 617
 618still_busy:
 619        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 620        local_irq_restore(flags);
 621        return;
 622}
 623
 624/*
 625 * If a page's buffers are under async readin (end_buffer_async_read
 626 * completion) then there is a possibility that another thread of
 627 * control could lock one of the buffers after it has completed
 628 * but while some of the other buffers have not completed.  This
 629 * locked buffer would confuse end_buffer_async_read() into not unlocking
 630 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 631 * that this buffer is not under async I/O.
 632 *
 633 * The page comes unlocked when it has no locked buffer_async buffers
 634 * left.
 635 *
 636 * PageLocked prevents anyone starting new async I/O reads any of
 637 * the buffers.
 638 *
 639 * PageWriteback is used to prevent simultaneous writeout of the same
 640 * page.
 641 *
 642 * PageLocked prevents anyone from starting writeback of a page which is
 643 * under read I/O (PageWriteback is only ever set against a locked page).
 644 */
 645static void mark_buffer_async_read(struct buffer_head *bh)
 646{
 647        bh->b_end_io = end_buffer_async_read;
 648        set_buffer_async_read(bh);
 649}
 650
 651void mark_buffer_async_write(struct buffer_head *bh)
 652{
 653        bh->b_end_io = end_buffer_async_write;
 654        set_buffer_async_write(bh);
 655}
 656EXPORT_SYMBOL(mark_buffer_async_write);
 657
 658
 659/*
 660 * fs/buffer.c contains helper functions for buffer-backed address space's
 661 * fsync functions.  A common requirement for buffer-based filesystems is
 662 * that certain data from the backing blockdev needs to be written out for
 663 * a successful fsync().  For example, ext2 indirect blocks need to be
 664 * written back and waited upon before fsync() returns.
 665 *
 666 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 667 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 668 * management of a list of dependent buffers at ->i_mapping->private_list.
 669 *
 670 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 671 * from their controlling inode's queue when they are being freed.  But
 672 * try_to_free_buffers() will be operating against the *blockdev* mapping
 673 * at the time, not against the S_ISREG file which depends on those buffers.
 674 * So the locking for private_list is via the private_lock in the address_space
 675 * which backs the buffers.  Which is different from the address_space 
 676 * against which the buffers are listed.  So for a particular address_space,
 677 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 678 * mapping->private_list will always be protected by the backing blockdev's
 679 * ->private_lock.
 680 *
 681 * Which introduces a requirement: all buffers on an address_space's
 682 * ->private_list must be from the same address_space: the blockdev's.
 683 *
 684 * address_spaces which do not place buffers at ->private_list via these
 685 * utility functions are free to use private_lock and private_list for
 686 * whatever they want.  The only requirement is that list_empty(private_list)
 687 * be true at clear_inode() time.
 688 *
 689 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 690 * filesystems should do that.  invalidate_inode_buffers() should just go
 691 * BUG_ON(!list_empty).
 692 *
 693 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 694 * take an address_space, not an inode.  And it should be called
 695 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 696 * queued up.
 697 *
 698 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 699 * list if it is already on a list.  Because if the buffer is on a list,
 700 * it *must* already be on the right one.  If not, the filesystem is being
 701 * silly.  This will save a ton of locking.  But first we have to ensure
 702 * that buffers are taken *off* the old inode's list when they are freed
 703 * (presumably in truncate).  That requires careful auditing of all
 704 * filesystems (do it inside bforget()).  It could also be done by bringing
 705 * b_inode back.
 706 */
 707
 708/*
 709 * The buffer's backing address_space's private_lock must be held
 710 */
 711static inline void __remove_assoc_queue(struct buffer_head *bh)
 712{
 713        list_del_init(&bh->b_assoc_buffers);
 714}
 715
 716int inode_has_buffers(struct inode *inode)
 717{
 718        return !list_empty(&inode->i_data.private_list);
 719}
 720
 721/*
 722 * osync is designed to support O_SYNC io.  It waits synchronously for
 723 * all already-submitted IO to complete, but does not queue any new
 724 * writes to the disk.
 725 *
 726 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 727 * you dirty the buffers, and then use osync_inode_buffers to wait for
 728 * completion.  Any other dirty buffers which are not yet queued for
 729 * write will not be flushed to disk by the osync.
 730 */
 731static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 732{
 733        struct buffer_head *bh;
 734        struct list_head *p;
 735        int err = 0;
 736
 737        spin_lock(lock);
 738repeat:
 739        list_for_each_prev(p, list) {
 740                bh = BH_ENTRY(p);
 741                if (buffer_locked(bh)) {
 742                        get_bh(bh);
 743                        spin_unlock(lock);
 744                        wait_on_buffer(bh);
 745                        if (!buffer_uptodate(bh))
 746                                err = -EIO;
 747                        brelse(bh);
 748                        spin_lock(lock);
 749                        goto repeat;
 750                }
 751        }
 752        spin_unlock(lock);
 753        return err;
 754}
 755
 756/**
 757 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 758 *                        buffers
 759 * @mapping: the mapping which wants those buffers written
 760 *
 761 * Starts I/O against the buffers at mapping->private_list, and waits upon
 762 * that I/O.
 763 *
 764 * Basically, this is a convenience function for fsync().
 765 * @mapping is a file or directory which needs those buffers to be written for
 766 * a successful fsync().
 767 */
 768int sync_mapping_buffers(struct address_space *mapping)
 769{
 770        struct address_space *buffer_mapping = mapping->assoc_mapping;
 771
 772        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 773                return 0;
 774
 775        return fsync_buffers_list(&buffer_mapping->private_lock,
 776                                        &mapping->private_list);
 777}
 778EXPORT_SYMBOL(sync_mapping_buffers);
 779
 780/*
 781 * Called when we've recently written block `bblock', and it is known that
 782 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 783 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 784 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 785 */
 786void write_boundary_block(struct block_device *bdev,
 787                        sector_t bblock, unsigned blocksize)
 788{
 789        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 790        if (bh) {
 791                if (buffer_dirty(bh))
 792                        ll_rw_block(WRITE, 1, &bh);
 793                put_bh(bh);
 794        }
 795}
 796
 797void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 798{
 799        struct address_space *mapping = inode->i_mapping;
 800        struct address_space *buffer_mapping = bh->b_page->mapping;
 801
 802        mark_buffer_dirty(bh);
 803        if (!mapping->assoc_mapping) {
 804                mapping->assoc_mapping = buffer_mapping;
 805        } else {
 806                BUG_ON(mapping->assoc_mapping != buffer_mapping);
 807        }
 808        if (list_empty(&bh->b_assoc_buffers)) {
 809                spin_lock(&buffer_mapping->private_lock);
 810                list_move_tail(&bh->b_assoc_buffers,
 811                                &mapping->private_list);
 812                spin_unlock(&buffer_mapping->private_lock);
 813        }
 814}
 815EXPORT_SYMBOL(mark_buffer_dirty_inode);
 816
 817/*
 818 * Add a page to the dirty page list.
 819 *
 820 * It is a sad fact of life that this function is called from several places
 821 * deeply under spinlocking.  It may not sleep.
 822 *
 823 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 824 * dirty-state coherency between the page and the buffers.  It the page does
 825 * not have buffers then when they are later attached they will all be set
 826 * dirty.
 827 *
 828 * The buffers are dirtied before the page is dirtied.  There's a small race
 829 * window in which a writepage caller may see the page cleanness but not the
 830 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 831 * before the buffers, a concurrent writepage caller could clear the page dirty
 832 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 833 * page on the dirty page list.
 834 *
 835 * We use private_lock to lock against try_to_free_buffers while using the
 836 * page's buffer list.  Also use this to protect against clean buffers being
 837 * added to the page after it was set dirty.
 838 *
 839 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 840 * address_space though.
 841 */
 842int __set_page_dirty_buffers(struct page *page)
 843{
 844        struct address_space * const mapping = page->mapping;
 845
 846        spin_lock(&mapping->private_lock);
 847        if (page_has_buffers(page)) {
 848                struct buffer_head *head = page_buffers(page);
 849                struct buffer_head *bh = head;
 850
 851                do {
 852                        set_buffer_dirty(bh);
 853                        bh = bh->b_this_page;
 854                } while (bh != head);
 855        }
 856        spin_unlock(&mapping->private_lock);
 857
 858        if (!TestSetPageDirty(page)) {
 859                write_lock_irq(&mapping->tree_lock);
 860                if (page->mapping) {    /* Race with truncate? */
 861                        if (mapping_cap_account_dirty(mapping))
 862                                inc_page_state(nr_dirty);
 863                        radix_tree_tag_set(&mapping->page_tree,
 864                                                page_index(page),
 865                                                PAGECACHE_TAG_DIRTY);
 866                }
 867                write_unlock_irq(&mapping->tree_lock);
 868                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 869                return 1;
 870        }
 871        return 0;
 872}
 873EXPORT_SYMBOL(__set_page_dirty_buffers);
 874
 875/*
 876 * Write out and wait upon a list of buffers.
 877 *
 878 * We have conflicting pressures: we want to make sure that all
 879 * initially dirty buffers get waited on, but that any subsequently
 880 * dirtied buffers don't.  After all, we don't want fsync to last
 881 * forever if somebody is actively writing to the file.
 882 *
 883 * Do this in two main stages: first we copy dirty buffers to a
 884 * temporary inode list, queueing the writes as we go.  Then we clean
 885 * up, waiting for those writes to complete.
 886 * 
 887 * During this second stage, any subsequent updates to the file may end
 888 * up refiling the buffer on the original inode's dirty list again, so
 889 * there is a chance we will end up with a buffer queued for write but
 890 * not yet completed on that list.  So, as a final cleanup we go through
 891 * the osync code to catch these locked, dirty buffers without requeuing
 892 * any newly dirty buffers for write.
 893 */
 894static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 895{
 896        struct buffer_head *bh;
 897        struct list_head tmp;
 898        int err = 0, err2;
 899
 900        INIT_LIST_HEAD(&tmp);
 901
 902        spin_lock(lock);
 903        while (!list_empty(list)) {
 904                bh = BH_ENTRY(list->next);
 905                list_del_init(&bh->b_assoc_buffers);
 906                if (buffer_dirty(bh) || buffer_locked(bh)) {
 907                        list_add(&bh->b_assoc_buffers, &tmp);
 908                        if (buffer_dirty(bh)) {
 909                                get_bh(bh);
 910                                spin_unlock(lock);
 911                                /*
 912                                 * Ensure any pending I/O completes so that
 913                                 * ll_rw_block() actually writes the current
 914                                 * contents - it is a noop if I/O is still in
 915                                 * flight on potentially older contents.
 916                                 */
 917                                ll_rw_block(SWRITE, 1, &bh);
 918                                brelse(bh);
 919                                spin_lock(lock);
 920                        }
 921                }
 922        }
 923
 924        while (!list_empty(&tmp)) {
 925                bh = BH_ENTRY(tmp.prev);
 926                __remove_assoc_queue(bh);
 927                get_bh(bh);
 928                spin_unlock(lock);
 929                wait_on_buffer(bh);
 930                if (!buffer_uptodate(bh))
 931                        err = -EIO;
 932                brelse(bh);
 933                spin_lock(lock);
 934        }
 935        
 936        spin_unlock(lock);
 937        err2 = osync_buffers_list(lock, list);
 938        if (err)
 939                return err;
 940        else
 941                return err2;
 942}
 943
 944/*
 945 * Invalidate any and all dirty buffers on a given inode.  We are
 946 * probably unmounting the fs, but that doesn't mean we have already
 947 * done a sync().  Just drop the buffers from the inode list.
 948 *
 949 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 950 * assumes that all the buffers are against the blockdev.  Not true
 951 * for reiserfs.
 952 */
 953void invalidate_inode_buffers(struct inode *inode)
 954{
 955        if (inode_has_buffers(inode)) {
 956                struct address_space *mapping = &inode->i_data;
 957                struct list_head *list = &mapping->private_list;
 958                struct address_space *buffer_mapping = mapping->assoc_mapping;
 959
 960                spin_lock(&buffer_mapping->private_lock);
 961                while (!list_empty(list))
 962                        __remove_assoc_queue(BH_ENTRY(list->next));
 963                spin_unlock(&buffer_mapping->private_lock);
 964        }
 965}
 966
 967/*
 968 * Remove any clean buffers from the inode's buffer list.  This is called
 969 * when we're trying to free the inode itself.  Those buffers can pin it.
 970 *
 971 * Returns true if all buffers were removed.
 972 */
 973int remove_inode_buffers(struct inode *inode)
 974{
 975        int ret = 1;
 976
 977        if (inode_has_buffers(inode)) {
 978                struct address_space *mapping = &inode->i_data;
 979                struct list_head *list = &mapping->private_list;
 980                struct address_space *buffer_mapping = mapping->assoc_mapping;
 981
 982                spin_lock(&buffer_mapping->private_lock);
 983                while (!list_empty(list)) {
 984                        struct buffer_head *bh = BH_ENTRY(list->next);
 985                        if (buffer_dirty(bh)) {
 986                                ret = 0;
 987                                break;
 988                        }
 989                        __remove_assoc_queue(bh);
 990                }
 991                spin_unlock(&buffer_mapping->private_lock);
 992        }
 993        return ret;
 994}
 995
 996/*
 997 * Create the appropriate buffers when given a page for data area and
 998 * the size of each buffer.. Use the bh->b_this_page linked list to
 999 * follow the buffers created.  Return NULL if unable to create more
1000 * buffers.
1001 *
1002 * The retry flag is used to differentiate async IO (paging, swapping)
1003 * which may not fail from ordinary buffer allocations.
1004 */
1005struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
1006                int retry)
1007{
1008        struct buffer_head *bh, *head;
1009        long offset;
1010
1011try_again:
1012        head = NULL;
1013        offset = PAGE_SIZE;
1014        while ((offset -= size) >= 0) {
1015                bh = alloc_buffer_head(GFP_NOFS);
1016                if (!bh)
1017                        goto no_grow;
1018
1019                bh->b_bdev = NULL;
1020                bh->b_this_page = head;
1021                bh->b_blocknr = -1;
1022                head = bh;
1023
1024                bh->b_state = 0;
1025                atomic_set(&bh->b_count, 0);
1026                bh->b_private = NULL;
1027                bh->b_size = size;
1028
1029                /* Link the buffer to its page */
1030                set_bh_page(bh, page, offset);
1031
1032                init_buffer(bh, NULL, NULL);
1033        }
1034        return head;
1035/*
1036 * In case anything failed, we just free everything we got.
1037 */
1038no_grow:
1039        if (head) {
1040                do {
1041                        bh = head;
1042                        head = head->b_this_page;
1043                        free_buffer_head(bh);
1044                } while (head);
1045        }
1046
1047        /*
1048         * Return failure for non-async IO requests.  Async IO requests
1049         * are not allowed to fail, so we have to wait until buffer heads
1050         * become available.  But we don't want tasks sleeping with 
1051         * partially complete buffers, so all were released above.
1052         */
1053        if (!retry)
1054                return NULL;
1055
1056        /* We're _really_ low on memory. Now we just
1057         * wait for old buffer heads to become free due to
1058         * finishing IO.  Since this is an async request and
1059         * the reserve list is empty, we're sure there are 
1060         * async buffer heads in use.
1061         */
1062        free_more_memory();
1063        goto try_again;
1064}
1065EXPORT_SYMBOL_GPL(alloc_page_buffers);
1066
1067static inline void
1068link_dev_buffers(struct page *page, struct buffer_head *head)
1069{
1070        struct buffer_head *bh, *tail;
1071
1072        bh = head;
1073        do {
1074                tail = bh;
1075                bh = bh->b_this_page;
1076        } while (bh);
1077        tail->b_this_page = head;
1078        attach_page_buffers(page, head);
1079}
1080
1081/*
1082 * Initialise the state of a blockdev page's buffers.
1083 */ 
1084static void
1085init_page_buffers(struct page *page, struct block_device *bdev,
1086                        sector_t block, int size)
1087{
1088        struct buffer_head *head = page_buffers(page);
1089        struct buffer_head *bh = head;
1090        int uptodate = PageUptodate(page);
1091
1092        do {
1093                if (!buffer_mapped(bh)) {
1094                        init_buffer(bh, NULL, NULL);
1095                        bh->b_bdev = bdev;
1096                        bh->b_blocknr = block;
1097                        if (uptodate)
1098                                set_buffer_uptodate(bh);
1099                        set_buffer_mapped(bh);
1100                }
1101                block++;
1102                bh = bh->b_this_page;
1103        } while (bh != head);
1104}
1105
1106/*
1107 * Create the page-cache page that contains the requested block.
1108 *
1109 * This is user purely for blockdev mappings.
1110 */
1111static struct page *
1112grow_dev_page(struct block_device *bdev, sector_t block,
1113                pgoff_t index, int size)
1114{
1115        struct inode *inode = bdev->bd_inode;
1116        struct page *page;
1117        struct buffer_head *bh;
1118
1119        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1120        if (!page)
1121                return NULL;
1122
1123        BUG_ON(!PageLocked(page));
1124
1125        if (page_has_buffers(page)) {
1126                bh = page_buffers(page);
1127                if (bh->b_size == size) {
1128                        init_page_buffers(page, bdev, block, size);
1129                        return page;
1130                }
1131                if (!try_to_free_buffers(page))
1132                        goto failed;
1133        }
1134
1135        /*
1136         * Allocate some buffers for this page
1137         */
1138        bh = alloc_page_buffers(page, size, 0);
1139        if (!bh)
1140                goto failed;
1141
1142        /*
1143         * Link the page to the buffers and initialise them.  Take the
1144         * lock to be atomic wrt __find_get_block(), which does not
1145         * run under the page lock.
1146         */
1147        spin_lock(&inode->i_mapping->private_lock);
1148        link_dev_buffers(page, bh);
1149        init_page_buffers(page, bdev, block, size);
1150        spin_unlock(&inode->i_mapping->private_lock);
1151        return page;
1152
1153failed:
1154        BUG();
1155        unlock_page(page);
1156        page_cache_release(page);
1157        return NULL;
1158}
1159
1160/*
1161 * Create buffers for the specified block device block's page.  If
1162 * that page was dirty, the buffers are set dirty also.
1163 *
1164 * Except that's a bug.  Attaching dirty buffers to a dirty
1165 * blockdev's page can result in filesystem corruption, because
1166 * some of those buffers may be aliases of filesystem data.
1167 * grow_dev_page() will go BUG() if this happens.
1168 */
1169static int
1170grow_buffers(struct block_device *bdev, sector_t block, int size)
1171{
1172        struct page *page;
1173        pgoff_t index;
1174        int sizebits;
1175
1176        sizebits = -1;
1177        do {
1178                sizebits++;
1179        } while ((size << sizebits) < PAGE_SIZE);
1180
1181        index = block >> sizebits;
1182        block = index << sizebits;
1183
1184        /* Create a page with the proper size buffers.. */
1185        page = grow_dev_page(bdev, block, index, size);
1186        if (!page)
1187                return 0;
1188        unlock_page(page);
1189        page_cache_release(page);
1190        return 1;
1191}
1192
1193static struct buffer_head *
1194__getblk_slow(struct block_device *bdev, sector_t block, int size)
1195{
1196        /* Size must be multiple of hard sectorsize */
1197        if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1198                        (size < 512 || size > PAGE_SIZE))) {
1199                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1200                                        size);
1201                printk(KERN_ERR "hardsect size: %d\n",
1202                                        bdev_hardsect_size(bdev));
1203
1204                dump_stack();
1205                return NULL;
1206        }
1207
1208        for (;;) {
1209                struct buffer_head * bh;
1210
1211                bh = __find_get_block(bdev, block, size);
1212                if (bh)
1213                        return bh;
1214
1215                if (!grow_buffers(bdev, block, size))
1216                        free_more_memory();
1217        }
1218}
1219
1220/*
1221 * The relationship between dirty buffers and dirty pages:
1222 *
1223 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1224 * the page is tagged dirty in its radix tree.
1225 *
1226 * At all times, the dirtiness of the buffers represents the dirtiness of
1227 * subsections of the page.  If the page has buffers, the page dirty bit is
1228 * merely a hint about the true dirty state.
1229 *
1230 * When a page is set dirty in its entirety, all its buffers are marked dirty
1231 * (if the page has buffers).
1232 *
1233 * When a buffer is marked dirty, its page is dirtied, but the page's other
1234 * buffers are not.
1235 *
1236 * Also.  When blockdev buffers are explicitly read with bread(), they
1237 * individually become uptodate.  But their backing page remains not
1238 * uptodate - even if all of its buffers are uptodate.  A subsequent
1239 * block_read_full_page() against that page will discover all the uptodate
1240 * buffers, will set the page uptodate and will perform no I/O.
1241 */
1242
1243/**
1244 * mark_buffer_dirty - mark a buffer_head as needing writeout
1245 * @bh: the buffer_head to mark dirty
1246 *
1247 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1248 * backing page dirty, then tag the page as dirty in its address_space's radix
1249 * tree and then attach the address_space's inode to its superblock's dirty
1250 * inode list.
1251 *
1252 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1253 * mapping->tree_lock and the global inode_lock.
1254 */
1255void fastcall mark_buffer_dirty(struct buffer_head *bh)
1256{
1257        if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1258                __set_page_dirty_nobuffers(bh->b_page);
1259}
1260
1261/*
1262 * Decrement a buffer_head's reference count.  If all buffers against a page
1263 * have zero reference count, are clean and unlocked, and if the page is clean
1264 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1265 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1266 * a page but it ends up not being freed, and buffers may later be reattached).
1267 */
1268void __brelse(struct buffer_head * buf)
1269{
1270        if (atomic_read(&buf->b_count)) {
1271                put_bh(buf);
1272                return;
1273        }
1274        printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1275        WARN_ON(1);
1276}
1277
1278/*
1279 * bforget() is like brelse(), except it discards any
1280 * potentially dirty data.
1281 */
1282void __bforget(struct buffer_head *bh)
1283{
1284        clear_buffer_dirty(bh);
1285        if (!list_empty(&bh->b_assoc_buffers)) {
1286                struct address_space *buffer_mapping = bh->b_page->mapping;
1287
1288                spin_lock(&buffer_mapping->private_lock);
1289                list_del_init(&bh->b_assoc_buffers);
1290                spin_unlock(&buffer_mapping->private_lock);
1291        }
1292        __brelse(bh);
1293}
1294
1295static struct buffer_head *__bread_slow(struct buffer_head *bh)
1296{
1297        lock_buffer(bh);
1298        if (buffer_uptodate(bh)) {
1299                unlock_buffer(bh);
1300                return bh;
1301        } else {
1302                get_bh(bh);
1303                bh->b_end_io = end_buffer_read_sync;
1304                submit_bh(READ, bh);
1305                wait_on_buffer(bh);
1306                if (buffer_uptodate(bh))
1307                        return bh;
1308        }
1309        brelse(bh);
1310        return NULL;
1311}
1312
1313/*
1314 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1315 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1316 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1317 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1318 * CPU's LRUs at the same time.
1319 *
1320 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1321 * sb_find_get_block().
1322 *
1323 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1324 * a local interrupt disable for that.
1325 */
1326
1327#define BH_LRU_SIZE     8
1328
1329struct bh_lru {
1330        struct buffer_head *bhs[BH_LRU_SIZE];
1331};
1332
1333static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1334
1335#ifdef CONFIG_SMP
1336#define bh_lru_lock()   local_irq_disable()
1337#define bh_lru_unlock() local_irq_enable()
1338#else
1339#define bh_lru_lock()   preempt_disable()
1340#define bh_lru_unlock() preempt_enable()
1341#endif
1342
1343static inline void check_irqs_on(void)
1344{
1345#ifdef irqs_disabled
1346        BUG_ON(irqs_disabled());
1347#endif
1348}
1349
1350/*
1351 * The LRU management algorithm is dopey-but-simple.  Sorry.
1352 */
1353static void bh_lru_install(struct buffer_head *bh)
1354{
1355        struct buffer_head *evictee = NULL;
1356        struct bh_lru *lru;
1357
1358        check_irqs_on();
1359        bh_lru_lock();
1360        lru = &__get_cpu_var(bh_lrus);
1361        if (lru->bhs[0] != bh) {
1362                struct buffer_head *bhs[BH_LRU_SIZE];
1363                int in;
1364                int out = 0;
1365
1366                get_bh(bh);
1367                bhs[out++] = bh;
1368                for (in = 0; in < BH_LRU_SIZE; in++) {
1369                        struct buffer_head *bh2 = lru->bhs[in];
1370
1371                        if (bh2 == bh) {
1372                                __brelse(bh2);
1373                        } else {
1374                                if (out >= BH_LRU_SIZE) {
1375                                        BUG_ON(evictee != NULL);
1376                                        evictee = bh2;
1377                                } else {
1378                                        bhs[out++] = bh2;
1379                                }
1380                        }
1381                }
1382                while (out < BH_LRU_SIZE)
1383                        bhs[out++] = NULL;
1384                memcpy(lru->bhs, bhs, sizeof(bhs));
1385        }
1386        bh_lru_unlock();
1387
1388        if (evictee)
1389                __brelse(evictee);
1390}
1391
1392/*
1393 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1394 */
1395static struct buffer_head *
1396lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1397{
1398        struct buffer_head *ret = NULL;
1399        struct bh_lru *lru;
1400        int i;
1401
1402        check_irqs_on();
1403        bh_lru_lock();
1404        lru = &__get_cpu_var(bh_lrus);
1405        for (i = 0; i < BH_LRU_SIZE; i++) {
1406                struct buffer_head *bh = lru->bhs[i];
1407
1408                if (bh && bh->b_bdev == bdev &&
1409                                bh->b_blocknr == block && bh->b_size == size) {
1410                        if (i) {
1411                                while (i) {
1412                                        lru->bhs[i] = lru->bhs[i - 1];
1413                                        i--;
1414                                }
1415                                lru->bhs[0] = bh;
1416                        }
1417                        get_bh(bh);
1418                        ret = bh;
1419                        break;
1420                }
1421        }
1422        bh_lru_unlock();
1423        return ret;
1424}
1425
1426/*
1427 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1428 * it in the LRU and mark it as accessed.  If it is not present then return
1429 * NULL
1430 */
1431struct buffer_head *
1432__find_get_block(struct block_device *bdev, sector_t block, int size)
1433{
1434        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1435
1436        if (bh == NULL) {
1437                bh = __find_get_block_slow(bdev, block);
1438                if (bh)
1439                        bh_lru_install(bh);
1440        }
1441        if (bh)
1442                touch_buffer(bh);
1443        return bh;
1444}
1445EXPORT_SYMBOL(__find_get_block);
1446
1447/*
1448 * __getblk will locate (and, if necessary, create) the buffer_head
1449 * which corresponds to the passed block_device, block and size. The
1450 * returned buffer has its reference count incremented.
1451 *
1452 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1453 * illegal block number, __getblk() will happily return a buffer_head
1454 * which represents the non-existent block.  Very weird.
1455 *
1456 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1457 * attempt is failing.  FIXME, perhaps?
1458 */
1459struct buffer_head *
1460__getblk(struct block_device *bdev, sector_t block, int size)
1461{
1462        struct buffer_head *bh = __find_get_block(bdev, block, size);
1463
1464        might_sleep();
1465        if (bh == NULL)
1466                bh = __getblk_slow(bdev, block, size);
1467        return bh;
1468}
1469EXPORT_SYMBOL(__getblk);
1470
1471/*
1472 * Do async read-ahead on a buffer..
1473 */
1474void __breadahead(struct block_device *bdev, sector_t block, int size)
1475{
1476        struct buffer_head *bh = __getblk(bdev, block, size);
1477        if (likely(bh)) {
1478                ll_rw_block(READA, 1, &bh);
1479                brelse(bh);
1480        }
1481}
1482EXPORT_SYMBOL(__breadahead);
1483
1484/**
1485 *  __bread() - reads a specified block and returns the bh
1486 *  @bdev: the block_device to read from
1487 *  @block: number of block
1488 *  @size: size (in bytes) to read
1489 * 
1490 *  Reads a specified block, and returns buffer head that contains it.
1491 *  It returns NULL if the block was unreadable.
1492 */
1493struct buffer_head *
1494__bread(struct block_device *bdev, sector_t block, int size)
1495{
1496        struct buffer_head *bh = __getblk(bdev, block, size);
1497
1498        if (likely(bh) && !buffer_uptodate(bh))
1499                bh = __bread_slow(bh);
1500        return bh;
1501}
1502EXPORT_SYMBOL(__bread);
1503
1504/*
1505 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1506 * This doesn't race because it runs in each cpu either in irq
1507 * or with preempt disabled.
1508 */
1509static void invalidate_bh_lru(void *arg)
1510{
1511        struct bh_lru *b = &get_cpu_var(bh_lrus);
1512        int i;
1513
1514        for (i = 0; i < BH_LRU_SIZE; i++) {
1515                brelse(b->bhs[i]);
1516                b->bhs[i] = NULL;
1517        }
1518        put_cpu_var(bh_lrus);
1519}
1520        
1521static void invalidate_bh_lrus(void)
1522{
1523        on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1524}
1525
1526void set_bh_page(struct buffer_head *bh,
1527                struct page *page, unsigned long offset)
1528{
1529        bh->b_page = page;
1530        BUG_ON(offset >= PAGE_SIZE);
1531        if (PageHighMem(page))
1532                /*
1533                 * This catches illegal uses and preserves the offset:
1534                 */
1535                bh->b_data = (char *)(0 + offset);
1536        else
1537                bh->b_data = page_address(page) + offset;
1538}
1539EXPORT_SYMBOL(set_bh_page);
1540
1541/*
1542 * Called when truncating a buffer on a page completely.
1543 */
1544static void discard_buffer(struct buffer_head * bh)
1545{
1546        lock_buffer(bh);
1547        clear_buffer_dirty(bh);
1548        bh->b_bdev = NULL;
1549        clear_buffer_mapped(bh);
1550        clear_buffer_req(bh);
1551        clear_buffer_new(bh);
1552        clear_buffer_delay(bh);
1553        unlock_buffer(bh);
1554}
1555
1556/**
1557 * try_to_release_page() - release old fs-specific metadata on a page
1558 *
1559 * @page: the page which the kernel is trying to free
1560 * @gfp_mask: memory allocation flags (and I/O mode)
1561 *
1562 * The address_space is to try to release any data against the page
1563 * (presumably at page->private).  If the release was successful, return `1'.
1564 * Otherwise return zero.
1565 *
1566 * The @gfp_mask argument specifies whether I/O may be performed to release
1567 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1568 *
1569 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1570 */
1571int try_to_release_page(struct page *page, gfp_t gfp_mask)
1572{
1573        struct address_space * const mapping = page->mapping;
1574
1575        BUG_ON(!PageLocked(page));
1576        if (PageWriteback(page))
1577                return 0;
1578        
1579        if (mapping && mapping->a_ops->releasepage)
1580                return mapping->a_ops->releasepage(page, gfp_mask);
1581        return try_to_free_buffers(page);
1582}
1583EXPORT_SYMBOL(try_to_release_page);
1584
1585/**
1586 * block_invalidatepage - invalidate part of all of a buffer-backed page
1587 *
1588 * @page: the page which is affected
1589 * @offset: the index of the truncation point
1590 *
1591 * block_invalidatepage() is called when all or part of the page has become
1592 * invalidatedby a truncate operation.
1593 *
1594 * block_invalidatepage() does not have to release all buffers, but it must
1595 * ensure that no dirty buffer is left outside @offset and that no I/O
1596 * is underway against any of the blocks which are outside the truncation
1597 * point.  Because the caller is about to free (and possibly reuse) those
1598 * blocks on-disk.
1599 */
1600void block_invalidatepage(struct page *page, unsigned long offset)
1601{
1602        struct buffer_head *head, *bh, *next;
1603        unsigned int curr_off = 0;
1604
1605        BUG_ON(!PageLocked(page));
1606        if (!page_has_buffers(page))
1607                goto out;
1608
1609        head = page_buffers(page);
1610        bh = head;
1611        do {
1612                unsigned int next_off = curr_off + bh->b_size;
1613                next = bh->b_this_page;
1614
1615                /*
1616                 * is this block fully invalidated?
1617                 */
1618                if (offset <= curr_off)
1619                        discard_buffer(bh);
1620                curr_off = next_off;
1621                bh = next;
1622        } while (bh != head);
1623
1624        /*
1625         * We release buffers only if the entire page is being invalidated.
1626         * The get_block cached value has been unconditionally invalidated,
1627         * so real IO is not possible anymore.
1628         */
1629        if (offset == 0)
1630                try_to_release_page(page, 0);
1631out:
1632        return;
1633}
1634EXPORT_SYMBOL(block_invalidatepage);
1635
1636void do_invalidatepage(struct page *page, unsigned long offset)
1637{
1638        void (*invalidatepage)(struct page *, unsigned long);
1639        invalidatepage = page->mapping->a_ops->invalidatepage ? :
1640                block_invalidatepage;
1641        (*invalidatepage)(page, offset);
1642}
1643
1644/*
1645 * We attach and possibly dirty the buffers atomically wrt
1646 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1647 * is already excluded via the page lock.
1648 */
1649void create_empty_buffers(struct page *page,
1650                        unsigned long blocksize, unsigned long b_state)
1651{
1652        struct buffer_head *bh, *head, *tail;
1653
1654        head = alloc_page_buffers(page, blocksize, 1);
1655        bh = head;
1656        do {
1657                bh->b_state |= b_state;
1658                tail = bh;
1659                bh = bh->b_this_page;
1660        } while (bh);
1661        tail->b_this_page = head;
1662
1663        spin_lock(&page->mapping->private_lock);
1664        if (PageUptodate(page) || PageDirty(page)) {
1665                bh = head;
1666                do {
1667                        if (PageDirty(page))
1668                                set_buffer_dirty(bh);
1669                        if (PageUptodate(page))
1670                                set_buffer_uptodate(bh);
1671                        bh = bh->b_this_page;
1672                } while (bh != head);
1673        }
1674        attach_page_buffers(page, head);
1675        spin_unlock(&page->mapping->private_lock);
1676}
1677EXPORT_SYMBOL(create_empty_buffers);
1678
1679/*
1680 * We are taking a block for data and we don't want any output from any
1681 * buffer-cache aliases starting from return from that function and
1682 * until the moment when something will explicitly mark the buffer
1683 * dirty (hopefully that will not happen until we will free that block ;-)
1684 * We don't even need to mark it not-uptodate - nobody can expect
1685 * anything from a newly allocated buffer anyway. We used to used
1686 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1687 * don't want to mark the alias unmapped, for example - it would confuse
1688 * anyone who might pick it with bread() afterwards...
1689 *
1690 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1691 * be writeout I/O going on against recently-freed buffers.  We don't
1692 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1693 * only if we really need to.  That happens here.
1694 */
1695void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1696{
1697        struct buffer_head *old_bh;
1698
1699        might_sleep();
1700
1701        old_bh = __find_get_block_slow(bdev, block);
1702        if (old_bh) {
1703                clear_buffer_dirty(old_bh);
1704                wait_on_buffer(old_bh);
1705                clear_buffer_req(old_bh);
1706                __brelse(old_bh);
1707        }
1708}
1709EXPORT_SYMBOL(unmap_underlying_metadata);
1710
1711/*
1712 * NOTE! All mapped/uptodate combinations are valid:
1713 *
1714 *      Mapped  Uptodate        Meaning
1715 *
1716 *      No      No              "unknown" - must do get_block()
1717 *      No      Yes             "hole" - zero-filled
1718 *      Yes     No              "allocated" - allocated on disk, not read in
1719 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1720 *
1721 * "Dirty" is valid only with the last case (mapped+uptodate).
1722 */
1723
1724/*
1725 * While block_write_full_page is writing back the dirty buffers under
1726 * the page lock, whoever dirtied the buffers may decide to clean them
1727 * again at any time.  We handle that by only looking at the buffer
1728 * state inside lock_buffer().
1729 *
1730 * If block_write_full_page() is called for regular writeback
1731 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1732 * locked buffer.   This only can happen if someone has written the buffer
1733 * directly, with submit_bh().  At the address_space level PageWriteback
1734 * prevents this contention from occurring.
1735 */
1736static int __block_write_full_page(struct inode *inode, struct page *page,
1737                        get_block_t *get_block, struct writeback_control *wbc)
1738{
1739        int err;
1740        sector_t block;
1741        sector_t last_block;
1742        struct buffer_head *bh, *head;
1743        const unsigned blocksize = 1 << inode->i_blkbits;
1744        int nr_underway = 0;
1745
1746        BUG_ON(!PageLocked(page));
1747
1748        last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1749
1750        if (!page_has_buffers(page)) {
1751                create_empty_buffers(page, blocksize,
1752                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1753        }
1754
1755        /*
1756         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1757         * here, and the (potentially unmapped) buffers may become dirty at
1758         * any time.  If a buffer becomes dirty here after we've inspected it
1759         * then we just miss that fact, and the page stays dirty.
1760         *
1761         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1762         * handle that here by just cleaning them.
1763         */
1764
1765        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1766        head = page_buffers(page);
1767        bh = head;
1768
1769        /*
1770         * Get all the dirty buffers mapped to disk addresses and
1771         * handle any aliases from the underlying blockdev's mapping.
1772         */
1773        do {
1774                if (block > last_block) {
1775                        /*
1776                         * mapped buffers outside i_size will occur, because
1777                         * this page can be outside i_size when there is a
1778                         * truncate in progress.
1779                         */
1780                        /*
1781                         * The buffer was zeroed by block_write_full_page()
1782                         */
1783                        clear_buffer_dirty(bh);
1784                        set_buffer_uptodate(bh);
1785                } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1786                        WARN_ON(bh->b_size != blocksize);
1787                        err = get_block(inode, block, bh, 1);
1788                        if (err)
1789                                goto recover;
1790                        if (buffer_new(bh)) {
1791                                /* blockdev mappings never come here */
1792                                clear_buffer_new(bh);
1793                                unmap_underlying_metadata(bh->b_bdev,
1794                                                        bh->b_blocknr);
1795                        }
1796                }
1797                bh = bh->b_this_page;
1798                block++;
1799        } while (bh != head);
1800
1801        do {
1802                if (!buffer_mapped(bh))
1803                        continue;
1804                /*
1805                 * If it's a fully non-blocking write attempt and we cannot
1806                 * lock the buffer then redirty the page.  Note that this can
1807                 * potentially cause a busy-wait loop from pdflush and kswapd
1808                 * activity, but those code paths have their own higher-level
1809                 * throttling.
1810                 */
1811                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1812                        lock_buffer(bh);
1813                } else if (test_set_buffer_locked(bh)) {
1814                        redirty_page_for_writepage(wbc, page);
1815                        continue;
1816                }
1817                if (test_clear_buffer_dirty(bh)) {
1818                        mark_buffer_async_write(bh);
1819                } else {
1820                        unlock_buffer(bh);
1821                }
1822        } while ((bh = bh->b_this_page) != head);
1823
1824        /*
1825         * The page and its buffers are protected by PageWriteback(), so we can
1826         * drop the bh refcounts early.
1827         */
1828        BUG_ON(PageWriteback(page));
1829        set_page_writeback(page);
1830
1831        do {
1832                struct buffer_head *next = bh->b_this_page;
1833                if (buffer_async_write(bh)) {
1834                        submit_bh(WRITE, bh);
1835                        nr_underway++;
1836                }
1837                bh = next;
1838        } while (bh != head);
1839        unlock_page(page);
1840
1841        err = 0;
1842done:
1843        if (nr_underway == 0) {
1844                /*
1845                 * The page was marked dirty, but the buffers were
1846                 * clean.  Someone wrote them back by hand with
1847                 * ll_rw_block/submit_bh.  A rare case.
1848                 */
1849                int uptodate = 1;
1850                do {
1851                        if (!buffer_uptodate(bh)) {
1852                                uptodate = 0;
1853                                break;
1854                        }
1855                        bh = bh->b_this_page;
1856                } while (bh != head);
1857                if (uptodate)
1858                        SetPageUptodate(page);
1859                end_page_writeback(page);
1860                /*
1861                 * The page and buffer_heads can be released at any time from
1862                 * here on.
1863                 */
1864                wbc->pages_skipped++;   /* We didn't write this page */
1865        }
1866        return err;
1867
1868recover:
1869        /*
1870         * ENOSPC, or some other error.  We may already have added some
1871         * blocks to the file, so we need to write these out to avoid
1872         * exposing stale data.
1873         * The page is currently locked and not marked for writeback
1874         */
1875        bh = head;
1876        /* Recovery: lock and submit the mapped buffers */
1877        do {
1878                if (buffer_mapped(bh) && buffer_dirty(bh)) {
1879                        lock_buffer(bh);
1880                        mark_buffer_async_write(bh);
1881                } else {
1882                        /*
1883                         * The buffer may have been set dirty during
1884                         * attachment to a dirty page.
1885                         */
1886                        clear_buffer_dirty(bh);
1887                }
1888        } while ((bh = bh->b_this_page) != head);
1889        SetPageError(page);
1890        BUG_ON(PageWriteback(page));
1891        set_page_writeback(page);
1892        unlock_page(page);
1893        do {
1894                struct buffer_head *next = bh->b_this_page;
1895                if (buffer_async_write(bh)) {
1896                        clear_buffer_dirty(bh);
1897                        submit_bh(WRITE, bh);
1898                        nr_underway++;
1899                }
1900                bh = next;
1901        } while (bh != head);
1902        goto done;
1903}
1904
1905static int __block_prepare_write(struct inode *inode, struct page *page,
1906                unsigned from, unsigned to, get_block_t *get_block)
1907{
1908        unsigned block_start, block_end;
1909        sector_t block;
1910        int err = 0;
1911        unsigned blocksize, bbits;
1912        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1913
1914        BUG_ON(!PageLocked(page));
1915        BUG_ON(from > PAGE_CACHE_SIZE);
1916        BUG_ON(to > PAGE_CACHE_SIZE);
1917        BUG_ON(from > to);
1918
1919        blocksize = 1 << inode->i_blkbits;
1920        if (!page_has_buffers(page))
1921                create_empty_buffers(page, blocksize, 0);
1922        head = page_buffers(page);
1923
1924        bbits = inode->i_blkbits;
1925        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1926
1927        for(bh = head, block_start = 0; bh != head || !block_start;
1928            block++, block_start=block_end, bh = bh->b_this_page) {
1929                block_end = block_start + blocksize;
1930                if (block_end <= from || block_start >= to) {
1931                        if (PageUptodate(page)) {
1932                                if (!buffer_uptodate(bh))
1933                                        set_buffer_uptodate(bh);
1934                        }
1935                        continue;
1936                }
1937                if (buffer_new(bh))
1938                        clear_buffer_new(bh);
1939                if (!buffer_mapped(bh)) {
1940                        WARN_ON(bh->b_size != blocksize);
1941                        err = get_block(inode, block, bh, 1);
1942                        if (err)
1943                                break;
1944                        if (buffer_new(bh)) {
1945                                unmap_underlying_metadata(bh->b_bdev,
1946                                                        bh->b_blocknr);
1947                                if (PageUptodate(page)) {
1948                                        set_buffer_uptodate(bh);
1949                                        continue;
1950                                }
1951                                if (block_end > to || block_start < from) {
1952                                        void *kaddr;
1953
1954                                        kaddr = kmap_atomic(page, KM_USER0);
1955                                        if (block_end > to)
1956                                                memset(kaddr+to, 0,
1957                                                        block_end-to);
1958                                        if (block_start < from)
1959                                                memset(kaddr+block_start,
1960                                                        0, from-block_start);
1961                                        flush_dcache_page(page);
1962                                        kunmap_atomic(kaddr, KM_USER0);
1963                                }
1964                                continue;
1965                        }
1966                }
1967                if (PageUptodate(page)) {
1968                        if (!buffer_uptodate(bh))
1969                                set_buffer_uptodate(bh);
1970                        continue; 
1971                }
1972                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1973                     (block_start < from || block_end > to)) {
1974                        ll_rw_block(READ, 1, &bh);
1975                        *wait_bh++=bh;
1976                }
1977        }
1978        /*
1979         * If we issued read requests - let them complete.
1980         */
1981        while(wait_bh > wait) {
1982                wait_on_buffer(*--wait_bh);
1983                if (!buffer_uptodate(*wait_bh))
1984                        err = -EIO;
1985        }
1986        if (!err) {
1987                bh = head;
1988                do {
1989                        if (buffer_new(bh))
1990                                clear_buffer_new(bh);
1991                } while ((bh = bh->b_this_page) != head);
1992                return 0;
1993        }
1994        /* Error case: */
1995        /*
1996         * Zero out any newly allocated blocks to avoid exposing stale
1997         * data.  If BH_New is set, we know that the block was newly
1998         * allocated in the above loop.
1999         */
2000        bh = head;
2001        block_start = 0;
2002        do {
2003                block_end = block_start+blocksize;
2004                if (block_end <= from)
2005                        goto next_bh;
2006                if (block_start >= to)
2007                        break;
2008                if (buffer_new(bh)) {
2009                        void *kaddr;
2010
2011                        clear_buffer_new(bh);
2012                        kaddr = kmap_atomic(page, KM_USER0);
2013                        memset(kaddr+block_start, 0, bh->b_size);
2014                        kunmap_atomic(kaddr, KM_USER0);
2015                        set_buffer_uptodate(bh);
2016                        mark_buffer_dirty(bh);
2017                }
2018next_bh:
2019                block_start = block_end;
2020                bh = bh->b_this_page;
2021        } while (bh != head);
2022        return err;
2023}
2024
2025static int __block_commit_write(struct inode *inode, struct page *page,
2026                unsigned from, unsigned to)
2027{
2028        unsigned block_start, block_end;
2029        int partial = 0;
2030        unsigned blocksize;
2031        struct buffer_head *bh, *head;
2032
2033        blocksize = 1 << inode->i_blkbits;
2034
2035        for(bh = head = page_buffers(page), block_start = 0;
2036            bh != head || !block_start;
2037            block_start=block_end, bh = bh->b_this_page) {
2038                block_end = block_start + blocksize;
2039                if (block_end <= from || block_start >= to) {
2040                        if (!buffer_uptodate(bh))
2041                                partial = 1;
2042                } else {
2043                        set_buffer_uptodate(bh);
2044                        mark_buffer_dirty(bh);
2045                }
2046        }
2047
2048        /*
2049         * If this is a partial write which happened to make all buffers
2050         * uptodate then we can optimize away a bogus readpage() for
2051         * the next read(). Here we 'discover' whether the page went
2052         * uptodate as a result of this (potentially partial) write.
2053         */
2054        if (!partial)
2055                SetPageUptodate(page);
2056        return 0;
2057}
2058
2059/*
2060 * Generic "read page" function for block devices that have the normal
2061 * get_block functionality. This is most of the block device filesystems.
2062 * Reads the page asynchronously --- the unlock_buffer() and
2063 * set/clear_buffer_uptodate() functions propagate buffer state into the
2064 * page struct once IO has completed.
2065 */
2066int block_read_full_page(struct page *page, get_block_t *get_block)
2067{
2068        struct inode *inode = page->mapping->host;
2069        sector_t iblock, lblock;
2070        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2071        unsigned int blocksize;
2072        int nr, i;
2073        int fully_mapped = 1;
2074
2075        BUG_ON(!PageLocked(page));
2076        blocksize = 1 << inode->i_blkbits;
2077        if (!page_has_buffers(page))
2078                create_empty_buffers(page, blocksize, 0);
2079        head = page_buffers(page);
2080
2081        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2082        lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2083        bh = head;
2084        nr = 0;
2085        i = 0;
2086
2087        do {
2088                if (buffer_uptodate(bh))
2089                        continue;
2090
2091                if (!buffer_mapped(bh)) {
2092                        int err = 0;
2093
2094                        fully_mapped = 0;
2095                        if (iblock < lblock) {
2096                                WARN_ON(bh->b_size != blocksize);
2097                                err = get_block(inode, iblock, bh, 0);
2098                                if (err)
2099                                        SetPageError(page);
2100                        }
2101                        if (!buffer_mapped(bh)) {
2102                                void *kaddr = kmap_atomic(page, KM_USER0);
2103                                memset(kaddr + i * blocksize, 0, blocksize);
2104                                flush_dcache_page(page);
2105                                kunmap_atomic(kaddr, KM_USER0);
2106                                if (!err)
2107                                        set_buffer_uptodate(bh);
2108                                continue;
2109                        }
2110                        /*
2111                         * get_block() might have updated the buffer
2112                         * synchronously
2113                         */
2114                        if (buffer_uptodate(bh))
2115                                continue;
2116                }
2117                arr[nr++] = bh;
2118        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2119
2120        if (fully_mapped)
2121                SetPageMappedToDisk(page);
2122
2123        if (!nr) {
2124                /*
2125                 * All buffers are uptodate - we can set the page uptodate
2126                 * as well. But not if get_block() returned an error.
2127                 */
2128                if (!PageError(page))
2129                        SetPageUptodate(page);
2130                unlock_page(page);
2131                return 0;
2132        }
2133
2134        /* Stage two: lock the buffers */
2135        for (i = 0; i < nr; i++) {
2136                bh = arr[i];
2137                lock_buffer(bh);
2138                mark_buffer_async_read(bh);
2139        }
2140
2141        /*
2142         * Stage 3: start the IO.  Check for uptodateness
2143         * inside the buffer lock in case another process reading
2144         * the underlying blockdev brought it uptodate (the sct fix).
2145         */
2146        for (i = 0; i < nr; i++) {
2147                bh = arr[i];
2148                if (buffer_uptodate(bh))
2149                        end_buffer_async_read(bh, 1);
2150                else
2151                        submit_bh(READ, bh);
2152        }
2153        return 0;
2154}
2155
2156/* utility function for filesystems that need to do work on expanding
2157 * truncates.  Uses prepare/commit_write to allow the filesystem to
2158 * deal with the hole.  
2159 */
2160static int __generic_cont_expand(struct inode *inode, loff_t size,
2161                                 pgoff_t index, unsigned int offset)
2162{
2163        struct address_space *mapping = inode->i_mapping;
2164        struct page *page;
2165        unsigned long limit;
2166        int err;
2167
2168        err = -EFBIG;
2169        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2170        if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2171                send_sig(SIGXFSZ, current, 0);
2172                goto out;
2173        }
2174        if (size > inode->i_sb->s_maxbytes)
2175                goto out;
2176
2177        err = -ENOMEM;
2178        page = grab_cache_page(mapping, index);
2179        if (!page)
2180                goto out;
2181        err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2182        if (err) {
2183                /*
2184                 * ->prepare_write() may have instantiated a few blocks
2185                 * outside i_size.  Trim these off again.
2186                 */
2187                unlock_page(page);
2188                page_cache_release(page);
2189                vmtruncate(inode, inode->i_size);
2190                goto out;
2191        }
2192
2193        err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2194
2195        unlock_page(page);
2196        page_cache_release(page);
2197        if (err > 0)
2198                err = 0;
2199out:
2200        return err;
2201}
2202
2203int generic_cont_expand(struct inode *inode, loff_t size)
2204{
2205        pgoff_t index;
2206        unsigned int offset;
2207
2208        offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2209
2210        /* ugh.  in prepare/commit_write, if from==to==start of block, we
2211        ** skip the prepare.  make sure we never send an offset for the start
2212        ** of a block
2213        */
2214        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2215                /* caller must handle this extra byte. */
2216                offset++;
2217        }
2218        index = size >> PAGE_CACHE_SHIFT;
2219
2220        return __generic_cont_expand(inode, size, index, offset);
2221}
2222
2223int generic_cont_expand_simple(struct inode *inode, loff_t size)
2224{
2225        loff_t pos = size - 1;
2226        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2227        unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2228
2229        /* prepare/commit_write can handle even if from==to==start of block. */
2230        return __generic_cont_expand(inode, size, index, offset);
2231}
2232
2233/*
2234 * For moronic filesystems that do not allow holes in file.
2235 * We may have to extend the file.
2236 */
2237
2238int cont_prepare_write(struct page *page, unsigned offset,
2239                unsigned to, get_block_t *get_block, loff_t *bytes)
2240{
2241        struct address_space *mapping = page->mapping;
2242        struct inode *inode = mapping->host;
2243        struct page *new_page;
2244        pgoff_t pgpos;
2245        long status;
2246        unsigned zerofrom;
2247        unsigned blocksize = 1 << inode->i_blkbits;
2248        void *kaddr;
2249
2250        while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2251                status = -ENOMEM;
2252                new_page = grab_cache_page(mapping, pgpos);
2253                if (!new_page)
2254                        goto out;
2255                /* we might sleep */
2256                if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2257                        unlock_page(new_page);
2258                        page_cache_release(new_page);
2259                        continue;
2260                }
2261                zerofrom = *bytes & ~PAGE_CACHE_MASK;
2262                if (zerofrom & (blocksize-1)) {
2263                        *bytes |= (blocksize-1);
2264                        (*bytes)++;
2265                }
2266                status = __block_prepare_write(inode, new_page, zerofrom,
2267                                                PAGE_CACHE_SIZE, get_block);
2268                if (status)
2269                        goto out_unmap;
2270                kaddr = kmap_atomic(new_page, KM_USER0);
2271                memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2272                flush_dcache_page(new_page);
2273                kunmap_atomic(kaddr, KM_USER0);
2274                generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2275                unlock_page(new_page);
2276                page_cache_release(new_page);
2277        }
2278
2279        if (page->index < pgpos) {
2280                /* completely inside the area */
2281                zerofrom = offset;
2282        } else {
2283                /* page covers the boundary, find the boundary offset */
2284                zerofrom = *bytes & ~PAGE_CACHE_MASK;
2285
2286                /* if we will expand the thing last block will be filled */
2287                if (to > zerofrom && (zerofrom & (blocksize-1))) {
2288                        *bytes |= (blocksize-1);
2289                        (*bytes)++;
2290                }
2291
2292                /* starting below the boundary? Nothing to zero out */
2293                if (offset <= zerofrom)
2294                        zerofrom = offset;
2295        }
2296        status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2297        if (status)
2298                goto out1;
2299        if (zerofrom < offset) {
2300                kaddr = kmap_atomic(page, KM_USER0);
2301                memset(kaddr+zerofrom, 0, offset-zerofrom);
2302                flush_dcache_page(page);
2303                kunmap_atomic(kaddr, KM_USER0);
2304                __block_commit_write(inode, page, zerofrom, offset);
2305        }
2306        return 0;
2307out1:
2308        ClearPageUptodate(page);
2309        return status;
2310
2311out_unmap:
2312        ClearPageUptodate(new_page);
2313        unlock_page(new_page);
2314        page_cache_release(new_page);
2315out:
2316        return status;
2317}
2318
2319int block_prepare_write(struct page *page, unsigned from, unsigned to,
2320                        get_block_t *get_block)
2321{
2322        struct inode *inode = page->mapping->host;
2323        int err = __block_prepare_write(inode, page, from, to, get_block);
2324        if (err)
2325                ClearPageUptodate(page);
2326        return err;
2327}
2328
2329int block_commit_write(struct page *page, unsigned from, unsigned to)
2330{
2331        struct inode *inode = page->mapping->host;
2332        __block_commit_write(inode,page,from,to);
2333        return 0;
2334}
2335
2336int generic_commit_write(struct file *file, struct page *page,
2337                unsigned from, unsigned to)
2338{
2339        struct inode *inode = page->mapping->host;
2340        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2341        __block_commit_write(inode,page,from,to);
2342        /*
2343         * No need to use i_size_read() here, the i_size
2344         * cannot change under us because we hold i_mutex.
2345         */
2346        if (pos > inode->i_size) {
2347                i_size_write(inode, pos);
2348                mark_inode_dirty(inode);
2349        }
2350        return 0;
2351}
2352
2353
2354/*
2355 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2356 * immediately, while under the page lock.  So it needs a special end_io
2357 * handler which does not touch the bh after unlocking it.
2358 *
2359 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2360 * a race there is benign: unlock_buffer() only use the bh's address for
2361 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2362 * itself.
2363 */
2364static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2365{
2366        if (uptodate) {
2367                set_buffer_uptodate(bh);
2368        } else {
2369                /* This happens, due to failed READA attempts. */
2370                clear_buffer_uptodate(bh);
2371        }
2372        unlock_buffer(bh);
2373}
2374
2375/*
2376 * On entry, the page is fully not uptodate.
2377 * On exit the page is fully uptodate in the areas outside (from,to)
2378 */
2379int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2380                        get_block_t *get_block)
2381{
2382        struct inode *inode = page->mapping->host;
2383        const unsigned blkbits = inode->i_blkbits;
2384        const unsigned blocksize = 1 << blkbits;
2385        struct buffer_head map_bh;
2386        struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2387        unsigned block_in_page;
2388        unsigned block_start;
2389        sector_t block_in_file;
2390        char *kaddr;
2391        int nr_reads = 0;
2392        int i;
2393        int ret = 0;
2394        int is_mapped_to_disk = 1;
2395        int dirtied_it = 0;
2396
2397        if (PageMappedToDisk(page))
2398                return 0;
2399
2400        block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2401        map_bh.b_page = page;
2402
2403        /*
2404         * We loop across all blocks in the page, whether or not they are
2405         * part of the affected region.  This is so we can discover if the
2406         * page is fully mapped-to-disk.
2407         */
2408        for (block_start = 0, block_in_page = 0;
2409                  block_start < PAGE_CACHE_SIZE;
2410                  block_in_page++, block_start += blocksize) {
2411                unsigned block_end = block_start + blocksize;
2412                int create;
2413
2414                map_bh.b_state = 0;
2415                create = 1;
2416                if (block_start >= to)
2417                        create = 0;
2418                map_bh.b_size = blocksize;
2419                ret = get_block(inode, block_in_file + block_in_page,
2420                                        &map_bh, create);
2421                if (ret)
2422                        goto failed;
2423                if (!buffer_mapped(&map_bh))
2424                        is_mapped_to_disk = 0;
2425                if (buffer_new(&map_bh))
2426                        unmap_underlying_metadata(map_bh.b_bdev,
2427                                                        map_bh.b_blocknr);
2428                if (PageUptodate(page))
2429                        continue;
2430                if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2431                        kaddr = kmap_atomic(page, KM_USER0);
2432                        if (block_start < from) {
2433                                memset(kaddr+block_start, 0, from-block_start);
2434                                dirtied_it = 1;
2435                        }
2436                        if (block_end > to) {
2437                                memset(kaddr + to, 0, block_end - to);
2438                                dirtied_it = 1;
2439                        }
2440                        flush_dcache_page(page);
2441                        kunmap_atomic(kaddr, KM_USER0);
2442                        continue;
2443                }
2444                if (buffer_uptodate(&map_bh))
2445                        continue;       /* reiserfs does this */
2446                if (block_start < from || block_end > to) {
2447                        struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2448
2449                        if (!bh) {
2450                                ret = -ENOMEM;
2451                                goto failed;
2452                        }
2453                        bh->b_state = map_bh.b_state;
2454                        atomic_set(&bh->b_count, 0);
2455                        bh->b_this_page = NULL;
2456                        bh->b_page = page;
2457                        bh->b_blocknr = map_bh.b_blocknr;
2458                        bh->b_size = blocksize;
2459                        bh->b_data = (char *)(long)block_start;
2460                        bh->b_bdev = map_bh.b_bdev;
2461                        bh->b_private = NULL;
2462                        read_bh[nr_reads++] = bh;
2463                }
2464        }
2465
2466        if (nr_reads) {
2467                struct buffer_head *bh;
2468
2469                /*
2470                 * The page is locked, so these buffers are protected from
2471                 * any VM or truncate activity.  Hence we don't need to care
2472                 * for the buffer_head refcounts.
2473                 */
2474                for (i = 0; i < nr_reads; i++) {
2475                        bh = read_bh[i];
2476                        lock_buffer(bh);
2477                        bh->b_end_io = end_buffer_read_nobh;
2478                        submit_bh(READ, bh);
2479                }
2480                for (i = 0; i < nr_reads; i++) {
2481                        bh = read_bh[i];
2482                        wait_on_buffer(bh);
2483                        if (!buffer_uptodate(bh))
2484                                ret = -EIO;
2485                        free_buffer_head(bh);
2486                        read_bh[i] = NULL;
2487                }
2488                if (ret)
2489                        goto failed;
2490        }
2491
2492        if (is_mapped_to_disk)
2493                SetPageMappedToDisk(page);
2494        SetPageUptodate(page);
2495
2496        /*
2497         * Setting the page dirty here isn't necessary for the prepare_write
2498         * function - commit_write will do that.  But if/when this function is
2499         * used within the pagefault handler to ensure that all mmapped pages
2500         * have backing space in the filesystem, we will need to dirty the page
2501         * if its contents were altered.
2502         */
2503        if (dirtied_it)
2504                set_page_dirty(page);
2505
2506        return 0;
2507
2508failed:
2509        for (i = 0; i < nr_reads; i++) {
2510                if (read_bh[i])
2511                        free_buffer_head(read_bh[i]);
2512        }
2513
2514        /*
2515         * Error recovery is pretty slack.  Clear the page and mark it dirty
2516         * so we'll later zero out any blocks which _were_ allocated.
2517         */
2518        kaddr = kmap_atomic(page, KM_USER0);
2519        memset(kaddr, 0, PAGE_CACHE_SIZE);
2520        kunmap_atomic(kaddr, KM_USER0);
2521        SetPageUptodate(page);
2522        set_page_dirty(page);
2523        return ret;
2524}
2525EXPORT_SYMBOL(nobh_prepare_write);
2526
2527int nobh_commit_write(struct file *file, struct page *page,
2528                unsigned from, unsigned to)
2529{
2530        struct inode *inode = page->mapping->host;
2531        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2532
2533        set_page_dirty(page);
2534        if (pos > inode->i_size) {
2535                i_size_write(inode, pos);
2536                mark_inode_dirty(inode);
2537        }
2538        return 0;
2539}
2540EXPORT_SYMBOL(nobh_commit_write);
2541
2542/*
2543 * nobh_writepage() - based on block_full_write_page() except
2544 * that it tries to operate without attaching bufferheads to
2545 * the page.
2546 */
2547int nobh_writepage(struct page *page, get_block_t *get_block,
2548                        struct writeback_control *wbc)
2549{
2550        struct inode * const inode = page->mapping->host;
2551        loff_t i_size = i_size_read(inode);
2552        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2553        unsigned offset;
2554        void *kaddr;
2555        int ret;
2556
2557        /* Is the page fully inside i_size? */
2558        if (page->index < end_index)
2559                goto out;
2560
2561        /* Is the page fully outside i_size? (truncate in progress) */
2562        offset = i_size & (PAGE_CACHE_SIZE-1);
2563        if (page->index >= end_index+1 || !offset) {
2564                /*
2565                 * The page may have dirty, unmapped buffers.  For example,
2566                 * they may have been added in ext3_writepage().  Make them
2567                 * freeable here, so the page does not leak.
2568                 */
2569#if 0
2570                /* Not really sure about this  - do we need this ? */
2571                if (page->mapping->a_ops->invalidatepage)
2572                        page->mapping->a_ops->invalidatepage(page, offset);
2573#endif
2574                unlock_page(page);
2575                return 0; /* don't care */
2576        }
2577
2578        /*
2579         * The page straddles i_size.  It must be zeroed out on each and every
2580         * writepage invocation because it may be mmapped.  "A file is mapped
2581         * in multiples of the page size.  For a file that is not a multiple of
2582         * the  page size, the remaining memory is zeroed when mapped, and
2583         * writes to that region are not written out to the file."
2584         */
2585        kaddr = kmap_atomic(page, KM_USER0);
2586        memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2587        flush_dcache_page(page);
2588        kunmap_atomic(kaddr, KM_USER0);
2589out:
2590        ret = mpage_writepage(page, get_block, wbc);
2591        if (ret == -EAGAIN)
2592                ret = __block_write_full_page(inode, page, get_block, wbc);
2593        return ret;
2594}
2595EXPORT_SYMBOL(nobh_writepage);
2596
2597/*
2598 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2599 */
2600int nobh_truncate_page(struct address_space *mapping, loff_t from)
2601{
2602        struct inode *inode = mapping->host;
2603        unsigned blocksize = 1 << inode->i_blkbits;
2604        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2605        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2606        unsigned to;
2607        struct page *page;
2608        struct address_space_operations *a_ops = mapping->a_ops;
2609        char *kaddr;
2610        int ret = 0;
2611
2612        if ((offset & (blocksize - 1)) == 0)
2613                goto out;
2614
2615        ret = -ENOMEM;
2616        page = grab_cache_page(mapping, index);
2617        if (!page)
2618                goto out;
2619
2620        to = (offset + blocksize) & ~(blocksize - 1);
2621        ret = a_ops->prepare_write(NULL, page, offset, to);
2622        if (ret == 0) {
2623                kaddr = kmap_atomic(page, KM_USER0);
2624                memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2625                flush_dcache_page(page);
2626                kunmap_atomic(kaddr, KM_USER0);
2627                set_page_dirty(page);
2628        }
2629        unlock_page(page);
2630        page_cache_release(page);
2631out:
2632        return ret;
2633}
2634EXPORT_SYMBOL(nobh_truncate_page);
2635
2636int block_truncate_page(struct address_space *mapping,
2637                        loff_t from, get_block_t *get_block)
2638{
2639        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2640        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2641        unsigned blocksize;
2642        sector_t iblock;
2643        unsigned length, pos;
2644        struct inode *inode = mapping->host;
2645        struct page *page;
2646        struct buffer_head *bh;
2647        void *kaddr;
2648        int err;
2649
2650        blocksize = 1 << inode->i_blkbits;
2651        length = offset & (blocksize - 1);
2652
2653        /* Block boundary? Nothing to do */
2654        if (!length)
2655                return 0;
2656
2657        length = blocksize - length;
2658        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2659        
2660        page = grab_cache_page(mapping, index);
2661        err = -ENOMEM;
2662        if (!page)
2663                goto out;
2664
2665        if (!page_has_buffers(page))
2666                create_empty_buffers(page, blocksize, 0);
2667
2668        /* Find the buffer that contains "offset" */
2669        bh = page_buffers(page);
2670        pos = blocksize;
2671        while (offset >= pos) {
2672                bh = bh->b_this_page;
2673                iblock++;
2674                pos += blocksize;
2675        }
2676
2677        err = 0;
2678        if (!buffer_mapped(bh)) {
2679                WARN_ON(bh->b_size != blocksize);
2680                err = get_block(inode, iblock, bh, 0);
2681                if (err)
2682                        goto unlock;
2683                /* unmapped? It's a hole - nothing to do */
2684                if (!buffer_mapped(bh))
2685                        goto unlock;
2686        }
2687
2688        /* Ok, it's mapped. Make sure it's up-to-date */
2689        if (PageUptodate(page))
2690                set_buffer_uptodate(bh);
2691
2692        if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2693                err = -EIO;
2694                ll_rw_block(READ, 1, &bh);
2695                wait_on_buffer(bh);
2696                /* Uhhuh. Read error. Complain and punt. */
2697                if (!buffer_uptodate(bh))
2698                        goto unlock;
2699        }
2700
2701        kaddr = kmap_atomic(page, KM_USER0);
2702        memset(kaddr + offset, 0, length);
2703        flush_dcache_page(page);
2704        kunmap_atomic(kaddr, KM_USER0);
2705
2706        mark_buffer_dirty(bh);
2707        err = 0;
2708
2709unlock:
2710        unlock_page(page);
2711        page_cache_release(page);
2712out:
2713        return err;
2714}
2715
2716/*
2717 * The generic ->writepage function for buffer-backed address_spaces
2718 */
2719int block_write_full_page(struct page *page, get_block_t *get_block,
2720                        struct writeback_control *wbc)
2721{
2722        struct inode * const inode = page->mapping->host;
2723        loff_t i_size = i_size_read(inode);
2724        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2725        unsigned offset;
2726        void *kaddr;
2727
2728        /* Is the page fully inside i_size? */
2729        if (page->index < end_index)
2730                return __block_write_full_page(inode, page, get_block, wbc);
2731
2732        /* Is the page fully outside i_size? (truncate in progress) */
2733        offset = i_size & (PAGE_CACHE_SIZE-1);
2734        if (page->index >= end_index+1 || !offset) {
2735                /*
2736                 * The page may have dirty, unmapped buffers.  For example,
2737                 * they may have been added in ext3_writepage().  Make them
2738                 * freeable here, so the page does not leak.
2739                 */
2740                do_invalidatepage(page, 0);
2741                unlock_page(page);
2742                return 0; /* don't care */
2743        }
2744
2745        /*
2746         * The page straddles i_size.  It must be zeroed out on each and every
2747         * writepage invokation because it may be mmapped.  "A file is mapped
2748         * in multiples of the page size.  For a file that is not a multiple of
2749         * the  page size, the remaining memory is zeroed when mapped, and
2750         * writes to that region are not written out to the file."
2751         */
2752        kaddr = kmap_atomic(page, KM_USER0);
2753        memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2754        flush_dcache_page(page);
2755        kunmap_atomic(kaddr, KM_USER0);
2756        return __block_write_full_page(inode, page, get_block, wbc);
2757}
2758
2759sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2760                            get_block_t *get_block)
2761{
2762        struct buffer_head tmp;
2763        struct inode *inode = mapping->host;
2764        tmp.b_state = 0;
2765        tmp.b_blocknr = 0;
2766        tmp.b_size = 1 << inode->i_blkbits;
2767        get_block(inode, block, &tmp, 0);
2768        return tmp.b_blocknr;
2769}
2770
2771static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2772{
2773        struct buffer_head *bh = bio->bi_private;
2774
2775        if (bio->bi_size)
2776                return 1;
2777
2778        if (err == -EOPNOTSUPP) {
2779                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2780                set_bit(BH_Eopnotsupp, &bh->b_state);
2781        }
2782
2783        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2784        bio_put(bio);
2785        return 0;
2786}
2787
2788int submit_bh(int rw, struct buffer_head * bh)
2789{
2790        struct bio *bio;
2791        int ret = 0;
2792
2793        BUG_ON(!buffer_locked(bh));
2794        BUG_ON(!buffer_mapped(bh));
2795        BUG_ON(!bh->b_end_io);
2796
2797        if (buffer_ordered(bh) && (rw == WRITE))
2798                rw = WRITE_BARRIER;
2799
2800        /*
2801         * Only clear out a write error when rewriting, should this
2802         * include WRITE_SYNC as well?
2803         */
2804        if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2805                clear_buffer_write_io_error(bh);
2806
2807        /*
2808         * from here on down, it's all bio -- do the initial mapping,
2809         * submit_bio -> generic_make_request may further map this bio around
2810         */
2811        bio = bio_alloc(GFP_NOIO, 1);
2812
2813        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2814        bio->bi_bdev = bh->b_bdev;
2815        bio->bi_io_vec[0].bv_page = bh->b_page;
2816        bio->bi_io_vec[0].bv_len = bh->b_size;
2817        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2818
2819        bio->bi_vcnt = 1;
2820        bio->bi_idx = 0;
2821        bio->bi_size = bh->b_size;
2822
2823        bio->bi_end_io = end_bio_bh_io_sync;
2824        bio->bi_private = bh;
2825
2826        bio_get(bio);
2827        submit_bio(rw, bio);
2828
2829        if (bio_flagged(bio, BIO_EOPNOTSUPP))
2830                ret = -EOPNOTSUPP;
2831
2832        bio_put(bio);
2833        return ret;
2834}
2835
2836/**
2837 * ll_rw_block: low-level access to block devices (DEPRECATED)
2838 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2839 * @nr: number of &struct buffer_heads in the array
2840 * @bhs: array of pointers to &struct buffer_head
2841 *
2842 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2843 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2844 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2845 * are sent to disk. The fourth %READA option is described in the documentation
2846 * for generic_make_request() which ll_rw_block() calls.
2847 *
2848 * This function drops any buffer that it cannot get a lock on (with the
2849 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2850 * clean when doing a write request, and any buffer that appears to be
2851 * up-to-date when doing read request.  Further it marks as clean buffers that
2852 * are processed for writing (the buffer cache won't assume that they are
2853 * actually clean until the buffer gets unlocked).
2854 *
2855 * ll_rw_block sets b_end_io to simple completion handler that marks
2856 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2857 * any waiters. 
2858 *
2859 * All of the buffers must be for the same device, and must also be a
2860 * multiple of the current approved size for the device.
2861 */
2862void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2863{
2864        int i;
2865
2866        for (i = 0; i < nr; i++) {
2867                struct buffer_head *bh = bhs[i];
2868
2869                if (rw == SWRITE)
2870                        lock_buffer(bh);
2871                else if (test_set_buffer_locked(bh))
2872                        continue;
2873
2874                if (rw == WRITE || rw == SWRITE) {
2875                        if (test_clear_buffer_dirty(bh)) {
2876                                bh->b_end_io = end_buffer_write_sync;
2877                                get_bh(bh);
2878                                submit_bh(WRITE, bh);
2879                                continue;
2880                        }
2881                } else {
2882                        if (!buffer_uptodate(bh)) {
2883                                bh->b_end_io = end_buffer_read_sync;
2884                                get_bh(bh);
2885                                submit_bh(rw, bh);
2886                                continue;
2887                        }
2888                }
2889                unlock_buffer(bh);
2890        }
2891}
2892
2893/*
2894 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2895 * and then start new I/O and then wait upon it.  The caller must have a ref on
2896 * the buffer_head.
2897 */
2898int sync_dirty_buffer(struct buffer_head *bh)
2899{
2900        int ret = 0;
2901
2902        WARN_ON(atomic_read(&bh->b_count) < 1);
2903        lock_buffer(bh);
2904        if (test_clear_buffer_dirty(bh)) {
2905                get_bh(bh);
2906                bh->b_end_io = end_buffer_write_sync;
2907                ret = submit_bh(WRITE, bh);
2908                wait_on_buffer(bh);
2909                if (buffer_eopnotsupp(bh)) {
2910                        clear_buffer_eopnotsupp(bh);
2911                        ret = -EOPNOTSUPP;
2912                }
2913                if (!ret && !buffer_uptodate(bh))
2914                        ret = -EIO;
2915        } else {
2916                unlock_buffer(bh);
2917        }
2918        return ret;
2919}
2920
2921/*
2922 * try_to_free_buffers() checks if all the buffers on this particular page
2923 * are unused, and releases them if so.
2924 *
2925 * Exclusion against try_to_free_buffers may be obtained by either
2926 * locking the page or by holding its mapping's private_lock.
2927 *
2928 * If the page is dirty but all the buffers are clean then we need to
2929 * be sure to mark the page clean as well.  This is because the page
2930 * may be against a block device, and a later reattachment of buffers
2931 * to a dirty page will set *all* buffers dirty.  Which would corrupt
2932 * filesystem data on the same device.
2933 *
2934 * The same applies to regular filesystem pages: if all the buffers are
2935 * clean then we set the page clean and proceed.  To do that, we require
2936 * total exclusion from __set_page_dirty_buffers().  That is obtained with
2937 * private_lock.
2938 *
2939 * try_to_free_buffers() is non-blocking.
2940 */
2941static inline int buffer_busy(struct buffer_head *bh)
2942{
2943        return atomic_read(&bh->b_count) |
2944                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2945}
2946
2947static int
2948drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2949{
2950        struct buffer_head *head = page_buffers(page);
2951        struct buffer_head *bh;
2952
2953        bh = head;
2954        do {
2955                if (buffer_write_io_error(bh) && page->mapping)
2956                        set_bit(AS_EIO, &page->mapping->flags);
2957                if (buffer_busy(bh))
2958                        goto failed;
2959                bh = bh->b_this_page;
2960        } while (bh != head);
2961
2962        do {
2963                struct buffer_head *next = bh->b_this_page;
2964
2965                if (!list_empty(&bh->b_assoc_buffers))
2966                        __remove_assoc_queue(bh);
2967                bh = next;
2968        } while (bh != head);
2969        *buffers_to_free = head;
2970        __clear_page_buffers(page);
2971        return 1;
2972failed:
2973        return 0;
2974}
2975
2976int try_to_free_buffers(struct page *page)
2977{
2978        struct address_space * const mapping = page->mapping;
2979        struct buffer_head *buffers_to_free = NULL;
2980        int ret = 0;
2981
2982        BUG_ON(!PageLocked(page));
2983        if (PageWriteback(page))
2984                return 0;
2985
2986        if (mapping == NULL) {          /* can this still happen? */
2987                ret = drop_buffers(page, &buffers_to_free);
2988                goto out;
2989        }
2990
2991        spin_lock(&mapping->private_lock);
2992        ret = drop_buffers(page, &buffers_to_free);
2993        if (ret) {
2994                /*
2995                 * If the filesystem writes its buffers by hand (eg ext3)
2996                 * then we can have clean buffers against a dirty page.  We
2997                 * clean the page here; otherwise later reattachment of buffers
2998                 * could encounter a non-uptodate page, which is unresolvable.
2999                 * This only applies in the rare case where try_to_free_buffers
3000                 * succeeds but the page is not freed.
3001                 */
3002                clear_page_dirty(page);
3003        }
3004        spin_unlock(&mapping->private_lock);
3005out:
3006        if (buffers_to_free) {
3007                struct buffer_head *bh = buffers_to_free;
3008
3009                do {
3010                        struct buffer_head *next = bh->b_this_page;
3011                        free_buffer_head(bh);
3012                        bh = next;
3013                } while (bh != buffers_to_free);
3014        }
3015        return ret;
3016}
3017EXPORT_SYMBOL(try_to_free_buffers);
3018
3019void block_sync_page(struct page *page)
3020{
3021        struct address_space *mapping;
3022
3023        smp_mb();
3024        mapping = page_mapping(page);
3025        if (mapping)
3026                blk_run_backing_dev(mapping->backing_dev_info, page);
3027}
3028
3029/*
3030 * There are no bdflush tunables left.  But distributions are
3031 * still running obsolete flush daemons, so we terminate them here.
3032 *
3033 * Use of bdflush() is deprecated and will be removed in a future kernel.
3034 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3035 */
3036asmlinkage long sys_bdflush(int func, long data)
3037{
3038        static int msg_count;
3039
3040        if (!capable(CAP_SYS_ADMIN))
3041                return -EPERM;
3042
3043        if (msg_count < 5) {
3044                msg_count++;
3045                printk(KERN_INFO
3046                        "warning: process `%s' used the obsolete bdflush"
3047                        " system call\n", current->comm);
3048                printk(KERN_INFO "Fix your initscripts?\n");
3049        }
3050
3051        if (func == 1)
3052                do_exit(0);
3053        return 0;
3054}
3055
3056/*
3057 * Buffer-head allocation
3058 */
3059static kmem_cache_t *bh_cachep;
3060
3061/*
3062 * Once the number of bh's in the machine exceeds this level, we start
3063 * stripping them in writeback.
3064 */
3065static int max_buffer_heads;
3066
3067int buffer_heads_over_limit;
3068
3069struct bh_accounting {
3070        int nr;                 /* Number of live bh's */
3071        int ratelimit;          /* Limit cacheline bouncing */
3072};
3073
3074static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3075
3076static void recalc_bh_state(void)
3077{
3078        int i;
3079        int tot = 0;
3080
3081        if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3082                return;
3083        __get_cpu_var(bh_accounting).ratelimit = 0;
3084        for_each_online_cpu(i)
3085                tot += per_cpu(bh_accounting, i).nr;
3086        buffer_heads_over_limit = (tot > max_buffer_heads);
3087}
3088        
3089struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3090{
3091        struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3092        if (ret) {
3093                get_cpu_var(bh_accounting).nr++;
3094                recalc_bh_state();
3095                put_cpu_var(bh_accounting);
3096        }
3097        return ret;
3098}
3099EXPORT_SYMBOL(alloc_buffer_head);
3100
3101void free_buffer_head(struct buffer_head *bh)
3102{
3103        BUG_ON(!list_empty(&bh->b_assoc_buffers));
3104        kmem_cache_free(bh_cachep, bh);
3105        get_cpu_var(bh_accounting).nr--;
3106        recalc_bh_state();
3107        put_cpu_var(bh_accounting);
3108}
3109EXPORT_SYMBOL(free_buffer_head);
3110
3111static void
3112init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3113{
3114        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3115                            SLAB_CTOR_CONSTRUCTOR) {
3116                struct buffer_head * bh = (struct buffer_head *)data;
3117
3118                memset(bh, 0, sizeof(*bh));
3119                INIT_LIST_HEAD(&bh->b_assoc_buffers);
3120        }
3121}
3122
3123#ifdef CONFIG_HOTPLUG_CPU
3124static void buffer_exit_cpu(int cpu)
3125{
3126        int i;
3127        struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3128
3129        for (i = 0; i < BH_LRU_SIZE; i++) {
3130                brelse(b->bhs[i]);
3131                b->bhs[i] = NULL;
3132        }
3133        get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3134        per_cpu(bh_accounting, cpu).nr = 0;
3135        put_cpu_var(bh_accounting);
3136}
3137
3138static int buffer_cpu_notify(struct notifier_block *self,
3139                              unsigned long action, void *hcpu)
3140{
3141        if (action == CPU_DEAD)
3142                buffer_exit_cpu((unsigned long)hcpu);
3143        return NOTIFY_OK;
3144}
3145#endif /* CONFIG_HOTPLUG_CPU */
3146
3147void __init buffer_init(void)
3148{
3149        int nrpages;
3150
3151        bh_cachep = kmem_cache_create("buffer_head",
3152                                        sizeof(struct buffer_head), 0,
3153                                        (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3154                                        SLAB_MEM_SPREAD),
3155                                        init_buffer_head,
3156                                        NULL);
3157
3158        /*
3159         * Limit the bh occupancy to 10% of ZONE_NORMAL
3160         */
3161        nrpages = (nr_free_buffer_pages() * 10) / 100;
3162        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3163        hotcpu_notifier(buffer_cpu_notify, 0);
3164}
3165
3166EXPORT_SYMBOL(__bforget);
3167EXPORT_SYMBOL(__brelse);
3168EXPORT_SYMBOL(__wait_on_buffer);
3169EXPORT_SYMBOL(block_commit_write);
3170EXPORT_SYMBOL(block_prepare_write);
3171EXPORT_SYMBOL(block_read_full_page);
3172EXPORT_SYMBOL(block_sync_page);
3173EXPORT_SYMBOL(block_truncate_page);
3174EXPORT_SYMBOL(block_write_full_page);
3175EXPORT_SYMBOL(cont_prepare_write);
3176EXPORT_SYMBOL(end_buffer_async_write);
3177EXPORT_SYMBOL(end_buffer_read_sync);
3178EXPORT_SYMBOL(end_buffer_write_sync);
3179EXPORT_SYMBOL(file_fsync);
3180EXPORT_SYMBOL(fsync_bdev);
3181EXPORT_SYMBOL(generic_block_bmap);
3182EXPORT_SYMBOL(generic_commit_write);
3183EXPORT_SYMBOL(generic_cont_expand);
3184EXPORT_SYMBOL(generic_cont_expand_simple);
3185EXPORT_SYMBOL(init_buffer);
3186EXPORT_SYMBOL(invalidate_bdev);
3187EXPORT_SYMBOL(ll_rw_block);
3188EXPORT_SYMBOL(mark_buffer_dirty);
3189EXPORT_SYMBOL(submit_bh);
3190EXPORT_SYMBOL(sync_dirty_buffer);
3191EXPORT_SYMBOL(unlock_buffer);
3192
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.