linux-bk/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/config.h>
  22#include <linux/kernel.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/smp_lock.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/iobuf.h>
  32#include <linux/module.h>
  33#include <linux/writeback.h>
  34#include <linux/mempool.h>
  35#include <linux/hash.h>
  36#include <linux/suspend.h>
  37#include <linux/buffer_head.h>
  38#include <linux/bio.h>
  39#include <asm/bitops.h>
  40
  41static void invalidate_bh_lrus(void);
  42
  43#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  44
  45/*
  46 * Hashed waitqueue_head's for wait_on_buffer()
  47 */
  48#define BH_WAIT_TABLE_ORDER     7
  49static struct bh_wait_queue_head {
  50        wait_queue_head_t wqh;
  51} ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
  52
  53/*
  54 * Debug/devel support stuff
  55 */
  56
  57void __buffer_error(char *file, int line)
  58{
  59        static int enough;
  60
  61        if (enough > 10)
  62                return;
  63        enough++;
  64        printk("buffer layer error at %s:%d\n", file, line);
  65        printk("Pass this trace through ksymoops for reporting\n");
  66        dump_stack();
  67}
  68EXPORT_SYMBOL(__buffer_error);
  69
  70inline void
  71init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  72{
  73        bh->b_end_io = handler;
  74        bh->b_private = private;
  75}
  76
  77/*
  78 * Return the address of the waitqueue_head to be used for this
  79 * buffer_head
  80 */
  81static wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
  82{
  83        return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
  84}
  85
  86/*
  87 * Wait on a buffer until someone does a wakeup on it.  Needs
  88 * lots of external locking.  ext3 uses this.  Fix it.
  89 */
  90void sleep_on_buffer(struct buffer_head *bh)
  91{
  92        wait_queue_head_t *wq = bh_waitq_head(bh);
  93        sleep_on(wq);
  94}
  95EXPORT_SYMBOL(sleep_on_buffer);
  96
  97void wake_up_buffer(struct buffer_head *bh)
  98{
  99        wait_queue_head_t *wq = bh_waitq_head(bh);
 100
 101        if (waitqueue_active(wq))
 102                wake_up_all(wq);
 103}
 104EXPORT_SYMBOL(wake_up_buffer);
 105
 106void unlock_buffer(struct buffer_head *bh)
 107{
 108        /*
 109         * unlock_buffer against a zero-count bh is a bug, if the page
 110         * is not locked.  Because then nothing protects the buffer's
 111         * waitqueue, which is used here. (Well.  Other locked buffers
 112         * against the page will pin it.  But complain anyway).
 113         */
 114        if (atomic_read(&bh->b_count) == 0 &&
 115                        !PageLocked(bh->b_page) &&
 116                        !PageWriteback(bh->b_page))
 117                buffer_error();
 118
 119        clear_buffer_locked(bh);
 120        smp_mb__after_clear_bit();
 121        wake_up_buffer(bh);
 122}
 123
 124/*
 125 * Block until a buffer comes unlocked.  This doesn't stop it
 126 * from becoming locked again - you have to lock it yourself
 127 * if you want to preserve its state.
 128 */
 129void __wait_on_buffer(struct buffer_head * bh)
 130{
 131        wait_queue_head_t *wq = bh_waitq_head(bh);
 132        struct task_struct *tsk = current;
 133        DECLARE_WAITQUEUE(wait, tsk);
 134
 135        get_bh(bh);
 136        add_wait_queue(wq, &wait);
 137        do {
 138                blk_run_queues();
 139                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 140                if (!buffer_locked(bh))
 141                        break;
 142                schedule();
 143        } while (buffer_locked(bh));
 144        tsk->state = TASK_RUNNING;
 145        remove_wait_queue(wq, &wait);
 146        put_bh(bh);
 147}
 148
 149static inline void
 150__set_page_buffers(struct page *page, struct buffer_head *head)
 151{
 152        if (page_has_buffers(page))
 153                buffer_error();
 154        page_cache_get(page);
 155        SetPagePrivate(page);
 156        page->private = (unsigned long)head;
 157}
 158
 159static inline void
 160__clear_page_buffers(struct page *page)
 161{
 162        ClearPagePrivate(page);
 163        page->private = 0;
 164        page_cache_release(page);
 165}
 166
 167static void buffer_io_error(struct buffer_head *bh)
 168{
 169        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 170                        bdevname(bh->b_bdev),
 171                        (unsigned long long)bh->b_blocknr);
 172}
 173
 174/*
 175 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 176 * unlock the buffer. This is what ll_rw_block uses too.
 177 */
 178void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 179{
 180        if (uptodate) {
 181                set_buffer_uptodate(bh);
 182        } else {
 183                /*
 184                 * This happens, due to failed READA attempts.
 185                 * buffer_io_error(bh);
 186                 */
 187                clear_buffer_uptodate(bh);
 188        }
 189        unlock_buffer(bh);
 190        put_bh(bh);
 191}
 192
 193/*
 194 * Write out and wait upon all the dirty data associated with a block
 195 * device via its mapping.  Does not take the superblock lock.
 196 */
 197int sync_blockdev(struct block_device *bdev)
 198{
 199        int ret = 0;
 200
 201        if (bdev) {
 202                int err;
 203
 204                ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
 205                err = filemap_fdatawait(bdev->bd_inode->i_mapping);
 206                if (!ret)
 207                        ret = err;
 208        }
 209        return ret;
 210}
 211EXPORT_SYMBOL(sync_blockdev);
 212
 213/*
 214 * Write out and wait upon all dirty data associated with this
 215 * superblock.  Filesystem data as well as the underlying block
 216 * device.  Takes the superblock lock.
 217 */
 218int fsync_super(struct super_block *sb)
 219{
 220        sync_inodes_sb(sb, 0);
 221        DQUOT_SYNC(sb);
 222        lock_super(sb);
 223        if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
 224                sb->s_op->write_super(sb);
 225        unlock_super(sb);
 226        sync_blockdev(sb->s_bdev);
 227        sync_inodes_sb(sb, 1);
 228
 229        return sync_blockdev(sb->s_bdev);
 230}
 231
 232/*
 233 * Write out and wait upon all dirty data associated with this
 234 * device.   Filesystem data as well as the underlying block
 235 * device.  Takes the superblock lock.
 236 */
 237int fsync_bdev(struct block_device *bdev)
 238{
 239        struct super_block *sb = get_super(bdev);
 240        if (sb) {
 241                int res = fsync_super(sb);
 242                drop_super(sb);
 243                return res;
 244        }
 245        return sync_blockdev(bdev);
 246}
 247
 248/*
 249 * sync everything.
 250 */
 251asmlinkage long sys_sync(void)
 252{
 253        sync_inodes(0); /* All mappings and inodes, including block devices */
 254        DQUOT_SYNC(NULL);
 255        sync_supers();  /* Write the superblocks */
 256        sync_inodes(1); /* All the mappings and inodes, again. */
 257        return 0;
 258}
 259
 260/*
 261 * Generic function to fsync a file.
 262 *
 263 * filp may be NULL if called via the msync of a vma.
 264 */
 265 
 266int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 267{
 268        struct inode * inode = dentry->d_inode;
 269        struct super_block * sb;
 270        int ret;
 271
 272        /* sync the inode to buffers */
 273        write_inode_now(inode, 0);
 274
 275        /* sync the superblock to buffers */
 276        sb = inode->i_sb;
 277        lock_super(sb);
 278        if (sb->s_op && sb->s_op->write_super)
 279                sb->s_op->write_super(sb);
 280        unlock_super(sb);
 281
 282        /* .. finally sync the buffers to disk */
 283        ret = sync_blockdev(sb->s_bdev);
 284        return ret;
 285}
 286
 287asmlinkage long sys_fsync(unsigned int fd)
 288{
 289        struct file * file;
 290        struct dentry * dentry;
 291        struct inode * inode;
 292        int ret, err;
 293
 294        ret = -EBADF;
 295        file = fget(fd);
 296        if (!file)
 297                goto out;
 298
 299        dentry = file->f_dentry;
 300        inode = dentry->d_inode;
 301
 302        ret = -EINVAL;
 303        if (!file->f_op || !file->f_op->fsync) {
 304                /* Why?  We can still call filemap_fdatawrite */
 305                goto out_putf;
 306        }
 307
 308        /* We need to protect against concurrent writers.. */
 309        down(&inode->i_sem);
 310        ret = filemap_fdatawrite(inode->i_mapping);
 311        err = file->f_op->fsync(file, dentry, 0);
 312        if (!ret)
 313                ret = err;
 314        err = filemap_fdatawait(inode->i_mapping);
 315        if (!ret)
 316                ret = err;
 317        up(&inode->i_sem);
 318
 319out_putf:
 320        fput(file);
 321out:
 322        return ret;
 323}
 324
 325asmlinkage long sys_fdatasync(unsigned int fd)
 326{
 327        struct file * file;
 328        struct dentry * dentry;
 329        struct inode * inode;
 330        int ret, err;
 331
 332        ret = -EBADF;
 333        file = fget(fd);
 334        if (!file)
 335                goto out;
 336
 337        dentry = file->f_dentry;
 338        inode = dentry->d_inode;
 339
 340        ret = -EINVAL;
 341        if (!file->f_op || !file->f_op->fsync)
 342                goto out_putf;
 343
 344        down(&inode->i_sem);
 345        ret = filemap_fdatawrite(inode->i_mapping);
 346        err = file->f_op->fsync(file, dentry, 1);
 347        if (!ret)
 348                ret = err;
 349        err = filemap_fdatawait(inode->i_mapping);
 350        if (!ret)
 351                ret = err;
 352        up(&inode->i_sem);
 353
 354out_putf:
 355        fput(file);
 356out:
 357        return ret;
 358}
 359
 360/*
 361 * Various filesystems appear to want __find_get_block to be non-blocking.
 362 * But it's the page lock which protects the buffers.  To get around this,
 363 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 364 * private_lock.
 365 *
 366 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 367 * may be quite high.  This code could TryLock the page, and if that
 368 * succeeds, there is no need to take private_lock. (But if
 369 * private_lock is contended then so is mapping->page_lock).
 370 */
 371struct buffer_head *
 372__find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
 373{
 374        struct inode *bd_inode = bdev->bd_inode;
 375        struct address_space *bd_mapping = bd_inode->i_mapping;
 376        struct buffer_head *ret = NULL;
 377        unsigned long index;
 378        struct buffer_head *bh;
 379        struct buffer_head *head;
 380        struct page *page;
 381
 382        index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 383        page = find_get_page(bd_mapping, index);
 384        if (!page)
 385                goto out;
 386
 387        spin_lock(&bd_mapping->private_lock);
 388        if (!page_has_buffers(page))
 389                goto out_unlock;
 390        head = page_buffers(page);
 391        bh = head;
 392        do {
 393                if (bh->b_blocknr == block) {
 394                        ret = bh;
 395                        get_bh(bh);
 396                        goto out_unlock;
 397                }
 398                bh = bh->b_this_page;
 399        } while (bh != head);
 400        buffer_error();
 401out_unlock:
 402        spin_unlock(&bd_mapping->private_lock);
 403        page_cache_release(page);
 404out:
 405        return ret;
 406}
 407
 408/* If invalidate_buffers() will trash dirty buffers, it means some kind
 409   of fs corruption is going on. Trashing dirty data always imply losing
 410   information that was supposed to be just stored on the physical layer
 411   by the user.
 412
 413   Thus invalidate_buffers in general usage is not allwowed to trash
 414   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 415   be preserved.  These buffers are simply skipped.
 416  
 417   We also skip buffers which are still in use.  For example this can
 418   happen if a userspace program is reading the block device.
 419
 420   NOTE: In the case where the user removed a removable-media-disk even if
 421   there's still dirty data not synced on disk (due a bug in the device driver
 422   or due an error of the user), by not destroying the dirty buffers we could
 423   generate corruption also on the next media inserted, thus a parameter is
 424   necessary to handle this case in the most safe way possible (trying
 425   to not corrupt also the new disk inserted with the data belonging to
 426   the old now corrupted disk). Also for the ramdisk the natural thing
 427   to do in order to release the ramdisk memory is to destroy dirty buffers.
 428
 429   These are two special cases. Normal usage imply the device driver
 430   to issue a sync on the device (without waiting I/O completion) and
 431   then an invalidate_buffers call that doesn't trash dirty buffers.
 432
 433   For handling cache coherency with the blkdev pagecache the 'update' case
 434   is been introduced. It is needed to re-read from disk any pinned
 435   buffer. NOTE: re-reading from disk is destructive so we can do it only
 436   when we assume nobody is changing the buffercache under our I/O and when
 437   we think the disk contains more recent information than the buffercache.
 438   The update == 1 pass marks the buffers we need to update, the update == 2
 439   pass does the actual I/O. */
 440void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 441{
 442        invalidate_bh_lrus();
 443        /*
 444         * FIXME: what about destroy_dirty_buffers?
 445         * We really want to use invalidate_inode_pages2() for
 446         * that, but not until that's cleaned up.
 447         */
 448        invalidate_inode_pages(bdev->bd_inode);
 449}
 450
 451void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 452{
 453        struct block_device *bdev = bdget(kdev_t_to_nr(dev));
 454        if (bdev) {
 455                invalidate_bdev(bdev, destroy_dirty_buffers);
 456                bdput(bdev);
 457        }
 458}
 459
 460/*
 461 * FIXME: What is this function actually trying to do?  Why "zones[0]"?
 462 * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
 463 */
 464static void free_more_memory(void)
 465{
 466        struct zone *zone;
 467
 468        zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
 469
 470        wakeup_bdflush();
 471        try_to_free_pages(zone, GFP_NOFS, 0);
 472        blk_run_queues();
 473        yield();
 474}
 475
 476/*
 477 * I/O completion handler for block_read_full_page() - pages
 478 * which come unlocked at the end of I/O.
 479 */
 480static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 481{
 482        static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 483        unsigned long flags;
 484        struct buffer_head *tmp;
 485        struct page *page;
 486        int page_uptodate = 1;
 487
 488        BUG_ON(!buffer_async_read(bh));
 489
 490        page = bh->b_page;
 491        if (uptodate) {
 492                set_buffer_uptodate(bh);
 493        } else {
 494                clear_buffer_uptodate(bh);
 495                buffer_io_error(bh);
 496                SetPageError(page);
 497        }
 498
 499        /*
 500         * Be _very_ careful from here on. Bad things can happen if
 501         * two buffer heads end IO at almost the same time and both
 502         * decide that the page is now completely done.
 503         */
 504        spin_lock_irqsave(&page_uptodate_lock, flags);
 505        clear_buffer_async_read(bh);
 506        unlock_buffer(bh);
 507        tmp = bh;
 508        do {
 509                if (!buffer_uptodate(tmp))
 510                        page_uptodate = 0;
 511                if (buffer_async_read(tmp)) {
 512                        BUG_ON(!buffer_locked(tmp));
 513                        goto still_busy;
 514                }
 515                tmp = tmp->b_this_page;
 516        } while (tmp != bh);
 517        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 518
 519        /*
 520         * If none of the buffers had errors and they are all
 521         * uptodate then we can set the page uptodate.
 522         */
 523        if (page_uptodate && !PageError(page))
 524                SetPageUptodate(page);
 525        unlock_page(page);
 526        return;
 527
 528still_busy:
 529        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 530        return;
 531}
 532
 533/*
 534 * Completion handler for block_write_full_page() - pages which are unlocked
 535 * during I/O, and which have PageWriteback cleared upon I/O completion.
 536 */
 537static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 538{
 539        static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 540        unsigned long flags;
 541        struct buffer_head *tmp;
 542        struct page *page;
 543
 544        BUG_ON(!buffer_async_write(bh));
 545
 546        page = bh->b_page;
 547        if (uptodate) {
 548                set_buffer_uptodate(bh);
 549        } else {
 550                buffer_io_error(bh);
 551                clear_buffer_uptodate(bh);
 552                SetPageError(page);
 553        }
 554
 555        spin_lock_irqsave(&page_uptodate_lock, flags);
 556        clear_buffer_async_write(bh);
 557        unlock_buffer(bh);
 558        tmp = bh->b_this_page;
 559        while (tmp != bh) {
 560                if (buffer_async_write(tmp)) {
 561                        BUG_ON(!buffer_locked(tmp));
 562                        goto still_busy;
 563                }
 564                tmp = tmp->b_this_page;
 565        }
 566        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 567        end_page_writeback(page);
 568        return;
 569
 570still_busy:
 571        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 572        return;
 573}
 574
 575/*
 576 * If a page's buffers are under async readin (end_buffer_async_read
 577 * completion) then there is a possibility that another thread of
 578 * control could lock one of the buffers after it has completed
 579 * but while some of the other buffers have not completed.  This
 580 * locked buffer would confuse end_buffer_async_read() into not unlocking
 581 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 582 * that this buffer is not under async I/O.
 583 *
 584 * The page comes unlocked when it has no locked buffer_async buffers
 585 * left.
 586 *
 587 * PageLocked prevents anyone starting new async I/O reads any of
 588 * the buffers.
 589 *
 590 * PageWriteback is used to prevent simultaneous writeout of the same
 591 * page.
 592 *
 593 * PageLocked prevents anyone from starting writeback of a page which is
 594 * under read I/O (PageWriteback is only ever set against a locked page).
 595 */
 596inline void mark_buffer_async_read(struct buffer_head *bh)
 597{
 598        bh->b_end_io = end_buffer_async_read;
 599        set_buffer_async_read(bh);
 600}
 601EXPORT_SYMBOL(mark_buffer_async_read);
 602
 603inline void mark_buffer_async_write(struct buffer_head *bh)
 604{
 605        bh->b_end_io = end_buffer_async_write;
 606        set_buffer_async_write(bh);
 607}
 608EXPORT_SYMBOL(mark_buffer_async_write);
 609
 610
 611/*
 612 * fs/buffer.c contains helper functions for buffer-backed address space's
 613 * fsync functions.  A common requirement for buffer-based filesystems is
 614 * that certain data from the backing blockdev needs to be written out for
 615 * a successful fsync().  For example, ext2 indirect blocks need to be
 616 * written back and waited upon before fsync() returns.
 617 *
 618 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 619 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 620 * management of a list of dependent buffers at ->i_mapping->private_list.
 621 *
 622 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 623 * from their controlling inode's queue when they are being freed.  But
 624 * try_to_free_buffers() will be operating against the *blockdev* mapping
 625 * at the time, not against the S_ISREG file which depends on those buffers.
 626 * So the locking for private_list is via the private_lock in the address_space
 627 * which backs the buffers.  Which is different from the address_space 
 628 * against which the buffers are listed.  So for a particular address_space,
 629 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 630 * mapping->private_list will always be protected by the backing blockdev's
 631 * ->private_lock.
 632 *
 633 * Which introduces a requirement: all buffers on an address_space's
 634 * ->private_list must be from the same address_space: the blockdev's.
 635 *
 636 * address_spaces which do not place buffers at ->private_list via these
 637 * utility functions are free to use private_lock and private_list for
 638 * whatever they want.  The only requirement is that list_empty(private_list)
 639 * be true at clear_inode() time.
 640 *
 641 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 642 * filesystems should do that.  invalidate_inode_buffers() should just go
 643 * BUG_ON(!list_empty).
 644 *
 645 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 646 * take an address_space, not an inode.  And it should be called
 647 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 648 * queued up.
 649 *
 650 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 651 * list if it is already on a list.  Because if the buffer is on a list,
 652 * it *must* already be on the right one.  If not, the filesystem is being
 653 * silly.  This will save a ton of locking.  But first we have to ensure
 654 * that buffers are taken *off* the old inode's list when they are freed
 655 * (presumably in truncate).  That requires careful auditing of all
 656 * filesystems (do it inside bforget()).  It could also be done by bringing
 657 * b_inode back.
 658 */
 659
 660void buffer_insert_list(spinlock_t *lock,
 661                struct buffer_head *bh, struct list_head *list)
 662{
 663        spin_lock(lock);
 664        list_del(&bh->b_assoc_buffers);
 665        list_add(&bh->b_assoc_buffers, list);
 666        spin_unlock(lock);
 667}
 668
 669/*
 670 * The buffer's backing address_space's private_lock must be held
 671 */
 672static inline void __remove_assoc_queue(struct buffer_head *bh)
 673{
 674        list_del_init(&bh->b_assoc_buffers);
 675}
 676
 677int inode_has_buffers(struct inode *inode)
 678{
 679        return !list_empty(&inode->i_mapping->private_list);
 680}
 681
 682/*
 683 * osync is designed to support O_SYNC io.  It waits synchronously for
 684 * all already-submitted IO to complete, but does not queue any new
 685 * writes to the disk.
 686 *
 687 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 688 * you dirty the buffers, and then use osync_inode_buffers to wait for
 689 * completion.  Any other dirty buffers which are not yet queued for
 690 * write will not be flushed to disk by the osync.
 691 */
 692static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 693{
 694        struct buffer_head *bh;
 695        struct list_head *p;
 696        int err = 0;
 697
 698        spin_lock(lock);
 699repeat:
 700        list_for_each_prev(p, list) {
 701                bh = BH_ENTRY(p);
 702                if (buffer_locked(bh)) {
 703                        get_bh(bh);
 704                        spin_unlock(lock);
 705                        wait_on_buffer(bh);
 706                        if (!buffer_uptodate(bh))
 707                                err = -EIO;
 708                        brelse(bh);
 709                        spin_lock(lock);
 710                        goto repeat;
 711                }
 712        }
 713        spin_unlock(lock);
 714        return err;
 715}
 716
 717/**
 718 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 719 *                        buffers
 720 * @buffer_mapping - the mapping which backs the buffers' data
 721 * @mapping - the mapping which wants those buffers written
 722 *
 723 * Starts I/O against the buffers at mapping->private_list, and waits upon
 724 * that I/O.
 725 *
 726 * Basically, this is a convenience function for fsync().  @buffer_mapping is
 727 * the blockdev which "owns" the buffers and @mapping is a file or directory
 728 * which needs those buffers to be written for a successful fsync().
 729 */
 730int sync_mapping_buffers(struct address_space *mapping)
 731{
 732        struct address_space *buffer_mapping = mapping->assoc_mapping;
 733
 734        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 735                return 0;
 736
 737        return fsync_buffers_list(&buffer_mapping->private_lock,
 738                                        &mapping->private_list);
 739}
 740EXPORT_SYMBOL(sync_mapping_buffers);
 741
 742/**
 743 * write_mapping_buffers - Start writeout of a mapping's "associated" buffers.
 744 * @mapping - the mapping which wants those buffers written.
 745 *
 746 * Starts I/O against dirty buffers which are on @mapping->private_list.
 747 * Those buffers must be backed by @mapping->assoc_mapping.
 748 *
 749 * The private_list buffers generally contain filesystem indirect blocks.
 750 * The idea is that the filesystem can start I/O against the indirects at
 751 * the same time as running generic_writepages(), so the indirect's
 752 * I/O will be merged with the data.
 753 *
 754 * We sneakliy write the buffers in probable tail-to-head order.  This is
 755 * because generic_writepages() writes in probable head-to-tail
 756 * order.  If the file is so huge that the data or the indirects overflow
 757 * the request queue we will at least get some merging this way.
 758 *
 759 * Any clean+unlocked buffers are de-listed.  clean/locked buffers must be
 760 * left on the list for an fsync() to wait on.
 761 *
 762 * Couldn't think of a smart way of avoiding livelock, so chose the dumb
 763 * way instead.
 764 *
 765 * FIXME: duplicates fsync_inode_buffers() functionality a bit.
 766 */
 767int write_mapping_buffers(struct address_space *mapping)
 768{
 769        spinlock_t *lock;
 770        struct address_space *buffer_mapping;
 771        unsigned nr_to_write;   /* livelock avoidance */
 772        struct list_head *lh;
 773        int ret = 0;
 774
 775        if (list_empty(&mapping->private_list))
 776                goto out;
 777
 778        buffer_mapping = mapping->assoc_mapping;
 779        lock = &buffer_mapping->private_lock;
 780        spin_lock(lock);
 781        nr_to_write = 0;
 782        lh = mapping->private_list.next;
 783        while (lh != &mapping->private_list) {
 784                lh = lh->next;
 785                nr_to_write++;
 786        }
 787        nr_to_write *= 2;       /* Allow for some late additions */
 788
 789        while (nr_to_write-- && !list_empty(&mapping->private_list)) {
 790                struct buffer_head *bh;
 791
 792                bh = BH_ENTRY(mapping->private_list.prev);
 793                list_del_init(&bh->b_assoc_buffers);
 794                if (!buffer_dirty(bh) && !buffer_locked(bh))
 795                        continue;
 796                /* Stick it on the far end of the list. Order is preserved. */
 797                list_add(&bh->b_assoc_buffers, &mapping->private_list);
 798                if (test_set_buffer_locked(bh))
 799                        continue;
 800                get_bh(bh);
 801                spin_unlock(lock);
 802                if (test_clear_buffer_dirty(bh)) {
 803                        bh->b_end_io = end_buffer_io_sync;
 804                        submit_bh(WRITE, bh);
 805                } else {
 806                        unlock_buffer(bh);
 807                        put_bh(bh);
 808                }
 809                spin_lock(lock);
 810        }
 811        spin_unlock(lock);
 812out:
 813        return ret;
 814}
 815EXPORT_SYMBOL(write_mapping_buffers);
 816
 817void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 818{
 819        struct address_space *mapping = inode->i_mapping;
 820        struct address_space *buffer_mapping = bh->b_page->mapping;
 821
 822        mark_buffer_dirty(bh);
 823        if (!mapping->assoc_mapping) {
 824                mapping->assoc_mapping = buffer_mapping;
 825        } else {
 826                if (mapping->assoc_mapping != buffer_mapping)
 827                        BUG();
 828        }
 829        if (list_empty(&bh->b_assoc_buffers))
 830                buffer_insert_list(&buffer_mapping->private_lock,
 831                                bh, &mapping->private_list);
 832}
 833EXPORT_SYMBOL(mark_buffer_dirty_inode);
 834
 835/*
 836 * Write out and wait upon a list of buffers.
 837 *
 838 * We have conflicting pressures: we want to make sure that all
 839 * initially dirty buffers get waited on, but that any subsequently
 840 * dirtied buffers don't.  After all, we don't want fsync to last
 841 * forever if somebody is actively writing to the file.
 842 *
 843 * Do this in two main stages: first we copy dirty buffers to a
 844 * temporary inode list, queueing the writes as we go.  Then we clean
 845 * up, waiting for those writes to complete.
 846 * 
 847 * During this second stage, any subsequent updates to the file may end
 848 * up refiling the buffer on the original inode's dirty list again, so
 849 * there is a chance we will end up with a buffer queued for write but
 850 * not yet completed on that list.  So, as a final cleanup we go through
 851 * the osync code to catch these locked, dirty buffers without requeuing
 852 * any newly dirty buffers for write.
 853 */
 854int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 855{
 856        struct buffer_head *bh;
 857        struct list_head tmp;
 858        int err = 0, err2;
 859
 860        INIT_LIST_HEAD(&tmp);
 861
 862        spin_lock(lock);
 863        while (!list_empty(list)) {
 864                bh = BH_ENTRY(list->next);
 865                list_del_init(&bh->b_assoc_buffers);
 866                if (buffer_dirty(bh) || buffer_locked(bh)) {
 867                        list_add(&bh->b_assoc_buffers, &tmp);
 868                        if (buffer_dirty(bh)) {
 869                                get_bh(bh);
 870                                spin_unlock(lock);
 871                                ll_rw_block(WRITE, 1, &bh);
 872                                brelse(bh);
 873                                spin_lock(lock);
 874                        }
 875                }
 876        }
 877
 878        while (!list_empty(&tmp)) {
 879                bh = BH_ENTRY(tmp.prev);
 880                __remove_assoc_queue(bh);
 881                get_bh(bh);
 882                spin_unlock(lock);
 883                wait_on_buffer(bh);
 884                if (!buffer_uptodate(bh))
 885                        err = -EIO;
 886                brelse(bh);
 887                spin_lock(lock);
 888        }
 889        
 890        spin_unlock(lock);
 891        err2 = osync_buffers_list(lock, list);
 892        if (err)
 893                return err;
 894        else
 895                return err2;
 896}
 897
 898/*
 899 * Invalidate any and all dirty buffers on a given inode.  We are
 900 * probably unmounting the fs, but that doesn't mean we have already
 901 * done a sync().  Just drop the buffers from the inode list.
 902 *
 903 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 904 * assumes that all the buffers are against the blockdev.  Not true
 905 * for reiserfs.
 906 */
 907void invalidate_inode_buffers(struct inode *inode)
 908{
 909        if (inode_has_buffers(inode)) {
 910                struct address_space *mapping = inode->i_mapping;
 911                struct list_head *list = &mapping->private_list;
 912                struct address_space *buffer_mapping = mapping->assoc_mapping;
 913
 914                spin_lock(&buffer_mapping->private_lock);
 915                while (!list_empty(list))
 916                        __remove_assoc_queue(BH_ENTRY(list->next));
 917                spin_unlock(&buffer_mapping->private_lock);
 918        }
 919}
 920
 921/*
 922 * Create the appropriate buffers when given a page for data area and
 923 * the size of each buffer.. Use the bh->b_this_page linked list to
 924 * follow the buffers created.  Return NULL if unable to create more
 925 * buffers.
 926 *
 927 * The retry flag is used to differentiate async IO (paging, swapping)
 928 * which may not fail from ordinary buffer allocations.
 929 */
 930static struct buffer_head *
 931create_buffers(struct page * page, unsigned long size, int retry)
 932{
 933        struct buffer_head *bh, *head;
 934        long offset;
 935
 936try_again:
 937        head = NULL;
 938        offset = PAGE_SIZE;
 939        while ((offset -= size) >= 0) {
 940                int pf_flags = current->flags;
 941
 942                current->flags |= PF_NOWARN;
 943                bh = alloc_buffer_head();
 944                current->flags = pf_flags;
 945                if (!bh)
 946                        goto no_grow;
 947
 948                bh->b_bdev = NULL;
 949                bh->b_this_page = head;
 950                bh->b_blocknr = -1;
 951                head = bh;
 952
 953                bh->b_state = 0;
 954                atomic_set(&bh->b_count, 0);
 955                bh->b_size = size;
 956
 957                /* Link the buffer to its page */
 958                set_bh_page(bh, page, offset);
 959
 960                bh->b_end_io = NULL;
 961        }
 962        return head;
 963/*
 964 * In case anything failed, we just free everything we got.
 965 */
 966no_grow:
 967        if (head) {
 968                do {
 969                        bh = head;
 970                        head = head->b_this_page;
 971                        free_buffer_head(bh);
 972                } while (head);
 973        }
 974
 975        /*
 976         * Return failure for non-async IO requests.  Async IO requests
 977         * are not allowed to fail, so we have to wait until buffer heads
 978         * become available.  But we don't want tasks sleeping with 
 979         * partially complete buffers, so all were released above.
 980         */
 981        if (!retry)
 982                return NULL;
 983
 984        /* We're _really_ low on memory. Now we just
 985         * wait for old buffer heads to become free due to
 986         * finishing IO.  Since this is an async request and
 987         * the reserve list is empty, we're sure there are 
 988         * async buffer heads in use.
 989         */
 990        blk_run_queues();
 991
 992        free_more_memory();
 993        goto try_again;
 994}
 995
 996static inline void
 997link_dev_buffers(struct page *page, struct buffer_head *head)
 998{
 999        struct buffer_head *bh, *tail;
1000
1001        bh = head;
1002        do {
1003                tail = bh;
1004                bh = bh->b_this_page;
1005        } while (bh);
1006        tail->b_this_page = head;
1007        __set_page_buffers(page, head);
1008}
1009
1010/*
1011 * Initialise the state of a blockdev page's buffers.
1012 */ 
1013static /*inline*/ void
1014init_page_buffers(struct page *page, struct block_device *bdev,
1015                        int block, int size)
1016{
1017        struct buffer_head *head = page_buffers(page);
1018        struct buffer_head *bh = head;
1019        unsigned int b_state;
1020
1021        b_state = 1 << BH_Mapped;
1022        if (PageUptodate(page))
1023                b_state |= 1 << BH_Uptodate;
1024
1025        do {
1026                if (!(bh->b_state & (1 << BH_Mapped))) {
1027                        init_buffer(bh, NULL, NULL);
1028                        bh->b_bdev = bdev;
1029                        bh->b_blocknr = block;
1030                        bh->b_state = b_state;
1031                }
1032                block++;
1033                bh = bh->b_this_page;
1034        } while (bh != head);
1035}
1036
1037/*
1038 * Create the page-cache page that contains the requested block.
1039 *
1040 * This is user purely for blockdev mappings.
1041 */
1042static /*inline*/ struct page *
1043grow_dev_page(struct block_device *bdev, unsigned long block,
1044                        unsigned long index, int size)
1045{
1046        struct inode *inode = bdev->bd_inode;
1047        struct page *page;
1048        struct buffer_head *bh;
1049
1050        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1051        if (!page)
1052                return NULL;
1053
1054        if (!PageLocked(page))
1055                BUG();
1056
1057        if (page_has_buffers(page)) {
1058                bh = page_buffers(page);
1059                if (bh->b_size == size)
1060                        return page;
1061                if (!try_to_free_buffers(page))
1062                        goto failed;
1063        }
1064
1065        /*
1066         * Allocate some buffers for this page
1067         */
1068        bh = create_buffers(page, size, 0);
1069        if (!bh)
1070                goto failed;
1071
1072        /*
1073         * Link the page to the buffers and initialise them.  Take the
1074         * lock to be atomic wrt __find_get_block(), which does not
1075         * run under the page lock.
1076         */
1077        spin_lock(&inode->i_mapping->private_lock);
1078        link_dev_buffers(page, bh);
1079        init_page_buffers(page, bdev, block, size);
1080        spin_unlock(&inode->i_mapping->private_lock);
1081        return page;
1082
1083failed:
1084        buffer_error();
1085        unlock_page(page);
1086        page_cache_release(page);
1087        return NULL;
1088}
1089
1090/*
1091 * Create buffers for the specified block device block's page.  If
1092 * that page was dirty, the buffers are set dirty also.
1093 *
1094 * Except that's a bug.  Attaching dirty buffers to a dirty
1095 * blockdev's page can result in filesystem corruption, because
1096 * some of those buffers may be aliases of filesystem data.
1097 * grow_dev_page() will go BUG() if this happens.
1098 */
1099static inline int
1100grow_buffers(struct block_device *bdev, unsigned long block, int size)
1101{
1102        struct page *page;
1103        unsigned long index;
1104        int sizebits;
1105
1106        /* Size must be multiple of hard sectorsize */
1107        if (size & (bdev_hardsect_size(bdev)-1))
1108                BUG();
1109        if (size < 512 || size > PAGE_SIZE)
1110                BUG();
1111
1112        sizebits = -1;
1113        do {
1114                sizebits++;
1115        } while ((size << sizebits) < PAGE_SIZE);
1116
1117        index = block >> sizebits;
1118        block = index << sizebits;
1119
1120        /* Create a page with the proper size buffers.. */
1121        page = grow_dev_page(bdev, block, index, size);
1122        if (!page)
1123                return 0;
1124        unlock_page(page);
1125        page_cache_release(page);
1126        return 1;
1127}
1128
1129/*
1130 * __getblk will locate (and, if necessary, create) the buffer_head
1131 * which corresponds to the passed block_device, block and size. The
1132 * returned buffer has its reference count incremented.
1133 *
1134 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1135 * illegal block number, __getblk() will happily return a buffer_head
1136 * which represents the non-existent block.  Very weird.
1137 *
1138 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1139 * attempt is failing.  FIXME, perhaps?
1140 */
1141struct buffer_head *
1142__getblk_slow(struct block_device *bdev, sector_t block, int size)
1143{
1144        for (;;) {
1145                struct buffer_head * bh;
1146
1147                bh = __find_get_block(bdev, block, size);
1148                if (bh) {
1149                        touch_buffer(bh);
1150                        return bh;
1151                }
1152
1153                if (!grow_buffers(bdev, block, size))
1154                        free_more_memory();
1155        }
1156}
1157
1158/*
1159 * The relationship between dirty buffers and dirty pages:
1160 *
1161 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1162 * the page appears on its address_space.dirty_pages list.
1163 *
1164 * At all times, the dirtiness of the buffers represents the dirtiness of
1165 * subsections of the page.  If the page has buffers, the page dirty bit is
1166 * merely a hint about the true dirty state.
1167 *
1168 * When a page is set dirty in its entirety, all its buffers are marked dirty
1169 * (if the page has buffers).
1170 *
1171 * When a buffer is marked dirty, its page is dirtied, but the page's other
1172 * buffers are not.
1173 *
1174 * Also.  When blockdev buffers are explicitly read with bread(), they
1175 * individually become uptodate.  But their backing page remains not
1176 * uptodate - even if all of its buffers are uptodate.  A subsequent
1177 * block_read_full_page() against that page will discover all the uptodate
1178 * buffers, will set the page uptodate and will perform no I/O.
1179 */
1180
1181/**
1182 * mark_buffer_dirty - mark a buffer_head as needing writeout
1183 *
1184 * mark_buffer_dirty() will set the dirty bit against the buffer,
1185 * then set its backing page dirty, then attach the page to its
1186 * address_space's dirty_pages list and then attach the address_space's
1187 * inode to its superblock's dirty inode list.
1188 *
1189 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1190 * mapping->page_lock and the global inode_lock.
1191 */
1192void mark_buffer_dirty(struct buffer_head *bh)
1193{
1194        if (!buffer_uptodate(bh))
1195                buffer_error();
1196        if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1197                __set_page_dirty_nobuffers(bh->b_page);
1198}
1199
1200/*
1201 * Decrement a buffer_head's reference count.  If all buffers against a page
1202 * have zero reference count, are clean and unlocked, and if the page is clean
1203 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1204 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1205 * a page but it ends up not being freed, and buffers may later be reattached).
1206 */
1207void __brelse(struct buffer_head * buf)
1208{
1209        if (atomic_read(&buf->b_count)) {
1210                put_bh(buf);
1211                return;
1212        }
1213        printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1214        buffer_error();         /* For the stack backtrace */
1215}
1216
1217/*
1218 * bforget() is like brelse(), except it discards any
1219 * potentially dirty data.
1220 */
1221void __bforget(struct buffer_head *bh)
1222{
1223        clear_buffer_dirty(bh);
1224        if (!list_empty(&bh->b_assoc_buffers)) {
1225                struct address_space *buffer_mapping = bh->b_page->mapping;
1226
1227                spin_lock(&buffer_mapping->private_lock);
1228                list_del_init(&bh->b_assoc_buffers);
1229                spin_unlock(&buffer_mapping->private_lock);
1230        }
1231        __brelse(bh);
1232}
1233
1234/**
1235 *  __bread() - reads a specified block and returns the bh
1236 *  @block: number of block
1237 *  @size: size (in bytes) to read
1238 * 
1239 *  Reads a specified block, and returns buffer head that contains it.
1240 *  It returns NULL if the block was unreadable.
1241 */
1242struct buffer_head *
1243__bread_slow(struct block_device *bdev, sector_t block, int size)
1244{
1245        struct buffer_head *bh = __getblk(bdev, block, size);
1246
1247        if (buffer_uptodate(bh))
1248                return bh;
1249        lock_buffer(bh);
1250        if (buffer_uptodate(bh)) {
1251                unlock_buffer(bh);
1252                return bh;
1253        } else {
1254                if (buffer_dirty(bh))
1255                        buffer_error();
1256                get_bh(bh);
1257                bh->b_end_io = end_buffer_io_sync;
1258                submit_bh(READ, bh);
1259                wait_on_buffer(bh);
1260                if (buffer_uptodate(bh))
1261                        return bh;
1262        }
1263        brelse(bh);
1264        return NULL;
1265}
1266
1267/*
1268 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1269 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1270 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1271 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1272 * CPU's LRUs at the same time.
1273 *
1274 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1275 * sb_find_get_block().
1276 *
1277 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1278 * a local interrupt disable for that.
1279 */
1280
1281#define BH_LRU_SIZE     8
1282
1283static struct bh_lru {
1284        struct buffer_head *bhs[BH_LRU_SIZE];
1285} ____cacheline_aligned_in_smp bh_lrus[NR_CPUS];
1286
1287#ifdef CONFIG_SMP
1288#define bh_lru_lock()   local_irq_disable()
1289#define bh_lru_unlock() local_irq_enable()
1290#else
1291#define bh_lru_lock()   preempt_disable()
1292#define bh_lru_unlock() preempt_enable()
1293#endif
1294
1295static inline void check_irqs_on(void)
1296{
1297#ifdef irqs_disabled
1298        BUG_ON(irqs_disabled());
1299#endif
1300}
1301
1302/*
1303 * The LRU management algorithm is dopey-but-simple.  Sorry.
1304 */
1305static void bh_lru_install(struct buffer_head *bh)
1306{
1307        struct buffer_head *evictee = NULL;
1308        struct bh_lru *lru;
1309
1310        if (bh == NULL)
1311                return;
1312
1313        check_irqs_on();
1314        bh_lru_lock();
1315        lru = &bh_lrus[smp_processor_id()];
1316        if (lru->bhs[0] != bh) {
1317                struct buffer_head *bhs[BH_LRU_SIZE];
1318                int in;
1319                int out = 0;
1320
1321                get_bh(bh);
1322                bhs[out++] = bh;
1323                for (in = 0; in < BH_LRU_SIZE; in++) {
1324                        struct buffer_head *bh2 = lru->bhs[in];
1325
1326                        if (bh2 == bh) {
1327                                __brelse(bh2);
1328                        } else {
1329                                if (out >= BH_LRU_SIZE) {
1330                                        BUG_ON(evictee != NULL);
1331                                        evictee = bh2;
1332                                } else {
1333                                        bhs[out++] = bh2;
1334                                }
1335                        }
1336                }
1337                while (out < BH_LRU_SIZE)
1338                        bhs[out++] = NULL;
1339                memcpy(lru->bhs, bhs, sizeof(bhs));
1340        }
1341        bh_lru_unlock();
1342
1343        if (evictee) {
1344                touch_buffer(evictee);
1345                __brelse(evictee);
1346        }
1347}
1348
1349static inline struct buffer_head *
1350lookup_bh(struct block_device *bdev, sector_t block, int size)
1351{
1352        struct buffer_head *ret = NULL;
1353        struct bh_lru *lru;
1354        int i;
1355
1356        check_irqs_on();
1357        bh_lru_lock();
1358        lru = &bh_lrus[smp_processor_id()];
1359        for (i = 0; i < BH_LRU_SIZE; i++) {
1360                struct buffer_head *bh = lru->bhs[i];
1361
1362                if (bh && bh->b_bdev == bdev &&
1363                                bh->b_blocknr == block && bh->b_size == size) {
1364                        if (i) {
1365                                while (i) {
1366                                        lru->bhs[i] = lru->bhs[i - 1];
1367                                        i--;
1368                                }
1369                                lru->bhs[0] = bh;
1370                        }
1371                        get_bh(bh);
1372                        ret = bh;
1373                        break;
1374                }
1375        }
1376        bh_lru_unlock();
1377        return ret;
1378}
1379
1380struct buffer_head *
1381__find_get_block(struct block_device *bdev, sector_t block, int size)
1382{
1383        struct buffer_head *bh = lookup_bh(bdev, block, size);
1384
1385        if (bh == NULL) {
1386                bh = __find_get_block_slow(bdev, block, size);
1387                bh_lru_install(bh);
1388        }
1389        return bh;
1390}
1391EXPORT_SYMBOL(__find_get_block);
1392
1393struct buffer_head *
1394__getblk(struct block_device *bdev, sector_t block, int size)
1395{
1396        struct buffer_head *bh = __find_get_block(bdev, block, size);
1397
1398        if (bh == NULL) {
1399                bh = __getblk_slow(bdev, block, size);
1400                bh_lru_install(bh);
1401        }
1402        return bh;
1403}
1404EXPORT_SYMBOL(__getblk);
1405
1406struct buffer_head *
1407__bread(struct block_device *bdev, sector_t block, int size)
1408{
1409        struct buffer_head *bh = __getblk(bdev, block, size);
1410
1411        if (bh) {
1412                if (buffer_uptodate(bh))
1413                        return bh;
1414                __brelse(bh);
1415        }
1416        bh = __bread_slow(bdev, block, size);
1417        bh_lru_install(bh);
1418        return bh;
1419}
1420EXPORT_SYMBOL(__bread);
1421
1422/*
1423 * invalidate_bh_lrus() is called rarely - at unmount.  Because it is only for
1424 * unmount it only needs to ensure that all buffers from the target device are
1425 * invalidated on return and it doesn't need to worry about new buffers from
1426 * that device being added - the unmount code has to prevent that.
1427 */
1428static void invalidate_bh_lru(void *arg)
1429{
1430        const int cpu = get_cpu();
1431        int i;
1432
1433        for (i = 0; i < BH_LRU_SIZE; i++) {
1434                brelse(bh_lrus[cpu].bhs[i]);
1435                bh_lrus[cpu].bhs[i] = NULL;
1436        }
1437        put_cpu();
1438}
1439        
1440static void invalidate_bh_lrus(void)
1441{
1442        preempt_disable();
1443        invalidate_bh_lru(NULL);
1444        smp_call_function(invalidate_bh_lru, NULL, 1, 1);
1445        preempt_enable();
1446}
1447
1448
1449
1450void set_bh_page(struct buffer_head *bh,
1451                struct page *page, unsigned long offset)
1452{
1453        bh->b_page = page;
1454        if (offset >= PAGE_SIZE)
1455                BUG();
1456        if (PageHighMem(page))
1457                /*
1458                 * This catches illegal uses and preserves the offset:
1459                 */
1460                bh->b_data = (char *)(0 + offset);
1461        else
1462                bh->b_data = page_address(page) + offset;
1463}
1464EXPORT_SYMBOL(set_bh_page);
1465
1466/*
1467 * Called when truncating a buffer on a page completely.
1468 */
1469static /* inline */ void discard_buffer(struct buffer_head * bh)
1470{
1471        lock_buffer(bh);
1472        clear_buffer_dirty(bh);
1473        bh->b_bdev = NULL;
1474        clear_buffer_mapped(bh);
1475        clear_buffer_req(bh);
1476        clear_buffer_new(bh);
1477        unlock_buffer(bh);
1478}
1479
1480/**
1481 * try_to_release_page() - release old fs-specific metadata on a page
1482 *
1483 * @page: the page which the kernel is trying to free
1484 * @gfp_mask: memory allocation flags (and I/O mode)
1485 *
1486 * The address_space is to try to release any data against the page
1487 * (presumably at page->private).  If the release was successful, return `1'.
1488 * Otherwise return zero.
1489 *
1490 * The @gfp_mask argument specifies whether I/O may be performed to release
1491 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1492 *
1493 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1494 */
1495int try_to_release_page(struct page *page, int gfp_mask)
1496{
1497        struct address_space * const mapping = page->mapping;
1498
1499        if (!PageLocked(page))
1500                BUG();
1501        if (PageWriteback(page))
1502                return 0;
1503        
1504        if (mapping && mapping->a_ops->releasepage)
1505                return mapping->a_ops->releasepage(page, gfp_mask);
1506        return try_to_free_buffers(page);
1507}
1508
1509/**
1510 * block_invalidatepage - invalidate part of all of a buffer-backed page
1511 *
1512 * @page: the page which is affected
1513 * @offset: the index of the truncation point
1514 *
1515 * block_invalidatepage() is called when all or part of the page has become
1516 * invalidatedby a truncate operation.
1517 *
1518 * block_invalidatepage() does not have to release all buffers, but it must
1519 * ensure that no dirty buffer is left outside @offset and that no I/O
1520 * is underway against any of the blocks which are outside the truncation
1521 * point.  Because the caller is about to free (and possibly reuse) those
1522 * blocks on-disk.
1523 */
1524int block_invalidatepage(struct page *page, unsigned long offset)
1525{
1526        struct buffer_head *head, *bh, *next;
1527        unsigned int curr_off = 0;
1528        int ret = 1;
1529
1530        BUG_ON(!PageLocked(page));
1531        if (!page_has_buffers(page))
1532                goto out;
1533
1534        head = page_buffers(page);
1535        bh = head;
1536        do {
1537                unsigned int next_off = curr_off + bh->b_size;
1538                next = bh->b_this_page;
1539
1540                /*
1541                 * is this block fully invalidated?
1542                 */
1543                if (offset <= curr_off)
1544                        discard_buffer(bh);
1545                curr_off = next_off;
1546                bh = next;
1547        } while (bh != head);
1548
1549        /*
1550         * We release buffers only if the entire page is being invalidated.
1551         * The get_block cached value has been unconditionally invalidated,
1552         * so real IO is not possible anymore.
1553         */
1554        if (offset == 0)
1555                ret = try_to_release_page(page, 0);
1556out:
1557        return ret;
1558}
1559EXPORT_SYMBOL(block_invalidatepage);
1560
1561/*
1562 * We attach and possibly dirty the buffers atomically wrt
1563 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1564 * is already excluded via the page lock.
1565 */
1566void create_empty_buffers(struct page *page,
1567                        unsigned long blocksize, unsigned long b_state)
1568{
1569        struct buffer_head *bh, *head, *tail;
1570
1571        head = create_buffers(page, blocksize, 1);
1572        bh = head;
1573        do {
1574                bh->b_state |= b_state;
1575                tail = bh;
1576                bh = bh->b_this_page;
1577        } while (bh);
1578        tail->b_this_page = head;
1579
1580        spin_lock(&page->mapping->private_lock);
1581        if (PageUptodate(page) || PageDirty(page)) {
1582                bh = head;
1583                do {
1584                        if (PageDirty(page))
1585                                set_buffer_dirty(bh);
1586                        if (PageUptodate(page))
1587                                set_buffer_uptodate(bh);
1588                        bh = bh->b_this_page;
1589                } while (bh != head);
1590        }
1591        __set_page_buffers(page, head);
1592        spin_unlock(&page->mapping->private_lock);
1593}
1594EXPORT_SYMBOL(create_empty_buffers);
1595
1596/*
1597 * We are taking a block for data and we don't want any output from any
1598 * buffer-cache aliases starting from return from that function and
1599 * until the moment when something will explicitly mark the buffer
1600 * dirty (hopefully that will not happen until we will free that block ;-)
1601 * We don't even need to mark it not-uptodate - nobody can expect
1602 * anything from a newly allocated buffer anyway. We used to used
1603 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1604 * don't want to mark the alias unmapped, for example - it would confuse
1605 * anyone who might pick it with bread() afterwards...
1606 *
1607 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1608 * be writeout I/O going on against recently-freed buffers.  We don't
1609 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1610 * only if we really need to.  That happens here.
1611 */
1612void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1613{
1614        struct buffer_head *old_bh;
1615
1616        old_bh = __find_get_block(bdev, block, 0);
1617        if (old_bh) {
1618#if 0   /* This happens.  Later. */
1619                if (buffer_dirty(old_bh))
1620                        buffer_error();
1621#endif
1622                clear_buffer_dirty(old_bh);
1623                wait_on_buffer(old_bh);
1624                clear_buffer_req(old_bh);
1625                __brelse(old_bh);
1626        }
1627}
1628EXPORT_SYMBOL(unmap_underlying_metadata);
1629
1630/*
1631 * NOTE! All mapped/uptodate combinations are valid:
1632 *
1633 *      Mapped  Uptodate        Meaning
1634 *
1635 *      No      No              "unknown" - must do get_block()
1636 *      No      Yes             "hole" - zero-filled
1637 *      Yes     No              "allocated" - allocated on disk, not read in
1638 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1639 *
1640 * "Dirty" is valid only with the last case (mapped+uptodate).
1641 */
1642
1643/*
1644 * While block_write_full_page is writing back the dirty buffers under
1645 * the page lock, whoever dirtied the buffers may decide to clean them
1646 * again at any time.  We handle that by only looking at the buffer
1647 * state inside lock_buffer().
1648 *
1649 * If block_write_full_page() is called for regular writeback
1650 * (called_for_sync() is false) then it will return -EAGAIN for a locked
1651 * buffer.   This only can happen if someone has written the buffer directly,
1652 * with submit_bh().  At the address_space level PageWriteback prevents this
1653 * contention from occurring.
1654 */
1655static int __block_write_full_page(struct inode *inode,
1656                        struct page *page, get_block_t *get_block)
1657{
1658        int err;
1659        int ret = 0;
1660        unsigned long block;
1661        unsigned long last_block;
1662        struct buffer_head *bh, *head;
1663        int nr_underway = 0;
1664
1665        BUG_ON(!PageLocked(page));
1666
1667        last_block = (inode->i_size - 1) >> inode->i_blkbits;
1668
1669        if (!page_has_buffers(page)) {
1670                if (S_ISBLK(inode->i_mode))
1671                        buffer_error();
1672                if (!PageUptodate(page))
1673                        buffer_error();
1674                create_empty_buffers(page, 1 << inode->i_blkbits,
1675                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1676        }
1677
1678        /*
1679         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1680         * here, and the (potentially unmapped) buffers may become dirty at
1681         * any time.  If a buffer becomes dirty here after we've inspected it
1682         * then we just miss that fact, and the page stays dirty.
1683         *
1684         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1685         * handle that here by just cleaning them.
1686         */
1687
1688        block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1689        head = page_buffers(page);
1690        bh = head;
1691
1692        /*
1693         * Get all the dirty buffers mapped to disk addresses and
1694         * handle any aliases from the underlying blockdev's mapping.
1695         */
1696        do {
1697                if (block > last_block) {
1698                        /*
1699                         * mapped buffers outside i_size will occur, because
1700                         * this page can be outside i_size when there is a
1701                         * truncate in progress.
1702                         *
1703                         * if (buffer_mapped(bh))
1704                         *      buffer_error();
1705                         */
1706                        /*
1707                         * The buffer was zeroed by block_write_full_page()
1708                         */
1709                        clear_buffer_dirty(bh);
1710                        set_buffer_uptodate(bh);
1711                } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1712                        if (buffer_new(bh))
1713                                buffer_error();
1714                        err = get_block(inode, block, bh, 1);
1715                        if (err)
1716                                goto recover;
1717                        if (buffer_new(bh)) {
1718                                /* blockdev mappings never come here */
1719                                clear_buffer_new(bh);
1720                                unmap_underlying_metadata(bh->b_bdev,
1721                                                        bh->b_blocknr);
1722                        }
1723                }
1724                bh = bh->b_this_page;
1725                block++;
1726        } while (bh != head);
1727
1728        do {
1729                get_bh(bh);
1730                if (buffer_mapped(bh) && buffer_dirty(bh)) {
1731                        if (called_for_sync()) {
1732                                lock_buffer(bh);
1733                        } else {
1734                                if (test_set_buffer_locked(bh)) {
1735                                        ret = -EAGAIN;
1736                                        continue;
1737                                }
1738                        }
1739                        if (test_clear_buffer_dirty(bh)) {
1740                                if (!buffer_uptodate(bh))
1741                                        buffer_error();
1742                                mark_buffer_async_write(bh);
1743                        } else {
1744                                unlock_buffer(bh);
1745                        }
1746                }
1747        } while ((bh = bh->b_this_page) != head);
1748
1749        BUG_ON(PageWriteback(page));
1750        SetPageWriteback(page);         /* Keeps try_to_free_buffers() away */
1751        unlock_page(page);
1752
1753        /*
1754         * The page may come unlocked any time after the *first* submit_bh()
1755         * call.  Be careful with its buffers.
1756         */
1757        do {
1758                struct buffer_head *next = bh->b_this_page;
1759                if (buffer_async_write(bh)) {
1760                        submit_bh(WRITE, bh);
1761                        nr_underway++;
1762                }
1763                put_bh(bh);
1764                bh = next;
1765        } while (bh != head);
1766
1767        err = 0;
1768done:
1769        if (nr_underway == 0) {
1770                /*
1771                 * The page was marked dirty, but the buffers were
1772                 * clean.  Someone wrote them back by hand with
1773                 * ll_rw_block/submit_bh.  A rare case.
1774                 */
1775                int uptodate = 1;
1776                do {
1777                        if (!buffer_uptodate(bh)) {
1778                                uptodate = 0;
1779                                break;
1780                        }
1781                        bh = bh->b_this_page;
1782                } while (bh != head);
1783                if (uptodate)
1784                        SetPageUptodate(page);
1785                end_page_writeback(page);
1786        }
1787        if (err == 0)
1788                return ret;
1789        return err;
1790
1791recover:
1792        /*
1793         * ENOSPC, or some other error.  We may already have added some
1794         * blocks to the file, so we need to write these out to avoid
1795         * exposing stale data.
1796         * The page is currently locked and not marked for writeback
1797         */
1798        ClearPageUptodate(page);
1799        bh = head;
1800        /* Recovery: lock and submit the mapped buffers */
1801        do {
1802                get_bh(bh);
1803                if (buffer_mapped(bh) && buffer_dirty(bh)) {
1804                        lock_buffer(bh);
1805                        mark_buffer_async_write(bh);
1806                } else {
1807                        /*
1808                         * The buffer may have been set dirty during
1809                         * attachment to a dirty page.
1810                         */
1811                        clear_buffer_dirty(bh);
1812                }
1813        } while ((bh = bh->b_this_page) != head);
1814        SetPageError(page);
1815        BUG_ON(PageWriteback(page));
1816        SetPageWriteback(page);
1817        unlock_page(page);
1818        do {
1819                struct buffer_head *next = bh->b_this_page;
1820                if (buffer_async_write(bh)) {
1821                        clear_buffer_dirty(bh);
1822                        submit_bh(WRITE, bh);
1823                        nr_underway++;
1824                }
1825                put_bh(bh);
1826                bh = next;
1827        } while (bh != head);
1828        goto done;
1829}
1830
1831static int __block_prepare_write(struct inode *inode, struct page *page,
1832                unsigned from, unsigned to, get_block_t *get_block)
1833{
1834        unsigned block_start, block_end;
1835        unsigned long block;
1836        int err = 0;
1837        unsigned blocksize, bbits;
1838        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1839
1840        BUG_ON(!PageLocked(page));
1841        BUG_ON(from > PAGE_CACHE_SIZE);
1842        BUG_ON(to > PAGE_CACHE_SIZE);
1843        BUG_ON(from > to);
1844
1845        blocksize = 1 << inode->i_blkbits;
1846        if (!page_has_buffers(page))
1847                create_empty_buffers(page, blocksize, 0);
1848        head = page_buffers(page);
1849
1850        bbits = inode->i_blkbits;
1851        block = page->index << (PAGE_CACHE_SHIFT - bbits);
1852
1853        for(bh = head, block_start = 0; bh != head || !block_start;
1854            block++, block_start=block_end, bh = bh->b_this_page) {
1855                block_end = block_start + blocksize;
1856                if (block_end <= from || block_start >= to) {
1857                        if (PageUptodate(page)) {
1858                                if (!buffer_uptodate(bh))
1859                                        set_buffer_uptodate(bh);
1860                        }
1861                        continue;
1862                }
1863                if (buffer_new(bh))
1864                        clear_buffer_new(bh);
1865                if (!buffer_mapped(bh)) {
1866                        err = get_block(inode, block, bh, 1);
1867                        if (err)
1868                                goto out;
1869                        if (buffer_new(bh)) {
1870                                clear_buffer_new(bh);
1871                                unmap_underlying_metadata(bh->b_bdev,
1872                                                        bh->b_blocknr);
1873                                if (PageUptodate(page)) {
1874                                        if (!buffer_mapped(bh))
1875                                                buffer_error();
1876                                        set_buffer_uptodate(bh);
1877                                        continue;
1878                                }
1879                                if (block_end > to || block_start < from) {
1880                                        void *kaddr;
1881
1882                                        kaddr = kmap_atomic(page, KM_USER0);
1883                                        if (block_end > to)
1884                                                memset(kaddr+to, 0,
1885                                                        block_end-to);
1886                                        if (block_start < from)
1887                                                memset(kaddr+block_start,
1888                                                        0, from-block_start);
1889                                        flush_dcache_page(page);
1890                                        kunmap_atomic(kaddr, KM_USER0);
1891                                }
1892                                continue;
1893                        }
1894                }
1895                if (PageUptodate(page)) {
1896                        if (!buffer_uptodate(bh))
1897                                set_buffer_uptodate(bh);
1898                        continue; 
1899                }
1900                if (!buffer_uptodate(bh) &&
1901                     (block_start < from || block_end > to)) {
1902                        ll_rw_block(READ, 1, &bh);
1903                        *wait_bh++=bh;
1904                }
1905        }
1906        /*
1907         * If we issued read requests - let them complete.
1908         */
1909        while(wait_bh > wait) {
1910                wait_on_buffer(*--wait_bh);
1911                if (!buffer_uptodate(*wait_bh))
1912                        return -EIO;
1913        }
1914        return 0;
1915out:
1916        /*
1917         * Zero out any newly allocated blocks to avoid exposing stale
1918         * data.  If BH_New is set, we know that the block was newly
1919         * allocated in the above loop.
1920         */
1921        bh = head;
1922        block_start = 0;
1923        do {
1924                block_end = block_start+blocksize;
1925                if (block_end <= from)
1926                        goto next_bh;
1927                if (block_start >= to)
1928                        break;
1929                if (buffer_new(bh)) {
1930                        void *kaddr;
1931
1932                        clear_buffer_new(bh);
1933                        if (buffer_uptodate(bh))
1934                                buffer_error();
1935                        kaddr = kmap_atomic(page, KM_USER0);
1936                        memset(kaddr+block_start, 0, bh->b_size);
1937                        kunmap_atomic(kaddr, KM_USER0);
1938                        set_buffer_uptodate(bh);
1939                        mark_buffer_dirty(bh);
1940                }
1941next_bh:
1942                block_start = block_end;
1943                bh = bh->b_this_page;
1944        } while (bh != head);
1945        return err;
1946}
1947
1948static int __block_commit_write(struct inode *inode, struct page *page,
1949                unsigned from, unsigned to)
1950{
1951        unsigned block_start, block_end;
1952        int partial = 0;
1953        unsigned blocksize;
1954        struct buffer_head *bh, *head;
1955
1956        blocksize = 1 << inode->i_blkbits;
1957
1958        for(bh = head = page_buffers(page), block_start = 0;
1959            bh != head || !block_start;
1960            block_start=block_end, bh = bh->b_this_page) {
1961                block_end = block_start + blocksize;
1962                if (block_end <= from || block_start >= to) {
1963                        if (!buffer_uptodate(bh))
1964                                partial = 1;
1965                } else {
1966                        set_buffer_uptodate(bh);
1967                        mark_buffer_dirty(bh);
1968                }
1969        }
1970
1971        /*
1972         * If this is a partial write which happened to make all buffers
1973         * uptodate then we can optimize away a bogus readpage() for
1974         * the next read(). Here we 'discover' whether the page went
1975         * uptodate as a result of this (potentially partial) write.
1976         */
1977        if (!partial)
1978                SetPageUptodate(page);
1979        return 0;
1980}
1981
1982/*
1983 * Generic "read page" function for block devices that have the normal
1984 * get_block functionality. This is most of the block device filesystems.
1985 * Reads the page asynchronously --- the unlock_buffer() and
1986 * set/clear_buffer_uptodate() functions propagate buffer state into the
1987 * page struct once IO has completed.
1988 */
1989int block_read_full_page(struct page *page, get_block_t *get_block)
1990{
1991        struct inode *inode = page->mapping->host;
1992        unsigned long iblock, lblock;
1993        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1994        unsigned int blocksize, blocks;
1995        int nr, i;
1996
1997        if (!PageLocked(page))
1998                PAGE_BUG(page);
1999        if (PageUptodate(page))
2000                buffer_error();
2001        blocksize = 1 << inode->i_blkbits;
2002        if (!page_has_buffers(page))
2003                create_empty_buffers(page, blocksize, 0);
2004        head = page_buffers(page);
2005
2006        blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
2007        iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2008        lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
2009        bh = head;
2010        nr = 0;
2011        i = 0;
2012
2013        do {
2014                if (buffer_uptodate(bh))
2015                        continue;
2016
2017                if (!buffer_mapped(bh)) {
2018                        if (iblock < lblock) {
2019                                if (get_block(inode, iblock, bh, 0))
2020                                        SetPageError(page);
2021                        }
2022                        if (!buffer_mapped(bh)) {
2023                                void *kaddr = kmap_atomic(page, KM_USER0);
2024                                memset(kaddr + i * blocksize, 0, blocksize);
2025                                flush_dcache_page(page);
2026                                kunmap_atomic(kaddr, KM_USER0);
2027                                set_buffer_uptodate(bh);
2028                                continue;
2029                        }
2030                        /*
2031                         * get_block() might have updated the buffer
2032                         * synchronously
2033                         */
2034                        if (buffer_uptodate(bh))
2035                                continue;
2036                }
2037                arr[nr++] = bh;
2038        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2039
2040        if (!nr) {
2041                /*
2042                 * All buffers are uptodate - we can set the page uptodate
2043                 * as well. But not if get_block() returned an error.
2044                 */
2045                if (!PageError(page))
2046                        SetPageUptodate(page);
2047                unlock_page(page);
2048                return 0;
2049        }
2050
2051        /* Stage two: lock the buffers */
2052        for (i = 0; i < nr; i++) {
2053                bh = arr[i];
2054                lock_buffer(bh);
2055                mark_buffer_async_read(bh);
2056        }
2057
2058        /*
2059         * Stage 3: start the IO.  Check for uptodateness
2060         * inside the buffer lock in case another process reading
2061         * the underlying blockdev brought it uptodate (the sct fix).
2062         */
2063        for (i = 0; i < nr; i++) {
2064                bh = arr[i];
2065                if (buffer_uptodate(bh))
2066                        end_buffer_async_read(bh, 1);
2067                else
2068                        submit_bh(READ, bh);
2069        }
2070        return 0;
2071}
2072
2073/* utility function for filesystems that need to do work on expanding
2074 * truncates.  Uses prepare/commit_write to allow the filesystem to
2075 * deal with the hole.  
2076 */
2077int generic_cont_expand(struct inode *inode, loff_t size)
2078{
2079        struct address_space *mapping = inode->i_mapping;
2080        struct page *page;
2081        unsigned long index, offset, limit;
2082        int err;
2083
2084        err = -EFBIG;
2085        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2086        if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2087                send_sig(SIGXFSZ, current, 0);
2088                goto out;
2089        }
2090        if (size > inode->i_sb->s_maxbytes)
2091                goto out;
2092
2093        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2094
2095        /* ugh.  in prepare/commit_write, if from==to==start of block, we 
2096        ** skip the prepare.  make sure we never send an offset for the start
2097        ** of a block
2098        */
2099        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2100                offset++;
2101        }
2102        index = size >> PAGE_CACHE_SHIFT;
2103        err = -ENOMEM;
2104        page = grab_cache_page(mapping, index);
2105        if (!page)
2106                goto out;
2107        err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2108        if (!err) {
2109                err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2110        }
2111        unlock_page(page);
2112        page_cache_release(page);
2113        if (err > 0)
2114                err = 0;
2115out:
2116        return err;
2117}
2118
2119/*
2120 * For moronic filesystems that do not allow holes in file.
2121 * We may have to extend the file.
2122 */
2123
2124int cont_prepare_write(struct page *page, unsigned offset,
2125                unsigned to, get_block_t *get_block, loff_t *bytes)
2126{
2127        struct address_space *mapping = page->mapping;
2128        struct inode *inode = mapping->host;
2129        struct page *new_page;
2130        unsigned long pgpos;
2131        long status;
2132        unsigned zerofrom;
2133        unsigned blocksize = 1 << inode->i_blkbits;
2134        void *kaddr;
2135
2136        while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2137                status = -ENOMEM;
2138                new_page = grab_cache_page(mapping, pgpos);
2139                if (!new_page)
2140                        goto out;
2141                /* we might sleep */
2142                if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2143                        unlock_page(new_page);
2144                        page_cache_release(new_page);
2145                        continue;
2146                }
2147                zerofrom = *bytes & ~PAGE_CACHE_MASK;
2148                if (zerofrom & (blocksize-1)) {
2149                        *bytes |= (blocksize-1);
2150                        (*bytes)++;
2151                }
2152                status = __block_prepare_write(inode, new_page, zerofrom,
2153                                                PAGE_CACHE_SIZE, get_block);
2154                if (status)
2155                        goto out_unmap;
2156                kaddr = kmap_atomic(new_page, KM_USER0);
2157                memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2158                flush_dcache_page(new_page);
2159                kunmap_atomic(kaddr, KM_USER0);
2160                __block_commit_write(inode, new_page,
2161                                zerofrom, PAGE_CACHE_SIZE);
2162                unlock_page(new_page);
2163                page_cache_release(new_page);
2164        }
2165
2166        if (page->index < pgpos) {
2167                /* completely inside the area */
2168                zerofrom = offset;
2169        } else {
2170                /* page covers the boundary, find the boundary offset */
2171                zerofrom = *bytes & ~PAGE_CACHE_MASK;
2172
2173                /* if we will expand the thing last block will be filled */
2174                if (to > zerofrom && (zerofrom & (blocksize-1))) {
2175                        *bytes |= (blocksize-1);
2176                        (*bytes)++;
2177                }
2178
2179                /* starting below the boundary? Nothing to zero out */
2180                if (offset <= zerofrom)
2181                        zerofrom = offset;
2182        }
2183        status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2184        if (status)
2185                goto out1;
2186        if (zerofrom < offset) {
2187                kaddr = kmap_atomic(page, KM_USER0);
2188                memset(kaddr+zerofrom, 0, offset-zerofrom);
2189                flush_dcache_page(page);
2190                kunmap_atomic(kaddr, KM_USER0);
2191                __block_commit_write(inode, page, zerofrom, offset);
2192        }
2193        return 0;
2194out1:
2195        ClearPageUptodate(page);
2196        return status;
2197
2198out_unmap:
2199        ClearPageUptodate(new_page);
2200        unlock_page(new_page);
2201        page_cache_release(new_page);
2202out:
2203        return status;
2204}
2205
2206int block_prepare_write(struct page *page, unsigned from, unsigned to,
2207                        get_block_t *get_block)
2208{
2209        struct inode *inode = page->mapping->host;
2210        int err = __block_prepare_write(inode, page, from, to, get_block);
2211        if (err)
2212                ClearPageUptodate(page);
2213        return err;
2214}
2215
2216int block_commit_write(struct page *page, unsigned from, unsigned to)
2217{
2218        struct inode *inode = page->mapping->host;
2219        __block_commit_write(inode,page,from,to);
2220        return 0;
2221}
2222
2223int generic_commit_write(struct file *file, struct page *page,
2224                unsigned from, unsigned to)
2225{
2226        struct inode *inode = page->mapping->host;
2227        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2228        __block_commit_write(inode,page,from,to);
2229        if (pos > inode->i_size) {
2230                inode->i_size = pos;
2231                mark_inode_dirty(inode);
2232        }
2233        return 0;
2234}
2235
2236int block_truncate_page(struct address_space *mapping,
2237                        loff_t from, get_block_t *get_block)
2238{
2239        unsigned long index = from >> PAGE_CACHE_SHIFT;
2240        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2241        unsigned blocksize, iblock, length, pos;
2242        struct inode *inode = mapping->host;
2243        struct page *page;
2244        struct buffer_head *bh;
2245        void *kaddr;
2246        int err;
2247
2248        blocksize = 1 << inode->i_blkbits;
2249        length = offset & (blocksize - 1);
2250
2251        /* Block boundary? Nothing to do */
2252        if (!length)
2253                return 0;
2254
2255        length = blocksize - length;
2256        iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2257        
2258        page = grab_cache_page(mapping, index);
2259        err = -ENOMEM;
2260        if (!page)
2261                goto out;
2262
2263        if (!page_has_buffers(page))
2264                create_empty_buffers(page, blocksize, 0);
2265
2266        /* Find the buffer that contains "offset" */
2267        bh = page_buffers(page);
2268        pos = blocksize;
2269        while (offset >= pos) {
2270                bh = bh->b_this_page;
2271                iblock++;
2272                pos += blocksize;
2273        }
2274
2275        err = 0;
2276        if (!buffer_mapped(bh)) {
2277                err = get_block(inode, iblock, bh, 0);
2278                if (err)
2279                        goto unlock;
2280                /* unmapped? It's a hole - nothing to do */
2281                if (!buffer_mapped(bh))
2282                        goto unlock;
2283        }
2284
2285        /* Ok, it's mapped. Make sure it's up-to-date */
2286        if (PageUptodate(page))
2287                set_buffer_uptodate(bh);
2288
2289        if (!buffer_uptodate(bh)) {
2290                err = -EIO;
2291                ll_rw_block(READ, 1, &bh);
2292                wait_on_buffer(bh);
2293                /* Uhhuh. Read error. Complain and punt. */
2294                if (!buffer_uptodate(bh))
2295                        goto unlock;
2296        }
2297
2298        kaddr = kmap_atomic(page, KM_USER0);
2299        memset(kaddr + offset, 0, length);
2300        flush_dcache_page(page);
2301        kunmap_atomic(kaddr, KM_USER0);
2302
2303        mark_buffer_dirty(bh);
2304        err = 0;
2305
2306unlock:
2307        unlock_page(page);
2308        page_cache_release(page);
2309out:
2310        return err;
2311}
2312
2313/*
2314 * The generic ->writepage function for buffer-backed address_spaces
2315 */
2316int block_write_full_page(struct page *page, get_block_t *get_block)
2317{
2318        struct inode * const inode = page->mapping->host;
2319        const unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2320        unsigned offset;
2321        void *kaddr;
2322
2323        /* Is the page fully inside i_size? */
2324        if (page->index < end_index)
2325                return __block_write_full_page(inode, page, get_block);
2326
2327        /* Is the page fully outside i_size? (truncate in progress) */
2328        offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2329        if (page->index >= end_index+1 || !offset) {
2330                unlock_page(page);
2331                return -EIO;
2332        }
2333
2334        /*
2335         * The page straddles i_size.  It must be zeroed out on each and every
2336         * writepage invokation because it may be mmapped.  "A file is mapped
2337         * in multiples of the page size.  For a file that is not a multiple of
2338         * the  page size, the remaining memory is zeroed when mapped, and
2339         * writes to that region are not written out to the file."
2340         */
2341        kaddr = kmap_atomic(page, KM_USER0);
2342        memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2343        flush_dcache_page(page);
2344        kunmap_atomic(kaddr, KM_USER0);
2345        return __block_write_full_page(inode, page, get_block);
2346}
2347
2348sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2349                            get_block_t *get_block)
2350{
2351        struct buffer_head tmp;
2352        struct inode *inode = mapping->host;
2353        tmp.b_state = 0;
2354        tmp.b_blocknr = 0;
2355        get_block(inode, block, &tmp, 0);
2356        return tmp.b_blocknr;
2357}
2358
2359/*
2360 * Start I/O on a physical range of kernel memory, defined by a vector
2361 * of kiobuf structs (much like a user-space iovec list).
2362 *
2363 * The kiobuf must already be locked for IO.  IO is submitted
2364 * asynchronously: you need to check page->locked and page->uptodate.
2365 *
2366 * It is up to the caller to make sure that there are enough blocks
2367 * passed in to completely map the iobufs to disk.
2368 */
2369int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2370               struct block_device *bdev, sector_t b[], int size)
2371{
2372        int             transferred;
2373        int             i;
2374        int             err;
2375        struct kiobuf * iobuf;
2376
2377        if (!nr)
2378                return 0;
2379        
2380        /* 
2381         * First, do some alignment and validity checks 
2382         */
2383        for (i = 0; i < nr; i++) {
2384                iobuf = iovec[i];
2385                if ((iobuf->offset & (size-1)) || (iobuf->length & (size-1)))
2386                        return -EINVAL;
2387                if (!iobuf->nr_pages)
2388                        panic("brw_kiovec: iobuf not initialised");
2389        }
2390
2391        /* 
2392         * OK to walk down the iovec doing page IO on each page we find. 
2393         */
2394        for (i = 0; i < nr; i++) {
2395                iobuf = iovec[i];
2396                iobuf->errno = 0;
2397
2398                ll_rw_kio(rw, iobuf, bdev, b[i] * (size >> 9));
2399        }
2400
2401        /*
2402         * now they are all submitted, wait for completion
2403         */
2404        transferred = 0;
2405        err = 0;
2406        for (i = 0; i < nr; i++) {
2407                iobuf = iovec[i];
2408                kiobuf_wait_for_io(iobuf);
2409                if (iobuf->errno && !err)
2410                        err = iobuf->errno;
2411                if (!err)
2412                        transferred += iobuf->length;
2413        }
2414
2415        return err ? err : transferred;
2416}
2417
2418static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2419{
2420        struct buffer_head *bh = bio->bi_private;
2421
2422        if (bio->bi_size)
2423                return 1;
2424
2425        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2426        bio_put(bio);
2427        return 0;
2428}
2429
2430int submit_bh(int rw, struct buffer_head * bh)
2431{
2432        struct bio *bio;
2433
2434        BUG_ON(!buffer_locked(bh));
2435        BUG_ON(!buffer_mapped(bh));
2436        BUG_ON(!bh->b_end_io);
2437
2438        if ((rw == READ || rw == READA) && buffer_uptodate(bh))
2439                buffer_error();
2440        if (rw == WRITE && !buffer_uptodate(bh))
2441                buffer_error();
2442        if (rw == READ && buffer_dirty(bh))
2443                buffer_error();
2444                                
2445        set_buffer_req(bh);
2446
2447        /*
2448         * from here on down, it's all bio -- do the initial mapping,
2449         * submit_bio -> generic_make_request may further map this bio around
2450         */
2451        bio = bio_alloc(GFP_NOIO, 1);
2452
2453        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2454        bio->bi_bdev = bh->b_bdev;
2455        bio->bi_io_vec[0].bv_page = bh->b_page;
2456        bio->bi_io_vec[0].bv_len = bh->b_size;
2457        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2458
2459        bio->bi_vcnt = 1;
2460        bio->bi_idx = 0;
2461        bio->bi_size = bh->b_size;
2462
2463        bio->bi_end_io = end_bio_bh_io_sync;
2464        bio->bi_private = bh;
2465
2466        return submit_bio(rw, bio);
2467}
2468
2469/**
2470 * ll_rw_block: low-level access to block devices (DEPRECATED)
2471 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2472 * @nr: number of &struct buffer_heads in the array
2473 * @bhs: array of pointers to &struct buffer_head
2474 *
2475 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2476 * and requests an I/O operation on them, either a %READ or a %WRITE.
2477 * The third %READA option is described in the documentation for
2478 * generic_make_request() which ll_rw_block() calls.
2479 *
2480 * This function drops any buffer that it cannot get a lock on (with the
2481 * BH_Lock state bit), any buffer that appears to be clean when doing a
2482 * write request, and any buffer that appears to be up-to-date when doing
2483 * read request.  Further it marks as clean buffers that are processed for
2484 * writing (the buffer cache wont assume that they are actually clean until
2485 * the buffer gets unlocked).
2486 *
2487 * ll_rw_block sets b_end_io to simple completion handler that marks
2488 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2489 * any waiters. 
2490 *
2491 * All of the buffers must be for the same device, and must also be a
2492 * multiple of the current approved size for the device.
2493 */
2494void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
2495{
2496        unsigned int major;
2497        int correct_size;
2498        int i;
2499
2500        if (!nr)
2501                return;
2502
2503        major = major(to_kdev_t(bhs[0]->b_bdev->bd_dev));
2504
2505        /* Determine correct block size for this device. */
2506        correct_size = bdev_hardsect_size(bhs[0]->b_bdev);
2507
2508        /* Verify requested block sizes. */
2509        for (i = 0; i < nr; i++) {
2510                struct buffer_head *bh = bhs[i];
2511                if (bh->b_size & (correct_size - 1)) {
2512                        printk(KERN_NOTICE "ll_rw_block: device %s: "
2513                               "only %d-char blocks implemented (%u)\n",
2514                               bdevname(bhs[0]->b_bdev),
2515                               correct_size, bh->b_size);
2516                        goto sorry;
2517                }
2518        }
2519
2520        if ((rw & WRITE) && bdev_read_only(bhs[0]->b_bdev)) {
2521                printk(KERN_NOTICE "Can't write to read-only device %s\n",
2522                       bdevname(bhs[0]->b_bdev));
2523                goto sorry;
2524        }
2525
2526        for (i = 0; i < nr; i++) {
2527                struct buffer_head *bh = bhs[i];
2528
2529                /* Only one thread can actually submit the I/O. */
2530                if (test_set_buffer_locked(bh))
2531                        continue;
2532
2533                /* We have the buffer lock */
2534                atomic_inc(&bh->b_count);
2535                bh->b_end_io = end_buffer_io_sync;
2536
2537                switch(rw) {
2538                case WRITE:
2539                        if (!test_clear_buffer_dirty(bh))
2540                                /* Hmmph! Nothing to write */
2541                                goto end_io;
2542                        break;
2543
2544                case READA:
2545                case READ:
2546                        if (buffer_uptodate(bh))
2547                                /* Hmmph! Already have it */
2548                                goto end_io;
2549                        break;
2550                default:
2551                        BUG();
2552        end_io:
2553                        bh->b_end_io(bh, buffer_uptodate(bh));
2554                        continue;
2555                }
2556
2557                submit_bh(rw, bh);
2558        }
2559        return;
2560
2561sorry:
2562        /* Make sure we don't get infinite dirty retries.. */
2563        for (i = 0; i < nr; i++)
2564                clear_buffer_dirty(bhs[i]);
2565}
2566
2567/*
2568 * Sanity checks for try_to_free_buffers.
2569 */
2570static void check_ttfb_buffer(struct page *page, struct buffer_head *bh)
2571{
2572        if (!buffer_uptodate(bh)) {
2573                if (PageUptodate(page) && page->mapping
2574                        && buffer_mapped(bh)    /* discard_buffer */
2575                        && S_ISBLK(page->mapping->host->i_mode))
2576                {
2577                        buffer_error();
2578                }
2579        }
2580}
2581
2582/*
2583 * try_to_free_buffers() checks if all the buffers on this particular page
2584 * are unused, and releases them if so.
2585 *
2586 * Exclusion against try_to_free_buffers may be obtained by either
2587 * locking the page or by holding its mapping's private_lock.
2588 *
2589 * If the page is dirty but all the buffers are clean then we need to
2590 * be sure to mark the page clean as well.  This is because the page
2591 * may be against a block device, and a later reattachment of buffers
2592 * to a dirty page will set *all* buffers dirty.  Which would corrupt
2593 * filesystem data on the same device.
2594 *
2595 * The same applies to regular filesystem pages: if all the buffers are
2596 * clean then we set the page clean and proceed.  To do that, we require
2597 * total exclusion from __set_page_dirty_buffers().  That is obtained with
2598 * private_lock.
2599 *
2600 * try_to_free_buffers() is non-blocking.
2601 */
2602static inline int buffer_busy(struct buffer_head *bh)
2603{
2604        return atomic_read(&bh->b_count) |
2605                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2606}
2607
2608static inline int
2609drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2610{
2611        struct buffer_head *head = page_buffers(page);
2612        struct buffer_head *bh;
2613        int was_uptodate = 1;
2614
2615        bh = head;
2616        do {
2617                check_ttfb_buffer(page, bh);
2618                if (buffer_busy(bh))
2619                        goto failed;
2620                if (!buffer_uptodate(bh))
2621                        was_uptodate = 0;
2622                bh = bh->b_this_page;
2623        } while (bh != head);
2624
2625        if (!was_uptodate && PageUptodate(page))
2626                buffer_error();
2627
2628        do {
2629                struct buffer_head *next = bh->b_this_page;
2630
2631                if (!list_empty(&bh->b_assoc_buffers))
2632                        __remove_assoc_queue(bh);
2633                bh = next;
2634        } while (bh != head);
2635        *buffers_to_free = head;
2636        __clear_page_buffers(page);
2637        return 1;
2638failed:
2639        return 0;
2640}
2641
2642int try_to_free_buffers(struct page *page)
2643{
2644        struct address_space * const mapping = page->mapping;
2645        struct buffer_head *buffers_to_free = NULL;
2646        int ret = 0;
2647
2648        BUG_ON(!PageLocked(page));
2649        if (PageWriteback(page))
2650                return 0;
2651
2652        if (mapping == NULL) {          /* swapped-in anon page */
2653                ret = drop_buffers(page, &buffers_to_free);
2654                goto out;
2655        }
2656
2657        spin_lock(&mapping->private_lock);
2658        ret = drop_buffers(page, &buffers_to_free);
2659        if (ret && !PageSwapCache(page)) {
2660                /*
2661                 * If the filesystem writes its buffers by hand (eg ext3)
2662                 * then we can have clean buffers against a dirty page.  We
2663                 * clean the page here; otherwise later reattachment of buffers
2664                 * could encounter a non-uptodate page, which is unresolvable.
2665                 * This only applies in the rare case where try_to_free_buffers
2666                 * succeeds but the page is not freed.
2667                 */
2668                clear_page_dirty(page);
2669        }
2670        spin_unlock(&mapping->private_lock);
2671out:
2672        if (buffers_to_free) {
2673                struct buffer_head *bh = buffers_to_free;
2674
2675                do {
2676                        struct buffer_head *next = bh->b_this_page;
2677                        free_buffer_head(bh);
2678                        bh = next;
2679                } while (bh != buffers_to_free);
2680        }
2681        return ret;
2682}
2683EXPORT_SYMBOL(try_to_free_buffers);
2684
2685int block_sync_page(struct page *page)
2686{
2687        blk_run_queues();
2688        return 0;
2689}
2690
2691/*
2692 * There are no bdflush tunables left.  But distributions are
2693 * still running obsolete flush daemons, so we terminate them here.
2694 */
2695asmlinkage long sys_bdflush(int func, long data)
2696{
2697        if (!capable(CAP_SYS_ADMIN))
2698                return -EPERM;
2699        if (func == 1)
2700                do_exit(0);
2701        return 0;
2702}
2703
2704/*
2705 * Buffer-head allocation
2706 */
2707static kmem_cache_t *bh_cachep;
2708static mempool_t *bh_mempool;
2709
2710/*
2711 * Once the number of bh's in the machine exceeds this level, we start
2712 * stripping them in writeback.
2713 */
2714static int max_buffer_heads;
2715
2716int buffer_heads_over_limit;
2717
2718struct bh_accounting {
2719        int nr;                 /* Number of live bh's */
2720        int ratelimit;          /* Limit cacheline bouncing */
2721};
2722
2723static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2724
2725static void recalc_bh_state(void)
2726{
2727        int i;
2728        int tot = 0;
2729
2730        if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2731                return;
2732        __get_cpu_var(bh_accounting).ratelimit = 0;
2733        for (i = 0; i < NR_CPUS; i++)
2734                tot += per_cpu(bh_accounting, i).nr;
2735        buffer_heads_over_limit = (tot > max_buffer_heads);
2736}
2737        
2738struct buffer_head *alloc_buffer_head(void)
2739{
2740        struct buffer_head *ret = mempool_alloc(bh_mempool, GFP_NOFS);
2741        if (ret) {
2742                preempt_disable();
2743                __get_cpu_var(bh_accounting).nr++;
2744                recalc_bh_state();
2745                preempt_enable();
2746        }
2747        return ret;
2748}
2749EXPORT_SYMBOL(alloc_buffer_head);
2750
2751void free_buffer_head(struct buffer_head *bh)
2752{
2753        BUG_ON(!list_empty(&bh->b_assoc_buffers));
2754        mempool_free(bh, bh_mempool);
2755        preempt_disable();
2756        __get_cpu_var(bh_accounting).nr--;
2757        recalc_bh_state();
2758        preempt_enable();
2759}
2760EXPORT_SYMBOL(free_buffer_head);
2761
2762static void init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
2763{
2764        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2765                            SLAB_CTOR_CONSTRUCTOR) {
2766                struct buffer_head * bh = (struct buffer_head *)data;
2767
2768                memset(bh, 0, sizeof(*bh));
2769                INIT_LIST_HEAD(&bh->b_assoc_buffers);
2770        }
2771}
2772
2773static void *bh_mempool_alloc(int gfp_mask, void *pool_data)
2774{
2775        return kmem_cache_alloc(bh_cachep, gfp_mask);
2776}
2777
2778static void bh_mempool_free(void *element, void *pool_data)
2779{
2780        return kmem_cache_free(bh_cachep, element);
2781}
2782
2783#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
2784#define MAX_UNUSED_BUFFERS NR_RESERVED+20
2785
2786void __init buffer_init(void)
2787{
2788        int i;
2789        int nrpages;
2790
2791        bh_cachep = kmem_cache_create("buffer_head",
2792                        sizeof(struct buffer_head), 0,
2793                        0, init_buffer_head, NULL);
2794        bh_mempool = mempool_create(MAX_UNUSED_BUFFERS, bh_mempool_alloc,
2795                                bh_mempool_free, NULL);
2796        for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
2797                init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
2798
2799        /*
2800         * Limit the bh occupancy to 10% of ZONE_NORMAL
2801         */
2802        nrpages = (nr_free_buffer_pages() * 10) / 100;
2803        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
2804}
2805
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.