linux/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/capability.h>
  28#include <linux/blkdev.h>
  29#include <linux/file.h>
  30#include <linux/quotaops.h>
  31#include <linux/highmem.h>
  32#include <linux/export.h>
  33#include <linux/writeback.h>
  34#include <linux/hash.h>
  35#include <linux/suspend.h>
  36#include <linux/buffer_head.h>
  37#include <linux/task_io_accounting_ops.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <linux/bitops.h>
  42#include <linux/mpage.h>
  43#include <linux/bit_spinlock.h>
  44#include <trace/events/block.h>
  45
  46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  47
  48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  51{
  52        bh->b_end_io = handler;
  53        bh->b_private = private;
  54}
  55EXPORT_SYMBOL(init_buffer);
  56
  57inline void touch_buffer(struct buffer_head *bh)
  58{
  59        trace_block_touch_buffer(bh);
  60        mark_page_accessed(bh->b_page);
  61}
  62EXPORT_SYMBOL(touch_buffer);
  63
  64static int sleep_on_buffer(void *word)
  65{
  66        io_schedule();
  67        return 0;
  68}
  69
  70void __lock_buffer(struct buffer_head *bh)
  71{
  72        wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
  73                                                        TASK_UNINTERRUPTIBLE);
  74}
  75EXPORT_SYMBOL(__lock_buffer);
  76
  77void unlock_buffer(struct buffer_head *bh)
  78{
  79        clear_bit_unlock(BH_Lock, &bh->b_state);
  80        smp_mb__after_clear_bit();
  81        wake_up_bit(&bh->b_state, BH_Lock);
  82}
  83EXPORT_SYMBOL(unlock_buffer);
  84
  85/*
  86 * Returns if the page has dirty or writeback buffers. If all the buffers
  87 * are unlocked and clean then the PageDirty information is stale. If
  88 * any of the pages are locked, it is assumed they are locked for IO.
  89 */
  90void buffer_check_dirty_writeback(struct page *page,
  91                                     bool *dirty, bool *writeback)
  92{
  93        struct buffer_head *head, *bh;
  94        *dirty = false;
  95        *writeback = false;
  96
  97        BUG_ON(!PageLocked(page));
  98
  99        if (!page_has_buffers(page))
 100                return;
 101
 102        if (PageWriteback(page))
 103                *writeback = true;
 104
 105        head = page_buffers(page);
 106        bh = head;
 107        do {
 108                if (buffer_locked(bh))
 109                        *writeback = true;
 110
 111                if (buffer_dirty(bh))
 112                        *dirty = true;
 113
 114                bh = bh->b_this_page;
 115        } while (bh != head);
 116}
 117EXPORT_SYMBOL(buffer_check_dirty_writeback);
 118
 119/*
 120 * Block until a buffer comes unlocked.  This doesn't stop it
 121 * from becoming locked again - you have to lock it yourself
 122 * if you want to preserve its state.
 123 */
 124void __wait_on_buffer(struct buffer_head * bh)
 125{
 126        wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 127}
 128EXPORT_SYMBOL(__wait_on_buffer);
 129
 130static void
 131__clear_page_buffers(struct page *page)
 132{
 133        ClearPagePrivate(page);
 134        set_page_private(page, 0);
 135        page_cache_release(page);
 136}
 137
 138
 139static int quiet_error(struct buffer_head *bh)
 140{
 141        if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
 142                return 0;
 143        return 1;
 144}
 145
 146
 147static void buffer_io_error(struct buffer_head *bh)
 148{
 149        char b[BDEVNAME_SIZE];
 150        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 151                        bdevname(bh->b_bdev, b),
 152                        (unsigned long long)bh->b_blocknr);
 153}
 154
 155/*
 156 * End-of-IO handler helper function which does not touch the bh after
 157 * unlocking it.
 158 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 159 * a race there is benign: unlock_buffer() only use the bh's address for
 160 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 161 * itself.
 162 */
 163static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 164{
 165        if (uptodate) {
 166                set_buffer_uptodate(bh);
 167        } else {
 168                /* This happens, due to failed READA attempts. */
 169                clear_buffer_uptodate(bh);
 170        }
 171        unlock_buffer(bh);
 172}
 173
 174/*
 175 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 176 * unlock the buffer. This is what ll_rw_block uses too.
 177 */
 178void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 179{
 180        __end_buffer_read_notouch(bh, uptodate);
 181        put_bh(bh);
 182}
 183EXPORT_SYMBOL(end_buffer_read_sync);
 184
 185void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 186{
 187        char b[BDEVNAME_SIZE];
 188
 189        if (uptodate) {
 190                set_buffer_uptodate(bh);
 191        } else {
 192                if (!quiet_error(bh)) {
 193                        buffer_io_error(bh);
 194                        printk(KERN_WARNING "lost page write due to "
 195                                        "I/O error on %s\n",
 196                                       bdevname(bh->b_bdev, b));
 197                }
 198                set_buffer_write_io_error(bh);
 199                clear_buffer_uptodate(bh);
 200        }
 201        unlock_buffer(bh);
 202        put_bh(bh);
 203}
 204EXPORT_SYMBOL(end_buffer_write_sync);
 205
 206/*
 207 * Various filesystems appear to want __find_get_block to be non-blocking.
 208 * But it's the page lock which protects the buffers.  To get around this,
 209 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 210 * private_lock.
 211 *
 212 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 213 * may be quite high.  This code could TryLock the page, and if that
 214 * succeeds, there is no need to take private_lock. (But if
 215 * private_lock is contended then so is mapping->tree_lock).
 216 */
 217static struct buffer_head *
 218__find_get_block_slow(struct block_device *bdev, sector_t block)
 219{
 220        struct inode *bd_inode = bdev->bd_inode;
 221        struct address_space *bd_mapping = bd_inode->i_mapping;
 222        struct buffer_head *ret = NULL;
 223        pgoff_t index;
 224        struct buffer_head *bh;
 225        struct buffer_head *head;
 226        struct page *page;
 227        int all_mapped = 1;
 228
 229        index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 230        page = find_get_page(bd_mapping, index);
 231        if (!page)
 232                goto out;
 233
 234        spin_lock(&bd_mapping->private_lock);
 235        if (!page_has_buffers(page))
 236                goto out_unlock;
 237        head = page_buffers(page);
 238        bh = head;
 239        do {
 240                if (!buffer_mapped(bh))
 241                        all_mapped = 0;
 242                else if (bh->b_blocknr == block) {
 243                        ret = bh;
 244                        get_bh(bh);
 245                        goto out_unlock;
 246                }
 247                bh = bh->b_this_page;
 248        } while (bh != head);
 249
 250        /* we might be here because some of the buffers on this page are
 251         * not mapped.  This is due to various races between
 252         * file io on the block device and getblk.  It gets dealt with
 253         * elsewhere, don't buffer_error if we had some unmapped buffers
 254         */
 255        if (all_mapped) {
 256                char b[BDEVNAME_SIZE];
 257
 258                printk("__find_get_block_slow() failed. "
 259                        "block=%llu, b_blocknr=%llu\n",
 260                        (unsigned long long)block,
 261                        (unsigned long long)bh->b_blocknr);
 262                printk("b_state=0x%08lx, b_size=%zu\n",
 263                        bh->b_state, bh->b_size);
 264                printk("device %s blocksize: %d\n", bdevname(bdev, b),
 265                        1 << bd_inode->i_blkbits);
 266        }
 267out_unlock:
 268        spin_unlock(&bd_mapping->private_lock);
 269        page_cache_release(page);
 270out:
 271        return ret;
 272}
 273
 274/*
 275 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 276 */
 277static void free_more_memory(void)
 278{
 279        struct zone *zone;
 280        int nid;
 281
 282        wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 283        yield();
 284
 285        for_each_online_node(nid) {
 286                (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 287                                                gfp_zone(GFP_NOFS), NULL,
 288                                                &zone);
 289                if (zone)
 290                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 291                                                GFP_NOFS, NULL);
 292        }
 293}
 294
 295/*
 296 * I/O completion handler for block_read_full_page() - pages
 297 * which come unlocked at the end of I/O.
 298 */
 299static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 300{
 301        unsigned long flags;
 302        struct buffer_head *first;
 303        struct buffer_head *tmp;
 304        struct page *page;
 305        int page_uptodate = 1;
 306
 307        BUG_ON(!buffer_async_read(bh));
 308
 309        page = bh->b_page;
 310        if (uptodate) {
 311                set_buffer_uptodate(bh);
 312        } else {
 313                clear_buffer_uptodate(bh);
 314                if (!quiet_error(bh))
 315                        buffer_io_error(bh);
 316                SetPageError(page);
 317        }
 318
 319        /*
 320         * Be _very_ careful from here on. Bad things can happen if
 321         * two buffer heads end IO at almost the same time and both
 322         * decide that the page is now completely done.
 323         */
 324        first = page_buffers(page);
 325        local_irq_save(flags);
 326        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 327        clear_buffer_async_read(bh);
 328        unlock_buffer(bh);
 329        tmp = bh;
 330        do {
 331                if (!buffer_uptodate(tmp))
 332                        page_uptodate = 0;
 333                if (buffer_async_read(tmp)) {
 334                        BUG_ON(!buffer_locked(tmp));
 335                        goto still_busy;
 336                }
 337                tmp = tmp->b_this_page;
 338        } while (tmp != bh);
 339        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 340        local_irq_restore(flags);
 341
 342        /*
 343         * If none of the buffers had errors and they are all
 344         * uptodate then we can set the page uptodate.
 345         */
 346        if (page_uptodate && !PageError(page))
 347                SetPageUptodate(page);
 348        unlock_page(page);
 349        return;
 350
 351still_busy:
 352        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 353        local_irq_restore(flags);
 354        return;
 355}
 356
 357/*
 358 * Completion handler for block_write_full_page() - pages which are unlocked
 359 * during I/O, and which have PageWriteback cleared upon I/O completion.
 360 */
 361void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 362{
 363        char b[BDEVNAME_SIZE];
 364        unsigned long flags;
 365        struct buffer_head *first;
 366        struct buffer_head *tmp;
 367        struct page *page;
 368
 369        BUG_ON(!buffer_async_write(bh));
 370
 371        page = bh->b_page;
 372        if (uptodate) {
 373                set_buffer_uptodate(bh);
 374        } else {
 375                if (!quiet_error(bh)) {
 376                        buffer_io_error(bh);
 377                        printk(KERN_WARNING "lost page write due to "
 378                                        "I/O error on %s\n",
 379                               bdevname(bh->b_bdev, b));
 380                }
 381                set_bit(AS_EIO, &page->mapping->flags);
 382                set_buffer_write_io_error(bh);
 383                clear_buffer_uptodate(bh);
 384                SetPageError(page);
 385        }
 386
 387        first = page_buffers(page);
 388        local_irq_save(flags);
 389        bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 390
 391        clear_buffer_async_write(bh);
 392        unlock_buffer(bh);
 393        tmp = bh->b_this_page;
 394        while (tmp != bh) {
 395                if (buffer_async_write(tmp)) {
 396                        BUG_ON(!buffer_locked(tmp));
 397                        goto still_busy;
 398                }
 399                tmp = tmp->b_this_page;
 400        }
 401        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 402        local_irq_restore(flags);
 403        end_page_writeback(page);
 404        return;
 405
 406still_busy:
 407        bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 408        local_irq_restore(flags);
 409        return;
 410}
 411EXPORT_SYMBOL(end_buffer_async_write);
 412
 413/*
 414 * If a page's buffers are under async readin (end_buffer_async_read
 415 * completion) then there is a possibility that another thread of
 416 * control could lock one of the buffers after it has completed
 417 * but while some of the other buffers have not completed.  This
 418 * locked buffer would confuse end_buffer_async_read() into not unlocking
 419 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 420 * that this buffer is not under async I/O.
 421 *
 422 * The page comes unlocked when it has no locked buffer_async buffers
 423 * left.
 424 *
 425 * PageLocked prevents anyone starting new async I/O reads any of
 426 * the buffers.
 427 *
 428 * PageWriteback is used to prevent simultaneous writeout of the same
 429 * page.
 430 *
 431 * PageLocked prevents anyone from starting writeback of a page which is
 432 * under read I/O (PageWriteback is only ever set against a locked page).
 433 */
 434static void mark_buffer_async_read(struct buffer_head *bh)
 435{
 436        bh->b_end_io = end_buffer_async_read;
 437        set_buffer_async_read(bh);
 438}
 439
 440static void mark_buffer_async_write_endio(struct buffer_head *bh,
 441                                          bh_end_io_t *handler)
 442{
 443        bh->b_end_io = handler;
 444        set_buffer_async_write(bh);
 445}
 446
 447void mark_buffer_async_write(struct buffer_head *bh)
 448{
 449        mark_buffer_async_write_endio(bh, end_buffer_async_write);
 450}
 451EXPORT_SYMBOL(mark_buffer_async_write);
 452
 453
 454/*
 455 * fs/buffer.c contains helper functions for buffer-backed address space's
 456 * fsync functions.  A common requirement for buffer-based filesystems is
 457 * that certain data from the backing blockdev needs to be written out for
 458 * a successful fsync().  For example, ext2 indirect blocks need to be
 459 * written back and waited upon before fsync() returns.
 460 *
 461 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 462 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 463 * management of a list of dependent buffers at ->i_mapping->private_list.
 464 *
 465 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 466 * from their controlling inode's queue when they are being freed.  But
 467 * try_to_free_buffers() will be operating against the *blockdev* mapping
 468 * at the time, not against the S_ISREG file which depends on those buffers.
 469 * So the locking for private_list is via the private_lock in the address_space
 470 * which backs the buffers.  Which is different from the address_space 
 471 * against which the buffers are listed.  So for a particular address_space,
 472 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 473 * mapping->private_list will always be protected by the backing blockdev's
 474 * ->private_lock.
 475 *
 476 * Which introduces a requirement: all buffers on an address_space's
 477 * ->private_list must be from the same address_space: the blockdev's.
 478 *
 479 * address_spaces which do not place buffers at ->private_list via these
 480 * utility functions are free to use private_lock and private_list for
 481 * whatever they want.  The only requirement is that list_empty(private_list)
 482 * be true at clear_inode() time.
 483 *
 484 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 485 * filesystems should do that.  invalidate_inode_buffers() should just go
 486 * BUG_ON(!list_empty).
 487 *
 488 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 489 * take an address_space, not an inode.  And it should be called
 490 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 491 * queued up.
 492 *
 493 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 494 * list if it is already on a list.  Because if the buffer is on a list,
 495 * it *must* already be on the right one.  If not, the filesystem is being
 496 * silly.  This will save a ton of locking.  But first we have to ensure
 497 * that buffers are taken *off* the old inode's list when they are freed
 498 * (presumably in truncate).  That requires careful auditing of all
 499 * filesystems (do it inside bforget()).  It could also be done by bringing
 500 * b_inode back.
 501 */
 502
 503/*
 504 * The buffer's backing address_space's private_lock must be held
 505 */
 506static void __remove_assoc_queue(struct buffer_head *bh)
 507{
 508        list_del_init(&bh->b_assoc_buffers);
 509        WARN_ON(!bh->b_assoc_map);
 510        if (buffer_write_io_error(bh))
 511                set_bit(AS_EIO, &bh->b_assoc_map->flags);
 512        bh->b_assoc_map = NULL;
 513}
 514
 515int inode_has_buffers(struct inode *inode)
 516{
 517        return !list_empty(&inode->i_data.private_list);
 518}
 519
 520/*
 521 * osync is designed to support O_SYNC io.  It waits synchronously for
 522 * all already-submitted IO to complete, but does not queue any new
 523 * writes to the disk.
 524 *
 525 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 526 * you dirty the buffers, and then use osync_inode_buffers to wait for
 527 * completion.  Any other dirty buffers which are not yet queued for
 528 * write will not be flushed to disk by the osync.
 529 */
 530static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 531{
 532        struct buffer_head *bh;
 533        struct list_head *p;
 534        int err = 0;
 535
 536        spin_lock(lock);
 537repeat:
 538        list_for_each_prev(p, list) {
 539                bh = BH_ENTRY(p);
 540                if (buffer_locked(bh)) {
 541                        get_bh(bh);
 542                        spin_unlock(lock);
 543                        wait_on_buffer(bh);
 544                        if (!buffer_uptodate(bh))
 545                                err = -EIO;
 546                        brelse(bh);
 547                        spin_lock(lock);
 548                        goto repeat;
 549                }
 550        }
 551        spin_unlock(lock);
 552        return err;
 553}
 554
 555static void do_thaw_one(struct super_block *sb, void *unused)
 556{
 557        char b[BDEVNAME_SIZE];
 558        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 559                printk(KERN_WARNING "Emergency Thaw on %s\n",
 560                       bdevname(sb->s_bdev, b));
 561}
 562
 563static void do_thaw_all(struct work_struct *work)
 564{
 565        iterate_supers(do_thaw_one, NULL);
 566        kfree(work);
 567        printk(KERN_WARNING "Emergency Thaw complete\n");
 568}
 569
 570/**
 571 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 572 *
 573 * Used for emergency unfreeze of all filesystems via SysRq
 574 */
 575void emergency_thaw_all(void)
 576{
 577        struct work_struct *work;
 578
 579        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 580        if (work) {
 581                INIT_WORK(work, do_thaw_all);
 582                schedule_work(work);
 583        }
 584}
 585
 586/**
 587 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 588 * @mapping: the mapping which wants those buffers written
 589 *
 590 * Starts I/O against the buffers at mapping->private_list, and waits upon
 591 * that I/O.
 592 *
 593 * Basically, this is a convenience function for fsync().
 594 * @mapping is a file or directory which needs those buffers to be written for
 595 * a successful fsync().
 596 */
 597int sync_mapping_buffers(struct address_space *mapping)
 598{
 599        struct address_space *buffer_mapping = mapping->private_data;
 600
 601        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 602                return 0;
 603
 604        return fsync_buffers_list(&buffer_mapping->private_lock,
 605                                        &mapping->private_list);
 606}
 607EXPORT_SYMBOL(sync_mapping_buffers);
 608
 609/*
 610 * Called when we've recently written block `bblock', and it is known that
 611 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 612 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 613 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 614 */
 615void write_boundary_block(struct block_device *bdev,
 616                        sector_t bblock, unsigned blocksize)
 617{
 618        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 619        if (bh) {
 620                if (buffer_dirty(bh))
 621                        ll_rw_block(WRITE, 1, &bh);
 622                put_bh(bh);
 623        }
 624}
 625
 626void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 627{
 628        struct address_space *mapping = inode->i_mapping;
 629        struct address_space *buffer_mapping = bh->b_page->mapping;
 630
 631        mark_buffer_dirty(bh);
 632        if (!mapping->private_data) {
 633                mapping->private_data = buffer_mapping;
 634        } else {
 635                BUG_ON(mapping->private_data != buffer_mapping);
 636        }
 637        if (!bh->b_assoc_map) {
 638                spin_lock(&buffer_mapping->private_lock);
 639                list_move_tail(&bh->b_assoc_buffers,
 640                                &mapping->private_list);
 641                bh->b_assoc_map = mapping;
 642                spin_unlock(&buffer_mapping->private_lock);
 643        }
 644}
 645EXPORT_SYMBOL(mark_buffer_dirty_inode);
 646
 647/*
 648 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 649 * dirty.
 650 *
 651 * If warn is true, then emit a warning if the page is not uptodate and has
 652 * not been truncated.
 653 */
 654static void __set_page_dirty(struct page *page,
 655                struct address_space *mapping, int warn)
 656{
 657        spin_lock_irq(&mapping->tree_lock);
 658        if (page->mapping) {    /* Race with truncate? */
 659                WARN_ON_ONCE(warn && !PageUptodate(page));
 660                account_page_dirtied(page, mapping);
 661                radix_tree_tag_set(&mapping->page_tree,
 662                                page_index(page), PAGECACHE_TAG_DIRTY);
 663        }
 664        spin_unlock_irq(&mapping->tree_lock);
 665        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 666}
 667
 668/*
 669 * Add a page to the dirty page list.
 670 *
 671 * It is a sad fact of life that this function is called from several places
 672 * deeply under spinlocking.  It may not sleep.
 673 *
 674 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 675 * dirty-state coherency between the page and the buffers.  It the page does
 676 * not have buffers then when they are later attached they will all be set
 677 * dirty.
 678 *
 679 * The buffers are dirtied before the page is dirtied.  There's a small race
 680 * window in which a writepage caller may see the page cleanness but not the
 681 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 682 * before the buffers, a concurrent writepage caller could clear the page dirty
 683 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 684 * page on the dirty page list.
 685 *
 686 * We use private_lock to lock against try_to_free_buffers while using the
 687 * page's buffer list.  Also use this to protect against clean buffers being
 688 * added to the page after it was set dirty.
 689 *
 690 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 691 * address_space though.
 692 */
 693int __set_page_dirty_buffers(struct page *page)
 694{
 695        int newly_dirty;
 696        struct address_space *mapping = page_mapping(page);
 697
 698        if (unlikely(!mapping))
 699                return !TestSetPageDirty(page);
 700
 701        spin_lock(&mapping->private_lock);
 702        if (page_has_buffers(page)) {
 703                struct buffer_head *head = page_buffers(page);
 704                struct buffer_head *bh = head;
 705
 706                do {
 707                        set_buffer_dirty(bh);
 708                        bh = bh->b_this_page;
 709                } while (bh != head);
 710        }
 711        newly_dirty = !TestSetPageDirty(page);
 712        spin_unlock(&mapping->private_lock);
 713
 714        if (newly_dirty)
 715                __set_page_dirty(page, mapping, 1);
 716        return newly_dirty;
 717}
 718EXPORT_SYMBOL(__set_page_dirty_buffers);
 719
 720/*
 721 * Write out and wait upon a list of buffers.
 722 *
 723 * We have conflicting pressures: we want to make sure that all
 724 * initially dirty buffers get waited on, but that any subsequently
 725 * dirtied buffers don't.  After all, we don't want fsync to last
 726 * forever if somebody is actively writing to the file.
 727 *
 728 * Do this in two main stages: first we copy dirty buffers to a
 729 * temporary inode list, queueing the writes as we go.  Then we clean
 730 * up, waiting for those writes to complete.
 731 * 
 732 * During this second stage, any subsequent updates to the file may end
 733 * up refiling the buffer on the original inode's dirty list again, so
 734 * there is a chance we will end up with a buffer queued for write but
 735 * not yet completed on that list.  So, as a final cleanup we go through
 736 * the osync code to catch these locked, dirty buffers without requeuing
 737 * any newly dirty buffers for write.
 738 */
 739static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 740{
 741        struct buffer_head *bh;
 742        struct list_head tmp;
 743        struct address_space *mapping;
 744        int err = 0, err2;
 745        struct blk_plug plug;
 746
 747        INIT_LIST_HEAD(&tmp);
 748        blk_start_plug(&plug);
 749
 750        spin_lock(lock);
 751        while (!list_empty(list)) {
 752                bh = BH_ENTRY(list->next);
 753                mapping = bh->b_assoc_map;
 754                __remove_assoc_queue(bh);
 755                /* Avoid race with mark_buffer_dirty_inode() which does
 756                 * a lockless check and we rely on seeing the dirty bit */
 757                smp_mb();
 758                if (buffer_dirty(bh) || buffer_locked(bh)) {
 759                        list_add(&bh->b_assoc_buffers, &tmp);
 760                        bh->b_assoc_map = mapping;
 761                        if (buffer_dirty(bh)) {
 762                                get_bh(bh);
 763                                spin_unlock(lock);
 764                                /*
 765                                 * Ensure any pending I/O completes so that
 766                                 * write_dirty_buffer() actually writes the
 767                                 * current contents - it is a noop if I/O is
 768                                 * still in flight on potentially older
 769                                 * contents.
 770                                 */
 771                                write_dirty_buffer(bh, WRITE_SYNC);
 772
 773                                /*
 774                                 * Kick off IO for the previous mapping. Note
 775                                 * that we will not run the very last mapping,
 776                                 * wait_on_buffer() will do that for us
 777                                 * through sync_buffer().
 778                                 */
 779                                brelse(bh);
 780                                spin_lock(lock);
 781                        }
 782                }
 783        }
 784
 785        spin_unlock(lock);
 786        blk_finish_plug(&plug);
 787        spin_lock(lock);
 788
 789        while (!list_empty(&tmp)) {
 790                bh = BH_ENTRY(tmp.prev);
 791                get_bh(bh);
 792                mapping = bh->b_assoc_map;
 793                __remove_assoc_queue(bh);
 794                /* Avoid race with mark_buffer_dirty_inode() which does
 795                 * a lockless check and we rely on seeing the dirty bit */
 796                smp_mb();
 797                if (buffer_dirty(bh)) {
 798                        list_add(&bh->b_assoc_buffers,
 799                                 &mapping->private_list);
 800                        bh->b_assoc_map = mapping;
 801                }
 802                spin_unlock(lock);
 803                wait_on_buffer(bh);
 804                if (!buffer_uptodate(bh))
 805                        err = -EIO;
 806                brelse(bh);
 807                spin_lock(lock);
 808        }
 809        
 810        spin_unlock(lock);
 811        err2 = osync_buffers_list(lock, list);
 812        if (err)
 813                return err;
 814        else
 815                return err2;
 816}
 817
 818/*
 819 * Invalidate any and all dirty buffers on a given inode.  We are
 820 * probably unmounting the fs, but that doesn't mean we have already
 821 * done a sync().  Just drop the buffers from the inode list.
 822 *
 823 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 824 * assumes that all the buffers are against the blockdev.  Not true
 825 * for reiserfs.
 826 */
 827void invalidate_inode_buffers(struct inode *inode)
 828{
 829        if (inode_has_buffers(inode)) {
 830                struct address_space *mapping = &inode->i_data;
 831                struct list_head *list = &mapping->private_list;
 832                struct address_space *buffer_mapping = mapping->private_data;
 833
 834                spin_lock(&buffer_mapping->private_lock);
 835                while (!list_empty(list))
 836                        __remove_assoc_queue(BH_ENTRY(list->next));
 837                spin_unlock(&buffer_mapping->private_lock);
 838        }
 839}
 840EXPORT_SYMBOL(invalidate_inode_buffers);
 841
 842/*
 843 * Remove any clean buffers from the inode's buffer list.  This is called
 844 * when we're trying to free the inode itself.  Those buffers can pin it.
 845 *
 846 * Returns true if all buffers were removed.
 847 */
 848int remove_inode_buffers(struct inode *inode)
 849{
 850        int ret = 1;
 851
 852        if (inode_has_buffers(inode)) {
 853                struct address_space *mapping = &inode->i_data;
 854                struct list_head *list = &mapping->private_list;
 855                struct address_space *buffer_mapping = mapping->private_data;
 856
 857                spin_lock(&buffer_mapping->private_lock);
 858                while (!list_empty(list)) {
 859                        struct buffer_head *bh = BH_ENTRY(list->next);
 860                        if (buffer_dirty(bh)) {
 861                                ret = 0;
 862                                break;
 863                        }
 864                        __remove_assoc_queue(bh);
 865                }
 866                spin_unlock(&buffer_mapping->private_lock);
 867        }
 868        return ret;
 869}
 870
 871/*
 872 * Create the appropriate buffers when given a page for data area and
 873 * the size of each buffer.. Use the bh->b_this_page linked list to
 874 * follow the buffers created.  Return NULL if unable to create more
 875 * buffers.
 876 *
 877 * The retry flag is used to differentiate async IO (paging, swapping)
 878 * which may not fail from ordinary buffer allocations.
 879 */
 880struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 881                int retry)
 882{
 883        struct buffer_head *bh, *head;
 884        long offset;
 885
 886try_again:
 887        head = NULL;
 888        offset = PAGE_SIZE;
 889        while ((offset -= size) >= 0) {
 890                bh = alloc_buffer_head(GFP_NOFS);
 891                if (!bh)
 892                        goto no_grow;
 893
 894                bh->b_this_page = head;
 895                bh->b_blocknr = -1;
 896                head = bh;
 897
 898                bh->b_size = size;
 899
 900                /* Link the buffer to its page */
 901                set_bh_page(bh, page, offset);
 902        }
 903        return head;
 904/*
 905 * In case anything failed, we just free everything we got.
 906 */
 907no_grow:
 908        if (head) {
 909                do {
 910                        bh = head;
 911                        head = head->b_this_page;
 912                        free_buffer_head(bh);
 913                } while (head);
 914        }
 915
 916        /*
 917         * Return failure for non-async IO requests.  Async IO requests
 918         * are not allowed to fail, so we have to wait until buffer heads
 919         * become available.  But we don't want tasks sleeping with 
 920         * partially complete buffers, so all were released above.
 921         */
 922        if (!retry)
 923                return NULL;
 924
 925        /* We're _really_ low on memory. Now we just
 926         * wait for old buffer heads to become free due to
 927         * finishing IO.  Since this is an async request and
 928         * the reserve list is empty, we're sure there are 
 929         * async buffer heads in use.
 930         */
 931        free_more_memory();
 932        goto try_again;
 933}
 934EXPORT_SYMBOL_GPL(alloc_page_buffers);
 935
 936static inline void
 937link_dev_buffers(struct page *page, struct buffer_head *head)
 938{
 939        struct buffer_head *bh, *tail;
 940
 941        bh = head;
 942        do {
 943                tail = bh;
 944                bh = bh->b_this_page;
 945        } while (bh);
 946        tail->b_this_page = head;
 947        attach_page_buffers(page, head);
 948}
 949
 950static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
 951{
 952        sector_t retval = ~((sector_t)0);
 953        loff_t sz = i_size_read(bdev->bd_inode);
 954
 955        if (sz) {
 956                unsigned int sizebits = blksize_bits(size);
 957                retval = (sz >> sizebits);
 958        }
 959        return retval;
 960}
 961
 962/*
 963 * Initialise the state of a blockdev page's buffers.
 964 */ 
 965static sector_t
 966init_page_buffers(struct page *page, struct block_device *bdev,
 967                        sector_t block, int size)
 968{
 969        struct buffer_head *head = page_buffers(page);
 970        struct buffer_head *bh = head;
 971        int uptodate = PageUptodate(page);
 972        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 973
 974        do {
 975                if (!buffer_mapped(bh)) {
 976                        init_buffer(bh, NULL, NULL);
 977                        bh->b_bdev = bdev;
 978                        bh->b_blocknr = block;
 979                        if (uptodate)
 980                                set_buffer_uptodate(bh);
 981                        if (block < end_block)
 982                                set_buffer_mapped(bh);
 983                }
 984                block++;
 985                bh = bh->b_this_page;
 986        } while (bh != head);
 987
 988        /*
 989         * Caller needs to validate requested block against end of device.
 990         */
 991        return end_block;
 992}
 993
 994/*
 995 * Create the page-cache page that contains the requested block.
 996 *
 997 * This is used purely for blockdev mappings.
 998 */
 999static int
1000grow_dev_page(struct block_device *bdev, sector_t block,
1001                pgoff_t index, int size, int sizebits)
1002{
1003        struct inode *inode = bdev->bd_inode;
1004        struct page *page;
1005        struct buffer_head *bh;
1006        sector_t end_block;
1007        int ret = 0;            /* Will call free_more_memory() */
1008        gfp_t gfp_mask;
1009
1010        gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
1011        gfp_mask |= __GFP_MOVABLE;
1012        /*
1013         * XXX: __getblk_slow() can not really deal with failure and
1014         * will endlessly loop on improvised global reclaim.  Prefer
1015         * looping in the allocator rather than here, at least that
1016         * code knows what it's doing.
1017         */
1018        gfp_mask |= __GFP_NOFAIL;
1019
1020        page = find_or_create_page(inode->i_mapping, index, gfp_mask);
1021        if (!page)
1022                return ret;
1023
1024        BUG_ON(!PageLocked(page));
1025
1026        if (page_has_buffers(page)) {
1027                bh = page_buffers(page);
1028                if (bh->b_size == size) {
1029                        end_block = init_page_buffers(page, bdev,
1030                                                index << sizebits, size);
1031                        goto done;
1032                }
1033                if (!try_to_free_buffers(page))
1034                        goto failed;
1035        }
1036
1037        /*
1038         * Allocate some buffers for this page
1039         */
1040        bh = alloc_page_buffers(page, size, 0);
1041        if (!bh)
1042                goto failed;
1043
1044        /*
1045         * Link the page to the buffers and initialise them.  Take the
1046         * lock to be atomic wrt __find_get_block(), which does not
1047         * run under the page lock.
1048         */
1049        spin_lock(&inode->i_mapping->private_lock);
1050        link_dev_buffers(page, bh);
1051        end_block = init_page_buffers(page, bdev, index << sizebits, size);
1052        spin_unlock(&inode->i_mapping->private_lock);
1053done:
1054        ret = (block < end_block) ? 1 : -ENXIO;
1055failed:
1056        unlock_page(page);
1057        page_cache_release(page);
1058        return ret;
1059}
1060
1061/*
1062 * Create buffers for the specified block device block's page.  If
1063 * that page was dirty, the buffers are set dirty also.
1064 */
1065static int
1066grow_buffers(struct block_device *bdev, sector_t block, int size)
1067{
1068        pgoff_t index;
1069        int sizebits;
1070
1071        sizebits = -1;
1072        do {
1073                sizebits++;
1074        } while ((size << sizebits) < PAGE_SIZE);
1075
1076        index = block >> sizebits;
1077
1078        /*
1079         * Check for a block which wants to lie outside our maximum possible
1080         * pagecache index.  (this comparison is done using sector_t types).
1081         */
1082        if (unlikely(index != block >> sizebits)) {
1083                char b[BDEVNAME_SIZE];
1084
1085                printk(KERN_ERR "%s: requested out-of-range block %llu for "
1086                        "device %s\n",
1087                        __func__, (unsigned long long)block,
1088                        bdevname(bdev, b));
1089                return -EIO;
1090        }
1091
1092        /* Create a page with the proper size buffers.. */
1093        return grow_dev_page(bdev, block, index, size, sizebits);
1094}
1095
1096static struct buffer_head *
1097__getblk_slow(struct block_device *bdev, sector_t block, int size)
1098{
1099        /* Size must be multiple of hard sectorsize */
1100        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1101                        (size < 512 || size > PAGE_SIZE))) {
1102                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1103                                        size);
1104                printk(KERN_ERR "logical block size: %d\n",
1105                                        bdev_logical_block_size(bdev));
1106
1107                dump_stack();
1108                return NULL;
1109        }
1110
1111        for (;;) {
1112                struct buffer_head *bh;
1113                int ret;
1114
1115                bh = __find_get_block(bdev, block, size);
1116                if (bh)
1117                        return bh;
1118
1119                ret = grow_buffers(bdev, block, size);
1120                if (ret < 0)
1121                        return NULL;
1122                if (ret == 0)
1123                        free_more_memory();
1124        }
1125}
1126
1127/*
1128 * The relationship between dirty buffers and dirty pages:
1129 *
1130 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1131 * the page is tagged dirty in its radix tree.
1132 *
1133 * At all times, the dirtiness of the buffers represents the dirtiness of
1134 * subsections of the page.  If the page has buffers, the page dirty bit is
1135 * merely a hint about the true dirty state.
1136 *
1137 * When a page is set dirty in its entirety, all its buffers are marked dirty
1138 * (if the page has buffers).
1139 *
1140 * When a buffer is marked dirty, its page is dirtied, but the page's other
1141 * buffers are not.
1142 *
1143 * Also.  When blockdev buffers are explicitly read with bread(), they
1144 * individually become uptodate.  But their backing page remains not
1145 * uptodate - even if all of its buffers are uptodate.  A subsequent
1146 * block_read_full_page() against that page will discover all the uptodate
1147 * buffers, will set the page uptodate and will perform no I/O.
1148 */
1149
1150/**
1151 * mark_buffer_dirty - mark a buffer_head as needing writeout
1152 * @bh: the buffer_head to mark dirty
1153 *
1154 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1155 * backing page dirty, then tag the page as dirty in its address_space's radix
1156 * tree and then attach the address_space's inode to its superblock's dirty
1157 * inode list.
1158 *
1159 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1160 * mapping->tree_lock and mapping->host->i_lock.
1161 */
1162void mark_buffer_dirty(struct buffer_head *bh)
1163{
1164        WARN_ON_ONCE(!buffer_uptodate(bh));
1165
1166        trace_block_dirty_buffer(bh);
1167
1168        /*
1169         * Very *carefully* optimize the it-is-already-dirty case.
1170         *
1171         * Don't let the final "is it dirty" escape to before we
1172         * perhaps modified the buffer.
1173         */
1174        if (buffer_dirty(bh)) {
1175                smp_mb();
1176                if (buffer_dirty(bh))
1177                        return;
1178        }
1179
1180        if (!test_set_buffer_dirty(bh)) {
1181                struct page *page = bh->b_page;
1182                if (!TestSetPageDirty(page)) {
1183                        struct address_space *mapping = page_mapping(page);
1184                        if (mapping)
1185                                __set_page_dirty(page, mapping, 0);
1186                }
1187        }
1188}
1189EXPORT_SYMBOL(mark_buffer_dirty);
1190
1191/*
1192 * Decrement a buffer_head's reference count.  If all buffers against a page
1193 * have zero reference count, are clean and unlocked, and if the page is clean
1194 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1195 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1196 * a page but it ends up not being freed, and buffers may later be reattached).
1197 */
1198void __brelse(struct buffer_head * buf)
1199{
1200        if (atomic_read(&buf->b_count)) {
1201                put_bh(buf);
1202                return;
1203        }
1204        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1205}
1206EXPORT_SYMBOL(__brelse);
1207
1208/*
1209 * bforget() is like brelse(), except it discards any
1210 * potentially dirty data.
1211 */
1212void __bforget(struct buffer_head *bh)
1213{
1214        clear_buffer_dirty(bh);
1215        if (bh->b_assoc_map) {
1216                struct address_space *buffer_mapping = bh->b_page->mapping;
1217
1218                spin_lock(&buffer_mapping->private_lock);
1219                list_del_init(&bh->b_assoc_buffers);
1220                bh->b_assoc_map = NULL;
1221                spin_unlock(&buffer_mapping->private_lock);
1222        }
1223        __brelse(bh);
1224}
1225EXPORT_SYMBOL(__bforget);
1226
1227static struct buffer_head *__bread_slow(struct buffer_head *bh)
1228{
1229        lock_buffer(bh);
1230        if (buffer_uptodate(bh)) {
1231                unlock_buffer(bh);
1232                return bh;
1233        } else {
1234                get_bh(bh);
1235                bh->b_end_io = end_buffer_read_sync;
1236                submit_bh(READ, bh);
1237                wait_on_buffer(bh);
1238                if (buffer_uptodate(bh))
1239                        return bh;
1240        }
1241        brelse(bh);
1242        return NULL;
1243}
1244
1245/*
1246 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1247 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1248 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1249 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1250 * CPU's LRUs at the same time.
1251 *
1252 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1253 * sb_find_get_block().
1254 *
1255 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1256 * a local interrupt disable for that.
1257 */
1258
1259#define BH_LRU_SIZE     8
1260
1261struct bh_lru {
1262        struct buffer_head *bhs[BH_LRU_SIZE];
1263};
1264
1265static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1266
1267#ifdef CONFIG_SMP
1268#define bh_lru_lock()   local_irq_disable()
1269#define bh_lru_unlock() local_irq_enable()
1270#else
1271#define bh_lru_lock()   preempt_disable()
1272#define bh_lru_unlock() preempt_enable()
1273#endif
1274
1275static inline void check_irqs_on(void)
1276{
1277#ifdef irqs_disabled
1278        BUG_ON(irqs_disabled());
1279#endif
1280}
1281
1282/*
1283 * The LRU management algorithm is dopey-but-simple.  Sorry.
1284 */
1285static void bh_lru_install(struct buffer_head *bh)
1286{
1287        struct buffer_head *evictee = NULL;
1288
1289        check_irqs_on();
1290        bh_lru_lock();
1291        if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1292                struct buffer_head *bhs[BH_LRU_SIZE];
1293                int in;
1294                int out = 0;
1295
1296                get_bh(bh);
1297                bhs[out++] = bh;
1298                for (in = 0; in < BH_LRU_SIZE; in++) {
1299                        struct buffer_head *bh2 =
1300                                __this_cpu_read(bh_lrus.bhs[in]);
1301
1302                        if (bh2 == bh) {
1303                                __brelse(bh2);
1304                        } else {
1305                                if (out >= BH_LRU_SIZE) {
1306                                        BUG_ON(evictee != NULL);
1307                                        evictee = bh2;
1308                                } else {
1309                                        bhs[out++] = bh2;
1310                                }
1311                        }
1312                }
1313                while (out < BH_LRU_SIZE)
1314                        bhs[out++] = NULL;
1315                memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1316        }
1317        bh_lru_unlock();
1318
1319        if (evictee)
1320                __brelse(evictee);
1321}
1322
1323/*
1324 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1325 */
1326static struct buffer_head *
1327lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1328{
1329        struct buffer_head *ret = NULL;
1330        unsigned int i;
1331
1332        check_irqs_on();
1333        bh_lru_lock();
1334        for (i = 0; i < BH_LRU_SIZE; i++) {
1335                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1336
1337                if (bh && bh->b_bdev == bdev &&
1338                                bh->b_blocknr == block && bh->b_size == size) {
1339                        if (i) {
1340                                while (i) {
1341                                        __this_cpu_write(bh_lrus.bhs[i],
1342                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
1343                                        i--;
1344                                }
1345                                __this_cpu_write(bh_lrus.bhs[0], bh);
1346                        }
1347                        get_bh(bh);
1348                        ret = bh;
1349                        break;
1350                }
1351        }
1352        bh_lru_unlock();
1353        return ret;
1354}
1355
1356/*
1357 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1358 * it in the LRU and mark it as accessed.  If it is not present then return
1359 * NULL
1360 */
1361struct buffer_head *
1362__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1363{
1364        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1365
1366        if (bh == NULL) {
1367                bh = __find_get_block_slow(bdev, block);
1368                if (bh)
1369                        bh_lru_install(bh);
1370        }
1371        if (bh)
1372                touch_buffer(bh);
1373        return bh;
1374}
1375EXPORT_SYMBOL(__find_get_block);
1376
1377/*
1378 * __getblk will locate (and, if necessary, create) the buffer_head
1379 * which corresponds to the passed block_device, block and size. The
1380 * returned buffer has its reference count incremented.
1381 *
1382 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1383 * attempt is failing.  FIXME, perhaps?
1384 */
1385struct buffer_head *
1386__getblk(struct block_device *bdev, sector_t block, unsigned size)
1387{
1388        struct buffer_head *bh = __find_get_block(bdev, block, size);
1389
1390        might_sleep();
1391        if (bh == NULL)
1392                bh = __getblk_slow(bdev, block, size);
1393        return bh;
1394}
1395EXPORT_SYMBOL(__getblk);
1396
1397/*
1398 * Do async read-ahead on a buffer..
1399 */
1400void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1401{
1402        struct buffer_head *bh = __getblk(bdev, block, size);
1403        if (likely(bh)) {
1404                ll_rw_block(READA, 1, &bh);
1405                brelse(bh);
1406        }
1407}
1408EXPORT_SYMBOL(__breadahead);
1409
1410/**
1411 *  __bread() - reads a specified block and returns the bh
1412 *  @bdev: the block_device to read from
1413 *  @block: number of block
1414 *  @size: size (in bytes) to read
1415 * 
1416 *  Reads a specified block, and returns buffer head that contains it.
1417 *  It returns NULL if the block was unreadable.
1418 */
1419struct buffer_head *
1420__bread(struct block_device *bdev, sector_t block, unsigned size)
1421{
1422        struct buffer_head *bh = __getblk(bdev, block, size);
1423
1424        if (likely(bh) && !buffer_uptodate(bh))
1425                bh = __bread_slow(bh);
1426        return bh;
1427}
1428EXPORT_SYMBOL(__bread);
1429
1430/*
1431 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1432 * This doesn't race because it runs in each cpu either in irq
1433 * or with preempt disabled.
1434 */
1435static void invalidate_bh_lru(void *arg)
1436{
1437        struct bh_lru *b = &get_cpu_var(bh_lrus);
1438        int i;
1439
1440        for (i = 0; i < BH_LRU_SIZE; i++) {
1441                brelse(b->bhs[i]);
1442                b->bhs[i] = NULL;
1443        }
1444        put_cpu_var(bh_lrus);
1445}
1446
1447static bool has_bh_in_lru(int cpu, void *dummy)
1448{
1449        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1450        int i;
1451        
1452        for (i = 0; i < BH_LRU_SIZE; i++) {
1453                if (b->bhs[i])
1454                        return 1;
1455        }
1456
1457        return 0;
1458}
1459
1460void invalidate_bh_lrus(void)
1461{
1462        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1463}
1464EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1465
1466void set_bh_page(struct buffer_head *bh,
1467                struct page *page, unsigned long offset)
1468{
1469        bh->b_page = page;
1470        BUG_ON(offset >= PAGE_SIZE);
1471        if (PageHighMem(page))
1472                /*
1473                 * This catches illegal uses and preserves the offset:
1474                 */
1475                bh->b_data = (char *)(0 + offset);
1476        else
1477                bh->b_data = page_address(page) + offset;
1478}
1479EXPORT_SYMBOL(set_bh_page);
1480
1481/*
1482 * Called when truncating a buffer on a page completely.
1483 */
1484static void discard_buffer(struct buffer_head * bh)
1485{
1486        lock_buffer(bh);
1487        clear_buffer_dirty(bh);
1488        bh->b_bdev = NULL;
1489        clear_buffer_mapped(bh);
1490        clear_buffer_req(bh);
1491        clear_buffer_new(bh);
1492        clear_buffer_delay(bh);
1493        clear_buffer_unwritten(bh);
1494        unlock_buffer(bh);
1495}
1496
1497/**
1498 * block_invalidatepage - invalidate part or all of a buffer-backed page
1499 *
1500 * @page: the page which is affected
1501 * @offset: start of the range to invalidate
1502 * @length: length of the range to invalidate
1503 *
1504 * block_invalidatepage() is called when all or part of the page has become
1505 * invalidated by a truncate operation.
1506 *
1507 * block_invalidatepage() does not have to release all buffers, but it must
1508 * ensure that no dirty buffer is left outside @offset and that no I/O
1509 * is underway against any of the blocks which are outside the truncation
1510 * point.  Because the caller is about to free (and possibly reuse) those
1511 * blocks on-disk.
1512 */
1513void block_invalidatepage(struct page *page, unsigned int offset,
1514                          unsigned int length)
1515{
1516        struct buffer_head *head, *bh, *next;
1517        unsigned int curr_off = 0;
1518        unsigned int stop = length + offset;
1519
1520        BUG_ON(!PageLocked(page));
1521        if (!page_has_buffers(page))
1522                goto out;
1523
1524        /*
1525         * Check for overflow
1526         */
1527        BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1528
1529        head = page_buffers(page);
1530        bh = head;
1531        do {
1532                unsigned int next_off = curr_off + bh->b_size;
1533                next = bh->b_this_page;
1534
1535                /*
1536                 * Are we still fully in range ?
1537                 */
1538                if (next_off > stop)
1539                        goto out;
1540
1541                /*
1542                 * is this block fully invalidated?
1543                 */
1544                if (offset <= curr_off)
1545                        discard_buffer(bh);
1546                curr_off = next_off;
1547                bh = next;
1548        } while (bh != head);
1549
1550        /*
1551         * We release buffers only if the entire page is being invalidated.
1552         * The get_block cached value has been unconditionally invalidated,
1553         * so real IO is not possible anymore.
1554         */
1555        if (offset == 0)
1556                try_to_release_page(page, 0);
1557out:
1558        return;
1559}
1560EXPORT_SYMBOL(block_invalidatepage);
1561
1562
1563/*
1564 * We attach and possibly dirty the buffers atomically wrt
1565 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1566 * is already excluded via the page lock.
1567 */
1568void create_empty_buffers(struct page *page,
1569                        unsigned long blocksize, unsigned long b_state)
1570{
1571        struct buffer_head *bh, *head, *tail;
1572
1573        head = alloc_page_buffers(page, blocksize, 1);
1574        bh = head;
1575        do {
1576                bh->b_state |= b_state;
1577                tail = bh;
1578                bh = bh->b_this_page;
1579        } while (bh);
1580        tail->b_this_page = head;
1581
1582        spin_lock(&page->mapping->private_lock);
1583        if (PageUptodate(page) || PageDirty(page)) {
1584                bh = head;
1585                do {
1586                        if (PageDirty(page))
1587                                set_buffer_dirty(bh);
1588                        if (PageUptodate(page))
1589                                set_buffer_uptodate(bh);
1590                        bh = bh->b_this_page;
1591                } while (bh != head);
1592        }
1593        attach_page_buffers(page, head);
1594        spin_unlock(&page->mapping->private_lock);
1595}
1596EXPORT_SYMBOL(create_empty_buffers);
1597
1598/*
1599 * We are taking a block for data and we don't want any output from any
1600 * buffer-cache aliases starting from return from that function and
1601 * until the moment when something will explicitly mark the buffer
1602 * dirty (hopefully that will not happen until we will free that block ;-)
1603 * We don't even need to mark it not-uptodate - nobody can expect
1604 * anything from a newly allocated buffer anyway. We used to used
1605 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1606 * don't want to mark the alias unmapped, for example - it would confuse
1607 * anyone who might pick it with bread() afterwards...
1608 *
1609 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1610 * be writeout I/O going on against recently-freed buffers.  We don't
1611 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1612 * only if we really need to.  That happens here.
1613 */
1614void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1615{
1616        struct buffer_head *old_bh;
1617
1618        might_sleep();
1619
1620        old_bh = __find_get_block_slow(bdev, block);
1621        if (old_bh) {
1622                clear_buffer_dirty(old_bh);
1623                wait_on_buffer(old_bh);
1624                clear_buffer_req(old_bh);
1625                __brelse(old_bh);
1626        }
1627}
1628EXPORT_SYMBOL(unmap_underlying_metadata);
1629
1630/*
1631 * Size is a power-of-two in the range 512..PAGE_SIZE,
1632 * and the case we care about most is PAGE_SIZE.
1633 *
1634 * So this *could* possibly be written with those
1635 * constraints in mind (relevant mostly if some
1636 * architecture has a slow bit-scan instruction)
1637 */
1638static inline int block_size_bits(unsigned int blocksize)
1639{
1640        return ilog2(blocksize);
1641}
1642
1643static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1644{
1645        BUG_ON(!PageLocked(page));
1646
1647        if (!page_has_buffers(page))
1648                create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1649        return page_buffers(page);
1650}
1651
1652/*
1653 * NOTE! All mapped/uptodate combinations are valid:
1654 *
1655 *      Mapped  Uptodate        Meaning
1656 *
1657 *      No      No              "unknown" - must do get_block()
1658 *      No      Yes             "hole" - zero-filled
1659 *      Yes     No              "allocated" - allocated on disk, not read in
1660 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1661 *
1662 * "Dirty" is valid only with the last case (mapped+uptodate).
1663 */
1664
1665/*
1666 * While block_write_full_page is writing back the dirty buffers under
1667 * the page lock, whoever dirtied the buffers may decide to clean them
1668 * again at any time.  We handle that by only looking at the buffer
1669 * state inside lock_buffer().
1670 *
1671 * If block_write_full_page() is called for regular writeback
1672 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1673 * locked buffer.   This only can happen if someone has written the buffer
1674 * directly, with submit_bh().  At the address_space level PageWriteback
1675 * prevents this contention from occurring.
1676 *
1677 * If block_write_full_page() is called with wbc->sync_mode ==
1678 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1679 * causes the writes to be flagged as synchronous writes.
1680 */
1681static int __block_write_full_page(struct inode *inode, struct page *page,
1682                        get_block_t *get_block, struct writeback_control *wbc,
1683                        bh_end_io_t *handler)
1684{
1685        int err;
1686        sector_t block;
1687        sector_t last_block;
1688        struct buffer_head *bh, *head;
1689        unsigned int blocksize, bbits;
1690        int nr_underway = 0;
1691        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1692                        WRITE_SYNC : WRITE);
1693
1694        head = create_page_buffers(page, inode,
1695                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
1696
1697        /*
1698         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1699         * here, and the (potentially unmapped) buffers may become dirty at
1700         * any time.  If a buffer becomes dirty here after we've inspected it
1701         * then we just miss that fact, and the page stays dirty.
1702         *
1703         * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1704         * handle that here by just cleaning them.
1705         */
1706
1707        bh = head;
1708        blocksize = bh->b_size;
1709        bbits = block_size_bits(blocksize);
1710
1711        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1712        last_block = (i_size_read(inode) - 1) >> bbits;
1713
1714        /*
1715         * Get all the dirty buffers mapped to disk addresses and
1716         * handle any aliases from the underlying blockdev's mapping.
1717         */
1718        do {
1719                if (block > last_block) {
1720                        /*
1721                         * mapped buffers outside i_size will occur, because
1722                         * this page can be outside i_size when there is a
1723                         * truncate in progress.
1724                         */
1725                        /*
1726                         * The buffer was zeroed by block_write_full_page()
1727                         */
1728                        clear_buffer_dirty(bh);
1729                        set_buffer_uptodate(bh);
1730                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1731                           buffer_dirty(bh)) {
1732                        WARN_ON(bh->b_size != blocksize);
1733                        err = get_block(inode, block, bh, 1);
1734                        if (err)
1735                                goto recover;
1736                        clear_buffer_delay(bh);
1737                        if (buffer_new(bh)) {
1738                                /* blockdev mappings never come here */
1739                                clear_buffer_new(bh);
1740                                unmap_underlying_metadata(bh->b_bdev,
1741                                                        bh->b_blocknr);
1742                        }
1743                }
1744                bh = bh->b_this_page;
1745                block++;
1746        } while (bh != head);
1747
1748        do {
1749                if (!buffer_mapped(bh))
1750                        continue;
1751                /*
1752                 * If it's a fully non-blocking write attempt and we cannot
1753                 * lock the buffer then redirty the page.  Note that this can
1754                 * potentially cause a busy-wait loop from writeback threads
1755                 * and kswapd activity, but those code paths have their own
1756                 * higher-level throttling.
1757                 */
1758                if (wbc->sync_mode != WB_SYNC_NONE) {
1759                        lock_buffer(bh);
1760                } else if (!trylock_buffer(bh)) {
1761                        redirty_page_for_writepage(wbc, page);
1762                        continue;
1763                }
1764                if (test_clear_buffer_dirty(bh)) {
1765                        mark_buffer_async_write_endio(bh, handler);
1766                } else {
1767                        unlock_buffer(bh);
1768                }
1769        } while ((bh = bh->b_this_page) != head);
1770
1771        /*
1772         * The page and its buffers are protected by PageWriteback(), so we can
1773         * drop the bh refcounts early.
1774         */
1775        BUG_ON(PageWriteback(page));
1776        set_page_writeback(page);
1777
1778        do {
1779                struct buffer_head *next = bh->b_this_page;
1780                if (buffer_async_write(bh)) {
1781                        submit_bh(write_op, bh);
1782                        nr_underway++;
1783                }
1784                bh = next;
1785        } while (bh != head);
1786        unlock_page(page);
1787
1788        err = 0;
1789done:
1790        if (nr_underway == 0) {
1791                /*
1792                 * The page was marked dirty, but the buffers were
1793                 * clean.  Someone wrote them back by hand with
1794                 * ll_rw_block/submit_bh.  A rare case.
1795                 */
1796                end_page_writeback(page);
1797
1798                /*
1799                 * The page and buffer_heads can be released at any time from
1800                 * here on.
1801                 */
1802        }
1803        return err;
1804
1805recover:
1806        /*
1807         * ENOSPC, or some other error.  We may already have added some
1808         * blocks to the file, so we need to write these out to avoid
1809         * exposing stale data.
1810         * The page is currently locked and not marked for writeback
1811         */
1812        bh = head;
1813        /* Recovery: lock and submit the mapped buffers */
1814        do {
1815                if (buffer_mapped(bh) && buffer_dirty(bh) &&
1816                    !buffer_delay(bh)) {
1817                        lock_buffer(bh);
1818                        mark_buffer_async_write_endio(bh, handler);
1819                } else {
1820                        /*
1821                         * The buffer may have been set dirty during
1822                         * attachment to a dirty page.
1823                         */
1824                        clear_buffer_dirty(bh);
1825                }
1826        } while ((bh = bh->b_this_page) != head);
1827        SetPageError(page);
1828        BUG_ON(PageWriteback(page));
1829        mapping_set_error(page->mapping, err);
1830        set_page_writeback(page);
1831        do {
1832                struct buffer_head *next = bh->b_this_page;
1833                if (buffer_async_write(bh)) {
1834                        clear_buffer_dirty(bh);
1835                        submit_bh(write_op, bh);
1836                        nr_underway++;
1837                }
1838                bh = next;
1839        } while (bh != head);
1840        unlock_page(page);
1841        goto done;
1842}
1843
1844/*
1845 * If a page has any new buffers, zero them out here, and mark them uptodate
1846 * and dirty so they'll be written out (in order to prevent uninitialised
1847 * block data from leaking). And clear the new bit.
1848 */
1849void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1850{
1851        unsigned int block_start, block_end;
1852        struct buffer_head *head, *bh;
1853
1854        BUG_ON(!PageLocked(page));
1855        if (!page_has_buffers(page))
1856                return;
1857
1858        bh = head = page_buffers(page);
1859        block_start = 0;
1860        do {
1861                block_end = block_start + bh->b_size;
1862
1863                if (buffer_new(bh)) {
1864                        if (block_end > from && block_start < to) {
1865                                if (!PageUptodate(page)) {
1866                                        unsigned start, size;
1867
1868                                        start = max(from, block_start);
1869                                        size = min(to, block_end) - start;
1870
1871                                        zero_user(page, start, size);
1872                                        set_buffer_uptodate(bh);
1873                                }
1874
1875                                clear_buffer_new(bh);
1876                                mark_buffer_dirty(bh);
1877                        }
1878                }
1879
1880                block_start = block_end;
1881                bh = bh->b_this_page;
1882        } while (bh != head);
1883}
1884EXPORT_SYMBOL(page_zero_new_buffers);
1885
1886int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1887                get_block_t *get_block)
1888{
1889        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1890        unsigned to = from + len;
1891        struct inode *inode = page->mapping->host;
1892        unsigned block_start, block_end;
1893        sector_t block;
1894        int err = 0;
1895        unsigned blocksize, bbits;
1896        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1897
1898        BUG_ON(!PageLocked(page));
1899        BUG_ON(from > PAGE_CACHE_SIZE);
1900        BUG_ON(to > PAGE_CACHE_SIZE);
1901        BUG_ON(from > to);
1902
1903        head = create_page_buffers(page, inode, 0);
1904        blocksize = head->b_size;
1905        bbits = block_size_bits(blocksize);
1906
1907        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1908
1909        for(bh = head, block_start = 0; bh != head || !block_start;
1910            block++, block_start=block_end, bh = bh->b_this_page) {
1911                block_end = block_start + blocksize;
1912                if (block_end <= from || block_start >= to) {
1913                        if (PageUptodate(page)) {
1914                                if (!buffer_uptodate(bh))
1915                                        set_buffer_uptodate(bh);
1916                        }
1917                        continue;
1918                }
1919                if (buffer_new(bh))
1920                        clear_buffer_new(bh);
1921                if (!buffer_mapped(bh)) {
1922                        WARN_ON(bh->b_size != blocksize);
1923                        err = get_block(inode, block, bh, 1);
1924                        if (err)
1925                                break;
1926                        if (buffer_new(bh)) {
1927                                unmap_underlying_metadata(bh->b_bdev,
1928                                                        bh->b_blocknr);
1929                                if (PageUptodate(page)) {
1930                                        clear_buffer_new(bh);
1931                                        set_buffer_uptodate(bh);
1932                                        mark_buffer_dirty(bh);
1933                                        continue;
1934                                }
1935                                if (block_end > to || block_start < from)
1936                                        zero_user_segments(page,
1937                                                to, block_end,
1938                                                block_start, from);
1939                                continue;
1940                        }
1941                }
1942                if (PageUptodate(page)) {
1943                        if (!buffer_uptodate(bh))
1944                                set_buffer_uptodate(bh);
1945                        continue; 
1946                }
1947                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1948                    !buffer_unwritten(bh) &&
1949                     (block_start < from || block_end > to)) {
1950                        ll_rw_block(READ, 1, &bh);
1951                        *wait_bh++=bh;
1952                }
1953        }
1954        /*
1955         * If we issued read requests - let them complete.
1956         */
1957        while(wait_bh > wait) {
1958                wait_on_buffer(*--wait_bh);
1959                if (!buffer_uptodate(*wait_bh))
1960                        err = -EIO;
1961        }
1962        if (unlikely(err))
1963                page_zero_new_buffers(page, from, to);
1964        return err;
1965}
1966EXPORT_SYMBOL(__block_write_begin);
1967
1968static int __block_commit_write(struct inode *inode, struct page *page,
1969                unsigned from, unsigned to)
1970{
1971        unsigned block_start, block_end;
1972        int partial = 0;
1973        unsigned blocksize;
1974        struct buffer_head *bh, *head;
1975
1976        bh = head = page_buffers(page);
1977        blocksize = bh->b_size;
1978
1979        block_start = 0;
1980        do {
1981                block_end = block_start + blocksize;
1982                if (block_end <= from || block_start >= to) {
1983                        if (!buffer_uptodate(bh))
1984                                partial = 1;
1985                } else {
1986                        set_buffer_uptodate(bh);
1987                        mark_buffer_dirty(bh);
1988                }
1989                clear_buffer_new(bh);
1990
1991                block_start = block_end;
1992                bh = bh->b_this_page;
1993        } while (bh != head);
1994
1995        /*
1996         * If this is a partial write which happened to make all buffers
1997         * uptodate then we can optimize away a bogus readpage() for
1998         * the next read(). Here we 'discover' whether the page went
1999         * uptodate as a result of this (potentially partial) write.
2000         */
2001        if (!partial)
2002                SetPageUptodate(page);
2003        return 0;
2004}
2005
2006/*
2007 * block_write_begin takes care of the basic task of block allocation and
2008 * bringing partial write blocks uptodate first.
2009 *
2010 * The filesystem needs to handle block truncation upon failure.
2011 */
2012int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2013                unsigned flags, struct page **pagep, get_block_t *get_block)
2014{
2015        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2016        struct page *page;
2017        int status;
2018
2019        page = grab_cache_page_write_begin(mapping, index, flags);
2020        if (!page)
2021                return -ENOMEM;
2022
2023        status = __block_write_begin(page, pos, len, get_block);
2024        if (unlikely(status)) {
2025                unlock_page(page);
2026                page_cache_release(page);
2027                page = NULL;
2028        }
2029
2030        *pagep = page;
2031        return status;
2032}
2033EXPORT_SYMBOL(block_write_begin);
2034
2035int block_write_end(struct file *file, struct address_space *mapping,
2036                        loff_t pos, unsigned len, unsigned copied,
2037                        struct page *page, void *fsdata)
2038{
2039        struct inode *inode = mapping->host;
2040        unsigned start;
2041
2042        start = pos & (PAGE_CACHE_SIZE - 1);
2043
2044        if (unlikely(copied < len)) {
2045                /*
2046                 * The buffers that were written will now be uptodate, so we
2047                 * don't have to worry about a readpage reading them and
2048                 * overwriting a partial write. However if we have encountered
2049                 * a short write and only partially written into a buffer, it
2050                 * will not be marked uptodate, so a readpage might come in and
2051                 * destroy our partial write.
2052                 *
2053                 * Do the simplest thing, and just treat any short write to a
2054                 * non uptodate page as a zero-length write, and force the
2055                 * caller to redo the whole thing.
2056                 */
2057                if (!PageUptodate(page))
2058                        copied = 0;
2059
2060                page_zero_new_buffers(page, start+copied, start+len);
2061        }
2062        flush_dcache_page(page);
2063
2064        /* This could be a short (even 0-length) commit */
2065        __block_commit_write(inode, page, start, start+copied);
2066
2067        return copied;
2068}
2069EXPORT_SYMBOL(block_write_end);
2070
2071int generic_write_end(struct file *file, struct address_space *mapping,
2072                        loff_t pos, unsigned len, unsigned copied,
2073                        struct page *page, void *fsdata)
2074{
2075        struct inode *inode = mapping->host;
2076        int i_size_changed = 0;
2077
2078        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2079
2080        /*
2081         * No need to use i_size_read() here, the i_size
2082         * cannot change under us because we hold i_mutex.
2083         *
2084         * But it's important to update i_size while still holding page lock:
2085         * page writeout could otherwise come in and zero beyond i_size.
2086         */
2087        if (pos+copied > inode->i_size) {
2088                i_size_write(inode, pos+copied);
2089                i_size_changed = 1;
2090        }
2091
2092        unlock_page(page);
2093        page_cache_release(page);
2094
2095        /*
2096         * Don't mark the inode dirty under page lock. First, it unnecessarily
2097         * makes the holding time of page lock longer. Second, it forces lock
2098         * ordering of page lock and transaction start for journaling
2099         * filesystems.
2100         */
2101        if (i_size_changed)
2102                mark_inode_dirty(inode);
2103
2104        return copied;
2105}
2106EXPORT_SYMBOL(generic_write_end);
2107
2108/*
2109 * block_is_partially_uptodate checks whether buffers within a page are
2110 * uptodate or not.
2111 *
2112 * Returns true if all buffers which correspond to a file portion
2113 * we want to read are uptodate.
2114 */
2115int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2116                                        unsigned long from)
2117{
2118        unsigned block_start, block_end, blocksize;
2119        unsigned to;
2120        struct buffer_head *bh, *head;
2121        int ret = 1;
2122
2123        if (!page_has_buffers(page))
2124                return 0;
2125
2126        head = page_buffers(page);
2127        blocksize = head->b_size;
2128        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2129        to = from + to;
2130        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2131                return 0;
2132
2133        bh = head;
2134        block_start = 0;
2135        do {
2136                block_end = block_start + blocksize;
2137                if (block_end > from && block_start < to) {
2138                        if (!buffer_uptodate(bh)) {
2139                                ret = 0;
2140                                break;
2141                        }
2142                        if (block_end >= to)
2143                                break;
2144                }
2145                block_start = block_end;
2146                bh = bh->b_this_page;
2147        } while (bh != head);
2148
2149        return ret;
2150}
2151EXPORT_SYMBOL(block_is_partially_uptodate);
2152
2153/*
2154 * Generic "read page" function for block devices that have the normal
2155 * get_block functionality. This is most of the block device filesystems.
2156 * Reads the page asynchronously --- the unlock_buffer() and
2157 * set/clear_buffer_uptodate() functions propagate buffer state into the
2158 * page struct once IO has completed.
2159 */
2160int block_read_full_page(struct page *page, get_block_t *get_block)
2161{
2162        struct inode *inode = page->mapping->host;
2163        sector_t iblock, lblock;
2164        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2165        unsigned int blocksize, bbits;
2166        int nr, i;
2167        int fully_mapped = 1;
2168
2169        head = create_page_buffers(page, inode, 0);
2170        blocksize = head->b_size;
2171        bbits = block_size_bits(blocksize);
2172
2173        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2174        lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2175        bh = head;
2176        nr = 0;
2177        i = 0;
2178
2179        do {
2180                if (buffer_uptodate(bh))
2181                        continue;
2182
2183                if (!buffer_mapped(bh)) {
2184                        int err = 0;
2185
2186                        fully_mapped = 0;
2187                        if (iblock < lblock) {
2188                                WARN_ON(bh->b_size != blocksize);
2189                                err = get_block(inode, iblock, bh, 0);
2190                                if (err)
2191                                        SetPageError(page);
2192                        }
2193                        if (!buffer_mapped(bh)) {
2194                                zero_user(page, i * blocksize, blocksize);
2195                                if (!err)
2196                                        set_buffer_uptodate(bh);
2197                                continue;
2198                        }
2199                        /*
2200                         * get_block() might have updated the buffer
2201                         * synchronously
2202                         */
2203                        if (buffer_uptodate(bh))
2204                                continue;
2205                }
2206                arr[nr++] = bh;
2207        } while (i++, iblock++, (bh = bh->b_this_page) != head);
2208
2209        if (fully_mapped)
2210                SetPageMappedToDisk(page);
2211
2212        if (!nr) {
2213                /*
2214                 * All buffers are uptodate - we can set the page uptodate
2215                 * as well. But not if get_block() returned an error.
2216                 */
2217                if (!PageError(page))
2218                        SetPageUptodate(page);
2219                unlock_page(page);
2220                return 0;
2221        }
2222
2223        /* Stage two: lock the buffers */
2224        for (i = 0; i < nr; i++) {
2225                bh = arr[i];
2226                lock_buffer(bh);
2227                mark_buffer_async_read(bh);
2228        }
2229
2230        /*
2231         * Stage 3: start the IO.  Check for uptodateness
2232         * inside the buffer lock in case another process reading
2233         * the underlying blockdev brought it uptodate (the sct fix).
2234         */
2235        for (i = 0; i < nr; i++) {
2236                bh = arr[i];
2237                if (buffer_uptodate(bh))
2238                        end_buffer_async_read(bh, 1);
2239                else
2240                        submit_bh(READ, bh);
2241        }
2242        return 0;
2243}
2244EXPORT_SYMBOL(block_read_full_page);
2245
2246/* utility function for filesystems that need to do work on expanding
2247 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2248 * deal with the hole.  
2249 */
2250int generic_cont_expand_simple(struct inode *inode, loff_t size)
2251{
2252        struct address_space *mapping = inode->i_mapping;
2253        struct page *page;
2254        void *fsdata;
2255        int err;
2256
2257        err = inode_newsize_ok(inode, size);
2258        if (err)
2259                goto out;
2260
2261        err = pagecache_write_begin(NULL, mapping, size, 0,
2262                                AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2263                                &page, &fsdata);
2264        if (err)
2265                goto out;
2266
2267        err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2268        BUG_ON(err > 0);
2269
2270out:
2271        return err;
2272}
2273EXPORT_SYMBOL(generic_cont_expand_simple);
2274
2275static int cont_expand_zero(struct file *file, struct address_space *mapping,
2276                            loff_t pos, loff_t *bytes)
2277{
2278        struct inode *inode = mapping->host;
2279        unsigned blocksize = 1 << inode->i_blkbits;
2280        struct page *page;
2281        void *fsdata;
2282        pgoff_t index, curidx;
2283        loff_t curpos;
2284        unsigned zerofrom, offset, len;
2285        int err = 0;
2286
2287        index = pos >> PAGE_CACHE_SHIFT;
2288        offset = pos & ~PAGE_CACHE_MASK;
2289
2290        while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2291                zerofrom = curpos & ~PAGE_CACHE_MASK;
2292                if (zerofrom & (blocksize-1)) {
2293                        *bytes |= (blocksize-1);
2294                        (*bytes)++;
2295                }
2296                len = PAGE_CACHE_SIZE - zerofrom;
2297
2298                err = pagecache_write_begin(file, mapping, curpos, len,
2299                                                AOP_FLAG_UNINTERRUPTIBLE,
2300                                                &page, &fsdata);
2301                if (err)
2302                        goto out;
2303                zero_user(page, zerofrom, len);
2304                err = pagecache_write_end(file, mapping, curpos, len, len,
2305                                                page, fsdata);
2306                if (err < 0)
2307                        goto out;
2308                BUG_ON(err != len);
2309                err = 0;
2310
2311                balance_dirty_pages_ratelimited(mapping);
2312        }
2313
2314        /* page covers the boundary, find the boundary offset */
2315        if (index == curidx) {
2316                zerofrom = curpos & ~PAGE_CACHE_MASK;
2317                /* if we will expand the thing last block will be filled */
2318                if (offset <= zerofrom) {
2319                        goto out;
2320                }
2321                if (zerofrom & (blocksize-1)) {
2322                        *bytes |= (blocksize-1);
2323                        (*bytes)++;
2324                }
2325                len = offset - zerofrom;
2326
2327                err = pagecache_write_begin(file, mapping, curpos, len,
2328                                                AOP_FLAG_UNINTERRUPTIBLE,
2329                                                &page, &fsdata);
2330                if (err)
2331                        goto out;
2332                zero_user(page, zerofrom, len);
2333                err = pagecache_write_end(file, mapping, curpos, len, len,
2334                                                page, fsdata);
2335                if (err < 0)
2336                        goto out;
2337                BUG_ON(err != len);
2338                err = 0;
2339        }
2340out:
2341        return err;
2342}
2343
2344/*
2345 * For moronic filesystems that do not allow holes in file.
2346 * We may have to extend the file.
2347 */
2348int cont_write_begin(struct file *file, struct address_space *mapping,
2349                        loff_t pos, unsigned len, unsigned flags,
2350                        struct page **pagep, void **fsdata,
2351                        get_block_t *get_block, loff_t *bytes)
2352{
2353        struct inode *inode = mapping->host;
2354        unsigned blocksize = 1 << inode->i_blkbits;
2355        unsigned zerofrom;
2356        int err;
2357
2358        err = cont_expand_zero(file, mapping, pos, bytes);
2359        if (err)
2360                return err;
2361
2362        zerofrom = *bytes & ~PAGE_CACHE_MASK;
2363        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2364                *bytes |= (blocksize-1);
2365                (*bytes)++;
2366        }
2367
2368        return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2369}
2370EXPORT_SYMBOL(cont_write_begin);
2371
2372int block_commit_write(struct page *page, unsigned from, unsigned to)
2373{
2374        struct inode *inode = page->mapping->host;
2375        __block_commit_write(inode,page,from,to);
2376        return 0;
2377}
2378EXPORT_SYMBOL(block_commit_write);
2379
2380/*
2381 * block_page_mkwrite() is not allowed to change the file size as it gets
2382 * called from a page fault handler when a page is first dirtied. Hence we must
2383 * be careful to check for EOF conditions here. We set the page up correctly
2384 * for a written page which means we get ENOSPC checking when writing into
2385 * holes and correct delalloc and unwritten extent mapping on filesystems that
2386 * support these features.
2387 *
2388 * We are not allowed to take the i_mutex here so we have to play games to
2389 * protect against truncate races as the page could now be beyond EOF.  Because
2390 * truncate writes the inode size before removing pages, once we have the
2391 * page lock we can determine safely if the page is beyond EOF. If it is not
2392 * beyond EOF, then the page is guaranteed safe against truncation until we
2393 * unlock the page.
2394 *
2395 * Direct callers of this function should protect against filesystem freezing
2396 * using sb_start_write() - sb_end_write() functions.
2397 */
2398int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2399                         get_block_t get_block)
2400{
2401        struct page *page = vmf->page;
2402        struct inode *inode = file_inode(vma->vm_file);
2403        unsigned long end;
2404        loff_t size;
2405        int ret;
2406
2407        lock_page(page);
2408        size = i_size_read(inode);
2409        if ((page->mapping != inode->i_mapping) ||
2410            (page_offset(page) > size)) {
2411                /* We overload EFAULT to mean page got truncated */
2412                ret = -EFAULT;
2413                goto out_unlock;
2414        }
2415
2416        /* page is wholly or partially inside EOF */
2417        if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2418                end = size & ~PAGE_CACHE_MASK;
2419        else
2420                end = PAGE_CACHE_SIZE;
2421
2422        ret = __block_write_begin(page, 0, end, get_block);
2423        if (!ret)
2424                ret = block_commit_write(page, 0, end);
2425
2426        if (unlikely(ret < 0))
2427                goto out_unlock;
2428        set_page_dirty(page);
2429        wait_for_stable_page(page);
2430        return 0;
2431out_unlock:
2432        unlock_page(page);
2433        return ret;
2434}
2435EXPORT_SYMBOL(__block_page_mkwrite);
2436
2437int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2438                   get_block_t get_block)
2439{
2440        int ret;
2441        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
2442
2443        sb_start_pagefault(sb);
2444
2445        /*
2446         * Update file times before taking page lock. We may end up failing the
2447         * fault so this update may be superfluous but who really cares...
2448         */
2449        file_update_time(vma->vm_file);
2450
2451        ret = __block_page_mkwrite(vma, vmf, get_block);
2452        sb_end_pagefault(sb);
2453        return block_page_mkwrite_return(ret);
2454}
2455EXPORT_SYMBOL(block_page_mkwrite);
2456
2457/*
2458 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2459 * immediately, while under the page lock.  So it needs a special end_io
2460 * handler which does not touch the bh after unlocking it.
2461 */
2462static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2463{
2464        __end_buffer_read_notouch(bh, uptodate);
2465}
2466
2467/*
2468 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2469 * the page (converting it to circular linked list and taking care of page
2470 * dirty races).
2471 */
2472static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2473{
2474        struct buffer_head *bh;
2475
2476        BUG_ON(!PageLocked(page));
2477
2478        spin_lock(&page->mapping->private_lock);
2479        bh = head;
2480        do {
2481                if (PageDirty(page))
2482                        set_buffer_dirty(bh);
2483                if (!bh->b_this_page)
2484                        bh->b_this_page = head;
2485                bh = bh->b_this_page;
2486        } while (bh != head);
2487        attach_page_buffers(page, head);
2488        spin_unlock(&page->mapping->private_lock);
2489}
2490
2491/*
2492 * On entry, the page is fully not uptodate.
2493 * On exit the page is fully uptodate in the areas outside (from,to)
2494 * The filesystem needs to handle block truncation upon failure.
2495 */
2496int nobh_write_begin(struct address_space *mapping,
2497                        loff_t pos, unsigned len, unsigned flags,
2498                        struct page **pagep, void **fsdata,
2499                        get_block_t *get_block)
2500{
2501        struct inode *inode = mapping->host;
2502        const unsigned blkbits = inode->i_blkbits;
2503        const unsigned blocksize = 1 << blkbits;
2504        struct buffer_head *head, *bh;
2505        struct page *page;
2506        pgoff_t index;
2507        unsigned from, to;
2508        unsigned block_in_page;
2509        unsigned block_start, block_end;
2510        sector_t block_in_file;
2511        int nr_reads = 0;
2512        int ret = 0;
2513        int is_mapped_to_disk = 1;
2514
2515        index = pos >> PAGE_CACHE_SHIFT;
2516        from = pos & (PAGE_CACHE_SIZE - 1);
2517        to = from + len;
2518
2519        page = grab_cache_page_write_begin(mapping, index, flags);
2520        if (!page)
2521                return -ENOMEM;
2522        *pagep = page;
2523        *fsdata = NULL;
2524
2525        if (page_has_buffers(page)) {
2526                ret = __block_write_begin(page, pos, len, get_block);
2527                if (unlikely(ret))
2528                        goto out_release;
2529                return ret;
2530        }
2531
2532        if (PageMappedToDisk(page))
2533                return 0;
2534
2535        /*
2536         * Allocate buffers so that we can keep track of state, and potentially
2537         * attach them to the page if an error occurs. In the common case of
2538         * no error, they will just be freed again without ever being attached
2539         * to the page (which is all OK, because we're under the page lock).
2540         *
2541         * Be careful: the buffer linked list is a NULL terminated one, rather
2542         * than the circular one we're used to.
2543         */
2544        head = alloc_page_buffers(page, blocksize, 0);
2545        if (!head) {
2546                ret = -ENOMEM;
2547                goto out_release;
2548        }
2549
2550        block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2551
2552        /*
2553         * We loop across all blocks in the page, whether or not they are
2554         * part of the affected region.  This is so we can discover if the
2555         * page is fully mapped-to-disk.
2556         */
2557        for (block_start = 0, block_in_page = 0, bh = head;
2558                  block_start < PAGE_CACHE_SIZE;
2559                  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2560                int create;
2561
2562                block_end = block_start + blocksize;
2563                bh->b_state = 0;
2564                create = 1;
2565                if (block_start >= to)
2566                        create = 0;
2567                ret = get_block(inode, block_in_file + block_in_page,
2568                                        bh, create);
2569                if (ret)
2570                        goto failed;
2571                if (!buffer_mapped(bh))
2572                        is_mapped_to_disk = 0;
2573                if (buffer_new(bh))
2574                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2575                if (PageUptodate(page)) {
2576                        set_buffer_uptodate(bh);
2577                        continue;
2578                }
2579                if (buffer_new(bh) || !buffer_mapped(bh)) {
2580                        zero_user_segments(page, block_start, from,
2581                                                        to, block_end);
2582                        continue;
2583                }
2584                if (buffer_uptodate(bh))
2585                        continue;       /* reiserfs does this */
2586                if (block_start < from || block_end > to) {
2587                        lock_buffer(bh);
2588                        bh->b_end_io = end_buffer_read_nobh;
2589                        submit_bh(READ, bh);
2590                        nr_reads++;
2591                }
2592        }
2593
2594        if (nr_reads) {
2595                /*
2596                 * The page is locked, so these buffers are protected from
2597                 * any VM or truncate activity.  Hence we don't need to care
2598                 * for the buffer_head refcounts.
2599                 */
2600                for (bh = head; bh; bh = bh->b_this_page) {
2601                        wait_on_buffer(bh);
2602                        if (!buffer_uptodate(bh))
2603                                ret = -EIO;
2604                }
2605                if (ret)
2606                        goto failed;
2607        }
2608
2609        if (is_mapped_to_disk)
2610                SetPageMappedToDisk(page);
2611
2612        *fsdata = head; /* to be released by nobh_write_end */
2613
2614        return 0;
2615
2616failed:
2617        BUG_ON(!ret);
2618        /*
2619         * Error recovery is a bit difficult. We need to zero out blocks that
2620         * were newly allocated, and dirty them to ensure they get written out.
2621         * Buffers need to be attached to the page at this point, otherwise
2622         * the handling of potential IO errors during writeout would be hard
2623         * (could try doing synchronous writeout, but what if that fails too?)
2624         */
2625        attach_nobh_buffers(page, head);
2626        page_zero_new_buffers(page, from, to);
2627
2628out_release:
2629        unlock_page(page);
2630        page_cache_release(page);
2631        *pagep = NULL;
2632
2633        return ret;
2634}
2635EXPORT_SYMBOL(nobh_write_begin);
2636
2637int nobh_write_end(struct file *file, struct address_space *mapping,
2638                        loff_t pos, unsigned len, unsigned copied,
2639                        struct page *page, void *fsdata)
2640{
2641        struct inode *inode = page->mapping->host;
2642        struct buffer_head *head = fsdata;
2643        struct buffer_head *bh;
2644        BUG_ON(fsdata != NULL && page_has_buffers(page));
2645
2646        if (unlikely(copied < len) && head)
2647                attach_nobh_buffers(page, head);
2648        if (page_has_buffers(page))
2649                return generic_write_end(file, mapping, pos, len,
2650                                        copied, page, fsdata);
2651
2652        SetPageUptodate(page);
2653        set_page_dirty(page);
2654        if (pos+copied > inode->i_size) {
2655                i_size_write(inode, pos+copied);
2656                mark_inode_dirty(inode);
2657        }
2658
2659        unlock_page(page);
2660        page_cache_release(page);
2661
2662        while (head) {
2663                bh = head;
2664                head = head->b_this_page;
2665                free_buffer_head(bh);
2666        }
2667
2668        return copied;
2669}
2670EXPORT_SYMBOL(nobh_write_end);
2671
2672/*
2673 * nobh_writepage() - based on block_full_write_page() except
2674 * that it tries to operate without attaching bufferheads to
2675 * the page.
2676 */
2677int nobh_writepage(struct page *page, get_block_t *get_block,
2678                        struct writeback_control *wbc)
2679{
2680        struct inode * const inode = page->mapping->host;
2681        loff_t i_size = i_size_read(inode);
2682        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2683        unsigned offset;
2684        int ret;
2685
2686        /* Is the page fully inside i_size? */
2687        if (page->index < end_index)
2688                goto out;
2689
2690        /* Is the page fully outside i_size? (truncate in progress) */
2691        offset = i_size & (PAGE_CACHE_SIZE-1);
2692        if (page->index >= end_index+1 || !offset) {
2693                /*
2694                 * The page may have dirty, unmapped buffers.  For example,
2695                 * they may have been added in ext3_writepage().  Make them
2696                 * freeable here, so the page does not leak.
2697                 */
2698#if 0
2699                /* Not really sure about this  - do we need this ? */
2700                if (page->mapping->a_ops->invalidatepage)
2701                        page->mapping->a_ops->invalidatepage(page, offset);
2702#endif
2703                unlock_page(page);
2704                return 0; /* don't care */
2705        }
2706
2707        /*
2708         * The page straddles i_size.  It must be zeroed out on each and every
2709         * writepage invocation because it may be mmapped.  "A file is mapped
2710         * in multiples of the page size.  For a file that is not a multiple of
2711         * the  page size, the remaining memory is zeroed when mapped, and
2712         * writes to that region are not written out to the file."
2713         */
2714        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2715out:
2716        ret = mpage_writepage(page, get_block, wbc);
2717        if (ret == -EAGAIN)
2718                ret = __block_write_full_page(inode, page, get_block, wbc,
2719                                              end_buffer_async_write);
2720        return ret;
2721}
2722EXPORT_SYMBOL(nobh_writepage);
2723
2724int nobh_truncate_page(struct address_space *mapping,
2725                        loff_t from, get_block_t *get_block)
2726{
2727        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2728        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2729        unsigned blocksize;
2730        sector_t iblock;
2731        unsigned length, pos;
2732        struct inode *inode = mapping->host;
2733        struct page *page;
2734        struct buffer_head map_bh;
2735        int err;
2736
2737        blocksize = 1 << inode->i_blkbits;
2738        length = offset & (blocksize - 1);
2739
2740        /* Block boundary? Nothing to do */
2741        if (!length)
2742                return 0;
2743
2744        length = blocksize - length;
2745        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2746
2747        page = grab_cache_page(mapping, index);
2748        err = -ENOMEM;
2749        if (!page)
2750                goto out;
2751
2752        if (page_has_buffers(page)) {
2753has_buffers:
2754                unlock_page(page);
2755                page_cache_release(page);
2756                return block_truncate_page(mapping, from, get_block);
2757        }
2758
2759        /* Find the buffer that contains "offset" */
2760        pos = blocksize;
2761        while (offset >= pos) {
2762                iblock++;
2763                pos += blocksize;
2764        }
2765
2766        map_bh.b_size = blocksize;
2767        map_bh.b_state = 0;
2768        err = get_block(inode, iblock, &map_bh, 0);
2769        if (err)
2770                goto unlock;
2771        /* unmapped? It's a hole - nothing to do */
2772        if (!buffer_mapped(&map_bh))
2773                goto unlock;
2774
2775        /* Ok, it's mapped. Make sure it's up-to-date */
2776        if (!PageUptodate(page)) {
2777                err = mapping->a_ops->readpage(NULL, page);
2778                if (err) {
2779                        page_cache_release(page);
2780                        goto out;
2781                }
2782                lock_page(page);
2783                if (!PageUptodate(page)) {
2784                        err = -EIO;
2785                        goto unlock;
2786                }
2787                if (page_has_buffers(page))
2788                        goto has_buffers;
2789        }
2790        zero_user(page, offset, length);
2791        set_page_dirty(page);
2792        err = 0;
2793
2794unlock:
2795        unlock_page(page);
2796        page_cache_release(page);
2797out:
2798        return err;
2799}
2800EXPORT_SYMBOL(nobh_truncate_page);
2801
2802int block_truncate_page(struct address_space *mapping,
2803                        loff_t from, get_block_t *get_block)
2804{
2805        pgoff_t index = from >> PAGE_CACHE_SHIFT;
2806        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2807        unsigned blocksize;
2808        sector_t iblock;
2809        unsigned length, pos;
2810        struct inode *inode = mapping->host;
2811        struct page *page;
2812        struct buffer_head *bh;
2813        int err;
2814
2815        blocksize = 1 << inode->i_blkbits;
2816        length = offset & (blocksize - 1);
2817
2818        /* Block boundary? Nothing to do */
2819        if (!length)
2820                return 0;
2821
2822        length = blocksize - length;
2823        iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2824        
2825        page = grab_cache_page(mapping, index);
2826        err = -ENOMEM;
2827        if (!page)
2828                goto out;
2829
2830        if (!page_has_buffers(page))
2831                create_empty_buffers(page, blocksize, 0);
2832
2833        /* Find the buffer that contains "offset" */
2834        bh = page_buffers(page);
2835        pos = blocksize;
2836        while (offset >= pos) {
2837                bh = bh->b_this_page;
2838                iblock++;
2839                pos += blocksize;
2840        }
2841
2842        err = 0;
2843        if (!buffer_mapped(bh)) {
2844                WARN_ON(bh->b_size != blocksize);
2845                err = get_block(inode, iblock, bh, 0);
2846                if (err)
2847                        goto unlock;
2848                /* unmapped? It's a hole - nothing to do */
2849                if (!buffer_mapped(bh))
2850                        goto unlock;
2851        }
2852
2853        /* Ok, it's mapped. Make sure it's up-to-date */
2854        if (PageUptodate(page))
2855                set_buffer_uptodate(bh);
2856
2857        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2858                err = -EIO;
2859                ll_rw_block(READ, 1, &bh);
2860                wait_on_buffer(bh);
2861                /* Uhhuh. Read error. Complain and punt. */
2862                if (!buffer_uptodate(bh))
2863                        goto unlock;
2864        }
2865
2866        zero_user(page, offset, length);
2867        mark_buffer_dirty(bh);
2868        err = 0;
2869
2870unlock:
2871        unlock_page(page);
2872        page_cache_release(page);
2873out:
2874        return err;
2875}
2876EXPORT_SYMBOL(block_truncate_page);
2877
2878/*
2879 * The generic ->writepage function for buffer-backed address_spaces
2880 * this form passes in the end_io handler used to finish the IO.
2881 */
2882int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2883                        struct writeback_control *wbc, bh_end_io_t *handler)
2884{
2885        struct inode * const inode = page->mapping->host;
2886        loff_t i_size = i_size_read(inode);
2887        const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2888        unsigned offset;
2889
2890        /* Is the page fully inside i_size? */
2891        if (page->index < end_index)
2892                return __block_write_full_page(inode, page, get_block, wbc,
2893                                               handler);
2894
2895        /* Is the page fully outside i_size? (truncate in progress) */
2896        offset = i_size & (PAGE_CACHE_SIZE-1);
2897        if (page->index >= end_index+1 || !offset) {
2898                /*
2899                 * The page may have dirty, unmapped buffers.  For example,
2900                 * they may have been added in ext3_writepage().  Make them
2901                 * freeable here, so the page does not leak.
2902                 */
2903                do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2904                unlock_page(page);
2905                return 0; /* don't care */
2906        }
2907
2908        /*
2909         * The page straddles i_size.  It must be zeroed out on each and every
2910         * writepage invocation because it may be mmapped.  "A file is mapped
2911         * in multiples of the page size.  For a file that is not a multiple of
2912         * the  page size, the remaining memory is zeroed when mapped, and
2913         * writes to that region are not written out to the file."
2914         */
2915        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2916        return __block_write_full_page(inode, page, get_block, wbc, handler);
2917}
2918EXPORT_SYMBOL(block_write_full_page_endio);
2919
2920/*
2921 * The generic ->writepage function for buffer-backed address_spaces
2922 */
2923int block_write_full_page(struct page *page, get_block_t *get_block,
2924                        struct writeback_control *wbc)
2925{
2926        return block_write_full_page_endio(page, get_block, wbc,
2927                                           end_buffer_async_write);
2928}
2929EXPORT_SYMBOL(block_write_full_page);
2930
2931sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2932                            get_block_t *get_block)
2933{
2934        struct buffer_head tmp;
2935        struct inode *inode = mapping->host;
2936        tmp.b_state = 0;
2937        tmp.b_blocknr = 0;
2938        tmp.b_size = 1 << inode->i_blkbits;
2939        get_block(inode, block, &tmp, 0);
2940        return tmp.b_blocknr;
2941}
2942EXPORT_SYMBOL(generic_block_bmap);
2943
2944static void end_bio_bh_io_sync(struct bio *bio, int err)
2945{
2946        struct buffer_head *bh = bio->bi_private;
2947
2948        if (err == -EOPNOTSUPP) {
2949                set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2950        }
2951
2952        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2953                set_bit(BH_Quiet, &bh->b_state);
2954
2955        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2956        bio_put(bio);
2957}
2958
2959/*
2960 * This allows us to do IO even on the odd last sectors
2961 * of a device, even if the bh block size is some multiple
2962 * of the physical sector size.
2963 *
2964 * We'll just truncate the bio to the size of the device,
2965 * and clear the end of the buffer head manually.
2966 *
2967 * Truly out-of-range accesses will turn into actual IO
2968 * errors, this only handles the "we need to be able to
2969 * do IO at the final sector" case.
2970 */
2971static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2972{
2973        sector_t maxsector;
2974        unsigned bytes;
2975
2976        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2977        if (!maxsector)
2978                return;
2979
2980        /*
2981         * If the *whole* IO is past the end of the device,
2982         * let it through, and the IO layer will turn it into
2983         * an EIO.
2984         */
2985        if (unlikely(bio->bi_sector >= maxsector))
2986                return;
2987
2988        maxsector -= bio->bi_sector;
2989        bytes = bio->bi_size;
2990        if (likely((bytes >> 9) <= maxsector))
2991                return;
2992
2993        /* Uhhuh. We've got a bh that straddles the device size! */
2994        bytes = maxsector << 9;
2995
2996        /* Truncate the bio.. */
2997        bio->bi_size = bytes;
2998        bio->bi_io_vec[0].bv_len = bytes;
2999
3000        /* ..and clear the end of the buffer for reads */
3001        if ((rw & RW_MASK) == READ) {
3002                void *kaddr = kmap_atomic(bh->b_page);
3003                memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
3004                kunmap_atomic(kaddr);
3005                flush_dcache_page(bh->b_page);
3006        }
3007}
3008
3009int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3010{
3011        struct bio *bio;
3012        int ret = 0;
3013
3014        BUG_ON(!buffer_locked(bh));
3015        BUG_ON(!buffer_mapped(bh));
3016        BUG_ON(!bh->b_end_io);
3017        BUG_ON(buffer_delay(bh));
3018        BUG_ON(buffer_unwritten(bh));
3019
3020        /*
3021         * Only clear out a write error when rewriting
3022         */
3023        if (test_set_buffer_req(bh) && (rw & WRITE))
3024                clear_buffer_write_io_error(bh);
3025
3026        /*
3027         * from here on down, it's all bio -- do the initial mapping,
3028         * submit_bio -> generic_make_request may further map this bio around
3029         */
3030        bio = bio_alloc(GFP_NOIO, 1);
3031
3032        bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3033        bio->bi_bdev = bh->b_bdev;
3034        bio->bi_io_vec[0].bv_page = bh->b_page;
3035        bio->bi_io_vec[0].bv_len = bh->b_size;
3036        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3037
3038        bio->bi_vcnt = 1;
3039        bio->bi_size = bh->b_size;
3040
3041        bio->bi_end_io = end_bio_bh_io_sync;
3042        bio->bi_private = bh;
3043        bio->bi_flags |= bio_flags;
3044
3045        /* Take care of bh's that straddle the end of the device */
3046        guard_bh_eod(rw, bio, bh);
3047
3048        if (buffer_meta(bh))
3049                rw |= REQ_META;
3050        if (buffer_prio(bh))
3051                rw |= REQ_PRIO;
3052
3053        bio_get(bio);
3054        submit_bio(rw, bio);
3055
3056        if (bio_flagged(bio, BIO_EOPNOTSUPP))
3057                ret = -EOPNOTSUPP;
3058
3059        bio_put(bio);
3060        return ret;
3061}
3062EXPORT_SYMBOL_GPL(_submit_bh);
3063
3064int submit_bh(int rw, struct buffer_head *bh)
3065{
3066        return _submit_bh(rw, bh, 0);
3067}
3068EXPORT_SYMBOL(submit_bh);
3069
3070/**
3071 * ll_rw_block: low-level access to block devices (DEPRECATED)
3072 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3073 * @nr: number of &struct buffer_heads in the array
3074 * @bhs: array of pointers to &struct buffer_head
3075 *
3076 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3077 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3078 * %READA option is described in the documentation for generic_make_request()
3079 * which ll_rw_block() calls.
3080 *
3081 * This function drops any buffer that it cannot get a lock on (with the
3082 * BH_Lock state bit), any buffer that appears to be clean when doing a write
3083 * request, and any buffer that appears to be up-to-date when doing read
3084 * request.  Further it marks as clean buffers that are processed for
3085 * writing (the buffer cache won't assume that they are actually clean
3086 * until the buffer gets unlocked).
3087 *
3088 * ll_rw_block sets b_end_io to simple completion handler that marks
3089 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
3090 * any waiters. 
3091 *
3092 * All of the buffers must be for the same device, and must also be a
3093 * multiple of the current approved size for the device.
3094 */
3095void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3096{
3097        int i;
3098
3099        for (i = 0; i < nr; i++) {
3100                struct buffer_head *bh = bhs[i];
3101
3102                if (!trylock_buffer(bh))
3103                        continue;
3104                if (rw == WRITE) {
3105                        if (test_clear_buffer_dirty(bh)) {
3106                                bh->b_end_io = end_buffer_write_sync;
3107                                get_bh(bh);
3108                                submit_bh(WRITE, bh);
3109                                continue;
3110                        }
3111                } else {
3112                        if (!buffer_uptodate(bh)) {
3113                                bh->b_end_io = end_buffer_read_sync;
3114                                get_bh(bh);
3115                                submit_bh(rw, bh);
3116                                continue;
3117                        }
3118                }
3119                unlock_buffer(bh);
3120        }
3121}
3122EXPORT_SYMBOL(ll_rw_block);
3123
3124void write_dirty_buffer(struct buffer_head *bh, int rw)
3125{
3126        lock_buffer(bh);
3127        if (!test_clear_buffer_dirty(bh)) {
3128                unlock_buffer(bh);
3129                return;
3130        }
3131        bh->b_end_io = end_buffer_write_sync;
3132        get_bh(bh);
3133        submit_bh(rw, bh);
3134}
3135EXPORT_SYMBOL(write_dirty_buffer);
3136
3137/*
3138 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3139 * and then start new I/O and then wait upon it.  The caller must have a ref on
3140 * the buffer_head.
3141 */
3142int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3143{
3144        int ret = 0;
3145
3146        WARN_ON(atomic_read(&bh->b_count) < 1);
3147        lock_buffer(bh);
3148        if (test_clear_buffer_dirty(bh)) {
3149                get_bh(bh);
3150                bh->b_end_io = end_buffer_write_sync;
3151                ret = submit_bh(rw, bh);
3152                wait_on_buffer(bh);
3153                if (!ret && !buffer_uptodate(bh))
3154                        ret = -EIO;
3155        } else {
3156                unlock_buffer(bh);
3157        }
3158        return ret;
3159}
3160EXPORT_SYMBOL(__sync_dirty_buffer);
3161
3162int sync_dirty_buffer(struct buffer_head *bh)
3163{
3164        return __sync_dirty_buffer(bh, WRITE_SYNC);
3165}
3166EXPORT_SYMBOL(sync_dirty_buffer);
3167
3168/*
3169 * try_to_free_buffers() checks if all the buffers on this particular page
3170 * are unused, and releases them if so.
3171 *
3172 * Exclusion against try_to_free_buffers may be obtained by either
3173 * locking the page or by holding its mapping's private_lock.
3174 *
3175 * If the page is dirty but all the buffers are clean then we need to
3176 * be sure to mark the page clean as well.  This is because the page
3177 * may be against a block device, and a later reattachment of buffers
3178 * to a dirty page will set *all* buffers dirty.  Which would corrupt
3179 * filesystem data on the same device.
3180 *
3181 * The same applies to regular filesystem pages: if all the buffers are
3182 * clean then we set the page clean and proceed.  To do that, we require
3183 * total exclusion from __set_page_dirty_buffers().  That is obtained with
3184 * private_lock.
3185 *
3186 * try_to_free_buffers() is non-blocking.
3187 */
3188static inline int buffer_busy(struct buffer_head *bh)
3189{
3190        return atomic_read(&bh->b_count) |
3191                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3192}
3193
3194static int
3195drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3196{
3197        struct buffer_head *head = page_buffers(page);
3198        struct buffer_head *bh;
3199
3200        bh = head;
3201        do {
3202                if (buffer_write_io_error(bh) && page->mapping)
3203                        set_bit(AS_EIO, &page->mapping->flags);
3204                if (buffer_busy(bh))
3205                        goto failed;
3206                bh = bh->b_this_page;
3207        } while (bh != head);
3208
3209        do {
3210                struct buffer_head *next = bh->b_this_page;
3211
3212                if (bh->b_assoc_map)
3213                        __remove_assoc_queue(bh);
3214                bh = next;
3215        } while (bh != head);
3216        *buffers_to_free = head;
3217        __clear_page_buffers(page);
3218        return 1;
3219failed:
3220        return 0;
3221}
3222
3223int try_to_free_buffers(struct page *page)
3224{
3225        struct address_space * const mapping = page->mapping;
3226        struct buffer_head *buffers_to_free = NULL;
3227        int ret = 0;
3228
3229        BUG_ON(!PageLocked(page));
3230        if (PageWriteback(page))
3231                return 0;
3232
3233        if (mapping == NULL) {          /* can this still happen? */
3234                ret = drop_buffers(page, &buffers_to_free);
3235                goto out;
3236        }
3237
3238        spin_lock(&mapping->private_lock);
3239        ret = drop_buffers(page, &buffers_to_free);
3240
3241        /*
3242         * If the filesystem writes its buffers by hand (eg ext3)
3243         * then we can have clean buffers against a dirty page.  We
3244         * clean the page here; otherwise the VM will never notice
3245         * that the filesystem did any IO at all.
3246         *
3247         * Also, during truncate, discard_buffer will have marked all
3248         * the page's buffers clean.  We discover that here and clean
3249         * the page also.
3250         *
3251         * private_lock must be held over this entire operation in order
3252         * to synchronise against __set_page_dirty_buffers and prevent the
3253         * dirty bit from being lost.
3254         */
3255        if (ret)
3256                cancel_dirty_page(page, PAGE_CACHE_SIZE);
3257        spin_unlock(&mapping->private_lock);
3258out:
3259        if (buffers_to_free) {
3260                struct buffer_head *bh = buffers_to_free;
3261
3262                do {
3263                        struct buffer_head *next = bh->b_this_page;
3264                        free_buffer_head(bh);
3265                        bh = next;
3266                } while (bh != buffers_to_free);
3267        }
3268        return ret;
3269}
3270EXPORT_SYMBOL(try_to_free_buffers);
3271
3272/*
3273 * There are no bdflush tunables left.  But distributions are
3274 * still running obsolete flush daemons, so we terminate them here.
3275 *
3276 * Use of bdflush() is deprecated and will be removed in a future kernel.
3277 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3278 */
3279SYSCALL_DEFINE2(bdflush, int, func, long, data)
3280{
3281        static int msg_count;
3282
3283        if (!capable(CAP_SYS_ADMIN))
3284                return -EPERM;
3285
3286        if (msg_count < 5) {
3287                msg_count++;
3288                printk(KERN_INFO
3289                        "warning: process `%s' used the obsolete bdflush"
3290                        " system call\n", current->comm);
3291                printk(KERN_INFO "Fix your initscripts?\n");
3292        }
3293
3294        if (func == 1)
3295                do_exit(0);
3296        return 0;
3297}
3298
3299/*
3300 * Buffer-head allocation
3301 */
3302static struct kmem_cache *bh_cachep __read_mostly;
3303
3304/*
3305 * Once the number of bh's in the machine exceeds this level, we start
3306 * stripping them in writeback.
3307 */
3308static unsigned long max_buffer_heads;
3309
3310int buffer_heads_over_limit;
3311
3312struct bh_accounting {
3313        int nr;                 /* Number of live bh's */
3314        int ratelimit;          /* Limit cacheline bouncing */
3315};
3316
3317static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3318
3319static void recalc_bh_state(void)
3320{
3321        int i;
3322        int tot = 0;
3323
3324        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3325                return;
3326        __this_cpu_write(bh_accounting.ratelimit, 0);
3327        for_each_online_cpu(i)
3328                tot += per_cpu(bh_accounting, i).nr;
3329        buffer_heads_over_limit = (tot > max_buffer_heads);
3330}
3331
3332struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3333{
3334        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3335        if (ret) {
3336                INIT_LIST_HEAD(&ret->b_assoc_buffers);
3337                preempt_disable();
3338                __this_cpu_inc(bh_accounting.nr);
3339                recalc_bh_state();
3340                preempt_enable();
3341        }
3342        return ret;
3343}
3344EXPORT_SYMBOL(alloc_buffer_head);
3345
3346void free_buffer_head(struct buffer_head *bh)
3347{
3348        BUG_ON(!list_empty(&bh->b_assoc_buffers));
3349        kmem_cache_free(bh_cachep, bh);
3350        preempt_disable();
3351        __this_cpu_dec(bh_accounting.nr);
3352        recalc_bh_state();
3353        preempt_enable();
3354}
3355EXPORT_SYMBOL(free_buffer_head);
3356
3357static void buffer_exit_cpu(int cpu)
3358{
3359        int i;
3360        struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3361
3362        for (i = 0; i < BH_LRU_SIZE; i++) {
3363                brelse(b->bhs[i]);
3364                b->bhs[i] = NULL;
3365        }
3366        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3367        per_cpu(bh_accounting, cpu).nr = 0;
3368}
3369
3370static int buffer_cpu_notify(struct notifier_block *self,
3371                              unsigned long action, void *hcpu)
3372{
3373        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3374                buffer_exit_cpu((unsigned long)hcpu);
3375        return NOTIFY_OK;
3376}
3377
3378/**
3379 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3380 * @bh: struct buffer_head
3381 *
3382 * Return true if the buffer is up-to-date and false,
3383 * with the buffer locked, if not.
3384 */
3385int bh_uptodate_or_lock(struct buffer_head *bh)
3386{
3387        if (!buffer_uptodate(bh)) {
3388                lock_buffer(bh);
3389                if (!buffer_uptodate(bh))
3390                        return 0;
3391                unlock_buffer(bh);
3392        }
3393        return 1;
3394}
3395EXPORT_SYMBOL(bh_uptodate_or_lock);
3396
3397/**
3398 * bh_submit_read - Submit a locked buffer for reading
3399 * @bh: struct buffer_head
3400 *
3401 * Returns zero on success and -EIO on error.
3402 */
3403int bh_submit_read(struct buffer_head *bh)
3404{
3405        BUG_ON(!buffer_locked(bh));
3406
3407        if (buffer_uptodate(bh)) {
3408                unlock_buffer(bh);
3409                return 0;
3410        }
3411
3412        get_bh(bh);
3413        bh->b_end_io = end_buffer_read_sync;
3414        submit_bh(READ, bh);
3415        wait_on_buffer(bh);
3416        if (buffer_uptodate(bh))
3417                return 0;
3418        return -EIO;
3419}
3420EXPORT_SYMBOL(bh_submit_read);
3421
3422void __init buffer_init(void)
3423{
3424        unsigned long nrpages;
3425
3426        bh_cachep = kmem_cache_create("buffer_head",
3427                        sizeof(struct buffer_head), 0,
3428                                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3429                                SLAB_MEM_SPREAD),
3430                                NULL);
3431
3432        /*
3433         * Limit the bh occupancy to 10% of ZONE_NORMAL
3434         */
3435        nrpages = (nr_free_buffer_pages() * 10) / 100;
3436        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3437        hotcpu_notifier(buffer_cpu_notify, 0);
3438}
3439
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.