linux-old/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9 * been avoided by NEVER letting an interrupt change a buffer (except for the
  10 * data, of course), but instead letting the caller do it.
  11 */
  12
  13/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15/* Removed a lot of unnecessary code and simplified things now that
  16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17 */
  18
  19/* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20 * hash table, use SLAB cache for buffer heads. -DaveM
  21 */
  22
  23/* Added 32k buffer block sizes - these are required older ARM systems.
  24 * - RMK
  25 */
  26
  27/* Thread it... -DaveM */
  28
  29/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31#include <linux/config.h>
  32#include <linux/sched.h>
  33#include <linux/fs.h>
  34#include <linux/slab.h>
  35#include <linux/locks.h>
  36#include <linux/errno.h>
  37#include <linux/swap.h>
  38#include <linux/swapctl.h>
  39#include <linux/smp_lock.h>
  40#include <linux/vmalloc.h>
  41#include <linux/blkdev.h>
  42#include <linux/sysrq.h>
  43#include <linux/file.h>
  44#include <linux/init.h>
  45#include <linux/quotaops.h>
  46#include <linux/iobuf.h>
  47#include <linux/highmem.h>
  48#include <linux/module.h>
  49#include <linux/completion.h>
  50
  51#include <asm/uaccess.h>
  52#include <asm/io.h>
  53#include <asm/bitops.h>
  54#include <asm/mmu_context.h>
  55
  56#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
  57#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
  58                                             number of unused buffer heads */
  59
  60/* Anti-deadlock ordering:
  61 *      lru_list_lock > hash_table_lock > unused_list_lock
  62 */
  63
  64#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
  65
  66/*
  67 * Hash table gook..
  68 */
  69static unsigned int bh_hash_mask;
  70static unsigned int bh_hash_shift;
  71static struct buffer_head **hash_table;
  72static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  73
  74static struct buffer_head *lru_list[NR_LIST];
  75
  76static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
  77#define lru_list_lock  lru_list_lock_cacheline.lock
  78
  79static int nr_buffers_type[NR_LIST];
  80static unsigned long size_buffers_type[NR_LIST];
  81
  82static struct buffer_head * unused_list;
  83static int nr_unused_buffer_heads;
  84static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  85static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  86
  87static int grow_buffers(kdev_t dev, unsigned long block, int size);
  88static int osync_buffers_list(struct list_head *);
  89static void __refile_buffer(struct buffer_head *);
  90
  91/*
  92 * A global sysctl-controlled flag which puts the machine into "laptop mode"
  93 */
  94int laptop_mode;
  95
  96static DECLARE_WAIT_QUEUE_HEAD(kupdate_wait);
  97
  98/* This is used by some architectures to estimate available memory. */
  99atomic_t buffermem_pages = ATOMIC_INIT(0);
 100
 101/* Here is the parameter block for the bdflush process. If you add or
 102 * remove any of the parameters, make sure to update kernel/sysctl.c
 103 * and the documentation at linux/Documentation/sysctl/vm.txt.
 104 */
 105
 106#define N_PARAM 9
 107
 108/* The dummy values in this structure are left in there for compatibility
 109 * with old programs that play with the /proc entries.
 110 */
 111union bdflush_param {
 112        struct {
 113                int nfract;     /* Percentage of buffer cache dirty to 
 114                                   activate bdflush */
 115                int ndirty;     /* Maximum number of dirty blocks to write out per
 116                                   wake-cycle */
 117                int dummy2;     /* old "nrefill" */
 118                int dummy3;     /* unused */
 119                int interval;   /* jiffies delay between kupdate flushes */
 120                int age_buffer; /* Time for normal buffer to age before we flush it */
 121                int nfract_sync;/* Percentage of buffer cache dirty to 
 122                                   activate bdflush synchronously */
 123                int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
 124                int dummy5;     /* unused */
 125        } b_un;
 126        unsigned int data[N_PARAM];
 127} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
 128
 129/* These are the min and max parameter values that we will allow to be assigned */
 130int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
 131int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
 132
 133static inline int write_buffer_delay(struct buffer_head *bh)
 134{
 135        struct page *page = bh->b_page;
 136
 137        if (!TryLockPage(page)) {
 138                spin_unlock(&lru_list_lock);
 139                unlock_buffer(bh);
 140                page->mapping->a_ops->writepage(page);
 141                return 1;
 142        }
 143
 144        return 0;
 145}
 146
 147static inline void write_buffer(struct buffer_head *bh)
 148{
 149        if (buffer_delay(bh)) {
 150                struct page *page = bh->b_page;
 151
 152                lock_page(page);
 153                if (buffer_delay(bh)) {
 154                        page->mapping->a_ops->writepage(page);
 155                        return;
 156                }
 157                unlock_page(page);
 158        }
 159
 160        ll_rw_block(WRITE, 1, &bh);
 161}
 162
 163void fastcall unlock_buffer(struct buffer_head *bh)
 164{
 165        clear_bit(BH_Wait_IO, &bh->b_state);
 166        clear_bit(BH_Launder, &bh->b_state);
 167        /*
 168         * When a locked buffer is visible to the I/O layer BH_Launder
 169         * is set. This means before unlocking we must clear BH_Launder,
 170         * mb() on alpha and then clear BH_Lock, so no reader can see
 171         * BH_Launder set on an unlocked buffer and then risk to deadlock.
 172         */
 173        smp_mb__after_clear_bit();
 174        clear_bit(BH_Lock, &bh->b_state);
 175        smp_mb__after_clear_bit();
 176        if (waitqueue_active(&bh->b_wait))
 177                wake_up(&bh->b_wait);
 178}
 179
 180/*
 181 * Note that the real wait_on_buffer() is an inline function that checks
 182 * that the buffer is locked before calling this, so that unnecessary disk
 183 * unplugging does not occur.
 184 */
 185void __wait_on_buffer(struct buffer_head * bh)
 186{
 187        struct task_struct *tsk = current;
 188        DECLARE_WAITQUEUE(wait, tsk);
 189
 190        get_bh(bh);
 191        add_wait_queue(&bh->b_wait, &wait);
 192        do {
 193                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 194                if (!buffer_locked(bh))
 195                        break;
 196                /*
 197                 * We must read tq_disk in TQ_ACTIVE after the
 198                 * add_wait_queue effect is visible to other cpus.
 199                 * We could unplug some line above it wouldn't matter
 200                 * but we can't do that right after add_wait_queue
 201                 * without an smp_mb() in between because spin_unlock
 202                 * has inclusive semantics.
 203                 * Doing it here is the most efficient place so we
 204                 * don't do a suprious unplug if we get a racy
 205                 * wakeup that make buffer_locked to return 0, and
 206                 * doing it here avoids an explicit smp_mb() we
 207                 * rely on the implicit one in set_task_state.
 208                 */
 209                run_task_queue(&tq_disk);
 210                schedule();
 211        } while (buffer_locked(bh));
 212        tsk->state = TASK_RUNNING;
 213        remove_wait_queue(&bh->b_wait, &wait);
 214        put_bh(bh);
 215}
 216
 217/*
 218 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 219 * unlock the buffer. This is what ll_rw_block uses too.
 220 */
 221void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 222{
 223        mark_buffer_uptodate(bh, uptodate);
 224        unlock_buffer(bh);
 225        put_bh(bh);
 226}
 227
 228/*
 229 * The buffers have been marked clean and locked.  Just submit the dang
 230 * things.. 
 231 */
 232static void write_locked_buffers(struct buffer_head **array, unsigned int count)
 233{
 234        do {
 235                struct buffer_head * bh = *array++;
 236                bh->b_end_io = end_buffer_io_sync;
 237                submit_bh(WRITE, bh);
 238        } while (--count);
 239}
 240
 241/*
 242 * Write some buffers from the head of the dirty queue.
 243 *
 244 * This must be called with the LRU lock held, and will
 245 * return without it!
 246 */
 247#define NRSYNC (32)
 248static int write_some_buffers(kdev_t dev)
 249{
 250        struct buffer_head *next;
 251        struct buffer_head *array[NRSYNC];
 252        unsigned int count;
 253        int nr;
 254
 255        next = lru_list[BUF_DIRTY];
 256        nr = nr_buffers_type[BUF_DIRTY];
 257        count = 0;
 258        while (next && --nr >= 0) {
 259                struct buffer_head * bh = next;
 260                next = bh->b_next_free;
 261
 262                if (dev != NODEV && bh->b_dev != dev)
 263                        continue;
 264                if (test_and_set_bit(BH_Lock, &bh->b_state))
 265                        continue;
 266                if (buffer_delay(bh)) {
 267                        if (write_buffer_delay(bh)) {
 268                                if (count)
 269                                        write_locked_buffers(array, count);
 270                                return -EAGAIN;
 271                        }
 272                } else if (atomic_set_buffer_clean(bh)) {
 273                        __refile_buffer(bh);
 274                        get_bh(bh);
 275                        array[count++] = bh;
 276                        if (count < NRSYNC)
 277                                continue;
 278
 279                        spin_unlock(&lru_list_lock);
 280                        write_locked_buffers(array, count);
 281                        return -EAGAIN;
 282                }
 283                unlock_buffer(bh);
 284                __refile_buffer(bh);
 285        }
 286        spin_unlock(&lru_list_lock);
 287
 288        if (count)
 289                write_locked_buffers(array, count);
 290        return 0;
 291}
 292
 293/*
 294 * Write out all buffers on the dirty list.
 295 */
 296static void write_unlocked_buffers(kdev_t dev)
 297{
 298        do
 299                spin_lock(&lru_list_lock);
 300        while (write_some_buffers(dev));
 301}
 302
 303/*
 304 * Wait for a buffer on the proper list.
 305 *
 306 * This must be called with the LRU lock held, and
 307 * will return with it released.
 308 */
 309static int wait_for_buffers(kdev_t dev, int index, int refile)
 310{
 311        struct buffer_head * next;
 312        int nr;
 313
 314        next = lru_list[index];
 315        nr = nr_buffers_type[index];
 316        while (next && --nr >= 0) {
 317                struct buffer_head *bh = next;
 318                next = bh->b_next_free;
 319
 320                if (!buffer_locked(bh)) {
 321                        if (refile)
 322                                __refile_buffer(bh);
 323                        continue;
 324                }
 325                if (dev != NODEV && bh->b_dev != dev)
 326                        continue;
 327
 328                get_bh(bh);
 329                spin_unlock(&lru_list_lock);
 330                wait_on_buffer (bh);
 331                put_bh(bh);
 332                return -EAGAIN;
 333        }
 334        spin_unlock(&lru_list_lock);
 335        return 0;
 336}
 337
 338static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
 339{
 340        do {
 341                spin_lock(&lru_list_lock);
 342        } while (wait_for_buffers(dev, index, refile));
 343        return 0;
 344}
 345
 346/* Call sync_buffers with wait!=0 to ensure that the call does not
 347 * return until all buffer writes have completed.  Sync() may return
 348 * before the writes have finished; fsync() may not.
 349 */
 350
 351/* Godamity-damn.  Some buffers (bitmaps for filesystems)
 352 * spontaneously dirty themselves without ever brelse being called.
 353 * We will ultimately want to put these in a separate list, but for
 354 * now we search all of the lists for dirty buffers.
 355 */
 356int sync_buffers(kdev_t dev, int wait)
 357{
 358        int err = 0;
 359
 360        /* One pass for no-wait, three for wait:
 361         * 0) write out all dirty, unlocked buffers;
 362         * 1) wait for all dirty locked buffers;
 363         * 2) write out all dirty, unlocked buffers;
 364         * 2) wait for completion by waiting for all buffers to unlock.
 365         */
 366        write_unlocked_buffers(dev);
 367        if (wait) {
 368                err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
 369                write_unlocked_buffers(dev);
 370                err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
 371        }
 372        return err;
 373}
 374EXPORT_SYMBOL(sync_buffers);
 375
 376int fsync_super(struct super_block *sb)
 377{
 378        kdev_t dev = sb->s_dev;
 379        sync_buffers(dev, 0);
 380
 381        lock_kernel();
 382        sync_inodes_sb(sb);
 383        DQUOT_SYNC_SB(sb);
 384        lock_super(sb);
 385        if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
 386                sb->s_op->write_super(sb);
 387        unlock_super(sb);
 388        if (sb->s_op && sb->s_op->sync_fs)
 389                sb->s_op->sync_fs(sb);
 390        unlock_kernel();
 391
 392        return sync_buffers(dev, 1);
 393}
 394
 395int fsync_no_super(kdev_t dev)
 396{
 397        sync_buffers(dev, 0);
 398        return sync_buffers(dev, 1);
 399}
 400
 401int fsync_dev(kdev_t dev)
 402{
 403        sync_buffers(dev, 0);
 404
 405        lock_kernel();
 406        sync_inodes(dev);
 407        DQUOT_SYNC_DEV(dev);
 408        sync_supers(dev, 1);
 409        unlock_kernel();
 410
 411        return sync_buffers(dev, 1);
 412}
 413
 414/*
 415 * There's no real reason to pretend we should
 416 * ever do anything differently
 417 */
 418void sync_dev(kdev_t dev)
 419{
 420        fsync_dev(dev);
 421}
 422
 423asmlinkage long sys_sync(void)
 424{
 425        fsync_dev(0);
 426        return 0;
 427}
 428
 429/*
 430 *      filp may be NULL if called via the msync of a vma.
 431 */
 432 
 433int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 434{
 435        struct inode * inode = dentry->d_inode;
 436        struct super_block * sb;
 437        kdev_t dev;
 438        int ret;
 439
 440        lock_kernel();
 441        /* sync the inode to buffers */
 442        write_inode_now(inode, 0);
 443
 444        /* sync the superblock to buffers */
 445        sb = inode->i_sb;
 446        lock_super(sb);
 447        if (sb->s_op && sb->s_op->write_super)
 448                sb->s_op->write_super(sb);
 449        unlock_super(sb);
 450
 451        /* .. finally sync the buffers to disk */
 452        dev = inode->i_dev;
 453        ret = sync_buffers(dev, 1);
 454        unlock_kernel();
 455        return ret;
 456}
 457
 458asmlinkage long sys_fsync(unsigned int fd)
 459{
 460        struct file * file;
 461        struct dentry * dentry;
 462        struct inode * inode;
 463        int ret, err;
 464
 465        ret = -EBADF;
 466        file = fget(fd);
 467        if (!file)
 468                goto out;
 469
 470        dentry = file->f_dentry;
 471        inode = dentry->d_inode;
 472
 473        ret = -EINVAL;
 474        if (!file->f_op || !file->f_op->fsync) {
 475                /* Why?  We can still call filemap_fdatasync */
 476                goto out_putf;
 477        }
 478
 479        /* We need to protect against concurrent writers.. */
 480        down(&inode->i_sem);
 481        ret = filemap_fdatasync(inode->i_mapping);
 482        err = file->f_op->fsync(file, dentry, 0);
 483        if (err && !ret)
 484                ret = err;
 485        err = filemap_fdatawait(inode->i_mapping);
 486        if (err && !ret)
 487                ret = err;
 488        up(&inode->i_sem);
 489
 490out_putf:
 491        fput(file);
 492out:
 493        return ret;
 494}
 495
 496int do_fdatasync(struct file *file)
 497{
 498        int ret, err;
 499        struct dentry *dentry;
 500        struct inode *inode;
 501
 502        if (unlikely(!file->f_op || !file->f_op->fsync))
 503                return -EINVAL;
 504        
 505        dentry = file->f_dentry;
 506        inode = dentry->d_inode;
 507
 508        ret = filemap_fdatasync(inode->i_mapping);
 509        err = file->f_op->fsync(file, dentry, 1);
 510        if (err && !ret)
 511                ret = err;
 512        err = filemap_fdatawait(inode->i_mapping);
 513        if (err && !ret)
 514                ret = err;
 515        return ret;
 516}
 517
 518asmlinkage long sys_fdatasync(unsigned int fd)
 519{
 520        struct file * file;
 521        struct inode *inode;
 522        int ret;
 523
 524        ret = -EBADF;
 525        file = fget(fd);
 526        if (!file)
 527                goto out;
 528
 529        inode = file->f_dentry->d_inode;
 530        down(&inode->i_sem);
 531        ret = do_fdatasync(file);
 532        up(&inode->i_sem);
 533
 534        fput(file);
 535out:
 536        return ret;
 537}
 538
 539/* After several hours of tedious analysis, the following hash
 540 * function won.  Do not mess with it... -DaveM
 541 */
 542#define _hashfn(dev,block)      \
 543        ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 544         (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
 545          ((block) << (bh_hash_shift - 12))))
 546#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
 547
 548static inline void __insert_into_hash_list(struct buffer_head *bh)
 549{
 550        struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 551        struct buffer_head *next = *head;
 552
 553        *head = bh;
 554        bh->b_pprev = head;
 555        bh->b_next = next;
 556        if (next != NULL)
 557                next->b_pprev = &bh->b_next;
 558}
 559
 560static __inline__ void __hash_unlink(struct buffer_head *bh)
 561{
 562        struct buffer_head **pprev = bh->b_pprev;
 563        if (pprev) {
 564                struct buffer_head *next = bh->b_next;
 565                if (next)
 566                        next->b_pprev = pprev;
 567                *pprev = next;
 568                bh->b_pprev = NULL;
 569        }
 570}
 571
 572static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 573{
 574        struct buffer_head **bhp = &lru_list[blist];
 575
 576        if (bh->b_prev_free || bh->b_next_free) BUG();
 577
 578        if(!*bhp) {
 579                *bhp = bh;
 580                bh->b_prev_free = bh;
 581        }
 582        bh->b_next_free = *bhp;
 583        bh->b_prev_free = (*bhp)->b_prev_free;
 584        (*bhp)->b_prev_free->b_next_free = bh;
 585        (*bhp)->b_prev_free = bh;
 586        nr_buffers_type[blist]++;
 587        size_buffers_type[blist] += bh->b_size;
 588}
 589
 590static void __remove_from_lru_list(struct buffer_head * bh)
 591{
 592        struct buffer_head *next = bh->b_next_free;
 593        if (next) {
 594                struct buffer_head *prev = bh->b_prev_free;
 595                int blist = bh->b_list;
 596
 597                prev->b_next_free = next;
 598                next->b_prev_free = prev;
 599                if (lru_list[blist] == bh) {
 600                        if (next == bh)
 601                                next = NULL;
 602                        lru_list[blist] = next;
 603                }
 604                bh->b_next_free = NULL;
 605                bh->b_prev_free = NULL;
 606                nr_buffers_type[blist]--;
 607                size_buffers_type[blist] -= bh->b_size;
 608        }
 609}
 610
 611/* must be called with both the hash_table_lock and the lru_list_lock
 612   held */
 613static void __remove_from_queues(struct buffer_head *bh)
 614{
 615        __hash_unlink(bh);
 616        __remove_from_lru_list(bh);
 617}
 618
 619static void remove_from_queues(struct buffer_head *bh)
 620{
 621        spin_lock(&lru_list_lock);
 622        write_lock(&hash_table_lock);
 623        __remove_from_queues(bh);
 624        write_unlock(&hash_table_lock); 
 625        spin_unlock(&lru_list_lock);
 626}
 627
 628struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 629{
 630        struct buffer_head *bh, **p = &hash(dev, block);
 631
 632        read_lock(&hash_table_lock);
 633
 634        for (;;) {
 635                bh = *p;
 636                if (!bh)
 637                        break;
 638                p = &bh->b_next;
 639                if (bh->b_blocknr != block)
 640                        continue;
 641                if (bh->b_size != size)
 642                        continue;
 643                if (bh->b_dev != dev)
 644                        continue;
 645                get_bh(bh);
 646                break;
 647        }
 648
 649        read_unlock(&hash_table_lock);
 650        return bh;
 651}
 652
 653void fastcall buffer_insert_list(struct buffer_head *bh, struct list_head *list)
 654{
 655        spin_lock(&lru_list_lock);
 656        if (buffer_attached(bh))
 657                list_del(&bh->b_inode_buffers);
 658        set_buffer_attached(bh);
 659        list_add_tail(&bh->b_inode_buffers, list);
 660        spin_unlock(&lru_list_lock);
 661}
 662
 663/*
 664 * The caller must have the lru_list lock before calling the 
 665 * remove_inode_queue functions.
 666 */
 667static void __remove_inode_queue(struct buffer_head *bh)
 668{
 669        list_del(&bh->b_inode_buffers);
 670        clear_buffer_attached(bh);
 671}
 672
 673static inline void remove_inode_queue(struct buffer_head *bh)
 674{
 675        if (buffer_attached(bh))
 676                __remove_inode_queue(bh);
 677}
 678
 679int inode_has_buffers(struct inode *inode)
 680{
 681        int ret;
 682        
 683        spin_lock(&lru_list_lock);
 684        ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
 685        spin_unlock(&lru_list_lock);
 686        
 687        return ret;
 688}
 689
 690/* If invalidate_buffers() will trash dirty buffers, it means some kind
 691   of fs corruption is going on. Trashing dirty data always imply losing
 692   information that was supposed to be just stored on the physical layer
 693   by the user.
 694
 695   Thus invalidate_buffers in general usage is not allwowed to trash
 696   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 697   be preserved.  These buffers are simply skipped.
 698  
 699   We also skip buffers which are still in use.  For example this can
 700   happen if a userspace program is reading the block device.
 701
 702   NOTE: In the case where the user removed a removable-media-disk even if
 703   there's still dirty data not synced on disk (due a bug in the device driver
 704   or due an error of the user), by not destroying the dirty buffers we could
 705   generate corruption also on the next media inserted, thus a parameter is
 706   necessary to handle this case in the most safe way possible (trying
 707   to not corrupt also the new disk inserted with the data belonging to
 708   the old now corrupted disk). Also for the ramdisk the natural thing
 709   to do in order to release the ramdisk memory is to destroy dirty buffers.
 710
 711   These are two special cases. Normal usage imply the device driver
 712   to issue a sync on the device (without waiting I/O completion) and
 713   then an invalidate_buffers call that doesn't trash dirty buffers.
 714
 715   For handling cache coherency with the blkdev pagecache the 'update' case
 716   is been introduced. It is needed to re-read from disk any pinned
 717   buffer. NOTE: re-reading from disk is destructive so we can do it only
 718   when we assume nobody is changing the buffercache under our I/O and when
 719   we think the disk contains more recent information than the buffercache.
 720   The update == 1 pass marks the buffers we need to update, the update == 2
 721   pass does the actual I/O. */
 722void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 723{
 724        int i, nlist, slept;
 725        struct buffer_head * bh, * bh_next;
 726        kdev_t dev = to_kdev_t(bdev->bd_dev);   /* will become bdev */
 727
 728 retry:
 729        slept = 0;
 730        spin_lock(&lru_list_lock);
 731        for(nlist = 0; nlist < NR_LIST; nlist++) {
 732                bh = lru_list[nlist];
 733                if (!bh)
 734                        continue;
 735                for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 736                        bh_next = bh->b_next_free;
 737
 738                        /* Another device? */
 739                        if (bh->b_dev != dev)
 740                                continue;
 741                        /* Not hashed? */
 742                        if (!bh->b_pprev)
 743                                continue;
 744                        if (buffer_locked(bh)) {
 745                                get_bh(bh);
 746                                spin_unlock(&lru_list_lock);
 747                                wait_on_buffer(bh);
 748                                slept = 1;
 749                                spin_lock(&lru_list_lock);
 750                                put_bh(bh);
 751                        }
 752
 753                        write_lock(&hash_table_lock);
 754                        /* All buffers in the lru lists are mapped */
 755                        if (!buffer_mapped(bh))
 756                                BUG();
 757                        if (buffer_dirty(bh) && destroy_dirty_buffers)
 758                                printk("invalidate: dirty buffer\n");
 759                        if (!atomic_read(&bh->b_count)) {
 760                                if (destroy_dirty_buffers || !buffer_dirty(bh)) {
 761                                        remove_inode_queue(bh);
 762                                }
 763                        } else if (!bdev->bd_openers)
 764                                printk("invalidate: busy buffer\n");
 765
 766                        write_unlock(&hash_table_lock);
 767                        if (slept)
 768                                goto out;
 769                }
 770        }
 771out:
 772        spin_unlock(&lru_list_lock);
 773        if (slept)
 774                goto retry;
 775
 776        /* Get rid of the page cache */
 777        invalidate_inode_pages(bdev->bd_inode);
 778}
 779
 780void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 781{
 782        struct block_device *bdev = bdget(dev);
 783        if (bdev) {
 784                invalidate_bdev(bdev, destroy_dirty_buffers);
 785                bdput(bdev);
 786        }
 787}
 788
 789static void free_more_memory(void)
 790{
 791        balance_dirty();
 792        wakeup_bdflush();
 793        try_to_free_pages(GFP_NOIO);
 794        run_task_queue(&tq_disk);
 795        yield();
 796}
 797
 798void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 799{
 800        bh->b_list = BUF_CLEAN;
 801        bh->b_end_io = handler;
 802        bh->b_private = private;
 803}
 804
 805void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 806{
 807        static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 808        unsigned long flags;
 809        struct buffer_head *tmp;
 810        struct page *page;
 811        int fullup = 1;
 812
 813        mark_buffer_uptodate(bh, uptodate);
 814
 815        /* This is a temporary buffer used for page I/O. */
 816        page = bh->b_page;
 817
 818        if (!uptodate)
 819                SetPageError(page);
 820
 821        /*
 822         * Be _very_ careful from here on. Bad things can happen if
 823         * two buffer heads end IO at almost the same time and both
 824         * decide that the page is now completely done.
 825         *
 826         * Async buffer_heads are here only as labels for IO, and get
 827         * thrown away once the IO for this page is complete.  IO is
 828         * deemed complete once all buffers have been visited
 829         * (b_count==0) and are now unlocked. We must make sure that
 830         * only the _last_ buffer that decrements its count is the one
 831         * that unlock the page..
 832         */
 833        spin_lock_irqsave(&page_uptodate_lock, flags);
 834        mark_buffer_async(bh, 0);
 835        unlock_buffer(bh);
 836        tmp = bh->b_this_page;
 837        while (tmp != bh) {
 838                if (buffer_locked(tmp)) {
 839                        if (buffer_async(tmp))
 840                                goto still_busy;
 841                } else if (!buffer_uptodate(tmp))
 842                        fullup = 0;
 843                tmp = tmp->b_this_page;
 844        }
 845
 846        /* OK, the async IO on this page is complete. */
 847        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 848
 849        /*
 850         * If none of the buffers had errors and all were uptodate
 851         * then we can set the page uptodate:
 852         */
 853        if (fullup && !PageError(page))
 854                SetPageUptodate(page);
 855
 856        UnlockPage(page);
 857
 858        return;
 859
 860still_busy:
 861        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 862        return;
 863}
 864
 865inline void set_buffer_async_io(struct buffer_head *bh)
 866{
 867        bh->b_end_io = end_buffer_io_async;
 868        mark_buffer_async(bh, 1);
 869}
 870
 871/*
 872 * Synchronise all the inode's dirty buffers to the disk.
 873 *
 874 * We have conflicting pressures: we want to make sure that all
 875 * initially dirty buffers get waited on, but that any subsequently
 876 * dirtied buffers don't.  After all, we don't want fsync to last
 877 * forever if somebody is actively writing to the file.
 878 *
 879 * Do this in two main stages: first we copy dirty buffers to a
 880 * temporary inode list, queueing the writes as we go.  Then we clean
 881 * up, waiting for those writes to complete.
 882 * 
 883 * During this second stage, any subsequent updates to the file may end
 884 * up refiling the buffer on the original inode's dirty list again, so
 885 * there is a chance we will end up with a buffer queued for write but
 886 * not yet completed on that list.  So, as a final cleanup we go through
 887 * the osync code to catch these locked, dirty buffers without requeuing
 888 * any newly dirty buffers for write.
 889 */
 890int fsync_buffers_list(struct list_head *list)
 891{
 892        struct buffer_head *bh;
 893        struct list_head tmp;
 894        int err = 0, err2;
 895        
 896        INIT_LIST_HEAD(&tmp);
 897        
 898        spin_lock(&lru_list_lock);
 899
 900        while (!list_empty(list)) {
 901                bh = BH_ENTRY(list->next);
 902                list_del(&bh->b_inode_buffers);
 903                if (!buffer_dirty(bh) && !buffer_locked(bh))
 904                        clear_buffer_attached(bh);
 905                else {
 906                        set_buffer_attached(bh);
 907                        list_add(&bh->b_inode_buffers, &tmp);
 908                        if (buffer_dirty(bh)) {
 909                                get_bh(bh);
 910                                spin_unlock(&lru_list_lock);
 911                        /*
 912                         * Wait I/O completion before submitting
 913                         * the buffer, to be sure the write will
 914                         * be effective on the latest data in
 915                         * the buffer. (otherwise - if there's old
 916                         * I/O in flight - write_buffer would become
 917                         * a noop)
 918                         */
 919                                wait_on_buffer(bh);
 920                                write_buffer(bh);
 921                                brelse(bh);
 922                                spin_lock(&lru_list_lock);
 923                        }
 924                }
 925        }
 926
 927        while (!list_empty(&tmp)) {
 928                bh = BH_ENTRY(tmp.prev);
 929                remove_inode_queue(bh);
 930                get_bh(bh);
 931                spin_unlock(&lru_list_lock);
 932                wait_on_buffer(bh);
 933                if (!buffer_uptodate(bh))
 934                        err = -EIO;
 935                brelse(bh);
 936                spin_lock(&lru_list_lock);
 937        }
 938        
 939        spin_unlock(&lru_list_lock);
 940        err2 = osync_buffers_list(list);
 941
 942        if (err)
 943                return err;
 944        else
 945                return err2;
 946}
 947
 948/*
 949 * osync is designed to support O_SYNC io.  It waits synchronously for
 950 * all already-submitted IO to complete, but does not queue any new
 951 * writes to the disk.
 952 *
 953 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 954 * you dirty the buffers, and then use osync_buffers_list to wait for
 955 * completion.  Any other dirty buffers which are not yet queued for
 956 * write will not be flushed to disk by the osync.
 957 */
 958static int osync_buffers_list(struct list_head *list)
 959{
 960        struct buffer_head *bh;
 961        struct list_head *p;
 962        int err = 0;
 963
 964        spin_lock(&lru_list_lock);
 965        
 966 repeat:
 967        list_for_each_prev(p, list) {
 968                bh = BH_ENTRY(p);
 969                if (buffer_locked(bh)) {
 970                        get_bh(bh);
 971                        spin_unlock(&lru_list_lock);
 972                        wait_on_buffer(bh);
 973                        if (!buffer_uptodate(bh))
 974                                err = -EIO;
 975                        brelse(bh);
 976                        spin_lock(&lru_list_lock);
 977                        goto repeat;
 978                }
 979        }
 980
 981        spin_unlock(&lru_list_lock);
 982        return err;
 983}
 984
 985/*
 986 * Invalidate any and all dirty buffers on a given inode.  We are
 987 * probably unmounting the fs, but that doesn't mean we have already
 988 * done a sync().  Just drop the buffers from the inode list.
 989 */
 990void invalidate_inode_buffers(struct inode *inode)
 991{
 992        struct list_head * entry;
 993        
 994        spin_lock(&lru_list_lock);
 995        while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
 996                remove_inode_queue(BH_ENTRY(entry));
 997        while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
 998                remove_inode_queue(BH_ENTRY(entry));
 999        spin_unlock(&lru_list_lock);
1000}
1001
1002
1003/*
1004 * Ok, this is getblk, and it isn't very clear, again to hinder
1005 * race-conditions. Most of the code is seldom used, (ie repeating),
1006 * so it should be much more efficient than it looks.
1007 *
1008 * The algorithm is changed: hopefully better, and an elusive bug removed.
1009 *
1010 * 14.02.92: changed it to sync dirty buffers a bit: better performance
1011 * when the filesystem starts to get full of dirty blocks (I hope).
1012 */
1013struct buffer_head * getblk(kdev_t dev, int block, int size)
1014{
1015        for (;;) {
1016                struct buffer_head * bh;
1017
1018                bh = get_hash_table(dev, block, size);
1019                if (bh) {
1020                        touch_buffer(bh);
1021                        return bh;
1022                }
1023
1024                if (!grow_buffers(dev, block, size))
1025                        free_more_memory();
1026        }
1027}
1028
1029/* -1 -> no need to flush
1030    0 -> async flush
1031    1 -> sync flush (wait for I/O completion) */
1032static int balance_dirty_state(void)
1033{
1034        unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1035
1036        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1037        tot = nr_free_buffer_pages();
1038
1039        dirty *= 100;
1040        soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1041        hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1042
1043        /* First, check for the "real" dirty limit. */
1044        if (dirty > soft_dirty_limit) {
1045                if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
1046                        return 1;
1047                return 0;
1048        }
1049
1050        return -1;
1051}
1052
1053static int bdflush_stop(void)
1054{
1055        unsigned long dirty, tot, dirty_limit;
1056
1057        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1058        tot = nr_free_buffer_pages();
1059
1060        dirty *= 100;
1061        dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1062
1063        if (!laptop_mode && dirty > dirty_limit)
1064                return 0;
1065        return 1;
1066}
1067
1068/*
1069 * if a new dirty buffer is created we need to balance bdflush.
1070 *
1071 * in the future we might want to make bdflush aware of different
1072 * pressures on different devices - thus the (currently unused)
1073 * 'dev' parameter.
1074 */
1075void balance_dirty(void)
1076{
1077        int state = balance_dirty_state();
1078
1079        if (state < 0)
1080                return;
1081
1082        wakeup_bdflush();
1083
1084        /*
1085         * And if we're _really_ out of balance, wait for
1086         * some of the dirty/locked buffers ourselves.
1087         * This will throttle heavy writers.
1088         */
1089        if (state > 0) {
1090                spin_lock(&lru_list_lock);
1091                write_some_buffers(NODEV);
1092        }
1093}
1094EXPORT_SYMBOL(balance_dirty);
1095
1096inline void fastcall __mark_dirty(struct buffer_head *bh)
1097{
1098        bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1099        refile_buffer(bh);
1100}
1101
1102/* atomic version, the user must call balance_dirty() by hand
1103   as soon as it become possible to block */
1104void fastcall __mark_buffer_dirty(struct buffer_head *bh)
1105{
1106        if (!atomic_set_buffer_dirty(bh))
1107                __mark_dirty(bh);
1108}
1109
1110void fastcall mark_buffer_dirty(struct buffer_head *bh)
1111{
1112        if (!atomic_set_buffer_dirty(bh)) {
1113                if (block_dump)
1114                        printk("%s: dirtied buffer\n", current->comm);
1115                __mark_dirty(bh);
1116                balance_dirty();
1117        }
1118}
1119
1120void set_buffer_flushtime(struct buffer_head *bh)
1121{
1122        bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1123}
1124EXPORT_SYMBOL(set_buffer_flushtime);
1125
1126int get_buffer_flushtime(void)
1127{
1128        return bdf_prm.b_un.interval;
1129}
1130EXPORT_SYMBOL(get_buffer_flushtime);
1131
1132/*
1133 * A buffer may need to be moved from one buffer list to another
1134 * (e.g. in case it is not shared any more). Handle this.
1135 */
1136static void __refile_buffer(struct buffer_head *bh)
1137{
1138        int dispose = BUF_CLEAN;
1139        if (buffer_locked(bh))
1140                dispose = BUF_LOCKED;
1141        if (buffer_dirty(bh))
1142                dispose = BUF_DIRTY;
1143        if (dispose != bh->b_list) {
1144                __remove_from_lru_list(bh);
1145                bh->b_list = dispose;
1146                if (dispose == BUF_CLEAN)
1147                        remove_inode_queue(bh);
1148                __insert_into_lru_list(bh, dispose);
1149        }
1150}
1151
1152void refile_buffer(struct buffer_head *bh)
1153{
1154        spin_lock(&lru_list_lock);
1155        __refile_buffer(bh);
1156        spin_unlock(&lru_list_lock);
1157}
1158
1159/*
1160 * Release a buffer head
1161 */
1162void __brelse(struct buffer_head * buf)
1163{
1164        if (atomic_read(&buf->b_count)) {
1165                put_bh(buf);
1166                return;
1167        }
1168        printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1169}
1170
1171/*
1172 * bforget() is like brelse(), except it discards any
1173 * potentially dirty data.
1174 */
1175void __bforget(struct buffer_head * buf)
1176{
1177        mark_buffer_clean(buf);
1178        __brelse(buf);
1179}
1180
1181/**
1182 *      bread() - reads a specified block and returns the bh
1183 *      @block: number of block
1184 *      @size: size (in bytes) to read
1185 * 
1186 *      Reads a specified block, and returns buffer head that
1187 *      contains it. It returns NULL if the block was unreadable.
1188 */
1189struct buffer_head * bread(kdev_t dev, int block, int size)
1190{
1191        struct buffer_head * bh;
1192
1193        bh = getblk(dev, block, size);
1194        if (buffer_uptodate(bh))
1195                return bh;
1196        set_bit(BH_Sync, &bh->b_state);
1197        ll_rw_block(READ, 1, &bh);
1198        wait_on_buffer(bh);
1199        if (buffer_uptodate(bh))
1200                return bh;
1201        brelse(bh);
1202        return NULL;
1203}
1204
1205/*
1206 * Note: the caller should wake up the buffer_wait list if needed.
1207 */
1208static void __put_unused_buffer_head(struct buffer_head * bh)
1209{
1210        if (unlikely(buffer_attached(bh)))
1211                BUG();
1212        if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1213                kmem_cache_free(bh_cachep, bh);
1214        } else {
1215                bh->b_dev = B_FREE;
1216                bh->b_blocknr = -1;
1217                bh->b_this_page = NULL;
1218
1219                nr_unused_buffer_heads++;
1220                bh->b_next_free = unused_list;
1221                unused_list = bh;
1222        }
1223}
1224
1225void put_unused_buffer_head(struct buffer_head *bh)
1226{
1227        spin_lock(&unused_list_lock);
1228        __put_unused_buffer_head(bh);
1229        spin_unlock(&unused_list_lock);
1230}
1231EXPORT_SYMBOL(put_unused_buffer_head);
1232
1233/*
1234 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1235 * no-buffer-head deadlock.  Return NULL on failure; waiting for
1236 * buffer heads is now handled in create_buffers().
1237 */ 
1238struct buffer_head * get_unused_buffer_head(int async)
1239{
1240        struct buffer_head * bh;
1241
1242        spin_lock(&unused_list_lock);
1243        if (nr_unused_buffer_heads > NR_RESERVED) {
1244                bh = unused_list;
1245                unused_list = bh->b_next_free;
1246                nr_unused_buffer_heads--;
1247                spin_unlock(&unused_list_lock);
1248                return bh;
1249        }
1250        spin_unlock(&unused_list_lock);
1251
1252        /* This is critical.  We can't call out to the FS
1253         * to get more buffer heads, because the FS may need
1254         * more buffer-heads itself.  Thus SLAB_NOFS.
1255         */
1256        if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1257                bh->b_blocknr = -1;
1258                bh->b_this_page = NULL;
1259                return bh;
1260        }
1261
1262        /*
1263         * If we need an async buffer, use the reserved buffer heads.
1264         * Non-PF_MEMALLOC tasks can just loop in create_buffers().
1265         */
1266        if (async && (current->flags & PF_MEMALLOC)) {
1267                spin_lock(&unused_list_lock);
1268                if (unused_list) {
1269                        bh = unused_list;
1270                        unused_list = bh->b_next_free;
1271                        nr_unused_buffer_heads--;
1272                        spin_unlock(&unused_list_lock);
1273                        return bh;
1274                }
1275                spin_unlock(&unused_list_lock);
1276        }
1277
1278        return NULL;
1279}
1280EXPORT_SYMBOL(get_unused_buffer_head);
1281
1282void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1283{
1284        if (offset >= PAGE_SIZE)
1285                BUG();
1286
1287        if (PageHighMem(page)) {
1288                bh->b_data = (char *)offset;
1289        } else {
1290                bh->b_data = page_address(page) + offset;
1291        }
1292        bh->b_page = page;
1293}
1294EXPORT_SYMBOL(set_bh_page);
1295
1296/*
1297 * Create the appropriate buffers when given a page for data area and
1298 * the size of each buffer.. Use the bh->b_this_page linked list to
1299 * follow the buffers created.  Return NULL if unable to create more
1300 * buffers.
1301 * The async flag is used to differentiate async IO (paging, swapping)
1302 * from ordinary buffer allocations, and only async requests are allowed
1303 * to sleep waiting for buffer heads. 
1304 */
1305static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1306{
1307        struct buffer_head *bh, *head;
1308        long offset;
1309
1310try_again:
1311        head = NULL;
1312        offset = PAGE_SIZE;
1313        while ((offset -= size) >= 0) {
1314                bh = get_unused_buffer_head(async);
1315                if (!bh)
1316                        goto no_grow;
1317
1318                bh->b_dev = NODEV;
1319                bh->b_this_page = head;
1320                head = bh;
1321
1322                bh->b_state = 0;
1323                bh->b_next_free = NULL;
1324                bh->b_pprev = NULL;
1325                atomic_set(&bh->b_count, 0);
1326                bh->b_size = size;
1327
1328                set_bh_page(bh, page, offset);
1329
1330                bh->b_list = BUF_CLEAN;
1331                bh->b_end_io = NULL;
1332        }
1333        return head;
1334/*
1335 * In case anything failed, we just free everything we got.
1336 */
1337no_grow:
1338        if (head) {
1339                spin_lock(&unused_list_lock);
1340                do {
1341                        bh = head;
1342                        head = head->b_this_page;
1343                        __put_unused_buffer_head(bh);
1344                } while (head);
1345                spin_unlock(&unused_list_lock);
1346
1347                /* Wake up any waiters ... */
1348                wake_up(&buffer_wait);
1349        }
1350
1351        /*
1352         * Return failure for non-async IO requests.  Async IO requests
1353         * are not allowed to fail, so we have to wait until buffer heads
1354         * become available.  But we don't want tasks sleeping with 
1355         * partially complete buffers, so all were released above.
1356         */
1357        if (!async)
1358                return NULL;
1359
1360        /* We're _really_ low on memory. Now we just
1361         * wait for old buffer heads to become free due to
1362         * finishing IO.  Since this is an async request and
1363         * the reserve list is empty, we're sure there are 
1364         * async buffer heads in use.
1365         */
1366        run_task_queue(&tq_disk);
1367
1368        free_more_memory();
1369        goto try_again;
1370}
1371
1372/*
1373 * Called when truncating a buffer on a page completely.
1374 */
1375static void discard_buffer(struct buffer_head * bh)
1376{
1377        if (buffer_mapped(bh) || buffer_delay(bh)) {
1378                mark_buffer_clean(bh);
1379                lock_buffer(bh);
1380                clear_bit(BH_Uptodate, &bh->b_state);
1381                clear_bit(BH_Mapped, &bh->b_state);
1382                clear_bit(BH_Req, &bh->b_state);
1383                clear_bit(BH_New, &bh->b_state);
1384                clear_bit(BH_Delay, &bh->b_state);
1385                remove_from_queues(bh);
1386                unlock_buffer(bh);
1387        }
1388}
1389
1390/**
1391 * try_to_release_page - release old fs-specific metadata on a page
1392 *
1393 */
1394
1395int try_to_release_page(struct page * page, int gfp_mask)
1396{
1397        if (!PageLocked(page))
1398                BUG();
1399        
1400        if (!page->mapping)
1401                goto try_to_free;
1402        if (!page->mapping->a_ops->releasepage)
1403                goto try_to_free;
1404        if (page->mapping->a_ops->releasepage(page, gfp_mask))
1405                goto try_to_free;
1406        /*
1407         * We couldn't release buffer metadata; don't even bother trying
1408         * to release buffers.
1409         */
1410        return 0;
1411try_to_free:    
1412        return try_to_free_buffers(page, gfp_mask);
1413}
1414
1415/*
1416 * We don't have to release all buffers here, but
1417 * we have to be sure that no dirty buffer is left
1418 * and no IO is going on (no buffer is locked), because
1419 * we have truncated the file and are going to free the
1420 * blocks on-disk..
1421 */
1422int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1423{
1424        struct buffer_head *head, *bh, *next;
1425        unsigned int curr_off = 0;
1426
1427        if (!PageLocked(page))
1428                BUG();
1429        if (!page->buffers)
1430                return 1;
1431
1432        head = page->buffers;
1433        bh = head;
1434        do {
1435                unsigned int next_off = curr_off + bh->b_size;
1436                next = bh->b_this_page;
1437
1438                /*
1439                 * is this block fully flushed?
1440                 */
1441                if (offset <= curr_off)
1442                        discard_buffer(bh);
1443                curr_off = next_off;
1444                bh = next;
1445        } while (bh != head);
1446
1447        /*
1448         * subtle. We release buffer-heads only if this is
1449         * the 'final' flushpage. We have invalidated the get_block
1450         * cached value unconditionally, so real IO is not
1451         * possible anymore.
1452         *
1453         * If the free doesn't work out, the buffers can be
1454         * left around - they just turn into anonymous buffers
1455         * instead.
1456         */
1457        if (!offset) {
1458                if (!try_to_release_page(page, 0))
1459                        return 0;
1460        }
1461
1462        return 1;
1463}
1464
1465void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1466{
1467        struct buffer_head *bh, *head, *tail;
1468
1469        /* FIXME: create_buffers should fail if there's no enough memory */
1470        head = create_buffers(page, blocksize, 1);
1471        if (page->buffers)
1472                BUG();
1473
1474        bh = head;
1475        do {
1476                bh->b_dev = dev;
1477                bh->b_blocknr = 0;
1478                bh->b_end_io = NULL;
1479                tail = bh;
1480                bh = bh->b_this_page;
1481        } while (bh);
1482        tail->b_this_page = head;
1483        page->buffers = head;
1484        page_cache_get(page);
1485}
1486EXPORT_SYMBOL(create_empty_buffers);
1487
1488/*
1489 * We are taking a block for data and we don't want any output from any
1490 * buffer-cache aliases starting from return from that function and
1491 * until the moment when something will explicitly mark the buffer
1492 * dirty (hopefully that will not happen until we will free that block ;-)
1493 * We don't even need to mark it not-uptodate - nobody can expect
1494 * anything from a newly allocated buffer anyway. We used to used
1495 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1496 * don't want to mark the alias unmapped, for example - it would confuse
1497 * anyone who might pick it with bread() afterwards...
1498 */
1499
1500static void unmap_underlying_metadata(struct buffer_head * bh)
1501{
1502        struct buffer_head *old_bh;
1503
1504        old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1505        if (old_bh) {
1506                mark_buffer_clean(old_bh);
1507                wait_on_buffer(old_bh);
1508                clear_bit(BH_Req, &old_bh->b_state);
1509                __brelse(old_bh);
1510        }
1511}
1512
1513/*
1514 * NOTE! All mapped/uptodate combinations are valid:
1515 *
1516 *      Mapped  Uptodate        Meaning
1517 *
1518 *      No      No              "unknown" - must do get_block()
1519 *      No      Yes             "hole" - zero-filled
1520 *      Yes     No              "allocated" - allocated on disk, not read in
1521 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1522 *
1523 * "Dirty" is valid only with the last case (mapped+uptodate).
1524 */
1525
1526/*
1527 * block_write_full_page() is SMP threaded - the kernel lock is not held.
1528 */
1529static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1530{
1531        int err, i;
1532        unsigned long block;
1533        struct buffer_head *bh, *head;
1534        int need_unlock;
1535
1536        if (!PageLocked(page))
1537                BUG();
1538
1539        if (!page->buffers)
1540                create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1541        head = page->buffers;
1542
1543        block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1544
1545        bh = head;
1546        i = 0;
1547
1548        /* Stage 1: make sure we have all the buffers mapped! */
1549        do {
1550                /*
1551                 * If the buffer isn't up-to-date, we can't be sure
1552                 * that the buffer has been initialized with the proper
1553                 * block number information etc..
1554                 *
1555                 * Leave it to the low-level FS to make all those
1556                 * decisions (block #0 may actually be a valid block)
1557                 */
1558                if (!buffer_mapped(bh)) {
1559                        err = get_block(inode, block, bh, 1);
1560                        if (err)
1561                                goto out;
1562                        if (buffer_new(bh))
1563                                unmap_underlying_metadata(bh);
1564                }
1565                bh = bh->b_this_page;
1566                block++;
1567        } while (bh != head);
1568
1569        /* Stage 2: lock the buffers, mark them clean */
1570        do {
1571                lock_buffer(bh);
1572                set_buffer_async_io(bh);
1573                set_bit(BH_Uptodate, &bh->b_state);
1574                clear_bit(BH_Dirty, &bh->b_state);
1575                bh = bh->b_this_page;
1576        } while (bh != head);
1577
1578        /* Stage 3: submit the IO */
1579        do {
1580                struct buffer_head *next = bh->b_this_page;
1581                submit_bh(WRITE, bh);
1582                bh = next;
1583        } while (bh != head);
1584
1585        /* Done - end_buffer_io_async will unlock */
1586        SetPageUptodate(page);
1587
1588        wakeup_page_waiters(page);
1589
1590        return 0;
1591
1592out:
1593        /*
1594         * ENOSPC, or some other error.  We may already have added some
1595         * blocks to the file, so we need to write these out to avoid
1596         * exposing stale data.
1597         */
1598        ClearPageUptodate(page);
1599        bh = head;
1600        need_unlock = 1;
1601        /* Recovery: lock and submit the mapped buffers */
1602        do {
1603                if (buffer_mapped(bh)) {
1604                        lock_buffer(bh);
1605                        set_buffer_async_io(bh);
1606                        need_unlock = 0;
1607                }
1608                bh = bh->b_this_page;
1609        } while (bh != head);
1610        do {
1611                struct buffer_head *next = bh->b_this_page;
1612                if (buffer_mapped(bh)) {
1613                        set_bit(BH_Uptodate, &bh->b_state);
1614                        clear_bit(BH_Dirty, &bh->b_state);
1615                        submit_bh(WRITE, bh);
1616                }
1617                bh = next;
1618        } while (bh != head);
1619        if (need_unlock)
1620                UnlockPage(page);
1621        wakeup_page_waiters(page);
1622        return err;
1623}
1624
1625static int __block_prepare_write(struct inode *inode, struct page *page,
1626                unsigned from, unsigned to, get_block_t *get_block)
1627{
1628        unsigned block_start, block_end;
1629        unsigned long block;
1630        int err = 0;
1631        unsigned blocksize, bbits;
1632        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1633        char *kaddr = kmap(page);
1634
1635        blocksize = 1 << inode->i_blkbits;
1636        if (!page->buffers)
1637                create_empty_buffers(page, inode->i_dev, blocksize);
1638        head = page->buffers;
1639
1640        bbits = inode->i_blkbits;
1641        block = page->index << (PAGE_CACHE_SHIFT - bbits);
1642
1643        for(bh = head, block_start = 0; bh != head || !block_start;
1644            block++, block_start=block_end, bh = bh->b_this_page) {
1645                if (!bh)
1646                        BUG();
1647                block_end = block_start+blocksize;
1648                if (block_end <= from)
1649                        continue;
1650                if (block_start >= to)
1651                        break;
1652                clear_bit(BH_New, &bh->b_state);
1653                if (!buffer_mapped(bh)) {
1654                        err = get_block(inode, block, bh, 1);
1655                        if (err)
1656                                goto out;
1657                        if (buffer_new(bh)) {
1658                                unmap_underlying_metadata(bh);
1659                                if (Page_Uptodate(page)) {
1660                                        set_bit(BH_Uptodate, &bh->b_state);
1661                                        continue;
1662                                }
1663                                if (block_end > to)
1664                                        memset(kaddr+to, 0, block_end-to);
1665                                if (block_start < from)
1666                                        memset(kaddr+block_start, 0, from-block_start);
1667                                if (block_end > to || block_start < from)
1668                                        flush_dcache_page(page);
1669                                continue;
1670                        }
1671                }
1672                if (Page_Uptodate(page)) {
1673                        set_bit(BH_Uptodate, &bh->b_state);
1674                        continue; 
1675                }
1676                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1677                     (block_start < from || block_end > to)) {
1678                        ll_rw_block(READ, 1, &bh);
1679                        *wait_bh++=bh;
1680                }
1681        }
1682        /*
1683         * If we issued read requests - let them complete.
1684         */
1685        while(wait_bh > wait) {
1686                wait_on_buffer(*--wait_bh);
1687                if (!buffer_uptodate(*wait_bh))
1688                        return -EIO;
1689        }
1690        return 0;
1691out:
1692        /*
1693         * Zero out any newly allocated blocks to avoid exposing stale
1694         * data.  If BH_New is set, we know that the block was newly
1695         * allocated in the above loop.
1696         *
1697         * Details the buffer can be new and uptodate because:
1698         * 1) hole in uptodate page, get_block(create) allocate the block,
1699         *    so the buffer is new and additionally we also mark it uptodate
1700         * 2) The buffer is not mapped and uptodate due a previous partial read.
1701         *
1702         * We can always ignore uptodate buffers here, if you mark a buffer
1703         * uptodate you must make sure it contains the right data first.
1704         *
1705         * We must stop the "undo/clear" fixup pass not at the caller "to"
1706         * but at the last block that we successfully arrived in the main loop.
1707         */
1708        bh = head;
1709        to = block_start; /* stop at the last successfully handled block */
1710        block_start = 0;
1711        do {
1712                block_end = block_start+blocksize;
1713                if (block_end <= from)
1714                        goto next_bh;
1715                if (block_start >= to)
1716                        break;
1717                if (buffer_new(bh) && !buffer_uptodate(bh)) {
1718                        memset(kaddr+block_start, 0, bh->b_size);
1719                        flush_dcache_page(page);
1720                        set_bit(BH_Uptodate, &bh->b_state);
1721                        mark_buffer_dirty(bh);
1722                }
1723next_bh:
1724                block_start = block_end;
1725                bh = bh->b_this_page;
1726        } while (bh != head);
1727        return err;
1728}
1729
1730static int __block_commit_write(struct inode *inode, struct page *page,
1731                unsigned from, unsigned to)
1732{
1733        unsigned block_start, block_end;
1734        int partial = 0, need_balance_dirty = 0;
1735        unsigned blocksize;
1736        struct buffer_head *bh, *head;
1737
1738        blocksize = 1 << inode->i_blkbits;
1739
1740        for(bh = head = page->buffers, block_start = 0;
1741            bh != head || !block_start;
1742            block_start=block_end, bh = bh->b_this_page) {
1743                block_end = block_start + blocksize;
1744                if (block_end <= from || block_start >= to) {
1745                        if (!buffer_uptodate(bh))
1746                                partial = 1;
1747                } else {
1748                        set_bit(BH_Uptodate, &bh->b_state);
1749                        if (!atomic_set_buffer_dirty(bh)) {
1750                                __mark_dirty(bh);
1751                                buffer_insert_inode_data_queue(bh, inode);
1752                                need_balance_dirty = 1;
1753                        }
1754                }
1755        }
1756
1757        if (need_balance_dirty)
1758                balance_dirty();
1759        /*
1760         * is this a partial write that happened to make all buffers
1761         * uptodate then we can optimize away a bogus readpage() for
1762         * the next read(). Here we 'discover' wether the page went
1763         * uptodate as a result of this (potentially partial) write.
1764         */
1765        if (!partial)
1766                SetPageUptodate(page);
1767        return 0;
1768}
1769
1770/*
1771 * Generic "read page" function for block devices that have the normal
1772 * get_block functionality. This is most of the block device filesystems.
1773 * Reads the page asynchronously --- the unlock_buffer() and
1774 * mark_buffer_uptodate() functions propagate buffer state into the
1775 * page struct once IO has completed.
1776 */
1777int block_read_full_page(struct page *page, get_block_t *get_block)
1778{
1779        struct inode *inode = page->mapping->host;
1780        unsigned long iblock, lblock;
1781        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1782        unsigned int blocksize, blocks;
1783        int nr, i;
1784
1785        if (!PageLocked(page))
1786                PAGE_BUG(page);
1787        blocksize = 1 << inode->i_blkbits;
1788        if (!page->buffers)
1789                create_empty_buffers(page, inode->i_dev, blocksize);
1790        head = page->buffers;
1791
1792        blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1793        iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794        lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1795        bh = head;
1796        nr = 0;
1797        i = 0;
1798
1799        do {
1800                if (buffer_uptodate(bh))
1801                        continue;
1802
1803                if (!buffer_mapped(bh)) {
1804                        if (iblock < lblock) {
1805                                if (get_block(inode, iblock, bh, 0))
1806                                        SetPageError(page);
1807                        }
1808                        if (!buffer_mapped(bh)) {
1809                                memset(kmap(page) + i*blocksize, 0, blocksize);
1810                                flush_dcache_page(page);
1811                                kunmap(page);
1812                                set_bit(BH_Uptodate, &bh->b_state);
1813                                continue;
1814                        }
1815                        /* get_block() might have updated the buffer synchronously */
1816                        if (buffer_uptodate(bh))
1817                                continue;
1818                }
1819
1820                arr[nr] = bh;
1821                nr++;
1822        } while (i++, iblock++, (bh = bh->b_this_page) != head);
1823
1824        if (!nr) {
1825                /*
1826                 * All buffers are uptodate - we can set the page uptodate
1827                 * as well. But not if get_block() returned an error.
1828                 */
1829                if (!PageError(page))
1830                        SetPageUptodate(page);
1831                UnlockPage(page);
1832                return 0;
1833        }
1834
1835        /* Stage two: lock the buffers */
1836        for (i = 0; i < nr; i++) {
1837                struct buffer_head * bh = arr[i];
1838                lock_buffer(bh);
1839                set_buffer_async_io(bh);
1840        }
1841
1842        /* Stage 3: start the IO */
1843        for (i = 0; i < nr; i++) {
1844                struct buffer_head * bh = arr[i];
1845                if (buffer_uptodate(bh))
1846                        end_buffer_io_async(bh, 1);
1847                else
1848                        submit_bh(READ, bh);
1849        }
1850
1851        wakeup_page_waiters(page);
1852        
1853        return 0;
1854}
1855
1856/* utility function for filesystems that need to do work on expanding
1857 * truncates.  Uses prepare/commit_write to allow the filesystem to
1858 * deal with the hole.  
1859 */
1860int generic_cont_expand(struct inode *inode, loff_t size)
1861{
1862        struct address_space *mapping = inode->i_mapping;
1863        struct page *page;
1864        unsigned long index, offset, limit;
1865        int err;
1866
1867        err = -EFBIG;
1868        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1869        if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1870                send_sig(SIGXFSZ, current, 0);
1871                goto out;
1872        }
1873        if (size > inode->i_sb->s_maxbytes)
1874                goto out;
1875
1876        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1877
1878        /* ugh.  in prepare/commit_write, if from==to==start of block, we 
1879        ** skip the prepare.  make sure we never send an offset for the start
1880        ** of a block
1881        */
1882        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1883                offset++;
1884        }
1885        index = size >> PAGE_CACHE_SHIFT;
1886        err = -ENOMEM;
1887        page = grab_cache_page(mapping, index);
1888        if (!page)
1889                goto out;
1890        err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1891        if (!err) {
1892                err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1893        }
1894        UnlockPage(page);
1895        page_cache_release(page);
1896        if (err > 0)
1897                err = 0;
1898out:
1899        return err;
1900}
1901
1902/*
1903 * For moronic filesystems that do not allow holes in file.
1904 * We may have to extend the file.
1905 */
1906
1907int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1908{
1909        struct address_space *mapping = page->mapping;
1910        struct inode *inode = mapping->host;
1911        struct page *new_page;
1912        unsigned long pgpos;
1913        long status;
1914        unsigned zerofrom;
1915        unsigned blocksize = 1 << inode->i_blkbits;
1916        char *kaddr;
1917
1918        while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1919                status = -ENOMEM;
1920                new_page = grab_cache_page(mapping, pgpos);
1921                if (!new_page)
1922                        goto out;
1923                /* we might sleep */
1924                if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1925                        UnlockPage(new_page);
1926                        page_cache_release(new_page);
1927                        continue;
1928                }
1929                zerofrom = *bytes & ~PAGE_CACHE_MASK;
1930                if (zerofrom & (blocksize-1)) {
1931                        *bytes |= (blocksize-1);
1932                        (*bytes)++;
1933                }
1934                status = __block_prepare_write(inode, new_page, zerofrom,
1935                                                PAGE_CACHE_SIZE, get_block);
1936                if (status)
1937                        goto out_unmap;
1938                kaddr = page_address(new_page);
1939                memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1940                flush_dcache_page(new_page);
1941                __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1942                kunmap(new_page);
1943                UnlockPage(new_page);
1944                page_cache_release(new_page);
1945        }
1946
1947        if (page->index < pgpos) {
1948                /* completely inside the area */
1949                zerofrom = offset;
1950        } else {
1951                /* page covers the boundary, find the boundary offset */
1952                zerofrom = *bytes & ~PAGE_CACHE_MASK;
1953
1954                /* if we will expand the thing last block will be filled */
1955                if (to > zerofrom && (zerofrom & (blocksize-1))) {
1956                        *bytes |= (blocksize-1);
1957                        (*bytes)++;
1958                }
1959
1960                /* starting below the boundary? Nothing to zero out */
1961                if (offset <= zerofrom)
1962                        zerofrom = offset;
1963        }
1964        status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1965        if (status)
1966                goto out1;
1967        kaddr = page_address(page);
1968        if (zerofrom < offset) {
1969                memset(kaddr+zerofrom, 0, offset-zerofrom);
1970                flush_dcache_page(page);
1971                __block_commit_write(inode, page, zerofrom, offset);
1972        }
1973        return 0;
1974out1:
1975        ClearPageUptodate(page);
1976        kunmap(page);
1977        return status;
1978
1979out_unmap:
1980        ClearPageUptodate(new_page);
1981        kunmap(new_page);
1982        UnlockPage(new_page);
1983        page_cache_release(new_page);
1984out:
1985        return status;
1986}
1987
1988int block_prepare_write(struct page *page, unsigned from, unsigned to,
1989                        get_block_t *get_block)
1990{
1991        struct inode *inode = page->mapping->host;
1992        int err = __block_prepare_write(inode, page, from, to, get_block);
1993        if (err) {
1994                ClearPageUptodate(page);
1995                kunmap(page);
1996        }
1997        return err;
1998}
1999
2000int block_commit_write(struct page *page, unsigned from, unsigned to)
2001{
2002        struct inode *inode = page->mapping->host;
2003        __block_commit_write(inode,page,from,to);
2004        kunmap(page);
2005        return 0;
2006}
2007
2008int generic_commit_write(struct file *file, struct page *page,
2009                unsigned from, unsigned to)
2010{
2011        struct inode *inode = page->mapping->host;
2012        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2013        __block_commit_write(inode,page,from,to);
2014        kunmap(page);
2015        if (pos > inode->i_size) {
2016                inode->i_size = pos;
2017                mark_inode_dirty(inode);
2018        }
2019        return 0;
2020}
2021
2022int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2023{
2024        unsigned long index = from >> PAGE_CACHE_SHIFT;
2025        unsigned offset = from & (PAGE_CACHE_SIZE-1);
2026        unsigned blocksize, iblock, length, pos;
2027        struct inode *inode = mapping->host;
2028        struct page *page;
2029        struct buffer_head *bh;
2030        int err;
2031
2032        blocksize = 1 << inode->i_blkbits;
2033        length = offset & (blocksize - 1);
2034
2035        /* Block boundary? Nothing to do */
2036        if (!length)
2037                return 0;
2038
2039        length = blocksize - length;
2040        iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2041        
2042        page = grab_cache_page(mapping, index);
2043        err = -ENOMEM;
2044        if (!page)
2045                goto out;
2046
2047        if (!page->buffers)
2048                create_empty_buffers(page, inode->i_dev, blocksize);
2049
2050        /* Find the buffer that contains "offset" */
2051        bh = page->buffers;
2052        pos = blocksize;
2053        while (offset >= pos) {
2054                bh = bh->b_this_page;
2055                iblock++;
2056                pos += blocksize;
2057        }
2058
2059        err = 0;
2060        if (!buffer_mapped(bh)) {
2061                /* Hole? Nothing to do */
2062                if (buffer_uptodate(bh))
2063                        goto unlock;
2064                get_block(inode, iblock, bh, 0);
2065                /* Still unmapped? Nothing to do */
2066                if (!buffer_mapped(bh))
2067                        goto unlock;
2068        }
2069
2070        /* Ok, it's mapped. Make sure it's up-to-date */
2071        if (Page_Uptodate(page))
2072                set_bit(BH_Uptodate, &bh->b_state);
2073
2074        if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2075                err = -EIO;
2076                ll_rw_block(READ, 1, &bh);
2077                wait_on_buffer(bh);
2078                /* Uhhuh. Read error. Complain and punt. */
2079                if (!buffer_uptodate(bh))
2080                        goto unlock;
2081        }
2082
2083        memset(kmap(page) + offset, 0, length);
2084        flush_dcache_page(page);
2085        kunmap(page);
2086
2087        if (!atomic_set_buffer_dirty(bh)) {
2088                __mark_dirty(bh);
2089                buffer_insert_inode_data_queue(bh, inode);
2090                balance_dirty();
2091        }
2092
2093        err = 0;
2094
2095unlock:
2096        UnlockPage(page);
2097        page_cache_release(page);
2098out:
2099        return err;
2100}
2101
2102int block_write_full_page(struct page *page, get_block_t *get_block)
2103{
2104        struct inode *inode = page->mapping->host;
2105        unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2106        unsigned offset;
2107        int err;
2108
2109        /* easy case */
2110        if (page->index < end_index)
2111                return __block_write_full_page(inode, page, get_block);
2112
2113        /* things got complicated... */
2114        offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2115        /* OK, are we completely out? */
2116        if (page->index >= end_index+1 || !offset) {
2117                UnlockPage(page);
2118                return -EIO;
2119        }
2120
2121        /* Sigh... will have to work, then... */
2122        err = __block_prepare_write(inode, page, 0, offset, get_block);
2123        if (!err) {
2124                memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2125                flush_dcache_page(page);
2126                __block_commit_write(inode,page,0,offset);
2127done:
2128                kunmap(page);
2129                UnlockPage(page);
2130                return err;
2131        }
2132        ClearPageUptodate(page);
2133        goto done;
2134}
2135
2136/*
2137 * Commence writeout of all the buffers against a page.  The
2138 * page must be locked.   Returns zero on success or a negative
2139 * errno.
2140 */
2141int writeout_one_page(struct page *page)
2142{
2143        struct buffer_head *bh, *head = page->buffers;
2144
2145        if (!PageLocked(page))
2146                BUG();
2147        bh = head;
2148        do {
2149                if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2150                        continue;
2151
2152                bh->b_flushtime = jiffies;
2153                ll_rw_block(WRITE, 1, &bh);     
2154        } while ((bh = bh->b_this_page) != head);
2155        return 0;
2156}
2157EXPORT_SYMBOL(writeout_one_page);
2158
2159/*
2160 * Wait for completion of I/O of all buffers against a page.  The page
2161 * must be locked.  Returns zero on success or a negative errno.
2162 */
2163int waitfor_one_page(struct page *page)
2164{
2165        int error = 0;
2166        struct buffer_head *bh, *head = page->buffers;
2167
2168        bh = head;
2169        do {
2170                wait_on_buffer(bh);
2171                if (buffer_req(bh) && !buffer_uptodate(bh))
2172                        error = -EIO;
2173        } while ((bh = bh->b_this_page) != head);
2174        return error;
2175}
2176EXPORT_SYMBOL(waitfor_one_page);
2177
2178int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2179{
2180        struct buffer_head tmp;
2181        struct inode *inode = mapping->host;
2182        tmp.b_state = 0;
2183        tmp.b_blocknr = 0;
2184        get_block(inode, block, &tmp, 0);
2185        return tmp.b_blocknr;
2186}
2187
2188int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2189{
2190        int i, nr_blocks, retval;
2191        unsigned long * blocks = iobuf->blocks;
2192        int length;
2193        int beyond_eof = 0;
2194        
2195        length = iobuf->length;
2196        nr_blocks = length / blocksize;
2197        /* build the blocklist */
2198        for (i = 0; i < nr_blocks; i++, blocknr++) {
2199                struct buffer_head bh;
2200
2201                bh.b_state = 0;
2202                bh.b_dev = inode->i_dev;
2203                bh.b_size = blocksize;
2204                bh.b_page = NULL;
2205
2206                if (((loff_t) blocknr) * blocksize >= inode->i_size)
2207                        beyond_eof = 1;
2208
2209                /* Only allow get_block to create new blocks if we are safely
2210                   beyond EOF.  O_DIRECT is unsafe inside sparse files. */
2211                retval = get_block(inode, blocknr, &bh, 
2212                                   ((rw != READ) && beyond_eof));
2213
2214                if (retval) {
2215                        if (!i)
2216                                /* report error to userspace */
2217                                goto out;
2218                        else
2219                                /* do short I/O until 'i' */
2220                                break;
2221                }
2222
2223                if (rw == READ) {
2224                        if (buffer_new(&bh))
2225                                BUG();
2226                        if (!buffer_mapped(&bh)) {
2227                                /* there was an hole in the filesystem */
2228                                blocks[i] = -1UL;
2229                                continue;
2230                        }
2231                } else {
2232                        if (buffer_new(&bh))
2233                                unmap_underlying_metadata(&bh);
2234                        if (!buffer_mapped(&bh))
2235                                /* upper layers need to pass the error on or
2236                                 * fall back to buffered IO. */
2237                                return -ENOTBLK;
2238                }
2239                blocks[i] = bh.b_blocknr;
2240        }
2241
2242        /* patch length to handle short I/O */
2243        iobuf->length = i * blocksize;
2244        if (!beyond_eof)
2245                up(&inode->i_sem);
2246        retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2247        if (!beyond_eof)
2248                down(&inode->i_sem);
2249        /* restore orig length */
2250        iobuf->length = length;
2251 out:
2252
2253        return retval;
2254}
2255
2256/*
2257 * IO completion routine for a buffer_head being used for kiobuf IO: we
2258 * can't dispatch the kiobuf callback until io_count reaches 0.  
2259 */
2260
2261static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2262{
2263        struct kiobuf *kiobuf;
2264        
2265        mark_buffer_uptodate(bh, uptodate);
2266
2267        kiobuf = bh->b_private;
2268        end_kio_request(kiobuf, uptodate);
2269        unlock_buffer(bh);
2270}
2271
2272/*
2273 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2274 * for them to complete.  Clean up the buffer_heads afterwards.  
2275 */
2276
2277static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2278{
2279        int iosize, err;
2280        int i;
2281        struct buffer_head *tmp;
2282
2283        iosize = 0;
2284        err = 0;
2285
2286        for (i = nr; --i >= 0; ) {
2287                iosize += size;
2288                tmp = bh[i];
2289                wait_on_buffer(tmp);
2290                
2291                if (!buffer_uptodate(tmp)) {
2292                        /* We are traversing bh'es in reverse order so
2293                           clearing iosize on error calculates the
2294                           amount of IO before the first error. */
2295                        iosize = 0;
2296                        err = -EIO;
2297                }
2298        }
2299        
2300        if (iosize)
2301                return iosize;
2302        return err;
2303}
2304
2305/*
2306 * Start I/O on a physical range of kernel memory, defined by a vector
2307 * of kiobuf structs (much like a user-space iovec list).
2308 *
2309 * The kiobuf must already be locked for IO.  IO is submitted
2310 * asynchronously: you need to check page->locked and page->uptodate.
2311 *
2312 * It is up to the caller to make sure that there are enough blocks
2313 * passed in to completely map the iobufs to disk.
2314 */
2315
2316int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
2317               kdev_t dev, unsigned long b[], int size)
2318{
2319        int             err;
2320        int             length;
2321        int             transferred;
2322        int             i;
2323        int             bufind;
2324        int             pageind;
2325        int             bhind;
2326        int             offset;
2327        unsigned long   blocknr;
2328        struct kiobuf * iobuf = NULL;
2329        struct page *   map;
2330        struct buffer_head *tmp, **bhs = NULL;
2331
2332        if (!nr)
2333                return 0;
2334        
2335        /* 
2336         * First, do some alignment and validity checks 
2337         */
2338        for (i = 0; i < nr; i++) {
2339                iobuf = iovec[i];
2340                if ((iobuf->offset & (size-1)) ||
2341                    (iobuf->length & (size-1)))
2342                        return -EINVAL;
2343                if (!iobuf->nr_pages)
2344                        panic("brw_kiovec: iobuf not initialised");
2345        }
2346
2347        /* 
2348         * OK to walk down the iovec doing page IO on each page we find. 
2349         */
2350        bufind = bhind = transferred = err = 0;
2351        for (i = 0; i < nr; i++) {
2352                iobuf = iovec[i];
2353                offset = iobuf->offset;
2354                length = iobuf->length;
2355                iobuf->errno = 0;
2356                if (!bhs)
2357                        bhs = iobuf->bh;
2358                
2359                for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2360                        map  = iobuf->maplist[pageind];
2361                        if (!map) {
2362                                err = -EFAULT;
2363                                goto finished;
2364                        }
2365                        
2366                        while (length > 0) {
2367                                blocknr = b[bufind++];
2368                                if (blocknr == -1UL) {
2369                                        if (rw == READ) {
2370                                                /* there was an hole in the filesystem */
2371                                                memset(kmap(map) + offset, 0, size);
2372                                                flush_dcache_page(map);
2373                                                kunmap(map);
2374
2375                                                transferred += size;
2376                                                goto skip_block;
2377                                        } else
2378                                                BUG();
2379                                }
2380                                tmp = bhs[bhind++];
2381
2382                                tmp->b_size = size;
2383                                set_bh_page(tmp, map, offset);
2384                                tmp->b_this_page = tmp;
2385
2386                                init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2387                                tmp->b_dev = dev;
2388                                tmp->b_blocknr = blocknr;
2389                                tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2390
2391                                if (rw == WRITE) {
2392                                        set_bit(BH_Uptodate, &tmp->b_state);
2393                                        clear_bit(BH_Dirty, &tmp->b_state);
2394                                } else
2395                                        set_bit(BH_Uptodate, &tmp->b_state);
2396
2397                                atomic_inc(&iobuf->io_count);
2398                                submit_bh(rw, tmp);
2399                                /* 
2400                                 * Wait for IO if we have got too much 
2401                                 */
2402                                if (bhind >= KIO_MAX_SECTORS) {
2403                                        kiobuf_wait_for_io(iobuf); /* wake-one */
2404                                        err = wait_kio(rw, bhind, bhs, size);
2405                                        if (err >= 0)
2406                                                transferred += err;
2407                                        else
2408                                                goto finished;
2409                                        bhind = 0;
2410                                }
2411
2412                        skip_block:
2413                                length -= size;
2414                                offset += size;
2415
2416                                if (offset >= PAGE_SIZE) {
2417                                        offset = 0;
2418                                        break;
2419                                }
2420                        } /* End of block loop */
2421                } /* End of page loop */                
2422        } /* End of iovec loop */
2423
2424        /* Is there any IO still left to submit? */
2425        if (bhind) {
2426                kiobuf_wait_for_io(iobuf); /* wake-one */
2427                err = wait_kio(rw, bhind, bhs, size);
2428                if (err >= 0)
2429                        transferred += err;
2430                else
2431                        goto finished;
2432        }
2433
2434 finished:
2435        if (transferred)
2436                return transferred;
2437        return err;
2438}
2439
2440/*
2441 * Start I/O on a page.
2442 * This function expects the page to be locked and may return
2443 * before I/O is complete. You then have to check page->locked
2444 * and page->uptodate.
2445 *
2446 * brw_page() is SMP-safe, although it's being called with the
2447 * kernel lock held - but the code is ready.
2448 *
2449 * FIXME: we need a swapper_inode->get_block function to remove
2450 *        some of the bmap kludges and interface ugliness here.
2451 */
2452int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2453{
2454        struct buffer_head *head, *bh;
2455
2456        if (!PageLocked(page))
2457                panic("brw_page: page not locked for I/O");
2458
2459        if (!page->buffers)
2460                create_empty_buffers(page, dev, size);
2461        head = bh = page->buffers;
2462
2463        /* Stage 1: lock all the buffers */
2464        do {
2465                lock_buffer(bh);
2466                bh->b_blocknr = *(b++);
2467                set_bit(BH_Mapped, &bh->b_state);
2468                set_buffer_async_io(bh);
2469                bh = bh->b_this_page;
2470        } while (bh != head);
2471
2472        /* Stage 2: start the IO */
2473        do {
2474                struct buffer_head *next = bh->b_this_page;
2475                submit_bh(rw, bh);
2476                bh = next;
2477        } while (bh != head);
2478        wakeup_page_waiters(page);
2479        return 0;
2480}
2481
2482int block_symlink(struct inode *inode, const char *symname, int len)
2483{
2484        struct address_space *mapping = inode->i_mapping;
2485        struct page *page = grab_cache_page(mapping, 0);
2486        int err = -ENOMEM;
2487        char *kaddr;
2488
2489        if (!page)
2490                goto fail;
2491        err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2492        if (err)
2493                goto fail_map;
2494        kaddr = page_address(page);
2495        memcpy(kaddr, symname, len-1);
2496        mapping->a_ops->commit_write(NULL, page, 0, len-1);
2497        /*
2498         * Notice that we are _not_ going to block here - end of page is
2499         * unmapped, so this will only try to map the rest of page, see
2500         * that it is unmapped (typically even will not look into inode -
2501         * ->i_size will be enough for everything) and zero it out.
2502         * OTOH it's obviously correct and should make the page up-to-date.
2503         */
2504        err = mapping->a_ops->readpage(NULL, page);
2505        wait_on_page(page);
2506        page_cache_release(page);
2507        if (err < 0)
2508                goto fail;
2509        mark_inode_dirty(inode);
2510        return 0;
2511fail_map:
2512        UnlockPage(page);
2513        page_cache_release(page);
2514fail:
2515        return err;
2516}
2517
2518static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2519{
2520        struct buffer_head *bh, *tail;
2521
2522        bh = head;
2523        do {
2524                tail = bh;
2525                bh = bh->b_this_page;
2526        } while (bh);
2527        tail->b_this_page = head;
2528        page->buffers = head;
2529        page_cache_get(page);
2530}
2531
2532/*
2533 * Create the page-cache page that contains the requested block
2534 */
2535static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2536{
2537        struct page * page;
2538        struct buffer_head *bh;
2539
2540        page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2541        if (!page)
2542                return NULL;
2543
2544        if (!PageLocked(page))
2545                BUG();
2546
2547        bh = page->buffers;
2548        if (bh) {
2549                if (bh->b_size == size)
2550                        return page;
2551                if (!try_to_free_buffers(page, GFP_NOFS))
2552                        goto failed;
2553        }
2554
2555        bh = create_buffers(page, size, 0);
2556        if (!bh)
2557                goto failed;
2558        link_dev_buffers(page, bh);
2559        return page;
2560
2561failed:
2562        UnlockPage(page);
2563        page_cache_release(page);
2564        return NULL;
2565}
2566
2567static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2568{
2569        struct buffer_head *head = page->buffers;
2570        struct buffer_head *bh = head;
2571        unsigned int uptodate;
2572
2573        uptodate = 1 << BH_Mapped;
2574        if (Page_Uptodate(page))
2575                uptodate |= 1 << BH_Uptodate;
2576
2577        write_lock(&hash_table_lock);
2578        do {
2579                if (!(bh->b_state & (1 << BH_Mapped))) {
2580                        init_buffer(bh, NULL, NULL);
2581                        bh->b_dev = dev;
2582                        bh->b_blocknr = block;
2583                        bh->b_state = uptodate;
2584                }
2585
2586                /* Insert the buffer into the hash lists if necessary */
2587                if (!bh->b_pprev)
2588                        __insert_into_hash_list(bh);
2589
2590                block++;
2591                bh = bh->b_this_page;
2592        } while (bh != head);
2593        write_unlock(&hash_table_lock);
2594}
2595
2596/*
2597 * Try to increase the number of buffers available: the size argument
2598 * is used to determine what kind of buffers we want.
2599 */
2600static int grow_buffers(kdev_t dev, unsigned long block, int size)
2601{
2602        struct page * page;
2603        struct block_device *bdev;
2604        unsigned long index;
2605        int sizebits;
2606
2607        /* Size must be multiple of hard sectorsize */
2608        if (size & (get_hardsect_size(dev)-1))
2609                BUG();
2610        /* Size must be within 512 bytes and PAGE_SIZE */
2611        if (size < 512 || size > PAGE_SIZE)
2612                BUG();
2613
2614        sizebits = -1;
2615        do {
2616                sizebits++;
2617        } while ((size << sizebits) < PAGE_SIZE);
2618
2619        index = block >> sizebits;
2620        block = index << sizebits;
2621
2622        bdev = bdget(kdev_t_to_nr(dev));
2623        if (!bdev) {
2624                printk("No block device for %s\n", kdevname(dev));
2625                BUG();
2626        }
2627
2628        /* Create a page with the proper size buffers.. */
2629        page = grow_dev_page(bdev, index, size);
2630
2631        /* This is "wrong" - talk to Al Viro */
2632        atomic_dec(&bdev->bd_count);
2633        if (!page)
2634                return 0;
2635
2636        /* Hash in the buffers on the hash list */
2637        hash_page_buffers(page, dev, block, size);
2638        UnlockPage(page);
2639        page_cache_release(page);
2640
2641        /* We hashed up this page, so increment buffermem */
2642        atomic_inc(&buffermem_pages);
2643        return 1;
2644}
2645
2646/*
2647 * The first time the VM inspects a page which has locked buffers, it
2648 * will just mark it as needing waiting upon on the scan of the page LRU.
2649 * BH_Wait_IO is used for this.
2650 *
2651 * The second time the VM visits the page, if it still has locked
2652 * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
2653 *
2654 * The third time the VM visits the page, if the I/O hasn't completed
2655 * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
2656 * used for this.
2657 *
2658 * There is also the case of buffers which were locked by someone else
2659 * - write(2) callers, bdflush, etc.  There can be a huge number of these
2660 * and we don't want to just skip them all and fail the page allocation. 
2661 * We want to be able to wait on these buffers as well.
2662 *
2663 * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2664 * underway against the buffer, doesn't matter who started it - we know
2665 * that the buffer will eventually come unlocked, and so it's safe to
2666 * wait on it.
2667 *
2668 * The caller holds the page lock and the caller will free this page
2669 * into current->local_page, so by waiting on the page's buffers the
2670 * caller is guaranteed to obtain this page.
2671 *
2672 * sync_page_buffers() will sort-of return true if all the buffers
2673 * against this page are freeable, so try_to_free_buffers() should
2674 * try to free the page's buffers a second time.  This is a bit
2675 * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2676 */
2677static int sync_page_buffers(struct buffer_head *head)
2678{
2679        struct buffer_head * bh = head;
2680        int tryagain = 1;
2681
2682        do {
2683                if (!buffer_dirty(bh) && !buffer_locked(bh))
2684                        continue;
2685
2686                /* Don't start IO first time around.. */
2687                if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2688                        tryagain = 0;
2689                        continue;
2690                }
2691
2692                /* Second time through we start actively writing out.. */
2693                if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2694                        if (unlikely(!buffer_launder(bh))) {
2695                                tryagain = 0;
2696                                continue;
2697                        }
2698                        wait_on_buffer(bh);
2699                        tryagain = 1;
2700                        continue;
2701                }
2702
2703                if (!atomic_set_buffer_clean(bh)) {
2704                        unlock_buffer(bh);
2705                        continue;
2706                }
2707
2708                __mark_buffer_clean(bh);
2709                get_bh(bh);
2710                bh->b_end_io = end_buffer_io_sync;
2711                submit_bh(WRITE, bh);
2712                tryagain = 0;
2713        } while ((bh = bh->b_this_page) != head);
2714
2715        return tryagain;
2716}
2717
2718/*
2719 * Can the buffer be thrown out?
2720 */
2721#define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock))
2722#define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2723
2724/*
2725 * try_to_free_buffers() checks if all the buffers on this particular page
2726 * are unused, and free's the page if so.
2727 *
2728 * Wake up bdflush() if this fails - if we're running low on memory due
2729 * to dirty buffers, we need to flush them out as quickly as possible.
2730 *
2731 * NOTE: There are quite a number of ways that threads of control can
2732 *       obtain a reference to a buffer head within a page.  So we must
2733 *       lock out all of these paths to cleanly toss the page.
2734 */
2735int fastcall try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2736{
2737        struct buffer_head * tmp, * bh = page->buffers;
2738
2739cleaned_buffers_try_again:
2740        spin_lock(&lru_list_lock);
2741        write_lock(&hash_table_lock);
2742        tmp = bh;
2743        do {
2744                if (buffer_busy(tmp))
2745                        goto busy_buffer_page;
2746                tmp = tmp->b_this_page;
2747        } while (tmp != bh);
2748
2749        spin_lock(&unused_list_lock);
2750        tmp = bh;
2751
2752        /* if this buffer was hashed, this page counts as buffermem */
2753        if (bh->b_pprev)
2754                atomic_dec(&buffermem_pages);
2755        do {
2756                struct buffer_head * p = tmp;
2757                tmp = tmp->b_this_page;
2758
2759                if (p->b_dev == B_FREE) BUG();
2760
2761                remove_inode_queue(p);
2762                __remove_from_queues(p);
2763                __put_unused_buffer_head(p);
2764        } while (tmp != bh);
2765        spin_unlock(&unused_list_lock);
2766
2767        /* Wake up anyone waiting for buffer heads */
2768        wake_up(&buffer_wait);
2769
2770        /* And free the page */
2771        page->buffers = NULL;
2772        page_cache_release(page);
2773        write_unlock(&hash_table_lock);
2774        spin_unlock(&lru_list_lock);
2775        return 1;
2776
2777busy_buffer_page:
2778        /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2779        write_unlock(&hash_table_lock);
2780        spin_unlock(&lru_list_lock);
2781        gfp_mask = pf_gfp_mask(gfp_mask);
2782        if (gfp_mask & __GFP_IO) {
2783                if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2784                        if (sync_page_buffers(bh)) {
2785                                /* no IO or waiting next time */
2786                                gfp_mask = 0;
2787                                goto cleaned_buffers_try_again;
2788                        }
2789                }
2790        }
2791        if (balance_dirty_state() >= 0)
2792                wakeup_bdflush();
2793        return 0;
2794}
2795EXPORT_SYMBOL(try_to_free_buffers);
2796
2797/* ================== Debugging =================== */
2798
2799void show_buffers(void)
2800{
2801#ifdef CONFIG_SMP
2802        struct buffer_head * bh;
2803        int delalloc = 0, found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2804        int nlist;
2805        static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2806#endif
2807
2808        printk("Buffer memory:   %6dkB\n",
2809                atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2810
2811        printk("Cache memory:   %6ldkB\n",
2812                (page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2813
2814#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2815        if (!spin_trylock(&lru_list_lock))
2816                return;
2817        for(nlist = 0; nlist < NR_LIST; nlist++) {
2818                delalloc = found = locked = dirty = used = lastused = 0;
2819                bh = lru_list[nlist];
2820                if(!bh) continue;
2821
2822                do {
2823                        found++;
2824                        if (buffer_locked(bh))
2825                                locked++;
2826                        if (buffer_dirty(bh))
2827                                dirty++;
2828                        if (buffer_delay(bh))
2829                                delalloc++;
2830                        if (atomic_read(&bh->b_count))
2831                                used++, lastused = found;
2832                        bh = bh->b_next_free;
2833                } while (bh != lru_list[nlist]);
2834                {
2835                        int tmp = nr_buffers_type[nlist];
2836                        if (found != tmp)
2837                                printk("%9s: BUG -> found %d, reported %d\n",
2838                                       buf_types[nlist], found, tmp);
2839                }
2840                printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2841                       "%d locked, %d dirty, %d delay\n",
2842                       buf_types[nlist], found, size_buffers_type[nlist]>>10,
2843                       used, lastused, locked, dirty, delalloc);
2844        }
2845        spin_unlock(&lru_list_lock);
2846#endif
2847}
2848
2849/* ===================== Init ======================= */
2850
2851/*
2852 * allocate the hash table and init the free list
2853 * Use gfp() for the hash table to decrease TLB misses, use
2854 * SLAB cache for buffer heads.
2855 */
2856void __init buffer_init(unsigned long mempages)
2857{
2858        int order, i;
2859        unsigned int nr_hash;
2860
2861        /* The buffer cache hash table is less important these days,
2862         * trim it a bit.
2863         */
2864        mempages >>= 14;
2865
2866        mempages *= sizeof(struct buffer_head *);
2867
2868        for (order = 0; (1 << order) < mempages; order++)
2869                ;
2870
2871        /* try to allocate something until we get it or we're asking
2872           for something that is really too small */
2873
2874        do {
2875                unsigned long tmp;
2876
2877                nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2878                bh_hash_mask = (nr_hash - 1);
2879
2880                tmp = nr_hash;
2881                bh_hash_shift = 0;
2882                while((tmp >>= 1UL) != 0UL)
2883                        bh_hash_shift++;
2884
2885                hash_table = (struct buffer_head **)
2886                    __get_free_pages(GFP_ATOMIC, order);
2887        } while (hash_table == NULL && --order > 0);
2888        printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
2889               nr_hash, order, (PAGE_SIZE << order));
2890
2891        if (!hash_table)
2892                panic("Failed to allocate buffer hash table\n");
2893
2894        /* Setup hash chains. */
2895        for(i = 0; i < nr_hash; i++)
2896                hash_table[i] = NULL;
2897
2898        /* Setup lru lists. */
2899        for(i = 0; i < NR_LIST; i++)
2900                lru_list[i] = NULL;
2901
2902}
2903
2904
2905/* ====================== bdflush support =================== */
2906
2907/* This is a simple kernel daemon, whose job it is to provide a dynamic
2908 * response to dirty buffers.  Once this process is activated, we write back
2909 * a limited number of buffers to the disks and then go back to sleep again.
2910 */
2911
2912DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2913
2914void wakeup_bdflush(void)
2915{
2916        wake_up_interruptible(&bdflush_wait);
2917}
2918
2919void wakeup_kupdate(void)
2920{
2921        if (waitqueue_active(&kupdate_wait))
2922                wake_up(&kupdate_wait);
2923}
2924
2925/* 
2926 * Here we attempt to write back old buffers.  We also try to flush inodes 
2927 * and supers as well, since this function is essentially "update", and 
2928 * otherwise there would be no way of ensuring that these quantities ever 
2929 * get written back.  Ideally, we would have a timestamp on the inodes
2930 * and superblocks so that we could write back only the old ones as well
2931 */
2932
2933static int sync_old_buffers(void)
2934{
2935        lock_kernel();
2936        sync_unlocked_inodes();
2937        sync_supers(0, 0);
2938        unlock_kernel();
2939
2940        for (;;) {
2941                struct buffer_head *bh;
2942
2943                spin_lock(&lru_list_lock);
2944                bh = lru_list[BUF_DIRTY];
2945                if (!bh)
2946                        break;
2947                if (time_before(jiffies, bh->b_flushtime) && !laptop_mode)
2948                        break;
2949                if (write_some_buffers(NODEV))
2950                        continue;
2951                return 0;
2952        }
2953        spin_unlock(&lru_list_lock);
2954        return 0;
2955}
2956
2957int block_sync_page(struct page *page)
2958{
2959        run_task_queue(&tq_disk);
2960        return 0;
2961}
2962
2963/* This is the interface to bdflush.  As we get more sophisticated, we can
2964 * pass tuning parameters to this "process", to adjust how it behaves. 
2965 * We would want to verify each parameter, however, to make sure that it 
2966 * is reasonable. */
2967
2968asmlinkage long sys_bdflush(int func, long data)
2969{
2970        if (!capable(CAP_SYS_ADMIN))
2971                return -EPERM;
2972
2973        if (func == 1) {
2974                /* do_exit directly and let kupdate to do its work alone. */
2975                do_exit(0);
2976#if 0 /* left here as it's the only example of lazy-mm-stuff used from
2977         a syscall that doesn't care about the current mm context. */
2978                int error;
2979                struct mm_struct *user_mm;
2980
2981                /*
2982                 * bdflush will spend all of it's time in kernel-space,
2983                 * without touching user-space, so we can switch it into
2984                 * 'lazy TLB mode' to reduce the cost of context-switches
2985                 * to and from bdflush.
2986                 */
2987                user_mm = start_lazy_tlb();
2988                error = sync_old_buffers();
2989                end_lazy_tlb(user_mm);
2990                return error;
2991#endif
2992        }
2993
2994        /* Basically func 1 means read param 1, 2 means write param 1, etc */
2995        if (func >= 2) {
2996                int i = (func-2) >> 1;
2997                if (i >= 0 && i < N_PARAM) {
2998                        if ((func & 1) == 0)
2999                                return put_user(bdf_prm.data[i], (int*)data);
3000
3001                        if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
3002                                bdf_prm.data[i] = data;
3003                                return 0;
3004                        }
3005                }
3006                return -EINVAL;
3007        }
3008
3009        /* Having func 0 used to launch the actual bdflush and then never
3010         * return (unless explicitly killed). We return zero here to 
3011         * remain semi-compatible with present update(8) programs.
3012         */
3013        return 0;
3014}
3015
3016/*
3017 * This is the actual bdflush daemon itself. It used to be started from
3018 * the syscall above, but now we launch it ourselves internally with
3019 * kernel_thread(...)  directly after the first thread in init/main.c
3020 */
3021int bdflush(void *startup)
3022{
3023        struct task_struct *tsk = current;
3024
3025        /*
3026         *      We have a bare-bones task_struct, and really should fill
3027         *      in a few more things so "top" and /proc/2/{exe,root,cwd}
3028         *      display semi-sane things. Not real crucial though...  
3029         */
3030
3031        tsk->session = 1;
3032        tsk->pgrp = 1;
3033        strcpy(tsk->comm, "bdflush");
3034
3035        /* avoid getting signals */
3036        spin_lock_irq(&tsk->sigmask_lock);
3037        flush_signals(tsk);
3038        sigfillset(&tsk->blocked);
3039        recalc_sigpending(tsk);
3040        spin_unlock_irq(&tsk->sigmask_lock);
3041
3042        complete((struct completion *)startup);
3043
3044        /*
3045         * FIXME: The ndirty logic here is wrong.  It's supposed to
3046         * send bdflush back to sleep after writing ndirty buffers.
3047         * In fact, the test is wrong so bdflush will in fact
3048         * sleep when bdflush_stop() returns true.
3049         *
3050         * FIXME: If it proves useful to implement ndirty properly,
3051         * then perhaps the value of ndirty should be scaled by the
3052         * amount of memory in the machine.
3053         */
3054        for (;;) {
3055                int ndirty = bdf_prm.b_un.ndirty;
3056
3057                CHECK_EMERGENCY_SYNC
3058
3059                while (ndirty > 0) {
3060                        spin_lock(&lru_list_lock);
3061                        if (!write_some_buffers(NODEV))
3062                                break;
3063                        ndirty -= NRSYNC;
3064                }
3065                if (ndirty > 0 || bdflush_stop())
3066                        interruptible_sleep_on(&bdflush_wait);
3067        }
3068}
3069
3070/*
3071 * This is the kernel update daemon. It was used to live in userspace
3072 * but since it's need to run safely we want it unkillable by mistake.
3073 * You don't need to change your userspace configuration since
3074 * the userspace `update` will do_exit(0) at the first sys_bdflush().
3075 */
3076int kupdate(void *startup)
3077{
3078        struct task_struct * tsk = current;
3079        int interval;
3080
3081        tsk->session = 1;
3082        tsk->pgrp = 1;
3083        strcpy(tsk->comm, "kupdated");
3084
3085        /* sigstop and sigcont will stop and wakeup kupdate */
3086        spin_lock_irq(&tsk->sigmask_lock);
3087        sigfillset(&tsk->blocked);
3088        siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
3089        recalc_sigpending(tsk);
3090        spin_unlock_irq(&tsk->sigmask_lock);
3091
3092        complete((struct completion *)startup);
3093
3094        for (;;) {
3095                DECLARE_WAITQUEUE(wait, tsk);
3096
3097                add_wait_queue(&kupdate_wait, &wait);
3098
3099                /* update interval */
3100                interval = bdf_prm.b_un.interval;
3101                if (interval) {
3102                        tsk->state = TASK_INTERRUPTIBLE;
3103                        schedule_timeout(interval);
3104                } else {
3105                        tsk->state = TASK_STOPPED;
3106                        schedule(); /* wait for SIGCONT */
3107                }
3108                remove_wait_queue(&kupdate_wait, &wait);
3109                /* check for sigstop */
3110                if (signal_pending(tsk)) {
3111                        int sig, stopped = 0;
3112                        struct siginfo info;
3113
3114                        spin_lock_irq(&tsk->sigmask_lock);
3115                        sig = dequeue_signal(&current->blocked, &info);
3116                        if (sig == SIGSTOP)
3117                                stopped = 1;
3118                        spin_unlock_irq(&tsk->sigmask_lock);
3119                        if (stopped) {
3120                                tsk->state = TASK_STOPPED;
3121                                schedule(); /* wait for SIGCONT */
3122                        }
3123                }
3124#ifdef DEBUG
3125                printk(KERN_DEBUG "kupdate() activated...\n");
3126#endif
3127                sync_old_buffers();
3128                if (laptop_mode)
3129                        fsync_dev(NODEV);
3130                run_task_queue(&tq_disk);
3131        }
3132}
3133
3134static int __init bdflush_init(void)
3135{
3136        static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3137
3138        kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3139        wait_for_completion(&startup);
3140        kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3141        wait_for_completion(&startup);
3142        return 0;
3143}
3144
3145module_init(bdflush_init)
3146
3147
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.