linux-old/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9 * been avoided by NEVER letting an interrupt change a buffer (except for the
  10 * data, of course), but instead letting the caller do it.
  11 */
  12
  13/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15/* Removed a lot of unnecessary code and simplified things now that
  16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17 */
  18
  19/* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20 * hash table, use SLAB cache for buffer heads. -DaveM
  21 */
  22
  23/* Added 32k buffer block sizes - these are required older ARM systems.
  24 * - RMK
  25 */
  26
  27/* Thread it... -DaveM */
  28
  29/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31#include <linux/config.h>
  32#include <linux/sched.h>
  33#include <linux/fs.h>
  34#include <linux/slab.h>
  35#include <linux/locks.h>
  36#include <linux/errno.h>
  37#include <linux/swap.h>
  38#include <linux/swapctl.h>
  39#include <linux/smp_lock.h>
  40#include <linux/vmalloc.h>
  41#include <linux/blkdev.h>
  42#include <linux/sysrq.h>
  43#include <linux/file.h>
  44#include <linux/init.h>
  45#include <linux/quotaops.h>
  46#include <linux/iobuf.h>
  47#include <linux/highmem.h>
  48#include <linux/module.h>
  49#include <linux/completion.h>
  50
  51#include <asm/uaccess.h>
  52#include <asm/io.h>
  53#include <asm/bitops.h>
  54#include <asm/mmu_context.h>
  55
  56#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
  57#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
  58                                             number of unused buffer heads */
  59
  60/* Anti-deadlock ordering:
  61 *      lru_list_lock > hash_table_lock > unused_list_lock
  62 */
  63
  64#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
  65
  66/*
  67 * Hash table gook..
  68 */
  69static unsigned int bh_hash_mask;
  70static unsigned int bh_hash_shift;
  71static struct buffer_head **hash_table;
  72static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  73
  74static struct buffer_head *lru_list[NR_LIST];
  75
  76static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
  77#define lru_list_lock  lru_list_lock_cacheline.lock
  78
  79static int nr_buffers_type[NR_LIST];
  80static unsigned long size_buffers_type[NR_LIST];
  81
  82static struct buffer_head * unused_list;
  83static int nr_unused_buffer_heads;
  84static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  85static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  86
  87static int grow_buffers(kdev_t dev, unsigned long block, int size);
  88static int osync_buffers_list(struct list_head *);
  89static void __refile_buffer(struct buffer_head *);
  90
  91/* This is used by some architectures to estimate available memory. */
  92atomic_t buffermem_pages = ATOMIC_INIT(0);
  93
  94/* Here is the parameter block for the bdflush process. If you add or
  95 * remove any of the parameters, make sure to update kernel/sysctl.c
  96 * and the documentation at linux/Documentation/sysctl/vm.txt.
  97 */
  98
  99#define N_PARAM 9
 100
 101/* The dummy values in this structure are left in there for compatibility
 102 * with old programs that play with the /proc entries.
 103 */
 104union bdflush_param {
 105        struct {
 106                int nfract;     /* Percentage of buffer cache dirty to 
 107                                   activate bdflush */
 108                int ndirty;     /* Maximum number of dirty blocks to write out per
 109                                   wake-cycle */
 110                int dummy2;     /* old "nrefill" */
 111                int dummy3;     /* unused */
 112                int interval;   /* jiffies delay between kupdate flushes */
 113                int age_buffer; /* Time for normal buffer to age before we flush it */
 114                int nfract_sync;/* Percentage of buffer cache dirty to 
 115                                   activate bdflush synchronously */
 116                int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
 117                int dummy5;     /* unused */
 118        } b_un;
 119        unsigned int data[N_PARAM];
 120} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
 121
 122/* These are the min and max parameter values that we will allow to be assigned */
 123int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
 124int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
 125
 126void unlock_buffer(struct buffer_head *bh)
 127{
 128        clear_bit(BH_Wait_IO, &bh->b_state);
 129        clear_bit(BH_Launder, &bh->b_state);
 130        /*
 131         * When a locked buffer is visible to the I/O layer BH_Launder
 132         * is set. This means before unlocking we must clear BH_Launder,
 133         * mb() on alpha and then clear BH_Lock, so no reader can see
 134         * BH_Launder set on an unlocked buffer and then risk to deadlock.
 135         */
 136        smp_mb__after_clear_bit();
 137        clear_bit(BH_Lock, &bh->b_state);
 138        smp_mb__after_clear_bit();
 139        if (waitqueue_active(&bh->b_wait))
 140                wake_up(&bh->b_wait);
 141}
 142
 143/*
 144 * Note that the real wait_on_buffer() is an inline function that checks
 145 * that the buffer is locked before calling this, so that unnecessary disk
 146 * unplugging does not occur.
 147 */
 148void __wait_on_buffer(struct buffer_head * bh)
 149{
 150        struct task_struct *tsk = current;
 151        DECLARE_WAITQUEUE(wait, tsk);
 152
 153        get_bh(bh);
 154        add_wait_queue(&bh->b_wait, &wait);
 155        do {
 156                run_task_queue(&tq_disk);
 157                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 158                if (!buffer_locked(bh))
 159                        break;
 160                schedule();
 161        } while (buffer_locked(bh));
 162        tsk->state = TASK_RUNNING;
 163        remove_wait_queue(&bh->b_wait, &wait);
 164        put_bh(bh);
 165}
 166
 167/*
 168 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 169 * unlock the buffer. This is what ll_rw_block uses too.
 170 */
 171void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 172{
 173        mark_buffer_uptodate(bh, uptodate);
 174        unlock_buffer(bh);
 175        put_bh(bh);
 176}
 177
 178/*
 179 * The buffers have been marked clean and locked.  Just submit the dang
 180 * things.. 
 181 */
 182static void write_locked_buffers(struct buffer_head **array, unsigned int count)
 183{
 184        do {
 185                struct buffer_head * bh = *array++;
 186                bh->b_end_io = end_buffer_io_sync;
 187                submit_bh(WRITE, bh);
 188        } while (--count);
 189}
 190
 191/*
 192 * Write some buffers from the head of the dirty queue.
 193 *
 194 * This must be called with the LRU lock held, and will
 195 * return without it!
 196 */
 197#define NRSYNC (32)
 198static int write_some_buffers(kdev_t dev)
 199{
 200        struct buffer_head *next;
 201        struct buffer_head *array[NRSYNC];
 202        unsigned int count;
 203        int nr;
 204
 205        next = lru_list[BUF_DIRTY];
 206        nr = nr_buffers_type[BUF_DIRTY];
 207        count = 0;
 208        while (next && --nr >= 0) {
 209                struct buffer_head * bh = next;
 210                next = bh->b_next_free;
 211
 212                if (dev != NODEV && bh->b_dev != dev)
 213                        continue;
 214                if (test_and_set_bit(BH_Lock, &bh->b_state))
 215                        continue;
 216                if (atomic_set_buffer_clean(bh)) {
 217                        __refile_buffer(bh);
 218                        get_bh(bh);
 219                        array[count++] = bh;
 220                        if (count < NRSYNC)
 221                                continue;
 222
 223                        spin_unlock(&lru_list_lock);
 224                        write_locked_buffers(array, count);
 225                        return -EAGAIN;
 226                }
 227                unlock_buffer(bh);
 228                __refile_buffer(bh);
 229        }
 230        spin_unlock(&lru_list_lock);
 231
 232        if (count)
 233                write_locked_buffers(array, count);
 234        return 0;
 235}
 236
 237/*
 238 * Write out all buffers on the dirty list.
 239 */
 240static void write_unlocked_buffers(kdev_t dev)
 241{
 242        do
 243                spin_lock(&lru_list_lock);
 244        while (write_some_buffers(dev));
 245}
 246
 247/*
 248 * Wait for a buffer on the proper list.
 249 *
 250 * This must be called with the LRU lock held, and
 251 * will return with it released.
 252 */
 253static int wait_for_buffers(kdev_t dev, int index, int refile)
 254{
 255        struct buffer_head * next;
 256        int nr;
 257
 258        next = lru_list[index];
 259        nr = nr_buffers_type[index];
 260        while (next && --nr >= 0) {
 261                struct buffer_head *bh = next;
 262                next = bh->b_next_free;
 263
 264                if (!buffer_locked(bh)) {
 265                        if (refile)
 266                                __refile_buffer(bh);
 267                        continue;
 268                }
 269                if (dev != NODEV && bh->b_dev != dev)
 270                        continue;
 271
 272                get_bh(bh);
 273                spin_unlock(&lru_list_lock);
 274                wait_on_buffer (bh);
 275                put_bh(bh);
 276                return -EAGAIN;
 277        }
 278        spin_unlock(&lru_list_lock);
 279        return 0;
 280}
 281
 282static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
 283{
 284        do {
 285                spin_lock(&lru_list_lock);
 286        } while (wait_for_buffers(dev, index, refile));
 287        return 0;
 288}
 289
 290/* Call sync_buffers with wait!=0 to ensure that the call does not
 291 * return until all buffer writes have completed.  Sync() may return
 292 * before the writes have finished; fsync() may not.
 293 */
 294
 295/* Godamity-damn.  Some buffers (bitmaps for filesystems)
 296 * spontaneously dirty themselves without ever brelse being called.
 297 * We will ultimately want to put these in a separate list, but for
 298 * now we search all of the lists for dirty buffers.
 299 */
 300int sync_buffers(kdev_t dev, int wait)
 301{
 302        int err = 0;
 303
 304        /* One pass for no-wait, three for wait:
 305         * 0) write out all dirty, unlocked buffers;
 306         * 1) wait for all dirty locked buffers;
 307         * 2) write out all dirty, unlocked buffers;
 308         * 2) wait for completion by waiting for all buffers to unlock.
 309         */
 310        write_unlocked_buffers(dev);
 311        if (wait) {
 312                err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
 313                write_unlocked_buffers(dev);
 314                err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
 315        }
 316        return err;
 317}
 318
 319int fsync_super(struct super_block *sb)
 320{
 321        kdev_t dev = sb->s_dev;
 322        sync_buffers(dev, 0);
 323
 324        lock_kernel();
 325        sync_inodes_sb(sb);
 326        DQUOT_SYNC(dev);
 327        lock_super(sb);
 328        if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
 329                sb->s_op->write_super(sb);
 330        unlock_super(sb);
 331        unlock_kernel();
 332
 333        return sync_buffers(dev, 1);
 334}
 335
 336int fsync_no_super(kdev_t dev)
 337{
 338        sync_buffers(dev, 0);
 339        return sync_buffers(dev, 1);
 340}
 341
 342int fsync_dev(kdev_t dev)
 343{
 344        sync_buffers(dev, 0);
 345
 346        lock_kernel();
 347        sync_inodes(dev);
 348        DQUOT_SYNC(dev);
 349        sync_supers(dev);
 350        unlock_kernel();
 351
 352        return sync_buffers(dev, 1);
 353}
 354
 355/*
 356 * There's no real reason to pretend we should
 357 * ever do anything differently
 358 */
 359void sync_dev(kdev_t dev)
 360{
 361        fsync_dev(dev);
 362}
 363
 364asmlinkage long sys_sync(void)
 365{
 366        fsync_dev(0);
 367        return 0;
 368}
 369
 370/*
 371 *      filp may be NULL if called via the msync of a vma.
 372 */
 373 
 374int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 375{
 376        struct inode * inode = dentry->d_inode;
 377        struct super_block * sb;
 378        kdev_t dev;
 379        int ret;
 380
 381        lock_kernel();
 382        /* sync the inode to buffers */
 383        write_inode_now(inode, 0);
 384
 385        /* sync the superblock to buffers */
 386        sb = inode->i_sb;
 387        lock_super(sb);
 388        if (sb->s_op && sb->s_op->write_super)
 389                sb->s_op->write_super(sb);
 390        unlock_super(sb);
 391
 392        /* .. finally sync the buffers to disk */
 393        dev = inode->i_dev;
 394        ret = sync_buffers(dev, 1);
 395        unlock_kernel();
 396        return ret;
 397}
 398
 399asmlinkage long sys_fsync(unsigned int fd)
 400{
 401        struct file * file;
 402        struct dentry * dentry;
 403        struct inode * inode;
 404        int ret, err;
 405
 406        ret = -EBADF;
 407        file = fget(fd);
 408        if (!file)
 409                goto out;
 410
 411        dentry = file->f_dentry;
 412        inode = dentry->d_inode;
 413
 414        ret = -EINVAL;
 415        if (!file->f_op || !file->f_op->fsync) {
 416                /* Why?  We can still call filemap_fdatasync */
 417                goto out_putf;
 418        }
 419
 420        /* We need to protect against concurrent writers.. */
 421        down(&inode->i_sem);
 422        ret = filemap_fdatasync(inode->i_mapping);
 423        err = file->f_op->fsync(file, dentry, 0);
 424        if (err && !ret)
 425                ret = err;
 426        err = filemap_fdatawait(inode->i_mapping);
 427        if (err && !ret)
 428                ret = err;
 429        up(&inode->i_sem);
 430
 431out_putf:
 432        fput(file);
 433out:
 434        return ret;
 435}
 436
 437asmlinkage long sys_fdatasync(unsigned int fd)
 438{
 439        struct file * file;
 440        struct dentry * dentry;
 441        struct inode * inode;
 442        int ret, err;
 443
 444        ret = -EBADF;
 445        file = fget(fd);
 446        if (!file)
 447                goto out;
 448
 449        dentry = file->f_dentry;
 450        inode = dentry->d_inode;
 451
 452        ret = -EINVAL;
 453        if (!file->f_op || !file->f_op->fsync)
 454                goto out_putf;
 455
 456        down(&inode->i_sem);
 457        ret = filemap_fdatasync(inode->i_mapping);
 458        err = file->f_op->fsync(file, dentry, 1);
 459        if (err && !ret)
 460                ret = err;
 461        err = filemap_fdatawait(inode->i_mapping);
 462        if (err && !ret)
 463                ret = err;
 464        up(&inode->i_sem);
 465
 466out_putf:
 467        fput(file);
 468out:
 469        return ret;
 470}
 471
 472/* After several hours of tedious analysis, the following hash
 473 * function won.  Do not mess with it... -DaveM
 474 */
 475#define _hashfn(dev,block)      \
 476        ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 477         (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
 478          ((block) << (bh_hash_shift - 12))))
 479#define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
 480
 481static inline void __insert_into_hash_list(struct buffer_head *bh)
 482{
 483        struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 484        struct buffer_head *next = *head;
 485
 486        *head = bh;
 487        bh->b_pprev = head;
 488        bh->b_next = next;
 489        if (next != NULL)
 490                next->b_pprev = &bh->b_next;
 491}
 492
 493static __inline__ void __hash_unlink(struct buffer_head *bh)
 494{
 495        struct buffer_head **pprev = bh->b_pprev;
 496        if (pprev) {
 497                struct buffer_head *next = bh->b_next;
 498                if (next)
 499                        next->b_pprev = pprev;
 500                *pprev = next;
 501                bh->b_pprev = NULL;
 502        }
 503}
 504
 505static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 506{
 507        struct buffer_head **bhp = &lru_list[blist];
 508
 509        if (bh->b_prev_free || bh->b_next_free) BUG();
 510
 511        if(!*bhp) {
 512                *bhp = bh;
 513                bh->b_prev_free = bh;
 514        }
 515        bh->b_next_free = *bhp;
 516        bh->b_prev_free = (*bhp)->b_prev_free;
 517        (*bhp)->b_prev_free->b_next_free = bh;
 518        (*bhp)->b_prev_free = bh;
 519        nr_buffers_type[blist]++;
 520        size_buffers_type[blist] += bh->b_size;
 521}
 522
 523static void __remove_from_lru_list(struct buffer_head * bh)
 524{
 525        struct buffer_head *next = bh->b_next_free;
 526        if (next) {
 527                struct buffer_head *prev = bh->b_prev_free;
 528                int blist = bh->b_list;
 529
 530                prev->b_next_free = next;
 531                next->b_prev_free = prev;
 532                if (lru_list[blist] == bh) {
 533                        if (next == bh)
 534                                next = NULL;
 535                        lru_list[blist] = next;
 536                }
 537                bh->b_next_free = NULL;
 538                bh->b_prev_free = NULL;
 539                nr_buffers_type[blist]--;
 540                size_buffers_type[blist] -= bh->b_size;
 541        }
 542}
 543
 544/* must be called with both the hash_table_lock and the lru_list_lock
 545   held */
 546static void __remove_from_queues(struct buffer_head *bh)
 547{
 548        __hash_unlink(bh);
 549        __remove_from_lru_list(bh);
 550}
 551
 552static void remove_from_queues(struct buffer_head *bh)
 553{
 554        spin_lock(&lru_list_lock);
 555        write_lock(&hash_table_lock);
 556        __remove_from_queues(bh);
 557        write_unlock(&hash_table_lock); 
 558        spin_unlock(&lru_list_lock);
 559}
 560
 561struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 562{
 563        struct buffer_head *bh, **p = &hash(dev, block);
 564
 565        read_lock(&hash_table_lock);
 566
 567        for (;;) {
 568                bh = *p;
 569                if (!bh)
 570                        break;
 571                p = &bh->b_next;
 572                if (bh->b_blocknr != block)
 573                        continue;
 574                if (bh->b_size != size)
 575                        continue;
 576                if (bh->b_dev != dev)
 577                        continue;
 578                get_bh(bh);
 579                break;
 580        }
 581
 582        read_unlock(&hash_table_lock);
 583        return bh;
 584}
 585
 586void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
 587{
 588        spin_lock(&lru_list_lock);
 589        if (bh->b_inode)
 590                list_del(&bh->b_inode_buffers);
 591        bh->b_inode = inode;
 592        list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
 593        spin_unlock(&lru_list_lock);
 594}
 595
 596void buffer_insert_inode_data_queue(struct buffer_head *bh, struct inode *inode)
 597{
 598        spin_lock(&lru_list_lock);
 599        if (bh->b_inode)
 600                list_del(&bh->b_inode_buffers);
 601        bh->b_inode = inode;
 602        list_add(&bh->b_inode_buffers, &inode->i_dirty_data_buffers);
 603        spin_unlock(&lru_list_lock);
 604}
 605
 606/* The caller must have the lru_list lock before calling the 
 607   remove_inode_queue functions.  */
 608static void __remove_inode_queue(struct buffer_head *bh)
 609{
 610        bh->b_inode = NULL;
 611        list_del(&bh->b_inode_buffers);
 612}
 613
 614static inline void remove_inode_queue(struct buffer_head *bh)
 615{
 616        if (bh->b_inode)
 617                __remove_inode_queue(bh);
 618}
 619
 620int inode_has_buffers(struct inode *inode)
 621{
 622        int ret;
 623        
 624        spin_lock(&lru_list_lock);
 625        ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
 626        spin_unlock(&lru_list_lock);
 627        
 628        return ret;
 629}
 630
 631/* If invalidate_buffers() will trash dirty buffers, it means some kind
 632   of fs corruption is going on. Trashing dirty data always imply losing
 633   information that was supposed to be just stored on the physical layer
 634   by the user.
 635
 636   Thus invalidate_buffers in general usage is not allwowed to trash
 637   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 638   be preserved.  These buffers are simply skipped.
 639  
 640   We also skip buffers which are still in use.  For example this can
 641   happen if a userspace program is reading the block device.
 642
 643   NOTE: In the case where the user removed a removable-media-disk even if
 644   there's still dirty data not synced on disk (due a bug in the device driver
 645   or due an error of the user), by not destroying the dirty buffers we could
 646   generate corruption also on the next media inserted, thus a parameter is
 647   necessary to handle this case in the most safe way possible (trying
 648   to not corrupt also the new disk inserted with the data belonging to
 649   the old now corrupted disk). Also for the ramdisk the natural thing
 650   to do in order to release the ramdisk memory is to destroy dirty buffers.
 651
 652   These are two special cases. Normal usage imply the device driver
 653   to issue a sync on the device (without waiting I/O completion) and
 654   then an invalidate_buffers call that doesn't trash dirty buffers.
 655
 656   For handling cache coherency with the blkdev pagecache the 'update' case
 657   is been introduced. It is needed to re-read from disk any pinned
 658   buffer. NOTE: re-reading from disk is destructive so we can do it only
 659   when we assume nobody is changing the buffercache under our I/O and when
 660   we think the disk contains more recent information than the buffercache.
 661   The update == 1 pass marks the buffers we need to update, the update == 2
 662   pass does the actual I/O. */
 663void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 664{
 665        int i, nlist, slept;
 666        struct buffer_head * bh, * bh_next;
 667        kdev_t dev = to_kdev_t(bdev->bd_dev);   /* will become bdev */
 668
 669 retry:
 670        slept = 0;
 671        spin_lock(&lru_list_lock);
 672        for(nlist = 0; nlist < NR_LIST; nlist++) {
 673                bh = lru_list[nlist];
 674                if (!bh)
 675                        continue;
 676                for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 677                        bh_next = bh->b_next_free;
 678
 679                        /* Another device? */
 680                        if (bh->b_dev != dev)
 681                                continue;
 682                        /* Not hashed? */
 683                        if (!bh->b_pprev)
 684                                continue;
 685                        if (buffer_locked(bh)) {
 686                                get_bh(bh);
 687                                spin_unlock(&lru_list_lock);
 688                                wait_on_buffer(bh);
 689                                slept = 1;
 690                                spin_lock(&lru_list_lock);
 691                                put_bh(bh);
 692                        }
 693
 694                        write_lock(&hash_table_lock);
 695                        /* All buffers in the lru lists are mapped */
 696                        if (!buffer_mapped(bh))
 697                                BUG();
 698                        if (buffer_dirty(bh))
 699                                printk("invalidate: dirty buffer\n");
 700                        if (!atomic_read(&bh->b_count)) {
 701                                if (destroy_dirty_buffers || !buffer_dirty(bh)) {
 702                                        remove_inode_queue(bh);
 703                                }
 704                        } else
 705                                printk("invalidate: busy buffer\n");
 706
 707                        write_unlock(&hash_table_lock);
 708                        if (slept)
 709                                goto out;
 710                }
 711        }
 712out:
 713        spin_unlock(&lru_list_lock);
 714        if (slept)
 715                goto retry;
 716
 717        /* Get rid of the page cache */
 718        invalidate_inode_pages(bdev->bd_inode);
 719}
 720
 721void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 722{
 723        struct block_device *bdev = bdget(dev);
 724        if (bdev) {
 725                invalidate_bdev(bdev, destroy_dirty_buffers);
 726                bdput(bdev);
 727        }
 728}
 729
 730static void free_more_memory(void)
 731{
 732        balance_dirty();
 733        wakeup_bdflush();
 734        try_to_free_pages(GFP_NOIO);
 735        run_task_queue(&tq_disk);
 736        yield();
 737}
 738
 739void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 740{
 741        bh->b_list = BUF_CLEAN;
 742        bh->b_end_io = handler;
 743        bh->b_private = private;
 744}
 745
 746static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 747{
 748        static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 749        unsigned long flags;
 750        struct buffer_head *tmp;
 751        struct page *page;
 752        int fullup = 1;
 753
 754        mark_buffer_uptodate(bh, uptodate);
 755
 756        /* This is a temporary buffer used for page I/O. */
 757        page = bh->b_page;
 758
 759        if (!uptodate)
 760                SetPageError(page);
 761
 762        /*
 763         * Be _very_ careful from here on. Bad things can happen if
 764         * two buffer heads end IO at almost the same time and both
 765         * decide that the page is now completely done.
 766         *
 767         * Async buffer_heads are here only as labels for IO, and get
 768         * thrown away once the IO for this page is complete.  IO is
 769         * deemed complete once all buffers have been visited
 770         * (b_count==0) and are now unlocked. We must make sure that
 771         * only the _last_ buffer that decrements its count is the one
 772         * that unlock the page..
 773         */
 774        spin_lock_irqsave(&page_uptodate_lock, flags);
 775        mark_buffer_async(bh, 0);
 776        unlock_buffer(bh);
 777        tmp = bh->b_this_page;
 778        while (tmp != bh) {
 779                if (buffer_locked(tmp)) {
 780                        if (buffer_async(tmp))
 781                                goto still_busy;
 782                } else if (!buffer_uptodate(tmp))
 783                        fullup = 0;
 784                tmp = tmp->b_this_page;
 785        }
 786
 787        /* OK, the async IO on this page is complete. */
 788        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 789
 790        /*
 791         * If none of the buffers had errors and all were uptodate
 792         * then we can set the page uptodate:
 793         */
 794        if (fullup && !PageError(page))
 795                SetPageUptodate(page);
 796
 797        UnlockPage(page);
 798
 799        return;
 800
 801still_busy:
 802        spin_unlock_irqrestore(&page_uptodate_lock, flags);
 803        return;
 804}
 805
 806inline void set_buffer_async_io(struct buffer_head *bh)
 807{
 808        bh->b_end_io = end_buffer_io_async;
 809        mark_buffer_async(bh, 1);
 810}
 811
 812/*
 813 * Synchronise all the inode's dirty buffers to the disk.
 814 *
 815 * We have conflicting pressures: we want to make sure that all
 816 * initially dirty buffers get waited on, but that any subsequently
 817 * dirtied buffers don't.  After all, we don't want fsync to last
 818 * forever if somebody is actively writing to the file.
 819 *
 820 * Do this in two main stages: first we copy dirty buffers to a
 821 * temporary inode list, queueing the writes as we go.  Then we clean
 822 * up, waiting for those writes to complete.
 823 * 
 824 * During this second stage, any subsequent updates to the file may end
 825 * up refiling the buffer on the original inode's dirty list again, so
 826 * there is a chance we will end up with a buffer queued for write but
 827 * not yet completed on that list.  So, as a final cleanup we go through
 828 * the osync code to catch these locked, dirty buffers without requeuing
 829 * any newly dirty buffers for write.
 830 */
 831int fsync_buffers_list(struct list_head *list)
 832{
 833        struct buffer_head *bh;
 834        struct inode tmp;
 835        int err = 0, err2;
 836        
 837        INIT_LIST_HEAD(&tmp.i_dirty_buffers);
 838        
 839        spin_lock(&lru_list_lock);
 840
 841        while (!list_empty(list)) {
 842                bh = BH_ENTRY(list->next);
 843                list_del(&bh->b_inode_buffers);
 844                if (!buffer_dirty(bh) && !buffer_locked(bh))
 845                        bh->b_inode = NULL;
 846                else {
 847                        bh->b_inode = &tmp;
 848                        list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
 849                        if (buffer_dirty(bh)) {
 850                                get_bh(bh);
 851                                spin_unlock(&lru_list_lock);
 852                        /*
 853                         * Wait I/O completion before submitting
 854                         * the buffer, to be sure the write will
 855                         * be effective on the latest data in
 856                         * the buffer. (otherwise - if there's old
 857                         * I/O in flight - write_buffer would become
 858                         * a noop)
 859                         */
 860                                wait_on_buffer(bh);
 861                                ll_rw_block(WRITE, 1, &bh);
 862                                brelse(bh);
 863                                spin_lock(&lru_list_lock);
 864                        }
 865                }
 866        }
 867
 868        while (!list_empty(&tmp.i_dirty_buffers)) {
 869                bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
 870                remove_inode_queue(bh);
 871                get_bh(bh);
 872                spin_unlock(&lru_list_lock);
 873                wait_on_buffer(bh);
 874                if (!buffer_uptodate(bh))
 875                        err = -EIO;
 876                brelse(bh);
 877                spin_lock(&lru_list_lock);
 878        }
 879        
 880        spin_unlock(&lru_list_lock);
 881        err2 = osync_buffers_list(list);
 882
 883        if (err)
 884                return err;
 885        else
 886                return err2;
 887}
 888
 889/*
 890 * osync is designed to support O_SYNC io.  It waits synchronously for
 891 * all already-submitted IO to complete, but does not queue any new
 892 * writes to the disk.
 893 *
 894 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 895 * you dirty the buffers, and then use osync_buffers_list to wait for
 896 * completion.  Any other dirty buffers which are not yet queued for
 897 * write will not be flushed to disk by the osync.
 898 */
 899static int osync_buffers_list(struct list_head *list)
 900{
 901        struct buffer_head *bh;
 902        struct list_head *p;
 903        int err = 0;
 904
 905        spin_lock(&lru_list_lock);
 906        
 907 repeat:
 908        list_for_each_prev(p, list) {
 909                bh = BH_ENTRY(p);
 910                if (buffer_locked(bh)) {
 911                        get_bh(bh);
 912                        spin_unlock(&lru_list_lock);
 913                        wait_on_buffer(bh);
 914                        if (!buffer_uptodate(bh))
 915                                err = -EIO;
 916                        brelse(bh);
 917                        spin_lock(&lru_list_lock);
 918                        goto repeat;
 919                }
 920        }
 921
 922        spin_unlock(&lru_list_lock);
 923        return err;
 924}
 925
 926/*
 927 * Invalidate any and all dirty buffers on a given inode.  We are
 928 * probably unmounting the fs, but that doesn't mean we have already
 929 * done a sync().  Just drop the buffers from the inode list.
 930 */
 931void invalidate_inode_buffers(struct inode *inode)
 932{
 933        struct list_head * entry;
 934        
 935        spin_lock(&lru_list_lock);
 936        while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
 937                remove_inode_queue(BH_ENTRY(entry));
 938        while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
 939                remove_inode_queue(BH_ENTRY(entry));
 940        spin_unlock(&lru_list_lock);
 941}
 942
 943
 944/*
 945 * Ok, this is getblk, and it isn't very clear, again to hinder
 946 * race-conditions. Most of the code is seldom used, (ie repeating),
 947 * so it should be much more efficient than it looks.
 948 *
 949 * The algorithm is changed: hopefully better, and an elusive bug removed.
 950 *
 951 * 14.02.92: changed it to sync dirty buffers a bit: better performance
 952 * when the filesystem starts to get full of dirty blocks (I hope).
 953 */
 954struct buffer_head * getblk(kdev_t dev, int block, int size)
 955{
 956        for (;;) {
 957                struct buffer_head * bh;
 958
 959                bh = get_hash_table(dev, block, size);
 960                if (bh) {
 961                        touch_buffer(bh);
 962                        return bh;
 963                }
 964
 965                if (!grow_buffers(dev, block, size))
 966                        free_more_memory();
 967        }
 968}
 969
 970/* -1 -> no need to flush
 971    0 -> async flush
 972    1 -> sync flush (wait for I/O completion) */
 973static int balance_dirty_state(void)
 974{
 975        unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 976
 977        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 978        tot = nr_free_buffer_pages();
 979
 980        dirty *= 100;
 981        soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 982        hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
 983
 984        /* First, check for the "real" dirty limit. */
 985        if (dirty > soft_dirty_limit) {
 986                if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
 987                        return 1;
 988                return 0;
 989        }
 990
 991        return -1;
 992}
 993
 994static int bdflush_stop(void)
 995{
 996        unsigned long dirty, tot, dirty_limit;
 997
 998        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 999        tot = nr_free_buffer_pages();
1000
1001        dirty *= 100;
1002        dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1003
1004        if (dirty > dirty_limit)
1005                return 0;
1006        return 1;
1007}
1008
1009/*
1010 * if a new dirty buffer is created we need to balance bdflush.
1011 *
1012 * in the future we might want to make bdflush aware of different
1013 * pressures on different devices - thus the (currently unused)
1014 * 'dev' parameter.
1015 */
1016void balance_dirty(void)
1017{
1018        int state = balance_dirty_state();
1019
1020        if (state < 0)
1021                return;
1022
1023        wakeup_bdflush();
1024
1025        /*
1026         * And if we're _really_ out of balance, wait for
1027         * some of the dirty/locked buffers ourselves.
1028         * This will throttle heavy writers.
1029         */
1030        if (state > 0) {
1031                spin_lock(&lru_list_lock);
1032                write_some_buffers(NODEV);
1033        }
1034}
1035
1036inline void __mark_dirty(struct buffer_head *bh)
1037{
1038        bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1039        refile_buffer(bh);
1040}
1041
1042/* atomic version, the user must call balance_dirty() by hand
1043   as soon as it become possible to block */
1044void __mark_buffer_dirty(struct buffer_head *bh)
1045{
1046        if (!atomic_set_buffer_dirty(bh))
1047                __mark_dirty(bh);
1048}
1049
1050void mark_buffer_dirty(struct buffer_head *bh)
1051{
1052        if (!atomic_set_buffer_dirty(bh)) {
1053                __mark_dirty(bh);
1054                balance_dirty();
1055        }
1056}
1057
1058void set_buffer_flushtime(struct buffer_head *bh)
1059{
1060        bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1061}
1062EXPORT_SYMBOL(set_buffer_flushtime);
1063
1064/*
1065 * A buffer may need to be moved from one buffer list to another
1066 * (e.g. in case it is not shared any more). Handle this.
1067 */
1068static void __refile_buffer(struct buffer_head *bh)
1069{
1070        int dispose = BUF_CLEAN;
1071        if (buffer_locked(bh))
1072                dispose = BUF_LOCKED;
1073        if (buffer_dirty(bh))
1074                dispose = BUF_DIRTY;
1075        if (dispose != bh->b_list) {
1076                __remove_from_lru_list(bh);
1077                bh->b_list = dispose;
1078                if (dispose == BUF_CLEAN)
1079                        remove_inode_queue(bh);
1080                __insert_into_lru_list(bh, dispose);
1081        }
1082}
1083
1084void refile_buffer(struct buffer_head *bh)
1085{
1086        spin_lock(&lru_list_lock);
1087        __refile_buffer(bh);
1088        spin_unlock(&lru_list_lock);
1089}
1090
1091/*
1092 * Release a buffer head
1093 */
1094void __brelse(struct buffer_head * buf)
1095{
1096        if (atomic_read(&buf->b_count)) {
1097                put_bh(buf);
1098                return;
1099        }
1100        printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1101}
1102
1103/*
1104 * bforget() is like brelse(), except it discards any
1105 * potentially dirty data.
1106 */
1107void __bforget(struct buffer_head * buf)
1108{
1109        mark_buffer_clean(buf);
1110        __brelse(buf);
1111}
1112
1113/**
1114 *      bread() - reads a specified block and returns the bh
1115 *      @block: number of block
1116 *      @size: size (in bytes) to read
1117 * 
1118 *      Reads a specified block, and returns buffer head that
1119 *      contains it. It returns NULL if the block was unreadable.
1120 */
1121struct buffer_head * bread(kdev_t dev, int block, int size)
1122{
1123        struct buffer_head * bh;
1124
1125        bh = getblk(dev, block, size);
1126        if (buffer_uptodate(bh))
1127                return bh;
1128        ll_rw_block(READ, 1, &bh);
1129        wait_on_buffer(bh);
1130        if (buffer_uptodate(bh))
1131                return bh;
1132        brelse(bh);
1133        return NULL;
1134}
1135
1136/*
1137 * Note: the caller should wake up the buffer_wait list if needed.
1138 */
1139static void __put_unused_buffer_head(struct buffer_head * bh)
1140{
1141        if (bh->b_inode)
1142                BUG();
1143        if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1144                kmem_cache_free(bh_cachep, bh);
1145        } else {
1146                bh->b_dev = B_FREE;
1147                bh->b_blocknr = -1;
1148                bh->b_this_page = NULL;
1149
1150                nr_unused_buffer_heads++;
1151                bh->b_next_free = unused_list;
1152                unused_list = bh;
1153        }
1154}
1155
1156void put_unused_buffer_head(struct buffer_head *bh)
1157{
1158        spin_lock(&unused_list_lock);
1159        __put_unused_buffer_head(bh);
1160        spin_unlock(&unused_list_lock);
1161}
1162EXPORT_SYMBOL(put_unused_buffer_head);
1163
1164/*
1165 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1166 * no-buffer-head deadlock.  Return NULL on failure; waiting for
1167 * buffer heads is now handled in create_buffers().
1168 */ 
1169struct buffer_head * get_unused_buffer_head(int async)
1170{
1171        struct buffer_head * bh;
1172
1173        spin_lock(&unused_list_lock);
1174        if (nr_unused_buffer_heads > NR_RESERVED) {
1175                bh = unused_list;
1176                unused_list = bh->b_next_free;
1177                nr_unused_buffer_heads--;
1178                spin_unlock(&unused_list_lock);
1179                return bh;
1180        }
1181        spin_unlock(&unused_list_lock);
1182
1183        /* This is critical.  We can't call out to the FS
1184         * to get more buffer heads, because the FS may need
1185         * more buffer-heads itself.  Thus SLAB_NOFS.
1186         */
1187        if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1188                bh->b_blocknr = -1;
1189                bh->b_this_page = NULL;
1190                return bh;
1191        }
1192
1193        /*
1194         * If we need an async buffer, use the reserved buffer heads.
1195         */
1196        if (async) {
1197                spin_lock(&unused_list_lock);
1198                if (unused_list) {
1199                        bh = unused_list;
1200                        unused_list = bh->b_next_free;
1201                        nr_unused_buffer_heads--;
1202                        spin_unlock(&unused_list_lock);
1203                        return bh;
1204                }
1205                spin_unlock(&unused_list_lock);
1206        }
1207
1208        return NULL;
1209}
1210EXPORT_SYMBOL(get_unused_buffer_head);
1211
1212void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1213{
1214        if (offset >= PAGE_SIZE)
1215                BUG();
1216
1217        /*
1218         * page_address will return NULL anyways for highmem pages
1219         */
1220        bh->b_data = page_address(page) + offset;
1221        bh->b_page = page;
1222}
1223EXPORT_SYMBOL(set_bh_page);
1224
1225/*
1226 * Create the appropriate buffers when given a page for data area and
1227 * the size of each buffer.. Use the bh->b_this_page linked list to
1228 * follow the buffers created.  Return NULL if unable to create more
1229 * buffers.
1230 * The async flag is used to differentiate async IO (paging, swapping)
1231 * from ordinary buffer allocations, and only async requests are allowed
1232 * to sleep waiting for buffer heads. 
1233 */
1234static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1235{
1236        struct buffer_head *bh, *head;
1237        long offset;
1238
1239try_again:
1240        head = NULL;
1241        offset = PAGE_SIZE;
1242        while ((offset -= size) >= 0) {
1243                bh = get_unused_buffer_head(async);
1244                if (!bh)
1245                        goto no_grow;
1246
1247                bh->b_dev = NODEV;
1248                bh->b_this_page = head;
1249                head = bh;
1250
1251                bh->b_state = 0;
1252                bh->b_next_free = NULL;
1253                bh->b_pprev = NULL;
1254                atomic_set(&bh->b_count, 0);
1255                bh->b_size = size;
1256
1257                set_bh_page(bh, page, offset);
1258
1259                bh->b_list = BUF_CLEAN;
1260                bh->b_end_io = NULL;
1261        }
1262        return head;
1263/*
1264 * In case anything failed, we just free everything we got.
1265 */
1266no_grow:
1267        if (head) {
1268                spin_lock(&unused_list_lock);
1269                do {
1270                        bh = head;
1271                        head = head->b_this_page;
1272                        __put_unused_buffer_head(bh);
1273                } while (head);
1274                spin_unlock(&unused_list_lock);
1275
1276                /* Wake up any waiters ... */
1277                wake_up(&buffer_wait);
1278        }
1279
1280        /*
1281         * Return failure for non-async IO requests.  Async IO requests
1282         * are not allowed to fail, so we have to wait until buffer heads
1283         * become available.  But we don't want tasks sleeping with 
1284         * partially complete buffers, so all were released above.
1285         */
1286        if (!async)
1287                return NULL;
1288
1289        /* We're _really_ low on memory. Now we just
1290         * wait for old buffer heads to become free due to
1291         * finishing IO.  Since this is an async request and
1292         * the reserve list is empty, we're sure there are 
1293         * async buffer heads in use.
1294         */
1295        run_task_queue(&tq_disk);
1296
1297        free_more_memory();
1298        goto try_again;
1299}
1300
1301/*
1302 * Called when truncating a buffer on a page completely.
1303 */
1304static void discard_buffer(struct buffer_head * bh)
1305{
1306        if (buffer_mapped(bh)) {
1307                mark_buffer_clean(bh);
1308                lock_buffer(bh);
1309                clear_bit(BH_Uptodate, &bh->b_state);
1310                clear_bit(BH_Mapped, &bh->b_state);
1311                clear_bit(BH_Req, &bh->b_state);
1312                clear_bit(BH_New, &bh->b_state);
1313                remove_from_queues(bh);
1314                unlock_buffer(bh);
1315        }
1316}
1317
1318/**
1319 * try_to_release_page - release old fs-specific metadata on a page
1320 *
1321 */
1322
1323int try_to_release_page(struct page * page, int gfp_mask)
1324{
1325        if (!PageLocked(page))
1326                BUG();
1327        
1328        if (!page->mapping)
1329                goto try_to_free;
1330        if (!page->mapping->a_ops->releasepage)
1331                goto try_to_free;
1332        if (page->mapping->a_ops->releasepage(page, gfp_mask))
1333                goto try_to_free;
1334        /*
1335         * We couldn't release buffer metadata; don't even bother trying
1336         * to release buffers.
1337         */
1338        return 0;
1339try_to_free:    
1340        return try_to_free_buffers(page, gfp_mask);
1341}
1342
1343/*
1344 * We don't have to release all buffers here, but
1345 * we have to be sure that no dirty buffer is left
1346 * and no IO is going on (no buffer is locked), because
1347 * we have truncated the file and are going to free the
1348 * blocks on-disk..
1349 */
1350int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1351{
1352        struct buffer_head *head, *bh, *next;
1353        unsigned int curr_off = 0;
1354
1355        if (!PageLocked(page))
1356                BUG();
1357        if (!page->buffers)
1358                return 1;
1359
1360        head = page->buffers;
1361        bh = head;
1362        do {
1363                unsigned int next_off = curr_off + bh->b_size;
1364                next = bh->b_this_page;
1365
1366                /*
1367                 * is this block fully flushed?
1368                 */
1369                if (offset <= curr_off)
1370                        discard_buffer(bh);
1371                curr_off = next_off;
1372                bh = next;
1373        } while (bh != head);
1374
1375        /*
1376         * subtle. We release buffer-heads only if this is
1377         * the 'final' flushpage. We have invalidated the get_block
1378         * cached value unconditionally, so real IO is not
1379         * possible anymore.
1380         *
1381         * If the free doesn't work out, the buffers can be
1382         * left around - they just turn into anonymous buffers
1383         * instead.
1384         */
1385        if (!offset) {
1386                if (!try_to_release_page(page, 0))
1387                        return 0;
1388        }
1389
1390        return 1;
1391}
1392
1393void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1394{
1395        struct buffer_head *bh, *head, *tail;
1396
1397        /* FIXME: create_buffers should fail if there's no enough memory */
1398        head = create_buffers(page, blocksize, 1);
1399        if (page->buffers)
1400                BUG();
1401
1402        bh = head;
1403        do {
1404                bh->b_dev = dev;
1405                bh->b_blocknr = 0;
1406                bh->b_end_io = NULL;
1407                tail = bh;
1408                bh = bh->b_this_page;
1409        } while (bh);
1410        tail->b_this_page = head;
1411        page->buffers = head;
1412        page_cache_get(page);
1413}
1414EXPORT_SYMBOL(create_empty_buffers);
1415
1416/*
1417 * We are taking a block for data and we don't want any output from any
1418 * buffer-cache aliases starting from return from that function and
1419 * until the moment when something will explicitly mark the buffer
1420 * dirty (hopefully that will not happen until we will free that block ;-)
1421 * We don't even need to mark it not-uptodate - nobody can expect
1422 * anything from a newly allocated buffer anyway. We used to used
1423 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1424 * don't want to mark the alias unmapped, for example - it would confuse
1425 * anyone who might pick it with bread() afterwards...
1426 */
1427
1428static void unmap_underlying_metadata(struct buffer_head * bh)
1429{
1430        struct buffer_head *old_bh;
1431
1432        old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1433        if (old_bh) {
1434                mark_buffer_clean(old_bh);
1435                wait_on_buffer(old_bh);
1436                clear_bit(BH_Req, &old_bh->b_state);
1437                __brelse(old_bh);
1438        }
1439}
1440
1441/*
1442 * NOTE! All mapped/uptodate combinations are valid:
1443 *
1444 *      Mapped  Uptodate        Meaning
1445 *
1446 *      No      No              "unknown" - must do get_block()
1447 *      No      Yes             "hole" - zero-filled
1448 *      Yes     No              "allocated" - allocated on disk, not read in
1449 *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1450 *
1451 * "Dirty" is valid only with the last case (mapped+uptodate).
1452 */
1453
1454/*
1455 * block_write_full_page() is SMP threaded - the kernel lock is not held.
1456 */
1457static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1458{
1459        int err, i;
1460        unsigned long block;
1461        struct buffer_head *bh, *head;
1462        int need_unlock;
1463
1464        if (!PageLocked(page))
1465                BUG();
1466
1467        if (!page->buffers)
1468                create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1469        head = page->buffers;
1470
1471        block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1472
1473        bh = head;
1474        i = 0;
1475
1476        /* Stage 1: make sure we have all the buffers mapped! */
1477        do {
1478                /*
1479                 * If the buffer isn't up-to-date, we can't be sure
1480                 * that the buffer has been initialized with the proper
1481                 * block number information etc..
1482                 *
1483                 * Leave it to the low-level FS to make all those
1484                 * decisions (block #0 may actually be a valid block)
1485                 */
1486                if (!buffer_mapped(bh)) {
1487                        err = get_block(inode, block, bh, 1);
1488                        if (err)
1489                                goto out;
1490                        if (buffer_new(bh))
1491                                unmap_underlying_metadata(bh);
1492                }
1493                bh = bh->b_this_page;
1494                block++;
1495        } while (bh != head);
1496
1497        /* Stage 2: lock the buffers, mark them clean */
1498        do {
1499                lock_buffer(bh);
1500                set_buffer_async_io(bh);
1501                set_bit(BH_Uptodate, &bh->b_state);
1502                clear_bit(BH_Dirty, &bh->b_state);
1503                bh = bh->b_this_page;
1504        } while (bh != head);
1505
1506        /* Stage 3: submit the IO */
1507        do {
1508                struct buffer_head *next = bh->b_this_page;
1509                submit_bh(WRITE, bh);
1510                bh = next;
1511        } while (bh != head);
1512
1513        /* Done - end_buffer_io_async will unlock */
1514        SetPageUptodate(page);
1515        return 0;
1516
1517out:
1518        /*
1519         * ENOSPC, or some other error.  We may already have added some
1520         * blocks to the file, so we need to write these out to avoid
1521         * exposing stale data.
1522         */
1523        ClearPageUptodate(page);
1524        bh = head;
1525        need_unlock = 1;
1526        /* Recovery: lock and submit the mapped buffers */
1527        do {
1528                if (buffer_mapped(bh)) {
1529                        lock_buffer(bh);
1530                        set_buffer_async_io(bh);
1531                        need_unlock = 0;
1532                }
1533                bh = bh->b_this_page;
1534        } while (bh != head);
1535        do {
1536                struct buffer_head *next = bh->b_this_page;
1537                if (buffer_mapped(bh)) {
1538                        set_bit(BH_Uptodate, &bh->b_state);
1539                        clear_bit(BH_Dirty, &bh->b_state);
1540                        submit_bh(WRITE, bh);
1541                }
1542                bh = next;
1543        } while (bh != head);
1544        if (need_unlock)
1545                UnlockPage(page);
1546        return err;
1547}
1548
1549static int __block_prepare_write(struct inode *inode, struct page *page,
1550                unsigned from, unsigned to, get_block_t *get_block)
1551{
1552        unsigned block_start, block_end;
1553        unsigned long block;
1554        int err = 0;
1555        unsigned blocksize, bbits;
1556        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1557        char *kaddr = kmap(page);
1558
1559        blocksize = 1 << inode->i_blkbits;
1560        if (!page->buffers)
1561                create_empty_buffers(page, inode->i_dev, blocksize);
1562        head = page->buffers;
1563
1564        bbits = inode->i_blkbits;
1565        block = page->index << (PAGE_CACHE_SHIFT - bbits);
1566
1567        for(bh = head, block_start = 0; bh != head || !block_start;
1568            block++, block_start=block_end, bh = bh->b_this_page) {
1569                if (!bh)
1570                        BUG();
1571                block_end = block_start+blocksize;
1572                if (block_end <= from)
1573                        continue;
1574                if (block_start >= to)
1575                        break;
1576                clear_bit(BH_New, &bh->b_state);
1577                if (!buffer_mapped(bh)) {
1578                        err = get_block(inode, block, bh, 1);
1579                        if (err)
1580                                goto out;
1581                        if (buffer_new(bh)) {
1582                                unmap_underlying_metadata(bh);
1583                                if (Page_Uptodate(page)) {
1584                                        set_bit(BH_Uptodate, &bh->b_state);
1585                                        continue;
1586                                }
1587                                if (block_end > to)
1588                                        memset(kaddr+to, 0, block_end-to);
1589                                if (block_start < from)
1590                                        memset(kaddr+block_start, 0, from-block_start);
1591                                if (block_end > to || block_start < from)
1592                                        flush_dcache_page(page);
1593                                continue;
1594                        }
1595                }
1596                if (Page_Uptodate(page)) {
1597                        set_bit(BH_Uptodate, &bh->b_state);
1598                        continue; 
1599                }
1600                if (!buffer_uptodate(bh) &&
1601                     (block_start < from || block_end > to)) {
1602                        ll_rw_block(READ, 1, &bh);
1603                        *wait_bh++=bh;
1604                }
1605        }
1606        /*
1607         * If we issued read requests - let them complete.
1608         */
1609        while(wait_bh > wait) {
1610                wait_on_buffer(*--wait_bh);
1611                if (!buffer_uptodate(*wait_bh))
1612                        return -EIO;
1613        }
1614        return 0;
1615out:
1616        /*
1617         * Zero out any newly allocated blocks to avoid exposing stale
1618         * data.  If BH_New is set, we know that the block was newly
1619         * allocated in the above loop.
1620         *
1621         * Details the buffer can be new and uptodate because:
1622         * 1) hole in uptodate page, get_block(create) allocate the block,
1623         *    so the buffer is new and additionally we also mark it uptodate
1624         * 2) The buffer is not mapped and uptodate due a previous partial read.
1625         *
1626         * We can always ignore uptodate buffers here, if you mark a buffer
1627         * uptodate you must make sure it contains the right data first.
1628         *
1629         * We must stop the "undo/clear" fixup pass not at the caller "to"
1630         * but at the last block that we successfully arrived in the main loop.
1631         */
1632        bh = head;
1633        to = block_start; /* stop at the last successfully handled block */
1634        block_start = 0;
1635        do {
1636                block_end = block_start+blocksize;
1637                if (block_end <= from)
1638                        goto next_bh;
1639                if (block_start >= to)
1640                        break;
1641                if (buffer_new(bh) && !buffer_uptodate(bh)) {
1642                        memset(kaddr+block_start, 0, bh->b_size);
1643                        flush_dcache_page(page);
1644                        set_bit(BH_Uptodate, &bh->b_state);
1645                        mark_buffer_dirty(bh);
1646                }
1647next_bh:
1648                block_start = block_end;
1649                bh = bh->b_this_page;
1650        } while (bh != head);
1651        return err;
1652}
1653
1654static int __block_commit_write(struct inode *inode, struct page *page,
1655                unsigned from, unsigned to)
1656{
1657        unsigned block_start, block_end;
1658        int partial = 0, need_balance_dirty = 0;
1659        unsigned blocksize;
1660        struct buffer_head *bh, *head;
1661
1662        blocksize = 1 << inode->i_blkbits;
1663
1664        for(bh = head = page->buffers, block_start = 0;
1665            bh != head || !block_start;
1666            block_start=block_end, bh = bh->b_this_page) {
1667                block_end = block_start + blocksize;
1668                if (block_end <= from || block_start >= to) {
1669                        if (!buffer_uptodate(bh))
1670                                partial = 1;
1671                } else {
1672                        set_bit(BH_Uptodate, &bh->b_state);
1673                        if (!atomic_set_buffer_dirty(bh)) {
1674                                __mark_dirty(bh);
1675                                buffer_insert_inode_data_queue(bh, inode);
1676                                need_balance_dirty = 1;
1677                        }
1678                }
1679        }
1680
1681        if (need_balance_dirty)
1682                balance_dirty();
1683        /*
1684         * is this a partial write that happened to make all buffers
1685         * uptodate then we can optimize away a bogus readpage() for
1686         * the next read(). Here we 'discover' wether the page went
1687         * uptodate as a result of this (potentially partial) write.
1688         */
1689        if (!partial)
1690                SetPageUptodate(page);
1691        return 0;
1692}
1693
1694/*
1695 * Generic "read page" function for block devices that have the normal
1696 * get_block functionality. This is most of the block device filesystems.
1697 * Reads the page asynchronously --- the unlock_buffer() and
1698 * mark_buffer_uptodate() functions propagate buffer state into the
1699 * page struct once IO has completed.
1700 */
1701int block_read_full_page(struct page *page, get_block_t *get_block)
1702{
1703        struct inode *inode = page->mapping->host;
1704        unsigned long iblock, lblock;
1705        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1706        unsigned int blocksize, blocks;
1707        int nr, i;
1708
1709        if (!PageLocked(page))
1710                PAGE_BUG(page);
1711        blocksize = 1 << inode->i_blkbits;
1712        if (!page->buffers)
1713                create_empty_buffers(page, inode->i_dev, blocksize);
1714        head = page->buffers;
1715
1716        blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1717        iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1718        lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1719        bh = head;
1720        nr = 0;
1721        i = 0;
1722
1723        do {
1724                if (buffer_uptodate(bh))
1725                        continue;
1726
1727                if (!buffer_mapped(bh)) {
1728                        if (iblock < lblock) {
1729                                if (get_block(inode, iblock, bh, 0))
1730                                        continue;
1731                        }
1732                        if (!buffer_mapped(bh)) {
1733                                memset(kmap(page) + i*blocksize, 0, blocksize);
1734                                flush_dcache_page(page);
1735                                kunmap(page);
1736                                set_bit(BH_Uptodate, &bh->b_state);
1737                                continue;
1738                        }
1739                        /* get_block() might have updated the buffer synchronously */
1740                        if (buffer_uptodate(bh))
1741                                continue;
1742                }
1743
1744                arr[nr] = bh;
1745                nr++;
1746        } while (i++, iblock++, (bh = bh->b_this_page) != head);
1747
1748        if (!nr) {
1749                /*
1750                 * all buffers are uptodate - we can set the page
1751                 * uptodate as well.
1752                 */
1753                SetPageUptodate(page);
1754                UnlockPage(page);
1755                return 0;
1756        }
1757
1758        /* Stage two: lock the buffers */
1759        for (i = 0; i < nr; i++) {
1760                struct buffer_head * bh = arr[i];
1761                lock_buffer(bh);
1762                set_buffer_async_io(bh);
1763        }
1764
1765        /* Stage 3: start the IO */
1766        for (i = 0; i < nr; i++) {
1767                struct buffer_head * bh = arr[i];
1768                if (buffer_uptodate(bh))
1769                        end_buffer_io_async(bh, 1);
1770                else
1771                        submit_bh(READ, bh);
1772        }
1773        
1774        return 0;
1775}
1776
1777/* utility function for filesystems that need to do work on expanding
1778 * truncates.  Uses prepare/commit_write to allow the filesystem to
1779 * deal with the hole.  
1780 */
1781int generic_cont_expand(struct inode *inode, loff_t size)
1782{
1783        struct address_space *mapping = inode->i_mapping;
1784        struct page *page;
1785        unsigned long index, offset, limit;
1786        int err;
1787
1788        err = -EFBIG;
1789        limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1790        if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1791                send_sig(SIGXFSZ, current, 0);
1792                goto out;
1793        }
1794        if (size > inode->i_sb->s_maxbytes)
1795                goto out;
1796
1797        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1798
1799        /* ugh.  in prepare/commit_write, if from==to==start of block, we 
1800        ** skip the prepare.  make sure we never send an offset for the start
1801        ** of a block
1802        */
1803        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1804                offset++;
1805        }
1806        index = size >> PAGE_CACHE_SHIFT;
1807        err = -ENOMEM;
1808        page = grab_cache_page(mapping, index);
1809        if (!page)
1810                goto out;
1811        err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1812        if (!err) {
1813                err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1814        }
1815        UnlockPage(page);
1816        page_cache_release(page);
1817        if (err > 0)
1818                err = 0;
1819out:
1820        return err;
1821}
1822
1823/*
1824 * For moronic filesystems that do not allow holes in file.
1825 * We may have to extend the file.
1826 */
1827
1828int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1829{
1830        struct address_space *mapping = page->mapping;
1831        struct inode *inode = mapping->host;
1832        struct page *new_page;
1833        unsigned long pgpos;
1834        long status;
1835        unsigned zerofrom;
1836        unsigned blocksize = 1 << inode->i_blkbits;
1837        char *kaddr;
1838
1839        while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1840                status = -ENOMEM;
1841                new_page = grab_cache_page(mapping, pgpos);
1842                if (!new_page)
1843                        goto out;
1844                /* we might sleep */
1845                if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1846                        UnlockPage(new_page);
1847                        page_cache_release(new_page);
1848                        continue;
1849                }
1850                zerofrom = *bytes & ~PAGE_CACHE_MASK;
1851                if (zerofrom & (blocksize-1)) {
1852                        *bytes |= (blocksize-1);
1853                        (*bytes)++;
1854                }
1855                status = __block_prepare_write(inode, new_page, zerofrom,
1856                                                PAGE_CACHE_SIZE, get_block);
1857                if (status)
1858                        goto out_unmap;
1859                kaddr = page_address(new_page);
1860                memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1861                flush_dcache_page(new_page);
1862                __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1863                kunmap(new_page);
1864                UnlockPage(new_page);
1865                page_cache_release(new_page);
1866        }
1867
1868        if (page->index < pgpos) {
1869                /* completely inside the area */
1870                zerofrom = offset;
1871        } else {
1872                /* page covers the boundary, find the boundary offset */
1873                zerofrom = *bytes & ~PAGE_CACHE_MASK;
1874
1875                /* if we will expand the thing last block will be filled */
1876                if (to > zerofrom && (zerofrom & (blocksize-1))) {
1877                        *bytes |= (blocksize-1);
1878                        (*bytes)++;
1879                }
1880
1881                /* starting below the boundary? Nothing to zero out */
1882                if (offset <= zerofrom)
1883                        zerofrom = offset;
1884        }
1885        status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1886        if (status)
1887                goto out1;
1888        kaddr = page_address(page);
1889        if (zerofrom < offset) {
1890                memset(kaddr+zerofrom, 0, offset-zerofrom);
1891                flush_dcache_page(page);
1892                __block_commit_write(inode, page, zerofrom, offset);
1893        }
1894        return 0;
1895out1:
1896        ClearPageUptodate(page);
1897        kunmap(page);
1898        return status;
1899
1900out_unmap:
1901        ClearPageUptodate(new_page);
1902        kunmap(new_page);
1903        UnlockPage(new_page);
1904        page_cache_release(new_page);
1905out:
1906        return status;
1907}
1908
1909int block_prepare_write(struct page *page, unsigned from, unsigned to,
1910                        get_block_t *get_block)
1911{
1912        struct inode *inode = page->mapping->host;
1913        int err = __block_prepare_write(inode, page, from, to, get_block);
1914        if (err) {
1915                ClearPageUptodate(page);
1916                kunmap(page);
1917        }
1918        return err;
1919}
1920
1921int block_commit_write(struct page *page, unsigned from, unsigned to)
1922{
1923        struct inode *inode = page->mapping->host;
1924        __block_commit_write(inode,page,from,to);
1925        kunmap(page);
1926        return 0;
1927}
1928
1929int generic_commit_write(struct file *file, struct page *page,
1930                unsigned from, unsigned to)
1931{
1932        struct inode *inode = page->mapping->host;
1933        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1934        __block_commit_write(inode,page,from,to);
1935        kunmap(page);
1936        if (pos > inode->i_size) {
1937                inode->i_size = pos;
1938                mark_inode_dirty(inode);
1939        }
1940        return 0;
1941}
1942
1943int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1944{
1945        unsigned long index = from >> PAGE_CACHE_SHIFT;
1946        unsigned offset = from & (PAGE_CACHE_SIZE-1);
1947        unsigned blocksize, iblock, length, pos;
1948        struct inode *inode = mapping->host;
1949        struct page *page;
1950        struct buffer_head *bh;
1951        int err;
1952
1953        blocksize = 1 << inode->i_blkbits;
1954        length = offset & (blocksize - 1);
1955
1956        /* Block boundary? Nothing to do */
1957        if (!length)
1958                return 0;
1959
1960        length = blocksize - length;
1961        iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1962        
1963        page = grab_cache_page(mapping, index);
1964        err = -ENOMEM;
1965        if (!page)
1966                goto out;
1967
1968        if (!page->buffers)
1969                create_empty_buffers(page, inode->i_dev, blocksize);
1970
1971        /* Find the buffer that contains "offset" */
1972        bh = page->buffers;
1973        pos = blocksize;
1974        while (offset >= pos) {
1975                bh = bh->b_this_page;
1976                iblock++;
1977                pos += blocksize;
1978        }
1979
1980        err = 0;
1981        if (!buffer_mapped(bh)) {
1982                /* Hole? Nothing to do */
1983                if (buffer_uptodate(bh))
1984                        goto unlock;
1985                get_block(inode, iblock, bh, 0);
1986                /* Still unmapped? Nothing to do */
1987                if (!buffer_mapped(bh))
1988                        goto unlock;
1989        }
1990
1991        /* Ok, it's mapped. Make sure it's up-to-date */
1992        if (Page_Uptodate(page))
1993                set_bit(BH_Uptodate, &bh->b_state);
1994
1995        if (!buffer_uptodate(bh)) {
1996                err = -EIO;
1997                ll_rw_block(READ, 1, &bh);
1998                wait_on_buffer(bh);
1999                /* Uhhuh. Read error. Complain and punt. */
2000                if (!buffer_uptodate(bh))
2001                        goto unlock;
2002        }
2003
2004        memset(kmap(page) + offset, 0, length);
2005        flush_dcache_page(page);
2006        kunmap(page);
2007
2008        if (!atomic_set_buffer_dirty(bh)) {
2009                __mark_dirty(bh);
2010                buffer_insert_inode_data_queue(bh, inode);
2011                balance_dirty();
2012        }
2013
2014        err = 0;
2015
2016unlock:
2017        UnlockPage(page);
2018        page_cache_release(page);
2019out:
2020        return err;
2021}
2022
2023int block_write_full_page(struct page *page, get_block_t *get_block)
2024{
2025        struct inode *inode = page->mapping->host;
2026        unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2027        unsigned offset;
2028        int err;
2029
2030        /* easy case */
2031        if (page->index < end_index)
2032                return __block_write_full_page(inode, page, get_block);
2033
2034        /* things got complicated... */
2035        offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2036        /* OK, are we completely out? */
2037        if (page->index >= end_index+1 || !offset) {
2038                UnlockPage(page);
2039                return -EIO;
2040        }
2041
2042        /* Sigh... will have to work, then... */
2043        err = __block_prepare_write(inode, page, 0, offset, get_block);
2044        if (!err) {
2045                memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2046                flush_dcache_page(page);
2047                __block_commit_write(inode,page,0,offset);
2048done:
2049                kunmap(page);
2050                UnlockPage(page);
2051                return err;
2052        }
2053        ClearPageUptodate(page);
2054        goto done;
2055}
2056
2057/*
2058 * Commence writeout of all the buffers against a page.  The
2059 * page must be locked.   Returns zero on success or a negative
2060 * errno.
2061 */
2062int writeout_one_page(struct page *page)
2063{
2064        struct buffer_head *bh, *head = page->buffers;
2065
2066        if (!PageLocked(page))
2067                BUG();
2068        bh = head;
2069        do {
2070                if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2071                        continue;
2072
2073                bh->b_flushtime = jiffies;
2074                ll_rw_block(WRITE, 1, &bh);     
2075        } while ((bh = bh->b_this_page) != head);
2076        return 0;
2077}
2078EXPORT_SYMBOL(writeout_one_page);
2079
2080/*
2081 * Wait for completion of I/O of all buffers against a page.  The page
2082 * must be locked.  Returns zero on success or a negative errno.
2083 */
2084int waitfor_one_page(struct page *page)
2085{
2086        int error = 0;
2087        struct buffer_head *bh, *head = page->buffers;
2088
2089        bh = head;
2090        do {
2091                wait_on_buffer(bh);
2092                if (buffer_req(bh) && !buffer_uptodate(bh))
2093                        error = -EIO;
2094        } while ((bh = bh->b_this_page) != head);
2095        return error;
2096}
2097EXPORT_SYMBOL(waitfor_one_page);
2098
2099int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2100{
2101        struct buffer_head tmp;
2102        struct inode *inode = mapping->host;
2103        tmp.b_state = 0;
2104        tmp.b_blocknr = 0;
2105        get_block(inode, block, &tmp, 0);
2106        return tmp.b_blocknr;
2107}
2108
2109int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2110{
2111        int i, nr_blocks, retval;
2112        unsigned long * blocks = iobuf->blocks;
2113        int length;
2114
2115        length = iobuf->length;
2116        nr_blocks = length / blocksize;
2117        /* build the blocklist */
2118        for (i = 0; i < nr_blocks; i++, blocknr++) {
2119                struct buffer_head bh;
2120
2121                bh.b_state = 0;
2122                bh.b_dev = inode->i_dev;
2123                bh.b_size = blocksize;
2124
2125                retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1);
2126                if (retval) {
2127                        if (!i)
2128                                /* report error to userspace */
2129                                goto out;
2130                        else
2131                                /* do short I/O utill 'i' */
2132                                break;
2133                }
2134
2135                if (rw == READ) {
2136                        if (buffer_new(&bh))
2137                                BUG();
2138                        if (!buffer_mapped(&bh)) {
2139                                /* there was an hole in the filesystem */
2140                                blocks[i] = -1UL;
2141                                continue;
2142                        }
2143                } else {
2144                        if (buffer_new(&bh))
2145                                unmap_underlying_metadata(&bh);
2146                        if (!buffer_mapped(&bh))
2147                                BUG();
2148                }
2149                blocks[i] = bh.b_blocknr;
2150        }
2151
2152        /* patch length to handle short I/O */
2153        iobuf->length = i * blocksize;
2154        retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2155        /* restore orig length */
2156        iobuf->length = length;
2157 out:
2158
2159        return retval;
2160}
2161
2162/*
2163 * IO completion routine for a buffer_head being used for kiobuf IO: we
2164 * can't dispatch the kiobuf callback until io_count reaches 0.  
2165 */
2166
2167static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2168{
2169        struct kiobuf *kiobuf;
2170        
2171        mark_buffer_uptodate(bh, uptodate);
2172
2173        kiobuf = bh->b_private;
2174        unlock_buffer(bh);
2175        end_kio_request(kiobuf, uptodate);
2176}
2177
2178/*
2179 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2180 * for them to complete.  Clean up the buffer_heads afterwards.  
2181 */
2182
2183static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2184{
2185        int iosize, err;
2186        int i;
2187        struct buffer_head *tmp;
2188
2189        iosize = 0;
2190        err = 0;
2191
2192        for (i = nr; --i >= 0; ) {
2193                iosize += size;
2194                tmp = bh[i];
2195                if (buffer_locked(tmp)) {
2196                        wait_on_buffer(tmp);
2197                }
2198                
2199                if (!buffer_uptodate(tmp)) {
2200                        /* We are traversing bh'es in reverse order so
2201                           clearing iosize on error calculates the
2202                           amount of IO before the first error. */
2203                        iosize = 0;
2204                        err = -EIO;
2205                }
2206        }
2207        
2208        if (iosize)
2209                return iosize;
2210        return err;
2211}
2212
2213/*
2214 * Start I/O on a physical range of kernel memory, defined by a vector
2215 * of kiobuf structs (much like a user-space iovec list).
2216 *
2217 * The kiobuf must already be locked for IO.  IO is submitted
2218 * asynchronously: you need to check page->locked and page->uptodate.
2219 *
2220 * It is up to the caller to make sure that there are enough blocks
2221 * passed in to completely map the iobufs to disk.
2222 */
2223
2224int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
2225               kdev_t dev, unsigned long b[], int size)
2226{
2227        int             err;
2228        int             length;
2229        int             transferred;
2230        int             i;
2231        int             bufind;
2232        int             pageind;
2233        int             bhind;
2234        int             offset;
2235        unsigned long   blocknr;
2236        struct kiobuf * iobuf = NULL;
2237        struct page *   map;
2238        struct buffer_head *tmp, **bhs = NULL;
2239
2240        if (!nr)
2241                return 0;
2242        
2243        /* 
2244         * First, do some alignment and validity checks 
2245         */
2246        for (i = 0; i < nr; i++) {
2247                iobuf = iovec[i];
2248                if ((iobuf->offset & (size-1)) ||
2249                    (iobuf->length & (size-1)))
2250                        return -EINVAL;
2251                if (!iobuf->nr_pages)
2252                        panic("brw_kiovec: iobuf not initialised");
2253        }
2254
2255        /* 
2256         * OK to walk down the iovec doing page IO on each page we find. 
2257         */
2258        bufind = bhind = transferred = err = 0;
2259        for (i = 0; i < nr; i++) {
2260                iobuf = iovec[i];
2261                offset = iobuf->offset;
2262                length = iobuf->length;
2263                iobuf->errno = 0;
2264                if (!bhs)
2265                        bhs = iobuf->bh;
2266                
2267                for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2268                        map  = iobuf->maplist[pageind];
2269                        if (!map) {
2270                                err = -EFAULT;
2271                                goto finished;
2272                        }
2273                        
2274                        while (length > 0) {
2275                                blocknr = b[bufind++];
2276                                if (blocknr == -1UL) {
2277                                        if (rw == READ) {
2278                                                /* there was an hole in the filesystem */
2279                                                memset(kmap(map) + offset, 0, size);
2280                                                flush_dcache_page(map);
2281                                                kunmap(map);
2282
2283                                                transferred += size;
2284                                                goto skip_block;
2285                                        } else
2286                                                BUG();
2287                                }
2288                                tmp = bhs[bhind++];
2289
2290                                tmp->b_size = size;
2291                                set_bh_page(tmp, map, offset);
2292                                tmp->b_this_page = tmp;
2293
2294                                init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2295                                tmp->b_dev = dev;
2296                                tmp->b_blocknr = blocknr;
2297                                tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2298
2299                                if (rw == WRITE) {
2300                                        set_bit(BH_Uptodate, &tmp->b_state);
2301                                        clear_bit(BH_Dirty, &tmp->b_state);
2302                                } else
2303                                        set_bit(BH_Uptodate, &tmp->b_state);
2304
2305                                atomic_inc(&iobuf->io_count);
2306                                submit_bh(rw, tmp);
2307                                /* 
2308                                 * Wait for IO if we have got too much 
2309                                 */
2310                                if (bhind >= KIO_MAX_SECTORS) {
2311                                        kiobuf_wait_for_io(iobuf); /* wake-one */
2312                                        err = wait_kio(rw, bhind, bhs, size);
2313                                        if (err >= 0)
2314                                                transferred += err;
2315                                        else
2316                                                goto finished;
2317                                        bhind = 0;
2318                                }
2319
2320                        skip_block:
2321                                length -= size;
2322                                offset += size;
2323
2324                                if (offset >= PAGE_SIZE) {
2325                                        offset = 0;
2326                                        break;
2327                                }
2328                        } /* End of block loop */
2329                } /* End of page loop */                
2330        } /* End of iovec loop */
2331
2332        /* Is there any IO still left to submit? */
2333        if (bhind) {
2334                kiobuf_wait_for_io(iobuf); /* wake-one */
2335                err = wait_kio(rw, bhind, bhs, size);
2336                if (err >= 0)
2337                        transferred += err;
2338                else
2339                        goto finished;
2340        }
2341
2342 finished:
2343        if (transferred)
2344                return transferred;
2345        return err;
2346}
2347
2348/*
2349 * Start I/O on a page.
2350 * This function expects the page to be locked and may return
2351 * before I/O is complete. You then have to check page->locked
2352 * and page->uptodate.
2353 *
2354 * brw_page() is SMP-safe, although it's being called with the
2355 * kernel lock held - but the code is ready.
2356 *
2357 * FIXME: we need a swapper_inode->get_block function to remove
2358 *        some of the bmap kludges and interface ugliness here.
2359 */
2360int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2361{
2362        struct buffer_head *head, *bh;
2363
2364        if (!PageLocked(page))
2365                panic("brw_page: page not locked for I/O");
2366
2367        if (!page->buffers)
2368                create_empty_buffers(page, dev, size);
2369        head = bh = page->buffers;
2370
2371        /* Stage 1: lock all the buffers */
2372        do {
2373                lock_buffer(bh);
2374                bh->b_blocknr = *(b++);
2375                set_bit(BH_Mapped, &bh->b_state);
2376                set_buffer_async_io(bh);
2377                bh = bh->b_this_page;
2378        } while (bh != head);
2379
2380        /* Stage 2: start the IO */
2381        do {
2382                struct buffer_head *next = bh->b_this_page;
2383                submit_bh(rw, bh);
2384                bh = next;
2385        } while (bh != head);
2386        return 0;
2387}
2388
2389int block_symlink(struct inode *inode, const char *symname, int len)
2390{
2391        struct address_space *mapping = inode->i_mapping;
2392        struct page *page = grab_cache_page(mapping, 0);
2393        int err = -ENOMEM;
2394        char *kaddr;
2395
2396        if (!page)
2397                goto fail;
2398        err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2399        if (err)
2400                goto fail_map;
2401        kaddr = page_address(page);
2402        memcpy(kaddr, symname, len-1);
2403        mapping->a_ops->commit_write(NULL, page, 0, len-1);
2404        /*
2405         * Notice that we are _not_ going to block here - end of page is
2406         * unmapped, so this will only try to map the rest of page, see
2407         * that it is unmapped (typically even will not look into inode -
2408         * ->i_size will be enough for everything) and zero it out.
2409         * OTOH it's obviously correct and should make the page up-to-date.
2410         */
2411        err = mapping->a_ops->readpage(NULL, page);
2412        wait_on_page(page);
2413        page_cache_release(page);
2414        if (err < 0)
2415                goto fail;
2416        mark_inode_dirty(inode);
2417        return 0;
2418fail_map:
2419        UnlockPage(page);
2420        page_cache_release(page);
2421fail:
2422        return err;
2423}
2424
2425static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2426{
2427        struct buffer_head *bh, *tail;
2428
2429        bh = head;
2430        do {
2431                tail = bh;
2432                bh = bh->b_this_page;
2433        } while (bh);
2434        tail->b_this_page = head;
2435        page->buffers = head;
2436        page_cache_get(page);
2437}
2438
2439/*
2440 * Create the page-cache page that contains the requested block
2441 */
2442static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2443{
2444        struct page * page;
2445        struct buffer_head *bh;
2446
2447        page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2448        if (!page)
2449                return NULL;
2450
2451        if (!PageLocked(page))
2452                BUG();
2453
2454        bh = page->buffers;
2455        if (bh) {
2456                if (bh->b_size == size)
2457                        return page;
2458                if (!try_to_free_buffers(page, GFP_NOFS))
2459                        goto failed;
2460        }
2461
2462        bh = create_buffers(page, size, 0);
2463        if (!bh)
2464                goto failed;
2465        link_dev_buffers(page, bh);
2466        return page;
2467
2468failed:
2469        UnlockPage(page);
2470        page_cache_release(page);
2471        return NULL;
2472}
2473
2474static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2475{
2476        struct buffer_head *head = page->buffers;
2477        struct buffer_head *bh = head;
2478        unsigned int uptodate;
2479
2480        uptodate = 1 << BH_Mapped;
2481        if (Page_Uptodate(page))
2482                uptodate |= 1 << BH_Uptodate;
2483
2484        write_lock(&hash_table_lock);
2485        do {
2486                if (!(bh->b_state & (1 << BH_Mapped))) {
2487                        init_buffer(bh, NULL, NULL);
2488                        bh->b_dev = dev;
2489                        bh->b_blocknr = block;
2490                        bh->b_state = uptodate;
2491                }
2492
2493                /* Insert the buffer into the hash lists if necessary */
2494                if (!bh->b_pprev)
2495                        __insert_into_hash_list(bh);
2496
2497                block++;
2498                bh = bh->b_this_page;
2499        } while (bh != head);
2500        write_unlock(&hash_table_lock);
2501}
2502
2503/*
2504 * Try to increase the number of buffers available: the size argument
2505 * is used to determine what kind of buffers we want.
2506 */
2507static int grow_buffers(kdev_t dev, unsigned long block, int size)
2508{
2509        struct page * page;
2510        struct block_device *bdev;
2511        unsigned long index;
2512        int sizebits;
2513
2514        /* Size must be multiple of hard sectorsize */
2515        if (size & (get_hardsect_size(dev)-1))
2516                BUG();
2517        /* Size must be within 512 bytes and PAGE_SIZE */
2518        if (size < 512 || size > PAGE_SIZE)
2519                BUG();
2520
2521        sizebits = -1;
2522        do {
2523                sizebits++;
2524        } while ((size << sizebits) < PAGE_SIZE);
2525
2526        index = block >> sizebits;
2527        block = index << sizebits;
2528
2529        bdev = bdget(kdev_t_to_nr(dev));
2530        if (!bdev) {
2531                printk("No block device for %s\n", kdevname(dev));
2532                BUG();
2533        }
2534
2535        /* Create a page with the proper size buffers.. */
2536        page = grow_dev_page(bdev, index, size);
2537
2538        /* This is "wrong" - talk to Al Viro */
2539        atomic_dec(&bdev->bd_count);
2540        if (!page)
2541                return 0;
2542
2543        /* Hash in the buffers on the hash list */
2544        hash_page_buffers(page, dev, block, size);
2545        UnlockPage(page);
2546        page_cache_release(page);
2547
2548        /* We hashed up this page, so increment buffermem */
2549        atomic_inc(&buffermem_pages);
2550        return 1;
2551}
2552
2553/*
2554 * The first time the VM inspects a page which has locked buffers, it
2555 * will just mark it as needing waiting upon on the scan of the page LRU.
2556 * BH_Wait_IO is used for this.
2557 *
2558 * The second time the VM visits the page, if it still has locked
2559 * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
2560 *
2561 * The third time the VM visits the page, if the I/O hasn't completed
2562 * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
2563 * used for this.
2564 *
2565 * There is also the case of buffers which were locked by someone else
2566 * - write(2) callers, bdflush, etc.  There can be a huge number of these
2567 * and we don't want to just skip them all and fail the page allocation. 
2568 * We want to be able to wait on these buffers as well.
2569 *
2570 * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2571 * underway against the buffer, doesn't matter who started it - we know
2572 * that the buffer will eventually come unlocked, and so it's safe to
2573 * wait on it.
2574 *
2575 * The caller holds the page lock and the caller will free this page
2576 * into current->local_page, so by waiting on the page's buffers the
2577 * caller is guaranteed to obtain this page.
2578 *
2579 * sync_page_buffers() will sort-of return true if all the buffers
2580 * against this page are freeable, so try_to_free_buffers() should
2581 * try to free the page's buffers a second time.  This is a bit
2582 * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2583 */
2584static int sync_page_buffers(struct buffer_head *head)
2585{
2586        struct buffer_head * bh = head;
2587        int tryagain = 1;
2588
2589        do {
2590                if (!buffer_dirty(bh) && !buffer_locked(bh))
2591                        continue;
2592
2593                /* Don't start IO first time around.. */
2594                if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2595                        tryagain = 0;
2596                        continue;
2597                }
2598
2599                /* Second time through we start actively writing out.. */
2600                if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2601                        if (unlikely(!buffer_launder(bh))) {
2602                                tryagain = 0;
2603                                continue;
2604                        }
2605                        wait_on_buffer(bh);
2606                        tryagain = 1;
2607                        continue;
2608                }
2609
2610                if (!atomic_set_buffer_clean(bh)) {
2611                        unlock_buffer(bh);
2612                        continue;
2613                }
2614
2615                __mark_buffer_clean(bh);
2616                get_bh(bh);
2617                bh->b_end_io = end_buffer_io_sync;
2618                submit_bh(WRITE, bh);
2619                tryagain = 0;
2620        } while ((bh = bh->b_this_page) != head);
2621
2622        return tryagain;
2623}
2624
2625/*
2626 * Can the buffer be thrown out?
2627 */
2628#define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock))
2629#define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2630
2631/*
2632 * try_to_free_buffers() checks if all the buffers on this particular page
2633 * are unused, and free's the page if so.
2634 *
2635 * Wake up bdflush() if this fails - if we're running low on memory due
2636 * to dirty buffers, we need to flush them out as quickly as possible.
2637 *
2638 * NOTE: There are quite a number of ways that threads of control can
2639 *       obtain a reference to a buffer head within a page.  So we must
2640 *       lock out all of these paths to cleanly toss the page.
2641 */
2642int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2643{
2644        struct buffer_head * tmp, * bh = page->buffers;
2645
2646cleaned_buffers_try_again:
2647        spin_lock(&lru_list_lock);
2648        write_lock(&hash_table_lock);
2649        tmp = bh;
2650        do {
2651                if (buffer_busy(tmp))
2652                        goto busy_buffer_page;
2653                tmp = tmp->b_this_page;
2654        } while (tmp != bh);
2655
2656        spin_lock(&unused_list_lock);
2657        tmp = bh;
2658
2659        /* if this buffer was hashed, this page counts as buffermem */
2660        if (bh->b_pprev)
2661                atomic_dec(&buffermem_pages);
2662        do {
2663                struct buffer_head * p = tmp;
2664                tmp = tmp->b_this_page;
2665
2666                if (p->b_dev == B_FREE) BUG();
2667
2668                remove_inode_queue(p);
2669                __remove_from_queues(p);
2670                __put_unused_buffer_head(p);
2671        } while (tmp != bh);
2672        spin_unlock(&unused_list_lock);
2673
2674        /* Wake up anyone waiting for buffer heads */
2675        wake_up(&buffer_wait);
2676
2677        /* And free the page */
2678        page->buffers = NULL;
2679        page_cache_release(page);
2680        write_unlock(&hash_table_lock);
2681        spin_unlock(&lru_list_lock);
2682        return 1;
2683
2684busy_buffer_page:
2685        /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2686        write_unlock(&hash_table_lock);
2687        spin_unlock(&lru_list_lock);
2688        gfp_mask = pf_gfp_mask(gfp_mask);
2689        if (gfp_mask & __GFP_IO) {
2690                if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2691                        if (sync_page_buffers(bh)) {
2692                                /* no IO or waiting next time */
2693                                gfp_mask = 0;
2694                                goto cleaned_buffers_try_again;
2695                        }
2696                }
2697        }
2698        if (balance_dirty_state() >= 0)
2699                wakeup_bdflush();
2700        return 0;
2701}
2702EXPORT_SYMBOL(try_to_free_buffers);
2703
2704/* ================== Debugging =================== */
2705
2706void show_buffers(void)
2707{
2708#ifdef CONFIG_SMP
2709        struct buffer_head * bh;
2710        int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2711        int nlist;
2712        static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2713#endif
2714
2715        printk("Buffer memory:   %6dkB\n",
2716                        atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2717
2718        printk("Cache memory:   %6dkB\n",
2719                        (atomic_read(&page_cache_size)- atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2720
2721#ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2722        if (!spin_trylock(&lru_list_lock))
2723                return;
2724        for(nlist = 0; nlist < NR_LIST; nlist++) {
2725                found = locked = dirty = used = lastused = 0;
2726                bh = lru_list[nlist];
2727                if(!bh) continue;
2728
2729                do {
2730                        found++;
2731                        if (buffer_locked(bh))
2732                                locked++;
2733                        if (buffer_dirty(bh))
2734                                dirty++;
2735                        if (atomic_read(&bh->b_count))
2736                                used++, lastused = found;
2737                        bh = bh->b_next_free;
2738                } while (bh != lru_list[nlist]);
2739                {
2740                        int tmp = nr_buffers_type[nlist];
2741                        if (found != tmp)
2742                                printk("%9s: BUG -> found %d, reported %d\n",
2743                                       buf_types[nlist], found, tmp);
2744                }
2745                printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2746                       "%d locked, %d dirty\n",
2747                       buf_types[nlist], found, size_buffers_type[nlist]>>10,
2748                       used, lastused, locked, dirty);
2749        }
2750        spin_unlock(&lru_list_lock);
2751#endif
2752}
2753
2754/* ===================== Init ======================= */
2755
2756/*
2757 * allocate the hash table and init the free list
2758 * Use gfp() for the hash table to decrease TLB misses, use
2759 * SLAB cache for buffer heads.
2760 */
2761void __init buffer_init(unsigned long mempages)
2762{
2763        int order, i;
2764        unsigned int nr_hash;
2765
2766        /* The buffer cache hash table is less important these days,
2767         * trim it a bit.
2768         */
2769        mempages >>= 14;
2770
2771        mempages *= sizeof(struct buffer_head *);
2772
2773        for (order = 0; (1 << order) < mempages; order++)
2774                ;
2775
2776        /* try to allocate something until we get it or we're asking
2777           for something that is really too small */
2778
2779        do {
2780                unsigned long tmp;
2781
2782                nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2783                bh_hash_mask = (nr_hash - 1);
2784
2785                tmp = nr_hash;
2786                bh_hash_shift = 0;
2787                while((tmp >>= 1UL) != 0UL)
2788                        bh_hash_shift++;
2789
2790                hash_table = (struct buffer_head **)
2791                    __get_free_pages(GFP_ATOMIC, order);
2792        } while (hash_table == NULL && --order > 0);
2793        printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2794               nr_hash, order, (PAGE_SIZE << order));
2795
2796        if (!hash_table)
2797                panic("Failed to allocate buffer hash table\n");
2798
2799        /* Setup hash chains. */
2800        for(i = 0; i < nr_hash; i++)
2801                hash_table[i] = NULL;
2802
2803        /* Setup lru lists. */
2804        for(i = 0; i < NR_LIST; i++)
2805                lru_list[i] = NULL;
2806
2807}
2808
2809
2810/* ====================== bdflush support =================== */
2811
2812/* This is a simple kernel daemon, whose job it is to provide a dynamic
2813 * response to dirty buffers.  Once this process is activated, we write back
2814 * a limited number of buffers to the disks and then go back to sleep again.
2815 */
2816
2817DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2818
2819void wakeup_bdflush(void)
2820{
2821        wake_up_interruptible(&bdflush_wait);
2822}
2823
2824/* 
2825 * Here we attempt to write back old buffers.  We also try to flush inodes 
2826 * and supers as well, since this function is essentially "update", and 
2827 * otherwise there would be no way of ensuring that these quantities ever 
2828 * get written back.  Ideally, we would have a timestamp on the inodes
2829 * and superblocks so that we could write back only the old ones as well
2830 */
2831
2832static int sync_old_buffers(void)
2833{
2834        lock_kernel();
2835        sync_unlocked_inodes();
2836        sync_supers(0);
2837        unlock_kernel();
2838
2839        for (;;) {
2840                struct buffer_head *bh;
2841
2842                spin_lock(&lru_list_lock);
2843                bh = lru_list[BUF_DIRTY];
2844                if (!bh || time_before(jiffies, bh->b_flushtime))
2845                        break;
2846                if (write_some_buffers(NODEV))
2847                        continue;
2848                return 0;
2849        }
2850        spin_unlock(&lru_list_lock);
2851        return 0;
2852}
2853
2854int block_sync_page(struct page *page)
2855{
2856        run_task_queue(&tq_disk);
2857        return 0;
2858}
2859
2860/* This is the interface to bdflush.  As we get more sophisticated, we can
2861 * pass tuning parameters to this "process", to adjust how it behaves. 
2862 * We would want to verify each parameter, however, to make sure that it 
2863 * is reasonable. */
2864
2865asmlinkage long sys_bdflush(int func, long data)
2866{
2867        if (!capable(CAP_SYS_ADMIN))
2868                return -EPERM;
2869
2870        if (func == 1) {
2871                /* do_exit directly and let kupdate to do its work alone. */
2872                do_exit(0);
2873#if 0 /* left here as it's the only example of lazy-mm-stuff used from
2874         a syscall that doesn't care about the current mm context. */
2875                int error;
2876                struct mm_struct *user_mm;
2877
2878                /*
2879                 * bdflush will spend all of it's time in kernel-space,
2880                 * without touching user-space, so we can switch it into
2881                 * 'lazy TLB mode' to reduce the cost of context-switches
2882                 * to and from bdflush.
2883                 */
2884                user_mm = start_lazy_tlb();
2885                error = sync_old_buffers();
2886                end_lazy_tlb(user_mm);
2887                return error;
2888#endif
2889        }
2890
2891        /* Basically func 1 means read param 1, 2 means write param 1, etc */
2892        if (func >= 2) {
2893                int i = (func-2) >> 1;
2894                if (i >= 0 && i < N_PARAM) {
2895                        if ((func & 1) == 0)
2896                                return put_user(bdf_prm.data[i], (int*)data);
2897
2898                        if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2899                                bdf_prm.data[i] = data;
2900                                return 0;
2901                        }
2902                }
2903                return -EINVAL;
2904        }
2905
2906        /* Having func 0 used to launch the actual bdflush and then never
2907         * return (unless explicitly killed). We return zero here to 
2908         * remain semi-compatible with present update(8) programs.
2909         */
2910        return 0;
2911}
2912
2913/*
2914 * This is the actual bdflush daemon itself. It used to be started from
2915 * the syscall above, but now we launch it ourselves internally with
2916 * kernel_thread(...)  directly after the first thread in init/main.c
2917 */
2918int bdflush(void *startup)
2919{
2920        struct task_struct *tsk = current;
2921
2922        /*
2923         *      We have a bare-bones task_struct, and really should fill
2924         *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2925         *      display semi-sane things. Not real crucial though...  
2926         */
2927
2928        tsk->session = 1;
2929        tsk->pgrp = 1;
2930        strcpy(tsk->comm, "bdflush");
2931
2932        /* avoid getting signals */
2933        spin_lock_irq(&tsk->sigmask_lock);
2934        flush_signals(tsk);
2935        sigfillset(&tsk->blocked);
2936        recalc_sigpending(tsk);
2937        spin_unlock_irq(&tsk->sigmask_lock);
2938
2939        complete((struct completion *)startup);
2940
2941        /*
2942         * FIXME: The ndirty logic here is wrong.  It's supposed to
2943         * send bdflush back to sleep after writing ndirty buffers.
2944         * In fact, the test is wrong so bdflush will in fact
2945         * sleep when bdflush_stop() returns true.
2946         *
2947         * FIXME: If it proves useful to implement ndirty properly,
2948         * then perhaps the value of ndirty should be scaled by the
2949         * amount of memory in the machine.
2950         */
2951        for (;;) {
2952                int ndirty = bdf_prm.b_un.ndirty;
2953
2954                CHECK_EMERGENCY_SYNC
2955
2956                while (ndirty > 0) {
2957                        spin_lock(&lru_list_lock);
2958                        if (!write_some_buffers(NODEV))
2959                                break;
2960                        ndirty -= NRSYNC;
2961                }
2962                if (ndirty > 0 || bdflush_stop())
2963                        interruptible_sleep_on(&bdflush_wait);
2964        }
2965}
2966
2967/*
2968 * This is the kernel update daemon. It was used to live in userspace
2969 * but since it's need to run safely we want it unkillable by mistake.
2970 * You don't need to change your userspace configuration since
2971 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2972 */
2973int kupdate(void *startup)
2974{
2975        struct task_struct * tsk = current;
2976        int interval;
2977
2978        tsk->session = 1;
2979        tsk->pgrp = 1;
2980        strcpy(tsk->comm, "kupdated");
2981
2982        /* sigstop and sigcont will stop and wakeup kupdate */
2983        spin_lock_irq(&tsk->sigmask_lock);
2984        sigfillset(&tsk->blocked);
2985        siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2986        recalc_sigpending(tsk);
2987        spin_unlock_irq(&tsk->sigmask_lock);
2988
2989        complete((struct completion *)startup);
2990
2991        for (;;) {
2992                /* update interval */
2993                interval = bdf_prm.b_un.interval;
2994                if (interval) {
2995                        tsk->state = TASK_INTERRUPTIBLE;
2996                        schedule_timeout(interval);
2997                } else {
2998                stop_kupdate:
2999                        tsk->state = TASK_STOPPED;
3000                        schedule(); /* wait for SIGCONT */
3001                }
3002                /* check for sigstop */
3003                if (signal_pending(tsk)) {
3004                        int stopped = 0;
3005                        spin_lock_irq(&tsk->sigmask_lock);
3006                        if (sigismember(&tsk->pending.signal, SIGSTOP)) {
3007                                sigdelset(&tsk->pending.signal, SIGSTOP);
3008                                stopped = 1;
3009                        }
3010                        recalc_sigpending(tsk);
3011                        spin_unlock_irq(&tsk->sigmask_lock);
3012                        if (stopped)
3013                                goto stop_kupdate;
3014                }
3015#ifdef DEBUG
3016                printk(KERN_DEBUG "kupdate() activated...\n");
3017#endif
3018                sync_old_buffers();
3019                run_task_queue(&tq_disk);
3020        }
3021}
3022
3023static int __init bdflush_init(void)
3024{
3025        static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3026
3027        kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3028        wait_for_completion(&startup);
3029        kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3030        wait_for_completion(&startup);
3031        return 0;
3032}
3033
3034module_init(bdflush_init)
3035
3036
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.