linux-old/fs/buffer.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9 * been avoided by NEVER letting an interrupt change a buffer (except for the
  10 * data, of course), but instead letting the caller do it.
  11 */
  12
  13/* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15/* Removed a lot of unnecessary code and simplified things now that
  16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17 */
  18
  19/* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20 * hash table, use SLAB cache for buffer heads. -DaveM
  21 */
  22
  23/* Added 32k buffer block sizes - these are required older ARM systems.
  24 * - RMK
  25 */
  26
  27/* invalidate_buffers/set_blocksize/sync_dev race conditions and
  28   fs corruption fixes, 1999, Andrea Arcangeli <andrea@suse.de> */
  29
  30/* Wait for dirty buffers to sync in sync_page_buffers.
  31 * 2000, Marcelo Tosatti <marcelo@conectiva.com.br>
  32 */
  33
  34#include <linux/malloc.h>
  35#include <linux/locks.h>
  36#include <linux/errno.h>
  37#include <linux/swap.h>
  38#include <linux/swapctl.h>
  39#include <linux/smp_lock.h>
  40#include <linux/vmalloc.h>
  41#include <linux/blkdev.h>
  42#include <linux/sysrq.h>
  43#include <linux/file.h>
  44#include <linux/init.h>
  45#include <linux/quotaops.h>
  46
  47#include <asm/uaccess.h>
  48#include <asm/io.h>
  49#include <asm/bitops.h>
  50#include <asm/pgtable.h>
  51
  52#define NR_SIZES 7
  53static char buffersize_index[65] =
  54{-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  55  4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  56  5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  6};
  59
  60#define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  61#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  62#define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  63#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
  64                                             number of unused buffer heads */
  65
  66/*
  67 * Hash table mask..
  68 */
  69static unsigned int bh_hash_mask = 0;
  70static unsigned int bh_hash_shift = 0;
  71static struct buffer_head ** hash_table = NULL;
  72
  73static int grow_buffers(int size);
  74
  75static struct buffer_head * lru_list[NR_LIST] = {NULL, };
  76static struct buffer_head * free_list[NR_SIZES] = {NULL, };
  77
  78static kmem_cache_t *bh_cachep;
  79
  80static struct buffer_head * unused_list = NULL;
  81static struct buffer_head * reuse_list = NULL;
  82static struct wait_queue * buffer_wait = NULL;
  83
  84static int nr_buffers = 0;
  85static int nr_buffers_type[NR_LIST] = {0,};
  86static unsigned long size_buffers_type[NR_LIST];
  87static int nr_buffer_heads = 0;
  88static int nr_unused_buffer_heads = 0;
  89static int nr_hashed_buffers = 0;
  90
  91/* This is used by some architectures to estimate available memory. */
  92long buffermem = 0;
  93
  94/* Here is the parameter block for the bdflush process. If you add or
  95 * remove any of the parameters, make sure to update kernel/sysctl.c.
  96 */
  97
  98#define N_PARAM 9
  99
 100/* The dummy values in this structure are left in there for compatibility
 101 * with old programs that play with the /proc entries.
 102 */
 103union bdflush_param{
 104        struct {
 105                int nfract;  /* Percentage of buffer cache dirty to 
 106                                activate bdflush */
 107                int ndirty;  /* Maximum number of dirty blocks to write out per
 108                                wake-cycle */
 109                int nrefill; /* Number of clean buffers to try to obtain
 110                                each time we call refill */
 111                int nref_dirt; /* Dirty buffer threshold for activating bdflush
 112                                  when trying to refill buffers. */
 113                int interval; /* jiffies delay between kupdate flushes */
 114                int age_buffer;  /* Time for normal buffer to age before 
 115                                    we flush it */
 116                int age_super;  /* Time for superblock to age before we 
 117                                   flush it */
 118                int dummy2;    /* unused */
 119                int dummy3;    /* unused */
 120        } b_un;
 121        unsigned int data[N_PARAM];
 122} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 123
 124/* These are the min and max parameter values that we will allow to be assigned */
 125int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 126int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,60*HZ, 600*HZ, 600*HZ, 2047, 5};
 127
 128void wakeup_bdflush(int);
 129
 130/*
 131 * Rewrote the wait-routines to use the "new" wait-queue functionality,
 132 * and getting rid of the cli-sti pairs. The wait-queue routines still
 133 * need cli-sti, but now it's just a couple of 386 instructions or so.
 134 *
 135 * Note that the real wait_on_buffer() is an inline function that checks
 136 * if 'b_wait' is set before calling this, so that the queues aren't set
 137 * up unnecessarily.
 138 */
 139void __wait_on_buffer(struct buffer_head * bh)
 140{
 141        struct task_struct *tsk = current;
 142        struct wait_queue wait;
 143
 144        bh->b_count++;
 145        wait.task = tsk;
 146        add_wait_queue(&bh->b_wait, &wait);
 147repeat:
 148        tsk->state = TASK_UNINTERRUPTIBLE;
 149        run_task_queue(&tq_disk);
 150        if (buffer_locked(bh)) {
 151                schedule();
 152                goto repeat;
 153        }
 154        tsk->state = TASK_RUNNING;
 155        remove_wait_queue(&bh->b_wait, &wait);
 156        bh->b_count--;
 157}
 158
 159/* Call sync_buffers with wait!=0 to ensure that the call does not
 160 * return until all buffer writes have completed.  Sync() may return
 161 * before the writes have finished; fsync() may not.
 162 */
 163
 164/* Godamity-damn.  Some buffers (bitmaps for filesystems)
 165 * spontaneously dirty themselves without ever brelse being called.
 166 * We will ultimately want to put these in a separate list, but for
 167 * now we search all of the lists for dirty buffers.
 168 */
 169static int sync_buffers(kdev_t dev, int wait)
 170{
 171        int i, retry, pass = 0, err = 0;
 172        struct buffer_head * bh, *next;
 173
 174        /* One pass for no-wait, three for wait:
 175         * 0) write out all dirty, unlocked buffers;
 176         * 1) write out all dirty buffers, waiting if locked;
 177         * 2) wait for completion by waiting for all buffers to unlock.
 178         */
 179        do {
 180                retry = 0;
 181repeat:
 182                /* We search all lists as a failsafe mechanism, not because we expect
 183                 * there to be dirty buffers on any of the other lists.
 184                 */
 185                bh = lru_list[BUF_DIRTY];
 186                if (!bh)
 187                        goto repeat2;
 188                for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 189                        if (bh->b_list != BUF_DIRTY)
 190                                goto repeat;
 191                        next = bh->b_next_free;
 192                        if (!lru_list[BUF_DIRTY])
 193                                break;
 194                        if (dev && bh->b_dev != dev)
 195                                continue;
 196                        if (buffer_locked(bh)) {
 197                                /* Buffer is locked; skip it unless wait is
 198                                 * requested AND pass > 0.
 199                                 */
 200                                if (!wait || !pass) {
 201                                        retry = 1;
 202                                        continue;
 203                                }
 204                                wait_on_buffer (bh);
 205                                goto repeat;
 206                        }
 207
 208                        /* If an unlocked buffer is not uptodate, there has
 209                         * been an IO error. Skip it.
 210                         */
 211                        if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 212                            !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 213                                err = -EIO;
 214                                continue;
 215                        }
 216
 217                        /* Don't write clean buffers.  Don't write ANY buffers
 218                         * on the third pass.
 219                         */
 220                        if (!buffer_dirty(bh) || pass >= 2)
 221                                continue;
 222
 223                        /* Don't bother about locked buffers.
 224                         *
 225                         * XXX We checked if it was locked above and there is no
 226                         * XXX way we could have slept in between. -DaveM
 227                         */
 228                        if (buffer_locked(bh))
 229                                continue;
 230                        bh->b_count++;
 231                        next->b_count++;
 232                        bh->b_flushtime = 0;
 233                        ll_rw_block(WRITE, 1, &bh);
 234                        bh->b_count--;
 235                        next->b_count--;
 236                        retry = 1;
 237                }
 238
 239    repeat2:
 240                bh = lru_list[BUF_LOCKED];
 241                if (!bh)
 242                        break;
 243                for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 244                        if (bh->b_list != BUF_LOCKED)
 245                                goto repeat2;
 246                        next = bh->b_next_free;
 247                        if (!lru_list[BUF_LOCKED])
 248                                break;
 249                        if (dev && bh->b_dev != dev)
 250                                continue;
 251                        if (buffer_locked(bh)) {
 252                                /* Buffer is locked; skip it unless wait is
 253                                 * requested AND pass > 0.
 254                                 */
 255                                if (!wait || !pass) {
 256                                        retry = 1;
 257                                        continue;
 258                                }
 259                                wait_on_buffer (bh);
 260                                goto repeat2;
 261                        }
 262                }
 263
 264                /* If we are waiting for the sync to succeed, and if any dirty
 265                 * blocks were written, then repeat; on the second pass, only
 266                 * wait for buffers being written (do not pass to write any
 267                 * more buffers on the second pass).
 268                 */
 269        } while (wait && retry && ++pass<=2);
 270        return err;
 271}
 272
 273void sync_dev(kdev_t dev)
 274{
 275        sync_supers(dev);
 276        sync_inodes(dev);
 277        DQUOT_SYNC(dev);
 278        /* sync all the dirty buffers out to disk only _after_ all the
 279           high level layers finished generated buffer dirty data
 280           (or we'll return with some buffer still dirty on the blockdevice
 281           so breaking the semantics of this call) */
 282        sync_buffers(dev, 0);
 283        /*
 284         * FIXME(eric) we need to sync the physical devices here.
 285         * This is because some (scsi) controllers have huge amounts of
 286         * cache onboard (hundreds of Mb), and we need to instruct
 287         * them to commit all of the dirty memory to disk, and we should
 288         * not return until this has happened.
 289         *
 290         * This would need to get implemented by going through the assorted
 291         * layers so that each block major number can be synced, and this
 292         * would call down into the upper and mid-layer scsi.
 293         */
 294}
 295
 296int fsync_dev(kdev_t dev)
 297{
 298        sync_buffers(dev, 0);
 299        sync_supers(dev);
 300        sync_inodes(dev);
 301        DQUOT_SYNC(dev);
 302        return sync_buffers(dev, 1);
 303}
 304
 305asmlinkage int sys_sync(void)
 306{
 307        lock_kernel();
 308        fsync_dev(0);
 309        unlock_kernel();
 310        return 0;
 311}
 312
 313/*
 314 *      filp may be NULL if called via the msync of a vma.
 315 */
 316 
 317int file_fsync(struct file *filp, struct dentry *dentry)
 318{
 319        struct inode * inode = dentry->d_inode;
 320        struct super_block * sb;
 321        kdev_t dev;
 322
 323        /* sync the inode to buffers */
 324        write_inode_now(inode);
 325
 326        /* sync the superblock to buffers */
 327        sb = inode->i_sb;
 328        wait_on_super(sb);
 329        if (sb->s_op && sb->s_op->write_super)
 330                sb->s_op->write_super(sb);
 331
 332        /* .. finally sync the buffers to disk */
 333        dev = inode->i_dev;
 334        return sync_buffers(dev, 1);
 335}
 336
 337asmlinkage int sys_fsync(unsigned int fd)
 338{
 339        struct file * file;
 340        struct dentry * dentry;
 341        struct inode * inode;
 342        int err;
 343
 344        lock_kernel();
 345        err = -EBADF;
 346        file = fget(fd);
 347        if (!file)
 348                goto out;
 349
 350        dentry = file->f_dentry;
 351        if (!dentry)
 352                goto out_putf;
 353
 354        inode = dentry->d_inode;
 355        if (!inode)
 356                goto out_putf;
 357
 358        err = -EINVAL;
 359        if (!file->f_op || !file->f_op->fsync)
 360                goto out_putf;
 361
 362        /* We need to protect against concurrent writers.. */
 363        fs_down(&inode->i_sem);
 364        err = file->f_op->fsync(file, dentry);
 365        fs_up(&inode->i_sem);
 366
 367out_putf:
 368        fput(file);
 369out:
 370        unlock_kernel();
 371        return err;
 372}
 373
 374asmlinkage int sys_fdatasync(unsigned int fd)
 375{
 376        struct file * file;
 377        struct dentry * dentry;
 378        struct inode * inode;
 379        int err;
 380
 381        lock_kernel();
 382        err = -EBADF;
 383        file = fget(fd);
 384        if (!file)
 385                goto out;
 386
 387        dentry = file->f_dentry;
 388        if (!dentry)
 389                goto out_putf;
 390
 391        inode = dentry->d_inode;
 392        if (!inode)
 393                goto out_putf;
 394
 395        err = -EINVAL;
 396        if (!file->f_op || !file->f_op->fsync)
 397                goto out_putf;
 398
 399        /* this needs further work, at the moment it is identical to fsync() */
 400        fs_down(&inode->i_sem);
 401        err = file->f_op->fsync(file, dentry);
 402        fs_up(&inode->i_sem);
 403
 404out_putf:
 405        fput(file);
 406out:
 407        unlock_kernel();
 408        return err;
 409}
 410
 411/* After several hours of tedious analysis, the following hash
 412 * function won.  Do not mess with it... -DaveM
 413 */
 414#define _hashfn(dev,block)      \
 415        ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 416         (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 417#define hash(dev,block) hash_table[_hashfn(dev,block) & bh_hash_mask]
 418
 419static inline void remove_from_hash_queue(struct buffer_head * bh)
 420{
 421        struct buffer_head **pprev = bh->b_pprev;
 422        if (pprev) {
 423                struct buffer_head * next = bh->b_next;
 424                if (next) {
 425                        next->b_pprev = pprev;
 426                        bh->b_next = NULL;
 427                }
 428                *pprev = next;
 429                bh->b_pprev = NULL;
 430                nr_hashed_buffers--;
 431        }
 432}
 433
 434static inline void remove_from_lru_list(struct buffer_head * bh)
 435{
 436        if (!(bh->b_prev_free) || !(bh->b_next_free))
 437                panic("VFS: LRU block list corrupted");
 438        if (bh->b_dev == B_FREE)
 439                panic("LRU list corrupted");
 440        bh->b_prev_free->b_next_free = bh->b_next_free;
 441        bh->b_next_free->b_prev_free = bh->b_prev_free;
 442
 443        if (lru_list[bh->b_list] == bh)
 444                 lru_list[bh->b_list] = bh->b_next_free;
 445        if (lru_list[bh->b_list] == bh)
 446                 lru_list[bh->b_list] = NULL;
 447        bh->b_next_free = bh->b_prev_free = NULL;
 448}
 449
 450static inline void remove_from_free_list(struct buffer_head * bh)
 451{
 452        int isize = BUFSIZE_INDEX(bh->b_size);
 453        if (!(bh->b_prev_free) || !(bh->b_next_free))
 454                panic("VFS: Free block list corrupted");
 455        if(bh->b_dev != B_FREE)
 456                panic("Free list corrupted");
 457        if(!free_list[isize])
 458                panic("Free list empty");
 459        if(bh->b_next_free == bh)
 460                 free_list[isize] = NULL;
 461        else {
 462                bh->b_prev_free->b_next_free = bh->b_next_free;
 463                bh->b_next_free->b_prev_free = bh->b_prev_free;
 464                if (free_list[isize] == bh)
 465                         free_list[isize] = bh->b_next_free;
 466        }
 467        bh->b_next_free = bh->b_prev_free = NULL;
 468}
 469
 470static void remove_from_queues(struct buffer_head * bh)
 471{
 472        if(bh->b_dev == B_FREE) {
 473                remove_from_free_list(bh); /* Free list entries should not be
 474                                              in the hash queue */
 475                return;
 476        }
 477        nr_buffers_type[bh->b_list]--;
 478        size_buffers_type[bh->b_list] -= bh->b_size;
 479        remove_from_hash_queue(bh);
 480        remove_from_lru_list(bh);
 481}
 482
 483static void put_last_free(struct buffer_head * bh)
 484{
 485        if (bh) {
 486                struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 487
 488                bh->b_count = 0;
 489                bh->b_state = 0;
 490                remove_from_queues(bh);
 491                bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
 492
 493                /* Add to back of free list. */
 494                if(!*bhp) {
 495                        *bhp = bh;
 496                        bh->b_prev_free = bh;
 497                }
 498
 499                bh->b_next_free = *bhp;
 500                bh->b_prev_free = (*bhp)->b_prev_free;
 501                (*bhp)->b_prev_free->b_next_free = bh;
 502                (*bhp)->b_prev_free = bh;
 503        }
 504}
 505
 506static void insert_into_queues(struct buffer_head * bh)
 507{
 508        /* put at end of free list */
 509        if(bh->b_dev == B_FREE) {
 510                panic("B_FREE inserted into queues");
 511        } else {
 512                struct buffer_head **bhp = &lru_list[bh->b_list];
 513
 514                if(!*bhp) {
 515                        *bhp = bh;
 516                        bh->b_prev_free = bh;
 517                }
 518
 519                if (bh->b_next_free)
 520                        panic("VFS: buffer LRU pointers corrupted");
 521
 522                bh->b_next_free = *bhp;
 523                bh->b_prev_free = (*bhp)->b_prev_free;
 524                (*bhp)->b_prev_free->b_next_free = bh;
 525                (*bhp)->b_prev_free = bh;
 526
 527                nr_buffers_type[bh->b_list]++;
 528                size_buffers_type[bh->b_list] += bh->b_size;
 529
 530                /* Put the buffer in new hash-queue if it has a device. */
 531                bh->b_next = NULL;
 532                bh->b_pprev = NULL;
 533                if (bh->b_dev) {
 534                        struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
 535                        struct buffer_head *next = *bhp;
 536
 537                        if (next) {
 538                                bh->b_next = next;
 539                                next->b_pprev = &bh->b_next;
 540                        }
 541                        *bhp = bh;
 542                        bh->b_pprev = bhp;
 543                        nr_hashed_buffers++;
 544                }
 545        }
 546}
 547
 548struct buffer_head * find_buffer(kdev_t dev, int block, int size)
 549{               
 550        struct buffer_head * next;
 551
 552        next = hash(dev,block);
 553        for (;;) {
 554                struct buffer_head *tmp = next;
 555                if (!next)
 556                        break;
 557                next = tmp->b_next;
 558                if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
 559                        continue;
 560                next = tmp;
 561                break;
 562        }
 563        return next;
 564}
 565
 566/*
 567 * Why like this, I hear you say... The reason is race-conditions.
 568 * As we don't lock buffers (unless we are reading them, that is),
 569 * something might happen to it while we sleep (ie a read-error
 570 * will force it bad). This shouldn't really happen currently, but
 571 * the code is ready.
 572 */
 573struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 574{
 575        struct buffer_head * bh;
 576        bh = find_buffer(dev,block,size);
 577        if (bh) {
 578                bh->b_count++;
 579                touch_buffer(bh);
 580        }
 581        return bh;
 582}
 583
 584unsigned int get_hardblocksize(kdev_t dev)
 585{
 586        /*
 587         * Get the hard sector size for the given device.  If we don't know
 588         * what it is, return 0.
 589         */
 590        if (hardsect_size[MAJOR(dev)] != NULL) {
 591                int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 592                if (blksize != 0)
 593                        return blksize;
 594        }
 595
 596        /*
 597         * We don't know what the hardware sector size for this device is.
 598         * Return 0 indicating that we don't know.
 599         */
 600        return 0;
 601}
 602
 603/* If invalidate_buffers() will trash dirty buffers, it means some kind
 604   of fs corruption is going on. Trashing dirty data always imply losing
 605   information that was supposed to be just stored on the physical layer
 606   by the user.
 607
 608   Thus invalidate_buffers in general usage is not allwowed to trash dirty
 609   buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 610
 611   NOTE: In the case where the user removed a removable-media-disk even if
 612   there's still dirty data not synced on disk (due a bug in the device driver
 613   or due an error of the user), by not destroying the dirty buffers we could
 614   generate corruption also on the next media inserted, thus a parameter is
 615   necessary to handle this case in the most safe way possible (trying
 616   to not corrupt also the new disk inserted with the data belonging to
 617   the old now corrupted disk). Also for the ramdisk the natural thing
 618   to do in order to release the ramdisk memory is to destroy dirty buffers.
 619
 620   These are two special cases. Normal usage imply the device driver
 621   to issue a sync on the device (without waiting I/O completation) and
 622   then an invalidate_buffers call that doesn't trashes dirty buffers. */
 623void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 624{
 625        int i, nlist, slept;
 626        struct buffer_head * bh, * bhnext;
 627
 628 again:
 629        slept = 0;
 630        for(nlist = 0; nlist < NR_LIST; nlist++) {
 631                bh = lru_list[nlist];
 632                if (!bh)
 633                        continue;
 634                for (i = nr_buffers_type[nlist] ; i > 0 ;
 635                     bh = bhnext, i--)
 636                {
 637                        bhnext = bh->b_next_free;
 638                        if (bh->b_dev != dev)
 639                                continue;
 640                        if (buffer_locked(bh))
 641                        {
 642                                slept = 1;
 643                                __wait_on_buffer(bh);
 644                        }
 645                        if (!bh->b_count &&
 646                            (destroy_dirty_buffers || !buffer_dirty(bh)))
 647                                put_last_free(bh);
 648                        if (slept)
 649                                goto again;
 650                }
 651        }
 652}
 653
 654void set_blocksize(kdev_t dev, int size)
 655{
 656        extern int *blksize_size[];
 657        int i, nlist, slept;
 658        struct buffer_head * bh, *bhnext;
 659
 660        if (!blksize_size[MAJOR(dev)])
 661                return;
 662
 663        /* Size must be a power of two, and between 512 and PAGE_SIZE */
 664        if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 665                panic("Invalid blocksize passed to set_blocksize");
 666
 667        if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 668                blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 669                return;
 670        }
 671        if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 672                return;
 673        sync_buffers(dev, 2);
 674        blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 675
 676        /* We need to be quite careful how we do this - we are moving entries
 677         * around on the free list, and we can get in a loop if we are not careful.
 678         */
 679 again:
 680        slept = 0;
 681        for(nlist = 0; nlist < NR_LIST; nlist++) {
 682                bh = lru_list[nlist];
 683                if (!bh)
 684                        continue;
 685                for (i = nr_buffers_type[nlist] ; i > 0 ;
 686                     bh = bhnext, i--)
 687                {
 688                        bhnext = bh->b_next_free;
 689                        if (bh->b_dev != dev || bh->b_size == size)
 690                                continue;
 691                        if (buffer_locked(bh))
 692                        {
 693                                slept = 1;
 694                                wait_on_buffer(bh);
 695                        }
 696                        if (buffer_dirty(bh))
 697                                printk(KERN_WARNING "set_blocksize: dev %s buffer_dirty %lu size %lu\n", kdevname(dev), bh->b_blocknr, bh->b_size);
 698                        if (!bh->b_count)
 699                                put_last_free(bh);
 700                        else
 701                        {
 702                                mark_buffer_clean(bh);
 703                                clear_bit(BH_Uptodate, &bh->b_state);
 704                                clear_bit(BH_Req, &bh->b_state);
 705                                printk(KERN_WARNING
 706                                       "set_blocksize: "
 707                                       "b_count %d, dev %s, block %lu, from %p\n",
 708                                       bh->b_count, bdevname(bh->b_dev),
 709                                       bh->b_blocknr, __builtin_return_address(0));
 710                        }
 711                        if (slept)
 712                                goto again;
 713                }
 714        }
 715}
 716
 717/*
 718 * We used to try various strange things. Let's not.
 719 */
 720static void refill_freelist(int size)
 721{
 722        if (!grow_buffers(size)) {
 723                wakeup_bdflush(1);
 724                current->policy |= SCHED_YIELD;
 725                current->state = TASK_RUNNING;
 726                schedule();
 727        }
 728}
 729
 730void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
 731                 bh_end_io_t *handler, void *dev_id)
 732{
 733        bh->b_count = 1;
 734        bh->b_list = BUF_CLEAN;
 735        bh->b_flushtime = 0;
 736        bh->b_dev = dev;
 737        bh->b_blocknr = block;
 738        bh->b_end_io = handler;
 739        bh->b_dev_id = dev_id;
 740}
 741
 742static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 743{
 744        mark_buffer_uptodate(bh, uptodate);
 745        unlock_buffer(bh);
 746}
 747
 748/*
 749 * Ok, this is getblk, and it isn't very clear, again to hinder
 750 * race-conditions. Most of the code is seldom used, (ie repeating),
 751 * so it should be much more efficient than it looks.
 752 *
 753 * The algorithm is changed: hopefully better, and an elusive bug removed.
 754 *
 755 * 14.02.92: changed it to sync dirty buffers a bit: better performance
 756 * when the filesystem starts to get full of dirty blocks (I hope).
 757 */
 758struct buffer_head * getblk(kdev_t dev, int block, int size)
 759{
 760        struct buffer_head * bh;
 761        int isize;
 762
 763repeat:
 764        bh = get_hash_table(dev, block, size);
 765        if (bh) {
 766                if (!buffer_dirty(bh)) {
 767                        bh->b_flushtime = 0;
 768                }
 769                return bh;
 770        }
 771
 772        isize = BUFSIZE_INDEX(size);
 773get_free:
 774        bh = free_list[isize];
 775        if (!bh)
 776                goto refill;
 777        remove_from_free_list(bh);
 778
 779        /* OK, FINALLY we know that this buffer is the only one of its kind,
 780         * and that it's unused (b_count=0), unlocked, and clean.
 781         */
 782        init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
 783        bh->b_state=0;
 784        insert_into_queues(bh);
 785        return bh;
 786
 787        /*
 788         * If we block while refilling the free list, somebody may
 789         * create the buffer first ... search the hashes again.
 790         */
 791refill:
 792        refill_freelist(size);
 793        if (!find_buffer(dev,block,size))
 794                goto get_free;
 795        goto repeat;
 796}
 797
 798void set_writetime(struct buffer_head * buf, int flag)
 799{
 800        int newtime;
 801
 802        if (buffer_dirty(buf)) {
 803                /* Move buffer to dirty list if jiffies is clear. */
 804                newtime = jiffies + (flag ? bdf_prm.b_un.age_super : 
 805                                     bdf_prm.b_un.age_buffer);
 806                if(!buf->b_flushtime || buf->b_flushtime > newtime)
 807                         buf->b_flushtime = newtime;
 808        } else {
 809                buf->b_flushtime = 0;
 810        }
 811}
 812
 813
 814/*
 815 * Put a buffer into the appropriate list, without side-effects.
 816 */
 817static inline void file_buffer(struct buffer_head *bh, int list)
 818{
 819        remove_from_queues(bh);
 820        bh->b_list = list;
 821        insert_into_queues(bh);
 822}
 823
 824/* -1 -> no need to flush
 825    0 -> async flush
 826    1 -> sync flush (wait for I/O completation) */
 827static int balance_dirty_state(kdev_t dev)
 828{
 829        unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 830
 831        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 832        tot = (buffermem >> PAGE_SHIFT) + nr_free_pages;
 833        tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 834
 835        dirty *= 200;
 836        soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 837        hard_dirty_limit = soft_dirty_limit * 2;
 838
 839        if (dirty > soft_dirty_limit)
 840        {
 841                if (dirty > hard_dirty_limit)
 842                        return 1;
 843                return 0;
 844        }
 845        return -1;
 846}
 847
 848/*
 849 * if a new dirty buffer is created we need to balance bdflush.
 850 *
 851 * in the future we might want to make bdflush aware of different
 852 * pressures on different devices - thus the (currently unused)
 853 * 'dev' parameter.
 854 */
 855void balance_dirty(kdev_t dev)
 856{
 857        int state = balance_dirty_state(dev);
 858
 859        if (state < 0)
 860                return;
 861        wakeup_bdflush(state);
 862}
 863
 864/*
 865 * A buffer may need to be moved from one buffer list to another
 866 * (e.g. in case it is not shared any more). Handle this.
 867 */
 868void refile_buffer(struct buffer_head * buf)
 869{
 870        int dispose;
 871
 872        if(buf->b_dev == B_FREE) {
 873                printk("Attempt to refile free buffer\n");
 874                return;
 875        }
 876        if (buffer_protected(buf))
 877                dispose = BUF_PROTECTED;
 878        else if (buffer_dirty(buf))
 879                dispose = BUF_DIRTY;
 880        else if (buffer_locked(buf))
 881                dispose = BUF_LOCKED;
 882        else
 883                dispose = BUF_CLEAN;
 884        if(dispose != buf->b_list) {
 885                file_buffer(buf, dispose);
 886                if(dispose == BUF_DIRTY) {
 887                        balance_dirty(buf->b_dev);
 888
 889                        /* If this is a loop device, and
 890                         * more than half of the buffers are dirty...
 891                         * (Prevents no-free-buffers deadlock with loop device.)
 892                         */
 893                        if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
 894                            nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
 895                                wakeup_bdflush(1);
 896                }
 897        }
 898}
 899
 900/*
 901 * Release a buffer head
 902 */
 903void __brelse(struct buffer_head * buf)
 904{
 905        /* If dirty, mark the time this buffer should be written back. */
 906        set_writetime(buf, 0);
 907        refile_buffer(buf);
 908
 909        if (buf->b_count) {
 910                buf->b_count--;
 911                return;
 912        }
 913        printk("VFS: brelse: Trying to free free buffer\n");
 914}
 915
 916/*
 917 * bforget() is like brelse(), except it puts the buffer on the
 918 * free list if it can.. We can NOT free the buffer if:
 919 *  - there are other users of it
 920 *  - it is locked and thus can have active IO
 921 */
 922void __bforget(struct buffer_head * buf)
 923{
 924        if (buf->b_count != 1 || buffer_locked(buf)) {
 925                __brelse(buf);
 926                return;
 927        }
 928        put_last_free(buf);
 929}
 930
 931/*
 932 * bread() reads a specified block and returns the buffer that contains
 933 * it. It returns NULL if the block was unreadable.
 934 */
 935struct buffer_head * bread(kdev_t dev, int block, int size)
 936{
 937        struct buffer_head * bh;
 938
 939        bh = getblk(dev, block, size);
 940        if (buffer_uptodate(bh))
 941                return bh;
 942        ll_rw_block(READ, 1, &bh);
 943        wait_on_buffer(bh);
 944        if (buffer_uptodate(bh))
 945                return bh;
 946        brelse(bh);
 947        return NULL;
 948}
 949
 950/*
 951 * Ok, breada can be used as bread, but additionally to mark other
 952 * blocks for reading as well. End the argument list with a negative
 953 * number.
 954 */
 955
 956#define NBUF 16
 957
 958struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 959        unsigned int pos, unsigned int filesize)
 960{
 961        struct buffer_head * bhlist[NBUF];
 962        unsigned int blocks;
 963        struct buffer_head * bh;
 964        int index;
 965        int i, j;
 966
 967        if (pos >= filesize)
 968                return NULL;
 969
 970        if (block < 0)
 971                return NULL;
 972
 973        bh = getblk(dev, block, bufsize);
 974        index = BUFSIZE_INDEX(bh->b_size);
 975
 976        if (buffer_uptodate(bh))
 977                return(bh);   
 978        else ll_rw_block(READ, 1, &bh);
 979
 980        blocks = (filesize - pos) >> (9+index);
 981
 982        if (blocks < (read_ahead[MAJOR(dev)] >> index))
 983                blocks = read_ahead[MAJOR(dev)] >> index;
 984        if (blocks > NBUF) 
 985                blocks = NBUF;
 986
 987/*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
 988
 989
 990        bhlist[0] = bh;
 991        j = 1;
 992        for(i=1; i<blocks; i++) {
 993                bh = getblk(dev,block+i,bufsize);
 994                if (buffer_uptodate(bh)) {
 995                        brelse(bh);
 996                        break;
 997                }
 998                else bhlist[j++] = bh;
 999        }
1000
1001        /* Request the read for these buffers, and then release them. */
1002        if (j>1)  
1003                ll_rw_block(READA, (j-1), bhlist+1); 
1004        for(i=1; i<j; i++)
1005                brelse(bhlist[i]);
1006
1007        /* Wait for this buffer, and then continue on. */
1008        bh = bhlist[0];
1009        wait_on_buffer(bh);
1010        if (buffer_uptodate(bh))
1011                return bh;
1012        brelse(bh);
1013        return NULL;
1014}
1015
1016/*
1017 * Note: the caller should wake up the buffer_wait list if needed.
1018 */
1019static void put_unused_buffer_head(struct buffer_head * bh)
1020{
1021        if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1022                nr_buffer_heads--;
1023                kmem_cache_free(bh_cachep, bh);
1024                return;
1025        }
1026
1027        memset(bh,0,sizeof(*bh));
1028        nr_unused_buffer_heads++;
1029        bh->b_next_free = unused_list;
1030        unused_list = bh;
1031}
1032
1033/* 
1034 * We can't put completed temporary IO buffer_heads directly onto the
1035 * unused_list when they become unlocked, since the device driver
1036 * end_request routines still expect access to the buffer_head's
1037 * fields after the final unlock.  So, the device driver puts them on
1038 * the reuse_list instead once IO completes, and we recover these to
1039 * the unused_list here.
1040 *
1041 * Note that we don't do a wakeup here, but return a flag indicating
1042 * whether we got any buffer heads. A task ready to sleep can check
1043 * the returned value, and any tasks already sleeping will have been
1044 * awakened when the buffer heads were added to the reuse list.
1045 */
1046static inline int recover_reusable_buffer_heads(void)
1047{
1048        struct buffer_head *head = xchg(&reuse_list, NULL);
1049        int found = 0;
1050        
1051        if (head) {
1052                do {
1053                        struct buffer_head *bh = head;
1054                        head = head->b_next_free;
1055                        put_unused_buffer_head(bh);
1056                } while (head);
1057                found = 1;
1058        }
1059        return found;
1060}
1061
1062/*
1063 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1064 * no-buffer-head deadlock.  Return NULL on failure; waiting for
1065 * buffer heads is now handled in create_buffers().
1066 */ 
1067static struct buffer_head * get_unused_buffer_head(int async)
1068{
1069        struct buffer_head * bh;
1070
1071        recover_reusable_buffer_heads();
1072        if (nr_unused_buffer_heads > NR_RESERVED) {
1073                bh = unused_list;
1074                unused_list = bh->b_next_free;
1075                nr_unused_buffer_heads--;
1076                return bh;
1077        }
1078
1079        /* This is critical.  We can't swap out pages to get
1080         * more buffer heads, because the swap-out may need
1081         * more buffer-heads itself.  Thus SLAB_BUFFER.
1082         */
1083        if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1084                memset(bh, 0, sizeof(*bh));
1085                nr_buffer_heads++;
1086                return bh;
1087        }
1088
1089        /*
1090         * If we need an async buffer, use the reserved buffer heads.
1091         */
1092        if (async && unused_list) {
1093                bh = unused_list;
1094                unused_list = bh->b_next_free;
1095                nr_unused_buffer_heads--;
1096                return bh;
1097        }
1098
1099#if 0
1100        /*
1101         * (Pending further analysis ...)
1102         * Ordinary (non-async) requests can use a different memory priority
1103         * to free up pages. Any swapping thus generated will use async
1104         * buffer heads.
1105         */
1106        if(!async &&
1107           (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1108                memset(bh, 0, sizeof(*bh));
1109                nr_buffer_heads++;
1110                return bh;
1111        }
1112#endif
1113
1114        return NULL;
1115}
1116
1117/*
1118 * Create the appropriate buffers when given a page for data area and
1119 * the size of each buffer.. Use the bh->b_this_page linked list to
1120 * follow the buffers created.  Return NULL if unable to create more
1121 * buffers.
1122 * The async flag is used to differentiate async IO (paging, swapping)
1123 * from ordinary buffer allocations, and only async requests are allowed
1124 * to sleep waiting for buffer heads. 
1125 */
1126static struct buffer_head * create_buffers(unsigned long page, 
1127                                                unsigned long size, int async)
1128{
1129        struct wait_queue wait = { current, NULL };
1130        struct buffer_head *bh, *head;
1131        long offset;
1132
1133try_again:
1134        head = NULL;
1135        offset = PAGE_SIZE;
1136        while ((offset -= size) >= 0) {
1137                bh = get_unused_buffer_head(async);
1138                if (!bh)
1139                        goto no_grow;
1140
1141                bh->b_dev = B_FREE;  /* Flag as unused */
1142                bh->b_this_page = head;
1143                head = bh;
1144
1145                bh->b_state = 0;
1146                bh->b_next_free = NULL;
1147                bh->b_count = 0;
1148                bh->b_size = size;
1149
1150                bh->b_data = (char *) (page+offset);
1151                bh->b_list = 0;
1152        }
1153        return head;
1154/*
1155 * In case anything failed, we just free everything we got.
1156 */
1157no_grow:
1158        if (head) {
1159                do {
1160                        bh = head;
1161                        head = head->b_this_page;
1162                        put_unused_buffer_head(bh);
1163                } while (head);
1164
1165                /* Wake up any waiters ... */
1166                wake_up(&buffer_wait);
1167        }
1168
1169        /*
1170         * Return failure for non-async IO requests.  Async IO requests
1171         * are not allowed to fail, so we have to wait until buffer heads
1172         * become available.  But we don't want tasks sleeping with 
1173         * partially complete buffers, so all were released above.
1174         */
1175        if (!async)
1176                return NULL;
1177
1178        /* We're _really_ low on memory. Now we just
1179         * wait for old buffer heads to become free due to
1180         * finishing IO.  Since this is an async request and
1181         * the reserve list is empty, we're sure there are 
1182         * async buffer heads in use.
1183         */
1184        run_task_queue(&tq_disk);
1185
1186        /* 
1187         * Set our state for sleeping, then check again for buffer heads.
1188         * This ensures we won't miss a wake_up from an interrupt.
1189         */
1190        add_wait_queue(&buffer_wait, &wait);
1191        current->state = TASK_UNINTERRUPTIBLE;
1192        if (!recover_reusable_buffer_heads())
1193                schedule();
1194        remove_wait_queue(&buffer_wait, &wait);
1195        current->state = TASK_RUNNING;
1196        goto try_again;
1197}
1198
1199/* Run the hooks that have to be done when a page I/O has completed. */
1200static inline void after_unlock_page (struct page * page)
1201{
1202        if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1203                atomic_dec(&nr_async_pages);
1204#ifdef DEBUG_SWAP
1205                printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1206                        (char *) page_address(page), 
1207                        atomic_read(&nr_async_pages));
1208#endif
1209        }
1210        if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1211                swap_after_unlock_page(page->offset);
1212        if (test_and_clear_bit(PG_free_after, &page->flags))
1213                __free_page(page);
1214}
1215
1216/*
1217 * Free all temporary buffers belonging to a page.
1218 * This needs to be called with interrupts disabled.
1219 */
1220static inline void free_async_buffers (struct buffer_head * bh)
1221{
1222        struct buffer_head *tmp, *tail;
1223
1224        /*
1225         * Link all the buffers into the b_next_free list,
1226         * so we only have to do one xchg() operation ...
1227         */
1228        tail = bh;
1229        while ((tmp = tail->b_this_page) != bh) {
1230                tail->b_next_free = tmp;
1231                tail = tmp;
1232        };
1233
1234        /* Update the reuse list */
1235        tail->b_next_free = xchg(&reuse_list, NULL);
1236        reuse_list = bh;
1237
1238        /* Wake up any waiters ... */
1239        wake_up(&buffer_wait);
1240}
1241
1242static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1243{
1244        unsigned long flags;
1245        struct buffer_head *tmp;
1246        struct page *page;
1247
1248        mark_buffer_uptodate(bh, uptodate);
1249        unlock_buffer(bh);
1250
1251        /* This is a temporary buffer used for page I/O. */
1252        page = mem_map + MAP_NR(bh->b_data);
1253        if (!PageLocked(page))
1254                goto not_locked;
1255        if (bh->b_count != 1)
1256                goto bad_count;
1257
1258        if (!test_bit(BH_Uptodate, &bh->b_state))
1259                set_bit(PG_error, &page->flags);
1260
1261        /*
1262         * Be _very_ careful from here on. Bad things can happen if
1263         * two buffer heads end IO at almost the same time and both
1264         * decide that the page is now completely done.
1265         *
1266         * Async buffer_heads are here only as labels for IO, and get
1267         * thrown away once the IO for this page is complete.  IO is
1268         * deemed complete once all buffers have been visited
1269         * (b_count==0) and are now unlocked. We must make sure that
1270         * only the _last_ buffer that decrements its count is the one
1271         * that free's the page..
1272         */
1273        save_flags(flags);
1274        cli();
1275        bh->b_count--;
1276        tmp = bh;
1277        do {
1278                if (tmp->b_count)
1279                        goto still_busy;
1280                tmp = tmp->b_this_page;
1281        } while (tmp != bh);
1282
1283        /* OK, the async IO on this page is complete. */
1284        free_async_buffers(bh);
1285        restore_flags(flags);
1286        clear_bit(PG_locked, &page->flags);
1287        wake_up(&page->wait);
1288        after_unlock_page(page);
1289        return;
1290
1291still_busy:
1292        restore_flags(flags);
1293        return;
1294
1295not_locked:
1296        printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1297        return;
1298
1299bad_count:
1300        printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1301        return;
1302}
1303
1304/*
1305 * Start I/O on a page.
1306 * This function expects the page to be locked and may return before I/O is complete.
1307 * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1308 */
1309int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1310{
1311        struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1312        int block, nr, need_dcache_flush;
1313
1314        if (!PageLocked(page))
1315                panic("brw_page: page not locked for I/O");
1316        clear_bit(PG_uptodate, &page->flags);
1317        clear_bit(PG_error, &page->flags);
1318        /*
1319         * Allocate async buffer heads pointing to this page, just for I/O.
1320         * They do _not_ show up in the buffer hash table!
1321         * They are _not_ registered in page->buffers either!
1322         */
1323        bh = create_buffers(page_address(page), size, 1);
1324        if (!bh) {
1325                /* WSH: exit here leaves page->count incremented */
1326                clear_bit(PG_locked, &page->flags);
1327                wake_up(&page->wait);
1328                return -ENOMEM;
1329        }
1330        nr = 0;
1331        need_dcache_flush = 0;
1332        next = bh;
1333        do {
1334                struct buffer_head * tmp;
1335                block = *(b++);
1336
1337                init_buffer(next, dev, block, end_buffer_io_async, NULL);
1338                set_bit(BH_Uptodate, &next->b_state);
1339
1340                /*
1341                 * When we use bmap, we define block zero to represent
1342                 * a hole.  ll_rw_page, however, may legitimately
1343                 * access block zero, and we need to distinguish the
1344                 * two cases.
1345                 */
1346                if (bmap && !block) {
1347                        memset(next->b_data, 0, size);
1348                        next->b_count--;
1349                        continue;
1350                }
1351                tmp = get_hash_table(dev, block, size);
1352                if (tmp) {
1353                        if (!buffer_uptodate(tmp)) {
1354                                if (rw == READ)
1355                                        ll_rw_block(READ, 1, &tmp);
1356                                wait_on_buffer(tmp);
1357                        }
1358                        if (rw == READ) {
1359                                memcpy(next->b_data, tmp->b_data, size);
1360                                need_dcache_flush = 1;
1361                        } else {
1362                                memcpy(tmp->b_data, next->b_data, size);
1363                                mark_buffer_dirty(tmp, 0);
1364                        }
1365                        brelse(tmp);
1366                        next->b_count--;
1367                        continue;
1368                }
1369                if (rw == READ)
1370                        clear_bit(BH_Uptodate, &next->b_state);
1371                else
1372                        set_bit(BH_Dirty, &next->b_state);
1373                arr[nr++] = next;
1374        } while (prev = next, (next = next->b_this_page) != NULL);
1375        if (need_dcache_flush)
1376                flush_dcache_page(page_address(page));
1377        prev->b_this_page = bh;
1378        
1379        if (nr) {
1380                ll_rw_block(rw, nr, arr);
1381                /* The rest of the work is done in mark_buffer_uptodate()
1382                 * and unlock_buffer(). */
1383        } else {
1384                unsigned long flags;
1385                clear_bit(PG_locked, &page->flags);
1386                set_bit(PG_uptodate, &page->flags);
1387                wake_up(&page->wait);
1388                save_flags(flags);
1389                cli();
1390                free_async_buffers(bh);
1391                restore_flags(flags);
1392                after_unlock_page(page);
1393        }
1394        ++current->maj_flt;
1395        return 0;
1396}
1397
1398/*
1399 * This is called by end_request() when I/O has completed.
1400 */
1401void mark_buffer_uptodate(struct buffer_head * bh, int on)
1402{
1403        if (on) {
1404                struct buffer_head *tmp = bh;
1405                set_bit(BH_Uptodate, &bh->b_state);
1406                /* If a page has buffers and all these buffers are uptodate,
1407                 * then the page is uptodate. */
1408                do {
1409                        if (!test_bit(BH_Uptodate, &tmp->b_state))
1410                                return;
1411                        tmp=tmp->b_this_page;
1412                } while (tmp && tmp != bh);
1413                set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1414                return;
1415        }
1416        clear_bit(BH_Uptodate, &bh->b_state);
1417}
1418
1419/*
1420 * Generic "readpage" function for block devices that have the normal
1421 * bmap functionality. This is most of the block device filesystems.
1422 * Reads the page asynchronously --- the unlock_buffer() and
1423 * mark_buffer_uptodate() functions propagate buffer state into the
1424 * page struct once IO has completed.
1425 */
1426int generic_readpage(struct file * file, struct page * page)
1427{
1428        struct dentry *dentry = file->f_dentry;
1429        struct inode *inode = dentry->d_inode;
1430        unsigned long block;
1431        int *p, nr[PAGE_SIZE/512];
1432        int i;
1433
1434        atomic_inc(&page->count);
1435        set_bit(PG_locked, &page->flags);
1436        set_bit(PG_free_after, &page->flags);
1437        
1438        i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1439        block = page->offset >> inode->i_sb->s_blocksize_bits;
1440        p = nr;
1441        do {
1442                *p = inode->i_op->bmap(inode, block);
1443                i--;
1444                block++;
1445                p++;
1446        } while (i > 0);
1447
1448        /* IO start */
1449        brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1450        return 0;
1451}
1452
1453/*
1454 * Try to increase the number of buffers available: the size argument
1455 * is used to determine what kind of buffers we want.
1456 */
1457static int grow_buffers(int size)
1458{
1459        unsigned long page;
1460        struct buffer_head *bh, *tmp;
1461        struct buffer_head * insert_point;
1462        int isize;
1463
1464        if ((size & 511) || (size > PAGE_SIZE)) {
1465                printk("VFS: grow_buffers: size = %d\n",size);
1466                return 0;
1467        }
1468
1469        if (!(page = __get_free_page(GFP_BUFFER)))
1470                return 0;
1471        bh = create_buffers(page, size, 0);
1472        if (!bh) {
1473                free_page(page);
1474                return 0;
1475        }
1476
1477        isize = BUFSIZE_INDEX(size);
1478        insert_point = free_list[isize];
1479
1480        tmp = bh;
1481        while (1) {
1482                if (insert_point) {
1483                        tmp->b_next_free = insert_point->b_next_free;
1484                        tmp->b_prev_free = insert_point;
1485                        insert_point->b_next_free->b_prev_free = tmp;
1486                        insert_point->b_next_free = tmp;
1487                } else {
1488                        tmp->b_prev_free = tmp;
1489                        tmp->b_next_free = tmp;
1490                }
1491                insert_point = tmp;
1492                ++nr_buffers;
1493                if (tmp->b_this_page)
1494                        tmp = tmp->b_this_page;
1495                else
1496                        break;
1497        }
1498        tmp->b_this_page = bh;
1499        free_list[isize] = bh;
1500        mem_map[MAP_NR(page)].flags = 0;
1501        mem_map[MAP_NR(page)].buffers = bh;
1502        buffermem += PAGE_SIZE;
1503        return 1;
1504}
1505
1506/*
1507 * Can the buffer be thrown out?
1508 */
1509#define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1510#define buffer_busy(bh)         ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1511
1512static void sync_page_buffers(struct page * page)
1513{
1514        struct buffer_head * tmp, * bh = page->buffers;
1515
1516        /*
1517         * Here we'll probably sleep and so we must make sure that
1518         * the page doesn't go away from under us. We also prefer any
1519         * concurrent try_to_free_buffers() not to work in any way on
1520         * our current page from under us since we're just working on it.
1521         * As always in 2.2.x we're serialized by the big kernel lock
1522         * during those hacky page-visibility manipulations.
1523         *
1524         * SUBTLE NOTE: for things like LVM snapshotting WRITEA will block too!
1525         */
1526        page->buffers = NULL;
1527
1528        tmp = bh;
1529        do {
1530                struct buffer_head *p = tmp;
1531                tmp = tmp->b_this_page;
1532
1533                if (buffer_dirty(p))
1534                        if (test_and_set_bit(BH_Wait_IO, &p->b_state))
1535                                ll_rw_block(WRITE, 1, &p);
1536        } while (tmp != bh);
1537
1538        /* Restore the visibility of the page before returning. */
1539        page->buffers = bh;
1540}
1541
1542/*
1543 * try_to_free_buffers() checks if all the buffers on this particular page
1544 * are unused, and free's the page if so.
1545 *
1546 * Wake up bdflush() if this fails - if we're running low on memory due
1547 * to dirty buffers, we need to flush them out as quickly as possible.
1548 */
1549int try_to_free_buffers(struct page * page_map, int gfp_mask)
1550{
1551        struct buffer_head * tmp, * bh = page_map->buffers;
1552
1553        tmp = bh;
1554        do {
1555                if (buffer_busy(tmp))
1556                        goto busy;
1557                tmp = tmp->b_this_page;
1558        } while (tmp != bh);
1559
1560        do {
1561                struct buffer_head * p = tmp;
1562                tmp = tmp->b_this_page;
1563                nr_buffers--;
1564                remove_from_queues(p);
1565                put_unused_buffer_head(p);
1566        } while (tmp != bh);
1567
1568        /* Wake up anyone waiting for buffer heads */
1569        wake_up(&buffer_wait);
1570
1571        /* And free the page */
1572        buffermem -= PAGE_SIZE;
1573        page_map->buffers = NULL;
1574        __free_page(page_map);
1575        return 1;
1576
1577 busy:
1578        if (gfp_mask & __GFP_IO)
1579                sync_page_buffers(page_map);
1580
1581        if (balance_dirty_state(NODEV) >= 0)
1582                wakeup_bdflush(0);
1583
1584        return 0;
1585}
1586
1587/* ================== Debugging =================== */
1588
1589void show_buffers(void)
1590{
1591        struct buffer_head * bh;
1592        int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1593        int protected = 0;
1594        int nlist;
1595        static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",};
1596
1597        printk("Buffer memory:   %8ldkB\n",buffermem>>10);
1598        printk("Buffer heads:    %6d\n",nr_buffer_heads);
1599        printk("Buffer blocks:   %6d\n",nr_buffers);
1600        printk("Buffer hashed:   %6d\n",nr_hashed_buffers);
1601
1602        for(nlist = 0; nlist < NR_LIST; nlist++) {
1603          found = locked = dirty = used = lastused = protected = 0;
1604          bh = lru_list[nlist];
1605          if(!bh) continue;
1606
1607          do {
1608                found++;
1609                if (buffer_locked(bh))
1610                        locked++;
1611                if (buffer_protected(bh))
1612                        protected++;
1613                if (buffer_dirty(bh))
1614                        dirty++;
1615                if (bh->b_count)
1616                        used++, lastused = found;
1617                bh = bh->b_next_free;
1618          } while (bh != lru_list[nlist]);
1619          printk("%9s: %d buffers, %d used (last=%d), "
1620                 "%d locked, %d protected, %d dirty\n",
1621                 buf_types[nlist], found, used, lastused,
1622                 locked, protected, dirty);
1623        };
1624}
1625
1626
1627/* ===================== Init ======================= */
1628
1629/*
1630 * allocate the hash table and init the free list
1631 * Use gfp() for the hash table to decrease TLB misses, use
1632 * SLAB cache for buffer heads.
1633 */
1634void __init buffer_init(unsigned long memory_size)
1635{
1636        int order;
1637        unsigned int nr_hash;
1638
1639        /* we need to guess at the right sort of size for a buffer cache.
1640           the heuristic from working with large databases and getting
1641           fsync times (ext2) manageable, is the following */
1642
1643        memory_size >>= 20;
1644        for (order = 0; (1UL << order) < memory_size; order++);
1645
1646        /* try to allocate something until we get it or we're asking
1647           for something that is really too small */
1648
1649        do {
1650                unsigned long tmp;
1651
1652                nr_hash = (1UL << order) * PAGE_SIZE /
1653                    sizeof(struct buffer_head *);
1654                bh_hash_mask = (nr_hash - 1);
1655
1656                tmp = nr_hash;
1657                bh_hash_shift = 0;
1658                while((tmp >>= 1UL) != 0UL)
1659                        bh_hash_shift++;
1660
1661                hash_table = (struct buffer_head **)
1662                    __get_free_pages(GFP_ATOMIC, order);
1663        } while (hash_table == NULL && --order >= 0);
1664        printk("Buffer cache hash table entries: %d (order %d, %ldk)\n",
1665               nr_hash, order, (1UL<<order) * PAGE_SIZE / 1024);
1666        
1667        if (!hash_table)
1668                panic("Failed to allocate buffer hash table\n");
1669        memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1670
1671        bh_cachep = kmem_cache_create("buffer_head",
1672                                      sizeof(struct buffer_head),
1673                                      0,
1674                                      SLAB_HWCACHE_ALIGN, NULL, NULL);
1675        if(!bh_cachep)
1676                panic("Cannot create buffer head SLAB cache\n");
1677        /*
1678         * Allocate the reserved buffer heads.
1679         */
1680        while (nr_buffer_heads < NR_RESERVED) {
1681                struct buffer_head * bh;
1682
1683                bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1684                if (!bh)
1685                        break;
1686                put_unused_buffer_head(bh);
1687                nr_buffer_heads++;
1688        }
1689
1690        lru_list[BUF_CLEAN] = 0;
1691        grow_buffers(BLOCK_SIZE);
1692}
1693
1694
1695/* ====================== bdflush support =================== */
1696
1697/* This is a simple kernel daemon, whose job it is to provide a dynamic
1698 * response to dirty buffers.  Once this process is activated, we write back
1699 * a limited number of buffers to the disks and then go back to sleep again.
1700 */
1701static struct wait_queue * bdflush_wait = NULL;
1702static struct wait_queue * bdflush_done = NULL;
1703struct task_struct *bdflush_tsk = 0;
1704
1705void wakeup_bdflush(int wait)
1706{
1707        if (current == bdflush_tsk)
1708                return;
1709        wake_up(&bdflush_wait);
1710        if (wait) {
1711                sleep_on(&bdflush_done);
1712        }
1713}
1714
1715
1716/* 
1717 * Here we attempt to write back old buffers.  We also try to flush inodes 
1718 * and supers as well, since this function is essentially "update", and 
1719 * otherwise there would be no way of ensuring that these quantities ever 
1720 * get written back.  Ideally, we would have a timestamp on the inodes
1721 * and superblocks so that we could write back only the old ones as well
1722 */
1723
1724static int sync_old_buffers(void)
1725{
1726        int i;
1727        int ndirty, nwritten;
1728        int nlist;
1729        int ncount;
1730        struct buffer_head * bh, *next;
1731
1732        sync_supers(0);
1733        sync_inodes(0);
1734
1735        ncount = 0;
1736#ifdef DEBUG
1737        for(nlist = 0; nlist < NR_LIST; nlist++)
1738#else
1739        for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
1740#endif
1741        {
1742                ndirty = 0;
1743                nwritten = 0;
1744        repeat:
1745
1746                bh = lru_list[nlist];
1747                if(bh) 
1748                         for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1749                                 /* We may have stalled while waiting for I/O to complete. */
1750                                 if(bh->b_list != nlist) goto repeat;
1751                                 next = bh->b_next_free;
1752                                 if(!lru_list[nlist]) {
1753                                         printk("Dirty list empty %d\n", i);
1754                                         break;
1755                                 }
1756                                 
1757                                 /* Clean buffer on dirty list?  Refile it */
1758                                 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
1759                                         refile_buffer(bh);
1760                                         continue;
1761                                 }
1762                                  
1763                                  /* Unlocked buffer on locked list?  Refile it */
1764                                  if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
1765                                          refile_buffer(bh);
1766                                          continue;
1767                                  }
1768                                 
1769                                 if (buffer_locked(bh) || !buffer_dirty(bh))
1770                                          continue;
1771                                 ndirty++;
1772                                 if(time_before(jiffies, bh->b_flushtime))
1773                                        continue;
1774                                 nwritten++;
1775                                 next->b_count++;
1776                                 bh->b_count++;
1777                                 bh->b_flushtime = 0;
1778#ifdef DEBUG
1779                                 if(nlist != BUF_DIRTY) ncount++;
1780#endif
1781                                 ll_rw_block(WRITE, 1, &bh);
1782                                 bh->b_count--;
1783                                 next->b_count--;
1784                         }
1785        }
1786        run_task_queue(&tq_disk);
1787#ifdef DEBUG
1788        if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1789        printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1790#endif
1791        run_task_queue(&tq_disk);
1792        return 0;
1793}
1794
1795
1796/* This is the interface to bdflush.  As we get more sophisticated, we can
1797 * pass tuning parameters to this "process", to adjust how it behaves. 
1798 * We would want to verify each parameter, however, to make sure that it 
1799 * is reasonable. */
1800
1801asmlinkage int sys_bdflush(int func, long data)
1802{
1803        int i, error = -EPERM;
1804
1805        lock_kernel();
1806        if (!capable(CAP_SYS_ADMIN))
1807                goto out;
1808
1809        if (func == 1) {
1810                unlock_kernel();
1811                /* do_exit directly and let kupdate to do its work alone. */
1812                do_exit(0);
1813        }
1814
1815        /* Basically func 1 means read param 1, 2 means write param 1, etc */
1816        if (func >= 2) {
1817                i = (func-2) >> 1;
1818                error = -EINVAL;
1819                if (i < 0 || i >= N_PARAM)
1820                        goto out;
1821                if((func & 1) == 0) {
1822                        error = put_user(bdf_prm.data[i], (int*)data);
1823                        goto out;
1824                }
1825                if (data < bdflush_min[i] || data > bdflush_max[i])
1826                        goto out;
1827                bdf_prm.data[i] = data;
1828                error = 0;
1829                goto out;
1830        };
1831
1832        /* Having func 0 used to launch the actual bdflush and then never
1833         * return (unless explicitly killed). We return zero here to 
1834         * remain semi-compatible with present update(8) programs.
1835         */
1836        error = 0;
1837out:
1838        unlock_kernel();
1839        return error;
1840}
1841
1842/* This is the actual bdflush daemon itself. It used to be started from
1843 * the syscall above, but now we launch it ourselves internally with
1844 * kernel_thread(...)  directly after the first thread in init/main.c */
1845
1846/* To prevent deadlocks for a loop device:
1847 * 1) Do non-blocking writes to loop (avoids deadlock with running
1848 *      out of request blocks).
1849 * 2) But do a blocking write if the only dirty buffers are loop buffers
1850 *      (otherwise we go into an infinite busy-loop).
1851 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1852 *      with running out of free buffers for loop's "real" device).
1853*/
1854int bdflush(void * unused) 
1855{
1856        int i;
1857        int ndirty;
1858        int nlist;
1859        int ncount;
1860        struct buffer_head * bh, *next;
1861        int major;
1862        int wrta_cmd = WRITEA;  /* non-blocking write for LOOP */
1863
1864        /*
1865         *      We have a bare-bones task_struct, and really should fill
1866         *      in a few more things so "top" and /proc/2/{exe,root,cwd}
1867         *      display semi-sane things. Not real crucial though...  
1868         */
1869
1870        current->session = 1;
1871        current->pgrp = 1;
1872        sprintf(current->comm, "kflushd");
1873        bdflush_tsk = current;
1874
1875        /*
1876         *      As a kernel thread we want to tamper with system buffers
1877         *      and other internals and thus be subject to the SMP locking
1878         *      rules. (On a uniprocessor box this does nothing).
1879         */
1880        lock_kernel();
1881                 
1882        for (;;) {
1883#ifdef DEBUG
1884                printk("bdflush() activated...");
1885#endif
1886
1887                CHECK_EMERGENCY_SYNC
1888
1889                ncount = 0;
1890#ifdef DEBUG
1891                for(nlist = 0; nlist < NR_LIST; nlist++)
1892#else
1893                for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
1894#endif
1895                 {
1896                         ndirty = 0;
1897                 repeat:
1898
1899                         bh = lru_list[nlist];
1900                         if(bh) 
1901                                  for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty; 
1902                                       bh = next) {
1903                                          /* We may have stalled while waiting for I/O to complete. */
1904                                          if(bh->b_list != nlist) goto repeat;
1905                                          next = bh->b_next_free;
1906                                          if(!lru_list[nlist]) {
1907                                                  printk("Dirty list empty %d\n", i);
1908                                                  break;
1909                                          }
1910                                          
1911                                          /* Clean buffer on dirty list?  Refile it */
1912                                          if (nlist == BUF_DIRTY && !buffer_dirty(bh)) {
1913                                                  refile_buffer(bh);
1914                                                  continue;
1915                                          }
1916                                          
1917                                          /* Unlocked buffer on locked list?  Refile it */
1918                                          if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
1919                                                  refile_buffer(bh);
1920                                                  continue;
1921                                          }
1922                                          
1923                                          if (buffer_locked(bh) || !buffer_dirty(bh))
1924                                                   continue;
1925                                          major = MAJOR(bh->b_dev);
1926                                          /* Should we write back buffers that are shared or not??
1927                                             currently dirty buffers are not shared, so it does not matter */
1928                                          next->b_count++;
1929                                          bh->b_count++;
1930                                          ndirty++;
1931                                          bh->b_flushtime = 0;
1932                                          if (major == LOOP_MAJOR) {
1933                                                  ll_rw_block(wrta_cmd,1, &bh);
1934                                                  wrta_cmd = WRITEA;
1935                                                  if (buffer_dirty(bh))
1936                                                          --ndirty;
1937                                          }
1938                                          else
1939                                          ll_rw_block(WRITE, 1, &bh);
1940#ifdef DEBUG
1941                                          if(nlist != BUF_DIRTY) ncount++;
1942#endif
1943                                          bh->b_count--;
1944                                          next->b_count--;
1945                                  }
1946                 }
1947#ifdef DEBUG
1948                if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
1949                printk("sleeping again.\n");
1950#endif
1951                /* If we didn't write anything, but there are still
1952                 * dirty buffers, then make the next write to a
1953                 * loop device to be a blocking write.
1954                 * This lets us block--which we _must_ do! */
1955                if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1956                        wrta_cmd = WRITE;
1957                        continue;
1958                }
1959                run_task_queue(&tq_disk);
1960                wake_up(&bdflush_done);
1961                
1962                /* If there are still a lot of dirty buffers around, skip the sleep
1963                   and flush some more */
1964                if (!ndirty || balance_dirty_state(NODEV) < 0)
1965                {
1966                        spin_lock_irq(&current->sigmask_lock);
1967                        flush_signals(current);
1968                        spin_unlock_irq(&current->sigmask_lock);
1969
1970                        interruptible_sleep_on(&bdflush_wait);
1971                }
1972        }
1973}
1974
1975/*
1976 * This is the kernel update daemon. It was used to live in userspace
1977 * but since it's need to run safely we want it unkillable by mistake.
1978 * You don't need to change your userspace configuration since
1979 * the userspace `update` will do_exit(0) at the first sys_bdflush().
1980 */
1981int kupdate(void * unused) 
1982{
1983        struct task_struct * tsk = current;
1984        int interval;
1985
1986        tsk->session = 1;
1987        tsk->pgrp = 1;
1988        strcpy(tsk->comm, "kupdate");
1989        sigfillset(&tsk->blocked);
1990        /* sigcont will wakeup kupdate after setting interval to 0 */
1991        sigdelset(&tsk->blocked, SIGCONT);
1992
1993        lock_kernel();
1994
1995        for (;;) {
1996                interval = bdf_prm.b_un.interval;
1997                if (interval)
1998                {
1999                        tsk->state = TASK_INTERRUPTIBLE;
2000                        schedule_timeout(interval);
2001                }
2002                else
2003                {
2004                        tsk->state = TASK_STOPPED;
2005                        schedule(); /* wait for SIGCONT */
2006                }
2007#ifdef DEBUG
2008                printk("kupdate() activated...\n");
2009#endif
2010                sync_old_buffers();
2011        }
2012}
2013
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.