linux-old/fs/ext3/inode.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/ext3/inode.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  from
  10 *
  11 *  linux/fs/minix/inode.c
  12 *
  13 *  Copyright (C) 1991, 1992  Linus Torvalds
  14 *
  15 *  Goal-directed block allocation by Stephen Tweedie
  16 *      (sct@redhat.com), 1993, 1998
  17 *  Big-endian to little-endian byte-swapping/bitmaps by
  18 *        David S. Miller (davem@caip.rutgers.edu), 1995
  19 *  64-bit file support on 64-bit platforms by Jakub Jelinek
  20 *      (jj@sunsite.ms.mff.cuni.cz)
  21 *
  22 *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
  23 */
  24
  25#include <linux/fs.h>
  26#include <linux/sched.h>
  27#include <linux/ext3_jbd.h>
  28#include <linux/jbd.h>
  29#include <linux/locks.h>
  30#include <linux/smp_lock.h>
  31#include <linux/highuid.h>
  32#include <linux/quotaops.h>
  33#include <linux/module.h>
  34
  35/*
  36 * SEARCH_FROM_ZERO forces each block allocation to search from the start
  37 * of the filesystem.  This is to force rapid reallocation of recently-freed
  38 * blocks.  The file fragmentation is horrendous.
  39 */
  40#undef SEARCH_FROM_ZERO
  41
  42/*
  43 * Test whether an inode is a fast symlink.
  44 */
  45static inline int ext3_inode_is_fast_symlink(struct inode *inode)
  46{
  47        int ea_blocks = EXT3_I(inode)->i_file_acl ?
  48                (inode->i_sb->s_blocksize >> 9) : 0;
  49
  50        return (S_ISLNK(inode->i_mode) &&
  51                inode->i_blocks - ea_blocks == 0);
  52}
  53
  54/* The ext3 forget function must perform a revoke if we are freeing data
  55 * which has been journaled.  Metadata (eg. indirect blocks) must be
  56 * revoked in all cases. 
  57 *
  58 * "bh" may be NULL: a metadata block may have been freed from memory
  59 * but there may still be a record of it in the journal, and that record
  60 * still needs to be revoked.
  61 */
  62
  63static int ext3_forget(handle_t *handle, int is_metadata,
  64                       struct inode *inode, struct buffer_head *bh,
  65                       int blocknr)
  66{
  67        int err;
  68
  69        BUFFER_TRACE(bh, "enter");
  70
  71        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
  72                  "data mode %lx\n",
  73                  bh, is_metadata, inode->i_mode,
  74                  test_opt(inode->i_sb, DATA_FLAGS));
  75        
  76        /* Never use the revoke function if we are doing full data
  77         * journaling: there is no need to, and a V1 superblock won't
  78         * support it.  Otherwise, only skip the revoke on un-journaled
  79         * data blocks. */
  80
  81        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
  82            (!is_metadata && !ext3_should_journal_data(inode))) {
  83                if (bh) {
  84                        BUFFER_TRACE(bh, "call journal_forget");
  85                        ext3_journal_forget(handle, bh);
  86                }
  87                return 0;
  88        }
  89
  90        /*
  91         * data!=journal && (is_metadata || should_journal_data(inode))
  92         */
  93        BUFFER_TRACE(bh, "call ext3_journal_revoke");
  94        err = ext3_journal_revoke(handle, blocknr, bh);
  95        if (err)
  96                ext3_abort(inode->i_sb, __FUNCTION__,
  97                           "error %d when attempting revoke", err);
  98        BUFFER_TRACE(bh, "exit");
  99        return err;
 100}
 101
 102/*
 103 * Work out how many blocks we need to progress with the next chunk of a
 104 * truncate transaction.
 105 */
 106
 107static unsigned long blocks_for_truncate(struct inode *inode) 
 108{
 109        unsigned long needed;
 110        
 111        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
 112
 113        /* Give ourselves just enough room to cope with inodes in which
 114         * i_blocks is corrupt: we've seen disk corruptions in the past
 115         * which resulted in random data in an inode which looked enough
 116         * like a regular file for ext3 to try to delete it.  Things
 117         * will go a bit crazy if that happens, but at least we should
 118         * try not to panic the whole kernel. */
 119        if (needed < 2)
 120                needed = 2;
 121
 122        /* But we need to bound the transaction so we don't overflow the
 123         * journal. */
 124        if (needed > EXT3_MAX_TRANS_DATA) 
 125                needed = EXT3_MAX_TRANS_DATA;
 126
 127        return EXT3_DATA_TRANS_BLOCKS + needed;
 128}
 129        
 130/* 
 131 * Truncate transactions can be complex and absolutely huge.  So we need to
 132 * be able to restart the transaction at a conventient checkpoint to make
 133 * sure we don't overflow the journal.
 134 *
 135 * start_transaction gets us a new handle for a truncate transaction,
 136 * and extend_transaction tries to extend the existing one a bit.  If
 137 * extend fails, we need to propagate the failure up and restart the
 138 * transaction in the top-level truncate loop. --sct 
 139 */
 140
 141static handle_t *start_transaction(struct inode *inode) 
 142{
 143        handle_t *result;
 144        
 145        result = ext3_journal_start(inode, blocks_for_truncate(inode));
 146        if (!IS_ERR(result))
 147                return result;
 148        
 149        ext3_std_error(inode->i_sb, PTR_ERR(result));
 150        return result;
 151}
 152
 153/*
 154 * Try to extend this transaction for the purposes of truncation.
 155 *
 156 * Returns 0 if we managed to create more room.  If we can't create more
 157 * room, and the transaction must be restarted we return 1.
 158 */
 159static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 160{
 161        if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
 162                return 0;
 163        if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
 164                return 0;
 165        return 1;
 166}
 167
 168/*
 169 * Restart the transaction associated with *handle.  This does a commit,
 170 * so before we call here everything must be consistently dirtied against
 171 * this transaction.
 172 */
 173static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
 174{
 175        jbd_debug(2, "restarting handle %p\n", handle);
 176        return ext3_journal_restart(handle, blocks_for_truncate(inode));
 177}
 178
 179/*
 180 * Called at each iput()
 181 */
 182void ext3_put_inode (struct inode * inode)
 183{
 184        ext3_discard_prealloc (inode);
 185}
 186
 187/*
 188 * Called at the last iput() if i_nlink is zero.
 189 */
 190void ext3_delete_inode (struct inode * inode)
 191{
 192        handle_t *handle;
 193        
 194        if (is_bad_inode(inode) ||
 195            inode->i_ino == EXT3_ACL_IDX_INO ||
 196            inode->i_ino == EXT3_ACL_DATA_INO)
 197                goto no_delete;
 198
 199        lock_kernel();
 200        handle = start_transaction(inode);
 201        if (IS_ERR(handle)) {
 202                /* If we're going to skip the normal cleanup, we still
 203                 * need to make sure that the in-core orphan linked list
 204                 * is properly cleaned up. */
 205                ext3_orphan_del(NULL, inode);
 206
 207                ext3_std_error(inode->i_sb, PTR_ERR(handle));
 208                unlock_kernel();
 209                goto no_delete;
 210        }
 211        
 212        if (IS_SYNC(inode))
 213                handle->h_sync = 1;
 214        inode->i_size = 0;
 215        if (inode->i_blocks)
 216                ext3_truncate(inode);
 217        /*
 218         * Kill off the orphan record which ext3_truncate created.
 219         * AKPM: I think this can be inside the above `if'.
 220         * Note that ext3_orphan_del() has to be able to cope with the
 221         * deletion of a non-existent orphan - this is because we don't
 222         * know if ext3_truncate() actually created an orphan record.
 223         * (Well, we could do this if we need to, but heck - it works)
 224         */
 225        ext3_orphan_del(handle, inode);
 226        inode->u.ext3_i.i_dtime = CURRENT_TIME;
 227
 228        /* 
 229         * One subtle ordering requirement: if anything has gone wrong
 230         * (transaction abort, IO errors, whatever), then we can still
 231         * do these next steps (the fs will already have been marked as
 232         * having errors), but we can't free the inode if the mark_dirty
 233         * fails.  
 234         */
 235        if (ext3_mark_inode_dirty(handle, inode))
 236                /* If that failed, just do the required in-core inode clear. */
 237                clear_inode(inode);
 238        else
 239                ext3_free_inode(handle, inode);
 240        ext3_journal_stop(handle, inode);
 241        unlock_kernel();
 242        return;
 243no_delete:
 244        clear_inode(inode);     /* We must guarantee clearing of inode... */
 245}
 246
 247void ext3_discard_prealloc (struct inode * inode)
 248{
 249#ifdef EXT3_PREALLOCATE
 250        lock_kernel();
 251        /* Writer: ->i_prealloc* */
 252        if (inode->u.ext3_i.i_prealloc_count) {
 253                unsigned short total = inode->u.ext3_i.i_prealloc_count;
 254                unsigned long block = inode->u.ext3_i.i_prealloc_block;
 255                inode->u.ext3_i.i_prealloc_count = 0;
 256                inode->u.ext3_i.i_prealloc_block = 0;
 257                /* Writer: end */
 258                ext3_free_blocks (inode, block, total);
 259        }
 260        unlock_kernel();
 261#endif
 262}
 263
 264static int ext3_alloc_block (handle_t *handle,
 265                        struct inode * inode, unsigned long goal, int *err)
 266{
 267#ifdef EXT3FS_DEBUG
 268        static unsigned long alloc_hits = 0, alloc_attempts = 0;
 269#endif
 270        unsigned long result;
 271
 272#ifdef EXT3_PREALLOCATE
 273        /* Writer: ->i_prealloc* */
 274        if (inode->u.ext3_i.i_prealloc_count &&
 275            (goal == inode->u.ext3_i.i_prealloc_block ||
 276             goal + 1 == inode->u.ext3_i.i_prealloc_block))
 277        {
 278                result = inode->u.ext3_i.i_prealloc_block++;
 279                inode->u.ext3_i.i_prealloc_count--;
 280                /* Writer: end */
 281                ext3_debug ("preallocation hit (%lu/%lu).\n",
 282                            ++alloc_hits, ++alloc_attempts);
 283        } else {
 284                ext3_discard_prealloc (inode);
 285                ext3_debug ("preallocation miss (%lu/%lu).\n",
 286                            alloc_hits, ++alloc_attempts);
 287                if (S_ISREG(inode->i_mode))
 288                        result = ext3_new_block (inode, goal, 
 289                                 &inode->u.ext3_i.i_prealloc_count,
 290                                 &inode->u.ext3_i.i_prealloc_block, err);
 291                else
 292                        result = ext3_new_block (inode, goal, 0, 0, err);
 293                /*
 294                 * AKPM: this is somewhat sticky.  I'm not surprised it was
 295                 * disabled in 2.2's ext3.  Need to integrate b_committed_data
 296                 * guarding with preallocation, if indeed preallocation is
 297                 * effective.
 298                 */
 299        }
 300#else
 301        result = ext3_new_block (handle, inode, goal, 0, 0, err);
 302#endif
 303        return result;
 304}
 305
 306
 307typedef struct {
 308        u32     *p;
 309        u32     key;
 310        struct buffer_head *bh;
 311} Indirect;
 312
 313static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
 314{
 315        p->key = *(p->p = v);
 316        p->bh = bh;
 317}
 318
 319static inline int verify_chain(Indirect *from, Indirect *to)
 320{
 321        while (from <= to && from->key == *from->p)
 322                from++;
 323        return (from > to);
 324}
 325
 326/**
 327 *      ext3_block_to_path - parse the block number into array of offsets
 328 *      @inode: inode in question (we are only interested in its superblock)
 329 *      @i_block: block number to be parsed
 330 *      @offsets: array to store the offsets in
 331 *
 332 *      To store the locations of file's data ext3 uses a data structure common
 333 *      for UNIX filesystems - tree of pointers anchored in the inode, with
 334 *      data blocks at leaves and indirect blocks in intermediate nodes.
 335 *      This function translates the block number into path in that tree -
 336 *      return value is the path length and @offsets[n] is the offset of
 337 *      pointer to (n+1)th node in the nth one. If @block is out of range
 338 *      (negative or too large) warning is printed and zero returned.
 339 *
 340 *      Note: function doesn't find node addresses, so no IO is needed. All
 341 *      we need to know is the capacity of indirect blocks (taken from the
 342 *      inode->i_sb).
 343 */
 344
 345/*
 346 * Portability note: the last comparison (check that we fit into triple
 347 * indirect block) is spelled differently, because otherwise on an
 348 * architecture with 32-bit longs and 8Kb pages we might get into trouble
 349 * if our filesystem had 8Kb blocks. We might use long long, but that would
 350 * kill us on x86. Oh, well, at least the sign propagation does not matter -
 351 * i_block would have to be negative in the very beginning, so we would not
 352 * get there at all.
 353 */
 354
 355static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
 356{
 357        int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
 358        int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
 359        const long direct_blocks = EXT3_NDIR_BLOCKS,
 360                indirect_blocks = ptrs,
 361                double_blocks = (1 << (ptrs_bits * 2));
 362        int n = 0;
 363
 364        if (i_block < 0) {
 365                ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
 366        } else if (i_block < direct_blocks) {
 367                offsets[n++] = i_block;
 368        } else if ( (i_block -= direct_blocks) < indirect_blocks) {
 369                offsets[n++] = EXT3_IND_BLOCK;
 370                offsets[n++] = i_block;
 371        } else if ((i_block -= indirect_blocks) < double_blocks) {
 372                offsets[n++] = EXT3_DIND_BLOCK;
 373                offsets[n++] = i_block >> ptrs_bits;
 374                offsets[n++] = i_block & (ptrs - 1);
 375        } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
 376                offsets[n++] = EXT3_TIND_BLOCK;
 377                offsets[n++] = i_block >> (ptrs_bits * 2);
 378                offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
 379                offsets[n++] = i_block & (ptrs - 1);
 380        } else {
 381                ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
 382        }
 383        return n;
 384}
 385
 386/**
 387 *      ext3_get_branch - read the chain of indirect blocks leading to data
 388 *      @inode: inode in question
 389 *      @depth: depth of the chain (1 - direct pointer, etc.)
 390 *      @offsets: offsets of pointers in inode/indirect blocks
 391 *      @chain: place to store the result
 392 *      @err: here we store the error value
 393 *
 394 *      Function fills the array of triples <key, p, bh> and returns %NULL
 395 *      if everything went OK or the pointer to the last filled triple
 396 *      (incomplete one) otherwise. Upon the return chain[i].key contains
 397 *      the number of (i+1)-th block in the chain (as it is stored in memory,
 398 *      i.e. little-endian 32-bit), chain[i].p contains the address of that
 399 *      number (it points into struct inode for i==0 and into the bh->b_data
 400 *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 401 *      block for i>0 and NULL for i==0. In other words, it holds the block
 402 *      numbers of the chain, addresses they were taken from (and where we can
 403 *      verify that chain did not change) and buffer_heads hosting these
 404 *      numbers.
 405 *
 406 *      Function stops when it stumbles upon zero pointer (absent block)
 407 *              (pointer to last triple returned, *@err == 0)
 408 *      or when it gets an IO error reading an indirect block
 409 *              (ditto, *@err == -EIO)
 410 *      or when it notices that chain had been changed while it was reading
 411 *              (ditto, *@err == -EAGAIN)
 412 *      or when it reads all @depth-1 indirect blocks successfully and finds
 413 *      the whole chain, all way to the data (returns %NULL, *err == 0).
 414 */
 415static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
 416                                 Indirect chain[4], int *err)
 417{
 418        struct super_block *sb = inode->i_sb;
 419        Indirect *p = chain;
 420        struct buffer_head *bh;
 421
 422        *err = 0;
 423        /* i_data is not going away, no lock needed */
 424        add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
 425        if (!p->key)
 426                goto no_block;
 427        while (--depth) {
 428                bh = sb_bread(sb, le32_to_cpu(p->key));
 429                if (!bh)
 430                        goto failure;
 431                /* Reader: pointers */
 432                if (!verify_chain(chain, p))
 433                        goto changed;
 434                add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
 435                /* Reader: end */
 436                if (!p->key)
 437                        goto no_block;
 438        }
 439        return NULL;
 440
 441changed:
 442        brelse(bh);
 443        *err = -EAGAIN;
 444        goto no_block;
 445failure:
 446        *err = -EIO;
 447no_block:
 448        return p;
 449}
 450
 451/**
 452 *      ext3_find_near - find a place for allocation with sufficient locality
 453 *      @inode: owner
 454 *      @ind: descriptor of indirect block.
 455 *
 456 *      This function returns the prefered place for block allocation.
 457 *      It is used when heuristic for sequential allocation fails.
 458 *      Rules are:
 459 *        + if there is a block to the left of our position - allocate near it.
 460 *        + if pointer will live in indirect block - allocate near that block.
 461 *        + if pointer will live in inode - allocate in the same
 462 *          cylinder group. 
 463 *      Caller must make sure that @ind is valid and will stay that way.
 464 */
 465
 466static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 467{
 468        u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
 469        u32 *p;
 470
 471        /* Try to find previous block */
 472        for (p = ind->p - 1; p >= start; p--)
 473                if (*p)
 474                        return le32_to_cpu(*p);
 475
 476        /* No such thing, so let's try location of indirect block */
 477        if (ind->bh)
 478                return ind->bh->b_blocknr;
 479
 480        /*
 481         * It is going to be refered from inode itself? OK, just put it into
 482         * the same cylinder group then.
 483         */
 484        return (inode->u.ext3_i.i_block_group * 
 485                EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
 486               le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
 487}
 488
 489/**
 490 *      ext3_find_goal - find a prefered place for allocation.
 491 *      @inode: owner
 492 *      @block:  block we want
 493 *      @chain:  chain of indirect blocks
 494 *      @partial: pointer to the last triple within a chain
 495 *      @goal:  place to store the result.
 496 *
 497 *      Normally this function find the prefered place for block allocation,
 498 *      stores it in *@goal and returns zero. If the branch had been changed
 499 *      under us we return -EAGAIN.
 500 */
 501
 502static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
 503                          Indirect *partial, unsigned long *goal)
 504{
 505        /* Writer: ->i_next_alloc* */
 506        if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
 507                inode->u.ext3_i.i_next_alloc_block++;
 508                inode->u.ext3_i.i_next_alloc_goal++;
 509        }
 510#ifdef SEARCH_FROM_ZERO
 511        inode->u.ext3_i.i_next_alloc_block = 0;
 512        inode->u.ext3_i.i_next_alloc_goal = 0;
 513#endif
 514        /* Writer: end */
 515        /* Reader: pointers, ->i_next_alloc* */
 516        if (verify_chain(chain, partial)) {
 517                /*
 518                 * try the heuristic for sequential allocation,
 519                 * failing that at least try to get decent locality.
 520                 */
 521                if (block == inode->u.ext3_i.i_next_alloc_block)
 522                        *goal = inode->u.ext3_i.i_next_alloc_goal;
 523                if (!*goal)
 524                        *goal = ext3_find_near(inode, partial);
 525#ifdef SEARCH_FROM_ZERO
 526                *goal = 0;
 527#endif
 528                return 0;
 529        }
 530        /* Reader: end */
 531        return -EAGAIN;
 532}
 533
 534/**
 535 *      ext3_alloc_branch - allocate and set up a chain of blocks.
 536 *      @inode: owner
 537 *      @num: depth of the chain (number of blocks to allocate)
 538 *      @offsets: offsets (in the blocks) to store the pointers to next.
 539 *      @branch: place to store the chain in.
 540 *
 541 *      This function allocates @num blocks, zeroes out all but the last one,
 542 *      links them into chain and (if we are synchronous) writes them to disk.
 543 *      In other words, it prepares a branch that can be spliced onto the
 544 *      inode. It stores the information about that chain in the branch[], in
 545 *      the same format as ext3_get_branch() would do. We are calling it after
 546 *      we had read the existing part of chain and partial points to the last
 547 *      triple of that (one with zero ->key). Upon the exit we have the same
 548 *      picture as after the successful ext3_get_block(), excpet that in one
 549 *      place chain is disconnected - *branch->p is still zero (we did not
 550 *      set the last link), but branch->key contains the number that should
 551 *      be placed into *branch->p to fill that gap.
 552 *
 553 *      If allocation fails we free all blocks we've allocated (and forget
 554 *      their buffer_heads) and return the error value the from failed
 555 *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 556 *      as described above and return 0.
 557 */
 558
 559static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 560                             int num,
 561                             unsigned long goal,
 562                             int *offsets,
 563                             Indirect *branch)
 564{
 565        int blocksize = inode->i_sb->s_blocksize;
 566        int n = 0, keys = 0;
 567        int err = 0;
 568        int i;
 569        int parent = ext3_alloc_block(handle, inode, goal, &err);
 570
 571        branch[0].key = cpu_to_le32(parent);
 572        if (parent) {
 573                for (n = 1; n < num; n++) {
 574                        struct buffer_head *bh;
 575                        /* Allocate the next block */
 576                        int nr = ext3_alloc_block(handle, inode, parent, &err);
 577                        if (!nr)
 578                                break;
 579                        branch[n].key = cpu_to_le32(nr);
 580                        keys = n+1;
 581                        
 582                        /*
 583                         * Get buffer_head for parent block, zero it out
 584                         * and set the pointer to new one, then send
 585                         * parent to disk.  
 586                         */
 587                        bh = sb_getblk(inode->i_sb, parent);
 588                        branch[n].bh = bh;
 589                        lock_buffer(bh);
 590                        BUFFER_TRACE(bh, "call get_create_access");
 591                        err = ext3_journal_get_create_access(handle, bh);
 592                        if (err) {
 593                                unlock_buffer(bh);
 594                                brelse(bh);
 595                                break;
 596                        }
 597
 598                        memset(bh->b_data, 0, blocksize);
 599                        branch[n].p = (u32*) bh->b_data + offsets[n];
 600                        *branch[n].p = branch[n].key;
 601                        BUFFER_TRACE(bh, "marking uptodate");
 602                        mark_buffer_uptodate(bh, 1);
 603                        unlock_buffer(bh);
 604
 605                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 606                        err = ext3_journal_dirty_metadata(handle, bh);
 607                        if (err)
 608                                break;
 609                        
 610                        parent = nr;
 611                }
 612        }
 613        if (n == num)
 614                return 0;
 615
 616        /* Allocation failed, free what we already allocated */
 617        for (i = 1; i < keys; i++) {
 618                BUFFER_TRACE(branch[i].bh, "call journal_forget");
 619                ext3_journal_forget(handle, branch[i].bh);
 620        }
 621        for (i = 0; i < keys; i++)
 622                ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
 623        return err;
 624}
 625
 626/**
 627 *      ext3_splice_branch - splice the allocated branch onto inode.
 628 *      @inode: owner
 629 *      @block: (logical) number of block we are adding
 630 *      @chain: chain of indirect blocks (with a missing link - see
 631 *              ext3_alloc_branch)
 632 *      @where: location of missing link
 633 *      @num:   number of blocks we are adding
 634 *
 635 *      This function verifies that chain (up to the missing link) had not
 636 *      changed, fills the missing link and does all housekeeping needed in
 637 *      inode (->i_blocks, etc.). In case of success we end up with the full
 638 *      chain to new block and return 0. Otherwise (== chain had been changed)
 639 *      we free the new blocks (forgetting their buffer_heads, indeed) and
 640 *      return -EAGAIN.
 641 */
 642
 643static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
 644                              Indirect chain[4], Indirect *where, int num)
 645{
 646        int i;
 647        int err = 0;
 648
 649        /*
 650         * If we're splicing into a [td]indirect block (as opposed to the
 651         * inode) then we need to get write access to the [td]indirect block
 652         * before the splice.
 653         */
 654        if (where->bh) {
 655                BUFFER_TRACE(where->bh, "get_write_access");
 656                err = ext3_journal_get_write_access(handle, where->bh);
 657                if (err)
 658                        goto err_out;
 659        }
 660        /* Verify that place we are splicing to is still there and vacant */
 661
 662        /* Writer: pointers, ->i_next_alloc* */
 663        if (!verify_chain(chain, where-1) || *where->p)
 664                /* Writer: end */
 665                goto changed;
 666
 667        /* That's it */
 668
 669        *where->p = where->key;
 670        inode->u.ext3_i.i_next_alloc_block = block;
 671        inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
 672#ifdef SEARCH_FROM_ZERO
 673        inode->u.ext3_i.i_next_alloc_block = 0;
 674        inode->u.ext3_i.i_next_alloc_goal = 0;
 675#endif
 676        /* Writer: end */
 677
 678        /* We are done with atomic stuff, now do the rest of housekeeping */
 679
 680        inode->i_ctime = CURRENT_TIME;
 681        ext3_mark_inode_dirty(handle, inode);
 682
 683        /* had we spliced it onto indirect block? */
 684        if (where->bh) {
 685                /*
 686                 * akpm: If we spliced it onto an indirect block, we haven't
 687                 * altered the inode.  Note however that if it is being spliced
 688                 * onto an indirect block at the very end of the file (the
 689                 * file is growing) then we *will* alter the inode to reflect
 690                 * the new i_size.  But that is not done here - it is done in
 691                 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
 692                 */
 693                jbd_debug(5, "splicing indirect only\n");
 694                BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
 695                err = ext3_journal_dirty_metadata(handle, where->bh);
 696                if (err) 
 697                        goto err_out;
 698        } else {
 699                /*
 700                 * OK, we spliced it into the inode itself on a direct block.
 701                 * Inode was dirtied above.
 702                 */
 703                jbd_debug(5, "splicing direct\n");
 704        }
 705        return err;
 706
 707changed:
 708        /*
 709         * AKPM: if where[i].bh isn't part of the current updating
 710         * transaction then we explode nastily.  Test this code path.
 711         */
 712        jbd_debug(1, "the chain changed: try again\n");
 713        err = -EAGAIN;
 714        
 715err_out:
 716        for (i = 1; i < num; i++) {
 717                BUFFER_TRACE(where[i].bh, "call journal_forget");
 718                ext3_journal_forget(handle, where[i].bh);
 719        }
 720        /* For the normal collision cleanup case, we free up the blocks.
 721         * On genuine filesystem errors we don't even think about doing
 722         * that. */
 723        if (err == -EAGAIN)
 724                for (i = 0; i < num; i++)
 725                        ext3_free_blocks(handle, inode, 
 726                                         le32_to_cpu(where[i].key), 1);
 727        return err;
 728}
 729
 730/*
 731 * Allocation strategy is simple: if we have to allocate something, we will
 732 * have to go the whole way to leaf. So let's do it before attaching anything
 733 * to tree, set linkage between the newborn blocks, write them if sync is
 734 * required, recheck the path, free and repeat if check fails, otherwise
 735 * set the last missing link (that will protect us from any truncate-generated
 736 * removals - all blocks on the path are immune now) and possibly force the
 737 * write on the parent block.
 738 * That has a nice additional property: no special recovery from the failed
 739 * allocations is needed - we simply release blocks and do not touch anything
 740 * reachable from inode.
 741 *
 742 * akpm: `handle' can be NULL if create == 0.
 743 *
 744 * The BKL may not be held on entry here.  Be sure to take it early.
 745 */
 746
 747static int ext3_get_block_handle(handle_t *handle, struct inode *inode, 
 748                                 long iblock,
 749                                 struct buffer_head *bh_result, int create)
 750{
 751        int err = -EIO;
 752        int offsets[4];
 753        Indirect chain[4];
 754        Indirect *partial;
 755        unsigned long goal;
 756        int left;
 757        int depth = ext3_block_to_path(inode, iblock, offsets);
 758        loff_t new_size;
 759
 760        J_ASSERT(handle != NULL || create == 0);
 761
 762        if (depth == 0)
 763                goto out;
 764
 765        lock_kernel();
 766reread:
 767        partial = ext3_get_branch(inode, depth, offsets, chain, &err);
 768
 769        /* Simplest case - block found, no allocation needed */
 770        if (!partial) {
 771                bh_result->b_state &= ~(1UL << BH_New);
 772got_it:
 773                bh_result->b_dev = inode->i_dev;
 774                bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
 775                bh_result->b_state |= (1UL << BH_Mapped);
 776                /* Clean up and exit */
 777                partial = chain+depth-1; /* the whole chain */
 778                goto cleanup;
 779        }
 780
 781        /* Next simple case - plain lookup or failed read of indirect block */
 782        if (!create || err == -EIO) {
 783cleanup:
 784                while (partial > chain) {
 785                        BUFFER_TRACE(partial->bh, "call brelse");
 786                        brelse(partial->bh);
 787                        partial--;
 788                }
 789                BUFFER_TRACE(bh_result, "returned");
 790                unlock_kernel();
 791out:
 792                return err;
 793        }
 794
 795        /*
 796         * Indirect block might be removed by truncate while we were
 797         * reading it. Handling of that case (forget what we've got and
 798         * reread) is taken out of the main path.
 799         */
 800        if (err == -EAGAIN)
 801                goto changed;
 802
 803        goal = 0;
 804        if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
 805                goto changed;
 806
 807        left = (chain + depth) - partial;
 808
 809        /*
 810         * Block out ext3_truncate while we alter the tree
 811         */
 812        down_read(&inode->u.ext3_i.truncate_sem);
 813        err = ext3_alloc_branch(handle, inode, left, goal,
 814                                        offsets+(partial-chain), partial);
 815
 816        /* The ext3_splice_branch call will free and forget any buffers
 817         * on the new chain if there is a failure, but that risks using
 818         * up transaction credits, especially for bitmaps where the
 819         * credits cannot be returned.  Can we handle this somehow?  We
 820         * may need to return -EAGAIN upwards in the worst case.  --sct */
 821        if (!err)
 822                err = ext3_splice_branch(handle, inode, iblock, chain,
 823                                         partial, left);
 824        up_read(&inode->u.ext3_i.truncate_sem);
 825        if (err == -EAGAIN)
 826                goto changed;
 827        if (err)
 828                goto cleanup;
 829
 830        new_size = inode->i_size;
 831        /*
 832         * This is not racy against ext3_truncate's modification of i_disksize
 833         * because VM/VFS ensures that the file cannot be extended while
 834         * truncate is in progress.  It is racy between multiple parallel
 835         * instances of get_block, but we have the BKL.
 836         */
 837        if (new_size > inode->u.ext3_i.i_disksize)
 838                inode->u.ext3_i.i_disksize = new_size;
 839
 840        bh_result->b_state |= (1UL << BH_New);
 841        goto got_it;
 842
 843changed:
 844        while (partial > chain) {
 845                jbd_debug(1, "buffer chain changed, retrying\n");
 846                BUFFER_TRACE(partial->bh, "brelsing");
 847                brelse(partial->bh);
 848                partial--;
 849        }
 850        goto reread;
 851}
 852
 853/*
 854 * The BKL is not held on entry here.
 855 */
 856static int ext3_get_block(struct inode *inode, long iblock,
 857                        struct buffer_head *bh_result, int create)
 858{
 859        handle_t *handle = 0;
 860        int ret;
 861
 862        if (create) {
 863                handle = ext3_journal_current_handle();
 864                J_ASSERT(handle != 0);
 865        }
 866        ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
 867        return ret;
 868}
 869
 870/*
 871 * `handle' can be NULL if create is zero
 872 */
 873struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
 874                                long block, int create, int * errp)
 875{
 876        struct buffer_head dummy;
 877        int fatal = 0, err;
 878        
 879        J_ASSERT(handle != NULL || create == 0);
 880
 881        dummy.b_state = 0;
 882        dummy.b_blocknr = -1000;
 883        buffer_trace_init(&dummy.b_history);
 884        *errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
 885        if (!*errp && buffer_mapped(&dummy)) {
 886                struct buffer_head *bh;
 887                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
 888                if (buffer_new(&dummy)) {
 889                        J_ASSERT(create != 0);
 890                        J_ASSERT(handle != 0);
 891
 892                        /* Now that we do not always journal data, we
 893                           should keep in mind whether this should
 894                           always journal the new buffer as metadata.
 895                           For now, regular file writes use
 896                           ext3_get_block instead, so it's not a
 897                           problem. */
 898                        lock_kernel();
 899                        lock_buffer(bh);
 900                        BUFFER_TRACE(bh, "call get_create_access");
 901                        fatal = ext3_journal_get_create_access(handle, bh);
 902                        if (!fatal) {
 903                                memset(bh->b_data, 0,
 904                                       inode->i_sb->s_blocksize);
 905                                mark_buffer_uptodate(bh, 1);
 906                        }
 907                        unlock_buffer(bh);
 908                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 909                        err = ext3_journal_dirty_metadata(handle, bh);
 910                        if (!fatal) fatal = err;
 911                        unlock_kernel();
 912                } else {
 913                        BUFFER_TRACE(bh, "not a new buffer");
 914                }
 915                if (fatal) {
 916                        *errp = fatal;
 917                        brelse(bh);
 918                        bh = NULL;
 919                }
 920                return bh;
 921        }
 922        return NULL;
 923}
 924
 925struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
 926                               int block, int create, int *err)
 927{
 928        struct buffer_head * bh;
 929        int prev_blocks;
 930
 931        prev_blocks = inode->i_blocks;
 932
 933        bh = ext3_getblk (handle, inode, block, create, err);
 934        if (!bh)
 935                return bh;
 936#ifdef EXT3_PREALLOCATE
 937        /*
 938         * If the inode has grown, and this is a directory, then use a few
 939         * more of the preallocated blocks to keep directory fragmentation
 940         * down.  The preallocated blocks are guaranteed to be contiguous.
 941         */
 942        if (create &&
 943            S_ISDIR(inode->i_mode) &&
 944            inode->i_blocks > prev_blocks &&
 945            EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
 946                                    EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
 947                int i;
 948                struct buffer_head *tmp_bh;
 949
 950                for (i = 1;
 951                     inode->u.ext3_i.i_prealloc_count &&
 952                     i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
 953                     i++) {
 954                        /*
 955                         * ext3_getblk will zero out the contents of the
 956                         * directory for us
 957                         */
 958                        tmp_bh = ext3_getblk(handle, inode,
 959                                                block+i, create, err);
 960                        if (!tmp_bh) {
 961                                brelse (bh);
 962                                return 0;
 963                        }
 964                        brelse (tmp_bh);
 965                }
 966        }
 967#endif
 968        if (buffer_uptodate(bh))
 969                return bh;
 970        ll_rw_block (READ, 1, &bh);
 971        wait_on_buffer (bh);
 972        if (buffer_uptodate(bh))
 973                return bh;
 974        brelse (bh);
 975        *err = -EIO;
 976        return NULL;
 977}
 978
 979static int walk_page_buffers(   handle_t *handle,
 980                                struct inode *inode,
 981                                struct buffer_head *head,
 982                                unsigned from,
 983                                unsigned to,
 984                                int *partial,
 985                                int (*fn)(      handle_t *handle,
 986                                                struct inode *inode,
 987                                                struct buffer_head *bh))
 988{
 989        struct buffer_head *bh;
 990        unsigned block_start, block_end;
 991        unsigned blocksize = head->b_size;
 992        int err, ret = 0;
 993
 994        for (   bh = head, block_start = 0;
 995                ret == 0 && (bh != head || !block_start);
 996                block_start = block_end, bh = bh->b_this_page)
 997        {
 998                block_end = block_start + blocksize;
 999                if (block_end <= from || block_start >= to) {
1000                        if (partial && !buffer_uptodate(bh))
1001                                *partial = 1;
1002                        continue;
1003                }
1004                err = (*fn)(handle, inode, bh);
1005                if (!ret)
1006                        ret = err;
1007        }
1008        return ret;
1009}
1010
1011/*
1012 * To preserve ordering, it is essential that the hole instantiation and
1013 * the data write be encapsulated in a single transaction.  We cannot
1014 * close off a transaction and start a new one between the ext3_get_block()
1015 * and the commit_write().  So doing the journal_start at the start of
1016 * prepare_write() is the right place.
1017 *
1018 * Also, this function can nest inside ext3_writepage() ->
1019 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1020 * has generated enough buffer credits to do the whole page.  So we won't
1021 * block on the journal in that case, which is good, because the caller may
1022 * be PF_MEMALLOC.
1023 *
1024 * By accident, ext3 can be reentered when a transaction is open via
1025 * quota file writes.  If we were to commit the transaction while thus
1026 * reentered, there can be a deadlock - we would be holding a quota
1027 * lock, and the commit would never complete if another thread had a
1028 * transaction open and was blocking on the quota lock - a ranking
1029 * violation.
1030 *
1031 * So what we do is to rely on the fact that journal_stop/journal_start
1032 * will _not_ run commit under these circumstances because handle->h_ref
1033 * is elevated.  We'll still have enough credits for the tiny quotafile
1034 * write.  
1035 */
1036
1037static int do_journal_get_write_access(handle_t *handle, struct inode *inode,
1038                                       struct buffer_head *bh)
1039{
1040        return ext3_journal_get_write_access(handle, bh);
1041}
1042
1043static int ext3_prepare_write(struct file *file, struct page *page,
1044                              unsigned from, unsigned to)
1045{
1046        struct inode *inode = page->mapping->host;
1047        int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1048        handle_t *handle;
1049
1050        lock_kernel();
1051        handle = ext3_journal_start(inode, needed_blocks);
1052        if (IS_ERR(handle)) {
1053                ret = PTR_ERR(handle);
1054                goto out;
1055        }
1056        unlock_kernel();
1057        ret = block_prepare_write(page, from, to, ext3_get_block);
1058        lock_kernel();
1059        if (ret != 0)
1060                goto prepare_write_failed;
1061
1062        if (ext3_should_journal_data(inode)) {
1063                ret = walk_page_buffers(handle, inode, page->buffers,
1064                                from, to, NULL, do_journal_get_write_access);
1065                if (ret) {
1066                        /*
1067                         * We're going to fail this prepare_write(),
1068                         * so commit_write() will not be called.
1069                         * We need to undo block_prepare_write()'s kmap().
1070                         * AKPM: Do we need to clear PageUptodate?  I don't
1071                         * think so.
1072                         */
1073                        kunmap(page);
1074                }
1075        }
1076prepare_write_failed:
1077        if (ret)
1078                ext3_journal_stop(handle, inode);
1079out:
1080        unlock_kernel();
1081        return ret;
1082}
1083
1084static int journal_dirty_sync_data(handle_t *handle, struct inode *inode,
1085                                   struct buffer_head *bh)
1086{
1087        int ret = ext3_journal_dirty_data(handle, bh, 0);
1088        buffer_insert_inode_data_queue(bh, inode);
1089        return ret;
1090}
1091
1092/*
1093 * For ext3_writepage().  We also brelse() the buffer to account for
1094 * the bget() which ext3_writepage() performs.
1095 */
1096static int journal_dirty_async_data(handle_t *handle, struct inode *inode, 
1097                                    struct buffer_head *bh)
1098{
1099        int ret = ext3_journal_dirty_data(handle, bh, 1);
1100        buffer_insert_inode_data_queue(bh, inode);
1101        __brelse(bh);
1102        return ret;
1103}
1104
1105/* For commit_write() in data=journal mode */
1106static int commit_write_fn(handle_t *handle, struct inode *inode, 
1107                           struct buffer_head *bh)
1108{
1109        set_bit(BH_Uptodate, &bh->b_state);
1110        return ext3_journal_dirty_metadata(handle, bh);
1111}
1112
1113/*
1114 * We need to pick up the new inode size which generic_commit_write gave us
1115 * `file' can be NULL - eg, when called from block_symlink().
1116 *
1117 * ext3 inode->i_dirty_buffers policy:  If we're journalling data we
1118 * definitely don't want them to appear on the inode at all - instead
1119 * we need to manage them at the JBD layer and we need to intercept
1120 * the relevant sync operations and translate them into journal operations.
1121 *
1122 * If we're not journalling data then we can just leave the buffers
1123 * on ->i_dirty_buffers.  If someone writes them out for us then thanks.
1124 * Otherwise we'll do it in commit, if we're using ordered data.
1125 */
1126
1127static int ext3_commit_write(struct file *file, struct page *page,
1128                             unsigned from, unsigned to)
1129{
1130        handle_t *handle = ext3_journal_current_handle();
1131        struct inode *inode = page->mapping->host;
1132        int ret = 0, ret2;
1133
1134        lock_kernel();
1135        if (ext3_should_journal_data(inode)) {
1136                /*
1137                 * Here we duplicate the generic_commit_write() functionality
1138                 */
1139                int partial = 0;
1140                loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1141
1142                ret = walk_page_buffers(handle, inode, page->buffers,
1143                        from, to, &partial, commit_write_fn);
1144                if (!partial)
1145                        SetPageUptodate(page);
1146                kunmap(page);
1147                if (pos > inode->i_size)
1148                        inode->i_size = pos;
1149                EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1150        } else {
1151                if (ext3_should_order_data(inode)) {
1152                        ret = walk_page_buffers(handle, inode, page->buffers,
1153                                from, to, NULL, journal_dirty_sync_data);
1154                }
1155                /* Be careful here if generic_commit_write becomes a
1156                 * required invocation after block_prepare_write. */
1157                if (ret == 0) {
1158                        ret = generic_commit_write(file, page, from, to);
1159                } else {
1160                        /*
1161                         * block_prepare_write() was called, but we're not
1162                         * going to call generic_commit_write().  So we
1163                         * need to perform generic_commit_write()'s kunmap
1164                         * by hand.
1165                         */
1166                        kunmap(page);
1167                }
1168        }
1169        if (inode->i_size > inode->u.ext3_i.i_disksize) {
1170                inode->u.ext3_i.i_disksize = inode->i_size;
1171                ret2 = ext3_mark_inode_dirty(handle, inode);
1172                if (!ret) 
1173                        ret = ret2;
1174        }
1175        ret2 = ext3_journal_stop(handle, inode);
1176        unlock_kernel();
1177        if (!ret)
1178                ret = ret2;
1179        return ret;
1180}
1181
1182/* 
1183 * bmap() is special.  It gets used by applications such as lilo and by
1184 * the swapper to find the on-disk block of a specific piece of data.
1185 *
1186 * Naturally, this is dangerous if the block concerned is still in the
1187 * journal.  If somebody makes a swapfile on an ext3 data-journaling
1188 * filesystem and enables swap, then they may get a nasty shock when the
1189 * data getting swapped to that swapfile suddenly gets overwritten by
1190 * the original zero's written out previously to the journal and
1191 * awaiting writeback in the kernel's buffer cache. 
1192 *
1193 * So, if we see any bmap calls here on a modified, data-journaled file,
1194 * take extra steps to flush any blocks which might be in the cache. 
1195 */
1196static int ext3_bmap(struct address_space *mapping, long block)
1197{
1198        struct inode *inode = mapping->host;
1199        journal_t *journal;
1200        int err;
1201        
1202        if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1203                /* 
1204                 * This is a REALLY heavyweight approach, but the use of
1205                 * bmap on dirty files is expected to be extremely rare:
1206                 * only if we run lilo or swapon on a freshly made file
1207                 * do we expect this to happen. 
1208                 *
1209                 * (bmap requires CAP_SYS_RAWIO so this does not
1210                 * represent an unprivileged user DOS attack --- we'd be
1211                 * in trouble if mortal users could trigger this path at
1212                 * will.) 
1213                 *
1214                 * NB. EXT3_STATE_JDATA is not set on files other than
1215                 * regular files.  If somebody wants to bmap a directory
1216                 * or symlink and gets confused because the buffer
1217                 * hasn't yet been flushed to disk, they deserve
1218                 * everything they get.
1219                 */
1220                
1221                EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1222                journal = EXT3_JOURNAL(inode);
1223                journal_lock_updates(journal);
1224                err = journal_flush(journal);
1225                journal_unlock_updates(journal);
1226                
1227                if (err)
1228                        return 0;
1229        }
1230        
1231        return generic_block_bmap(mapping,block,ext3_get_block);
1232}
1233
1234static int bget_one(handle_t *handle, struct inode *inode, 
1235                    struct buffer_head *bh)
1236{
1237        atomic_inc(&bh->b_count);
1238        return 0;
1239}
1240
1241/*
1242 * Note that we always start a transaction even if we're not journalling
1243 * data.  This is to preserve ordering: any hole instantiation within
1244 * __block_write_full_page -> ext3_get_block() should be journalled
1245 * along with the data so we don't crash and then get metadata which
1246 * refers to old data.
1247 *
1248 * In all journalling modes block_write_full_page() will start the I/O.
1249 *
1250 * Problem:
1251 *
1252 *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1253 *              ext3_writepage()
1254 *
1255 * Similar for:
1256 *
1257 *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1258 *
1259 * Same applies to ext3_get_block().  We will deadlock on various things like
1260 * lock_journal and i_truncate_sem.
1261 *
1262 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1263 * allocations fail.
1264 *
1265 * 16May01: If we're reentered then journal_current_handle() will be
1266 *          non-zero. We simply *return*.
1267 *
1268 * 1 July 2001: @@@ FIXME:
1269 *   In journalled data mode, a data buffer may be metadata against the
1270 *   current transaction.  But the same file is part of a shared mapping
1271 *   and someone does a writepage() on it.
1272 *
1273 *   We will move the buffer onto the async_data list, but *after* it has
1274 *   been dirtied. So there's a small window where we have dirty data on
1275 *   BJ_Metadata.
1276 *
1277 *   Note that this only applies to the last partial page in the file.  The
1278 *   bit which block_write_full_page() uses prepare/commit for.  (That's
1279 *   broken code anyway: it's wrong for msync()).
1280 *
1281 *   It's a rare case: affects the final partial page, for journalled data
1282 *   where the file is subject to bith write() and writepage() in the same
1283 *   transction.  To fix it we'll need a custom block_write_full_page().
1284 *   We'll probably need that anyway for journalling writepage() output.
1285 *
1286 * We don't honour synchronous mounts for writepage().  That would be
1287 * disastrous.  Any write() or metadata operation will sync the fs for
1288 * us.
1289 */
1290static int ext3_writepage(struct page *page)
1291{
1292        struct inode *inode = page->mapping->host;
1293        struct buffer_head *page_buffers;
1294        handle_t *handle = NULL;
1295        int ret = 0, err;
1296        int needed;
1297        int order_data;
1298
1299        J_ASSERT(PageLocked(page));
1300        
1301        /*
1302         * We give up here if we're reentered, because it might be
1303         * for a different filesystem.  One *could* look for a
1304         * nested transaction opportunity.
1305         */
1306        lock_kernel();
1307        if (ext3_journal_current_handle())
1308                goto out_fail;
1309
1310        needed = ext3_writepage_trans_blocks(inode);
1311        if (current->flags & PF_MEMALLOC)
1312                handle = ext3_journal_try_start(inode, needed);
1313        else
1314                handle = ext3_journal_start(inode, needed);
1315                                
1316        if (IS_ERR(handle)) {
1317                ret = PTR_ERR(handle);
1318                goto out_fail;
1319        }
1320
1321        order_data = ext3_should_order_data(inode) ||
1322                        ext3_should_journal_data(inode);
1323
1324        unlock_kernel();
1325
1326        page_buffers = NULL;    /* Purely to prevent compiler warning */
1327
1328        /* bget() all the buffers */
1329        if (order_data) {
1330                if (!page->buffers)
1331                        create_empty_buffers(page,
1332                                inode->i_dev, inode->i_sb->s_blocksize);
1333                page_buffers = page->buffers;
1334                walk_page_buffers(handle, inode, page_buffers, 0,
1335                                PAGE_CACHE_SIZE, NULL, bget_one);
1336        }
1337
1338        ret = block_write_full_page(page, ext3_get_block);
1339
1340        /*
1341         * The page can become unlocked at any point now, and
1342         * truncate can then come in and change things.  So we
1343         * can't touch *page from now on.  But *page_buffers is
1344         * safe due to elevated refcount.
1345         */
1346
1347        handle = ext3_journal_current_handle();
1348        lock_kernel();
1349
1350        /* And attach them to the current transaction */
1351        if (order_data) {
1352                err = walk_page_buffers(handle, inode, page_buffers,
1353                        0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
1354                if (!ret)
1355                        ret = err;
1356        }
1357
1358        err = ext3_journal_stop(handle, inode);
1359        if (!ret)
1360                ret = err;
1361        unlock_kernel();
1362        return ret;
1363
1364out_fail:
1365        
1366        unlock_kernel();
1367        SetPageDirty(page);
1368        UnlockPage(page);
1369        return ret;
1370}
1371
1372static int ext3_readpage(struct file *file, struct page *page)
1373{
1374        return block_read_full_page(page,ext3_get_block);
1375}
1376
1377
1378static int ext3_flushpage(struct page *page, unsigned long offset)
1379{
1380        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1381        return journal_flushpage(journal, page, offset);
1382}
1383
1384static int ext3_releasepage(struct page *page, int wait)
1385{
1386        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1387        return journal_try_to_free_buffers(journal, page, wait);
1388}
1389
1390
1391struct address_space_operations ext3_aops = {
1392        readpage:       ext3_readpage,          /* BKL not held.  Don't need */
1393        writepage:      ext3_writepage,         /* BKL not held.  We take it */
1394        sync_page:      block_sync_page,
1395        prepare_write:  ext3_prepare_write,     /* BKL not held.  We take it */
1396        commit_write:   ext3_commit_write,      /* BKL not held.  We take it */
1397        bmap:           ext3_bmap,              /* BKL held */
1398        flushpage:      ext3_flushpage,         /* BKL not held.  Don't need */
1399        releasepage:    ext3_releasepage,       /* BKL not held.  Don't need */
1400};
1401
1402/*
1403 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1404 * up to the end of the block which corresponds to `from'.
1405 * This required during truncate. We need to physically zero the tail end
1406 * of that block so it doesn't yield old data if the file is later grown.
1407 */
1408static int ext3_block_truncate_page(handle_t *handle,
1409                struct address_space *mapping, loff_t from)
1410{
1411        unsigned long index = from >> PAGE_CACHE_SHIFT;
1412        unsigned offset = from & (PAGE_CACHE_SIZE-1);
1413        unsigned blocksize, iblock, length, pos;
1414        struct inode *inode = mapping->host;
1415        struct page *page;
1416        struct buffer_head *bh;
1417        int err;
1418
1419        blocksize = inode->i_sb->s_blocksize;
1420        length = offset & (blocksize - 1);
1421
1422        /* Block boundary? Nothing to do */
1423        if (!length)
1424                return 0;
1425
1426        length = blocksize - length;
1427        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1428
1429        page = find_or_create_page(mapping, index, GFP_NOFS);
1430        err = -ENOMEM;
1431        if (!page)
1432                goto out;
1433
1434        if (!page->buffers)
1435                create_empty_buffers(page, inode->i_dev, blocksize);
1436
1437        /* Find the buffer that contains "offset" */
1438        bh = page->buffers;
1439        pos = blocksize;
1440        while (offset >= pos) {
1441                bh = bh->b_this_page;
1442                iblock++;
1443                pos += blocksize;
1444        }
1445
1446        err = 0;
1447        if (!buffer_mapped(bh)) {
1448                /* Hole? Nothing to do */
1449                if (buffer_uptodate(bh))
1450                        goto unlock;
1451                ext3_get_block(inode, iblock, bh, 0);
1452                /* Still unmapped? Nothing to do */
1453                if (!buffer_mapped(bh))
1454                        goto unlock;
1455        }
1456
1457        /* Ok, it's mapped. Make sure it's up-to-date */
1458        if (Page_Uptodate(page))
1459                set_bit(BH_Uptodate, &bh->b_state);
1460
1461        if (!buffer_uptodate(bh)) {
1462                err = -EIO;
1463                ll_rw_block(READ, 1, &bh);
1464                wait_on_buffer(bh);
1465                /* Uhhuh. Read error. Complain and punt. */
1466                if (!buffer_uptodate(bh))
1467                        goto unlock;
1468        }
1469
1470        if (ext3_should_journal_data(inode)) {
1471                BUFFER_TRACE(bh, "get write access");
1472                err = ext3_journal_get_write_access(handle, bh);
1473                if (err)
1474                        goto unlock;
1475        }
1476        
1477        memset(kmap(page) + offset, 0, length);
1478        flush_dcache_page(page);
1479        kunmap(page);
1480
1481        BUFFER_TRACE(bh, "zeroed end of block");
1482
1483        err = 0;
1484        if (ext3_should_journal_data(inode)) {
1485                err = ext3_journal_dirty_metadata(handle, bh);
1486        } else {
1487                if (ext3_should_order_data(inode))
1488                        err = ext3_journal_dirty_data(handle, bh, 0);
1489                __mark_buffer_dirty(bh);
1490        }
1491
1492unlock:
1493        UnlockPage(page);
1494        page_cache_release(page);
1495out:
1496        return err;
1497}
1498
1499/*
1500 * Probably it should be a library function... search for first non-zero word
1501 * or memcmp with zero_page, whatever is better for particular architecture.
1502 * Linus?
1503 */
1504static inline int all_zeroes(u32 *p, u32 *q)
1505{
1506        while (p < q)
1507                if (*p++)
1508                        return 0;
1509        return 1;
1510}
1511
1512/**
1513 *      ext3_find_shared - find the indirect blocks for partial truncation.
1514 *      @inode:   inode in question
1515 *      @depth:   depth of the affected branch
1516 *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1517 *      @chain:   place to store the pointers to partial indirect blocks
1518 *      @top:     place to the (detached) top of branch
1519 *
1520 *      This is a helper function used by ext3_truncate().
1521 *
1522 *      When we do truncate() we may have to clean the ends of several
1523 *      indirect blocks but leave the blocks themselves alive. Block is
1524 *      partially truncated if some data below the new i_size is refered
1525 *      from it (and it is on the path to the first completely truncated
1526 *      data block, indeed).  We have to free the top of that path along
1527 *      with everything to the right of the path. Since no allocation
1528 *      past the truncation point is possible until ext3_truncate()
1529 *      finishes, we may safely do the latter, but top of branch may
1530 *      require special attention - pageout below the truncation point
1531 *      might try to populate it.
1532 *
1533 *      We atomically detach the top of branch from the tree, store the
1534 *      block number of its root in *@top, pointers to buffer_heads of
1535 *      partially truncated blocks - in @chain[].bh and pointers to
1536 *      their last elements that should not be removed - in
1537 *      @chain[].p. Return value is the pointer to last filled element
1538 *      of @chain.
1539 *
1540 *      The work left to caller to do the actual freeing of subtrees:
1541 *              a) free the subtree starting from *@top
1542 *              b) free the subtrees whose roots are stored in
1543 *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1544 *              c) free the subtrees growing from the inode past the @chain[0].
1545 *                      (no partially truncated stuff there).  */
1546
1547static Indirect *ext3_find_shared(struct inode *inode,
1548                                int depth,
1549                                int offsets[4],
1550                                Indirect chain[4],
1551                                u32 *top)
1552{
1553        Indirect *partial, *p;
1554        int k, err;
1555
1556        *top = 0;
1557        /* Make k index the deepest non-null offest + 1 */
1558        for (k = depth; k > 1 && !offsets[k-1]; k--)
1559                ;
1560        partial = ext3_get_branch(inode, k, offsets, chain, &err);
1561        /* Writer: pointers */
1562        if (!partial)
1563                partial = chain + k-1;
1564        /*
1565         * If the branch acquired continuation since we've looked at it -
1566         * fine, it should all survive and (new) top doesn't belong to us.
1567         */
1568        if (!partial->key && *partial->p)
1569                /* Writer: end */
1570                goto no_top;
1571        for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
1572                ;
1573        /*
1574         * OK, we've found the last block that must survive. The rest of our
1575         * branch should be detached before unlocking. However, if that rest
1576         * of branch is all ours and does not grow immediately from the inode
1577         * it's easier to cheat and just decrement partial->p.
1578         */
1579        if (p == chain + k - 1 && p > chain) {
1580                p->p--;
1581        } else {
1582                *top = *p->p;
1583                /* Nope, don't do this in ext3.  Must leave the tree intact */
1584#if 0
1585                *p->p = 0;
1586#endif
1587        }
1588        /* Writer: end */
1589
1590        while(partial > p)
1591        {
1592                brelse(partial->bh);
1593                partial--;
1594        }
1595no_top:
1596        return partial;
1597}
1598
1599/*
1600 * Zero a number of block pointers in either an inode or an indirect block.
1601 * If we restart the transaction we must again get write access to the
1602 * indirect block for further modification.
1603 *
1604 * We release `count' blocks on disk, but (last - first) may be greater
1605 * than `count' because there can be holes in there.
1606 */
1607static void
1608ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1609                unsigned long block_to_free, unsigned long count,
1610                u32 *first, u32 *last)
1611{
1612        u32 *p;
1613        if (try_to_extend_transaction(handle, inode)) {
1614                if (bh) {
1615                        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1616                        ext3_journal_dirty_metadata(handle, bh);
1617                }
1618                ext3_mark_inode_dirty(handle, inode);
1619                ext3_journal_test_restart(handle, inode);
1620                if (bh) {
1621                        BUFFER_TRACE(bh, "retaking write access");
1622                        ext3_journal_get_write_access(handle, bh);
1623                }
1624        }
1625
1626        /*
1627         * Any buffers which are on the journal will be in memory. We find
1628         * them on the hash table so journal_revoke() will run journal_forget()
1629         * on them.  We've already detached each block from the file, so
1630         * bforget() in journal_forget() should be safe.
1631         *
1632         * AKPM: turn on bforget in journal_forget()!!!
1633         */
1634        for (p = first; p < last; p++) {
1635                u32 nr = le32_to_cpu(*p);
1636                if (nr) {
1637                        struct buffer_head *bh;
1638
1639                        *p = 0;
1640                        bh = sb_get_hash_table(inode->i_sb, nr);
1641                        ext3_forget(handle, 0, inode, bh, nr);
1642                }
1643        }
1644
1645        ext3_free_blocks(handle, inode, block_to_free, count);
1646}
1647
1648/**
1649 * ext3_free_data - free a list of data blocks
1650 * @handle:     handle for this transaction
1651 * @inode:      inode we are dealing with
1652 * @this_bh:    indirect buffer_head which contains *@first and *@last
1653 * @first:      array of block numbers
1654 * @last:       points immediately past the end of array
1655 *
1656 * We are freeing all blocks refered from that array (numbers are stored as
1657 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1658 *
1659 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1660 * blocks are contiguous then releasing them at one time will only affect one
1661 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1662 * actually use a lot of journal space.
1663 *
1664 * @this_bh will be %NULL if @first and @last point into the inode's direct
1665 * block pointers.
1666 */
1667static void ext3_free_data(handle_t *handle, struct inode *inode,
1668                           struct buffer_head *this_bh, u32 *first, u32 *last)
1669{
1670        unsigned long block_to_free = 0;    /* Starting block # of a run */
1671        unsigned long count = 0;            /* Number of blocks in the run */ 
1672        u32 *block_to_free_p = NULL;        /* Pointer into inode/ind
1673                                               corresponding to
1674                                               block_to_free */
1675        unsigned long nr;                   /* Current block # */
1676        u32 *p;                             /* Pointer into inode/ind
1677                                               for current block */
1678        int err;
1679
1680        if (this_bh) {                          /* For indirect block */
1681                BUFFER_TRACE(this_bh, "get_write_access");
1682                err = ext3_journal_get_write_access(handle, this_bh);
1683                /* Important: if we can't update the indirect pointers
1684                 * to the blocks, we can't free them. */
1685                if (err)
1686                        return;
1687        }
1688
1689        for (p = first; p < last; p++) {
1690                nr = le32_to_cpu(*p);
1691                if (nr) {
1692                        /* accumulate blocks to free if they're contiguous */
1693                        if (count == 0) {
1694                                block_to_free = nr;
1695                                block_to_free_p = p;
1696                                count = 1;
1697                        } else if (nr == block_to_free + count) {
1698                                count++;
1699                        } else {
1700                                ext3_clear_blocks(handle, inode, this_bh, 
1701                                                  block_to_free,
1702                                                  count, block_to_free_p, p);
1703                                block_to_free = nr;
1704                                block_to_free_p = p;
1705                                count = 1;
1706                        }
1707                }
1708        }
1709
1710        if (count > 0)
1711                ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1712                                  count, block_to_free_p, p);
1713
1714        if (this_bh) {
1715                BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1716                ext3_journal_dirty_metadata(handle, this_bh);
1717        }
1718}
1719
1720/**
1721 *      ext3_free_branches - free an array of branches
1722 *      @handle: JBD handle for this transaction
1723 *      @inode: inode we are dealing with
1724 *      @parent_bh: the buffer_head which contains *@first and *@last
1725 *      @first: array of block numbers
1726 *      @last:  pointer immediately past the end of array
1727 *      @depth: depth of the branches to free
1728 *
1729 *      We are freeing all blocks refered from these branches (numbers are
1730 *      stored as little-endian 32-bit) and updating @inode->i_blocks
1731 *      appropriately.
1732 */
1733static void ext3_free_branches(handle_t *handle, struct inode *inode,
1734                               struct buffer_head *parent_bh,
1735                               u32 *first, u32 *last, int depth)
1736{
1737        unsigned long nr;
1738        u32 *p;
1739
1740        if (is_handle_aborted(handle))
1741                return;
1742        
1743        if (depth--) {
1744                struct buffer_head *bh;
1745                int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1746                p = last;
1747                while (--p >= first) {
1748                        nr = le32_to_cpu(*p);
1749                        if (!nr)
1750                                continue;               /* A hole */
1751
1752                        /* Go read the buffer for the next level down */
1753                        bh = sb_bread(inode->i_sb, nr);
1754
1755                        /*
1756                         * A read failure? Report error and clear slot
1757                         * (should be rare).
1758                         */
1759                        if (!bh) {
1760                                ext3_error(inode->i_sb, "ext3_free_branches",
1761                                           "Read failure, inode=%ld, block=%ld",
1762                                           inode->i_ino, nr);
1763                                continue;
1764                        }
1765
1766                        /* This zaps the entire block.  Bottom up. */
1767                        BUFFER_TRACE(bh, "free child branches");
1768                        ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
1769                                           (u32*)bh->b_data + addr_per_block,
1770                                           depth);
1771
1772                        /*
1773                         * We've probably journalled the indirect block several
1774                         * times during the truncate.  But it's no longer
1775                         * needed and we now drop it from the transaction via
1776                         * journal_revoke().
1777                         *
1778                         * That's easy if it's exclusively part of this
1779                         * transaction.  But if it's part of the committing
1780                         * transaction then journal_forget() will simply
1781                         * brelse() it.  That means that if the underlying
1782                         * block is reallocated in ext3_get_block(),
1783                         * unmap_underlying_metadata() will find this block
1784                         * and will try to get rid of it.  damn, damn.
1785                         *
1786                         * If this block has already been committed to the
1787                         * journal, a revoke record will be written.  And
1788                         * revoke records must be emitted *before* clearing
1789                         * this block's bit in the bitmaps.
1790                         */
1791                        ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
1792
1793                        /*
1794                         * Everything below this this pointer has been
1795                         * released.  Now let this top-of-subtree go.
1796                         *
1797                         * We want the freeing of this indirect block to be
1798                         * atomic in the journal with the updating of the
1799                         * bitmap block which owns it.  So make some room in
1800                         * the journal.
1801                         *
1802                         * We zero the parent pointer *after* freeing its
1803                         * pointee in the bitmaps, so if extend_transaction()
1804                         * for some reason fails to put the bitmap changes and
1805                         * the release into the same transaction, recovery
1806                         * will merely complain about releasing a free block,
1807                         * rather than leaking blocks.
1808                         */
1809                        if (is_handle_aborted(handle))
1810                                return;
1811                        if (try_to_extend_transaction(handle, inode)) {
1812                                ext3_mark_inode_dirty(handle, inode);
1813                                ext3_journal_test_restart(handle, inode);
1814                        }
1815
1816                        ext3_free_blocks(handle, inode, nr, 1);
1817
1818                        if (parent_bh) {
1819                                /*
1820                                 * The block which we have just freed is
1821                                 * pointed to by an indirect block: journal it
1822                                 */
1823                                BUFFER_TRACE(parent_bh, "get_write_access");
1824                                if (!ext3_journal_get_write_access(handle,
1825                                                                   parent_bh)){
1826                                        *p = 0;
1827                                        BUFFER_TRACE(parent_bh,
1828                                        "call ext3_journal_dirty_metadata");
1829                                        ext3_journal_dirty_metadata(handle, 
1830                                                                    parent_bh);
1831                                }
1832                        }
1833                }
1834        } else {
1835                /* We have reached the bottom of the tree. */
1836                BUFFER_TRACE(parent_bh, "free data blocks");
1837                ext3_free_data(handle, inode, parent_bh, first, last);
1838        }
1839}
1840
1841/*
1842 * ext3_truncate()
1843 *
1844 * We block out ext3_get_block() block instantiations across the entire
1845 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
1846 * simultaneously on behalf of the same inode.
1847 *
1848 * As we work through the truncate and commmit bits of it to the journal there
1849 * is one core, guiding principle: the file's tree must always be consistent on
1850 * disk.  We must be able to restart the truncate after a crash.
1851 *
1852 * The file's tree may be transiently inconsistent in memory (although it
1853 * probably isn't), but whenever we close off and commit a journal transaction,
1854 * the contents of (the filesystem + the journal) must be consistent and
1855 * restartable.  It's pretty simple, really: bottom up, right to left (although
1856 * left-to-right works OK too).
1857 *
1858 * Note that at recovery time, journal replay occurs *before* the restart of
1859 * truncate against the orphan inode list.
1860 *
1861 * The committed inode has the new, desired i_size (which is the same as
1862 * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
1863 * that this inode's truncate did not complete and it will again call
1864 * ext3_truncate() to have another go.  So there will be instantiated blocks
1865 * to the right of the truncation point in a crashed ext3 filesystem.  But
1866 * that's fine - as long as they are linked from the inode, the post-crash
1867 * ext3_truncate() run will find them and release them.
1868 */
1869
1870void ext3_truncate(struct inode * inode)
1871{
1872        handle_t *handle;
1873        u32 *i_data = inode->u.ext3_i.i_data;
1874        int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1875        int offsets[4];
1876        Indirect chain[4];
1877        Indirect *partial;
1878        int nr = 0;
1879        int n;
1880        long last_block;
1881        unsigned blocksize;
1882
1883        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1884            S_ISLNK(inode->i_mode)))
1885                return;
1886        if (ext3_inode_is_fast_symlink(inode))
1887                return;
1888        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1889                return;
1890
1891        ext3_discard_prealloc(inode);
1892
1893        handle = start_transaction(inode);
1894        if (IS_ERR(handle))
1895                return;         /* AKPM: return what? */
1896
1897        blocksize = inode->i_sb->s_blocksize;
1898        last_block = (inode->i_size + blocksize-1)
1899                                        >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
1900
1901        ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
1902                
1903
1904        n = ext3_block_to_path(inode, last_block, offsets);
1905        if (n == 0)
1906                goto out_stop;  /* error */
1907
1908        /*
1909         * OK.  This truncate is going to happen.  We add the inode to the
1910         * orphan list, so that if this truncate spans multiple transactions,
1911         * and we crash, we will resume the truncate when the filesystem
1912         * recovers.  It also marks the inode dirty, to catch the new size.
1913         *
1914         * Implication: the file must always be in a sane, consistent
1915         * truncatable state while each transaction commits.
1916         */
1917        if (ext3_orphan_add(handle, inode))
1918                goto out_stop;
1919
1920        /*
1921         * The orphan list entry will now protect us from any crash which
1922         * occurs before the truncate completes, so it is now safe to propagate
1923         * the new, shorter inode size (held for now in i_size) into the
1924         * on-disk inode. We do this via i_disksize, which is the value which
1925         * ext3 *really* writes onto the disk inode.
1926         */
1927        inode->u.ext3_i.i_disksize = inode->i_size;
1928
1929        /*
1930         * From here we block out all ext3_get_block() callers who want to
1931         * modify the block allocation tree.
1932         */
1933        down_write(&inode->u.ext3_i.truncate_sem);
1934
1935        if (n == 1) {           /* direct blocks */
1936                ext3_free_data(handle, inode, NULL, i_data+offsets[0],
1937                               i_data + EXT3_NDIR_BLOCKS);
1938                goto do_indirects;
1939        }
1940
1941        partial = ext3_find_shared(inode, n, offsets, chain, &nr);
1942        /* Kill the top of shared branch (not detached) */
1943        if (nr) {
1944                if (partial == chain) {
1945                        /* Shared branch grows from the inode */
1946                        ext3_free_branches(handle, inode, NULL,
1947                                           &nr, &nr+1, (chain+n-1) - partial);
1948                        *partial->p = 0;
1949                        /*
1950                         * We mark the inode dirty prior to restart,
1951                         * and prior to stop.  No need for it here.
1952                         */
1953                } else {
1954                        /* Shared branch grows from an indirect block */
1955                        BUFFER_TRACE(partial->bh, "get_write_access");
1956                        ext3_free_branches(handle, inode, partial->bh,
1957                                        partial->p,
1958                                        partial->p+1, (chain+n-1) - partial);
1959                }
1960        }
1961        /* Clear the ends of indirect blocks on the shared branch */
1962        while (partial > chain) {
1963                ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
1964                                   (u32*)partial->bh->b_data + addr_per_block,
1965                                   (chain+n-1) - partial);
1966                BUFFER_TRACE(partial->bh, "call brelse");
1967                brelse (partial->bh);
1968                partial--;
1969        }
1970do_indirects:
1971        /* Kill the remaining (whole) subtrees */
1972        switch (offsets[0]) {
1973                default:
1974                        nr = i_data[EXT3_IND_BLOCK];
1975                        if (nr) {
1976                                ext3_free_branches(handle, inode, NULL,
1977                                                   &nr, &nr+1, 1);
1978                                i_data[EXT3_IND_BLOCK] = 0;
1979                        }
1980                case EXT3_IND_BLOCK:
1981                        nr = i_data[EXT3_DIND_BLOCK];
1982                        if (nr) {
1983                                ext3_free_branches(handle, inode, NULL,
1984                                                   &nr, &nr+1, 2);
1985                                i_data[EXT3_DIND_BLOCK] = 0;
1986                        }
1987                case EXT3_DIND_BLOCK:
1988                        nr = i_data[EXT3_TIND_BLOCK];
1989                        if (nr) {
1990                                ext3_free_branches(handle, inode, NULL,
1991                                                   &nr, &nr+1, 3);
1992                                i_data[EXT3_TIND_BLOCK] = 0;
1993                        }
1994                case EXT3_TIND_BLOCK:
1995                        ;
1996        }
1997        up_write(&inode->u.ext3_i.truncate_sem);
1998        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1999        ext3_mark_inode_dirty(handle, inode);
2000
2001        /* In a multi-transaction truncate, we only make the final
2002         * transaction synchronous */
2003        if (IS_SYNC(inode))
2004                handle->h_sync = 1;
2005out_stop:
2006        /*
2007         * If this was a simple ftruncate(), and the file will remain alive
2008         * then we need to clear up the orphan record which we created above.
2009         * However, if this was a real unlink then we were called by
2010         * ext3_delete_inode(), and we allow that function to clean up the
2011         * orphan info for us.
2012         */
2013        if (inode->i_nlink)
2014                ext3_orphan_del(handle, inode);
2015
2016        ext3_journal_stop(handle, inode);
2017}
2018
2019/* 
2020 * ext3_get_inode_loc returns with an extra refcount against the
2021 * inode's underlying buffer_head on success. 
2022 */
2023
2024int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
2025{
2026        struct buffer_head *bh = 0;
2027        unsigned long block;
2028        unsigned long block_group;
2029        unsigned long group_desc;
2030        unsigned long desc;
2031        unsigned long offset;
2032        struct ext3_group_desc * gdp;
2033                
2034        if ((inode->i_ino != EXT3_ROOT_INO &&
2035                inode->i_ino != EXT3_ACL_IDX_INO &&
2036                inode->i_ino != EXT3_ACL_DATA_INO &&
2037                inode->i_ino != EXT3_JOURNAL_INO &&
2038                inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
2039                inode->i_ino > le32_to_cpu(
2040                        inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
2041                ext3_error (inode->i_sb, "ext3_get_inode_loc",
2042                            "bad inode number: %lu", inode->i_ino);
2043                goto bad_inode;
2044        }
2045        block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
2046        if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
2047                ext3_error (inode->i_sb, "ext3_get_inode_loc",
2048                            "group >= groups count");
2049                goto bad_inode;
2050        }
2051        group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
2052        desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
2053        bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
2054        if (!bh) {
2055                ext3_error (inode->i_sb, "ext3_get_inode_loc",
2056                            "Descriptor not loaded");
2057                goto bad_inode;
2058        }
2059
2060        gdp = (struct ext3_group_desc *) bh->b_data;
2061        /*
2062         * Figure out the offset within the block group inode table
2063         */
2064        offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
2065                EXT3_INODE_SIZE(inode->i_sb);
2066        block = le32_to_cpu(gdp[desc].bg_inode_table) +
2067                (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
2068        if (!(bh = sb_bread(inode->i_sb, block))) {
2069                ext3_error (inode->i_sb, "ext3_get_inode_loc",
2070                            "unable to read inode block - "
2071                            "inode=%lu, block=%lu", inode->i_ino, block);
2072                goto bad_inode;
2073        }
2074        offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
2075
2076        iloc->bh = bh;
2077        iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
2078        iloc->block_group = block_group;
2079        
2080        return 0;
2081        
2082 bad_inode:
2083        return -EIO;
2084}
2085
2086void ext3_set_inode_flags(struct inode *inode)
2087{
2088        unsigned int flags = inode->u.ext3_i.i_flags;
2089
2090        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME);
2091        if (flags & EXT3_SYNC_FL)
2092                inode->i_flags |= S_SYNC;
2093        if (flags & EXT3_APPEND_FL)
2094                inode->i_flags |= S_APPEND;
2095        if (flags & EXT3_IMMUTABLE_FL)
2096                inode->i_flags |= S_IMMUTABLE;
2097        if (flags & EXT3_NOATIME_FL)
2098                inode->i_flags |= S_NOATIME;
2099}
2100
2101
2102void ext3_read_inode(struct inode * inode)
2103{
2104        struct ext3_iloc iloc;
2105        struct ext3_inode *raw_inode;
2106        struct buffer_head *bh;
2107        int block;
2108        
2109        if(ext3_get_inode_loc(inode, &iloc))
2110                goto bad_inode;
2111        bh = iloc.bh;
2112        raw_inode = iloc.raw_inode;
2113        init_rwsem(&inode->u.ext3_i.truncate_sem);
2114        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2115        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2116        inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2117        if(!(test_opt (inode->i_sb, NO_UID32))) {
2118                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2119                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2120        }
2121        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2122        inode->i_size = le32_to_cpu(raw_inode->i_size);
2123        inode->i_atime = le32_to_cpu(raw_inode->i_atime);
2124        inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
2125        inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
2126        inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
2127        /* We now have enough fields to check if the inode was active or not.
2128         * This is needed because nfsd might try to access dead inodes
2129         * the test is that same one that e2fsck uses
2130         * NeilBrown 1999oct15
2131         */
2132        if (inode->i_nlink == 0) {
2133                if (inode->i_mode == 0 ||
2134                    !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
2135                        /* this inode is deleted */
2136                        brelse (bh);
2137                        goto bad_inode;
2138                }
2139                /* The only unlinked inodes we let through here have
2140                 * valid i_mode and are being read by the orphan
2141                 * recovery code: that's fine, we're about to complete
2142                 * the process of deleting those. */
2143        }
2144        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2145                                         * (for stat), not the fs block
2146                                         * size */  
2147        inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2148        inode->i_version = ++event;
2149        inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
2150#ifdef EXT3_FRAGMENTS
2151        inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
2152        inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
2153        inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
2154#endif
2155        inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2156        if (!S_ISREG(inode->i_mode)) {
2157                inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2158        } else {
2159                inode->i_size |=
2160                        ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2161        }
2162        inode->u.ext3_i.i_disksize = inode->i_size;
2163        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2164#ifdef EXT3_PREALLOCATE
2165        inode->u.ext3_i.i_prealloc_count = 0;
2166#endif
2167        inode->u.ext3_i.i_block_group = iloc.block_group;
2168
2169        /*
2170         * NOTE! The in-memory inode i_data array is in little-endian order
2171         * even on big-endian machines: we do NOT byteswap the block numbers!
2172         */
2173        for (block = 0; block < EXT3_N_BLOCKS; block++)
2174                inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
2175        INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
2176
2177        if (inode->i_ino == EXT3_ACL_IDX_INO ||
2178            inode->i_ino == EXT3_ACL_DATA_INO)
2179                /* Nothing to do */ ;
2180        else if (S_ISREG(inode->i_mode)) {
2181                inode->i_op = &ext3_file_inode_operations;
2182                inode->i_fop = &ext3_file_operations;
2183                inode->i_mapping->a_ops = &ext3_aops;
2184        } else if (S_ISDIR(inode->i_mode)) {
2185                inode->i_op = &ext3_dir_inode_operations;
2186                inode->i_fop = &ext3_dir_operations;
2187        } else if (S_ISLNK(inode->i_mode)) {
2188                if (ext3_inode_is_fast_symlink(inode))
2189                        inode->i_op = &ext3_fast_symlink_inode_operations;
2190                else {
2191                        inode->i_op = &page_symlink_inode_operations;
2192                        inode->i_mapping->a_ops = &ext3_aops;
2193                }
2194        } else 
2195                init_special_inode(inode, inode->i_mode,
2196                                   le32_to_cpu(iloc.raw_inode->i_block[0]));
2197        brelse(iloc.bh);
2198        ext3_set_inode_flags(inode);
2199        return;
2200        
2201bad_inode:
2202        make_bad_inode(inode);
2203        return;
2204}
2205
2206/*
2207 * Post the struct inode info into an on-disk inode location in the
2208 * buffer-cache.  This gobbles the caller's reference to the
2209 * buffer_head in the inode location struct.  
2210 */
2211
2212static int ext3_do_update_inode(handle_t *handle, 
2213                                struct inode *inode, 
2214                                struct ext3_iloc *iloc)
2215{
2216        struct ext3_inode *raw_inode = iloc->raw_inode;
2217        struct buffer_head *bh = iloc->bh;
2218        int err = 0, rc, block;
2219
2220        if (handle) {
2221                BUFFER_TRACE(bh, "get_write_access");
2222                err = ext3_journal_get_write_access(handle, bh);
2223                if (err)
2224                        goto out_brelse;
2225        }
2226        /* For fields not not tracking in the in-memory inode,
2227         * initialise them to zero for new inodes. */
2228        if (EXT3_I(inode)->i_state & EXT3_STATE_NEW)
2229                memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2230
2231        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2232        if(!(test_opt(inode->i_sb, NO_UID32))) {
2233                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2234                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2235/*
2236 * Fix up interoperability with old kernels. Otherwise, old inodes get
2237 * re-used with the upper 16 bits of the uid/gid intact
2238 */
2239                if(!inode->u.ext3_i.i_dtime) {
2240                        raw_inode->i_uid_high =
2241                                cpu_to_le16(high_16_bits(inode->i_uid));
2242                        raw_inode->i_gid_high =
2243                                cpu_to_le16(high_16_bits(inode->i_gid));
2244                } else {
2245                        raw_inode->i_uid_high = 0;
2246                        raw_inode->i_gid_high = 0;
2247                }
2248        } else {
2249                raw_inode->i_uid_low =
2250                        cpu_to_le16(fs_high2lowuid(inode->i_uid));
2251                raw_inode->i_gid_low =
2252                        cpu_to_le16(fs_high2lowgid(inode->i_gid));
2253                raw_inode->i_uid_high = 0;
2254                raw_inode->i_gid_high = 0;
2255        }
2256        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2257        raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
2258        raw_inode->i_atime = cpu_to_le32(inode->i_atime);
2259        raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
2260        raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
2261        raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2262        raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
2263        raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
2264#ifdef EXT3_FRAGMENTS
2265        raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
2266        raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
2267        raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
2268#endif
2269        raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
2270        if (!S_ISREG(inode->i_mode)) {
2271                raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
2272        } else {
2273                raw_inode->i_size_high =
2274                        cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
2275                if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
2276                        struct super_block *sb = inode->i_sb;
2277                        if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2278                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2279                            EXT3_SB(sb)->s_es->s_rev_level ==
2280                                        cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2281                               /* If this is the first large file
2282                                * created, add a flag to the superblock.
2283                                */
2284                                err = ext3_journal_get_write_access(handle,
2285                                                sb->u.ext3_sb.s_sbh);
2286                                if (err)
2287                                        goto out_brelse;
2288                                ext3_update_dynamic_rev(sb);
2289                                EXT3_SET_RO_COMPAT_FEATURE(sb,
2290                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2291                                sb->s_dirt = 1;
2292                                handle->h_sync = 1;
2293                                err = ext3_journal_dirty_metadata(handle,
2294                                                sb->u.ext3_sb.s_sbh);
2295                        }
2296                }
2297        }
2298        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2299        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
2300                raw_inode->i_block[0] =
2301                        cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
2302        else for (block = 0; block < EXT3_N_BLOCKS; block++)
2303                raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
2304
2305        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2306        rc = ext3_journal_dirty_metadata(handle, bh);
2307        if (!err)
2308                err = rc;
2309        EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
2310
2311out_brelse:
2312        brelse (bh);
2313        ext3_std_error(inode->i_sb, err);
2314        return err;
2315}
2316
2317/*
2318 * ext3_write_inode()
2319 *
2320 * We are called from a few places:
2321 *
2322 * - Within generic_file_write() for O_SYNC files.
2323 *   Here, there will be no transaction running. We wait for any running
2324 *   trasnaction to commit.
2325 *
2326 * - Within sys_sync(), kupdate and such.
2327 *   We wait on commit, if tol to.
2328 *
2329 * - Within prune_icache() (PF_MEMALLOC == true)
2330 *   Here we simply return.  We can't afford to block kswapd on the
2331 *   journal commit.
2332 *
2333 * In all cases it is actually safe for us to return without doing anything,
2334 * because the inode has been copied into a raw inode buffer in
2335 * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2336 * knfsd.
2337 *
2338 * Note that we are absolutely dependent upon all inode dirtiers doing the
2339 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2340 * which we are interested.
2341 *
2342 * It would be a bug for them to not do this.  The code:
2343 *
2344 *      mark_inode_dirty(inode)
2345 *      stuff();
2346 *      inode->i_size = expr;
2347 *
2348 * is in error because a kswapd-driven write_inode() could occur while
2349 * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2350 * will no longer be on the superblock's dirty inode list.
2351 */
2352void ext3_write_inode(struct inode *inode, int wait)
2353{
2354        if (current->flags & PF_MEMALLOC)
2355                return;
2356
2357        if (ext3_journal_current_handle()) {
2358                jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2359                return;
2360        }
2361
2362        if (!wait)
2363                return;
2364
2365        ext3_force_commit(inode->i_sb); 
2366}
2367
2368/*
2369 * ext3_setattr()
2370 *
2371 * Called from notify_change.
2372 *
2373 * We want to trap VFS attempts to truncate the file as soon as
2374 * possible.  In particular, we want to make sure that when the VFS
2375 * shrinks i_size, we put the inode on the orphan list and modify
2376 * i_disksize immediately, so that during the subsequent flushing of
2377 * dirty pages and freeing of disk blocks, we can guarantee that any
2378 * commit will leave the blocks being flushed in an unused state on
2379 * disk.  (On recovery, the inode will get truncated and the blocks will
2380 * be freed, so we have a strong guarantee that no future commit will
2381 * leave these blocks visible to the user.)  
2382 *
2383 * This is only needed for regular files.  rmdir() has its own path, and
2384 * we can never truncate a direcory except on final unlink (at which
2385 * point i_nlink is zero so recovery is easy.)
2386 *
2387 * Called with the BKL.  
2388 */
2389
2390int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2391{
2392        struct inode *inode = dentry->d_inode;
2393        int error, rc = 0;
2394        const unsigned int ia_valid = attr->ia_valid;
2395
2396        error = inode_change_ok(inode, attr);
2397        if (error)
2398                return error;
2399
2400        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2401                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2402                error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2403                if (error)
2404                        return error;
2405        }
2406
2407        if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2408                handle_t *handle;
2409
2410                handle = ext3_journal_start(inode, 3);
2411                if (IS_ERR(handle)) {
2412                        error = PTR_ERR(handle);
2413                        goto err_out;
2414                }
2415                
2416                error = ext3_orphan_add(handle, inode);
2417                inode->u.ext3_i.i_disksize = attr->ia_size;
2418                rc = ext3_mark_inode_dirty(handle, inode);
2419                if (!error)
2420                        error = rc;
2421                ext3_journal_stop(handle, inode);
2422        }
2423        
2424        rc = inode_setattr(inode, attr);
2425
2426        /* If inode_setattr's call to ext3_truncate failed to get a
2427         * transaction handle at all, we need to clean up the in-core
2428         * orphan list manually. */
2429        if (inode->i_nlink)
2430                ext3_orphan_del(NULL, inode);
2431
2432err_out:
2433        ext3_std_error(inode->i_sb, error);
2434        if (!error)
2435                error = rc;
2436        return error;
2437}
2438
2439
2440/*
2441 * akpm: how many blocks doth make a writepage()?
2442 *
2443 * With N blocks per page, it may be:
2444 * N data blocks
2445 * 2 indirect block
2446 * 2 dindirect
2447 * 1 tindirect
2448 * N+5 bitmap blocks (from the above)
2449 * N+5 group descriptor summary blocks
2450 * 1 inode block
2451 * 1 superblock.
2452 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2453 *
2454 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2455 *
2456 * With ordered or writeback data it's the same, less the N data blocks.
2457 *
2458 * If the inode's direct blocks can hold an integral number of pages then a
2459 * page cannot straddle two indirect blocks, and we can only touch one indirect
2460 * and dindirect block, and the "5" above becomes "3".
2461 *
2462 * This still overestimates under most circumstances.  If we were to pass the
2463 * start and end offsets in here as well we could do block_to_path() on each
2464 * block and work out the exact number of indirects which are touched.  Pah.
2465 */
2466
2467int ext3_writepage_trans_blocks(struct inode *inode)
2468{
2469        int bpp = ext3_journal_blocks_per_page(inode);
2470        int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2471        int ret;
2472        
2473        if (ext3_should_journal_data(inode))
2474                ret = 3 * (bpp + indirects) + 2;
2475        else
2476                ret = 2 * (bpp + indirects) + 2;
2477
2478#ifdef CONFIG_QUOTA
2479        ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
2480#endif
2481
2482        return ret;
2483}
2484
2485int
2486ext3_mark_iloc_dirty(handle_t *handle, 
2487                     struct inode *inode,
2488                     struct ext3_iloc *iloc)
2489{
2490        int err = 0;
2491
2492        if (handle) {
2493                /* the do_update_inode consumes one bh->b_count */
2494                atomic_inc(&iloc->bh->b_count);
2495                err = ext3_do_update_inode(handle, inode, iloc);
2496                /* ext3_do_update_inode() does journal_dirty_metadata */
2497                brelse(iloc->bh);
2498        } else {
2499                printk(KERN_EMERG "%s: called with no handle!\n", __FUNCTION__);
2500        }
2501        return err;
2502}
2503
2504/* 
2505 * On success, We end up with an outstanding reference count against
2506 * iloc->bh.  This _must_ be cleaned up later. 
2507 */
2508
2509int
2510ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2511                         struct ext3_iloc *iloc)
2512{
2513        int err = 0;
2514        if (handle) {
2515                err = ext3_get_inode_loc(inode, iloc);
2516                if (!err) {
2517                        BUFFER_TRACE(iloc->bh, "get_write_access");
2518                        err = ext3_journal_get_write_access(handle, iloc->bh);
2519                        if (err) {
2520                                brelse(iloc->bh);
2521                                iloc->bh = NULL;
2522                        }
2523                }
2524        }
2525        ext3_std_error(inode->i_sb, err);
2526        return err;
2527}
2528
2529/*
2530 * akpm: What we do here is to mark the in-core inode as clean
2531 * with respect to inode dirtiness (it may still be data-dirty).
2532 * This means that the in-core inode may be reaped by prune_icache
2533 * without having to perform any I/O.  This is a very good thing,
2534 * because *any* task may call prune_icache - even ones which
2535 * have a transaction open against a different journal.
2536 *
2537 * Is this cheating?  Not really.  Sure, we haven't written the
2538 * inode out, but prune_icache isn't a user-visible syncing function.
2539 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2540 * we start and wait on commits.
2541 *
2542 * Is this efficient/effective?  Well, we're being nice to the system
2543 * by cleaning up our inodes proactively so they can be reaped
2544 * without I/O.  But we are potentially leaving up to five seconds'
2545 * worth of inodes floating about which prune_icache wants us to
2546 * write out.  One way to fix that would be to get prune_icache()
2547 * to do a write_super() to free up some memory.  It has the desired
2548 * effect.
2549 */
2550int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2551{
2552        struct ext3_iloc iloc;
2553        int err;
2554
2555        err = ext3_reserve_inode_write(handle, inode, &iloc);
2556        if (!err)
2557                err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2558        return err;
2559}
2560
2561/*
2562 * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2563 *
2564 * We're really interested in the case where a file is being extended.
2565 * i_size has been changed by generic_commit_write() and we thus need
2566 * to include the updated inode in the current transaction.
2567 *
2568 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
2569 * are allocated to the file.
2570 *
2571 * If the inode is marked synchronous, we don't honour that here - doing
2572 * so would cause a commit on atime updates, which we don't bother doing.
2573 * We handle synchronous inodes at the highest possible level.
2574 */
2575void ext3_dirty_inode(struct inode *inode)
2576{
2577        handle_t *current_handle = ext3_journal_current_handle();
2578        handle_t *handle;
2579
2580        lock_kernel();
2581        handle = ext3_journal_start(inode, 2);
2582        if (IS_ERR(handle))
2583                goto out;
2584        if (current_handle &&
2585                current_handle->h_transaction != handle->h_transaction) {
2586                /* This task has a transaction open against a different fs */
2587                printk(KERN_EMERG "%s: transactions do not match!\n",
2588                        __FUNCTION__);
2589        } else {
2590                jbd_debug(5, "marking dirty.  outer handle=%p\n",
2591                                current_handle);
2592                ext3_mark_inode_dirty(handle, inode);
2593        }
2594        ext3_journal_stop(handle, inode);
2595out:
2596        unlock_kernel();
2597}
2598
2599#ifdef AKPM
2600/* 
2601 * Bind an inode's backing buffer_head into this transaction, to prevent
2602 * it from being flushed to disk early.  Unlike
2603 * ext3_reserve_inode_write, this leaves behind no bh reference and
2604 * returns no iloc structure, so the caller needs to repeat the iloc
2605 * lookup to mark the inode dirty later.
2606 */
2607static inline int
2608ext3_pin_inode(handle_t *handle, struct inode *inode)
2609{
2610        struct ext3_iloc iloc;
2611        
2612        int err = 0;
2613        if (handle) {
2614                err = ext3_get_inode_loc(inode, &iloc);
2615                if (!err) {
2616                        BUFFER_TRACE(iloc.bh, "get_write_access");
2617                        err = journal_get_write_access(handle, iloc.bh);
2618                        if (!err)
2619                                err = ext3_journal_dirty_metadata(handle, 
2620                                                                  iloc.bh);
2621                        brelse(iloc.bh);
2622                }
2623        }
2624        ext3_std_error(inode->i_sb, err);
2625        return err;
2626}
2627#endif
2628
2629int ext3_change_inode_journal_flag(struct inode *inode, int val)
2630{
2631        journal_t *journal;
2632        handle_t *handle;
2633        int err;
2634
2635        /*
2636         * We have to be very careful here: changing a data block's
2637         * journaling status dynamically is dangerous.  If we write a
2638         * data block to the journal, change the status and then delete
2639         * that block, we risk forgetting to revoke the old log record
2640         * from the journal and so a subsequent replay can corrupt data.
2641         * So, first we make sure that the journal is empty and that
2642         * nobody is changing anything.
2643         */
2644
2645        journal = EXT3_JOURNAL(inode);
2646        if (is_journal_aborted(journal) || IS_RDONLY(inode))
2647                return -EROFS;
2648        
2649        journal_lock_updates(journal);
2650        journal_flush(journal);
2651
2652        /*
2653         * OK, there are no updates running now, and all cached data is
2654         * synced to disk.  We are now in a completely consistent state
2655         * which doesn't have anything in the journal, and we know that
2656         * no filesystem updates are running, so it is safe to modify
2657         * the inode's in-core data-journaling state flag now.
2658         */
2659
2660        if (val)
2661                inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
2662        else
2663                inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
2664
2665        journal_unlock_updates(journal);
2666
2667        /* Finally we can mark the inode as dirty. */
2668
2669        handle = ext3_journal_start(inode, 1);
2670        if (IS_ERR(handle))
2671                return PTR_ERR(handle);
2672
2673        err = ext3_mark_inode_dirty(handle, inode);
2674        handle->h_sync = 1;
2675        ext3_journal_stop(handle, inode);
2676        ext3_std_error(inode->i_sb, err);
2677        
2678        return err;
2679}
2680
2681
2682/*
2683 * ext3_aops_journal_start().
2684 *
2685 * <This function died, but the comment lives on>
2686 *
2687 * We need to take the inode semaphore *outside* the
2688 * journal_start/journal_stop.  Otherwise, a different task could do a
2689 * wait_for_commit() while holding ->i_sem, which deadlocks.  The rule
2690 * is: transaction open/closes are considered to be a locking operation
2691 * and they nest *inside* ->i_sem.
2692 * ----------------------------------------------------------------------------
2693 * Possible problem:
2694 *      ext3_file_write()
2695 *      -> generic_file_write()
2696 *         -> __alloc_pages()
2697 *            -> page_launder()
2698 *               -> ext3_writepage()
2699 *
2700 * And the writepage can be on a different fs while we have a
2701 * transaction open against this one!  Bad.
2702 *
2703 * I tried making the task PF_MEMALLOC here, but that simply results in
2704 * 0-order allocation failures passed back to generic_file_write().
2705 * Instead, we rely on the reentrancy protection in ext3_writepage().
2706 * ----------------------------------------------------------------------------
2707 * When we do the journal_start() here we don't really need to reserve
2708 * any blocks - we won't need any until we hit ext3_prepare_write(),
2709 * which does all the needed journal extending.  However!  There is a
2710 * problem with quotas:
2711 *
2712 * Thread 1:
2713 * sys_sync
2714 * ->sync_dquots
2715 *   ->commit_dquot
2716 *     ->lock_dquot
2717 *     ->write_dquot
2718 *       ->ext3_file_write
2719 *         ->journal_start
2720 *         ->ext3_prepare_write
2721 *           ->journal_extend
2722 *           ->journal_start
2723 * Thread 2:
2724 * ext3_create          (for example)
2725 * ->ext3_new_inode
2726 *   ->dquot_initialize
2727 *     ->lock_dquot
2728 *
2729 * Deadlock.  Thread 1's journal_start blocks because thread 2 has a
2730 * transaction open.  Thread 2's transaction will never close because
2731 * thread 2 is stuck waiting for the dquot lock.
2732 *
2733 * So.  We must ensure that thread 1 *never* needs to extend the journal
2734 * for quota writes.  We do that by reserving enough journal blocks
2735 * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
2736 * need to extend" test in ext3_prepare_write() succeeds.  
2737 */
2738
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.