linux/fs/jbd/commit.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd.h>
  19#include <linux/errno.h>
  20#include <linux/mm.h>
  21#include <linux/pagemap.h>
  22#include <linux/bio.h>
  23#include <linux/blkdev.h>
  24#include <trace/events/jbd.h>
  25
  26/*
  27 * Default IO end handler for temporary BJ_IO buffer_heads.
  28 */
  29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30{
  31        BUFFER_TRACE(bh, "");
  32        if (uptodate)
  33                set_buffer_uptodate(bh);
  34        else
  35                clear_buffer_uptodate(bh);
  36        unlock_buffer(bh);
  37}
  38
  39/*
  40 * When an ext3-ordered file is truncated, it is possible that many pages are
  41 * not successfully freed, because they are attached to a committing transaction.
  42 * After the transaction commits, these pages are left on the LRU, with no
  43 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45 * the numbers in /proc/meminfo look odd.
  46 *
  47 * So here, we have a buffer which has just come off the forget list.  Look to
  48 * see if we can strip all buffers from the backing page.
  49 *
  50 * Called under journal->j_list_lock.  The caller provided us with a ref
  51 * against the buffer, and we drop that here.
  52 */
  53static void release_buffer_page(struct buffer_head *bh)
  54{
  55        struct page *page;
  56
  57        if (buffer_dirty(bh))
  58                goto nope;
  59        if (atomic_read(&bh->b_count) != 1)
  60                goto nope;
  61        page = bh->b_page;
  62        if (!page)
  63                goto nope;
  64        if (page->mapping)
  65                goto nope;
  66
  67        /* OK, it's a truncated page */
  68        if (!trylock_page(page))
  69                goto nope;
  70
  71        page_cache_get(page);
  72        __brelse(bh);
  73        try_to_free_buffers(page);
  74        unlock_page(page);
  75        page_cache_release(page);
  76        return;
  77
  78nope:
  79        __brelse(bh);
  80}
  81
  82/*
  83 * Decrement reference counter for data buffer. If it has been marked
  84 * 'BH_Freed', release it and the page to which it belongs if possible.
  85 */
  86static void release_data_buffer(struct buffer_head *bh)
  87{
  88        if (buffer_freed(bh)) {
  89                clear_buffer_freed(bh);
  90                release_buffer_page(bh);
  91        } else
  92                put_bh(bh);
  93}
  94
  95/*
  96 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  97 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  98 * return 0.  j_list_lock is dropped in this case.
  99 */
 100static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 101{
 102        if (!jbd_trylock_bh_state(bh)) {
 103                spin_unlock(&journal->j_list_lock);
 104                schedule();
 105                return 0;
 106        }
 107        return 1;
 108}
 109
 110/* Done it all: now write the commit record.  We should have
 111 * cleaned up our previous buffers by now, so if we are in abort
 112 * mode we can now just skip the rest of the journal write
 113 * entirely.
 114 *
 115 * Returns 1 if the journal needs to be aborted or 0 on success
 116 */
 117static int journal_write_commit_record(journal_t *journal,
 118                                        transaction_t *commit_transaction)
 119{
 120        struct journal_head *descriptor;
 121        struct buffer_head *bh;
 122        journal_header_t *header;
 123        int ret;
 124
 125        if (is_journal_aborted(journal))
 126                return 0;
 127
 128        descriptor = journal_get_descriptor_buffer(journal);
 129        if (!descriptor)
 130                return 1;
 131
 132        bh = jh2bh(descriptor);
 133
 134        header = (journal_header_t *)(bh->b_data);
 135        header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
 136        header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
 137        header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 138
 139        JBUFFER_TRACE(descriptor, "write commit block");
 140        set_buffer_dirty(bh);
 141
 142        if (journal->j_flags & JFS_BARRIER)
 143                ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
 144        else
 145                ret = sync_dirty_buffer(bh);
 146
 147        put_bh(bh);             /* One for getblk() */
 148        journal_put_journal_head(descriptor);
 149
 150        return (ret == -EIO);
 151}
 152
 153static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
 154                                   int write_op)
 155{
 156        int i;
 157
 158        for (i = 0; i < bufs; i++) {
 159                wbuf[i]->b_end_io = end_buffer_write_sync;
 160                /* We use-up our safety reference in submit_bh() */
 161                submit_bh(write_op, wbuf[i]);
 162        }
 163}
 164
 165/*
 166 *  Submit all the data buffers to disk
 167 */
 168static int journal_submit_data_buffers(journal_t *journal,
 169                                       transaction_t *commit_transaction,
 170                                       int write_op)
 171{
 172        struct journal_head *jh;
 173        struct buffer_head *bh;
 174        int locked;
 175        int bufs = 0;
 176        struct buffer_head **wbuf = journal->j_wbuf;
 177        int err = 0;
 178
 179        /*
 180         * Whenever we unlock the journal and sleep, things can get added
 181         * onto ->t_sync_datalist, so we have to keep looping back to
 182         * write_out_data until we *know* that the list is empty.
 183         *
 184         * Cleanup any flushed data buffers from the data list.  Even in
 185         * abort mode, we want to flush this out as soon as possible.
 186         */
 187write_out_data:
 188        cond_resched();
 189        spin_lock(&journal->j_list_lock);
 190
 191        while (commit_transaction->t_sync_datalist) {
 192                jh = commit_transaction->t_sync_datalist;
 193                bh = jh2bh(jh);
 194                locked = 0;
 195
 196                /* Get reference just to make sure buffer does not disappear
 197                 * when we are forced to drop various locks */
 198                get_bh(bh);
 199                /* If the buffer is dirty, we need to submit IO and hence
 200                 * we need the buffer lock. We try to lock the buffer without
 201                 * blocking. If we fail, we need to drop j_list_lock and do
 202                 * blocking lock_buffer().
 203                 */
 204                if (buffer_dirty(bh)) {
 205                        if (!trylock_buffer(bh)) {
 206                                BUFFER_TRACE(bh, "needs blocking lock");
 207                                spin_unlock(&journal->j_list_lock);
 208                                trace_jbd_do_submit_data(journal,
 209                                                     commit_transaction);
 210                                /* Write out all data to prevent deadlocks */
 211                                journal_do_submit_data(wbuf, bufs, write_op);
 212                                bufs = 0;
 213                                lock_buffer(bh);
 214                                spin_lock(&journal->j_list_lock);
 215                        }
 216                        locked = 1;
 217                }
 218                /* We have to get bh_state lock. Again out of order, sigh. */
 219                if (!inverted_lock(journal, bh)) {
 220                        jbd_lock_bh_state(bh);
 221                        spin_lock(&journal->j_list_lock);
 222                }
 223                /* Someone already cleaned up the buffer? */
 224                if (!buffer_jbd(bh) || bh2jh(bh) != jh
 225                        || jh->b_transaction != commit_transaction
 226                        || jh->b_jlist != BJ_SyncData) {
 227                        jbd_unlock_bh_state(bh);
 228                        if (locked)
 229                                unlock_buffer(bh);
 230                        BUFFER_TRACE(bh, "already cleaned up");
 231                        release_data_buffer(bh);
 232                        continue;
 233                }
 234                if (locked && test_clear_buffer_dirty(bh)) {
 235                        BUFFER_TRACE(bh, "needs writeout, adding to array");
 236                        wbuf[bufs++] = bh;
 237                        __journal_file_buffer(jh, commit_transaction,
 238                                                BJ_Locked);
 239                        jbd_unlock_bh_state(bh);
 240                        if (bufs == journal->j_wbufsize) {
 241                                spin_unlock(&journal->j_list_lock);
 242                                trace_jbd_do_submit_data(journal,
 243                                                     commit_transaction);
 244                                journal_do_submit_data(wbuf, bufs, write_op);
 245                                bufs = 0;
 246                                goto write_out_data;
 247                        }
 248                } else if (!locked && buffer_locked(bh)) {
 249                        __journal_file_buffer(jh, commit_transaction,
 250                                                BJ_Locked);
 251                        jbd_unlock_bh_state(bh);
 252                        put_bh(bh);
 253                } else {
 254                        BUFFER_TRACE(bh, "writeout complete: unfile");
 255                        if (unlikely(!buffer_uptodate(bh)))
 256                                err = -EIO;
 257                        __journal_unfile_buffer(jh);
 258                        jbd_unlock_bh_state(bh);
 259                        if (locked)
 260                                unlock_buffer(bh);
 261                        release_data_buffer(bh);
 262                }
 263
 264                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 265                        spin_unlock(&journal->j_list_lock);
 266                        goto write_out_data;
 267                }
 268        }
 269        spin_unlock(&journal->j_list_lock);
 270        trace_jbd_do_submit_data(journal, commit_transaction);
 271        journal_do_submit_data(wbuf, bufs, write_op);
 272
 273        return err;
 274}
 275
 276/*
 277 * journal_commit_transaction
 278 *
 279 * The primary function for committing a transaction to the log.  This
 280 * function is called by the journal thread to begin a complete commit.
 281 */
 282void journal_commit_transaction(journal_t *journal)
 283{
 284        transaction_t *commit_transaction;
 285        struct journal_head *jh, *new_jh, *descriptor;
 286        struct buffer_head **wbuf = journal->j_wbuf;
 287        int bufs;
 288        int flags;
 289        int err;
 290        unsigned int blocknr;
 291        ktime_t start_time;
 292        u64 commit_time;
 293        char *tagp = NULL;
 294        journal_header_t *header;
 295        journal_block_tag_t *tag = NULL;
 296        int space_left = 0;
 297        int first_tag = 0;
 298        int tag_flag;
 299        int i;
 300        struct blk_plug plug;
 301        int write_op = WRITE;
 302
 303        /*
 304         * First job: lock down the current transaction and wait for
 305         * all outstanding updates to complete.
 306         */
 307
 308        /* Do we need to erase the effects of a prior journal_flush? */
 309        if (journal->j_flags & JFS_FLUSHED) {
 310                jbd_debug(3, "super block updated\n");
 311                mutex_lock(&journal->j_checkpoint_mutex);
 312                /*
 313                 * We hold j_checkpoint_mutex so tail cannot change under us.
 314                 * We don't need any special data guarantees for writing sb
 315                 * since journal is empty and it is ok for write to be
 316                 * flushed only with transaction commit.
 317                 */
 318                journal_update_sb_log_tail(journal, journal->j_tail_sequence,
 319                                           journal->j_tail, WRITE_SYNC);
 320                mutex_unlock(&journal->j_checkpoint_mutex);
 321        } else {
 322                jbd_debug(3, "superblock not updated\n");
 323        }
 324
 325        J_ASSERT(journal->j_running_transaction != NULL);
 326        J_ASSERT(journal->j_committing_transaction == NULL);
 327
 328        commit_transaction = journal->j_running_transaction;
 329        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 330
 331        trace_jbd_start_commit(journal, commit_transaction);
 332        jbd_debug(1, "JBD: starting commit of transaction %d\n",
 333                        commit_transaction->t_tid);
 334
 335        spin_lock(&journal->j_state_lock);
 336        commit_transaction->t_state = T_LOCKED;
 337
 338        trace_jbd_commit_locking(journal, commit_transaction);
 339        spin_lock(&commit_transaction->t_handle_lock);
 340        while (commit_transaction->t_updates) {
 341                DEFINE_WAIT(wait);
 342
 343                prepare_to_wait(&journal->j_wait_updates, &wait,
 344                                        TASK_UNINTERRUPTIBLE);
 345                if (commit_transaction->t_updates) {
 346                        spin_unlock(&commit_transaction->t_handle_lock);
 347                        spin_unlock(&journal->j_state_lock);
 348                        schedule();
 349                        spin_lock(&journal->j_state_lock);
 350                        spin_lock(&commit_transaction->t_handle_lock);
 351                }
 352                finish_wait(&journal->j_wait_updates, &wait);
 353        }
 354        spin_unlock(&commit_transaction->t_handle_lock);
 355
 356        J_ASSERT (commit_transaction->t_outstanding_credits <=
 357                        journal->j_max_transaction_buffers);
 358
 359        /*
 360         * First thing we are allowed to do is to discard any remaining
 361         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 362         * that there are no such buffers: if a large filesystem
 363         * operation like a truncate needs to split itself over multiple
 364         * transactions, then it may try to do a journal_restart() while
 365         * there are still BJ_Reserved buffers outstanding.  These must
 366         * be released cleanly from the current transaction.
 367         *
 368         * In this case, the filesystem must still reserve write access
 369         * again before modifying the buffer in the new transaction, but
 370         * we do not require it to remember exactly which old buffers it
 371         * has reserved.  This is consistent with the existing behaviour
 372         * that multiple journal_get_write_access() calls to the same
 373         * buffer are perfectly permissible.
 374         */
 375        while (commit_transaction->t_reserved_list) {
 376                jh = commit_transaction->t_reserved_list;
 377                JBUFFER_TRACE(jh, "reserved, unused: refile");
 378                /*
 379                 * A journal_get_undo_access()+journal_release_buffer() may
 380                 * leave undo-committed data.
 381                 */
 382                if (jh->b_committed_data) {
 383                        struct buffer_head *bh = jh2bh(jh);
 384
 385                        jbd_lock_bh_state(bh);
 386                        jbd_free(jh->b_committed_data, bh->b_size);
 387                        jh->b_committed_data = NULL;
 388                        jbd_unlock_bh_state(bh);
 389                }
 390                journal_refile_buffer(journal, jh);
 391        }
 392
 393        /*
 394         * Now try to drop any written-back buffers from the journal's
 395         * checkpoint lists.  We do this *before* commit because it potentially
 396         * frees some memory
 397         */
 398        spin_lock(&journal->j_list_lock);
 399        __journal_clean_checkpoint_list(journal);
 400        spin_unlock(&journal->j_list_lock);
 401
 402        jbd_debug (3, "JBD: commit phase 1\n");
 403
 404        /*
 405         * Clear revoked flag to reflect there is no revoked buffers
 406         * in the next transaction which is going to be started.
 407         */
 408        journal_clear_buffer_revoked_flags(journal);
 409
 410        /*
 411         * Switch to a new revoke table.
 412         */
 413        journal_switch_revoke_table(journal);
 414
 415        trace_jbd_commit_flushing(journal, commit_transaction);
 416        commit_transaction->t_state = T_FLUSH;
 417        journal->j_committing_transaction = commit_transaction;
 418        journal->j_running_transaction = NULL;
 419        start_time = ktime_get();
 420        commit_transaction->t_log_start = journal->j_head;
 421        wake_up(&journal->j_wait_transaction_locked);
 422        spin_unlock(&journal->j_state_lock);
 423
 424        jbd_debug (3, "JBD: commit phase 2\n");
 425
 426        if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
 427                write_op = WRITE_SYNC;
 428
 429        /*
 430         * Now start flushing things to disk, in the order they appear
 431         * on the transaction lists.  Data blocks go first.
 432         */
 433        blk_start_plug(&plug);
 434        err = journal_submit_data_buffers(journal, commit_transaction,
 435                                          write_op);
 436        blk_finish_plug(&plug);
 437
 438        /*
 439         * Wait for all previously submitted IO to complete.
 440         */
 441        spin_lock(&journal->j_list_lock);
 442        while (commit_transaction->t_locked_list) {
 443                struct buffer_head *bh;
 444
 445                jh = commit_transaction->t_locked_list->b_tprev;
 446                bh = jh2bh(jh);
 447                get_bh(bh);
 448                if (buffer_locked(bh)) {
 449                        spin_unlock(&journal->j_list_lock);
 450                        wait_on_buffer(bh);
 451                        spin_lock(&journal->j_list_lock);
 452                }
 453                if (unlikely(!buffer_uptodate(bh))) {
 454                        if (!trylock_page(bh->b_page)) {
 455                                spin_unlock(&journal->j_list_lock);
 456                                lock_page(bh->b_page);
 457                                spin_lock(&journal->j_list_lock);
 458                        }
 459                        if (bh->b_page->mapping)
 460                                set_bit(AS_EIO, &bh->b_page->mapping->flags);
 461
 462                        unlock_page(bh->b_page);
 463                        SetPageError(bh->b_page);
 464                        err = -EIO;
 465                }
 466                if (!inverted_lock(journal, bh)) {
 467                        put_bh(bh);
 468                        spin_lock(&journal->j_list_lock);
 469                        continue;
 470                }
 471                if (buffer_jbd(bh) && bh2jh(bh) == jh &&
 472                    jh->b_transaction == commit_transaction &&
 473                    jh->b_jlist == BJ_Locked)
 474                        __journal_unfile_buffer(jh);
 475                jbd_unlock_bh_state(bh);
 476                release_data_buffer(bh);
 477                cond_resched_lock(&journal->j_list_lock);
 478        }
 479        spin_unlock(&journal->j_list_lock);
 480
 481        if (err) {
 482                char b[BDEVNAME_SIZE];
 483
 484                printk(KERN_WARNING
 485                        "JBD: Detected IO errors while flushing file data "
 486                        "on %s\n", bdevname(journal->j_fs_dev, b));
 487                if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
 488                        journal_abort(journal, err);
 489                err = 0;
 490        }
 491
 492        blk_start_plug(&plug);
 493
 494        journal_write_revoke_records(journal, commit_transaction, write_op);
 495
 496        /*
 497         * If we found any dirty or locked buffers, then we should have
 498         * looped back up to the write_out_data label.  If there weren't
 499         * any then journal_clean_data_list should have wiped the list
 500         * clean by now, so check that it is in fact empty.
 501         */
 502        J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 503
 504        jbd_debug (3, "JBD: commit phase 3\n");
 505
 506        /*
 507         * Way to go: we have now written out all of the data for a
 508         * transaction!  Now comes the tricky part: we need to write out
 509         * metadata.  Loop over the transaction's entire buffer list:
 510         */
 511        spin_lock(&journal->j_state_lock);
 512        commit_transaction->t_state = T_COMMIT;
 513        spin_unlock(&journal->j_state_lock);
 514
 515        trace_jbd_commit_logging(journal, commit_transaction);
 516        J_ASSERT(commit_transaction->t_nr_buffers <=
 517                 commit_transaction->t_outstanding_credits);
 518
 519        descriptor = NULL;
 520        bufs = 0;
 521        while (commit_transaction->t_buffers) {
 522
 523                /* Find the next buffer to be journaled... */
 524
 525                jh = commit_transaction->t_buffers;
 526
 527                /* If we're in abort mode, we just un-journal the buffer and
 528                   release it. */
 529
 530                if (is_journal_aborted(journal)) {
 531                        clear_buffer_jbddirty(jh2bh(jh));
 532                        JBUFFER_TRACE(jh, "journal is aborting: refile");
 533                        journal_refile_buffer(journal, jh);
 534                        /* If that was the last one, we need to clean up
 535                         * any descriptor buffers which may have been
 536                         * already allocated, even if we are now
 537                         * aborting. */
 538                        if (!commit_transaction->t_buffers)
 539                                goto start_journal_io;
 540                        continue;
 541                }
 542
 543                /* Make sure we have a descriptor block in which to
 544                   record the metadata buffer. */
 545
 546                if (!descriptor) {
 547                        struct buffer_head *bh;
 548
 549                        J_ASSERT (bufs == 0);
 550
 551                        jbd_debug(4, "JBD: get descriptor\n");
 552
 553                        descriptor = journal_get_descriptor_buffer(journal);
 554                        if (!descriptor) {
 555                                journal_abort(journal, -EIO);
 556                                continue;
 557                        }
 558
 559                        bh = jh2bh(descriptor);
 560                        jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 561                                (unsigned long long)bh->b_blocknr, bh->b_data);
 562                        header = (journal_header_t *)&bh->b_data[0];
 563                        header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
 564                        header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
 565                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 566
 567                        tagp = &bh->b_data[sizeof(journal_header_t)];
 568                        space_left = bh->b_size - sizeof(journal_header_t);
 569                        first_tag = 1;
 570                        set_buffer_jwrite(bh);
 571                        set_buffer_dirty(bh);
 572                        wbuf[bufs++] = bh;
 573
 574                        /* Record it so that we can wait for IO
 575                           completion later */
 576                        BUFFER_TRACE(bh, "ph3: file as descriptor");
 577                        journal_file_buffer(descriptor, commit_transaction,
 578                                        BJ_LogCtl);
 579                }
 580
 581                /* Where is the buffer to be written? */
 582
 583                err = journal_next_log_block(journal, &blocknr);
 584                /* If the block mapping failed, just abandon the buffer
 585                   and repeat this loop: we'll fall into the
 586                   refile-on-abort condition above. */
 587                if (err) {
 588                        journal_abort(journal, err);
 589                        continue;
 590                }
 591
 592                /*
 593                 * start_this_handle() uses t_outstanding_credits to determine
 594                 * the free space in the log, but this counter is changed
 595                 * by journal_next_log_block() also.
 596                 */
 597                commit_transaction->t_outstanding_credits--;
 598
 599                /* Bump b_count to prevent truncate from stumbling over
 600                   the shadowed buffer!  @@@ This can go if we ever get
 601                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 602                get_bh(jh2bh(jh));
 603
 604                /* Make a temporary IO buffer with which to write it out
 605                   (this will requeue both the metadata buffer and the
 606                   temporary IO buffer). new_bh goes on BJ_IO*/
 607
 608                set_buffer_jwrite(jh2bh(jh));
 609                /*
 610                 * akpm: journal_write_metadata_buffer() sets
 611                 * new_bh->b_transaction to commit_transaction.
 612                 * We need to clean this up before we release new_bh
 613                 * (which is of type BJ_IO)
 614                 */
 615                JBUFFER_TRACE(jh, "ph3: write metadata");
 616                flags = journal_write_metadata_buffer(commit_transaction,
 617                                                      jh, &new_jh, blocknr);
 618                set_buffer_jwrite(jh2bh(new_jh));
 619                wbuf[bufs++] = jh2bh(new_jh);
 620
 621                /* Record the new block's tag in the current descriptor
 622                   buffer */
 623
 624                tag_flag = 0;
 625                if (flags & 1)
 626                        tag_flag |= JFS_FLAG_ESCAPE;
 627                if (!first_tag)
 628                        tag_flag |= JFS_FLAG_SAME_UUID;
 629
 630                tag = (journal_block_tag_t *) tagp;
 631                tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
 632                tag->t_flags = cpu_to_be32(tag_flag);
 633                tagp += sizeof(journal_block_tag_t);
 634                space_left -= sizeof(journal_block_tag_t);
 635
 636                if (first_tag) {
 637                        memcpy (tagp, journal->j_uuid, 16);
 638                        tagp += 16;
 639                        space_left -= 16;
 640                        first_tag = 0;
 641                }
 642
 643                /* If there's no more to do, or if the descriptor is full,
 644                   let the IO rip! */
 645
 646                if (bufs == journal->j_wbufsize ||
 647                    commit_transaction->t_buffers == NULL ||
 648                    space_left < sizeof(journal_block_tag_t) + 16) {
 649
 650                        jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 651
 652                        /* Write an end-of-descriptor marker before
 653                           submitting the IOs.  "tag" still points to
 654                           the last tag we set up. */
 655
 656                        tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
 657
 658start_journal_io:
 659                        for (i = 0; i < bufs; i++) {
 660                                struct buffer_head *bh = wbuf[i];
 661                                lock_buffer(bh);
 662                                clear_buffer_dirty(bh);
 663                                set_buffer_uptodate(bh);
 664                                bh->b_end_io = journal_end_buffer_io_sync;
 665                                submit_bh(write_op, bh);
 666                        }
 667                        cond_resched();
 668
 669                        /* Force a new descriptor to be generated next
 670                           time round the loop. */
 671                        descriptor = NULL;
 672                        bufs = 0;
 673                }
 674        }
 675
 676        blk_finish_plug(&plug);
 677
 678        /* Lo and behold: we have just managed to send a transaction to
 679           the log.  Before we can commit it, wait for the IO so far to
 680           complete.  Control buffers being written are on the
 681           transaction's t_log_list queue, and metadata buffers are on
 682           the t_iobuf_list queue.
 683
 684           Wait for the buffers in reverse order.  That way we are
 685           less likely to be woken up until all IOs have completed, and
 686           so we incur less scheduling load.
 687        */
 688
 689        jbd_debug(3, "JBD: commit phase 4\n");
 690
 691        /*
 692         * akpm: these are BJ_IO, and j_list_lock is not needed.
 693         * See __journal_try_to_free_buffer.
 694         */
 695wait_for_iobuf:
 696        while (commit_transaction->t_iobuf_list != NULL) {
 697                struct buffer_head *bh;
 698
 699                jh = commit_transaction->t_iobuf_list->b_tprev;
 700                bh = jh2bh(jh);
 701                if (buffer_locked(bh)) {
 702                        wait_on_buffer(bh);
 703                        goto wait_for_iobuf;
 704                }
 705                if (cond_resched())
 706                        goto wait_for_iobuf;
 707
 708                if (unlikely(!buffer_uptodate(bh)))
 709                        err = -EIO;
 710
 711                clear_buffer_jwrite(bh);
 712
 713                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 714                journal_unfile_buffer(journal, jh);
 715
 716                /*
 717                 * ->t_iobuf_list should contain only dummy buffer_heads
 718                 * which were created by journal_write_metadata_buffer().
 719                 */
 720                BUFFER_TRACE(bh, "dumping temporary bh");
 721                journal_put_journal_head(jh);
 722                __brelse(bh);
 723                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 724                free_buffer_head(bh);
 725
 726                /* We also have to unlock and free the corresponding
 727                   shadowed buffer */
 728                jh = commit_transaction->t_shadow_list->b_tprev;
 729                bh = jh2bh(jh);
 730                clear_buffer_jwrite(bh);
 731                J_ASSERT_BH(bh, buffer_jbddirty(bh));
 732
 733                /* The metadata is now released for reuse, but we need
 734                   to remember it against this transaction so that when
 735                   we finally commit, we can do any checkpointing
 736                   required. */
 737                JBUFFER_TRACE(jh, "file as BJ_Forget");
 738                journal_file_buffer(jh, commit_transaction, BJ_Forget);
 739                /*
 740                 * Wake up any transactions which were waiting for this
 741                 * IO to complete. The barrier must be here so that changes
 742                 * by journal_file_buffer() take effect before wake_up_bit()
 743                 * does the waitqueue check.
 744                 */
 745                smp_mb();
 746                wake_up_bit(&bh->b_state, BH_Unshadow);
 747                JBUFFER_TRACE(jh, "brelse shadowed buffer");
 748                __brelse(bh);
 749        }
 750
 751        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 752
 753        jbd_debug(3, "JBD: commit phase 5\n");
 754
 755        /* Here we wait for the revoke record and descriptor record buffers */
 756 wait_for_ctlbuf:
 757        while (commit_transaction->t_log_list != NULL) {
 758                struct buffer_head *bh;
 759
 760                jh = commit_transaction->t_log_list->b_tprev;
 761                bh = jh2bh(jh);
 762                if (buffer_locked(bh)) {
 763                        wait_on_buffer(bh);
 764                        goto wait_for_ctlbuf;
 765                }
 766                if (cond_resched())
 767                        goto wait_for_ctlbuf;
 768
 769                if (unlikely(!buffer_uptodate(bh)))
 770                        err = -EIO;
 771
 772                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 773                clear_buffer_jwrite(bh);
 774                journal_unfile_buffer(journal, jh);
 775                journal_put_journal_head(jh);
 776                __brelse(bh);           /* One for getblk */
 777                /* AKPM: bforget here */
 778        }
 779
 780        if (err)
 781                journal_abort(journal, err);
 782
 783        jbd_debug(3, "JBD: commit phase 6\n");
 784
 785        /* All metadata is written, now write commit record and do cleanup */
 786        spin_lock(&journal->j_state_lock);
 787        J_ASSERT(commit_transaction->t_state == T_COMMIT);
 788        commit_transaction->t_state = T_COMMIT_RECORD;
 789        spin_unlock(&journal->j_state_lock);
 790
 791        if (journal_write_commit_record(journal, commit_transaction))
 792                err = -EIO;
 793
 794        if (err)
 795                journal_abort(journal, err);
 796
 797        /* End of a transaction!  Finally, we can do checkpoint
 798           processing: any buffers committed as a result of this
 799           transaction can be removed from any checkpoint list it was on
 800           before. */
 801
 802        jbd_debug(3, "JBD: commit phase 7\n");
 803
 804        J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 805        J_ASSERT(commit_transaction->t_buffers == NULL);
 806        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 807        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 808        J_ASSERT(commit_transaction->t_shadow_list == NULL);
 809        J_ASSERT(commit_transaction->t_log_list == NULL);
 810
 811restart_loop:
 812        /*
 813         * As there are other places (journal_unmap_buffer()) adding buffers
 814         * to this list we have to be careful and hold the j_list_lock.
 815         */
 816        spin_lock(&journal->j_list_lock);
 817        while (commit_transaction->t_forget) {
 818                transaction_t *cp_transaction;
 819                struct buffer_head *bh;
 820                int try_to_free = 0;
 821
 822                jh = commit_transaction->t_forget;
 823                spin_unlock(&journal->j_list_lock);
 824                bh = jh2bh(jh);
 825                /*
 826                 * Get a reference so that bh cannot be freed before we are
 827                 * done with it.
 828                 */
 829                get_bh(bh);
 830                jbd_lock_bh_state(bh);
 831                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 832                        jh->b_transaction == journal->j_running_transaction);
 833
 834                /*
 835                 * If there is undo-protected committed data against
 836                 * this buffer, then we can remove it now.  If it is a
 837                 * buffer needing such protection, the old frozen_data
 838                 * field now points to a committed version of the
 839                 * buffer, so rotate that field to the new committed
 840                 * data.
 841                 *
 842                 * Otherwise, we can just throw away the frozen data now.
 843                 */
 844                if (jh->b_committed_data) {
 845                        jbd_free(jh->b_committed_data, bh->b_size);
 846                        jh->b_committed_data = NULL;
 847                        if (jh->b_frozen_data) {
 848                                jh->b_committed_data = jh->b_frozen_data;
 849                                jh->b_frozen_data = NULL;
 850                        }
 851                } else if (jh->b_frozen_data) {
 852                        jbd_free(jh->b_frozen_data, bh->b_size);
 853                        jh->b_frozen_data = NULL;
 854                }
 855
 856                spin_lock(&journal->j_list_lock);
 857                cp_transaction = jh->b_cp_transaction;
 858                if (cp_transaction) {
 859                        JBUFFER_TRACE(jh, "remove from old cp transaction");
 860                        __journal_remove_checkpoint(jh);
 861                }
 862
 863                /* Only re-checkpoint the buffer_head if it is marked
 864                 * dirty.  If the buffer was added to the BJ_Forget list
 865                 * by journal_forget, it may no longer be dirty and
 866                 * there's no point in keeping a checkpoint record for
 867                 * it. */
 868
 869                /* A buffer which has been freed while still being
 870                 * journaled by a previous transaction may end up still
 871                 * being dirty here, but we want to avoid writing back
 872                 * that buffer in the future after the "add to orphan"
 873                 * operation been committed,  That's not only a performance
 874                 * gain, it also stops aliasing problems if the buffer is
 875                 * left behind for writeback and gets reallocated for another
 876                 * use in a different page. */
 877                if (buffer_freed(bh) && !jh->b_next_transaction) {
 878                        clear_buffer_freed(bh);
 879                        clear_buffer_jbddirty(bh);
 880                }
 881
 882                if (buffer_jbddirty(bh)) {
 883                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
 884                        __journal_insert_checkpoint(jh, commit_transaction);
 885                        if (is_journal_aborted(journal))
 886                                clear_buffer_jbddirty(bh);
 887                } else {
 888                        J_ASSERT_BH(bh, !buffer_dirty(bh));
 889                        /*
 890                         * The buffer on BJ_Forget list and not jbddirty means
 891                         * it has been freed by this transaction and hence it
 892                         * could not have been reallocated until this
 893                         * transaction has committed. *BUT* it could be
 894                         * reallocated once we have written all the data to
 895                         * disk and before we process the buffer on BJ_Forget
 896                         * list.
 897                         */
 898                        if (!jh->b_next_transaction)
 899                                try_to_free = 1;
 900                }
 901                JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 902                __journal_refile_buffer(jh);
 903                jbd_unlock_bh_state(bh);
 904                if (try_to_free)
 905                        release_buffer_page(bh);
 906                else
 907                        __brelse(bh);
 908                cond_resched_lock(&journal->j_list_lock);
 909        }
 910        spin_unlock(&journal->j_list_lock);
 911        /*
 912         * This is a bit sleazy.  We use j_list_lock to protect transition
 913         * of a transaction into T_FINISHED state and calling
 914         * __journal_drop_transaction(). Otherwise we could race with
 915         * other checkpointing code processing the transaction...
 916         */
 917        spin_lock(&journal->j_state_lock);
 918        spin_lock(&journal->j_list_lock);
 919        /*
 920         * Now recheck if some buffers did not get attached to the transaction
 921         * while the lock was dropped...
 922         */
 923        if (commit_transaction->t_forget) {
 924                spin_unlock(&journal->j_list_lock);
 925                spin_unlock(&journal->j_state_lock);
 926                goto restart_loop;
 927        }
 928
 929        /* Done with this transaction! */
 930
 931        jbd_debug(3, "JBD: commit phase 8\n");
 932
 933        J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
 934
 935        commit_transaction->t_state = T_FINISHED;
 936        J_ASSERT(commit_transaction == journal->j_committing_transaction);
 937        journal->j_commit_sequence = commit_transaction->t_tid;
 938        journal->j_committing_transaction = NULL;
 939        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 940
 941        /*
 942         * weight the commit time higher than the average time so we don't
 943         * react too strongly to vast changes in commit time
 944         */
 945        if (likely(journal->j_average_commit_time))
 946                journal->j_average_commit_time = (commit_time*3 +
 947                                journal->j_average_commit_time) / 4;
 948        else
 949                journal->j_average_commit_time = commit_time;
 950
 951        spin_unlock(&journal->j_state_lock);
 952
 953        if (commit_transaction->t_checkpoint_list == NULL &&
 954            commit_transaction->t_checkpoint_io_list == NULL) {
 955                __journal_drop_transaction(journal, commit_transaction);
 956        } else {
 957                if (journal->j_checkpoint_transactions == NULL) {
 958                        journal->j_checkpoint_transactions = commit_transaction;
 959                        commit_transaction->t_cpnext = commit_transaction;
 960                        commit_transaction->t_cpprev = commit_transaction;
 961                } else {
 962                        commit_transaction->t_cpnext =
 963                                journal->j_checkpoint_transactions;
 964                        commit_transaction->t_cpprev =
 965                                commit_transaction->t_cpnext->t_cpprev;
 966                        commit_transaction->t_cpnext->t_cpprev =
 967                                commit_transaction;
 968                        commit_transaction->t_cpprev->t_cpnext =
 969                                commit_transaction;
 970                }
 971        }
 972        spin_unlock(&journal->j_list_lock);
 973
 974        trace_jbd_end_commit(journal, commit_transaction);
 975        jbd_debug(1, "JBD: commit %d complete, head %d\n",
 976                  journal->j_commit_sequence, journal->j_tail_sequence);
 977
 978        wake_up(&journal->j_wait_done_commit);
 979}
 980
lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.