linux/fs/jbd2/commit.c
<<
>>
Prefs
   1/*
   2 * linux/fs/jbd2/commit.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Journal commit routines for the generic filesystem journaling code;
  13 * part of the ext2fs journaling system.
  14 */
  15
  16#include <linux/time.h>
  17#include <linux/fs.h>
  18#include <linux/jbd2.h>
  19#include <linux/errno.h>
  20#include <linux/slab.h>
  21#include <linux/mm.h>
  22#include <linux/pagemap.h>
  23#include <linux/jiffies.h>
  24#include <linux/crc32.h>
  25#include <linux/writeback.h>
  26#include <linux/backing-dev.h>
  27#include <linux/bio.h>
  28#include <linux/blkdev.h>
  29#include <linux/bitops.h>
  30#include <trace/events/jbd2.h>
  31#include <asm/system.h>
  32
  33/*
  34 * Default IO end handler for temporary BJ_IO buffer_heads.
  35 */
  36static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  37{
  38        BUFFER_TRACE(bh, "");
  39        if (uptodate)
  40                set_buffer_uptodate(bh);
  41        else
  42                clear_buffer_uptodate(bh);
  43        unlock_buffer(bh);
  44}
  45
  46/*
  47 * When an ext4 file is truncated, it is possible that some pages are not
  48 * successfully freed, because they are attached to a committing transaction.
  49 * After the transaction commits, these pages are left on the LRU, with no
  50 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  51 * by the VM, but their apparent absence upsets the VM accounting, and it makes
  52 * the numbers in /proc/meminfo look odd.
  53 *
  54 * So here, we have a buffer which has just come off the forget list.  Look to
  55 * see if we can strip all buffers from the backing page.
  56 *
  57 * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  58 * caller provided us with a ref against the buffer, and we drop that here.
  59 */
  60static void release_buffer_page(struct buffer_head *bh)
  61{
  62        struct page *page;
  63
  64        if (buffer_dirty(bh))
  65                goto nope;
  66        if (atomic_read(&bh->b_count) != 1)
  67                goto nope;
  68        page = bh->b_page;
  69        if (!page)
  70                goto nope;
  71        if (page->mapping)
  72                goto nope;
  73
  74        /* OK, it's a truncated page */
  75        if (!trylock_page(page))
  76                goto nope;
  77
  78        page_cache_get(page);
  79        __brelse(bh);
  80        try_to_free_buffers(page);
  81        unlock_page(page);
  82        page_cache_release(page);
  83        return;
  84
  85nope:
  86        __brelse(bh);
  87}
  88
  89/*
  90 * Done it all: now submit the commit record.  We should have
  91 * cleaned up our previous buffers by now, so if we are in abort
  92 * mode we can now just skip the rest of the journal write
  93 * entirely.
  94 *
  95 * Returns 1 if the journal needs to be aborted or 0 on success
  96 */
  97static int journal_submit_commit_record(journal_t *journal,
  98                                        transaction_t *commit_transaction,
  99                                        struct buffer_head **cbh,
 100                                        __u32 crc32_sum)
 101{
 102        struct journal_head *descriptor;
 103        struct commit_header *tmp;
 104        struct buffer_head *bh;
 105        int ret;
 106        struct timespec now = current_kernel_time();
 107
 108        *cbh = NULL;
 109
 110        if (is_journal_aborted(journal))
 111                return 0;
 112
 113        descriptor = jbd2_journal_get_descriptor_buffer(journal);
 114        if (!descriptor)
 115                return 1;
 116
 117        bh = jh2bh(descriptor);
 118
 119        tmp = (struct commit_header *)bh->b_data;
 120        tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 121        tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 122        tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 123        tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 124        tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 125
 126        if (JBD2_HAS_COMPAT_FEATURE(journal,
 127                                    JBD2_FEATURE_COMPAT_CHECKSUM)) {
 128                tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 129                tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 130                tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 131        }
 132
 133        JBUFFER_TRACE(descriptor, "submit commit block");
 134        lock_buffer(bh);
 135        clear_buffer_dirty(bh);
 136        set_buffer_uptodate(bh);
 137        bh->b_end_io = journal_end_buffer_io_sync;
 138
 139        if (journal->j_flags & JBD2_BARRIER &&
 140            !JBD2_HAS_INCOMPAT_FEATURE(journal,
 141                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 142                ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 143        else
 144                ret = submit_bh(WRITE_SYNC, bh);
 145
 146        *cbh = bh;
 147        return ret;
 148}
 149
 150/*
 151 * This function along with journal_submit_commit_record
 152 * allows to write the commit record asynchronously.
 153 */
 154static int journal_wait_on_commit_record(journal_t *journal,
 155                                         struct buffer_head *bh)
 156{
 157        int ret = 0;
 158
 159        clear_buffer_dirty(bh);
 160        wait_on_buffer(bh);
 161
 162        if (unlikely(!buffer_uptodate(bh)))
 163                ret = -EIO;
 164        put_bh(bh);            /* One for getblk() */
 165        jbd2_journal_put_journal_head(bh2jh(bh));
 166
 167        return ret;
 168}
 169
 170/*
 171 * write the filemap data using writepage() address_space_operations.
 172 * We don't do block allocation here even for delalloc. We don't
 173 * use writepages() because with dealyed allocation we may be doing
 174 * block allocation in writepages().
 175 */
 176static int journal_submit_inode_data_buffers(struct address_space *mapping)
 177{
 178        int ret;
 179        struct writeback_control wbc = {
 180                .sync_mode =  WB_SYNC_ALL,
 181                .nr_to_write = mapping->nrpages * 2,
 182                .range_start = 0,
 183                .range_end = i_size_read(mapping->host),
 184        };
 185
 186        ret = generic_writepages(mapping, &wbc);
 187        return ret;
 188}
 189
 190/*
 191 * Submit all the data buffers of inode associated with the transaction to
 192 * disk.
 193 *
 194 * We are in a committing transaction. Therefore no new inode can be added to
 195 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 196 * operate on from being released while we write out pages.
 197 */
 198static int journal_submit_data_buffers(journal_t *journal,
 199                transaction_t *commit_transaction)
 200{
 201        struct jbd2_inode *jinode;
 202        int err, ret = 0;
 203        struct address_space *mapping;
 204
 205        spin_lock(&journal->j_list_lock);
 206        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 207                mapping = jinode->i_vfs_inode->i_mapping;
 208                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 209                spin_unlock(&journal->j_list_lock);
 210                /*
 211                 * submit the inode data buffers. We use writepage
 212                 * instead of writepages. Because writepages can do
 213                 * block allocation  with delalloc. We need to write
 214                 * only allocated blocks here.
 215                 */
 216                trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 217                err = journal_submit_inode_data_buffers(mapping);
 218                if (!ret)
 219                        ret = err;
 220                spin_lock(&journal->j_list_lock);
 221                J_ASSERT(jinode->i_transaction == commit_transaction);
 222                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 223                smp_mb__after_clear_bit();
 224                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 225        }
 226        spin_unlock(&journal->j_list_lock);
 227        return ret;
 228}
 229
 230/*
 231 * Wait for data submitted for writeout, refile inodes to proper
 232 * transaction if needed.
 233 *
 234 */
 235static int journal_finish_inode_data_buffers(journal_t *journal,
 236                transaction_t *commit_transaction)
 237{
 238        struct jbd2_inode *jinode, *next_i;
 239        int err, ret = 0;
 240
 241        /* For locking, see the comment in journal_submit_data_buffers() */
 242        spin_lock(&journal->j_list_lock);
 243        list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 244                set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 245                spin_unlock(&journal->j_list_lock);
 246                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 247                if (err) {
 248                        /*
 249                         * Because AS_EIO is cleared by
 250                         * filemap_fdatawait_range(), set it again so
 251                         * that user process can get -EIO from fsync().
 252                         */
 253                        set_bit(AS_EIO,
 254                                &jinode->i_vfs_inode->i_mapping->flags);
 255
 256                        if (!ret)
 257                                ret = err;
 258                }
 259                spin_lock(&journal->j_list_lock);
 260                clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 261                smp_mb__after_clear_bit();
 262                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 263        }
 264
 265        /* Now refile inode to proper lists */
 266        list_for_each_entry_safe(jinode, next_i,
 267                                 &commit_transaction->t_inode_list, i_list) {
 268                list_del(&jinode->i_list);
 269                if (jinode->i_next_transaction) {
 270                        jinode->i_transaction = jinode->i_next_transaction;
 271                        jinode->i_next_transaction = NULL;
 272                        list_add(&jinode->i_list,
 273                                &jinode->i_transaction->t_inode_list);
 274                } else {
 275                        jinode->i_transaction = NULL;
 276                }
 277        }
 278        spin_unlock(&journal->j_list_lock);
 279
 280        return ret;
 281}
 282
 283static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 284{
 285        struct page *page = bh->b_page;
 286        char *addr;
 287        __u32 checksum;
 288
 289        addr = kmap_atomic(page, KM_USER0);
 290        checksum = crc32_be(crc32_sum,
 291                (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 292        kunmap_atomic(addr, KM_USER0);
 293
 294        return checksum;
 295}
 296
 297static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 298                                   unsigned long long block)
 299{
 300        tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 301        if (tag_bytes > JBD2_TAG_SIZE32)
 302                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 303}
 304
 305/*
 306 * jbd2_journal_commit_transaction
 307 *
 308 * The primary function for committing a transaction to the log.  This
 309 * function is called by the journal thread to begin a complete commit.
 310 */
 311void jbd2_journal_commit_transaction(journal_t *journal)
 312{
 313        struct transaction_stats_s stats;
 314        transaction_t *commit_transaction;
 315        struct journal_head *jh, *new_jh, *descriptor;
 316        struct buffer_head **wbuf = journal->j_wbuf;
 317        int bufs;
 318        int flags;
 319        int err;
 320        unsigned long long blocknr;
 321        ktime_t start_time;
 322        u64 commit_time;
 323        char *tagp = NULL;
 324        journal_header_t *header;
 325        journal_block_tag_t *tag = NULL;
 326        int space_left = 0;
 327        int first_tag = 0;
 328        int tag_flag;
 329        int i, to_free = 0;
 330        int tag_bytes = journal_tag_bytes(journal);
 331        struct buffer_head *cbh = NULL; /* For transactional checksums */
 332        __u32 crc32_sum = ~0;
 333        struct blk_plug plug;
 334
 335        /*
 336         * First job: lock down the current transaction and wait for
 337         * all outstanding updates to complete.
 338         */
 339
 340        /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 341        if (journal->j_flags & JBD2_FLUSHED) {
 342                jbd_debug(3, "super block updated\n");
 343                jbd2_journal_update_superblock(journal, 1);
 344        } else {
 345                jbd_debug(3, "superblock not updated\n");
 346        }
 347
 348        J_ASSERT(journal->j_running_transaction != NULL);
 349        J_ASSERT(journal->j_committing_transaction == NULL);
 350
 351        commit_transaction = journal->j_running_transaction;
 352        J_ASSERT(commit_transaction->t_state == T_RUNNING);
 353
 354        trace_jbd2_start_commit(journal, commit_transaction);
 355        jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 356                        commit_transaction->t_tid);
 357
 358        write_lock(&journal->j_state_lock);
 359        commit_transaction->t_state = T_LOCKED;
 360
 361        trace_jbd2_commit_locking(journal, commit_transaction);
 362        stats.run.rs_wait = commit_transaction->t_max_wait;
 363        stats.run.rs_locked = jiffies;
 364        stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 365                                              stats.run.rs_locked);
 366
 367        spin_lock(&commit_transaction->t_handle_lock);
 368        while (atomic_read(&commit_transaction->t_updates)) {
 369                DEFINE_WAIT(wait);
 370
 371                prepare_to_wait(&journal->j_wait_updates, &wait,
 372                                        TASK_UNINTERRUPTIBLE);
 373                if (atomic_read(&commit_transaction->t_updates)) {
 374                        spin_unlock(&commit_transaction->t_handle_lock);
 375                        write_unlock(&journal->j_state_lock);
 376                        schedule();
 377                        write_lock(&journal->j_state_lock);
 378                        spin_lock(&commit_transaction->t_handle_lock);
 379                }
 380                finish_wait(&journal->j_wait_updates, &wait);
 381        }
 382        spin_unlock(&commit_transaction->t_handle_lock);
 383
 384        J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 385                        journal->j_max_transaction_buffers);
 386
 387        /*
 388         * First thing we are allowed to do is to discard any remaining
 389         * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 390         * that there are no such buffers: if a large filesystem
 391         * operation like a truncate needs to split itself over multiple
 392         * transactions, then it may try to do a jbd2_journal_restart() while
 393         * there are still BJ_Reserved buffers outstanding.  These must
 394         * be released cleanly from the current transaction.
 395         *
 396         * In this case, the filesystem must still reserve write access
 397         * again before modifying the buffer in the new transaction, but
 398         * we do not require it to remember exactly which old buffers it
 399         * has reserved.  This is consistent with the existing behaviour
 400         * that multiple jbd2_journal_get_write_access() calls to the same
 401         * buffer are perfectly permissible.
 402         */
 403        while (commit_transaction->t_reserved_list) {
 404                jh = commit_transaction->t_reserved_list;
 405                JBUFFER_TRACE(jh, "reserved, unused: refile");
 406                /*
 407                 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 408                 * leave undo-committed data.
 409                 */
 410                if (jh->b_committed_data) {
 411                        struct buffer_head *bh = jh2bh(jh);
 412
 413                        jbd_lock_bh_state(bh);
 414                        jbd2_free(jh->b_committed_data, bh->b_size);
 415                        jh->b_committed_data = NULL;
 416                        jbd_unlock_bh_state(bh);
 417                }
 418                jbd2_journal_refile_buffer(journal, jh);
 419        }
 420
 421        /*
 422         * Now try to drop any written-back buffers from the journal's
 423         * checkpoint lists.  We do this *before* commit because it potentially
 424         * frees some memory
 425         */
 426        spin_lock(&journal->j_list_lock);
 427        __jbd2_journal_clean_checkpoint_list(journal);
 428        spin_unlock(&journal->j_list_lock);
 429
 430        jbd_debug(3, "JBD2: commit phase 1\n");
 431
 432        /*
 433         * Clear revoked flag to reflect there is no revoked buffers
 434         * in the next transaction which is going to be started.
 435         */
 436        jbd2_clear_buffer_revoked_flags(journal);
 437
 438        /*
 439         * Switch to a new revoke table.
 440         */
 441        jbd2_journal_switch_revoke_table(journal);
 442
 443        trace_jbd2_commit_flushing(journal, commit_transaction);
 444        stats.run.rs_flushing = jiffies;
 445        stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 446                                             stats.run.rs_flushing);
 447
 448        commit_transaction->t_state = T_FLUSH;
 449        journal->j_committing_transaction = commit_transaction;
 450        journal->j_running_transaction = NULL;
 451        start_time = ktime_get();
 452        commit_transaction->t_log_start = journal->j_head;
 453        wake_up(&journal->j_wait_transaction_locked);
 454        write_unlock(&journal->j_state_lock);
 455
 456        jbd_debug(3, "JBD2: commit phase 2\n");
 457
 458        /*
 459         * Now start flushing things to disk, in the order they appear
 460         * on the transaction lists.  Data blocks go first.
 461         */
 462        err = journal_submit_data_buffers(journal, commit_transaction);
 463        if (err)
 464                jbd2_journal_abort(journal, err);
 465
 466        blk_start_plug(&plug);
 467        jbd2_journal_write_revoke_records(journal, commit_transaction,
 468                                          WRITE_SYNC);
 469        blk_finish_plug(&plug);
 470
 471        jbd_debug(3, "JBD2: commit phase 2\n");
 472
 473        /*
 474         * Way to go: we have now written out all of the data for a
 475         * transaction!  Now comes the tricky part: we need to write out
 476         * metadata.  Loop over the transaction's entire buffer list:
 477         */
 478        write_lock(&journal->j_state_lock);
 479        commit_transaction->t_state = T_COMMIT;
 480        write_unlock(&journal->j_state_lock);
 481
 482        trace_jbd2_commit_logging(journal, commit_transaction);
 483        stats.run.rs_logging = jiffies;
 484        stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 485                                               stats.run.rs_logging);
 486        stats.run.rs_blocks =
 487                atomic_read(&commit_transaction->t_outstanding_credits);
 488        stats.run.rs_blocks_logged = 0;
 489
 490        J_ASSERT(commit_transaction->t_nr_buffers <=
 491                 atomic_read(&commit_transaction->t_outstanding_credits));
 492
 493        err = 0;
 494        descriptor = NULL;
 495        bufs = 0;
 496        blk_start_plug(&plug);
 497        while (commit_transaction->t_buffers) {
 498
 499                /* Find the next buffer to be journaled... */
 500
 501                jh = commit_transaction->t_buffers;
 502
 503                /* If we're in abort mode, we just un-journal the buffer and
 504                   release it. */
 505
 506                if (is_journal_aborted(journal)) {
 507                        clear_buffer_jbddirty(jh2bh(jh));
 508                        JBUFFER_TRACE(jh, "journal is aborting: refile");
 509                        jbd2_buffer_abort_trigger(jh,
 510                                                  jh->b_frozen_data ?
 511                                                  jh->b_frozen_triggers :
 512                                                  jh->b_triggers);
 513                        jbd2_journal_refile_buffer(journal, jh);
 514                        /* If that was the last one, we need to clean up
 515                         * any descriptor buffers which may have been
 516                         * already allocated, even if we are now
 517                         * aborting. */
 518                        if (!commit_transaction->t_buffers)
 519                                goto start_journal_io;
 520                        continue;
 521                }
 522
 523                /* Make sure we have a descriptor block in which to
 524                   record the metadata buffer. */
 525
 526                if (!descriptor) {
 527                        struct buffer_head *bh;
 528
 529                        J_ASSERT (bufs == 0);
 530
 531                        jbd_debug(4, "JBD2: get descriptor\n");
 532
 533                        descriptor = jbd2_journal_get_descriptor_buffer(journal);
 534                        if (!descriptor) {
 535                                jbd2_journal_abort(journal, -EIO);
 536                                continue;
 537                        }
 538
 539                        bh = jh2bh(descriptor);
 540                        jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 541                                (unsigned long long)bh->b_blocknr, bh->b_data);
 542                        header = (journal_header_t *)&bh->b_data[0];
 543                        header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 544                        header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 545                        header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 546
 547                        tagp = &bh->b_data[sizeof(journal_header_t)];
 548                        space_left = bh->b_size - sizeof(journal_header_t);
 549                        first_tag = 1;
 550                        set_buffer_jwrite(bh);
 551                        set_buffer_dirty(bh);
 552                        wbuf[bufs++] = bh;
 553
 554                        /* Record it so that we can wait for IO
 555                           completion later */
 556                        BUFFER_TRACE(bh, "ph3: file as descriptor");
 557                        jbd2_journal_file_buffer(descriptor, commit_transaction,
 558                                        BJ_LogCtl);
 559                }
 560
 561                /* Where is the buffer to be written? */
 562
 563                err = jbd2_journal_next_log_block(journal, &blocknr);
 564                /* If the block mapping failed, just abandon the buffer
 565                   and repeat this loop: we'll fall into the
 566                   refile-on-abort condition above. */
 567                if (err) {
 568                        jbd2_journal_abort(journal, err);
 569                        continue;
 570                }
 571
 572                /*
 573                 * start_this_handle() uses t_outstanding_credits to determine
 574                 * the free space in the log, but this counter is changed
 575                 * by jbd2_journal_next_log_block() also.
 576                 */
 577                atomic_dec(&commit_transaction->t_outstanding_credits);
 578
 579                /* Bump b_count to prevent truncate from stumbling over
 580                   the shadowed buffer!  @@@ This can go if we ever get
 581                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 582                atomic_inc(&jh2bh(jh)->b_count);
 583
 584                /* Make a temporary IO buffer with which to write it out
 585                   (this will requeue both the metadata buffer and the
 586                   temporary IO buffer). new_bh goes on BJ_IO*/
 587
 588                set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 589                /*
 590                 * akpm: jbd2_journal_write_metadata_buffer() sets
 591                 * new_bh->b_transaction to commit_transaction.
 592                 * We need to clean this up before we release new_bh
 593                 * (which is of type BJ_IO)
 594                 */
 595                JBUFFER_TRACE(jh, "ph3: write metadata");
 596                flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 597                                                      jh, &new_jh, blocknr);
 598                if (flags < 0) {
 599                        jbd2_journal_abort(journal, flags);
 600                        continue;
 601                }
 602                set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 603                wbuf[bufs++] = jh2bh(new_jh);
 604
 605                /* Record the new block's tag in the current descriptor
 606                   buffer */
 607
 608                tag_flag = 0;
 609                if (flags & 1)
 610                        tag_flag |= JBD2_FLAG_ESCAPE;
 611                if (!first_tag)
 612                        tag_flag |= JBD2_FLAG_SAME_UUID;
 613
 614                tag = (journal_block_tag_t *) tagp;
 615                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 616                tag->t_flags = cpu_to_be32(tag_flag);
 617                tagp += tag_bytes;
 618                space_left -= tag_bytes;
 619
 620                if (first_tag) {
 621                        memcpy (tagp, journal->j_uuid, 16);
 622                        tagp += 16;
 623                        space_left -= 16;
 624                        first_tag = 0;
 625                }
 626
 627                /* If there's no more to do, or if the descriptor is full,
 628                   let the IO rip! */
 629
 630                if (bufs == journal->j_wbufsize ||
 631                    commit_transaction->t_buffers == NULL ||
 632                    space_left < tag_bytes + 16) {
 633
 634                        jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 635
 636                        /* Write an end-of-descriptor marker before
 637                           submitting the IOs.  "tag" still points to
 638                           the last tag we set up. */
 639
 640                        tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 641
 642start_journal_io:
 643                        for (i = 0; i < bufs; i++) {
 644                                struct buffer_head *bh = wbuf[i];
 645                                /*
 646                                 * Compute checksum.
 647                                 */
 648                                if (JBD2_HAS_COMPAT_FEATURE(journal,
 649                                        JBD2_FEATURE_COMPAT_CHECKSUM)) {
 650                                        crc32_sum =
 651                                            jbd2_checksum_data(crc32_sum, bh);
 652                                }
 653
 654                                lock_buffer(bh);
 655                                clear_buffer_dirty(bh);
 656                                set_buffer_uptodate(bh);
 657                                bh->b_end_io = journal_end_buffer_io_sync;
 658                                submit_bh(WRITE_SYNC, bh);
 659                        }
 660                        cond_resched();
 661                        stats.run.rs_blocks_logged += bufs;
 662
 663                        /* Force a new descriptor to be generated next
 664                           time round the loop. */
 665                        descriptor = NULL;
 666                        bufs = 0;
 667                }
 668        }
 669
 670        err = journal_finish_inode_data_buffers(journal, commit_transaction);
 671        if (err) {
 672                printk(KERN_WARNING
 673                        "JBD2: Detected IO errors while flushing file data "
 674                       "on %s\n", journal->j_devname);
 675                if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 676                        jbd2_journal_abort(journal, err);
 677                err = 0;
 678        }
 679
 680        write_lock(&journal->j_state_lock);
 681        J_ASSERT(commit_transaction->t_state == T_COMMIT);
 682        commit_transaction->t_state = T_COMMIT_DFLUSH;
 683        write_unlock(&journal->j_state_lock);
 684        /* 
 685         * If the journal is not located on the file system device,
 686         * then we must flush the file system device before we issue
 687         * the commit record
 688         */
 689        if (commit_transaction->t_need_data_flush &&
 690            (journal->j_fs_dev != journal->j_dev) &&
 691            (journal->j_flags & JBD2_BARRIER))
 692                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 693
 694        /* Done it all: now write the commit record asynchronously. */
 695        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 696                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 697                err = journal_submit_commit_record(journal, commit_transaction,
 698                                                 &cbh, crc32_sum);
 699                if (err)
 700                        __jbd2_journal_abort_hard(journal);
 701        }
 702
 703        blk_finish_plug(&plug);
 704
 705        /* Lo and behold: we have just managed to send a transaction to
 706           the log.  Before we can commit it, wait for the IO so far to
 707           complete.  Control buffers being written are on the
 708           transaction's t_log_list queue, and metadata buffers are on
 709           the t_iobuf_list queue.
 710
 711           Wait for the buffers in reverse order.  That way we are
 712           less likely to be woken up until all IOs have completed, and
 713           so we incur less scheduling load.
 714        */
 715
 716        jbd_debug(3, "JBD2: commit phase 3\n");
 717
 718        /*
 719         * akpm: these are BJ_IO, and j_list_lock is not needed.
 720         * See __journal_try_to_free_buffer.
 721         */
 722wait_for_iobuf:
 723        while (commit_transaction->t_iobuf_list != NULL) {
 724                struct buffer_head *bh;
 725
 726                jh = commit_transaction->t_iobuf_list->b_tprev;
 727                bh = jh2bh(jh);
 728                if (buffer_locked(bh)) {
 729                        wait_on_buffer(bh);
 730                        goto wait_for_iobuf;
 731                }
 732                if (cond_resched())
 733                        goto wait_for_iobuf;
 734
 735                if (unlikely(!buffer_uptodate(bh)))
 736                        err = -EIO;
 737
 738                clear_buffer_jwrite(bh);
 739
 740                JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 741                jbd2_journal_unfile_buffer(journal, jh);
 742
 743                /*
 744                 * ->t_iobuf_list should contain only dummy buffer_heads
 745                 * which were created by jbd2_journal_write_metadata_buffer().
 746                 */
 747                BUFFER_TRACE(bh, "dumping temporary bh");
 748                jbd2_journal_put_journal_head(jh);
 749                __brelse(bh);
 750                J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 751                free_buffer_head(bh);
 752
 753                /* We also have to unlock and free the corresponding
 754                   shadowed buffer */
 755                jh = commit_transaction->t_shadow_list->b_tprev;
 756                bh = jh2bh(jh);
 757                clear_bit(BH_JWrite, &bh->b_state);
 758                J_ASSERT_BH(bh, buffer_jbddirty(bh));
 759
 760                /* The metadata is now released for reuse, but we need
 761                   to remember it against this transaction so that when
 762                   we finally commit, we can do any checkpointing
 763                   required. */
 764                JBUFFER_TRACE(jh, "file as BJ_Forget");
 765                jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 766                /*
 767                 * Wake up any transactions which were waiting for this IO to
 768                 * complete. The barrier must be here so that changes by
 769                 * jbd2_journal_file_buffer() take effect before wake_up_bit()
 770                 * does the waitqueue check.
 771                 */
 772                smp_mb();
 773                wake_up_bit(&bh->b_state, BH_Unshadow);
 774                JBUFFER_TRACE(jh, "brelse shadowed buffer");
 775                __brelse(bh);
 776        }
 777
 778        J_ASSERT (commit_transaction->t_shadow_list == NULL);
 779
 780        jbd_debug(3, "JBD2: commit phase 4\n");
 781
 782        /* Here we wait for the revoke record and descriptor record buffers */
 783 wait_for_ctlbuf:
 784        while (commit_transaction->t_log_list != NULL) {
 785                struct buffer_head *bh;
 786
 787                jh = commit_transaction->t_log_list->b_tprev;
 788                bh = jh2bh(jh);
 789                if (buffer_locked(bh)) {
 790                        wait_on_buffer(bh);
 791                        goto wait_for_ctlbuf;
 792                }
 793                if (cond_resched())
 794                        goto wait_for_ctlbuf;
 795
 796                if (unlikely(!buffer_uptodate(bh)))
 797                        err = -EIO;
 798
 799                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 800                clear_buffer_jwrite(bh);
 801                jbd2_journal_unfile_buffer(journal, jh);
 802                jbd2_journal_put_journal_head(jh);
 803                __brelse(bh);           /* One for getblk */
 804                /* AKPM: bforget here */
 805        }
 806
 807        if (err)
 808                jbd2_journal_abort(journal, err);
 809
 810        jbd_debug(3, "JBD2: commit phase 5\n");
 811        write_lock(&journal->j_state_lock);
 812        J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 813        commit_transaction->t_state = T_COMMIT_JFLUSH;
 814        write_unlock(&journal->j_state_lock);
 815
 816        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 817                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 818                err = journal_submit_commit_record(journal, commit_transaction,
 819                                                &cbh, crc32_sum);
 820                if (err)
 821                        __jbd2_journal_abort_hard(journal);
 822        }
 823        if (cbh)
 824                err = journal_wait_on_commit_record(journal, cbh);
 825        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 826                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 827            journal->j_flags & JBD2_BARRIER) {
 828                blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 829        }
 830
 831        if (err)
 832                jbd2_journal_abort(journal, err);
 833
 834        /* End of a transaction!  Finally, we can do checkpoint
 835           processing: any buffers committed as a result of this
 836           transaction can be removed from any checkpoint list it was on
 837           before. */
 838
 839        jbd_debug(3, "JBD2: commit phase 6\n");
 840
 841        J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 842        J_ASSERT(commit_transaction->t_buffers == NULL);
 843        J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 844        J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 845        J_ASSERT(commit_transaction->t_shadow_list == NULL);
 846        J_ASSERT(commit_transaction->t_log_list == NULL);
 847
 848restart_loop:
 849        /*
 850         * As there are other places (journal_unmap_buffer()) adding buffers
 851         * to this list we have to be careful and hold the j_list_lock.
 852         */
 853        spin_lock(&journal->j_list_lock);
 854        while (commit_transaction->t_forget) {
 855                transaction_t *cp_transaction;
 856                struct buffer_head *bh;
 857                int try_to_free = 0;
 858
 859                jh = commit_transaction->t_forget;
 860                spin_unlock(&journal->j_list_lock);
 861                bh = jh2bh(jh);
 862                /*
 863                 * Get a reference so that bh cannot be freed before we are
 864                 * done with it.
 865                 */
 866                get_bh(bh);
 867                jbd_lock_bh_state(bh);
 868                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 869
 870                /*
 871                 * If there is undo-protected committed data against
 872                 * this buffer, then we can remove it now.  If it is a
 873                 * buffer needing such protection, the old frozen_data
 874                 * field now points to a committed version of the
 875                 * buffer, so rotate that field to the new committed
 876                 * data.
 877                 *
 878                 * Otherwise, we can just throw away the frozen data now.
 879                 *
 880                 * We also know that the frozen data has already fired
 881                 * its triggers if they exist, so we can clear that too.
 882                 */
 883                if (jh->b_committed_data) {
 884                        jbd2_free(jh->b_committed_data, bh->b_size);
 885                        jh->b_committed_data = NULL;
 886                        if (jh->b_frozen_data) {
 887                                jh->b_committed_data = jh->b_frozen_data;
 888                                jh->b_frozen_data = NULL;
 889                                jh->b_frozen_triggers = NULL;
 890                        }
 891                } else if (jh->b_frozen_data) {
 892                        jbd2_free(jh->b_frozen_data, bh->b_size);
 893                        jh->b_frozen_data = NULL;
 894                        jh->b_frozen_triggers = NULL;
 895                }
 896
 897                spin_lock(&journal->j_list_lock);
 898                cp_transaction = jh->b_cp_transaction;
 899                if (cp_transaction) {
 900                        JBUFFER_TRACE(jh, "remove from old cp transaction");
 901                        cp_transaction->t_chp_stats.cs_dropped++;
 902                        __jbd2_journal_remove_checkpoint(jh);
 903                }
 904
 905                /* Only re-checkpoint the buffer_head if it is marked
 906                 * dirty.  If the buffer was added to the BJ_Forget list
 907                 * by jbd2_journal_forget, it may no longer be dirty and
 908                 * there's no point in keeping a checkpoint record for
 909                 * it. */
 910
 911                /* A buffer which has been freed while still being
 912                 * journaled by a previous transaction may end up still
 913                 * being dirty here, but we want to avoid writing back
 914                 * that buffer in the future after the "add to orphan"
 915                 * operation been committed,  That's not only a performance
 916                 * gain, it also stops aliasing problems if the buffer is
 917                 * left behind for writeback and gets reallocated for another
 918                 * use in a different page. */
 919                if (buffer_freed(bh) && !jh->b_next_transaction) {
 920                        clear_buffer_freed(bh);
 921                        clear_buffer_jbddirty(bh);
 922                }
 923
 924                if (buffer_jbddirty(bh)) {
 925                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
 926                        __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 927                        if (is_journal_aborted(journal))
 928                                clear_buffer_jbddirty(bh);
 929                } else {
 930                        J_ASSERT_BH(bh, !buffer_dirty(bh));
 931                        /*
 932                         * The buffer on BJ_Forget list and not jbddirty means
 933                         * it has been freed by this transaction and hence it
 934                         * could not have been reallocated until this
 935                         * transaction has committed. *BUT* it could be
 936                         * reallocated once we have written all the data to
 937                         * disk and before we process the buffer on BJ_Forget
 938                         * list.
 939                         */
 940                        if (!jh->b_next_transaction)
 941                                try_to_free = 1;
 942                }
 943                JBUFFER_TRACE(jh, "refile or unfile buffer");
 944                __jbd2_journal_refile_buffer(jh);
 945                jbd_unlock_bh_state(bh);
 946                if (try_to_free)
 947                        release_buffer_page(bh);        /* Drops bh reference */
 948                else
 949                        __brelse(bh);
 950                cond_resched_lock(&journal->j_list_lock);
 951        }
 952        spin_unlock(&journal->j_list_lock);
 953        /*
 954         * This is a bit sleazy.  We use j_list_lock to protect transition
 955         * of a transaction into T_FINISHED state and calling
 956         * __jbd2_journal_drop_transaction(). Otherwise we could race with
 957         * other checkpointing code processing the transaction...
 958         */
 959        write_lock(&journal->j_state_lock);
 960        spin_lock(&journal->j_list_lock);
 961        /*
 962         * Now recheck if some buffers did not get attached to the transaction
 963         * while the lock was dropped...
 964         */
 965        if (commit_transaction->t_forget) {
 966                spin_unlock(&journal->j_list_lock);
 967                write_unlock(&journal->j_state_lock);
 968                goto restart_loop;
 969        }
 970
 971        /* Done with this transaction! */
 972
 973        jbd_debug(3, "JBD2: commit phase 7\n");
 974
 975        J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
 976
 977        commit_transaction->t_start = jiffies;
 978        stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
 979                                              commit_transaction->t_start);
 980
 981        /*
 982         * File the transaction statistics
 983         */
 984        stats.ts_tid = commit_transaction->t_tid;
 985        stats.run.rs_handle_count =
 986                atomic_read(&commit_transaction->t_handle_count);
 987        trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 988                             commit_transaction->t_tid, &stats.run);
 989
 990        /*
 991         * Calculate overall stats
 992         */
 993        spin_lock(&journal->j_history_lock);
 994        journal->j_stats.ts_tid++;
 995        journal->j_stats.run.rs_wait += stats.run.rs_wait;
 996        journal->j_stats.run.rs_running += stats.run.rs_running;
 997        journal->j_stats.run.rs_locked += stats.run.rs_locked;
 998        journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
 999        journal->j_stats.run.rs_logging += stats.run.rs_logging;
1000        journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1001        journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1002        journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1003        spin_unlock(&journal->j_history_lock);
1004
1005        commit_transaction->t_state = T_FINISHED;
1006        J_ASSERT(commit_transaction == journal->j_committing_transaction);
1007        journal->j_commit_sequence = commit_transaction->t_tid;
1008        journal->j_committing_transaction = NULL;
1009        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1010
1011        /*
1012         * weight the commit time higher than the average time so we don't
1013         * react too strongly to vast changes in the commit time
1014         */
1015        if (likely(journal->j_average_commit_time))
1016                journal->j_average_commit_time = (commit_time +
1017                                journal->j_average_commit_time*3) / 4;
1018        else
1019                journal->j_average_commit_time = commit_time;
1020        write_unlock(&journal->j_state_lock);
1021
1022        if (commit_transaction->t_checkpoint_list == NULL &&
1023            commit_transaction->t_checkpoint_io_list == NULL) {
1024                __jbd2_journal_drop_transaction(journal, commit_transaction);
1025                to_free = 1;
1026        } else {
1027                if (journal->j_checkpoint_transactions == NULL) {
1028                        journal->j_checkpoint_transactions = commit_transaction;
1029                        commit_transaction->t_cpnext = commit_transaction;
1030                        commit_transaction->t_cpprev = commit_transaction;
1031                } else {
1032                        commit_transaction->t_cpnext =
1033                                journal->j_checkpoint_transactions;
1034                        commit_transaction->t_cpprev =
1035                                commit_transaction->t_cpnext->t_cpprev;
1036                        commit_transaction->t_cpnext->t_cpprev =
1037                                commit_transaction;
1038                        commit_transaction->t_cpprev->t_cpnext =
1039                                commit_transaction;
1040                }
1041        }
1042        spin_unlock(&journal->j_list_lock);
1043
1044        if (journal->j_commit_callback)
1045                journal->j_commit_callback(journal, commit_transaction);
1046
1047        trace_jbd2_end_commit(journal, commit_transaction);
1048        jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1049                  journal->j_commit_sequence, journal->j_tail_sequence);
1050        if (to_free)
1051                kfree(commit_transaction);
1052
1053        wake_up(&journal->j_wait_done_commit);
1054}
1055